diff options
Diffstat (limited to 'fs/f2fs')
| -rw-r--r-- | fs/f2fs/Kconfig | 107 | ||||
| -rw-r--r-- | fs/f2fs/Makefile | 7 | ||||
| -rw-r--r-- | fs/f2fs/acl.c | 357 | ||||
| -rw-r--r-- | fs/f2fs/acl.h | 22 | ||||
| -rw-r--r-- | fs/f2fs/checkpoint.c | 2143 | ||||
| -rw-r--r-- | fs/f2fs/compress.c | 2101 | ||||
| -rw-r--r-- | fs/f2fs/data.c | 4431 | ||||
| -rw-r--r-- | fs/f2fs/debug.c | 837 | ||||
| -rw-r--r-- | fs/f2fs/dir.c | 1227 | ||||
| -rw-r--r-- | fs/f2fs/extent_cache.c | 1255 | ||||
| -rw-r--r-- | fs/f2fs/f2fs.h | 4842 | ||||
| -rw-r--r-- | fs/f2fs/file.c | 5473 | ||||
| -rw-r--r-- | fs/f2fs/gc.c | 2250 | ||||
| -rw-r--r-- | fs/f2fs/gc.h | 190 | ||||
| -rw-r--r-- | fs/f2fs/hash.c | 72 | ||||
| -rw-r--r-- | fs/f2fs/inline.c | 834 | ||||
| -rw-r--r-- | fs/f2fs/inode.c | 1031 | ||||
| -rw-r--r-- | fs/f2fs/iostat.c | 315 | ||||
| -rw-r--r-- | fs/f2fs/iostat.h | 85 | ||||
| -rw-r--r-- | fs/f2fs/namei.c | 1399 | ||||
| -rw-r--r-- | fs/f2fs/node.c | 3554 | ||||
| -rw-r--r-- | fs/f2fs/node.h | 413 | ||||
| -rw-r--r-- | fs/f2fs/recovery.c | 1015 | ||||
| -rw-r--r-- | fs/f2fs/segment.c | 5683 | ||||
| -rw-r--r-- | fs/f2fs/segment.h | 897 | ||||
| -rw-r--r-- | fs/f2fs/shrinker.c | 246 | ||||
| -rw-r--r-- | fs/f2fs/super.c | 5423 | ||||
| -rw-r--r-- | fs/f2fs/sysfs.c | 2004 | ||||
| -rw-r--r-- | fs/f2fs/verity.c | 297 | ||||
| -rw-r--r-- | fs/f2fs/xattr.c | 871 | ||||
| -rw-r--r-- | fs/f2fs/xattr.h | 90 |
31 files changed, 43195 insertions, 6276 deletions
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index e06e0995e00f..5916a02fb46d 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -1,6 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only config F2FS_FS - tristate "F2FS filesystem support (EXPERIMENTAL)" + tristate "F2FS filesystem support" depends on BLOCK + select BUFFER_HEAD + select NLS + select CRC32 + select F2FS_FS_XATTR if FS_ENCRYPTION + select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select FS_IOMAP + select LZ4_COMPRESS if F2FS_FS_LZ4 + select LZ4_DECOMPRESS if F2FS_FS_LZ4 + select LZ4HC_COMPRESS if F2FS_FS_LZ4HC + select LZO_COMPRESS if F2FS_FS_LZO + select LZO_DECOMPRESS if F2FS_FS_LZO + select ZSTD_COMPRESS if F2FS_FS_ZSTD + select ZSTD_DECOMPRESS if F2FS_FS_ZSTD help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -16,14 +30,14 @@ config F2FS_FS config F2FS_STAT_FS bool "F2FS Status Information" - depends on F2FS_FS && DEBUG_FS + depends on F2FS_FS default y help /sys/kernel/debug/f2fs/ contains information about all the partitions mounted as f2fs. Each file shows the whole f2fs information. /sys/kernel/debug/f2fs/status includes: - - major file system information managed by f2fs currently + - major filesystem information managed by f2fs currently - average SIT information about whole segments - current memory footprint consumed by f2fs. @@ -33,8 +47,7 @@ config F2FS_FS_XATTR default y help Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). + the kernel or by users (see the attr(5) manual page for details). If unsure, say N. @@ -45,10 +58,7 @@ config F2FS_FS_POSIX_ACL default y help Posix Access Control Lists (ACLs) support permissions for users and - gourps beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. + groups beyond the owner/group/world scheme. If you don't know what Access Control Lists are, say N @@ -60,6 +70,83 @@ config F2FS_FS_SECURITY Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO Linux. This option enables an extended attribute handler for file security labels in the f2fs filesystem, so that it requires enabling - the extended attribute support in advance. + the extended attribute support in advance. In particular you need this + option if you use the setcap command to assign initial process capabi- + lities to executables (the security.* extended attributes). If you are not using a security module, say N. + +config F2FS_CHECK_FS + bool "F2FS consistency checking feature" + depends on F2FS_FS + help + Enables BUG_ONs which check the filesystem consistency in runtime. + + If you want to improve the performance, say N. + +config F2FS_FAULT_INJECTION + bool "F2FS fault injection facility" + depends on F2FS_FS + help + Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. + + If unsure, say N. + +config F2FS_FS_COMPRESSION + bool "F2FS compression feature" + depends on F2FS_FS + help + Enable filesystem-level compression on f2fs regular files, + multiple back-end compression algorithms are supported. + +config F2FS_FS_LZO + bool "LZO compression support" + depends on F2FS_FS_COMPRESSION + default y + help + Support LZO compress algorithm, if unsure, say Y. + +config F2FS_FS_LZORLE + bool "LZO-RLE compression support" + depends on F2FS_FS_LZO + default y + help + Support LZO-RLE compress algorithm, if unsure, say Y. + +config F2FS_FS_LZ4 + bool "LZ4 compression support" + depends on F2FS_FS_COMPRESSION + default y + help + Support LZ4 compress algorithm, if unsure, say Y. + +config F2FS_FS_LZ4HC + bool "LZ4HC compression support" + depends on F2FS_FS_LZ4 + default y + help + Support LZ4HC compress algorithm, LZ4HC has compatible on-disk + layout with LZ4, if unsure, say Y. + +config F2FS_FS_ZSTD + bool "ZSTD compression support" + depends on F2FS_FS_COMPRESSION + default y + help + Support ZSTD compress algorithm, if unsure, say Y. + +config F2FS_IOSTAT + bool "F2FS IO statistics information" + depends on F2FS_FS + default y + help + Support getting IO statistics through sysfs and printing out periodic + IO statistics tracepoint events. You have to turn on "iostat_enable" + sysfs node to enable this feature. + +config F2FS_UNFAIR_RWSEM + bool "F2FS unfair rw_semaphore" + depends on F2FS_FS && BLK_CGROUP + help + Use unfair rw_semaphore, if system configured IO priority by block + cgroup. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 27a0820340b9..8a7322d229e4 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -1,7 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_F2FS_FS) += f2fs.o -f2fs-y := dir.o file.o inode.o namei.o hash.o super.o +f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o +f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o +f2fs-$(CONFIG_FS_VERITY) += verity.o +f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o +f2fs-$(CONFIG_F2FS_IOSTAT) += iostat.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index b7826ec1b470..fa8d81a30fb9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/acl.c * @@ -7,19 +8,13 @@ * Portions of this code from linux/fs/ext2/acl.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ +#include <linux/fs_struct.h> #include <linux/f2fs_fs.h> #include "f2fs.h" #include "xattr.h" #include "acl.h" -#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ - (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) - static inline size_t f2fs_acl_size(int count) { if (count <= 4) { @@ -35,6 +30,7 @@ static inline size_t f2fs_acl_size(int count) static inline int f2fs_acl_count(size_t size) { ssize_t s; + size -= sizeof(struct f2fs_acl_header); s = size - 4 * sizeof(struct f2fs_acl_entry_short); if (s < 0) { @@ -56,6 +52,9 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1); const char *end = value + size; + if (size < sizeof(struct f2fs_acl_header)) + return ERR_PTR(-EINVAL); + if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION)) return ERR_PTR(-EINVAL); @@ -65,7 +64,7 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) if (count == 0) return NULL; - acl = posix_acl_alloc(count, GFP_KERNEL); + acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); @@ -112,14 +111,16 @@ fail: return ERR_PTR(-EINVAL); } -static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) +static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, + const struct posix_acl *acl, size_t *size) { struct f2fs_acl_header *f2fs_acl; struct f2fs_acl_entry *entry; int i; - f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * - sizeof(struct f2fs_acl_entry), GFP_KERNEL); + f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) + + acl->a_count * sizeof(struct f2fs_acl_entry), + GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -165,30 +166,24 @@ fail: return ERR_PTR(-EINVAL); } -struct posix_acl *f2fs_get_acl(struct inode *inode, int type) +static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, + struct folio *dfolio) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; void *value = NULL; struct posix_acl *acl; int retval; - if (!test_opt(sbi, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - if (type == ACL_TYPE_ACCESS) name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - retval = f2fs_getxattr(inode, name_index, "", NULL, 0); + retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dfolio); if (retval > 0) { - value = kmalloc(retval, GFP_KERNEL); + value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); - retval = f2fs_getxattr(inode, name_index, "", value, retval); + retval = f2fs_getxattr(inode, name_index, "", value, + retval, dfolio); } if (retval > 0) @@ -199,36 +194,56 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) acl = ERR_PTR(retval); kfree(value); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - return acl; } -static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) +{ + if (rcu) + return ERR_PTR(-ECHILD); + + return __f2fs_get_acl(inode, type, NULL); +} + +static int f2fs_acl_update_mode(struct mnt_idmap *idmap, + struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) + mode = F2FS_I(inode)->i_acl_mode; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} + +static int __f2fs_set_acl(struct mnt_idmap *idmap, + struct inode *inode, int type, + struct posix_acl *acl, struct folio *ifolio) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; int error; - - if (!test_opt(sbi, POSIX_ACL)) - return 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + if (acl && !ifolio) { + error = f2fs_acl_update_mode(idmap, inode, &mode, &acl); + if (error) return error; - set_acl_inode(fi, inode->i_mode); - if (error == 0) - acl = NULL; + set_acl_inode(inode, mode); } break; @@ -243,170 +258,184 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } if (acl) { - value = f2fs_acl_to_disk(acl, &size); + value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { - cond_clear_inode_flag(fi, FI_ACL_MODE); - return (int)PTR_ERR(value); + clear_inode_flag(inode, FI_ACL_MODE); + return PTR_ERR(value); } } - error = f2fs_setxattr(inode, name_index, "", value, size, NULL); + error = f2fs_setxattr(inode, name_index, "", value, size, ifolio, 0); kfree(value); if (!error) set_cached_acl(inode, type, acl); - cond_clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return error; } -int f2fs_init_acl(struct inode *inode, struct inode *dir) +int f2fs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type) { - struct posix_acl *acl = NULL; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(sbi, POSIX_ACL)) { - acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } + struct inode *inode = d_inode(dentry); - if (test_opt(sbi, POSIX_ACL) && acl) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; - if (S_ISDIR(inode->i_mode)) { - error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); + return __f2fs_set_acl(idmap, inode, type, acl, NULL); +} + +/* + * Most part of f2fs_acl_clone, f2fs_acl_create_masq, f2fs_acl_create + * are copied from posix_acl.c + */ +static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl, + gfp_t flags) +{ + struct posix_acl *clone = NULL; + + if (acl) { + clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), + flags); + if (clone) + refcount_set(&clone->a_refcount, 1); } -cleanup: - posix_acl_release(acl); - return error; + return clone; } -int f2fs_acl_chmod(struct inode *inode) +static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct posix_acl *acl; - int error; - umode_t mode = get_inode_mode(inode); + struct posix_acl_entry *pa, *pe; + struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; + umode_t mode = *mode_p; + int not_equiv = 0; - if (!test_opt(sbi, POSIX_ACL)) - return 0; - if (S_ISLNK(mode)) - return -EOPNOTSUPP; + /* assert(atomic_read(acl->a_refcount) == 1); */ - acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch (pa->e_tag) { + case ACL_USER_OBJ: + pa->e_perm &= (mode >> 6) | ~S_IRWXO; + mode &= (pa->e_perm << 6) | ~S_IRWXU; + break; - error = posix_acl_chmod(&acl, GFP_KERNEL, mode); - if (error) - return error; - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return error; + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + + case ACL_GROUP_OBJ: + group_obj = pa; + break; + + case ACL_OTHER: + pa->e_perm &= mode | ~S_IRWXO; + mode &= pa->e_perm | ~S_IRWXO; + break; + + case ACL_MASK: + mask_obj = pa; + not_equiv = 1; + break; + + default: + return -EIO; + } + } + + if (mask_obj) { + mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (mask_obj->e_perm << 3) | ~S_IRWXG; + } else { + if (!group_obj) + return -EIO; + group_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (group_obj->e_perm << 3) | ~S_IRWXG; + } + + *mode_p = (*mode_p & ~S_IRWXUGO) | mode; + return not_equiv; } -static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) +static int f2fs_acl_create(struct inode *dir, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl, + struct folio *dfolio) { - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - const char *xname = POSIX_ACL_XATTR_DEFAULT; - size_t size; + struct posix_acl *p; + struct posix_acl *clone; + int ret; - if (!test_opt(sbi, POSIX_ACL)) + *acl = NULL; + *default_acl = NULL; + + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) return 0; - if (type == ACL_TYPE_ACCESS) - xname = POSIX_ACL_XATTR_ACCESS; + p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dfolio); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; + } + if (IS_ERR(p)) + return PTR_ERR(p); - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; -} + clone = f2fs_acl_clone(p, GFP_NOFS); + if (!clone) { + ret = -ENOMEM; + goto release_acl; + } -static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct posix_acl *acl; - int error; + ret = f2fs_acl_create_masq(clone, mode); + if (ret < 0) + goto release_clone; - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(sbi, POSIX_ACL)) - return -EOPNOTSUPP; + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; - acl = f2fs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); + if (!S_ISDIR(*mode)) + posix_acl_release(p); + else + *default_acl = p; - return error; + return 0; + +release_clone: + posix_acl_release(clone); +release_acl: + posix_acl_release(p); + return ret; } -static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct folio *ifolio, + struct folio *dfolio) { - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; + struct posix_acl *default_acl = NULL, *acl = NULL; int error; - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(sbi, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } + error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dfolio); + if (error) + return error; + + f2fs_mark_inode_dirty_sync(inode, true); + + if (default_acl) { + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, + default_acl, ifolio); + posix_acl_release(default_acl); } else { - acl = NULL; + inode->i_default_acl = NULL; + } + if (acl) { + if (!error) + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, + acl, ifolio); + posix_acl_release(acl); + } else { + inode->i_acl = NULL; } - error = f2fs_set_acl(inode, type, acl); - -release_and_out: - posix_acl_release(acl); return error; } - -const struct xattr_handler f2fs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = f2fs_xattr_list_acl, - .get = f2fs_xattr_get_acl, - .set = f2fs_xattr_set_acl, -}; - -const struct xattr_handler f2fs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = f2fs_xattr_list_acl, - .get = f2fs_xattr_get_acl, - .set = f2fs_xattr_set_acl, -}; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 80f430674417..20e87e63c089 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/acl.h * @@ -7,10 +8,6 @@ * Portions of this code from linux/fs/ext2/acl.h * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef __F2FS_ACL_H__ #define __F2FS_ACL_H__ @@ -36,20 +33,17 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL -extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type); -extern int f2fs_acl_chmod(struct inode *inode); -extern int f2fs_init_acl(struct inode *inode, struct inode *dir); +struct posix_acl *f2fs_get_acl(struct inode *, int, bool); +int f2fs_set_acl(struct mnt_idmap *, struct dentry *, + struct posix_acl *, int); +int f2fs_init_acl(struct inode *, struct inode *, struct folio *ifolio, + struct folio *dfolio); #else -#define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL -static inline int f2fs_acl_chmod(struct inode *inode) -{ - return 0; -} - -static inline int f2fs_init_acl(struct inode *inode, struct inode *dir) +static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, + struct folio *ifolio, struct folio *dfolio) { return 0; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 66a6b85a51d8..300664269eb6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/checkpoint.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/bio.h> @@ -16,302 +13,794 @@ #include <linux/f2fs_fs.h> #include <linux/pagevec.h> #include <linux/swap.h> +#include <linux/kthread.h> #include "f2fs.h" #include "node.h" #include "segment.h" +#include "iostat.h" #include <trace/events/f2fs.h> -static struct kmem_cache *orphan_entry_slab; -static struct kmem_cache *inode_entry_slab; +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3)) + +static struct kmem_cache *ino_entry_slab; +struct kmem_cache *f2fs_inode_entry_slab; + +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason) +{ + f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL); + if (!end_io) + f2fs_flush_merged_writes(sbi); + f2fs_handle_critical_error(sbi, reason); +} /* * We guarantee no failure on the returned page. */ -struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct folio *f2fs_grab_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index) { - struct address_space *mapping = sbi->meta_inode->i_mapping; - struct page *page = NULL; + struct address_space *mapping = META_MAPPING(sbi); + struct folio *folio; repeat: - page = grab_cache_page(mapping, index); - if (!page) { + folio = f2fs_grab_cache_folio(mapping, index, false); + if (IS_ERR(folio)) { cond_resched(); goto repeat; } - - /* We wait writeback only inside grab_meta_page() */ - wait_on_page_writeback(page); - SetPageUptodate(page); - return page; + f2fs_folio_wait_writeback(folio, META, true, true); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + return folio; } -/* - * We guarantee no failure on the returned page. - */ -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +static struct folio *__get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index, + bool is_meta) { - struct address_space *mapping = sbi->meta_inode->i_mapping; - struct page *page; + struct address_space *mapping = META_MAPPING(sbi); + struct folio *folio; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = META, + .op = REQ_OP_READ, + .op_flags = REQ_META | REQ_PRIO, + .old_blkaddr = index, + .new_blkaddr = index, + .encrypted_page = NULL, + .is_por = !is_meta ? 1 : 0, + }; + int err; + + if (unlikely(!is_meta)) + fio.op_flags &= ~REQ_META; repeat: - page = grab_cache_page(mapping, index); - if (!page) { + folio = f2fs_grab_cache_folio(mapping, index, false); + if (IS_ERR(folio)) { cond_resched(); goto repeat; } - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - if (f2fs_readpage(sbi, page, index, READ_SYNC)) - goto repeat; + fio.folio = folio; + + err = f2fs_submit_page_bio(&fio); + if (err) { + f2fs_folio_put(folio, true); + return ERR_PTR(err); + } + + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); - lock_page(page); - if (page->mapping != mapping) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(!is_meta_folio(folio))) { + f2fs_folio_put(folio, true); goto repeat; } + + if (unlikely(!folio_test_uptodate(folio))) { + f2fs_handle_page_eio(sbi, folio, META); + f2fs_folio_put(folio, true); + return ERR_PTR(-EIO); + } out: - mark_page_accessed(page); - return page; + return folio; } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) +struct folio *f2fs_get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index) { - struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + return __get_meta_folio(sbi, index, true); +} - /* Should not write any meta pages, if any IO error was occurred */ - if (wbc->for_reclaim || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { - dec_page_count(sbi, F2FS_DIRTY_META); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; +struct folio *f2fs_get_meta_folio_retry(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct folio *folio; + int count = 0; + +retry: + folio = __get_meta_folio(sbi, index, true); + if (IS_ERR(folio)) { + if (PTR_ERR(folio) == -EIO && + ++count <= DEFAULT_RETRY_IO_COUNT) + goto retry; + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } + return folio; +} + +/* for POR only */ +struct folio *f2fs_get_tmp_folio(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_folio(sbi, index, false); +} - wait_on_page_writeback(page); +static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, + int type) +{ + struct seg_entry *se; + unsigned int segno, offset; + bool exist; + + if (type == DATA_GENERIC) + return true; + + segno = GET_SEGNO(sbi, blkaddr); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + se = get_seg_entry(sbi, segno); + + exist = f2fs_test_bit(offset, se->cur_valid_map); + + /* skip data, if we already have an error in checkpoint. */ + if (unlikely(f2fs_cp_error(sbi))) + return exist; + + if ((exist && type == DATA_GENERIC_ENHANCE_UPDATE) || + (!exist && type == DATA_GENERIC_ENHANCE)) + goto out_err; + if (!exist && type != DATA_GENERIC_ENHANCE_UPDATE) + goto out_handle; + return exist; + +out_err: + f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", + blkaddr, exist); + set_sbi_flag(sbi, SBI_NEED_FSCK); + dump_stack(); +out_handle: + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + return exist; +} + +static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + switch (type) { + case META_NAT: + break; + case META_SIT: + if (unlikely(blkaddr >= SIT_BLK_CNT(sbi))) + goto check_only; + break; + case META_SSA: + if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) || + blkaddr < SM_I(sbi)->ssa_blkaddr)) + goto check_only; + break; + case META_CP: + if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr || + blkaddr < __start_cp_addr(sbi))) + goto check_only; + break; + case META_POR: + if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || + blkaddr < MAIN_BLKADDR(sbi))) + goto check_only; + break; + case DATA_GENERIC: + case DATA_GENERIC_ENHANCE: + case DATA_GENERIC_ENHANCE_READ: + case DATA_GENERIC_ENHANCE_UPDATE: + if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || + blkaddr < MAIN_BLKADDR(sbi))) { + + /* Skip to emit an error message. */ + if (unlikely(f2fs_cp_error(sbi))) + return false; + + f2fs_warn(sbi, "access invalid blkaddr:%u", + blkaddr); + set_sbi_flag(sbi, SBI_NEED_FSCK); + dump_stack(); + goto err; + } else { + return __is_bitmap_valid(sbi, blkaddr, type); + } + break; + case META_GENERIC: + if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || + blkaddr >= MAIN_BLKADDR(sbi))) + goto err; + break; + default: + BUG(); + } + + return true; +err: + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); +check_only: + return false; +} - write_meta_page(sbi, page); +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + if (time_to_inject(sbi, FAULT_BLKADDR_VALIDITY)) + return false; + return __f2fs_is_valid_blkaddr(sbi, blkaddr, type); +} + +bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + return __f2fs_is_valid_blkaddr(sbi, blkaddr, type); +} + +/* + * Readahead CP/NAT/SIT/SSA/POR pages + */ +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync) +{ + block_t blkno = start; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = META, + .op = REQ_OP_READ, + .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, + .encrypted_page = NULL, + .in_list = 0, + .is_por = (type == META_POR) ? 1 : 0, + }; + struct blk_plug plug; + int err; + + if (unlikely(type == META_POR)) + fio.op_flags &= ~REQ_META; + + blk_start_plug(&plug); + for (; nrpages-- > 0; blkno++) { + struct folio *folio; + + if (!f2fs_is_valid_blkaddr(sbi, blkno, type)) + goto out; + + switch (type) { + case META_NAT: + if (unlikely(blkno >= + NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) + blkno = 0; + /* get nat block addr */ + fio.new_blkaddr = current_nat_addr(sbi, + blkno * NAT_ENTRY_PER_BLOCK); + break; + case META_SIT: + if (unlikely(blkno >= TOTAL_SEGS(sbi))) + goto out; + /* get sit block addr */ + fio.new_blkaddr = current_sit_addr(sbi, + blkno * SIT_ENTRY_PER_BLOCK); + break; + case META_SSA: + case META_CP: + case META_POR: + fio.new_blkaddr = blkno; + break; + default: + BUG(); + } + + folio = f2fs_grab_cache_folio(META_MAPPING(sbi), + fio.new_blkaddr, false); + if (IS_ERR(folio)) + continue; + if (folio_test_uptodate(folio)) { + f2fs_folio_put(folio, true); + continue; + } + + fio.folio = folio; + err = f2fs_submit_page_bio(&fio); + f2fs_folio_put(folio, err ? true : false); + + if (!err) + f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, + F2FS_BLKSIZE); + } +out: + blk_finish_plug(&plug); + return blkno - start; +} + +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks) +{ + struct folio *folio; + bool readahead = false; + + if (ra_blocks == RECOVERY_MIN_RA_BLOCKS) + return; + + folio = filemap_get_folio(META_MAPPING(sbi), index); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) + readahead = true; + f2fs_folio_put(folio, false); + + if (readahead) + f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true); +} + +static bool __f2fs_write_meta_folio(struct folio *folio, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); + + trace_f2fs_writepage(folio, META); + + if (unlikely(f2fs_cp_error(sbi))) { + if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { + folio_clear_uptodate(folio); + dec_page_count(sbi, F2FS_DIRTY_META); + folio_unlock(folio); + return true; + } + goto redirty_out; + } + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + + f2fs_do_write_meta_page(sbi, folio, io_type); dec_page_count(sbi, F2FS_DIRTY_META); - unlock_page(page); - return 0; + + folio_unlock(folio); + + if (unlikely(f2fs_cp_error(sbi))) + f2fs_submit_merged_write(sbi, META); + + return true; + +redirty_out: + folio_redirty_for_writepage(wbc, folio); + return false; } static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - struct block_device *bdev = sbi->sb->s_bdev; - long written; - - if (wbc->for_kupdate) - return 0; - - if (get_pages(sbi, F2FS_DIRTY_META) == 0) - return 0; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + long diff, written; + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + + /* collect a number of dirty meta pages and write together */ + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_META) < + nr_pages_to_skip(sbi, META)) + goto skip_write; + + /* if locked failed, cp will flush dirty pages instead */ + if (!f2fs_down_write_trylock(&sbi->cp_global_sem)) + goto skip_write; + + trace_f2fs_writepages(mapping->host, wbc, META); + diff = nr_pages_to_write(sbi, META, wbc); + written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); + f2fs_up_write(&sbi->cp_global_sem); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); + return 0; - /* if mounting is failed, skip writing node pages */ - mutex_lock(&sbi->cp_mutex); - written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev)); - mutex_unlock(&sbi->cp_mutex); - wbc->nr_to_write -= written; +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); + trace_f2fs_writepages(mapping->host, wbc, META); return 0; } -long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write) +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write, enum iostat_type io_type) { - struct address_space *mapping = sbi->meta_inode->i_mapping; - pgoff_t index = 0, end = LONG_MAX; - struct pagevec pvec; + struct address_space *mapping = META_MAPPING(sbi); + pgoff_t index = 0, prev = ULONG_MAX; + struct folio_batch fbatch; long nwritten = 0; - struct writeback_control wbc = { - .for_reclaim = 0, - }; + int nr_folios; + struct writeback_control wbc = {}; + struct blk_plug plug; - pagevec_init(&pvec, 0); + folio_batch_init(&fbatch); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + blk_start_plug(&plug); + + while ((nr_folios = filemap_get_folios_tag(mapping, &index, + (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch))) { + int i; + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + if (nr_to_write != LONG_MAX && i != 0 && + folio->index != prev + + folio_nr_pages(fbatch.folios[i-1])) { + folio_batch_release(&fbatch); + goto stop; + } + + folio_lock(folio); + + if (unlikely(!is_meta_folio(folio))) { +continue_unlock: + folio_unlock(folio); + continue; + } + if (!folio_test_dirty(folio)) { + /* someone wrote it for us */ + goto continue_unlock; + } - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - lock_page(page); - BUG_ON(page->mapping != mapping); - BUG_ON(!PageDirty(page)); - clear_page_dirty_for_io(page); - if (f2fs_write_meta_page(page, &wbc)) { - unlock_page(page); + f2fs_folio_wait_writeback(folio, META, true, true); + + if (!folio_clear_dirty_for_io(folio)) + goto continue_unlock; + + if (!__f2fs_write_meta_folio(folio, &wbc, + io_type)) { + folio_unlock(folio); break; } - if (nwritten++ >= nr_to_write) + nwritten += folio_nr_pages(folio); + prev = folio->index; + if (unlikely(nwritten >= nr_to_write)) break; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } - +stop: if (nwritten) - f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX); + f2fs_submit_merged_write(sbi, type); + + blk_finish_plug(&plug); return nwritten; } -static int f2fs_set_meta_page_dirty(struct page *page) +static bool f2fs_dirty_meta_folio(struct address_space *mapping, + struct folio *folio) { - struct address_space *mapping = page->mapping; - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - - SetPageUptodate(page); - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); - inc_page_count(sbi, F2FS_DIRTY_META); - return 1; + trace_f2fs_set_page_dirty(folio, META); + + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + if (filemap_dirty_folio(mapping, folio)) { + inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); + folio_set_f2fs_reference(folio); + return true; } - return 0; + return false; } const struct address_space_operations f2fs_meta_aops = { - .writepage = f2fs_write_meta_page, .writepages = f2fs_write_meta_pages, - .set_page_dirty = f2fs_set_meta_page_dirty, + .dirty_folio = f2fs_dirty_meta_folio, + .invalidate_folio = f2fs_invalidate_folio, + .release_folio = f2fs_release_folio, + .migrate_folio = filemap_migrate_folio, }; -int check_orphan_space(struct f2fs_sb_info *sbi) +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) { - unsigned int max_orphans; - int err = 0; + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e = NULL, *new = NULL; + int ret; + + if (type == FLUSH_INO) { + rcu_read_lock(); + e = radix_tree_lookup(&im->ino_root, ino); + rcu_read_unlock(); + } - /* - * considering 512 blocks in a segment 5 blocks are needed for cp - * and log segment summaries. Remaining blocks are used to keep - * orphan entries with the limitation one reserved segment - * for cp pack we can have max 1020*507 orphan entries - */ - max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK; - mutex_lock(&sbi->orphan_inode_mutex); - if (sbi->n_orphans >= max_orphans) - err = -ENOSPC; - mutex_unlock(&sbi->orphan_inode_mutex); - return err; +retry: + if (!e) + new = f2fs_kmem_cache_alloc(ino_entry_slab, + GFP_NOFS, true, NULL); + + ret = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + f2fs_bug_on(sbi, ret); + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (!e) { + if (!new) { + spin_unlock(&im->ino_lock); + radix_tree_preload_end(); + goto retry; + } + e = new; + if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) + f2fs_bug_on(sbi, 1); + + memset(e, 0, sizeof(struct ino_entry)); + e->ino = ino; + + list_add_tail(&e->list, &im->ino_list); + if (type != ORPHAN_INO) + im->ino_num++; + } + + if (type == FLUSH_INO) + f2fs_set_bit(devidx, (char *)&e->dirty_device); + + spin_unlock(&im->ino_lock); + radix_tree_preload_end(); + + if (new && e != new) + kmem_cache_free(ino_entry_slab, new); } -void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { - struct list_head *head, *this; - struct orphan_inode_entry *new = NULL, *orphan = NULL; + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e) { + list_del(&e->list); + radix_tree_delete(&im->ino_root, ino); + im->ino_num--; + spin_unlock(&im->ino_lock); + kmem_cache_free(ino_entry_slab, e); + return; + } + spin_unlock(&im->ino_lock); +} - mutex_lock(&sbi->orphan_inode_mutex); - head = &sbi->orphan_inode_list; - list_for_each(this, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); - if (orphan->ino == ino) - goto out; - if (orphan->ino > ino) - break; - orphan = NULL; +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* add new dirty ino entry into list */ + __add_ino_entry(sbi, ino, 0, type); +} + +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* remove dirty ino entry from list */ + __remove_ino_entry(sbi, ino, type); +} + +/* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */ +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +{ + struct inode_management *im = &sbi->im[mode]; + struct ino_entry *e; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + spin_unlock(&im->ino_lock); + return e ? true : false; +} + +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all) +{ + struct ino_entry *e, *tmp; + int i; + + for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) { + struct inode_management *im = &sbi->im[i]; + + spin_lock(&im->ino_lock); + list_for_each_entry_safe(e, tmp, &im->ino_list, list) { + list_del(&e->list); + radix_tree_delete(&im->ino_root, e->ino); + kmem_cache_free(ino_entry_slab, e); + im->ino_num--; + } + spin_unlock(&im->ino_lock); } -retry: - new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); - if (!new) { - cond_resched(); - goto retry; +} + +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + __add_ino_entry(sbi, ino, devidx, type); +} + +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + bool is_dirty = false; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device)) + is_dirty = true; + spin_unlock(&im->ino_lock); + return is_dirty; +} + +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) +{ + struct inode_management *im = &sbi->im[ORPHAN_INO]; + int err = 0; + + spin_lock(&im->ino_lock); + + if (time_to_inject(sbi, FAULT_ORPHAN)) { + spin_unlock(&im->ino_lock); + return -ENOSPC; } - new->ino = ino; - /* add new_oentry into list which is sorted by inode number */ - if (orphan) - list_add(&new->list, this->prev); + if (unlikely(im->ino_num >= sbi->max_orphans)) + err = -ENOSPC; else - list_add_tail(&new->list, head); + im->ino_num++; + spin_unlock(&im->ino_lock); - sbi->n_orphans++; -out: - mutex_unlock(&sbi->orphan_inode_mutex); + return err; } -void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi) { - struct list_head *this, *next, *head; - struct orphan_inode_entry *orphan; + struct inode_management *im = &sbi->im[ORPHAN_INO]; - mutex_lock(&sbi->orphan_inode_mutex); - head = &sbi->orphan_inode_list; - list_for_each_safe(this, next, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); - if (orphan->ino == ino) { - list_del(&orphan->list); - kmem_cache_free(orphan_entry_slab, orphan); - sbi->n_orphans--; - break; - } - } - mutex_unlock(&sbi->orphan_inode_mutex); + spin_lock(&im->ino_lock); + f2fs_bug_on(sbi, im->ino_num == 0); + im->ino_num--; + spin_unlock(&im->ino_lock); +} + +void f2fs_add_orphan_inode(struct inode *inode) +{ + /* add new orphan ino entry into list */ + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); + f2fs_update_inode_page(inode); +} + +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +{ + /* remove orphan entry from orphan list */ + __remove_ino_entry(sbi, ino, ORPHAN_INO); } -static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct inode *inode = f2fs_iget(sbi->sb, ino); - BUG_ON(IS_ERR(inode)); + struct inode *inode; + struct node_info ni; + int err; + + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) { + /* + * there should be a bug that we can't find the entry + * to orphan inode. + */ + f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT); + return PTR_ERR(inode); + } + + err = f2fs_dquot_initialize(inode); + if (err) { + iput(inode); + goto err_out; + } + clear_nlink(inode); /* truncate all the data during iput */ iput(inode); + + err = f2fs_get_node_info(sbi, ino, &ni, false); + if (err) + goto err_out; + + /* ENOMEM was fully retried in f2fs_evict_inode. */ + if (ni.blk_addr != NULL_ADDR) { + err = -EIO; + goto err_out; + } + return 0; + +err_out: + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; } -int recover_orphan_inodes(struct f2fs_sb_info *sbi) +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) { - block_t start_blk, orphan_blkaddr, i, j; + block_t start_blk, orphan_blocks, i, j; + int err = 0; - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) + if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; - sbi->por_doing = 1; - start_blk = __start_cp_addr(sbi) + 1; - orphan_blkaddr = __start_sum_addr(sbi) - 1; + if (f2fs_hw_is_readonly(sbi)) { + f2fs_info(sbi, "write access unavailable, skipping orphan cleanup"); + return 0; + } + + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) + f2fs_info(sbi, "orphan cleanup on readonly fs"); + + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); + orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); + + f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); - for (i = 0; i < orphan_blkaddr; i++) { - struct page *page = get_meta_page(sbi, start_blk + i); + for (i = 0; i < orphan_blocks; i++) { + struct folio *folio; struct f2fs_orphan_block *orphan_blk; - orphan_blk = (struct f2fs_orphan_block *)page_address(page); + folio = f2fs_get_meta_folio(sbi, start_blk + i); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto out; + } + + orphan_blk = folio_address(folio); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); - recover_orphan_inode(sbi, ino); + + err = recover_orphan_inode(sbi, ino); + if (err) { + f2fs_folio_put(folio, true); + goto out; + } } - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); } /* clear Orphan Flag */ - clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); - sbi->por_doing = 0; - return 0; + clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); +out: + set_sbi_flag(sbi, SBI_IS_RECOVERED); + + return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) { - struct list_head *head, *this, *next; + struct list_head *head; struct f2fs_orphan_block *orphan_blk = NULL; - struct page *page = NULL; unsigned int nentries = 0; unsigned short index = 1; unsigned short orphan_blocks; + struct folio *folio = NULL; + struct ino_entry *orphan = NULL; + struct inode_management *im = &sbi->im[ORPHAN_INO]; - orphan_blocks = (unsigned short)((sbi->n_orphans + - (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); - - mutex_lock(&sbi->orphan_inode_mutex); - head = &sbi->orphan_inode_list; + orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num); - /* loop for each orphan inode entry and write them in Jornal block */ - list_for_each_safe(this, next, head) { - struct orphan_inode_entry *orphan; + /* + * we don't need to do spin_lock(&im->ino_lock) here, since all the + * orphan inode operations are covered under f2fs_lock_op(). + * And, spin_lock should be avoided due to page operations below. + */ + head = &im->ino_list; + + /* loop for each orphan inode entry and write them in journal block */ + list_for_each_entry(orphan, head, list) { + if (!folio) { + folio = f2fs_grab_meta_folio(sbi, start_blk++); + orphan_blk = folio_address(folio); + memset(orphan_blk, 0, sizeof(*orphan_blk)); + } - orphan = list_entry(this, struct orphan_inode_entry, list); + orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); if (nentries == F2FS_ORPHANS_PER_BLOCK) { /* @@ -322,517 +811,1181 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) orphan_blk->blk_addr = cpu_to_le16(index); orphan_blk->blk_count = cpu_to_le16(orphan_blocks); orphan_blk->entry_count = cpu_to_le32(nentries); - set_page_dirty(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); index++; - start_blk++; nentries = 0; - page = NULL; + folio = NULL; } - if (page) - goto page_exist; + } - page = grab_meta_page(sbi, start_blk); - orphan_blk = (struct f2fs_orphan_block *)page_address(page); - memset(orphan_blk, 0, sizeof(*orphan_blk)); -page_exist: - orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); + if (folio) { + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); } - if (!page) - goto end; +} - orphan_blk->blk_addr = cpu_to_le16(index); - orphan_blk->blk_count = cpu_to_le16(orphan_blocks); - orphan_blk->entry_count = cpu_to_le32(nentries); - set_page_dirty(page); - f2fs_put_page(page, 1); -end: - mutex_unlock(&sbi->orphan_inode_mutex); +static __u32 f2fs_checkpoint_chksum(struct f2fs_checkpoint *ckpt) +{ + unsigned int chksum_ofs = le32_to_cpu(ckpt->checksum_offset); + __u32 chksum; + + chksum = f2fs_crc32(ckpt, chksum_ofs); + if (chksum_ofs < CP_CHKSUM_OFFSET) { + chksum_ofs += sizeof(chksum); + chksum = f2fs_chksum(chksum, (__u8 *)ckpt + chksum_ofs, + F2FS_BLKSIZE - chksum_ofs); + } + return chksum; } -static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, - block_t cp_addr, unsigned long long *version) +static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, + struct f2fs_checkpoint **cp_block, struct folio **cp_folio, + unsigned long long *version) { - struct page *cp_page_1, *cp_page_2 = NULL; - unsigned long blk_size = sbi->blocksize; - struct f2fs_checkpoint *cp_block; - unsigned long long cur_version = 0, pre_version = 0; - size_t crc_offset; - __u32 crc = 0; + size_t crc_offset = 0; + __u32 crc; - /* Read the 1st cp block in this CP pack */ - cp_page_1 = get_meta_page(sbi, cp_addr); + *cp_folio = f2fs_get_meta_folio(sbi, cp_addr); + if (IS_ERR(*cp_folio)) + return PTR_ERR(*cp_folio); - /* get the version number */ - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) - goto invalid_cp1; + *cp_block = folio_address(*cp_folio); - crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(crc, cp_block, crc_offset)) - goto invalid_cp1; + crc_offset = le32_to_cpu((*cp_block)->checksum_offset); + if (crc_offset < CP_MIN_CHKSUM_OFFSET || + crc_offset > CP_CHKSUM_OFFSET) { + f2fs_folio_put(*cp_folio, true); + f2fs_warn(sbi, "invalid crc_offset: %zu", crc_offset); + return -EINVAL; + } - pre_version = le64_to_cpu(cp_block->checkpoint_ver); + crc = f2fs_checkpoint_chksum(*cp_block); + if (crc != cur_cp_crc(*cp_block)) { + f2fs_folio_put(*cp_folio, true); + f2fs_warn(sbi, "invalid crc value"); + return -EINVAL; + } - /* Read the 2nd cp block in this CP pack */ - cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; - cp_page_2 = get_meta_page(sbi, cp_addr); + *version = cur_cp_version(*cp_block); + return 0; +} - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) - goto invalid_cp2; +static struct folio *validate_checkpoint(struct f2fs_sb_info *sbi, + block_t cp_addr, unsigned long long *version) +{ + struct folio *cp_folio_1 = NULL, *cp_folio_2 = NULL; + struct f2fs_checkpoint *cp_block = NULL; + unsigned long long cur_version = 0, pre_version = 0; + unsigned int cp_blocks; + int err; + + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_folio_1, version); + if (err) + return NULL; - crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(crc, cp_block, crc_offset)) - goto invalid_cp2; + cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count); + + if (cp_blocks > BLKS_PER_SEG(sbi) || cp_blocks <= F2FS_CP_PACKS) { + f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u", + le32_to_cpu(cp_block->cp_pack_total_block_count)); + goto invalid_cp; + } + pre_version = *version; - cur_version = le64_to_cpu(cp_block->checkpoint_ver); + cp_addr += cp_blocks - 1; + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_folio_2, version); + if (err) + goto invalid_cp; + cur_version = *version; if (cur_version == pre_version) { *version = cur_version; - f2fs_put_page(cp_page_2, 1); - return cp_page_1; + f2fs_folio_put(cp_folio_2, true); + return cp_folio_1; } -invalid_cp2: - f2fs_put_page(cp_page_2, 1); -invalid_cp1: - f2fs_put_page(cp_page_1, 1); + f2fs_folio_put(cp_folio_2, true); +invalid_cp: + f2fs_folio_put(cp_folio_1, true); return NULL; } -int get_valid_checkpoint(struct f2fs_sb_info *sbi) +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *cp_block; struct f2fs_super_block *fsb = sbi->raw_super; - struct page *cp1, *cp2, *cur_page; + struct folio *cp1, *cp2, *cur_folio; unsigned long blk_size = sbi->blocksize; unsigned long long cp1_version = 0, cp2_version = 0; unsigned long long cp_start_blk_no; + unsigned int cp_blks = 1 + __cp_payload(sbi); + block_t cp_blk_no; + int i; + int err; - sbi->ckpt = kzalloc(blk_size, GFP_KERNEL); + sbi->ckpt = f2fs_kvzalloc(sbi, array_size(blk_size, cp_blks), + GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* * Finding out valid cp block involves read both - * sets( cp pack1 and cp pack 2) + * sets( cp pack 1 and cp pack 2) */ cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); /* The second checkpoint pack should start at the next segment */ - cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + cp_start_blk_no += ((unsigned long long)1) << + le32_to_cpu(fsb->log_blocks_per_seg); cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); if (cp1 && cp2) { if (ver_after(cp2_version, cp1_version)) - cur_page = cp2; + cur_folio = cp2; else - cur_page = cp1; + cur_folio = cp1; } else if (cp1) { - cur_page = cp1; + cur_folio = cp1; } else if (cp2) { - cur_page = cp2; + cur_folio = cp2; } else { + err = -EFSCORRUPTED; goto fail_no_cp; } - cp_block = (struct f2fs_checkpoint *)page_address(cur_page); + cp_block = folio_address(cur_folio); memcpy(sbi->ckpt, cp_block, blk_size); - f2fs_put_page(cp1, 1); - f2fs_put_page(cp2, 1); - return 0; + if (cur_folio == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; -fail_no_cp: - kfree(sbi->ckpt); - return -EINVAL; -} + /* Sanity checking of checkpoint */ + if (f2fs_sanity_check_ckpt(sbi)) { + err = -EFSCORRUPTED; + goto free_fail_no_cp; + } -static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) -{ - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + if (cp_blks <= 1) + goto done; + + cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); + if (cur_folio == cp2) + cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg)); - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode == inode) - return -EEXIST; + for (i = 1; i < cp_blks; i++) { + void *sit_bitmap_ptr; + unsigned char *ckpt = (unsigned char *)sbi->ckpt; + + cur_folio = f2fs_get_meta_folio(sbi, cp_blk_no + i); + if (IS_ERR(cur_folio)) { + err = PTR_ERR(cur_folio); + goto free_fail_no_cp; + } + sit_bitmap_ptr = folio_address(cur_folio); + memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); + f2fs_folio_put(cur_folio, true); } - list_add_tail(&new->list, head); -#ifdef CONFIG_F2FS_STAT_FS - sbi->n_dirty_dirs++; -#endif +done: + f2fs_folio_put(cp1, true); + f2fs_folio_put(cp2, true); return 0; + +free_fail_no_cp: + f2fs_folio_put(cp1, true); + f2fs_folio_put(cp2, true); +fail_no_cp: + kvfree(sbi->ckpt); + return err; } -void set_dirty_dir_page(struct inode *inode, struct page *page) +static void __add_dirty_inode(struct inode *inode, enum inode_type type) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct dir_inode_entry *new; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (!S_ISDIR(inode->i_mode)) + if (is_inode_flag_set(inode, flag)) return; -retry: - new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - if (!new) { - cond_resched(); - goto retry; - } - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); - inc_page_count(sbi, F2FS_DIRTY_DENTS); - inode_inc_dirty_dents(inode); - SetPagePrivate(page); - spin_unlock(&sbi->dir_inode_lock); + set_inode_flag(inode, flag); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); } -void add_dirty_dir_inode(struct inode *inode) +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct dir_inode_entry *new; -retry: - new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - if (!new) { - cond_resched(); - goto retry; - } - new->inode = inode; - INIT_LIST_HEAD(&new->list); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); - spin_unlock(&sbi->dir_inode_lock); + if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) + return; + + list_del_init(&F2FS_I(inode)->dirty_list); + clear_inode_flag(inode, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); } -void remove_dirty_dir_inode(struct inode *inode) +void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; - - if (!S_ISDIR(inode->i_mode)) - return; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; - spin_lock(&sbi->dir_inode_lock); - if (atomic_read(&F2FS_I(inode)->dirty_dents)) { - spin_unlock(&sbi->dir_inode_lock); + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; - } - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode == inode) { - list_del(&entry->list); - kmem_cache_free(inode_entry_slab, entry); -#ifdef CONFIG_F2FS_STAT_FS - sbi->n_dirty_dirs--; -#endif - break; - } - } - spin_unlock(&sbi->dir_inode_lock); + spin_lock(&sbi->inode_lock[type]); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) + __add_dirty_inode(inode, type); + inode_inc_dirty_pages(inode); + spin_unlock(&sbi->inode_lock[type]); - /* Only from the recovery routine */ - if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { - clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); - iput(inode); - } + folio_set_f2fs_reference(folio); } -struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) +void f2fs_remove_dirty_inode(struct inode *inode) { - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; - struct inode *inode = NULL; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; - spin_lock(&sbi->dir_inode_lock); - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode->i_ino == ino) { - inode = entry->inode; - break; - } - } - spin_unlock(&sbi->dir_inode_lock); - return inode; + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return; + + if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) + return; + + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); } -void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp) { - struct list_head *head = &sbi->dir_inode_list; - struct dir_inode_entry *entry; + struct list_head *head; struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + unsigned long ino = 0; + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: - spin_lock(&sbi->dir_inode_lock); + if (unlikely(f2fs_cp_error(sbi))) { + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return -EIO; + } + + spin_lock(&sbi->inode_lock[type]); + + head = &sbi->inode_list[type]; if (list_empty(head)) { - spin_unlock(&sbi->dir_inode_lock); - return; + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; } - entry = list_entry(head->next, struct dir_inode_entry, list); - inode = igrab(entry->inode); - spin_unlock(&sbi->dir_inode_lock); + fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); if (inode) { - filemap_flush(inode->i_mapping); + unsigned long cur_ino = inode->i_ino; + + if (from_cp) + F2FS_I(inode)->cp_task = current; + F2FS_I(inode)->wb_task = current; + + filemap_fdatawrite(inode->i_mapping); + + F2FS_I(inode)->wb_task = NULL; + if (from_cp) + F2FS_I(inode)->cp_task = NULL; + iput(inode); + /* We need to give cpu to another writers. */ + if (ino == cur_ino) + cond_resched(); + else + ino = cur_ino; } else { /* * We should submit bio, since it exists several - * wribacking dentry pages in the freeing inode. + * writebacking dentry pages in the freeing inode. */ - f2fs_submit_bio(sbi, DATA, true); + f2fs_submit_merged_write(sbi, DATA); + cond_resched(); } goto retry; } +static int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[DIRTY_META]; + struct inode *inode; + struct f2fs_inode_info *fi; + s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); + + while (total--) { + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 0; + } + fi = list_first_entry(head, struct f2fs_inode_info, + gdirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + if (inode) { + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + f2fs_update_inode_page(inode); + iput(inode); + } + } + return 0; +} + +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); + + /* update user_block_counts */ + sbi->last_valid_block_count = sbi->total_valid_block_count; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); + percpu_counter_set(&sbi->rf_node_block_count, 0); +} + +static bool __need_flush_quota(struct f2fs_sb_info *sbi) +{ + bool ret = false; + + if (!is_journalled_quota(sbi)) + return false; + + if (!f2fs_down_write_trylock(&sbi->quota_sem)) + return true; + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { + ret = false; + } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) { + ret = false; + } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH)) { + clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); + ret = true; + } else if (get_pages(sbi, F2FS_DIRTY_QDATA)) { + ret = true; + } + f2fs_up_write(&sbi->quota_sem); + return ret; +} + /* * Freeze all the FS-operations for checkpoint. */ -static void block_operations(struct f2fs_sb_info *sbi) +static int block_operations(struct f2fs_sb_info *sbi) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .for_reclaim = 0, }; - struct blk_plug plug; + int err = 0, cnt = 0; - blk_start_plug(&plug); + /* + * Let's flush inline_data in dirty node pages. + */ + f2fs_flush_inline_data(sbi); -retry_flush_dents: - mutex_lock_all(sbi); +retry_flush_quotas: + f2fs_lock_all(sbi); + if (__need_flush_quota(sbi)) { + bool need_lock = sbi->umount_lock_holder != current; + if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) { + set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); + set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); + goto retry_flush_dents; + } + f2fs_unlock_all(sbi); + + /* don't grab s_umount lock during mount/umount/remount/freeze/quotactl */ + if (!need_lock) { + f2fs_do_quota_sync(sbi->sb, -1); + } else if (down_read_trylock(&sbi->sb->s_umount)) { + f2fs_do_quota_sync(sbi->sb, -1); + up_read(&sbi->sb->s_umount); + } + cond_resched(); + goto retry_flush_quotas; + } + +retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { - mutex_unlock_all(sbi); - sync_dirty_dir_inodes(sbi); - goto retry_flush_dents; + f2fs_unlock_all(sbi); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true); + if (err) + return err; + cond_resched(); + goto retry_flush_quotas; } /* - * POR: we should ensure that there is no dirty node pages - * until finishing nat/sit flush. + * POR: we should ensure that there are no dirty node pages + * until finishing nat/sit flush. inode->i_blocks can be updated. */ + f2fs_down_write(&sbi->node_change); + + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + f2fs_up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) + return err; + cond_resched(); + goto retry_flush_quotas; + } + retry_flush_nodes: - mutex_lock(&sbi->node_write); + f2fs_down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - mutex_unlock(&sbi->node_write); - sync_node_pages(sbi, 0, &wbc); + f2fs_up_write(&sbi->node_write); + atomic_inc(&sbi->wb_sync_req[NODE]); + err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); + atomic_dec(&sbi->wb_sync_req[NODE]); + if (err) { + f2fs_up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + return err; + } + cond_resched(); goto retry_flush_nodes; } - blk_finish_plug(&plug); + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + f2fs_up_write(&sbi->node_change); + return err; } static void unblock_operations(struct f2fs_sb_info *sbi) { - mutex_unlock(&sbi->node_write); - mutex_unlock_all(sbi); + f2fs_up_write(&sbi->node_write); + f2fs_unlock_all(sbi); +} + +void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) +{ + DEFINE_WAIT(wait); + + for (;;) { + if (!get_pages(sbi, type)) + break; + + if (unlikely(f2fs_cp_error(sbi) && + !is_sbi_flag_set(sbi, SBI_IS_CLOSE))) + break; + + if (type == F2FS_DIRTY_META) + f2fs_sync_meta_pages(sbi, META, LONG_MAX, + FS_CP_META_IO); + else if (type == F2FS_WB_CP_DATA) + f2fs_submit_merged_write(sbi, DATA); + + prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + } + finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) { + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - nid_t last_nid = 0; + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); + + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + + if (cpc->reason & CP_UMOUNT) + __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason & CP_FASTBOOT) + __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) + __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + + if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS)) + __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); + else + __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); + + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + __set_ckpt_flags(ckpt, CP_DISABLED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK)) + __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) + __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + + if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) + __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + + /* set this flag to activate crc|cp_ver for recovery */ + __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); + + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static void commit_checkpoint(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) +{ + struct writeback_control wbc = {}; + + /* + * filemap_get_folios_tag and folio_lock again will take + * some extra time. Therefore, f2fs_update_meta_pages and + * f2fs_sync_meta_pages are combined in this function. + */ + struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr); + + memcpy(folio_address(folio), src, PAGE_SIZE); + + folio_mark_dirty(folio); + if (unlikely(!folio_clear_dirty_for_io(folio))) + f2fs_bug_on(sbi, 1); + + /* writeout cp pack 2 page */ + if (unlikely(!__f2fs_write_meta_folio(folio, &wbc, FS_CP_META_IO))) { + if (f2fs_cp_error(sbi)) { + f2fs_folio_put(folio, true); + return; + } + f2fs_bug_on(sbi, true); + } + + f2fs_folio_put(folio, false); + + /* submit checkpoint (with barrier if NOBARRIER is not set) */ + f2fs_submit_merged_write(sbi, META_FLUSH); +} + +static inline u64 get_sectors_written(struct block_device *bdev) +{ + return (u64)part_stat_read(bdev, sectors[STAT_WRITE]); +} + +u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) +{ + if (f2fs_is_multi_device(sbi)) { + u64 sectors = 0; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + sectors += get_sectors_written(FDEV(i).bdev); + + return sectors; + } + + return get_sectors_written(sbi->sb->s_bdev); +} + +static inline void stat_cp_time(struct cp_control *cpc, enum cp_time type) +{ + cpc->stats.times[type] = ktime_get(); +} + +static inline void check_cp_time(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long long sb_diff, cur_diff; + enum cp_time ct; + + sb_diff = (u64)ktime_ms_delta(sbi->cp_stats.times[CP_TIME_END], + sbi->cp_stats.times[CP_TIME_START]); + cur_diff = (u64)ktime_ms_delta(cpc->stats.times[CP_TIME_END], + cpc->stats.times[CP_TIME_START]); + + if (cur_diff > sb_diff) { + sbi->cp_stats = cpc->stats; + if (cur_diff < CP_LONG_LATENCY_THRESHOLD) + return; + + f2fs_warn(sbi, "checkpoint was blocked for %llu ms", cur_diff); + for (ct = CP_TIME_START; ct < CP_TIME_MAX - 1; ct++) + f2fs_warn(sbi, "Step#%d: %llu ms", ct, + (u64)ktime_ms_delta(cpc->stats.times[ct + 1], + cpc->stats.times[ct])); + } +} + +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; block_t start_blk; - struct page *cp_page; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; - void *kaddr; int i; + int cp_payload_blks = __cp_payload(sbi); + struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + u64 kbytes_written; + int err; /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) - sync_meta_pages(sbi, META, LONG_MAX); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - next_free_nid(sbi, &last_nid); + stat_cp_time(cpc, CP_TIME_SYNC_META); - /* - * modify checkpoint - * version number is already updated - */ - ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); - ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + /* start to update checkpoint, cp ver is already updated previously */ + ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); - for (i = 0; i < 3; i++) { - ckpt->cur_node_segno[i] = - cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); - ckpt->cur_node_blkoff[i] = - cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE)); - ckpt->alloc_type[i + CURSEG_HOT_NODE] = - curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); - } - for (i = 0; i < 3; i++) { - ckpt->cur_data_segno[i] = - cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); - ckpt->cur_data_blkoff[i] = - cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA)); - ckpt->alloc_type[i + CURSEG_HOT_DATA] = - curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_NODE); + + ckpt->cur_node_segno[i] = cpu_to_le32(curseg->segno); + ckpt->cur_node_blkoff[i] = cpu_to_le16(curseg->next_blkoff); + ckpt->alloc_type[i + CURSEG_HOT_NODE] = curseg->alloc_type; } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_DATA); - ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); - ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); - ckpt->next_free_nid = cpu_to_le32(last_nid); + ckpt->cur_data_segno[i] = cpu_to_le32(curseg->segno); + ckpt->cur_data_blkoff[i] = cpu_to_le16(curseg->next_blkoff); + ckpt->alloc_type[i + CURSEG_HOT_DATA] = curseg->alloc_type; + } - /* 2 cp + n data seg summary + orphan inode blocks */ - data_sum_blocks = npages_for_summary_flush(sbi); - if (data_sum_blocks < 3) - set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + /* 2 cp + n data seg summary + orphan inode blocks */ + data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false); + spin_lock_irqsave(&sbi->cp_lock, flags); + if (data_sum_blocks < NR_CURSEG_DATA_TYPE) + __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else - clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - - orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) - / F2FS_ORPHANS_PER_BLOCK; - ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks); + __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + spin_unlock_irqrestore(&sbi->cp_lock, flags); - if (is_umount) { - set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - ckpt->cp_pack_total_block_count = cpu_to_le32(2 + - data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); - } else { - clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - ckpt->cp_pack_total_block_count = cpu_to_le32(2 + - data_sum_blocks + orphan_blocks); - } + orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); + ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + + orphan_blocks); - if (sbi->n_orphans) - set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + if (__remain_node_summaries(cpc->reason)) + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + + cp_payload_blks + data_sum_blocks + + orphan_blocks + NR_CURSEG_NODE_TYPE); else - clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + + cp_payload_blks + data_sum_blocks + + orphan_blocks); + + /* update ckpt flag for checkpoint */ + update_ckpt_flags(sbi, cpc); /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); - crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); + crc32 = f2fs_checkpoint_chksum(ckpt); *((__le32 *)((unsigned char *)ckpt + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); - start_blk = __start_cp_addr(sbi); + start_blk = __start_cp_next_addr(sbi); + + /* write nat bits */ + if (enabled_nat_bits(sbi, cpc)) { + __u64 cp_ver = cur_cp_version(ckpt); + block_t blk; + + cp_ver |= ((__u64)crc32 << 32); + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); + + blk = start_blk + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) + f2fs_update_meta_page(sbi, nm_i->nat_bits + + F2FS_BLK_TO_BYTES(i), blk + i); + } /* write out checkpoint buffer at block 0 */ - cp_page = grab_meta_page(sbi, start_blk++); - kaddr = page_address(cp_page); - memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); - set_page_dirty(cp_page); - f2fs_put_page(cp_page, 1); + f2fs_update_meta_page(sbi, ckpt, start_blk++); - if (sbi->n_orphans) { + for (i = 1; i < 1 + cp_payload_blks; i++) + f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, + start_blk++); + + if (orphan_num) { write_orphan_inodes(sbi, start_blk); start_blk += orphan_blocks; } - write_data_summaries(sbi, start_blk); + f2fs_write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; - if (is_umount) { - write_node_summaries(sbi, start_blk); + + /* Record write statistics in the hot node summary */ + kbytes_written = sbi->kbytes_written; + kbytes_written += (f2fs_get_sectors_written(sbi) - + sbi->sectors_written_start) >> 1; + seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); + + if (__remain_node_summaries(cpc->reason)) { + f2fs_write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; } - /* writeout checkpoint block */ - cp_page = grab_meta_page(sbi, start_blk); - kaddr = page_address(cp_page); - memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); - set_page_dirty(cp_page); - f2fs_put_page(cp_page, 1); + /* Here, we have one bio having CP pack except cp pack 2 page */ + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + stat_cp_time(cpc, CP_TIME_SYNC_CP_META); - /* wait for previous submitted node/meta pages writeback */ - while (get_pages(sbi, F2FS_WRITEBACK)) - congestion_wait(BLK_RW_ASYNC, HZ / 50); + /* Wait for all dirty meta pages to be submitted for IO */ + f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); + stat_cp_time(cpc, CP_TIME_WAIT_DIRTY_META); - filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); - filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX); + /* wait for previous submitted meta pages writeback */ + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + stat_cp_time(cpc, CP_TIME_WAIT_CP_DATA); - /* update user_block_counts */ - sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; + stat_cp_time(cpc, CP_TIME_FLUSH_DEVICE); - /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + /* barrier and flush checkpoint cp pack 2 page if it can */ + commit_checkpoint(sbi, ckpt, start_blk); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + stat_cp_time(cpc, CP_TIME_WAIT_LAST_CP); - if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { - clear_prefree_segments(sbi); - F2FS_RESET_SB_DIRT(sbi); - } + /* + * invalidate intermediate page cache borrowed from meta inode which are + * used for migration of encrypted, verity or compressed inode's blocks. + */ + if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) || + f2fs_sb_has_compression(sbi)) + f2fs_bug_on(sbi, + invalidate_inode_pages2_range(META_MAPPING(sbi), + MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1)); + + f2fs_release_ino_entry(sbi, false); + + f2fs_reset_fsync_node_info(sbi); + + clear_sbi_flag(sbi, SBI_IS_DIRTY); + clear_sbi_flag(sbi, SBI_NEED_CP); + clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); + + spin_lock(&sbi->stat_lock); + sbi->unusable_block_count = 0; + spin_unlock(&sbi->stat_lock); + + __set_cp_next_pack(sbi); + + /* + * redirty superblock if metadata like node page or inode cache is + * updated during writing checkpoint. + */ + if (get_pages(sbi, F2FS_DIRTY_NODES) || + get_pages(sbi, F2FS_DIRTY_IMETA)) + set_sbi_flag(sbi, SBI_IS_DIRTY); + + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); + + return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; } -/* - * We guarantee that this checkpoint procedure should not fail. - */ -void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + int err = 0; + + stat_cp_time(cpc, CP_TIME_START); + + if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) + return -EROFS; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (cpc->reason != CP_PAUSE) + return 0; + f2fs_warn(sbi, "Start checkpoint disabled!"); + } + if (cpc->reason != CP_RESIZE) + f2fs_down_write(&sbi->cp_global_sem); + + stat_cp_time(cpc, CP_TIME_LOCK); + + if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) + goto out; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto out; + } - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_START_BLOCK_OPS); - mutex_lock(&sbi->cp_mutex); - block_operations(sbi); + err = block_operations(sbi); + if (err) + goto out; + + stat_cp_time(cpc, CP_TIME_OP_LOCK); + + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_BLOCK_OPS); - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); + f2fs_flush_merged_writes(sbi); - f2fs_submit_bio(sbi, DATA, true); - f2fs_submit_bio(sbi, NODE, true); - f2fs_submit_bio(sbi, META, true); + /* this is the case of multiple fstrims without any changes */ + if (cpc->reason & CP_DISCARD) { + if (!f2fs_exist_trim_candidates(sbi, cpc)) { + unblock_operations(sbi); + goto out; + } + + if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 && + SIT_I(sbi)->dirty_sentries == 0 && + prefree_segments(sbi) == 0) { + f2fs_flush_sit_entries(sbi, cpc); + f2fs_clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } + } /* * update checkpoint pack index * Increase the version number so that * SIT entries and seg summaries are written at correct place */ - ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver); + ckpt_ver = cur_cp_version(ckpt); ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi); - flush_sit_entries(sbi); + err = f2fs_flush_nat_entries(sbi, cpc); + if (err) { + f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); + goto stop; + } + + f2fs_flush_sit_entries(sbi, cpc); + + stat_cp_time(cpc, CP_TIME_FLUSH_META); + + /* save inmem log status */ + f2fs_save_inmem_curseg(sbi); - /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, is_umount); + err = do_checkpoint(sbi, cpc); + if (err) { + f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); + f2fs_release_discard_addrs(sbi); + } else { + f2fs_clear_prefree_segments(sbi, cpc); + } + f2fs_restore_inmem_curseg(sbi); + f2fs_reinit_atgc_curseg(sbi); + stat_inc_cp_count(sbi); +stop: unblock_operations(sbi); - mutex_unlock(&sbi->cp_mutex); + stat_cp_time(cpc, CP_TIME_END); + check_cp_time(sbi, cpc); - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); + if (cpc->reason & CP_RECOVERY) + f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); + + /* update CP_TIME to trigger checkpoint periodically */ + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_CHECKPOINT); +out: + if (cpc->reason != CP_RESIZE) + f2fs_up_write(&sbi->cp_global_sem); + return err; } -void init_orphan_info(struct f2fs_sb_info *sbi) +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) { - mutex_init(&sbi->orphan_inode_mutex); - INIT_LIST_HEAD(&sbi->orphan_inode_list); - sbi->n_orphans = 0; + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) { + struct inode_management *im = &sbi->im[i]; + + INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC); + spin_lock_init(&im->ino_lock); + INIT_LIST_HEAD(&im->ino_list); + im->ino_num = 0; + } + + sbi->max_orphans = (BLKS_PER_SEG(sbi) - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) * + F2FS_ORPHANS_PER_BLOCK; } -int __init create_checkpoint_caches(void) +int __init f2fs_create_checkpoint_caches(void) { - orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry), NULL); - if (unlikely(!orphan_entry_slab)) + ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", + sizeof(struct ino_entry)); + if (!ino_entry_slab) return -ENOMEM; - inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", - sizeof(struct dir_inode_entry), NULL); - if (unlikely(!inode_entry_slab)) { - kmem_cache_destroy(orphan_entry_slab); + f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", + sizeof(struct inode_entry)); + if (!f2fs_inode_entry_slab) { + kmem_cache_destroy(ino_entry_slab); return -ENOMEM; } return 0; } -void destroy_checkpoint_caches(void) +void f2fs_destroy_checkpoint_caches(void) +{ + kmem_cache_destroy(ino_entry_slab); + kmem_cache_destroy(f2fs_inode_entry_slab); +} + +static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc = { .reason = CP_SYNC, }; + int err; + + f2fs_down_write(&sbi->gc_lock); + err = f2fs_write_checkpoint(sbi, &cpc); + f2fs_up_write(&sbi->gc_lock); + + return err; +} + +static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req *req, *next; + struct llist_node *dispatch_list; + u64 sum_diff = 0, diff, count = 0; + int ret; + + dispatch_list = llist_del_all(&cprc->issue_list); + if (!dispatch_list) + return; + dispatch_list = llist_reverse_order(dispatch_list); + + ret = __write_checkpoint_sync(sbi); + atomic_inc(&cprc->issued_ckpt); + + llist_for_each_entry_safe(req, next, dispatch_list, llnode) { + diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); + req->ret = ret; + req->delta_time = diff; + complete(&req->wait); + + sum_diff += diff; + count++; + } + atomic_sub(count, &cprc->queued_ckpt); + atomic_add(count, &cprc->total_ckpt); + + spin_lock(&cprc->stat_lock); + cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); + if (cprc->peak_time < cprc->cur_time) + cprc->peak_time = cprc->cur_time; + spin_unlock(&cprc->stat_lock); +} + +static int issue_checkpoint_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct ckpt_req_control *cprc = &sbi->cprc_info; + wait_queue_head_t *q = &cprc->ckpt_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + if (!llist_empty(&cprc->issue_list)) + __checkpoint_and_complete_reqs(sbi); + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&cprc->issue_list)); + goto repeat; +} + +static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, + struct ckpt_req *wait_req) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (!llist_empty(&cprc->issue_list)) { + __checkpoint_and_complete_reqs(sbi); + } else { + /* already dispatched by issue_checkpoint_thread */ + if (wait_req) + wait_for_completion(&wait_req->wait); + } +} + +static void init_ckpt_req(struct ckpt_req *req) +{ + memset(req, 0, sizeof(struct ckpt_req)); + + init_completion(&req->wait); + req->queue_time = ktime_get(); +} + +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req req; + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC || + sbi->umount_lock_holder == current) { + int ret; + + f2fs_down_write(&sbi->gc_lock); + ret = f2fs_write_checkpoint(sbi, &cpc); + f2fs_up_write(&sbi->gc_lock); + + return ret; + } + + if (!cprc->f2fs_issue_ckpt) + return __write_checkpoint_sync(sbi); + + init_ckpt_req(&req); + + llist_add(&req.llnode, &cprc->issue_list); + atomic_inc(&cprc->queued_ckpt); + + /* + * update issue_list before we wake up issue_checkpoint thread, + * this smp_mb() pairs with another barrier in ___wait_event(), + * see more details in comments of waitqueue_active(). + */ + smp_mb(); + + if (waitqueue_active(&cprc->ckpt_wait_queue)) + wake_up(&cprc->ckpt_wait_queue); + + if (cprc->f2fs_issue_ckpt) + wait_for_completion(&req.wait); + else + flush_remained_ckpt_reqs(sbi, &req); + + if (unlikely(req.delta_time >= CP_LONG_LATENCY_THRESHOLD)) { + f2fs_warn_ratelimited(sbi, + "blocked on checkpoint for %u ms", cprc->peak_time); + dump_stack(); + } + + return req.ret; +} + +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) + return 0; + + cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, + "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(cprc->f2fs_issue_ckpt)) { + int err = PTR_ERR(cprc->f2fs_issue_ckpt); + + cprc->f2fs_issue_ckpt = NULL; + return err; + } + + set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); + + return 0; +} + +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct task_struct *ckpt_task; + + if (!cprc->f2fs_issue_ckpt) + return; + + ckpt_task = cprc->f2fs_issue_ckpt; + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); + + f2fs_flush_ckpt_thread(sbi); +} + +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + flush_remained_ckpt_reqs(sbi, NULL); + + /* Let's wait for the previous dispatched checkpoint. */ + while (atomic_read(&cprc->queued_ckpt)) + io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); +} + +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) { - kmem_cache_destroy(orphan_entry_slab); - kmem_cache_destroy(inode_entry_slab); + struct ckpt_req_control *cprc = &sbi->cprc_info; + + atomic_set(&cprc->issued_ckpt, 0); + atomic_set(&cprc->total_ckpt, 0); + atomic_set(&cprc->queued_ckpt, 0); + cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; + init_waitqueue_head(&cprc->ckpt_wait_queue); + init_llist_head(&cprc->issue_list); + spin_lock_init(&cprc->stat_lock); } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c new file mode 100644 index 000000000000..7b68bf22989d --- /dev/null +++ b/fs/f2fs/compress.c @@ -0,0 +1,2101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs compress support + * + * Copyright (c) 2019 Chao Yu <chao@kernel.org> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/moduleparam.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/lzo.h> +#include <linux/lz4.h> +#include <linux/zstd.h> +#include <linux/pagevec.h> + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include <trace/events/f2fs.h> + +static struct kmem_cache *cic_entry_slab; +static struct kmem_cache *dic_entry_slab; + +static void *page_array_alloc(struct f2fs_sb_info *sbi, int nr) +{ + unsigned int size = sizeof(struct page *) * nr; + + if (likely(size <= sbi->page_array_slab_size)) + return f2fs_kmem_cache_alloc(sbi->page_array_slab, + GFP_F2FS_ZERO, false, sbi); + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} + +static void page_array_free(struct f2fs_sb_info *sbi, void *pages, int nr) +{ + unsigned int size = sizeof(struct page *) * nr; + + if (!pages) + return; + + if (likely(size <= sbi->page_array_slab_size)) + kmem_cache_free(sbi->page_array_slab, pages); + else + kfree(pages); +} + +struct f2fs_compress_ops { + int (*init_compress_ctx)(struct compress_ctx *cc); + void (*destroy_compress_ctx)(struct compress_ctx *cc); + int (*compress_pages)(struct compress_ctx *cc); + int (*init_decompress_ctx)(struct decompress_io_ctx *dic); + void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic); + int (*decompress_pages)(struct decompress_io_ctx *dic); + bool (*is_level_valid)(int level); +}; + +static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index) +{ + return index & (cc->cluster_size - 1); +} + +static pgoff_t cluster_idx(struct compress_ctx *cc, pgoff_t index) +{ + return index >> cc->log_cluster_size; +} + +static pgoff_t start_idx_of_cluster(struct compress_ctx *cc) +{ + return cc->cluster_idx << cc->log_cluster_size; +} + +bool f2fs_is_compressed_page(struct folio *folio) +{ + if (!folio->private) + return false; + if (folio_test_f2fs_nonpointer(folio)) + return false; + + f2fs_bug_on(F2FS_F_SB(folio), + *((u32 *)folio->private) != F2FS_COMPRESSED_PAGE_MAGIC); + return true; +} + +static void f2fs_set_compressed_page(struct page *page, + struct inode *inode, pgoff_t index, void *data) +{ + struct folio *folio = page_folio(page); + + folio_attach_private(folio, (void *)data); + + /* i_crypto_info and iv index */ + folio->index = index; + folio->mapping = inode->i_mapping; +} + +static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock) +{ + int i; + + for (i = 0; i < len; i++) { + if (!cc->rpages[i]) + continue; + if (unlock) + unlock_page(cc->rpages[i]); + else + put_page(cc->rpages[i]); + } +} + +static void f2fs_put_rpages(struct compress_ctx *cc) +{ + f2fs_drop_rpages(cc, cc->cluster_size, false); +} + +static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) +{ + f2fs_drop_rpages(cc, len, true); +} + +static void f2fs_put_rpages_wbc(struct compress_ctx *cc, + struct writeback_control *wbc, bool redirty, bool unlock) +{ + unsigned int i; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + if (redirty) + redirty_page_for_writepage(wbc, cc->rpages[i]); + f2fs_put_page(cc->rpages[i], unlock); + } +} + +struct folio *f2fs_compress_control_folio(struct folio *folio) +{ + struct compress_io_ctx *ctx = folio->private; + + return page_folio(ctx->rpages[0]); +} + +int f2fs_init_compress_ctx(struct compress_ctx *cc) +{ + if (cc->rpages) + return 0; + + cc->rpages = page_array_alloc(F2FS_I_SB(cc->inode), cc->cluster_size); + return cc->rpages ? 0 : -ENOMEM; +} + +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) +{ + page_array_free(F2FS_I_SB(cc->inode), cc->rpages, cc->cluster_size); + cc->rpages = NULL; + cc->nr_rpages = 0; + cc->nr_cpages = 0; + cc->valid_nr_cpages = 0; + if (!reuse) + cc->cluster_idx = NULL_CLUSTER; +} + +void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct folio *folio) +{ + unsigned int cluster_ofs; + + if (!f2fs_cluster_can_merge_page(cc, folio->index)) + f2fs_bug_on(F2FS_I_SB(cc->inode), 1); + + cluster_ofs = offset_in_cluster(cc, folio->index); + cc->rpages[cluster_ofs] = folio_page(folio, 0); + cc->nr_rpages++; + cc->cluster_idx = cluster_idx(cc, folio->index); +} + +#ifdef CONFIG_F2FS_FS_LZO +static int lzo_init_compress_ctx(struct compress_ctx *cc) +{ + cc->private = f2fs_vmalloc(F2FS_I_SB(cc->inode), + LZO1X_MEM_COMPRESS); + if (!cc->private) + return -ENOMEM; + + cc->clen = lzo1x_worst_compress(PAGE_SIZE << cc->log_cluster_size); + return 0; +} + +static void lzo_destroy_compress_ctx(struct compress_ctx *cc) +{ + vfree(cc->private); + cc->private = NULL; +} + +static int lzo_compress_pages(struct compress_ctx *cc) +{ + int ret; + + ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, + &cc->clen, cc->private); + if (ret != LZO_E_OK) { + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "lzo compress failed, ret:%d", ret); + return -EIO; + } + return 0; +} + +static int lzo_decompress_pages(struct decompress_io_ctx *dic) +{ + int ret; + + ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, + dic->rbuf, &dic->rlen); + if (ret != LZO_E_OK) { + f2fs_err_ratelimited(dic->sbi, + "lzo decompress failed, ret:%d", ret); + return -EIO; + } + + if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { + f2fs_err_ratelimited(dic->sbi, + "lzo invalid rlen:%zu, expected:%lu", + dic->rlen, PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lzo_ops = { + .init_compress_ctx = lzo_init_compress_ctx, + .destroy_compress_ctx = lzo_destroy_compress_ctx, + .compress_pages = lzo_compress_pages, + .decompress_pages = lzo_decompress_pages, +}; +#endif + +#ifdef CONFIG_F2FS_FS_LZ4 +static int lz4_init_compress_ctx(struct compress_ctx *cc) +{ + unsigned int size = LZ4_MEM_COMPRESS; + +#ifdef CONFIG_F2FS_FS_LZ4HC + if (F2FS_I(cc->inode)->i_compress_level) + size = LZ4HC_MEM_COMPRESS; +#endif + + cc->private = f2fs_vmalloc(F2FS_I_SB(cc->inode), size); + if (!cc->private) + return -ENOMEM; + + /* + * we do not change cc->clen to LZ4_compressBound(inputsize) to + * adapt worst compress case, because lz4 compressor can handle + * output budget properly. + */ + cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; + return 0; +} + +static void lz4_destroy_compress_ctx(struct compress_ctx *cc) +{ + vfree(cc->private); + cc->private = NULL; +} + +static int lz4_compress_pages(struct compress_ctx *cc) +{ + int len = -EINVAL; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; + + if (!level) + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); +#ifdef CONFIG_F2FS_FS_LZ4HC + else + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); +#endif + if (len < 0) + return len; + if (!len) + return -EAGAIN; + + cc->clen = len; + return 0; +} + +static int lz4_decompress_pages(struct decompress_io_ctx *dic) +{ + int ret; + + ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, + dic->clen, dic->rlen); + if (ret < 0) { + f2fs_err_ratelimited(dic->sbi, + "lz4 decompress failed, ret:%d", ret); + return -EIO; + } + + if (ret != PAGE_SIZE << dic->log_cluster_size) { + f2fs_err_ratelimited(dic->sbi, + "lz4 invalid ret:%d, expected:%lu", + ret, PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + return 0; +} + +static bool lz4_is_level_valid(int lvl) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + return !lvl || (lvl >= LZ4HC_MIN_CLEVEL && lvl <= LZ4HC_MAX_CLEVEL); +#else + return lvl == 0; +#endif +} + +static const struct f2fs_compress_ops f2fs_lz4_ops = { + .init_compress_ctx = lz4_init_compress_ctx, + .destroy_compress_ctx = lz4_destroy_compress_ctx, + .compress_pages = lz4_compress_pages, + .decompress_pages = lz4_decompress_pages, + .is_level_valid = lz4_is_level_valid, +}; +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int zstd_init_compress_ctx(struct compress_ctx *cc) +{ + zstd_parameters params; + zstd_cstream *stream; + void *workspace; + unsigned int workspace_size; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; + + /* Need to remain this for backward compatibility */ + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; + + params = zstd_get_params(level, cc->rlen); + workspace_size = zstd_cstream_workspace_bound(¶ms.cParams); + + workspace = f2fs_vmalloc(F2FS_I_SB(cc->inode), workspace_size); + if (!workspace) + return -ENOMEM; + + stream = zstd_init_cstream(¶ms, 0, workspace, workspace_size); + if (!stream) { + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "%s zstd_init_cstream failed", __func__); + vfree(workspace); + return -EIO; + } + + cc->private = workspace; + cc->private2 = stream; + + cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; + return 0; +} + +static void zstd_destroy_compress_ctx(struct compress_ctx *cc) +{ + vfree(cc->private); + cc->private = NULL; + cc->private2 = NULL; +} + +static int zstd_compress_pages(struct compress_ctx *cc) +{ + zstd_cstream *stream = cc->private2; + zstd_in_buffer inbuf; + zstd_out_buffer outbuf; + int src_size = cc->rlen; + int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE; + int ret; + + inbuf.pos = 0; + inbuf.src = cc->rbuf; + inbuf.size = src_size; + + outbuf.pos = 0; + outbuf.dst = cc->cbuf->cdata; + outbuf.size = dst_size; + + ret = zstd_compress_stream(stream, &outbuf, &inbuf); + if (zstd_is_error(ret)) { + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "%s zstd_compress_stream failed, ret: %d", + __func__, zstd_get_error_code(ret)); + return -EIO; + } + + ret = zstd_end_stream(stream, &outbuf); + if (zstd_is_error(ret)) { + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "%s zstd_end_stream returned %d", + __func__, zstd_get_error_code(ret)); + return -EIO; + } + + /* + * there is compressed data remained in intermediate buffer due to + * no more space in cbuf.cdata + */ + if (ret) + return -EAGAIN; + + cc->clen = outbuf.pos; + return 0; +} + +static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) +{ + zstd_dstream *stream; + void *workspace; + unsigned int workspace_size; + unsigned int max_window_size = + MAX_COMPRESS_WINDOW_SIZE(dic->log_cluster_size); + + workspace_size = zstd_dstream_workspace_bound(max_window_size); + + workspace = f2fs_vmalloc(dic->sbi, workspace_size); + if (!workspace) + return -ENOMEM; + + stream = zstd_init_dstream(max_window_size, workspace, workspace_size); + if (!stream) { + f2fs_err_ratelimited(dic->sbi, + "%s zstd_init_dstream failed", __func__); + vfree(workspace); + return -EIO; + } + + dic->private = workspace; + dic->private2 = stream; + + return 0; +} + +static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic) +{ + vfree(dic->private); + dic->private = NULL; + dic->private2 = NULL; +} + +static int zstd_decompress_pages(struct decompress_io_ctx *dic) +{ + zstd_dstream *stream = dic->private2; + zstd_in_buffer inbuf; + zstd_out_buffer outbuf; + int ret; + + inbuf.pos = 0; + inbuf.src = dic->cbuf->cdata; + inbuf.size = dic->clen; + + outbuf.pos = 0; + outbuf.dst = dic->rbuf; + outbuf.size = dic->rlen; + + ret = zstd_decompress_stream(stream, &outbuf, &inbuf); + if (zstd_is_error(ret)) { + f2fs_err_ratelimited(dic->sbi, + "%s zstd_decompress_stream failed, ret: %d", + __func__, zstd_get_error_code(ret)); + return -EIO; + } + + if (dic->rlen != outbuf.pos) { + f2fs_err_ratelimited(dic->sbi, + "%s ZSTD invalid rlen:%zu, expected:%lu", + __func__, dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + + return 0; +} + +static bool zstd_is_level_valid(int lvl) +{ + return lvl >= zstd_min_clevel() && lvl <= zstd_max_clevel(); +} + +static const struct f2fs_compress_ops f2fs_zstd_ops = { + .init_compress_ctx = zstd_init_compress_ctx, + .destroy_compress_ctx = zstd_destroy_compress_ctx, + .compress_pages = zstd_compress_pages, + .init_decompress_ctx = zstd_init_decompress_ctx, + .destroy_decompress_ctx = zstd_destroy_decompress_ctx, + .decompress_pages = zstd_decompress_pages, + .is_level_valid = zstd_is_level_valid, +}; +#endif + +#ifdef CONFIG_F2FS_FS_LZO +#ifdef CONFIG_F2FS_FS_LZORLE +static int lzorle_compress_pages(struct compress_ctx *cc) +{ + int ret; + + ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, + &cc->clen, cc->private); + if (ret != LZO_E_OK) { + f2fs_err_ratelimited(F2FS_I_SB(cc->inode), + "lzo-rle compress failed, ret:%d", ret); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lzorle_ops = { + .init_compress_ctx = lzo_init_compress_ctx, + .destroy_compress_ctx = lzo_destroy_compress_ctx, + .compress_pages = lzorle_compress_pages, + .decompress_pages = lzo_decompress_pages, +}; +#endif +#endif + +static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { +#ifdef CONFIG_F2FS_FS_LZO + &f2fs_lzo_ops, +#else + NULL, +#endif +#ifdef CONFIG_F2FS_FS_LZ4 + &f2fs_lz4_ops, +#else + NULL, +#endif +#ifdef CONFIG_F2FS_FS_ZSTD + &f2fs_zstd_ops, +#else + NULL, +#endif +#if defined(CONFIG_F2FS_FS_LZO) && defined(CONFIG_F2FS_FS_LZORLE) + &f2fs_lzorle_ops, +#else + NULL, +#endif +}; + +bool f2fs_is_compress_backend_ready(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return true; + return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; +} + +bool f2fs_is_compress_level_valid(int alg, int lvl) +{ + const struct f2fs_compress_ops *cops = f2fs_cops[alg]; + + if (cops->is_level_valid) + return cops->is_level_valid(lvl); + + return lvl == 0; +} + +static mempool_t *compress_page_pool; +static int num_compress_pages = 512; +module_param(num_compress_pages, uint, 0444); +MODULE_PARM_DESC(num_compress_pages, + "Number of intermediate compress pages to preallocate"); + +int __init f2fs_init_compress_mempool(void) +{ + compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); + return compress_page_pool ? 0 : -ENOMEM; +} + +void f2fs_destroy_compress_mempool(void) +{ + mempool_destroy(compress_page_pool); +} + +static struct page *f2fs_compress_alloc_page(void) +{ + struct page *page; + + page = mempool_alloc(compress_page_pool, GFP_NOFS); + lock_page(page); + + return page; +} + +static void f2fs_compress_free_page(struct page *page) +{ + struct folio *folio; + + if (!page) + return; + folio = page_folio(page); + folio_detach_private(folio); + folio->mapping = NULL; + folio_unlock(folio); + mempool_free(page, compress_page_pool); +} + +#define MAX_VMAP_RETRIES 3 + +static void *f2fs_vmap(struct page **pages, unsigned int count) +{ + int i; + void *buf = NULL; + + for (i = 0; i < MAX_VMAP_RETRIES; i++) { + buf = vm_map_ram(pages, count, -1); + if (buf) + break; + vm_unmap_aliases(); + } + return buf; +} + +static int f2fs_compress_pages(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct f2fs_inode_info *fi = F2FS_I(cc->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + unsigned int max_len, new_nr_cpages; + u32 chksum = 0; + int i, ret; + + trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, + cc->cluster_size, fi->i_compress_algorithm); + + if (cops->init_compress_ctx) { + ret = cops->init_compress_ctx(cc); + if (ret) + goto out; + } + + max_len = COMPRESS_HEADER_SIZE + cc->clen; + cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + cc->valid_nr_cpages = cc->nr_cpages; + + cc->cpages = page_array_alloc(sbi, cc->nr_cpages); + if (!cc->cpages) { + ret = -ENOMEM; + goto destroy_compress_ctx; + } + + for (i = 0; i < cc->nr_cpages; i++) + cc->cpages[i] = f2fs_compress_alloc_page(); + + cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size); + if (!cc->rbuf) { + ret = -ENOMEM; + goto out_free_cpages; + } + + cc->cbuf = f2fs_vmap(cc->cpages, cc->nr_cpages); + if (!cc->cbuf) { + ret = -ENOMEM; + goto out_vunmap_rbuf; + } + + ret = cops->compress_pages(cc); + if (ret) + goto out_vunmap_cbuf; + + max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE; + + if (cc->clen > max_len) { + ret = -EAGAIN; + goto out_vunmap_cbuf; + } + + cc->cbuf->clen = cpu_to_le32(cc->clen); + + if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM)) + chksum = f2fs_crc32(cc->cbuf->cdata, cc->clen); + cc->cbuf->chksum = cpu_to_le32(chksum); + + for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) + cc->cbuf->reserved[i] = cpu_to_le32(0); + + new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + /* zero out any unused part of the last page */ + memset(&cc->cbuf->cdata[cc->clen], 0, + (new_nr_cpages * PAGE_SIZE) - + (cc->clen + COMPRESS_HEADER_SIZE)); + + vm_unmap_ram(cc->cbuf, cc->nr_cpages); + vm_unmap_ram(cc->rbuf, cc->cluster_size); + + for (i = new_nr_cpages; i < cc->nr_cpages; i++) { + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } + + if (cops->destroy_compress_ctx) + cops->destroy_compress_ctx(cc); + + cc->valid_nr_cpages = new_nr_cpages; + + trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, + cc->clen, ret); + return 0; + +out_vunmap_cbuf: + vm_unmap_ram(cc->cbuf, cc->nr_cpages); +out_vunmap_rbuf: + vm_unmap_ram(cc->rbuf, cc->cluster_size); +out_free_cpages: + for (i = 0; i < cc->nr_cpages; i++) { + if (cc->cpages[i]) + f2fs_compress_free_page(cc->cpages[i]); + } + page_array_free(sbi, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; +destroy_compress_ctx: + if (cops->destroy_compress_ctx) + cops->destroy_compress_ctx(cc); +out: + trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, + cc->clen, ret); + return ret; +} + +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, + bool pre_alloc); +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool pre_alloc); + +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) +{ + struct f2fs_sb_info *sbi = dic->sbi; + struct f2fs_inode_info *fi = F2FS_I(dic->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + bool bypass_callback = false; + int ret; + + trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, + dic->cluster_size, fi->i_compress_algorithm); + + if (dic->failed) { + ret = -EIO; + goto out_end_io; + } + + ret = f2fs_prepare_decomp_mem(dic, false); + if (ret) { + bypass_callback = true; + goto out_release; + } + + dic->clen = le32_to_cpu(dic->cbuf->clen); + dic->rlen = PAGE_SIZE << dic->log_cluster_size; + + if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { + ret = -EFSCORRUPTED; + + /* Avoid f2fs_commit_super in irq context */ + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); + goto out_release; + } + + ret = cops->decompress_pages(dic); + + if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) { + u32 provided = le32_to_cpu(dic->cbuf->chksum); + u32 calculated = f2fs_crc32(dic->cbuf->cdata, dic->clen); + + if (provided != calculated) { + if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { + set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); + f2fs_info_ratelimited(sbi, + "checksum invalid, nid = %lu, %x vs %x", + dic->inode->i_ino, + provided, calculated); + } + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } + +out_release: + f2fs_release_decomp_mem(dic, bypass_callback, false); + +out_end_io: + trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, + dic->clen, ret); + f2fs_decompress_end_io(dic, ret, in_task); +} + +static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct folio *folio, nid_t ino, block_t blkaddr); + +/* + * This is called when a page of a compressed cluster has been read from disk + * (or failed to be read from disk). It checks whether this page was the last + * page being waited on in the cluster, and if so, it decompresses the cluster + * (or in the case of a failure, cleans up without actually decompressing). + */ +void f2fs_end_read_compressed_page(struct folio *folio, bool failed, + block_t blkaddr, bool in_task) +{ + struct decompress_io_ctx *dic = folio->private; + struct f2fs_sb_info *sbi = dic->sbi; + + dec_page_count(sbi, F2FS_RD_DATA); + + if (failed) + WRITE_ONCE(dic->failed, true); + else if (blkaddr && in_task) + f2fs_cache_compressed_page(sbi, folio, + dic->inode->i_ino, blkaddr); + + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic, in_task); +} + +static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) +{ + if (cc->cluster_idx == NULL_CLUSTER) + return true; + return cc->cluster_idx == cluster_idx(cc, index); +} + +bool f2fs_cluster_is_empty(struct compress_ctx *cc) +{ + return cc->nr_rpages == 0; +} + +static bool f2fs_cluster_is_full(struct compress_ctx *cc) +{ + return cc->cluster_size == cc->nr_rpages; +} + +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) +{ + if (f2fs_cluster_is_empty(cc)) + return true; + return is_page_in_cluster(cc, index); +} + +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, + int index, int nr_pages, bool uptodate) +{ + unsigned long pgidx = page_folio(pages[index])->index; + int i = uptodate ? 0 : 1; + + /* + * when uptodate set to true, try to check all pages in cluster is + * uptodate or not. + */ + if (uptodate && (pgidx % cc->cluster_size)) + return false; + + if (nr_pages - index < cc->cluster_size) + return false; + + for (; i < cc->cluster_size; i++) { + struct folio *folio = page_folio(pages[index + i]); + + if (folio->index != pgidx + i) + return false; + if (uptodate && !folio_test_uptodate(folio)) + return false; + } + + return true; +} + +static bool cluster_has_invalid_data(struct compress_ctx *cc) +{ + loff_t i_size = i_size_read(cc->inode); + unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); + int i; + + for (i = 0; i < cc->cluster_size; i++) { + struct page *page = cc->rpages[i]; + + f2fs_bug_on(F2FS_I_SB(cc->inode), !page); + + /* beyond EOF */ + if (page_folio(page)->index >= nr_pages) + return true; + } + return false; +} + +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + int cluster_end = 0; + unsigned int count; + int i; + char *reason = ""; + + if (dn->data_blkaddr != COMPRESS_ADDR) + return false; + + /* [..., COMPR_ADDR, ...] */ + if (dn->ofs_in_node % cluster_size) { + reason = "[*|C|*|*]"; + goto out; + } + + for (i = 1, count = 1; i < cluster_size; i++, count++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_folio, + dn->ofs_in_node + i); + + /* [COMPR_ADDR, ..., COMPR_ADDR] */ + if (blkaddr == COMPRESS_ADDR) { + reason = "[C|*|C|*]"; + goto out; + } + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; + } + } + + f2fs_bug_on(F2FS_I_SB(dn->inode), count != cluster_size && + !is_inode_flag_set(dn->inode, FI_COMPRESS_RELEASED)); + + return false; +out: + f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", + dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return true; +#else + return false; +#endif +} + +static int __f2fs_get_cluster_blocks(struct inode *inode, + struct dnode_of_data *dn) +{ + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + int count, i; + + for (i = 0, count = 0; i < cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_folio, + dn->ofs_in_node + i); + + if (__is_valid_data_blkaddr(blkaddr)) + count++; + } + + return count; +} + +static int __f2fs_cluster_blocks(struct inode *inode, unsigned int cluster_idx, + enum cluster_check_type type) +{ + struct dnode_of_data dn; + unsigned int start_idx = cluster_idx << + F2FS_I(inode)->i_log_cluster_size; + int ret; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) + ret = 0; + goto fail; + } + + if (f2fs_sanity_check_cluster(&dn)) { + ret = -EFSCORRUPTED; + goto fail; + } + + if (dn.data_blkaddr == COMPRESS_ADDR) { + if (type == CLUSTER_COMPR_BLKS) + ret = 1 + __f2fs_get_cluster_blocks(inode, &dn); + else if (type == CLUSTER_IS_COMPR) + ret = 1; + } else if (type == CLUSTER_RAW_BLKS) { + ret = __f2fs_get_cluster_blocks(inode, &dn); + } +fail: + f2fs_put_dnode(&dn); + return ret; +} + +/* return # of compressed blocks in compressed cluster */ +static int f2fs_compressed_blocks(struct compress_ctx *cc) +{ + return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, + CLUSTER_COMPR_BLKS); +} + +/* return # of raw blocks in non-compressed cluster */ +static int f2fs_decompressed_blocks(struct inode *inode, + unsigned int cluster_idx) +{ + return __f2fs_cluster_blocks(inode, cluster_idx, + CLUSTER_RAW_BLKS); +} + +/* return whether cluster is compressed one or not */ +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) +{ + return __f2fs_cluster_blocks(inode, + index >> F2FS_I(inode)->i_log_cluster_size, + CLUSTER_IS_COMPR); +} + +/* return whether cluster contains non raw blocks or not */ +bool f2fs_is_sparse_cluster(struct inode *inode, pgoff_t index) +{ + unsigned int cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size; + + return f2fs_decompressed_blocks(inode, cluster_idx) != + F2FS_I(inode)->i_cluster_size; +} + +static bool cluster_may_compress(struct compress_ctx *cc) +{ + if (!f2fs_need_compress_data(cc->inode)) + return false; + if (f2fs_is_atomic_file(cc->inode)) + return false; + if (!f2fs_cluster_is_full(cc)) + return false; + if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode)))) + return false; + return !cluster_has_invalid_data(cc); +} + +static void set_cluster_writeback(struct compress_ctx *cc) +{ + int i; + + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) + set_page_writeback(cc->rpages[i]); + } +} + +static void cancel_cluster_writeback(struct compress_ctx *cc, + struct compress_io_ctx *cic, int submitted) +{ + int i; + + /* Wait for submitted IOs. */ + if (submitted > 1) { + f2fs_submit_merged_write(F2FS_I_SB(cc->inode), DATA); + while (atomic_read(&cic->pending_pages) != + (cc->valid_nr_cpages - submitted + 1)) + f2fs_io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + } + + /* Cancel writeback and stay locked. */ + for (i = 0; i < cc->cluster_size; i++) { + if (i < submitted) { + inode_inc_dirty_pages(cc->inode); + lock_page(cc->rpages[i]); + } + clear_page_private_gcing(cc->rpages[i]); + if (folio_test_writeback(page_folio(cc->rpages[i]))) + end_page_writeback(cc->rpages[i]); + } +} + +static void set_cluster_dirty(struct compress_ctx *cc) +{ + int i; + + for (i = 0; i < cc->cluster_size; i++) + if (cc->rpages[i]) { + set_page_dirty(cc->rpages[i]); + set_page_private_gcing(cc->rpages[i]); + } +} + +static int prepare_compress_overwrite(struct compress_ctx *cc, + struct page **pagep, pgoff_t index, void **fsdata) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct address_space *mapping = cc->inode->i_mapping; + struct folio *folio; + sector_t last_block_in_bio; + fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; + pgoff_t start_idx = start_idx_of_cluster(cc); + int i, ret; + +retry: + ret = f2fs_is_compressed_cluster(cc->inode, start_idx); + if (ret <= 0) + return ret; + + ret = f2fs_init_compress_ctx(cc); + if (ret) + return ret; + + /* keep folio reference to avoid page reclaim */ + for (i = 0; i < cc->cluster_size; i++) { + folio = f2fs_filemap_get_folio(mapping, start_idx + i, + fgp_flag, GFP_NOFS); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto unlock_pages; + } + + if (folio_test_uptodate(folio)) + f2fs_folio_put(folio, true); + else + f2fs_compress_ctx_add_page(cc, folio); + } + + if (!f2fs_cluster_is_empty(cc)) { + struct bio *bio = NULL; + + ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, + &last_block_in_bio, NULL, true); + f2fs_put_rpages(cc); + f2fs_destroy_compress_ctx(cc, true); + if (ret) + goto out; + if (bio) + f2fs_submit_read_bio(sbi, bio, DATA); + + ret = f2fs_init_compress_ctx(cc); + if (ret) + goto out; + } + + for (i = 0; i < cc->cluster_size; i++) { + f2fs_bug_on(sbi, cc->rpages[i]); + + folio = filemap_lock_folio(mapping, start_idx + i); + if (IS_ERR(folio)) { + /* folio could be truncated */ + goto release_and_retry; + } + + f2fs_folio_wait_writeback(folio, DATA, true, true); + f2fs_compress_ctx_add_page(cc, folio); + + if (!folio_test_uptodate(folio)) { + f2fs_handle_page_eio(sbi, folio, DATA); +release_and_retry: + f2fs_put_rpages(cc); + f2fs_unlock_rpages(cc, i + 1); + f2fs_destroy_compress_ctx(cc, true); + goto retry; + } + } + + if (likely(!ret)) { + *fsdata = cc->rpages; + *pagep = cc->rpages[offset_in_cluster(cc, index)]; + return cc->cluster_size; + } + +unlock_pages: + f2fs_put_rpages(cc); + f2fs_unlock_rpages(cc, i); + f2fs_destroy_compress_ctx(cc, true); +out: + return ret; +} + +int f2fs_prepare_compress_overwrite(struct inode *inode, + struct page **pagep, pgoff_t index, void **fsdata) +{ + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, + .rpages = NULL, + .nr_rpages = 0, + }; + + return prepare_compress_overwrite(&cc, pagep, index, fsdata); +} + +bool f2fs_compress_write_end(struct inode *inode, void *fsdata, + pgoff_t index, unsigned copied) + +{ + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .rpages = fsdata, + }; + struct folio *folio = page_folio(cc.rpages[0]); + bool first_index = (index == folio->index); + + if (copied) + set_cluster_dirty(&cc); + + f2fs_put_rpages_wbc(&cc, NULL, false, true); + f2fs_destroy_compress_ctx(&cc, false); + + return first_index; +} + +int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) +{ + void *fsdata = NULL; + struct page *pagep; + struct page **rpages; + int log_cluster_size = F2FS_I(inode)->i_log_cluster_size; + pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) << + log_cluster_size; + int i; + int err; + + err = f2fs_is_compressed_cluster(inode, start_idx); + if (err < 0) + return err; + + /* truncate normal cluster */ + if (!err) + return f2fs_do_truncate_blocks(inode, from, lock); + + /* truncate compressed cluster */ + err = f2fs_prepare_compress_overwrite(inode, &pagep, + start_idx, &fsdata); + + /* should not be a normal cluster */ + f2fs_bug_on(F2FS_I_SB(inode), err == 0); + + if (err <= 0) + return err; + + rpages = fsdata; + + for (i = (1 << log_cluster_size) - 1; i >= 0; i--) { + struct folio *folio = page_folio(rpages[i]); + loff_t start = (loff_t)folio->index << PAGE_SHIFT; + loff_t offset = from > start ? from - start : 0; + + folio_zero_segment(folio, offset, folio_size(folio)); + + if (from >= start) + break; + } + + f2fs_compress_write_end(inode, fsdata, start_idx, true); + + err = filemap_write_and_wait_range(inode->i_mapping, + round_down(from, 1 << log_cluster_size << PAGE_SHIFT), + LLONG_MAX); + if (err) + return err; + + truncate_pagecache(inode, from); + + return f2fs_do_truncate_blocks(inode, round_up(from, PAGE_SIZE), lock); +} + +static int f2fs_write_compressed_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct inode *inode = cc->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = cc->inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NEW_ADDR, + .page = NULL, + .encrypted_page = NULL, + .compressed_page = NULL, + .io_type = io_type, + .io_wbc = wbc, + .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ? + 1 : 0, + }; + struct folio *folio; + struct dnode_of_data dn; + struct node_info ni; + struct compress_io_ctx *cic; + pgoff_t start_idx = start_idx_of_cluster(cc); + unsigned int last_index = cc->cluster_size - 1; + loff_t psize; + int i, err; + bool quota_inode = IS_NOQUOTA(inode); + + /* we should bypass data pages to proceed the kworker jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(inode->i_mapping, -EIO); + goto out_free; + } + + if (quota_inode) { + /* + * We need to wait for node_write to avoid block allocation during + * checkpoint. This can only happen to quota writes which can cause + * the below discard race condition. + */ + f2fs_down_read(&sbi->node_write); + } else if (!f2fs_trylock_op(sbi)) { + goto out_free; + } + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + + err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (err) + goto out_unlock_op; + + for (i = 0; i < cc->cluster_size; i++) { + if (data_blkaddr(dn.inode, dn.node_folio, + dn.ofs_in_node + i) == NULL_ADDR) + goto out_put_dnode; + } + + folio = page_folio(cc->rpages[last_index]); + psize = folio_next_pos(folio); + + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); + if (err) + goto out_put_dnode; + + fio.version = ni.version; + + cic = f2fs_kmem_cache_alloc(cic_entry_slab, GFP_F2FS_ZERO, false, sbi); + if (!cic) + goto out_put_dnode; + + cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; + cic->inode = inode; + atomic_set(&cic->pending_pages, cc->valid_nr_cpages); + cic->rpages = page_array_alloc(sbi, cc->cluster_size); + if (!cic->rpages) + goto out_put_cic; + + cic->nr_rpages = cc->cluster_size; + + for (i = 0; i < cc->valid_nr_cpages; i++) { + f2fs_set_compressed_page(cc->cpages[i], inode, + page_folio(cc->rpages[i + 1])->index, cic); + fio.compressed_page = cc->cpages[i]; + + fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_folio, + dn.ofs_in_node + i + 1); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, fio.old_blkaddr); + + if (fio.encrypted) { + fio.page = cc->rpages[i + 1]; + err = f2fs_encrypt_one_page(&fio); + if (err) + goto out_destroy_crypt; + cc->cpages[i] = fio.encrypted_page; + } + } + + set_cluster_writeback(cc); + + for (i = 0; i < cc->cluster_size; i++) + cic->rpages[i] = cc->rpages[i]; + + for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) { + block_t blkaddr; + + blkaddr = f2fs_data_blkaddr(&dn); + fio.page = cc->rpages[i]; + fio.old_blkaddr = blkaddr; + + /* cluster header */ + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + fio.compr_blocks++; + if (__is_valid_data_blkaddr(blkaddr)) + f2fs_invalidate_blocks(sbi, blkaddr, 1); + f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR); + goto unlock_continue; + } + + if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) + fio.compr_blocks++; + + if (i > cc->valid_nr_cpages) { + if (__is_valid_data_blkaddr(blkaddr)) { + f2fs_invalidate_blocks(sbi, blkaddr, 1); + f2fs_update_data_blkaddr(&dn, NEW_ADDR); + } + goto unlock_continue; + } + + f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR); + + if (fio.encrypted) + fio.encrypted_page = cc->cpages[i - 1]; + else + fio.compressed_page = cc->cpages[i - 1]; + + cc->cpages[i - 1] = NULL; + fio.submitted = 0; + f2fs_outplace_write_data(&dn, &fio); + if (unlikely(!fio.submitted)) { + cancel_cluster_writeback(cc, cic, i); + + /* To call fscrypt_finalize_bounce_page */ + i = cc->valid_nr_cpages; + *submitted = 0; + goto out_destroy_crypt; + } + (*submitted)++; +unlock_continue: + inode_dec_dirty_pages(cc->inode); + folio_unlock(fio.folio); + } + + if (fio.compr_blocks) + f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); + f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true); + add_compr_block_stat(inode, cc->valid_nr_cpages); + + set_inode_flag(cc->inode, FI_APPEND_WRITE); + + f2fs_put_dnode(&dn); + if (quota_inode) + f2fs_up_read(&sbi->node_write); + else + f2fs_unlock_op(sbi); + + spin_lock(&fi->i_size_lock); + if (fi->last_disk_size < psize) + fi->last_disk_size = psize; + spin_unlock(&fi->i_size_lock); + + f2fs_put_rpages(cc); + page_array_free(sbi, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; + f2fs_destroy_compress_ctx(cc, false); + return 0; + +out_destroy_crypt: + page_array_free(sbi, cic->rpages, cc->cluster_size); + + for (--i; i >= 0; i--) { + if (!cc->cpages[i]) + continue; + fscrypt_finalize_bounce_page(&cc->cpages[i]); + } +out_put_cic: + kmem_cache_free(cic_entry_slab, cic); +out_put_dnode: + f2fs_put_dnode(&dn); +out_unlock_op: + if (quota_inode) + f2fs_up_read(&sbi->node_write); + else + f2fs_unlock_op(sbi); +out_free: + for (i = 0; i < cc->valid_nr_cpages; i++) { + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } + page_array_free(sbi, cc->cpages, cc->nr_cpages); + cc->cpages = NULL; + return -EAGAIN; +} + +void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio) +{ + struct page *page = &folio->page; + struct f2fs_sb_info *sbi = bio->bi_private; + struct compress_io_ctx *cic = folio->private; + enum count_type type = WB_DATA_TYPE(folio, + f2fs_is_compressed_page(folio)); + int i; + + if (unlikely(bio->bi_status != BLK_STS_OK)) + mapping_set_error(cic->inode->i_mapping, -EIO); + + f2fs_compress_free_page(page); + + dec_page_count(sbi, type); + + if (atomic_dec_return(&cic->pending_pages)) + return; + + for (i = 0; i < cic->nr_rpages; i++) { + WARN_ON(!cic->rpages[i]); + clear_page_private_gcing(cic->rpages[i]); + end_page_writeback(cic->rpages[i]); + } + + page_array_free(sbi, cic->rpages, cic->nr_rpages); + kmem_cache_free(cic_entry_slab, cic); +} + +static int f2fs_write_raw_pages(struct compress_ctx *cc, + int *submitted_p, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct address_space *mapping = cc->inode->i_mapping; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + int submitted, compr_blocks, i; + int ret = 0; + + compr_blocks = f2fs_compressed_blocks(cc); + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + + redirty_page_for_writepage(wbc, cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + + if (compr_blocks < 0) + return compr_blocks; + + /* overwrite compressed cluster w/ normal cluster */ + if (compr_blocks > 0) + f2fs_lock_op(sbi); + + for (i = 0; i < cc->cluster_size; i++) { + struct folio *folio; + + if (!cc->rpages[i]) + continue; + folio = page_folio(cc->rpages[i]); +retry_write: + folio_lock(folio); + + if (folio->mapping != mapping) { +continue_unlock: + folio_unlock(folio); + continue; + } + + if (!folio_test_dirty(folio)) + goto continue_unlock; + + if (folio_test_writeback(folio)) { + if (wbc->sync_mode == WB_SYNC_NONE) + goto continue_unlock; + f2fs_folio_wait_writeback(folio, DATA, true, true); + } + + if (!folio_clear_dirty_for_io(folio)) + goto continue_unlock; + + submitted = 0; + ret = f2fs_write_single_data_page(folio, &submitted, + NULL, NULL, wbc, io_type, + compr_blocks, false); + if (ret) { + if (ret == 1) { + ret = 0; + } else if (ret == -EAGAIN) { + ret = 0; + /* + * for quota file, just redirty left pages to + * avoid deadlock caused by cluster update race + * from foreground operation. + */ + if (IS_NOQUOTA(cc->inode)) + goto out; + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + goto retry_write; + } + goto out; + } + + *submitted_p += submitted; + } + +out: + if (compr_blocks > 0) + f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, true); + return ret; +} + +int f2fs_write_multi_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + int err; + + *submitted = 0; + if (cluster_may_compress(cc)) { + err = f2fs_compress_pages(cc); + if (err == -EAGAIN) { + add_compr_block_stat(cc->inode, cc->cluster_size); + goto write; + } else if (err) { + f2fs_put_rpages_wbc(cc, wbc, true, true); + goto destroy_out; + } + + err = f2fs_write_compressed_pages(cc, submitted, + wbc, io_type); + if (!err) + return 0; + f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); + } +write: + f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted); + + err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); + f2fs_put_rpages_wbc(cc, wbc, false, false); +destroy_out: + f2fs_destroy_compress_ctx(cc, false); + return err; +} + +static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi, + bool pre_alloc) +{ + return pre_alloc ^ f2fs_low_mem_mode(sbi); +} + +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, + bool pre_alloc) +{ + const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm]; + int i; + + if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc)) + return 0; + + dic->tpages = page_array_alloc(dic->sbi, dic->cluster_size); + if (!dic->tpages) + return -ENOMEM; + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) { + dic->tpages[i] = dic->rpages[i]; + continue; + } + + dic->tpages[i] = f2fs_compress_alloc_page(); + } + + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); + if (!dic->rbuf) + return -ENOMEM; + + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); + if (!dic->cbuf) + return -ENOMEM; + + if (cops->init_decompress_ctx) + return cops->init_decompress_ctx(dic); + + return 0; +} + +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool pre_alloc) +{ + const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm]; + + if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc)) + return; + + if (!bypass_destroy_callback && cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); + + if (dic->cbuf) + vm_unmap_ram(dic->cbuf, dic->nr_cpages); + + if (dic->rbuf) + vm_unmap_ram(dic->rbuf, dic->cluster_size); +} + +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback); + +struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) +{ + struct decompress_io_ctx *dic; + pgoff_t start_idx = start_idx_of_cluster(cc); + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + int i, ret; + + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi); + if (!dic) + return ERR_PTR(-ENOMEM); + + dic->rpages = page_array_alloc(sbi, cc->cluster_size); + if (!dic->rpages) { + kmem_cache_free(dic_entry_slab, dic); + return ERR_PTR(-ENOMEM); + } + + dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; + dic->inode = cc->inode; + dic->sbi = sbi; + dic->compress_algorithm = F2FS_I(cc->inode)->i_compress_algorithm; + atomic_set(&dic->remaining_pages, cc->nr_cpages); + dic->cluster_idx = cc->cluster_idx; + dic->cluster_size = cc->cluster_size; + dic->log_cluster_size = cc->log_cluster_size; + dic->nr_cpages = cc->nr_cpages; + refcount_set(&dic->refcnt, 1); + dic->failed = false; + dic->need_verity = f2fs_need_verity(cc->inode, start_idx); + + for (i = 0; i < dic->cluster_size; i++) + dic->rpages[i] = cc->rpages[i]; + dic->nr_rpages = cc->cluster_size; + + dic->cpages = page_array_alloc(sbi, dic->nr_cpages); + if (!dic->cpages) { + ret = -ENOMEM; + goto out_free; + } + + for (i = 0; i < dic->nr_cpages; i++) { + struct page *page; + + page = f2fs_compress_alloc_page(); + f2fs_set_compressed_page(page, cc->inode, + start_idx + i + 1, dic); + dic->cpages[i] = page; + } + + ret = f2fs_prepare_decomp_mem(dic, true); + if (ret) + goto out_free; + + return dic; + +out_free: + f2fs_free_dic(dic, true); + return ERR_PTR(ret); +} + +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback) +{ + int i; + /* use sbi in dic to avoid UFA of dic->inode*/ + struct f2fs_sb_info *sbi = dic->sbi; + + f2fs_release_decomp_mem(dic, bypass_destroy_callback, true); + + if (dic->tpages) { + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) + continue; + if (!dic->tpages[i]) + continue; + f2fs_compress_free_page(dic->tpages[i]); + } + page_array_free(sbi, dic->tpages, dic->cluster_size); + } + + if (dic->cpages) { + for (i = 0; i < dic->nr_cpages; i++) { + if (!dic->cpages[i]) + continue; + f2fs_compress_free_page(dic->cpages[i]); + } + page_array_free(sbi, dic->cpages, dic->nr_cpages); + } + + page_array_free(sbi, dic->rpages, dic->nr_rpages); + kmem_cache_free(dic_entry_slab, dic); +} + +static void f2fs_late_free_dic(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, free_work); + + f2fs_free_dic(dic, false); +} + +static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) +{ + if (refcount_dec_and_test(&dic->refcnt)) { + if (in_task) { + f2fs_free_dic(dic, false); + } else { + INIT_WORK(&dic->free_work, f2fs_late_free_dic); + queue_work(dic->sbi->post_read_wq, &dic->free_work); + } + } +} + +static void f2fs_verify_cluster(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); + int i; + + /* Verify, update, and unlock the decompressed pages. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (!rpage) + continue; + + if (fsverity_verify_page(rpage)) + SetPageUptodate(rpage); + else + ClearPageUptodate(rpage); + unlock_page(rpage); + } + + f2fs_put_dic(dic, true); +} + +/* + * This is called when a compressed cluster has been decompressed + * (or failed to be read and/or decompressed). + */ +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) +{ + int i; + + if (!failed && dic->need_verity) { + /* + * Note that to avoid deadlocks, the verity work can't be done + * on the decompression workqueue. This is because verifying + * the data pages can involve reading metadata pages from the + * file, and these metadata pages may be compressed. + */ + INIT_WORK(&dic->verity_work, f2fs_verify_cluster); + fsverity_enqueue_verify_work(&dic->verity_work); + return; + } + + /* Update and unlock the cluster's pagecache pages. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (!rpage) + continue; + + if (failed) + ClearPageUptodate(rpage); + else + SetPageUptodate(rpage); + unlock_page(rpage); + } + + /* + * Release the reference to the decompress_io_ctx that was being held + * for I/O completion. + */ + f2fs_put_dic(dic, in_task); +} + +/* + * Put a reference to a compressed folio's decompress_io_ctx. + * + * This is called when the folio is no longer needed and can be freed. + */ +void f2fs_put_folio_dic(struct folio *folio, bool in_task) +{ + struct decompress_io_ctx *dic = folio->private; + + f2fs_put_dic(dic, in_task); +} + +/* + * check whether cluster blocks are contiguous, and add extent cache entry + * only if cluster blocks are logically and physically contiguous. + */ +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn, + unsigned int ofs_in_node) +{ + bool compressed = data_blkaddr(dn->inode, dn->node_folio, + ofs_in_node) == COMPRESS_ADDR; + int i = compressed ? 1 : 0; + block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_folio, + ofs_in_node + i); + + for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_folio, + ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + if (first_blkaddr + i - (compressed ? 1 : 0) != blkaddr) + return 0; + } + + return compressed ? i - 1 : i; +} + +const struct address_space_operations f2fs_compress_aops = { + .release_folio = f2fs_release_folio, + .invalidate_folio = f2fs_invalidate_folio, + .migrate_folio = filemap_migrate_folio, +}; + +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->compress_inode->i_mapping; +} + +void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, + block_t blkaddr, unsigned int len) +{ + if (!sbi->compress_inode) + return; + invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr + len - 1); +} + +static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct folio *folio, nid_t ino, block_t blkaddr) +{ + struct folio *cfolio; + int ret; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + return; + + if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE)) + return; + + cfolio = filemap_get_folio(COMPRESS_MAPPING(sbi), blkaddr); + if (!IS_ERR(cfolio)) { + f2fs_folio_put(cfolio, false); + return; + } + + cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0, NULL); + if (!cfolio) + return; + + ret = filemap_add_folio(COMPRESS_MAPPING(sbi), cfolio, + blkaddr, GFP_NOFS); + if (ret) { + f2fs_folio_put(cfolio, false); + return; + } + + folio_set_f2fs_data(cfolio, ino); + + memcpy(folio_address(cfolio), folio_address(folio), PAGE_SIZE); + folio_mark_uptodate(cfolio); + f2fs_folio_put(cfolio, true); +} + +bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, + block_t blkaddr) +{ + struct folio *cfolio; + bool hitted = false; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return false; + + cfolio = f2fs_filemap_get_folio(COMPRESS_MAPPING(sbi), + blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS); + if (!IS_ERR(cfolio)) { + if (folio_test_uptodate(cfolio)) { + atomic_inc(&sbi->compress_page_hit); + memcpy(folio_address(folio), + folio_address(cfolio), folio_size(folio)); + hitted = true; + } + f2fs_folio_put(cfolio, true); + } + + return hitted; +} + +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct address_space *mapping = COMPRESS_MAPPING(sbi); + struct folio_batch fbatch; + pgoff_t index = 0; + pgoff_t end = MAX_BLKADDR(sbi); + + if (!mapping->nrpages) + return; + + folio_batch_init(&fbatch); + + do { + unsigned int nr, i; + + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); + if (!nr) + break; + + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; + + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + continue; + } + + if (ino != folio_get_f2fs_data(folio)) { + folio_unlock(folio); + continue; + } + + generic_error_remove_folio(mapping, folio); + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } while (index < end); +} + +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) +{ + struct inode *inode; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return 0; + + inode = f2fs_iget(sbi->sb, F2FS_COMPRESS_INO(sbi)); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->compress_inode = inode; + + sbi->compress_percent = COMPRESS_PERCENT; + sbi->compress_watermark = COMPRESS_WATERMARK; + + atomic_set(&sbi->compress_page_hit, 0); + + return 0; +} + +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) +{ + if (!sbi->compress_inode) + return; + iput(sbi->compress_inode); + sbi->compress_inode = NULL; +} + +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + char slab_name[35]; + + if (!f2fs_sb_has_compression(sbi)) + return 0; + + sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev)); + + sbi->page_array_slab_size = sizeof(struct page *) << + F2FS_OPTION(sbi).compress_log_size; + + sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, + sbi->page_array_slab_size); + return sbi->page_array_slab ? 0 : -ENOMEM; +} + +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) +{ + kmem_cache_destroy(sbi->page_array_slab); +} + +int __init f2fs_init_compress_cache(void) +{ + cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", + sizeof(struct compress_io_ctx)); + if (!cic_entry_slab) + return -ENOMEM; + dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", + sizeof(struct decompress_io_ctx)); + if (!dic_entry_slab) + goto free_cic; + return 0; +free_cic: + kmem_cache_destroy(cic_entry_slab); + return -ENOMEM; +} + +void f2fs_destroy_compress_cache(void) +{ + kmem_cache_destroy(dic_entry_slab); + kmem_cache_destroy(cic_entry_slab); +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 035f9a345cdf..c30e69392a62 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1,788 +1,4247 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/data.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> -#include <linux/buffer_head.h> +#include <linux/sched/mm.h> #include <linux/mpage.h> -#include <linux/aio.h> #include <linux/writeback.h> -#include <linux/backing-dev.h> +#include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/bio.h> +#include <linux/blk-crypto.h> +#include <linux/swap.h> #include <linux/prefetch.h> +#include <linux/uio.h> +#include <linux/sched/signal.h> +#include <linux/fiemap.h> +#include <linux/iomap.h> #include "f2fs.h" #include "node.h" #include "segment.h" +#include "iostat.h" #include <trace/events/f2fs.h> +#define NUM_PREALLOC_POST_READ_CTXS 128 + +static struct kmem_cache *bio_post_read_ctx_cache; +static struct kmem_cache *bio_entry_slab; +static mempool_t *bio_post_read_ctx_pool; +static struct bio_set f2fs_bioset; + +#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE + +int __init f2fs_init_bioset(void) +{ + return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, + 0, BIOSET_NEED_BVECS); +} + +void f2fs_destroy_bioset(void) +{ + bioset_exit(&f2fs_bioset); +} + +bool f2fs_is_cp_guaranteed(const struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (fscrypt_is_bounce_folio(folio)) + return folio_test_f2fs_gcing(fscrypt_pagecache_folio(folio)); + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode)) + return true; + + if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || + folio_test_f2fs_gcing(folio)) + return true; + return false; +} + +static enum count_type __read_io_type(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + + if (mapping) { + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi)) + return F2FS_RD_META; + + if (inode->i_ino == F2FS_NODE_INO(sbi)) + return F2FS_RD_NODE; + } + return F2FS_RD_DATA; +} + +/* postprocessing steps for read bios */ +enum bio_post_read_step { +#ifdef CONFIG_FS_ENCRYPTION + STEP_DECRYPT = BIT(0), +#else + STEP_DECRYPT = 0, /* compile out the decryption-related code */ +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + STEP_DECOMPRESS = BIT(1), +#else + STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ +#endif +#ifdef CONFIG_FS_VERITY + STEP_VERITY = BIT(2), +#else + STEP_VERITY = 0, /* compile out the verity-related code */ +#endif +}; + +struct bio_post_read_ctx { + struct bio *bio; + struct f2fs_sb_info *sbi; + struct work_struct work; + unsigned int enabled_steps; + /* + * decompression_attempted keeps track of whether + * f2fs_end_read_compressed_page() has been called on the pages in the + * bio that belong to a compressed cluster yet. + */ + bool decompression_attempted; + block_t fs_blkaddr; +}; + /* - * Lock ordering for the change of data block address: - * ->data_page - * ->node_page - * update block addresses in the node page + * Update and unlock a bio's pages, and free the bio. + * + * This marks pages up-to-date only if there was no error in the bio (I/O error, + * decryption error, or verity error), as indicated by bio->bi_status. + * + * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk) + * aren't marked up-to-date here, as decompression is done on a per-compression- + * cluster basis rather than a per-bio basis. Instead, we only must do two + * things for each compressed page here: call f2fs_end_read_compressed_page() + * with failed=true if an error occurred before it would have normally gotten + * called (i.e., I/O error or decryption error, but *not* verity error), and + * release the bio's reference to the decompress_io_ctx of the page's cluster. */ -static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) +static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { - struct f2fs_node *rn; - __le32 *addr_array; - struct page *node_page = dn->node_page; - unsigned int ofs_in_node = dn->ofs_in_node; + struct folio_iter fi; + struct bio_post_read_ctx *ctx = bio->bi_private; - wait_on_page_writeback(node_page); + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; - rn = (struct f2fs_node *)page_address(node_page); + if (f2fs_is_compressed_page(folio)) { + if (ctx && !ctx->decompression_attempted) + f2fs_end_read_compressed_page(folio, true, 0, + in_task); + f2fs_put_folio_dic(folio, in_task); + continue; + } - /* Get physical address of data block */ - addr_array = blkaddr_in_node(rn); - addr_array[ofs_in_node] = cpu_to_le32(new_addr); - set_page_dirty(node_page); + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio)); + folio_end_read(folio, bio->bi_status == BLK_STS_OK); + } + + if (ctx) + mempool_free(ctx, bio_post_read_ctx_pool); + bio_put(bio); } -int reserve_new_block(struct dnode_of_data *dn) +static void f2fs_verify_bio(struct work_struct *work) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; + bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) - return -EPERM; - if (!inc_valid_block_count(sbi, dn->inode, 1)) - return -ENOSPC; + /* + * fsverity_verify_bio() may call readahead() again, and while verity + * will be disabled for this, decryption and/or decompression may still + * be needed, resulting in another bio_post_read_ctx being allocated. + * So to prevent deadlocks we need to release the current ctx to the + * mempool first. This assumes that verity is the last post-read step. + */ + mempool_free(ctx, bio_post_read_ctx_pool); + bio->bi_private = NULL; + + /* + * Verify the bio's pages with fs-verity. Exclude compressed pages, + * as those were handled separately by f2fs_end_read_compressed_page(). + */ + if (may_have_compressed_pages) { + struct folio_iter fi; - trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; - __set_data_blkaddr(dn, NEW_ADDR); - dn->data_blkaddr = NEW_ADDR; - sync_inode_page(dn); - return 0; + if (!f2fs_is_compressed_page(folio) && + !fsverity_verify_page(&folio->page)) { + bio->bi_status = BLK_STS_IOERR; + break; + } + } + } else { + fsverity_verify_bio(bio); + } + + f2fs_finish_read_bio(bio, true); } -static int check_extent_cache(struct inode *inode, pgoff_t pgofs, - struct buffer_head *bh_result) +/* + * If the bio's data needs to be verified with fs-verity, then enqueue the + * verity work for the bio. Otherwise finish the bio now. + * + * Note that to avoid deadlocks, the verity work can't be done on the + * decryption/decompression workqueue. This is because verifying the data pages + * can involve reading verity metadata pages from the file, and these verity + * metadata pages may be encrypted and/or compressed. + */ +static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) { - struct f2fs_inode_info *fi = F2FS_I(inode); -#ifdef CONFIG_F2FS_STAT_FS - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct bio_post_read_ctx *ctx = bio->bi_private; + + if (ctx && (ctx->enabled_steps & STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verify_bio); + fsverity_enqueue_verify_work(&ctx->work); + } else { + f2fs_finish_read_bio(bio, in_task); + } +} + +/* + * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last + * remaining page was read by @ctx->bio. + * + * Note that a bio may span clusters (even a mix of compressed and uncompressed + * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates + * that the bio includes at least one compressed page. The actual decompression + * is done on a per-cluster basis, not a per-bio basis. + */ +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, + bool in_task) +{ + struct folio_iter fi; + bool all_compressed = true; + block_t blkaddr = ctx->fs_blkaddr; + + bio_for_each_folio_all(fi, ctx->bio) { + struct folio *folio = fi.folio; + + if (f2fs_is_compressed_page(folio)) + f2fs_end_read_compressed_page(folio, false, blkaddr, + in_task); + else + all_compressed = false; + + blkaddr++; + } + + ctx->decompression_attempted = true; + + /* + * Optimization: if all the bio's pages are compressed, then scheduling + * the per-bio verity work is unnecessary, as verity will be fully + * handled at the compression cluster level. + */ + if (all_compressed) + ctx->enabled_steps &= ~STEP_VERITY; +} + +static void f2fs_post_read_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; + + if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) { + f2fs_finish_read_bio(bio, true); + return; + } + + if (ctx->enabled_steps & STEP_DECOMPRESS) + f2fs_handle_step_decompress(ctx, true); + + f2fs_verify_and_finish_bio(bio, true); +} + +static void f2fs_read_end_io(struct bio *bio) +{ + struct f2fs_sb_info *sbi = F2FS_F_SB(bio_first_folio_all(bio)); + struct bio_post_read_ctx *ctx; + bool intask = in_task() && !irqs_disabled(); + + iostat_update_and_unbind_ctx(bio); + ctx = bio->bi_private; + + if (time_to_inject(sbi, FAULT_READ_IO)) + bio->bi_status = BLK_STS_IOERR; + + if (bio->bi_status != BLK_STS_OK) { + f2fs_finish_read_bio(bio, intask); + return; + } + + if (ctx) { + unsigned int enabled_steps = ctx->enabled_steps & + (STEP_DECRYPT | STEP_DECOMPRESS); + + /* + * If we have only decompression step between decompression and + * decrypt, we don't need post processing for this. + */ + if (enabled_steps == STEP_DECOMPRESS && + !f2fs_low_mem_mode(sbi)) { + f2fs_handle_step_decompress(ctx, intask); + } else if (enabled_steps) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + return; + } + } + + f2fs_verify_and_finish_bio(bio, intask); +} + +static void f2fs_write_end_io(struct bio *bio) +{ + struct f2fs_sb_info *sbi; + struct folio_iter fi; + + iostat_update_and_unbind_ctx(bio); + sbi = bio->bi_private; + + if (time_to_inject(sbi, FAULT_WRITE_IO)) + bio->bi_status = BLK_STS_IOERR; + + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + enum count_type type; + + if (fscrypt_is_bounce_folio(folio)) { + struct folio *io_folio = folio; + + folio = fscrypt_pagecache_folio(io_folio); + fscrypt_free_bounce_page(&io_folio->page); + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_is_compressed_page(folio)) { + f2fs_compress_write_end_io(bio, folio); + continue; + } #endif - pgoff_t start_fofs, end_fofs; - block_t start_blkaddr; - read_lock(&fi->ext.ext_lock); - if (fi->ext.len == 0) { - read_unlock(&fi->ext.ext_lock); - return 0; + type = WB_DATA_TYPE(folio, false); + + if (unlikely(bio->bi_status != BLK_STS_OK)) { + mapping_set_error(folio->mapping, -EIO); + if (type == F2FS_WB_CP_DATA) + f2fs_stop_checkpoint(sbi, true, + STOP_CP_REASON_WRITE_FAIL); + } + + f2fs_bug_on(sbi, is_node_folio(folio) && + folio->index != nid_of_node(folio)); + + dec_page_count(sbi, type); + if (f2fs_in_warm_node_list(sbi, folio)) + f2fs_del_fsync_node_entry(sbi, folio); + folio_clear_f2fs_gcing(folio); + folio_end_writeback(folio); } + if (!get_pages(sbi, F2FS_WB_CP_DATA) && + wq_has_sleeper(&sbi->cp_wait)) + wake_up(&sbi->cp_wait); + + bio_put(bio); +} + +#ifdef CONFIG_BLK_DEV_ZONED +static void f2fs_zone_write_end_io(struct bio *bio) +{ + struct f2fs_bio_info *io = (struct f2fs_bio_info *)bio->bi_private; -#ifdef CONFIG_F2FS_STAT_FS - sbi->total_hit_ext++; + bio->bi_private = io->bi_private; + complete(&io->zone_wait); + f2fs_write_end_io(bio); +} #endif - start_fofs = fi->ext.fofs; - end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk_addr; - - if (pgofs >= start_fofs && pgofs <= end_fofs) { - unsigned int blkbits = inode->i_sb->s_blocksize_bits; - size_t count; - - clear_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, - start_blkaddr + pgofs - start_fofs); - count = end_fofs - pgofs + 1; - if (count < (UINT_MAX >> blkbits)) - bh_result->b_size = (count << blkbits); - else - bh_result->b_size = UINT_MAX; -#ifdef CONFIG_F2FS_STAT_FS - sbi->read_hit_ext++; +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, sector_t *sector) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + if (f2fs_is_multi_device(sbi)) { + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + } + + if (sector) + *sector = SECTOR_FROM_BLOCK(blk_addr); + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + if (!f2fs_is_multi_device(sbi)) + return 0; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) +{ + unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); + unsigned int fua_flag, meta_flag, io_flag; + blk_opf_t op_flags = 0; + + if (fio->op != REQ_OP_WRITE) + return 0; + if (fio->type == DATA) + io_flag = fio->sbi->data_io_flag; + else if (fio->type == NODE) + io_flag = fio->sbi->node_io_flag; + else + return 0; + + fua_flag = io_flag & temp_mask; + meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + + /* + * data/node io flag bits per temp: + * REQ_META | REQ_FUA | + * 5 | 4 | 3 | 2 | 1 | 0 | + * Cold | Warm | Hot | Cold | Warm | Hot | + */ + if (BIT(fio->temp) & meta_flag) + op_flags |= REQ_META; + if (BIT(fio->temp) & fua_flag) + op_flags |= REQ_FUA; + + if (fio->type == DATA && + F2FS_I(fio->folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) + op_flags |= REQ_PRIO; + + return op_flags; +} + +static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) +{ + struct f2fs_sb_info *sbi = fio->sbi; + struct block_device *bdev; + sector_t sector; + struct bio *bio; + + bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); + bio = bio_alloc_bioset(bdev, npages, + fio->op | fio->op_flags | f2fs_io_flags(fio), + GFP_NOIO, &f2fs_bioset); + bio->bi_iter.bi_sector = sector; + if (is_read_io(fio->op)) { + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = NULL; + } else { + bio->bi_end_io = f2fs_write_end_io; + bio->bi_private = sbi; + bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, + fio->type, fio->temp); + } + iostat_alloc_and_bind_ctx(sbi, bio, NULL); + + if (fio->io_wbc) + wbc_init_bio(fio->io_wbc, bio); + + return bio; +} + +static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, + pgoff_t first_idx, + const struct f2fs_io_info *fio, + gfp_t gfp_mask) +{ + /* + * The f2fs garbage collector sets ->encrypted_page when it wants to + * read/write raw data without encryption. + */ + if (!fio || !fio->encrypted_page) + fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask); +} + +static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, + pgoff_t next_idx, + const struct f2fs_io_info *fio) +{ + /* + * The f2fs garbage collector sets ->encrypted_page when it wants to + * read/write raw data without encryption. + */ + if (fio && fio->encrypted_page) + return !bio_has_crypt_ctx(bio); + + return fscrypt_mergeable_bio(bio, inode, next_idx); +} + +void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, + enum page_type type) +{ + WARN_ON_ONCE(!is_read_io(bio_op(bio))); + trace_f2fs_submit_read_bio(sbi->sb, type, bio); + + iostat_update_submit_ctx(bio, type); + submit_bio(bio); +} + +static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, + enum page_type type) +{ + WARN_ON_ONCE(is_read_io(bio_op(bio))); + trace_f2fs_submit_write_bio(sbi->sb, type, bio); + iostat_update_submit_ctx(bio, type); + submit_bio(bio); +} + +static void __submit_merged_bio(struct f2fs_bio_info *io) +{ + struct f2fs_io_info *fio = &io->fio; + + if (!io->bio) + return; + + if (is_read_io(fio->op)) { + trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); + f2fs_submit_read_bio(io->sbi, io->bio, fio->type); + } else { + trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); + f2fs_submit_write_bio(io->sbi, io->bio, fio->type); + } + io->bio = NULL; +} + +static bool __has_merged_page(struct bio *bio, struct inode *inode, + struct folio *folio, nid_t ino) +{ + struct folio_iter fi; + + if (!bio) + return false; + + if (!inode && !folio && !ino) + return true; + + bio_for_each_folio_all(fi, bio) { + struct folio *target = fi.folio; + + if (fscrypt_is_bounce_folio(target)) { + target = fscrypt_pagecache_folio(target); + if (IS_ERR(target)) + continue; + } + if (f2fs_is_compressed_page(target)) { + target = f2fs_compress_control_folio(target); + if (IS_ERR(target)) + continue; + } + + if (inode && inode == target->mapping->host) + return true; + if (folio && folio == target) + return true; + if (ino && ino == ino_of_node(target)) + return true; + } + + return false; +} + +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1 : NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = f2fs_kmalloc(sbi, + array_size(n, sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) + return -ENOMEM; + + for (j = HOT; j < n; j++) { + struct f2fs_bio_info *io = &sbi->write_io[i][j]; + + init_f2fs_rwsem(&io->io_rwsem); + io->sbi = sbi; + io->bio = NULL; + io->last_block_in_bio = 0; + spin_lock_init(&io->io_lock); + INIT_LIST_HEAD(&io->io_list); + INIT_LIST_HEAD(&io->bio_list); + init_f2fs_rwsem(&io->bio_list_lock); +#ifdef CONFIG_BLK_DEV_ZONED + init_completion(&io->zone_wait); + io->zone_pending_bio = NULL; + io->bi_private = NULL; #endif - read_unlock(&fi->ext.ext_lock); - return 1; + } } - read_unlock(&fi->ext.ext_lock); + return 0; } -void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) +static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) { - struct f2fs_inode_info *fi = F2FS_I(dn->inode); - pgoff_t fofs, start_fofs, end_fofs; - block_t start_blkaddr, end_blkaddr; + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - BUG_ON(blk_addr == NEW_ADDR); - fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node; + f2fs_down_write(&io->io_rwsem); - /* Update the page address in the parent node */ - __set_data_blkaddr(dn, blk_addr); + if (!io->bio) + goto unlock_out; - write_lock(&fi->ext.ext_lock); + /* change META to META_FLUSH in the checkpoint procedure */ + if (type >= META_FLUSH) { + io->fio.type = META_FLUSH; + io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC; + if (!test_opt(sbi, NOBARRIER)) + io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; + } + __submit_merged_bio(io); +unlock_out: + f2fs_up_write(&io->io_rwsem); +} - start_fofs = fi->ext.fofs; - end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk_addr; - end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1; +static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct folio *folio, + nid_t ino, enum page_type type, bool force) +{ + enum temp_type temp; + bool ret = true; - /* Drop and initialize the matched extent */ - if (fi->ext.len == 1 && fofs == start_fofs) - fi->ext.len = 0; + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + if (!force) { + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - /* Initial extent */ - if (fi->ext.len == 0) { - if (blk_addr != NULL_ADDR) { - fi->ext.fofs = fofs; - fi->ext.blk_addr = blk_addr; - fi->ext.len = 1; + f2fs_down_read(&io->io_rwsem); + ret = __has_merged_page(io->bio, inode, folio, ino); + f2fs_up_read(&io->io_rwsem); } - goto end_update; + if (ret) + __f2fs_submit_merged_write(sbi, type, temp); + + /* TODO: use HOT temp only for meta pages now. */ + if (type >= META) + break; } +} + +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) +{ + __submit_merged_write_cond(sbi, NULL, NULL, 0, type, true); +} - /* Front merge */ - if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) { - fi->ext.fofs--; - fi->ext.blk_addr--; - fi->ext.len++; - goto end_update; +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct folio *folio, + nid_t ino, enum page_type type) +{ + __submit_merged_write_cond(sbi, inode, folio, ino, type, false); +} + +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) +{ + f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_write(sbi, NODE); + f2fs_submit_merged_write(sbi, META); +} + +/* + * Fill the locked page with data located in the block address. + * A caller needs to unlock the page on failure. + */ +int f2fs_submit_page_bio(struct f2fs_io_info *fio) +{ + struct bio *bio; + struct folio *fio_folio = fio->folio; + struct folio *data_folio = fio->encrypted_page ? + page_folio(fio->encrypted_page) : fio_folio; + + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, + fio->is_por ? META_POR : (__is_meta_io(fio) ? + META_GENERIC : DATA_GENERIC_ENHANCE))) + return -EFSCORRUPTED; + + trace_f2fs_submit_folio_bio(data_folio, fio); + + /* Allocate a new bio */ + bio = __bio_alloc(fio, 1); + + f2fs_set_bio_crypt_ctx(bio, fio_folio->mapping->host, + fio_folio->index, fio, GFP_NOIO); + bio_add_folio_nofail(bio, data_folio, folio_size(data_folio), 0); + + if (fio->io_wbc && !is_read_io(fio->op)) + wbc_account_cgroup_owner(fio->io_wbc, fio_folio, PAGE_SIZE); + + inc_page_count(fio->sbi, is_read_io(fio->op) ? + __read_io_type(data_folio) : WB_DATA_TYPE(fio->folio, false)); + + if (is_read_io(bio_op(bio))) + f2fs_submit_read_bio(fio->sbi, bio, fio->type); + else + f2fs_submit_write_bio(fio->sbi, bio, fio->type); + return 0; +} + +static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, + block_t last_blkaddr, block_t cur_blkaddr) +{ + if (unlikely(sbi->max_io_bytes && + bio->bi_iter.bi_size >= sbi->max_io_bytes)) + return false; + if (last_blkaddr + 1 != cur_blkaddr) + return false; + return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL); +} + +static bool io_type_is_mergeable(struct f2fs_bio_info *io, + struct f2fs_io_info *fio) +{ + blk_opf_t mask = ~(REQ_PREFLUSH | REQ_FUA); + + if (io->fio.op != fio->op) + return false; + return (io->fio.op_flags & mask) == (fio->op_flags & mask); +} + +static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, + struct f2fs_bio_info *io, + struct f2fs_io_info *fio, + block_t last_blkaddr, + block_t cur_blkaddr) +{ + if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr)) + return false; + return io_type_is_mergeable(io, fio); +} + +static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, + struct folio *folio, enum temp_type temp) +{ + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct bio_entry *be; + + be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL); + be->bio = bio; + bio_get(bio); + + bio_add_folio_nofail(bio, folio, folio_size(folio), 0); + + f2fs_down_write(&io->bio_list_lock); + list_add_tail(&be->list, &io->bio_list); + f2fs_up_write(&io->bio_list_lock); +} + +static void del_bio_entry(struct bio_entry *be) +{ + list_del(&be->list); + kmem_cache_free(bio_entry_slab, be); +} + +static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, + struct folio *folio) +{ + struct folio *fio_folio = fio->folio; + struct f2fs_sb_info *sbi = fio->sbi; + enum temp_type temp; + bool found = false; + int ret = -EAGAIN; + + for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct list_head *head = &io->bio_list; + struct bio_entry *be; + + f2fs_down_write(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (be->bio != *bio) + continue; + + found = true; + + f2fs_bug_on(sbi, !page_is_mergeable(sbi, *bio, + *fio->last_block, + fio->new_blkaddr)); + if (f2fs_crypt_mergeable_bio(*bio, + fio_folio->mapping->host, + fio_folio->index, fio) && + bio_add_folio(*bio, folio, folio_size(folio), 0)) { + ret = 0; + break; + } + + /* page can't be merged into bio; submit the bio */ + del_bio_entry(be); + f2fs_submit_write_bio(sbi, *bio, DATA); + break; + } + f2fs_up_write(&io->bio_list_lock); } - /* Back merge */ - if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) { - fi->ext.len++; - goto end_update; + if (ret) { + bio_put(*bio); + *bio = NULL; } - /* Split the existing extent */ - if (fi->ext.len > 1 && - fofs >= start_fofs && fofs <= end_fofs) { - if ((end_fofs - fofs) < (fi->ext.len >> 1)) { - fi->ext.len = fofs - start_fofs; - } else { - fi->ext.fofs = fofs + 1; - fi->ext.blk_addr = start_blkaddr + - fofs - start_fofs + 1; - fi->ext.len -= fofs - start_fofs + 1; + return ret; +} + +void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, + struct bio **bio, struct folio *folio) +{ + enum temp_type temp; + bool found = false; + struct bio *target = bio ? *bio : NULL; + + f2fs_bug_on(sbi, !target && !folio); + + for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct list_head *head = &io->bio_list; + struct bio_entry *be; + + if (list_empty(head)) + continue; + + f2fs_down_read(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (target) + found = (target == be->bio); + else + found = __has_merged_page(be->bio, NULL, + folio, 0); + if (found) + break; } - goto end_update; + f2fs_up_read(&io->bio_list_lock); + + if (!found) + continue; + + found = false; + + f2fs_down_write(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (target) + found = (target == be->bio); + else + found = __has_merged_page(be->bio, NULL, + folio, 0); + if (found) { + target = be->bio; + del_bio_entry(be); + break; + } + } + f2fs_up_write(&io->bio_list_lock); } - write_unlock(&fi->ext.ext_lock); - return; -end_update: - write_unlock(&fi->ext.ext_lock); - sync_inode_page(dn); - return; + if (found) + f2fs_submit_write_bio(sbi, target, DATA); + if (bio && *bio) { + bio_put(*bio); + *bio = NULL; + } } -struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) +int f2fs_merge_page_bio(struct f2fs_io_info *fio) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct address_space *mapping = inode->i_mapping; - struct dnode_of_data dn; - struct page *page; - int err; + struct bio *bio = *fio->bio; + struct folio *data_folio = fio->encrypted_page ? + page_folio(fio->encrypted_page) : fio->folio; + struct folio *folio = fio->folio; - page = find_get_page(mapping, index); - if (page && PageUptodate(page)) - return page; - f2fs_put_page(page, 0); + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + return -EFSCORRUPTED; - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) - return ERR_PTR(err); - f2fs_put_dnode(&dn); + trace_f2fs_submit_folio_bio(data_folio, fio); - if (dn.data_blkaddr == NULL_ADDR) - return ERR_PTR(-ENOENT); + if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, + fio->new_blkaddr)) + f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); +alloc_new: + if (!bio) { + bio = __bio_alloc(fio, BIO_MAX_VECS); + f2fs_set_bio_crypt_ctx(bio, folio->mapping->host, + folio->index, fio, GFP_NOIO); - /* By fallocate(), there is no cached page, but with NEW_ADDR */ - if (dn.data_blkaddr == NEW_ADDR) - return ERR_PTR(-EINVAL); + add_bio_entry(fio->sbi, bio, data_folio, fio->temp); + } else { + if (add_ipu_page(fio, &bio, data_folio)) + goto alloc_new; + } - page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); - if (!page) - return ERR_PTR(-ENOMEM); + if (fio->io_wbc) + wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); + + inc_page_count(fio->sbi, WB_DATA_TYPE(folio, false)); + + *fio->last_block = fio->new_blkaddr; + *fio->bio = bio; + + return 0; +} + +#ifdef CONFIG_BLK_DEV_ZONED +static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int devi = 0; - if (PageUptodate(page)) { - unlock_page(page); - return page; + if (f2fs_is_multi_device(sbi)) { + devi = f2fs_target_device_index(sbi, blkaddr); + if (blkaddr < FDEV(devi).start_blk || + blkaddr > FDEV(devi).end_blk) { + f2fs_err(sbi, "Invalid block %x", blkaddr); + return false; + } + blkaddr -= FDEV(devi).start_blk; + bdev = FDEV(devi).bdev; } + return bdev_is_zoned(bdev) && + f2fs_blkz_is_seq(sbi, devi, blkaddr) && + (blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1); +} +#endif - err = f2fs_readpage(sbi, page, dn.data_blkaddr, - sync ? READ_SYNC : READA); - if (sync) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - f2fs_put_page(page, 0); - return ERR_PTR(-EIO); +void f2fs_submit_page_write(struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); + struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; + struct folio *bio_folio; + enum count_type type; + + f2fs_bug_on(sbi, is_read_io(fio->op)); + + f2fs_down_write(&io->io_rwsem); +next: +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) { + wait_for_completion_io(&io->zone_wait); + bio_put(io->zone_pending_bio); + io->zone_pending_bio = NULL; + io->bi_private = NULL; + } +#endif + + if (fio->in_list) { + spin_lock(&io->io_lock); + if (list_empty(&io->io_list)) { + spin_unlock(&io->io_lock); + goto out; } + fio = list_first_entry(&io->io_list, + struct f2fs_io_info, list); + list_del(&fio->list); + spin_unlock(&io->io_lock); + } + + verify_fio_blkaddr(fio); + + if (fio->encrypted_page) + bio_folio = page_folio(fio->encrypted_page); + else if (fio->compressed_page) + bio_folio = page_folio(fio->compressed_page); + else + bio_folio = fio->folio; + + /* set submitted = true as a return value */ + fio->submitted = 1; + + type = WB_DATA_TYPE(bio_folio, fio->compressed_page); + inc_page_count(sbi, type); + + if (io->bio && + (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, + fio->new_blkaddr) || + !f2fs_crypt_mergeable_bio(io->bio, fio_inode(fio), + bio_folio->index, fio))) + __submit_merged_bio(io); +alloc_new: + if (io->bio == NULL) { + io->bio = __bio_alloc(fio, BIO_MAX_VECS); + f2fs_set_bio_crypt_ctx(io->bio, fio_inode(fio), + bio_folio->index, fio, GFP_NOIO); + io->fio = *fio; + } + + if (!bio_add_folio(io->bio, bio_folio, folio_size(bio_folio), 0)) { + __submit_merged_bio(io); + goto alloc_new; } - return page; + + if (fio->io_wbc) + wbc_account_cgroup_owner(fio->io_wbc, fio->folio, + folio_size(fio->folio)); + + io->last_block_in_bio = fio->new_blkaddr; + + trace_f2fs_submit_folio_write(fio->folio, fio); +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && btype < META && + is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { + bio_get(io->bio); + reinit_completion(&io->zone_wait); + io->bi_private = io->bio->bi_private; + io->bio->bi_private = io; + io->bio->bi_end_io = f2fs_zone_write_end_io; + io->zone_pending_bio = io->bio; + __submit_merged_bio(io); + } +#endif + if (fio->in_list) + goto next; +out: + if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + !f2fs_is_checkpoint_ready(sbi)) + __submit_merged_bio(io); + f2fs_up_write(&io->io_rwsem); +} + +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages, blk_opf_t op_flag, + pgoff_t first_idx, bool for_write) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio; + struct bio_post_read_ctx *ctx = NULL; + unsigned int post_read_steps = 0; + sector_t sector; + struct block_device *bdev = f2fs_target_device(sbi, blkaddr, §or); + + bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), + REQ_OP_READ | op_flag, + for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); + bio->bi_iter.bi_sector = sector; + f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); + bio->bi_end_io = f2fs_read_end_io; + + if (fscrypt_inode_uses_fs_layer_crypto(inode)) + post_read_steps |= STEP_DECRYPT; + + if (f2fs_need_verity(inode, first_idx)) + post_read_steps |= STEP_VERITY; + + /* + * STEP_DECOMPRESS is handled specially, since a compressed file might + * contain both compressed and uncompressed clusters. We'll allocate a + * bio_post_read_ctx if the file is compressed, but the caller is + * responsible for enabling STEP_DECOMPRESS if it's actually needed. + */ + + if (post_read_steps || f2fs_compressed_file(inode)) { + /* Due to the mempool, this never fails. */ + ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + ctx->bio = bio; + ctx->sbi = sbi; + ctx->enabled_steps = post_read_steps; + ctx->fs_blkaddr = blkaddr; + ctx->decompression_attempted = false; + bio->bi_private = ctx; + } + iostat_alloc_and_bind_ctx(sbi, bio, ctx); + + return bio; +} + +/* This can handle encryption stuffs */ +static void f2fs_submit_page_read(struct inode *inode, struct folio *folio, + block_t blkaddr, blk_opf_t op_flags, + bool for_write) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio; + + bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, + folio->index, for_write); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) + f2fs_bug_on(sbi, 1); + + inc_page_count(sbi, F2FS_RD_DATA); + f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_submit_read_bio(sbi, bio, DATA); +} + +static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) +{ + __le32 *addr = get_dnode_addr(dn->inode, dn->node_folio); + + dn->data_blkaddr = blkaddr; + addr[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* - * If it tries to access a hole, return an error. - * Because, the callers, functions in dir.c and GC, should be able to know - * whether this page exists or not. + * Lock ordering for the change of data block address: + * ->data_page + * ->node_folio + * update block addresses in the node page */ -struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) +{ + f2fs_folio_wait_writeback(dn->node_folio, NODE, true, true); + __set_data_blkaddr(dn, blkaddr); + if (folio_mark_dirty(dn->node_folio)) + dn->node_changed = true; +} + +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) +{ + f2fs_set_data_blkaddr(dn, blkaddr); + f2fs_update_read_extent_cache(dn); +} + +/* dn->ofs_in_node will be returned with up-to-date last block pointer */ +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err; + + if (!count) + return 0; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) + return -EPERM; + err = inc_valid_block_count(sbi, dn->inode, &count, true); + if (unlikely(err)) + return err; + + trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, + dn->ofs_in_node, count); + + f2fs_folio_wait_writeback(dn->node_folio, NODE, true, true); + + for (; count > 0; dn->ofs_in_node++) { + block_t blkaddr = f2fs_data_blkaddr(dn); + + if (blkaddr == NULL_ADDR) { + __set_data_blkaddr(dn, NEW_ADDR); + count--; + } + } + + if (folio_mark_dirty(dn->node_folio)) + dn->node_changed = true; + return 0; +} + +/* Should keep dn->ofs_in_node unchanged */ +int f2fs_reserve_new_block(struct dnode_of_data *dn) +{ + unsigned int ofs_in_node = dn->ofs_in_node; + int ret; + + ret = f2fs_reserve_new_blocks(dn, 1); + dn->ofs_in_node = ofs_in_node; + return ret; +} + +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) +{ + bool need_put = dn->inode_folio ? false : true; + int err; + + err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE); + if (err) + return err; + + if (dn->data_blkaddr == NULL_ADDR) + err = f2fs_reserve_new_block(dn); + if (err || need_put) + f2fs_put_dnode(dn); + return err; +} + +struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; - struct page *page; + struct folio *folio; int err; -repeat: - page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(mapping, index, for_write); + if (IS_ERR(folio)) + return folio; + + if (f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, + DATA_GENERIC_ENHANCE_READ)) { + err = -EFSCORRUPTED; + goto put_err; + } + goto got_it; + } set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + if (err == -ENOENT && next_pgofs) + *next_pgofs = f2fs_get_next_page_offset(&dn, index); + goto put_err; } f2fs_put_dnode(&dn); - if (dn.data_blkaddr == NULL_ADDR) { - f2fs_put_page(page, 1); - return ERR_PTR(-ENOENT); + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + err = -ENOENT; + if (next_pgofs) + *next_pgofs = index + 1; + goto put_err; + } + if (dn.data_blkaddr != NEW_ADDR && + !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), + dn.data_blkaddr, + DATA_GENERIC_ENHANCE)) { + err = -EFSCORRUPTED; + goto put_err; + } +got_it: + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + return folio; + } + + /* + * A new dentry page is allocated but not able to be written, since its + * new inode page couldn't be allocated due to -ENOSPC. + * In such the case, its blkaddr can be remained as NEW_ADDR. + * see, f2fs_add_link -> f2fs_get_new_data_folio -> + * f2fs_init_inode_metadata. + */ + if (dn.data_blkaddr == NEW_ADDR) { + folio_zero_segment(folio, 0, folio_size(folio)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + folio_unlock(folio); + return folio; } - if (PageUptodate(page)) - return page; + f2fs_submit_page_read(inode, folio, dn.data_blkaddr, + op_flags, for_write); + return folio; - BUG_ON(dn.data_blkaddr == NEW_ADDR); - BUG_ON(dn.data_blkaddr == NULL_ADDR); +put_err: + f2fs_folio_put(folio, true); + return ERR_PTR(err); +} - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) - return ERR_PTR(err); +struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + + folio = f2fs_filemap_get_folio(mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio)) + goto read; + if (folio_test_uptodate(folio)) + return folio; + f2fs_folio_put(folio, false); + +read: + folio = f2fs_get_read_data_folio(inode, index, 0, false, next_pgofs); + if (IS_ERR(folio)) + return folio; + + if (folio_test_uptodate(folio)) + return folio; - lock_page(page); - if (!PageUptodate(page)) { - f2fs_put_page(page, 1); + folio_wait_locked(folio); + if (unlikely(!folio_test_uptodate(folio))) { + f2fs_folio_put(folio, false); return ERR_PTR(-EIO); } - if (page->mapping != mapping) { - f2fs_put_page(page, 1); - goto repeat; + return folio; +} + +/* + * If it tries to access a hole, return an error. + * Because, the callers, functions in dir.c and GC, should be able to know + * whether this page exists or not. + */ +struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, + bool for_write) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + + folio = f2fs_get_read_data_folio(inode, index, 0, for_write, NULL); + if (IS_ERR(folio)) + return folio; + + /* wait for read completion */ + folio_lock(folio); + if (unlikely(folio->mapping != mapping || !folio_test_uptodate(folio))) { + f2fs_folio_put(folio, true); + return ERR_PTR(-EIO); } - return page; + return folio; } /* * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. * - * Also, caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). - * Note that, npage is set only by make_empty_dir. + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + * Note that, ifolio is set only by make_empty_dir, and if any error occur, + * ifolio should be released by this function. */ -struct page *get_new_data_page(struct inode *inode, - struct page *npage, pgoff_t index, bool new_i_size) +struct folio *f2fs_get_new_data_folio(struct inode *inode, + struct folio *ifolio, pgoff_t index, bool new_i_size) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; struct dnode_of_data dn; int err; - set_new_dnode(&dn, inode, npage, npage, 0); - err = get_dnode_of_data(&dn, index, ALLOC_NODE); - if (err) - return ERR_PTR(err); + folio = f2fs_grab_cache_folio(mapping, index, true); + if (IS_ERR(folio)) { + /* + * before exiting, we should make sure ifolio will be released + * if any error occur. + */ + f2fs_folio_put(ifolio, true); + return ERR_PTR(-ENOMEM); + } - if (dn.data_blkaddr == NULL_ADDR) { - if (reserve_new_block(&dn)) { - if (!npage) - f2fs_put_dnode(&dn); - return ERR_PTR(-ENOSPC); - } + set_new_dnode(&dn, inode, ifolio, NULL, 0); + err = f2fs_reserve_block(&dn, index); + if (err) { + f2fs_folio_put(folio, true); + return ERR_PTR(err); } - if (!npage) + if (!ifolio) f2fs_put_dnode(&dn); -repeat: - page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); - if (PageUptodate(page)) - return page; + if (folio_test_uptodate(folio)) + goto got_it; if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, 0, folio_size(folio)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) - return ERR_PTR(err); - lock_page(page); - if (!PageUptodate(page)) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } - if (page->mapping != mapping) { - f2fs_put_page(page, 1); - goto repeat; - } + f2fs_folio_put(folio, true); + + /* if ifolio exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ifolio); + folio = f2fs_get_lock_data_folio(inode, index, true); + if (IS_ERR(folio)) + return folio; } +got_it: + if (new_i_size && i_size_read(inode) < + ((loff_t)(index + 1) << PAGE_SHIFT)) + f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); + return folio; +} - if (new_i_size && - i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { - i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); - /* Only the directory inode sets new_i_size */ - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - mark_inode_dirty_sync(inode); +static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_summary sum; + struct node_info ni; + block_t old_blkaddr; + blkcnt_t count = 1; + int err; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) + return -EPERM; + + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); + if (err) + return err; + + dn->data_blkaddr = f2fs_data_blkaddr(dn); + if (dn->data_blkaddr == NULL_ADDR) { + err = inc_valid_block_count(sbi, dn->inode, &count, true); + if (unlikely(err)) + return err; } - return page; + + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + old_blkaddr = dn->data_blkaddr; + err = f2fs_allocate_data_block(sbi, NULL, old_blkaddr, + &dn->data_blkaddr, &sum, seg_type, NULL); + if (err) + return err; + + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + f2fs_invalidate_internal_cache(sbi, old_blkaddr, 1); + + f2fs_update_data_blkaddr(dn, dn->data_blkaddr); + return 0; } -static void read_end_io(struct bio *bio, int err) +static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + f2fs_down_read(&sbi->cp_enable_rwsem); + if (flag == F2FS_GET_BLOCK_PRE_AIO) + f2fs_down_read(&sbi->node_change); + else + f2fs_lock_op(sbi); +} - do { - struct page *page = bvec->bv_page; +static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) + f2fs_up_read(&sbi->node_change); + else + f2fs_unlock_op(sbi); + f2fs_up_read(&sbi->cp_enable_rwsem); +} - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); +int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err = 0; - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - kfree(bio->bi_private); - bio_put(bio); + f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + if (!f2fs_lookup_read_extent_cache_block(dn->inode, index, + &dn->data_blkaddr)) + err = f2fs_reserve_block(dn, index); + f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); + + return err; } -/* - * Fill the locked page with data located in the block address. - * Return unlocked page. - */ -int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, int type) +static int f2fs_map_no_dnode(struct inode *inode, + struct f2fs_map_blocks *map, struct dnode_of_data *dn, + pgoff_t pgoff) { - struct block_device *bdev = sbi->sb->s_bdev; - struct bio *bio; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - trace_f2fs_readpage(page, blk_addr, type); + /* + * There is one exceptional case that read_node_page() may return + * -ENOENT due to filesystem has been shutdown or cp_error, return + * -EIO in that case. + */ + if (map->m_may_create && + (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || f2fs_cp_error(sbi))) + return -EIO; - down_read(&sbi->bio_sem); + if (map->m_next_pgofs) + *map->m_next_pgofs = f2fs_get_next_page_offset(dn, pgoff); + if (map->m_next_extent) + *map->m_next_extent = f2fs_get_next_page_offset(dn, pgoff); + return 0; +} - /* Allocate a new bio */ - bio = f2fs_bio_alloc(bdev, 1); +static bool f2fs_map_blocks_cached(struct inode *inode, + struct f2fs_map_blocks *map, int flag) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int maxblocks = map->m_len; + pgoff_t pgoff = (pgoff_t)map->m_lblk; + struct extent_info ei = {}; + + if (!f2fs_lookup_read_extent_cache(inode, pgoff, &ei)) + return false; + + map->m_pblk = ei.blk + pgoff - ei.fofs; + map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgoff); + map->m_flags = F2FS_MAP_MAPPED; + if (map->m_next_extent) + *map->m_next_extent = pgoff + map->m_len; - /* Initialize the bio */ - bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - bio->bi_end_io = read_end_io; + /* for hardware encryption, but to avoid potential issue in future */ + if (flag == F2FS_GET_BLOCK_DIO) + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { - kfree(bio->bi_private); - bio_put(bio); - up_read(&sbi->bio_sem); - f2fs_put_page(page, 1); - return -EFAULT; + if (f2fs_allow_multi_device_dio(sbi, flag)) { + int bidx = f2fs_target_device_index(sbi, map->m_pblk); + struct f2fs_dev_info *dev = &sbi->devs[bidx]; + + map->m_bdev = dev->bdev; + map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk); + map->m_pblk -= dev->start_blk; + } else { + map->m_bdev = inode->i_sb->s_bdev; } + return true; +} - submit_bio(type, bio); - up_read(&sbi->bio_sem); - return 0; +static bool map_is_mergeable(struct f2fs_sb_info *sbi, + struct f2fs_map_blocks *map, + block_t blkaddr, int flag, int bidx, + int ofs) +{ + if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) + return false; + if (map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) + return true; + if (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) + return true; + if (flag == F2FS_GET_BLOCK_PRE_DIO) + return true; + if (flag == F2FS_GET_BLOCK_DIO && + map->m_pblk == NULL_ADDR && blkaddr == NULL_ADDR) + return true; + return false; } /* - * This function should be used by the data read flow only where it - * does not check the "create" flag that indicates block allocation. - * The reason for this special functionality is to exploit VFS readahead - * mechanism. + * f2fs_map_blocks() tries to find or build mapping relationship which + * maps continuous logical blocks to physical blocks, and return such + * info via f2fs_map_blocks structure. */ -static int get_data_block_ro(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) { - unsigned int blkbits = inode->i_sb->s_blocksize_bits; - unsigned maxblocks = bh_result->b_size >> blkbits; + unsigned int maxblocks = map->m_len; struct dnode_of_data dn; - pgoff_t pgofs; - int err; - - /* Get the page offset from the block offset(iblock) */ - pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE; + pgoff_t pgofs, end_offset, end; + int err = 0, ofs = 1; + unsigned int ofs_in_node, last_ofs_in_node; + blkcnt_t prealloc; + block_t blkaddr; + unsigned int start_pgofs; + int bidx = 0; + bool is_hole; + bool lfs_dio_write; - if (check_extent_cache(inode, pgofs, bh_result)) { - trace_f2fs_get_data_block(inode, iblock, bh_result, 0); + if (!maxblocks) return 0; + + lfs_dio_write = (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && + map->m_may_create); + + if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag)) + goto out; + + map->m_bdev = inode->i_sb->s_bdev; + map->m_multidev_dio = + f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); + + map->m_len = 0; + map->m_flags = 0; + + /* it only supports block size == page size */ + pgofs = (pgoff_t)map->m_lblk; + end = pgofs + maxblocks; + + if (flag == F2FS_GET_BLOCK_PRECACHE) + mode = LOOKUP_NODE_RA; + +next_dnode: + if (map->m_may_create) { + if (f2fs_lfs_mode(sbi)) + f2fs_balance_fs(sbi, true); + f2fs_map_lock(sbi, flag); } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = f2fs_get_dnode_of_data(&dn, pgofs, mode); if (err) { - trace_f2fs_get_data_block(inode, iblock, bh_result, err); - return (err == -ENOENT) ? 0 : err; + if (flag == F2FS_GET_BLOCK_BMAP) + map->m_pblk = 0; + if (err == -ENOENT) + err = f2fs_map_no_dnode(inode, map, &dn, pgofs); + goto unlock_out; } - /* It does not support data allocation */ - BUG_ON(create); + start_pgofs = pgofs; + prealloc = 0; + last_ofs_in_node = ofs_in_node = dn.ofs_in_node; + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); + +next_block: + blkaddr = f2fs_data_blkaddr(&dn); + is_hole = !__is_valid_data_blkaddr(blkaddr); + if (!is_hole && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { + err = -EFSCORRUPTED; + goto sync_out; + } - if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { - int i; - unsigned int end_offset; + /* use out-place-update for direct IO under LFS mode */ + if (map->m_may_create && (is_hole || + (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && + !f2fs_is_pinned_file(inode) && map->m_last_pblk != blkaddr))) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; + } - end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE : - ADDRS_PER_BLOCK; + switch (flag) { + case F2FS_GET_BLOCK_PRE_AIO: + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } + break; + case F2FS_GET_BLOCK_PRE_DIO: + case F2FS_GET_BLOCK_DIO: + err = __allocate_data_block(&dn, map->m_seg_type); + if (err) + goto sync_out; + if (flag == F2FS_GET_BLOCK_PRE_DIO) + file_need_truncate(inode); + set_inode_flag(inode, FI_APPEND_WRITE); + break; + default: + WARN_ON_ONCE(1); + err = -EIO; + goto sync_out; + } - clear_buffer_new(bh_result); + blkaddr = dn.data_blkaddr; + if (is_hole) + map->m_flags |= F2FS_MAP_NEW; + } else if (is_hole) { + if (f2fs_compressed_file(inode) && + f2fs_sanity_check_cluster(&dn)) { + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, + ERROR_CORRUPTED_CLUSTER); + goto sync_out; + } - /* Give more consecutive addresses for the read ahead */ - for (i = 0; i < end_offset - dn.ofs_in_node; i++) - if (((datablock_addr(dn.node_page, - dn.ofs_in_node + i)) - != (dn.data_blkaddr + i)) || maxblocks == i) - break; - map_bh(bh_result, inode->i_sb, dn.data_blkaddr); - bh_result->b_size = (i << blkbits); + switch (flag) { + case F2FS_GET_BLOCK_PRECACHE: + goto sync_out; + case F2FS_GET_BLOCK_BMAP: + map->m_pblk = 0; + goto sync_out; + case F2FS_GET_BLOCK_FIEMAP: + if (blkaddr == NULL_ADDR) { + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + goto sync_out; + } + break; + case F2FS_GET_BLOCK_DIO: + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + break; + default: + /* for defragment case */ + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + goto sync_out; + } + } + + if (flag == F2FS_GET_BLOCK_PRE_AIO) + goto skip; + + if (map->m_multidev_dio) + bidx = f2fs_target_device_index(sbi, blkaddr); + + if (map->m_len == 0) { + /* reserved delalloc block should be mapped for fiemap. */ + if (blkaddr == NEW_ADDR) + map->m_flags |= F2FS_MAP_DELALLOC; + /* DIO READ and hole case, should not map the blocks. */ + if (!(flag == F2FS_GET_BLOCK_DIO && is_hole && !map->m_may_create)) + map->m_flags |= F2FS_MAP_MAPPED; + + map->m_pblk = blkaddr; + map->m_len = 1; + + if (map->m_multidev_dio) + map->m_bdev = FDEV(bidx).bdev; + + if (lfs_dio_write) + map->m_last_pblk = NULL_ADDR; + } else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) { + ofs++; + map->m_len++; + } else { + if (lfs_dio_write && !f2fs_is_pinned_file(inode)) + map->m_last_pblk = blkaddr; + goto sync_out; + } + +skip: + dn.ofs_in_node++; + pgofs++; + + /* preallocate blocks in batch for one dnode page */ + if (flag == F2FS_GET_BLOCK_PRE_AIO && + (pgofs == end || dn.ofs_in_node == end_offset)) { + + dn.ofs_in_node = ofs_in_node; + err = f2fs_reserve_new_blocks(&dn, prealloc); + if (err) + goto sync_out; + + map->m_len += dn.ofs_in_node - ofs_in_node; + if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { + err = -ENOSPC; + goto sync_out; + } + dn.ofs_in_node = end_offset; + } + + if (pgofs >= end) + goto sync_out; + else if (dn.ofs_in_node < end_offset) + goto next_block; + + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_read_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } } + f2fs_put_dnode(&dn); - trace_f2fs_get_data_block(inode, iblock, bh_result, 0); + + if (map->m_may_create) { + f2fs_map_unlock(sbi, flag); + f2fs_balance_fs(sbi, dn.node_changed); + } + goto next_dnode; + +sync_out: + + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { + /* + * for hardware encryption, but to avoid potential issue + * in future + */ + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); + + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + + f2fs_bug_on(sbi, blk_addr + map->m_len > + FDEV(bidx).end_blk + 1); + } + } + + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + if (map->m_len > ofs) + f2fs_update_read_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + if (map->m_next_extent) + *map->m_next_extent = is_hole ? pgofs + 1 : pgofs; + } + f2fs_put_dnode(&dn); +unlock_out: + if (map->m_may_create) { + f2fs_map_unlock(sbi, flag); + f2fs_balance_fs(sbi, dn.node_changed); + } +out: + trace_f2fs_map_blocks(inode, map, flag, err); + return err; +} + +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +{ + struct f2fs_map_blocks map; + block_t last_lblk; + int err; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = F2FS_BYTES_TO_BLK(pos); + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + last_lblk = F2FS_BLK_ALIGN(pos + len); + + while (map.m_lblk < last_lblk) { + map.m_len = last_lblk - map.m_lblk; + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); + if (err || map.m_len == 0) + return false; + map.m_lblk += map.m_len; + } + return true; +} + +static int f2fs_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct node_info ni; + __u64 phys = 0, len; + __u32 flags; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + int err = 0; + + if (f2fs_has_inline_xattr(inode)) { + int offset; + struct folio *folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), + inode->i_ino, false); + + if (IS_ERR(folio)) + return PTR_ERR(folio); + + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); + if (err) { + f2fs_folio_put(folio, true); + return err; + } + + phys = F2FS_BLK_TO_BYTES(ni.blk_addr); + offset = offsetof(struct f2fs_inode, i_addr) + + sizeof(__le32) * (DEF_ADDRS_PER_INODE - + get_inline_xattr_addrs(inode)); + + phys += offset; + len = inline_xattr_size(inode); + + f2fs_folio_put(folio, true); + + flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; + + if (!xnid) + flags |= FIEMAP_EXTENT_LAST; + + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + trace_f2fs_fiemap(inode, 0, phys, len, flags, err); + if (err) + return err; + } + + if (xnid) { + struct folio *folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), + xnid, false); + + if (IS_ERR(folio)) + return PTR_ERR(folio); + + err = f2fs_get_node_info(sbi, xnid, &ni, false); + if (err) { + f2fs_folio_put(folio, true); + return err; + } + + phys = F2FS_BLK_TO_BYTES(ni.blk_addr); + len = inode->i_sb->s_blocksize; + + f2fs_folio_put(folio, true); + + flags = FIEMAP_EXTENT_LAST; + } + + if (phys) { + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + trace_f2fs_fiemap(inode, 0, phys, len, flags, err); + } + + return (err < 0 ? err : 0); +} + +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct f2fs_map_blocks map; + sector_t start_blk, last_blk, blk_len, max_len; + pgoff_t next_pgofs; + u64 logical = 0, phys = 0, size = 0; + u32 flags = 0; + int ret = 0; + bool compr_cluster = false, compr_appended; + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int count_in_cluster = 0; + loff_t maxbytes; + + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + ret = f2fs_precache_extents(inode); + if (ret) + return ret; + } + + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_XATTR); + if (ret) + return ret; + + inode_lock_shared(inode); + + maxbytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); + if (start > maxbytes) { + ret = -EFBIG; + goto out; + } + + if (len > maxbytes || (maxbytes - len) < start) + len = maxbytes - start; + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + ret = f2fs_xattr_fiemap(inode, fieinfo); + goto out; + } + + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { + ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); + if (ret != -EAGAIN) + goto out; + } + + start_blk = F2FS_BYTES_TO_BLK(start); + last_blk = F2FS_BYTES_TO_BLK(start + len - 1); + blk_len = last_blk - start_blk + 1; + max_len = F2FS_BYTES_TO_BLK(maxbytes) - start_blk; + +next: + memset(&map, 0, sizeof(map)); + map.m_lblk = start_blk; + map.m_len = blk_len; + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = NO_CHECK_TYPE; + + if (compr_cluster) { + map.m_lblk += 1; + map.m_len = cluster_size - count_in_cluster; + } + + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); + if (ret) + goto out; + + /* HOLE */ + if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { + start_blk = next_pgofs; + + if (F2FS_BLK_TO_BYTES(start_blk) < maxbytes) + goto prep_next; + + flags |= FIEMAP_EXTENT_LAST; + } + + /* + * current extent may cross boundary of inquiry, increase len to + * requery. + */ + if (!compr_cluster && (map.m_flags & F2FS_MAP_MAPPED) && + map.m_lblk + map.m_len - 1 == last_blk && + blk_len != max_len) { + blk_len = max_len; + goto next; + } + + compr_appended = false; + /* In a case of compressed cluster, append this to the last extent */ + if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) || + !(map.m_flags & F2FS_MAP_FLAGS))) { + compr_appended = true; + goto skip_fill; + } + + if (size) { + flags |= FIEMAP_EXTENT_MERGED; + if (IS_ENCRYPTED(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; + + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + trace_f2fs_fiemap(inode, logical, phys, size, flags, ret); + if (ret) + goto out; + size = 0; + } + + if (start_blk > last_blk) + goto out; + +skip_fill: + if (map.m_pblk == COMPRESS_ADDR) { + compr_cluster = true; + count_in_cluster = 1; + } else if (compr_appended) { + unsigned int appended_blks = cluster_size - + count_in_cluster + 1; + size += F2FS_BLK_TO_BYTES(appended_blks); + start_blk += appended_blks; + compr_cluster = false; + } else { + logical = F2FS_BLK_TO_BYTES(start_blk); + phys = __is_valid_data_blkaddr(map.m_pblk) ? + F2FS_BLK_TO_BYTES(map.m_pblk) : 0; + size = F2FS_BLK_TO_BYTES(map.m_len); + flags = 0; + + if (compr_cluster) { + flags = FIEMAP_EXTENT_ENCODED; + count_in_cluster += map.m_len; + if (count_in_cluster == cluster_size) { + compr_cluster = false; + size += F2FS_BLKSIZE; + } + } else if (map.m_flags & F2FS_MAP_DELALLOC) { + flags = FIEMAP_EXTENT_UNWRITTEN; + } + + start_blk += F2FS_BYTES_TO_BLK(size); + } + +prep_next: + cond_resched(); + if (fatal_signal_pending(current)) + ret = -EINTR; + else + goto next; +out: + if (ret == 1) + ret = 0; + + inode_unlock_shared(inode); + return ret; +} + +static inline loff_t f2fs_readpage_limit(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) + return F2FS_BLK_TO_BYTES(max_file_blocks(inode)); + + return i_size_read(inode); +} + +static inline blk_opf_t f2fs_ra_op_flags(struct readahead_control *rac) +{ + return rac ? REQ_RAHEAD : 0; +} + +static int f2fs_read_single_page(struct inode *inode, struct folio *folio, + unsigned nr_pages, + struct f2fs_map_blocks *map, + struct bio **bio_ret, + sector_t *last_block_in_bio, + struct readahead_control *rac) +{ + struct bio *bio = *bio_ret; + const unsigned int blocksize = F2FS_BLKSIZE; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t block_nr; + pgoff_t index = folio->index; + int ret = 0; + + block_in_file = (sector_t)index; + last_block = block_in_file + nr_pages; + last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + + blocksize - 1); + if (last_block > last_block_in_file) + last_block = last_block_in_file; + + /* just zeroing out page which is beyond EOF */ + if (block_in_file >= last_block) + goto zero_out; + /* + * Map blocks using the previous result first. + */ + if ((map->m_flags & F2FS_MAP_MAPPED) && + block_in_file > map->m_lblk && + block_in_file < (map->m_lblk + map->m_len)) + goto got_it; + + /* + * Then do more f2fs_map_blocks() calls until we are + * done with this page. + */ + map->m_lblk = block_in_file; + map->m_len = last_block - block_in_file; + + ret = f2fs_map_blocks(inode, map, F2FS_GET_BLOCK_DEFAULT); + if (ret) + goto out; +got_it: + if ((map->m_flags & F2FS_MAP_MAPPED)) { + block_nr = map->m_pblk + block_in_file - map->m_lblk; + folio_set_mappedtodisk(folio); + + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, + DATA_GENERIC_ENHANCE_READ)) { + ret = -EFSCORRUPTED; + goto out; + } + } else { +zero_out: + folio_zero_segment(folio, 0, folio_size(folio)); + if (f2fs_need_verity(inode, index) && + !fsverity_verify_folio(folio)) { + ret = -EIO; + goto out; + } + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + folio_unlock(folio); + goto out; + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio, + *last_block_in_bio, block_nr) || + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) { +submit_and_realloc: + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + bio = NULL; + } + if (bio == NULL) + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, + f2fs_ra_op_flags(rac), index, + false); + + /* + * If the page is under writeback, we need to wait for + * its completion to see the correct decrypted data. + */ + f2fs_wait_on_block_writeback(inode, block_nr); + + if (!bio_add_folio(bio, folio, blocksize, 0)) + goto submit_and_realloc; + + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, + F2FS_BLKSIZE); + *last_block_in_bio = block_nr; +out: + *bio_ret = bio; + return ret; +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + unsigned nr_pages, sector_t *last_block_in_bio, + struct readahead_control *rac, bool for_write) +{ + struct dnode_of_data dn; + struct inode *inode = cc->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio = *bio_ret; + unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; + sector_t last_block_in_file; + const unsigned int blocksize = F2FS_BLKSIZE; + struct decompress_io_ctx *dic = NULL; + struct extent_info ei = {}; + bool from_dnode = true; + int i; + int ret = 0; + + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + from_dnode = false; + goto out_put_dnode; + } + + f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); + + last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + + blocksize - 1); + + /* get rid of pages beyond EOF */ + for (i = 0; i < cc->cluster_size; i++) { + struct page *page = cc->rpages[i]; + struct folio *folio; + + if (!page) + continue; + + folio = page_folio(page); + if ((sector_t)folio->index >= last_block_in_file) { + folio_zero_segment(folio, 0, folio_size(folio)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + } else if (!folio_test_uptodate(folio)) { + continue; + } + folio_unlock(folio); + if (for_write) + folio_put(folio); + cc->rpages[i] = NULL; + cc->nr_rpages--; + } + + /* we are done since all pages are beyond EOF */ + if (f2fs_cluster_is_empty(cc)) + goto out; + + if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) + from_dnode = false; + + if (!from_dnode) + goto skip_reading_dnode; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) + goto out; + + f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); + +skip_reading_dnode: + for (i = 1; i < cc->cluster_size; i++) { + block_t blkaddr; + + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_folio, + dn.ofs_in_node + i) : + ei.blk + i - 1; + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { + ret = -EFAULT; + goto out_put_dnode; + } + cc->nr_cpages++; + + if (!from_dnode && i >= ei.c_len) + break; + } + + /* nothing to decompress */ + if (cc->nr_cpages == 0) { + ret = 0; + goto out_put_dnode; + } + + dic = f2fs_alloc_dic(cc); + if (IS_ERR(dic)) { + ret = PTR_ERR(dic); + goto out_put_dnode; + } + + for (i = 0; i < cc->nr_cpages; i++) { + struct folio *folio = page_folio(dic->cpages[i]); + block_t blkaddr; + struct bio_post_read_ctx *ctx; + + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_folio, + dn.ofs_in_node + i + 1) : + ei.blk + i; + + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (f2fs_load_compressed_folio(sbi, folio, blkaddr)) { + if (atomic_dec_and_test(&dic->remaining_pages)) { + f2fs_decompress_cluster(dic, true); + break; + } + continue; + } + + if (bio && (!page_is_mergeable(sbi, bio, + *last_block_in_bio, blkaddr) || + !f2fs_crypt_mergeable_bio(bio, inode, folio->index, NULL))) { +submit_and_realloc: + f2fs_submit_read_bio(sbi, bio, DATA); + bio = NULL; + } + + if (!bio) + bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i, + f2fs_ra_op_flags(rac), + folio->index, for_write); + + if (!bio_add_folio(bio, folio, blocksize, 0)) + goto submit_and_realloc; + + ctx = get_post_read_ctx(bio); + ctx->enabled_steps |= STEP_DECOMPRESS; + refcount_inc(&dic->refcnt); + + inc_page_count(sbi, F2FS_RD_DATA); + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); + *last_block_in_bio = blkaddr; + } + + if (from_dnode) + f2fs_put_dnode(&dn); + + *bio_ret = bio; return 0; + +out_put_dnode: + if (from_dnode) + f2fs_put_dnode(&dn); +out: + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) { + ClearPageUptodate(cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + } + *bio_ret = bio; + return ret; } +#endif -static int f2fs_read_data_page(struct file *file, struct page *page) +/* + * This function was originally taken from fs/mpage.c, and customized for f2fs. + * Major change was from block_size == page_size in f2fs by default. + */ +static int f2fs_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct folio *folio) { - return mpage_readpage(page, get_data_block_ro); + struct bio *bio = NULL; + sector_t last_block_in_bio = 0; + struct f2fs_map_blocks map; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = NULL_CLUSTER, + .rpages = NULL, + .cpages = NULL, + .nr_rpages = 0, + .nr_cpages = 0, + }; + pgoff_t nc_cluster_idx = NULL_CLUSTER; + pgoff_t index; +#endif + unsigned nr_pages = rac ? readahead_count(rac) : 1; + unsigned max_nr_pages = nr_pages; + int ret = 0; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + index = rac ? readahead_index(rac) : folio->index; + max_nr_pages = round_up(index + nr_pages, cc.cluster_size) - + round_down(index, cc.cluster_size); + } +#endif + + map.m_pblk = 0; + map.m_lblk = 0; + map.m_len = 0; + map.m_flags = 0; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + + for (; nr_pages; nr_pages--) { + if (rac) { + folio = readahead_folio(rac); + prefetchw(&folio->flags); + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + index = folio->index; + + if (!f2fs_compressed_file(inode)) + goto read_single_page; + + /* there are remained compressed pages, submit them */ + if (!f2fs_cluster_can_merge_page(&cc, index)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + rac, false); + f2fs_destroy_compress_ctx(&cc, false); + if (ret) + goto set_error_page; + } + if (cc.cluster_idx == NULL_CLUSTER) { + if (nc_cluster_idx == index >> cc.log_cluster_size) + goto read_single_page; + + ret = f2fs_is_compressed_cluster(inode, index); + if (ret < 0) + goto set_error_page; + else if (!ret) { + nc_cluster_idx = + index >> cc.log_cluster_size; + goto read_single_page; + } + + nc_cluster_idx = NULL_CLUSTER; + } + ret = f2fs_init_compress_ctx(&cc); + if (ret) + goto set_error_page; + + f2fs_compress_ctx_add_page(&cc, folio); + + goto next_page; +read_single_page: +#endif + + ret = f2fs_read_single_page(inode, folio, max_nr_pages, &map, + &bio, &last_block_in_bio, rac); + if (ret) { +#ifdef CONFIG_F2FS_FS_COMPRESSION +set_error_page: +#endif + folio_zero_segment(folio, 0, folio_size(folio)); + folio_unlock(folio); + } +#ifdef CONFIG_F2FS_FS_COMPRESSION +next_page: +#endif + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + /* last page */ + if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + rac, false); + f2fs_destroy_compress_ctx(&cc, false); + } + } +#endif + } + if (bio) + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); + return ret; } -static int f2fs_read_data_pages(struct file *file, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static int f2fs_read_data_folio(struct file *file, struct folio *folio) +{ + struct inode *inode = folio->mapping->host; + int ret = -EAGAIN; + + trace_f2fs_readpage(folio, DATA); + + if (!f2fs_is_compress_backend_ready(inode)) { + folio_unlock(folio); + return -EOPNOTSUPP; + } + + /* If the file has inline data, try to read it directly */ + if (f2fs_has_inline_data(inode)) + ret = f2fs_read_inline_data(inode, folio); + if (ret == -EAGAIN) + ret = f2fs_mpage_readpages(inode, NULL, folio); + return ret; +} + +static void f2fs_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + + trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); + + if (!f2fs_is_compress_backend_ready(inode)) + return; + + /* If the file has inline data, skip readahead */ + if (f2fs_has_inline_data(inode)) + return; + + f2fs_mpage_readpages(inode, rac, NULL); +} + +int f2fs_encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio_inode(fio); + struct folio *mfolio; + struct page *page; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_file(inode)) + return 0; + + page = fio->compressed_page ? fio->compressed_page : fio->page; + + if (fscrypt_inode_uses_inline_crypto(inode)) + return 0; + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page_folio(page), + PAGE_SIZE, 0, gfp_flags); + if (IS_ERR(fio->encrypted_page)) { + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_writes(fio->sbi); + memalloc_retry_wait(GFP_NOFS); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); + } + + mfolio = filemap_lock_folio(META_MAPPING(fio->sbi), fio->old_blkaddr); + if (!IS_ERR(mfolio)) { + if (folio_test_uptodate(mfolio)) + memcpy(folio_address(mfolio), + page_address(fio->encrypted_page), PAGE_SIZE); + f2fs_folio_put(mfolio, true); + } + return 0; +} + +static inline bool check_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (IS_F2FS_IPU_HONOR_OPU_WRITE(sbi) && + is_inode_flag_set(inode, FI_OPU_WRITE)) + return false; + if (IS_F2FS_IPU_FORCE(sbi)) + return true; + if (IS_F2FS_IPU_SSR(sbi) && f2fs_need_SSR(sbi)) + return true; + if (IS_F2FS_IPU_UTIL(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + if (IS_F2FS_IPU_SSR_UTIL(sbi) && f2fs_need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + + /* + * IPU for rewrite async pages + */ + if (IS_F2FS_IPU_ASYNC(sbi) && fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && !IS_ENCRYPTED(inode)) + return true; + + /* this is only set during fdatasync */ + if (IS_F2FS_IPU_FSYNC(sbi) && is_inode_flag_set(inode, FI_NEED_IPU)) + return true; + + if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) + return true; + + return false; +} + +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) +{ + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return false; + + if (f2fs_is_pinned_file(inode)) + return true; + + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) + return true; + + return check_inplace_update_policy(inode, fio); +} + +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { - return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* The below cases were checked when setting it. */ + if (f2fs_is_pinned_file(inode)) + return false; + if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return true; + if (f2fs_lfs_mode(sbi)) + return true; + if (S_ISDIR(inode->i_mode)) + return true; + if (IS_NOQUOTA(inode)) + return true; + if (f2fs_used_in_atomic_write(inode)) + return true; + /* rewrite low ratio compress data w/ OPU mode to avoid fragmentation */ + if (f2fs_compressed_file(inode) && + F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER && + is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) + return true; + + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return true; + + if (is_inode_flag_set(inode, FI_OPU_WRITE)) + return true; + + if (fio) { + if (page_private_gcing(fio->page)) + return true; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) + return true; + } + return false; } -int do_write_data_page(struct page *page) +static inline bool need_inplace_update(struct f2fs_io_info *fio) { - struct inode *inode = page->mapping->host; - block_t old_blk_addr, new_blk_addr; + struct inode *inode = fio_inode(fio); + + if (f2fs_should_update_outplace(inode, fio)) + return false; + + return f2fs_should_update_inplace(inode, fio); +} + +int f2fs_do_write_data_page(struct f2fs_io_info *fio) +{ + struct folio *folio = fio->folio; + struct inode *inode = folio->mapping->host; struct dnode_of_data dn; + struct node_info ni; + bool ipu_force = false; + bool atomic_commit; int err = 0; - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + /* Use COW inode to make dnode_of_data for atomic write */ + atomic_commit = f2fs_is_atomic_file(inode) && + folio_test_f2fs_atomic(folio); + if (atomic_commit) + set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); + else + set_new_dnode(&dn, inode, NULL, NULL, 0); + + if (need_inplace_update(fio) && + f2fs_lookup_read_extent_cache_block(inode, folio->index, + &fio->old_blkaddr)) { + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC_ENHANCE)) + return -EFSCORRUPTED; + + ipu_force = true; + fio->need_lock = LOCK_DONE; + goto got_it; + } + + /* Deadlock due to between page->lock and f2fs_lock_op */ + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + return -EAGAIN; + + err = f2fs_get_dnode_of_data(&dn, folio->index, LOOKUP_NODE); if (err) - return err; + goto out; - old_blk_addr = dn.data_blkaddr; + fio->old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ - if (old_blk_addr == NULL_ADDR) + if (fio->old_blkaddr == NULL_ADDR) { + folio_clear_uptodate(folio); + folio_clear_f2fs_gcing(folio); goto out_writepage; + } +got_it: + if (__is_valid_data_blkaddr(fio->old_blkaddr) && + !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC_ENHANCE)) { + err = -EFSCORRUPTED; + goto out_writepage; + } - set_page_writeback(page); + /* wait for GCed page writeback via META_MAPPING */ + if (fio->meta_gc) + f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(old_blk_addr != NEW_ADDR && - !is_cold_data(page) && - need_inplace_update(inode))) { - rewrite_data_page(F2FS_SB(inode->i_sb), page, - old_blk_addr); - } else { - write_data_page(inode, page, &dn, - old_blk_addr, &new_blk_addr); - update_extent_cache(new_blk_addr, &dn); + if (ipu_force || + (__is_valid_data_blkaddr(fio->old_blkaddr) && + need_inplace_update(fio))) { + err = f2fs_encrypt_one_page(fio); + if (err) + goto out_writepage; + + folio_start_writeback(folio); + f2fs_put_dnode(&dn); + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); + err = f2fs_inplace_write_data(fio); + if (err) { + if (fscrypt_inode_uses_fs_layer_crypto(inode)) + fscrypt_finalize_bounce_page(&fio->encrypted_page); + folio_end_writeback(folio); + } else { + set_inode_flag(inode, FI_UPDATE_WRITE); + } + trace_f2fs_do_write_data_page(folio, IPU); + return err; } + + if (fio->need_lock == LOCK_RETRY) { + if (!f2fs_trylock_op(fio->sbi)) { + err = -EAGAIN; + goto out_writepage; + } + fio->need_lock = LOCK_REQ; + } + + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); + if (err) + goto out_writepage; + + fio->version = ni.version; + + err = f2fs_encrypt_one_page(fio); + if (err) + goto out_writepage; + + folio_start_writeback(folio); + + if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR) + f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false); + + /* LFS mode write path */ + f2fs_outplace_write_data(&dn, fio); + trace_f2fs_do_write_data_page(folio, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (atomic_commit) + folio_clear_f2fs_atomic(folio); out_writepage: f2fs_put_dnode(&dn); +out: + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); return err; } -static int f2fs_write_data_page(struct page *page, - struct writeback_control *wbc) +int f2fs_write_single_data_page(struct folio *folio, int *submitted, + struct bio **bio, + sector_t *last_block, + struct writeback_control *wbc, + enum iostat_type io_type, + int compr_blocks, + bool allow_balance) { - struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct inode *inode = folio->mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); - const pgoff_t end_index = ((unsigned long long) i_size) - >> PAGE_CACHE_SHIFT; - unsigned offset; + const pgoff_t end_index = ((unsigned long long)i_size) + >> PAGE_SHIFT; + loff_t psize = (loff_t)(folio->index + 1) << PAGE_SHIFT; + unsigned offset = 0; bool need_balance_fs = false; + bool quota_inode = IS_NOQUOTA(inode); int err = 0; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, + .folio = folio, + .encrypted_page = NULL, + .submitted = 0, + .compr_blocks = compr_blocks, + .need_lock = compr_blocks ? LOCK_DONE : LOCK_RETRY, + .meta_gc = f2fs_meta_inode_gc_required(inode) ? 1 : 0, + .io_type = io_type, + .io_wbc = wbc, + .bio = bio, + .last_block = last_block, + }; + + trace_f2fs_writepage(folio, DATA); + + /* we should bypass data pages to proceed the kworker jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(folio->mapping, -EIO); + /* + * don't drop any dirty dentry pages for keeping lastest + * directory structure. + */ + if (S_ISDIR(inode->i_mode) && + !is_sbi_flag_set(sbi, SBI_IS_CLOSE)) + goto redirty_out; + + /* keep data pages in remount-ro mode */ + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) + goto redirty_out; + goto out; + } - if (page->index < end_index) + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + + if (folio->index < end_index || + f2fs_verity_in_progress(inode) || + compr_blocks) goto write; /* * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ - offset = i_size & (PAGE_CACHE_SIZE - 1); - if ((page->index >= end_index + 1) || !offset) { - if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); - } + offset = i_size & (PAGE_SIZE - 1); + if ((folio->index >= end_index + 1) || !offset) goto out; - } - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + folio_zero_segment(folio, offset, folio_size(folio)); write: - if (sbi->por_doing) { - err = AOP_WRITEPAGE_ACTIVATE; - goto redirty_out; + /* Dentry/quota blocks are controlled by checkpoint */ + if (S_ISDIR(inode->i_mode) || quota_inode) { + /* + * We need to wait for node_write to avoid block allocation during + * checkpoint. This can only happen to quota writes which can cause + * the below discard race condition. + */ + if (quota_inode) + f2fs_down_read(&sbi->node_write); + + fio.need_lock = LOCK_DONE; + err = f2fs_do_write_data_page(&fio); + + if (quota_inode) + f2fs_up_read(&sbi->node_write); + + goto done; + } + + need_balance_fs = true; + err = -EAGAIN; + if (f2fs_has_inline_data(inode)) { + err = f2fs_write_inline_data(inode, folio); + if (!err) + goto out; } - /* Dentry blocks are controlled by checkpoint */ - if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); - err = do_write_data_page(page); + if (err == -EAGAIN) { + err = f2fs_do_write_data_page(&fio); + if (err == -EAGAIN) { + f2fs_bug_on(sbi, compr_blocks); + fio.need_lock = LOCK_REQ; + err = f2fs_do_write_data_page(&fio); + } + } + + if (err) { + file_set_keep_isize(inode); } else { - int ilock = mutex_lock_op(sbi); - err = do_write_data_page(page); - mutex_unlock_op(sbi, ilock); - need_balance_fs = true; + spin_lock(&F2FS_I(inode)->i_size_lock); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; + spin_unlock(&F2FS_I(inode)->i_size_lock); } - if (err == -ENOENT) - goto out; - else if (err) - goto redirty_out; - if (wbc->for_reclaim) - f2fs_submit_bio(sbi, DATA, true); +done: + if (err && err != -ENOENT) + goto redirty_out; - clear_cold_data(page); out: - unlock_page(page); - if (need_balance_fs) - f2fs_balance_fs(sbi); + inode_dec_dirty_pages(inode); + if (err) { + folio_clear_uptodate(folio); + folio_clear_f2fs_gcing(folio); + } + folio_unlock(folio); + if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && + !F2FS_I(inode)->wb_task && allow_balance) + f2fs_balance_fs(sbi, need_balance_fs); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, DATA); + if (bio && *bio) + f2fs_submit_merged_ipu_write(sbi, bio, NULL); + submitted = NULL; + } + + if (submitted) + *submitted = fio.submitted; + return 0; redirty_out: - wbc->pages_skipped++; - set_page_dirty(page); + folio_redirty_for_writepage(wbc, folio); + /* + * pageout() in MM translates EAGAIN, so calls handle_write_error() + * -> mapping_set_error() -> set_bit(AS_EIO, ...). + * file_write_and_wait_range() will see EIO error, which is critical + * to return value of fsync() followed by atomic_write failure to user. + */ + folio_unlock(folio); + if (!err) + return 1; return err; } -#define MAX_DESIRED_PAGES_WP 4096 - -static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, - void *data) +/* + * This function was copied from write_cache_pages from mm/page-writeback.c. + * The major change is making write step of cold data page separately from + * warm/hot data page. + */ +static int f2fs_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); - mapping_set_error(mapping, ret); + int ret = 0; + int done = 0, retry = 0; + struct page *pages_local[F2FS_ONSTACK_PAGES]; + struct page **pages = pages_local; + struct folio_batch fbatch; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct bio *bio = NULL; + sector_t last_block; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct inode *inode = mapping->host; + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = NULL_CLUSTER, + .rpages = NULL, + .nr_rpages = 0, + .cpages = NULL, + .valid_nr_cpages = 0, + .rbuf = NULL, + .cbuf = NULL, + .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, + .private = NULL, + }; +#endif + int nr_folios, p, idx; + int nr_pages; + unsigned int max_pages = F2FS_ONSTACK_PAGES; + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int range_whole = 0; + xa_mark_t tag; + int nwritten = 0; + int submitted = 0; + int i; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode) && + 1 << cc.log_cluster_size > F2FS_ONSTACK_PAGES) { + pages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc.log_cluster_size, GFP_NOFS | __GFP_NOFAIL); + max_pages = 1 << cc.log_cluster_size; + } +#endif + + folio_batch_init(&fbatch); + + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + } + tag = wbc_to_tag(wbc); +retry: + retry = 0; + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && !retry && (index <= end)) { + nr_pages = 0; +again: + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) { + if (nr_pages) + goto write; + break; + } + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + idx = 0; + p = folio_nr_pages(folio); +add_more: + pages[nr_pages] = folio_page(folio, idx); + folio_get(folio); + if (++nr_pages == max_pages) { + index = folio->index + idx + 1; + folio_batch_release(&fbatch); + goto write; + } + if (++idx < p) + goto add_more; + } + folio_batch_release(&fbatch); + goto again; +write: + for (i = 0; i < nr_pages; i++) { + struct page *page = pages[i]; + struct folio *folio = page_folio(page); + bool need_readd; +readd: + need_readd = false; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + void *fsdata = NULL; + struct page *pagep; + int ret2; + + ret = f2fs_init_compress_ctx(&cc); + if (ret) { + done = 1; + break; + } + + if (!f2fs_cluster_can_merge_page(&cc, + folio->index)) { + ret = f2fs_write_multi_pages(&cc, + &submitted, wbc, io_type); + if (!ret) + need_readd = true; + goto result; + } + + if (unlikely(f2fs_cp_error(sbi))) + goto lock_folio; + + if (!f2fs_cluster_is_empty(&cc)) + goto lock_folio; + + if (f2fs_all_cluster_page_ready(&cc, + pages, i, nr_pages, true)) + goto lock_folio; + + ret2 = f2fs_prepare_compress_overwrite( + inode, &pagep, + folio->index, &fsdata); + if (ret2 < 0) { + ret = ret2; + done = 1; + break; + } else if (ret2 && + (!f2fs_compress_write_end(inode, + fsdata, folio->index, 1) || + !f2fs_all_cluster_page_ready(&cc, + pages, i, nr_pages, + false))) { + retry = 1; + break; + } + } +#endif + /* give a priority to WB_SYNC threads */ + if (atomic_read(&sbi->wb_sync_req[DATA]) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } +#ifdef CONFIG_F2FS_FS_COMPRESSION +lock_folio: +#endif + done_index = folio->index; +retry_write: + folio_lock(folio); + + if (unlikely(folio->mapping != mapping)) { +continue_unlock: + folio_unlock(folio); + continue; + } + + if (!folio_test_dirty(folio)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (folio_test_writeback(folio)) { + if (wbc->sync_mode == WB_SYNC_NONE) + goto continue_unlock; + f2fs_folio_wait_writeback(folio, DATA, true, true); + } + + if (!folio_clear_dirty_for_io(folio)) + goto continue_unlock; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + folio_get(folio); + f2fs_compress_ctx_add_page(&cc, folio); + continue; + } +#endif + submitted = 0; + ret = f2fs_write_single_data_page(folio, + &submitted, &bio, &last_block, + wbc, io_type, 0, true); +#ifdef CONFIG_F2FS_FS_COMPRESSION +result: +#endif + nwritten += submitted; + wbc->nr_to_write -= submitted; + + if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == 1) { + ret = 0; + goto next; + } else if (ret == -EAGAIN) { + ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) { + f2fs_schedule_timeout( + DEFAULT_SCHEDULE_TIMEOUT); + goto retry_write; + } + goto next; + } + done_index = folio_next_index(folio); + done = 1; + break; + } + + if (wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } +next: + if (need_readd) + goto readd; + } + release_pages(pages, nr_pages); + cond_resched(); + } +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* flush remained pages in compress cluster */ + if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) { + ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); + nwritten += submitted; + wbc->nr_to_write -= submitted; + if (ret) { + done = 1; + retry = 0; + } + } + if (f2fs_compressed_file(inode)) + f2fs_destroy_compress_ctx(&cc, false); +#endif + if (retry) { + index = 0; + end = -1; + goto retry; + } + if (wbc->range_cyclic && !done) + done_index = 0; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + if (nwritten) + f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, + NULL, 0, DATA); + /* submit cached bio of IPU write */ + if (bio) + f2fs_submit_merged_ipu_write(sbi, &bio, NULL); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (pages != pages_local) + kfree(pages); +#endif + return ret; } -static int f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc) +static inline bool __should_serialize_io(struct inode *inode, + struct writeback_control *wbc) +{ + /* to avoid deadlock in path of data flush */ + if (F2FS_I(inode)->wb_task) + return false; + + if (!S_ISREG(inode->i_mode)) + return false; + if (IS_NOQUOTA(inode)) + return false; + + if (f2fs_need_compress_data(inode)) + return true; + if (wbc->sync_mode != WB_SYNC_ALL) + return true; + if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) + return true; + return false; +} + +static inline void account_writeback(struct inode *inode, bool inc) +{ + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return; + + f2fs_down_read(&F2FS_I(inode)->i_sem); + if (inc) + atomic_inc(&F2FS_I(inode)->writeback); + else + atomic_dec(&F2FS_I(inode)->writeback); + f2fs_up_read(&F2FS_I(inode)->i_sem); +} + +static int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - bool locked = false; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct blk_plug plug; int ret; - long excess_nrtw = 0, desired_nrtw; + bool locked = false; - /* deal with chardevs and other special file */ - if (!mapping->a_ops->writepage) + /* skip writing if there is no dirty page in this inode */ + if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; - if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { - desired_nrtw = MAX_DESIRED_PAGES_WP; - excess_nrtw = desired_nrtw - wbc->nr_to_write; - wbc->nr_to_write = desired_nrtw; + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + + if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) && + wbc->sync_mode == WB_SYNC_NONE && + get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && + f2fs_available_free_memory(sbi, DIRTY_DENTS)) + goto skip_write; + + /* skip writing in file defragment preparing stage */ + if (is_inode_flag_set(inode, FI_SKIP_WRITES)) + goto skip_write; + + trace_f2fs_writepages(mapping->host, wbc, DATA); + + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req[DATA]); + else if (atomic_read(&sbi->wb_sync_req[DATA])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); + goto skip_write; } - if (!S_ISDIR(inode->i_mode)) { + if (__should_serialize_io(inode, wbc)) { mutex_lock(&sbi->writepages); locked = true; } - ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); + + account_writeback(inode, true); + + blk_start_plug(&plug); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); + blk_finish_plug(&plug); + + account_writeback(inode, false); + if (locked) mutex_unlock(&sbi->writepages); - f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); - remove_dirty_dir_inode(inode); + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req[DATA]); + /* + * if some pages were truncated, we cannot guarantee its mapping->host + * to detect pending bios. + */ - wbc->nr_to_write -= excess_nrtw; + f2fs_remove_dirty_inode(inode); return ret; + +skip_write: + wbc->pages_skipped += get_dirty_pages(inode); + trace_f2fs_writepages(mapping->host, wbc, DATA); + return 0; } -static int f2fs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct page *page; - pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + +void f2fs_write_failed(struct inode *inode, loff_t to) +{ + loff_t i_size = i_size_read(inode); + + if (IS_NOQUOTA(inode)) + return; + + /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ + if (to > i_size && !f2fs_verity_in_progress(inode)) { + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + truncate_pagecache(inode, i_size); + f2fs_truncate_blocks(inode, i_size, true); + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } +} + +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct folio *folio, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = folio->mapping->host; + pgoff_t index = folio->index; struct dnode_of_data dn; + struct folio *ifolio; + bool locked = false; + int flag = F2FS_GET_BLOCK_PRE_AIO; int err = 0; - int ilock; - /* for nobh_write_end */ - *fsdata = NULL; + /* + * If a whole page is being written and we already preallocated all the + * blocks, then there is no need to get a block address now. + */ + if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) + return 0; - f2fs_balance_fs(sbi); -repeat: - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - *pagep = page; + /* f2fs_lock_op avoids race between write CP and convert_inline_page */ + if (f2fs_has_inline_data(inode)) { + if (pos + len > MAX_INLINE_DATA(inode)) + flag = F2FS_GET_BLOCK_DEFAULT; + f2fs_map_lock(sbi, flag); + locked = true; + } else if ((pos & PAGE_MASK) >= i_size_read(inode)) { + f2fs_map_lock(sbi, flag); + locked = true; + } - ilock = mutex_lock_op(sbi); +restart: + /* check inline_data */ + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) { + err = PTR_ERR(ifolio); + goto unlock_out; + } - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, ALLOC_NODE); - if (err) - goto err; + set_new_dnode(&dn, inode, ifolio, ifolio, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA(inode)) { + f2fs_do_read_inline_data(folio, ifolio); + set_inode_flag(inode, FI_DATA_EXIST); + if (inode->i_nlink) + folio_set_f2fs_inline(ifolio); + goto out; + } + err = f2fs_convert_inline_folio(&dn, folio); + if (err || dn.data_blkaddr != NULL_ADDR) + goto out; + } + + if (!f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { + if (IS_DEVICE_ALIASING(inode)) { + err = -ENODATA; + goto out; + } + + if (locked) { + err = f2fs_reserve_block(&dn, index); + goto out; + } + + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (!err && dn.data_blkaddr != NULL_ADDR) + goto out; + f2fs_put_dnode(&dn); + f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); + locked = true; + goto restart; + } +out: + if (!err) { + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + } + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + f2fs_map_unlock(sbi, flag); + return err; +} + +static int __find_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr) +{ + struct dnode_of_data dn; + struct folio *ifolio; + int err = 0; - if (dn.data_blkaddr == NULL_ADDR) - err = reserve_new_block(&dn); + ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); + set_new_dnode(&dn, inode, ifolio, ifolio, 0); + + if (!f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + dn.data_blkaddr = NULL_ADDR; + err = 0; + } + } + *blk_addr = dn.data_blkaddr; f2fs_put_dnode(&dn); - if (err) - goto err; + return err; +} - mutex_unlock_op(sbi, ilock); +static int __reserve_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr, bool *node_changed) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct folio *ifolio; + int err = 0; + + f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); + + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) { + err = PTR_ERR(ifolio); + goto unlock_out; + } + set_new_dnode(&dn, inode, ifolio, ifolio, 0); - if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) + if (!f2fs_lookup_read_extent_cache_block(dn.inode, index, + &dn.data_blkaddr)) + err = f2fs_reserve_block(&dn, index); + + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + f2fs_put_dnode(&dn); + +unlock_out: + f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); + return err; +} + +static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, + struct folio *folio, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed, bool *use_cow) +{ + struct inode *inode = folio->mapping->host; + struct inode *cow_inode = F2FS_I(inode)->cow_inode; + pgoff_t index = folio->index; + int err = 0; + block_t ori_blk_addr = NULL_ADDR; + + /* If pos is beyond the end of file, reserve a new block in COW inode */ + if ((pos & PAGE_MASK) >= i_size_read(inode)) + goto reserve_block; + + /* Look for the block in COW inode first */ + err = __find_data_block(cow_inode, index, blk_addr); + if (err) { + return err; + } else if (*blk_addr != NULL_ADDR) { + *use_cow = true; return 0; + } - if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { - unsigned start = pos & (PAGE_CACHE_SIZE - 1); - unsigned end = start + len; + if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) + goto reserve_block; - /* Reading beyond i_size is simple: memset to zero */ - zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - goto out; + /* Look for the block in the original inode */ + err = __find_data_block(inode, index, &ori_blk_addr); + if (err) + return err; + +reserve_block: + /* Finally, we should reserve a new block in COW inode for the update */ + err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); + if (err) + return err; + inc_atomic_write_cnt(inode); + + if (ori_blk_addr != NULL_ADDR) + *blk_addr = ori_blk_addr; + return 0; +} + +static int f2fs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata) +{ + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct folio *folio; + pgoff_t index = pos >> PAGE_SHIFT; + bool need_balance = false; + bool use_cow = false; + block_t blkaddr = NULL_ADDR; + int err = 0; + + trace_f2fs_write_begin(inode, pos, len); + + if (!f2fs_is_checkpoint_ready(sbi)) { + err = -ENOSPC; + goto fail; } - if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + /* + * We should check this at this moment to avoid deadlock on inode page + * and #0 page. The locking rule for inline_data conversion should be: + * folio_lock(folio #0) -> folio_lock(inode_page) + */ + if (index != 0) { + err = f2fs_convert_inline_inode(inode); if (err) - return err; - lock_page(page); - if (!PageUptodate(page)) { - f2fs_put_page(page, 1); - return -EIO; + goto fail; + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + int ret; + struct page *page; + + *fsdata = NULL; + + if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) + goto repeat; + + ret = f2fs_prepare_compress_overwrite(inode, &page, + index, fsdata); + if (ret < 0) { + err = ret; + goto fail; + } else if (ret) { + *foliop = page_folio(page); + return 0; } - if (page->mapping != mapping) { - f2fs_put_page(page, 1); + } +#endif + +repeat: + /* + * Do not use FGP_STABLE to avoid deadlock. + * Will wait that below with our IO control. + */ + folio = f2fs_filemap_get_folio(mapping, index, + FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto fail; + } + + /* TODO: cluster can be compressed due to race with .writepage */ + + *foliop = folio; + + if (f2fs_is_atomic_file(inode)) + err = prepare_atomic_write_begin(sbi, folio, pos, len, + &blkaddr, &need_balance, &use_cow); + else + err = prepare_write_begin(sbi, folio, pos, len, + &blkaddr, &need_balance); + if (err) + goto put_folio; + + if (need_balance && !IS_NOQUOTA(inode) && + has_not_enough_free_secs(sbi, 0, 0)) { + folio_unlock(folio); + f2fs_balance_fs(sbi, true); + folio_lock(folio); + if (folio->mapping != mapping) { + /* The folio got truncated from under us */ + folio_unlock(folio); + folio_put(folio); goto repeat; } } -out: - SetPageUptodate(page); - clear_cold_data(page); + + f2fs_folio_wait_writeback(folio, DATA, false, true); + + if (len == folio_size(folio) || folio_test_uptodate(folio)) + return 0; + + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) && + !f2fs_verity_in_progress(inode)) { + folio_zero_segment(folio, len, folio_size(folio)); + return 0; + } + + if (blkaddr == NEW_ADDR) { + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); + } else { + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE_READ)) { + err = -EFSCORRUPTED; + goto put_folio; + } + f2fs_submit_page_read(use_cow ? + F2FS_I(inode)->cow_inode : inode, + folio, blkaddr, 0, true); + + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); + goto repeat; + } + if (unlikely(!folio_test_uptodate(folio))) { + err = -EIO; + goto put_folio; + } + } return 0; -err: - mutex_unlock_op(sbi, ilock); - f2fs_put_page(page, 1); +put_folio: + f2fs_folio_put(folio, true); +fail: + f2fs_write_failed(inode, pos + len); return err; } -static int f2fs_write_end(struct file *file, +static int f2fs_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; + + trace_f2fs_write_end(inode, pos, len, copied); + + /* + * This should be come from len == PAGE_SIZE, and we expect copied + * should be PAGE_SIZE. Otherwise, we treat it with zero copied and + * let generic_perform_write() try to copy data again through copied=0. + */ + if (!folio_test_uptodate(folio)) { + if (unlikely(copied != len)) + copied = 0; + else + folio_mark_uptodate(folio); + } - SetPageUptodate(page); - set_page_dirty(page); +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* overwrite compressed file */ + if (f2fs_compressed_file(inode) && fsdata) { + f2fs_compress_write_end(inode, fsdata, folio->index, copied); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - if (pos + copied > i_size_read(inode)) { - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); - update_inode_page(inode); + if (pos + copied > i_size_read(inode) && + !f2fs_verity_in_progress(inode)) + f2fs_i_size_write(inode, pos + copied); + return copied; } +#endif + + if (!copied) + goto unlock_out; + + folio_mark_dirty(folio); - unlock_page(page); - page_cache_release(page); + if (f2fs_is_atomic_file(inode)) + folio_set_f2fs_atomic(folio); + + if (pos + copied > i_size_read(inode) && + !f2fs_verity_in_progress(inode)) { + f2fs_i_size_write(inode, pos + copied); + if (f2fs_is_atomic_file(inode)) + f2fs_i_size_write(F2FS_I(inode)->cow_inode, + pos + copied); + } +unlock_out: + f2fs_folio_put(folio, true); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } -static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) +void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + struct inode *inode = folio->mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino >= F2FS_ROOT_INO(sbi) && + (offset || length != folio_size(folio))) + return; + + if (folio_test_dirty(folio)) { + if (inode->i_ino == F2FS_META_INO(sbi)) { + dec_page_count(sbi, F2FS_DIRTY_META); + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + } else { + inode_dec_dirty_pages(inode); + f2fs_remove_dirty_inode(inode); + } + } + folio_detach_private(folio); +} + +bool f2fs_release_folio(struct folio *folio, gfp_t wait) +{ + /* If this is dirty folio, keep private data */ + if (folio_test_dirty(folio)) + return false; + + folio_detach_private(folio); + return true; +} + +static bool f2fs_dirty_data_folio(struct address_space *mapping, + struct folio *folio) +{ + struct inode *inode = mapping->host; + + trace_f2fs_set_page_dirty(folio, DATA); + + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + BUG_ON(folio_test_swapcache(folio)); + + if (filemap_dirty_folio(mapping, folio)) { + f2fs_update_dirty_folio(inode, folio); + return true; + } + return false; +} + + +static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct dnode_of_data dn; + sector_t start_idx, blknr = 0; + int ret; + + start_idx = round_down(block, F2FS_I(inode)->i_cluster_size); - if (rw == WRITE) + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) return 0; - /* Needs synchronization with the cleaner */ - return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - get_data_block_ro); + if (dn.data_blkaddr != COMPRESS_ADDR) { + dn.ofs_in_node += block - start_idx; + blknr = f2fs_data_blkaddr(&dn); + if (!__is_valid_data_blkaddr(blknr)) + blknr = 0; + } + + f2fs_put_dnode(&dn); + return blknr; +#else + return 0; +#endif } -static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, - unsigned int length) + +static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { - struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (S_ISDIR(inode->i_mode) && PageDirty(page)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); + struct inode *inode = mapping->host; + sector_t blknr = 0; + + if (f2fs_has_inline_data(inode)) + goto out; + + /* make sure allocating whole blocks */ + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + filemap_write_and_wait(mapping); + + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(block >= max_file_blocks(inode))) + goto out; + + if (f2fs_compressed_file(inode)) { + blknr = f2fs_bmap_compress(inode, block); + } else { + struct f2fs_map_blocks map; + + memset(&map, 0, sizeof(map)); + map.m_lblk = block; + map.m_len = 1; + map.m_next_pgofs = NULL; + map.m_seg_type = NO_CHECK_TYPE; + + if (!f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_BMAP)) + blknr = map.m_pblk; } - ClearPagePrivate(page); +out: + trace_f2fs_bmap(inode, block, blknr); + return blknr; } -static int f2fs_release_data_page(struct page *page, gfp_t wait) +#ifdef CONFIG_SWAP +static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, + unsigned int blkcnt) { - ClearPagePrivate(page); - return 1; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int blkofs; + unsigned int blk_per_sec = BLKS_PER_SEC(sbi); + unsigned int end_blk = start_blk + blkcnt - 1; + unsigned int secidx = start_blk / blk_per_sec; + unsigned int end_sec; + int ret = 0; + + if (!blkcnt) + return 0; + end_sec = end_blk / blk_per_sec; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + set_inode_flag(inode, FI_ALIGNED_WRITE); + set_inode_flag(inode, FI_OPU_WRITE); + + for (; secidx <= end_sec; secidx++) { + unsigned int blkofs_end = secidx == end_sec ? + end_blk % blk_per_sec : blk_per_sec - 1; + + f2fs_down_write(&sbi->pin_sem); + + ret = f2fs_allocate_pinning_section(sbi); + if (ret) { + f2fs_up_write(&sbi->pin_sem); + break; + } + + set_inode_flag(inode, FI_SKIP_WRITES); + + for (blkofs = 0; blkofs <= blkofs_end; blkofs++) { + struct folio *folio; + unsigned int blkidx = secidx * blk_per_sec + blkofs; + + folio = f2fs_get_lock_data_folio(inode, blkidx, true); + if (IS_ERR(folio)) { + f2fs_up_write(&sbi->pin_sem); + ret = PTR_ERR(folio); + goto done; + } + + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); + } + + clear_inode_flag(inode, FI_SKIP_WRITES); + + ret = filemap_fdatawrite(inode->i_mapping); + + f2fs_up_write(&sbi->pin_sem); + + if (ret) + break; + } + +done: + clear_inode_flag(inode, FI_SKIP_WRITES); + clear_inode_flag(inode, FI_OPU_WRITE); + clear_inode_flag(inode, FI_ALIGNED_WRITE); + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + return ret; } -static int f2fs_set_data_page_dirty(struct page *page) +static int check_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + block_t cur_lblock; + block_t last_lblock; + block_t pblock; + block_t lowest_pblock = -1; + block_t highest_pblock = 0; + int nr_extents = 0; + unsigned int nr_pblocks; + unsigned int blks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; + int ret = 0; - SetPageUptodate(page); - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); - set_dirty_dir_page(inode, page); - return 1; + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + cur_lblock = 0; + last_lblock = F2FS_BYTES_TO_BLK(i_size_read(inode)); + + while (cur_lblock < last_lblock && cur_lblock < sis->max) { + struct f2fs_map_blocks map; +retry: + cond_resched(); + + memset(&map, 0, sizeof(map)); + map.m_lblk = cur_lblock; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); + if (ret) + goto out; + + /* hole */ + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes"); + ret = -EINVAL; + goto out; + } + + pblock = map.m_pblk; + nr_pblocks = map.m_len; + + if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec || + nr_pblocks % blks_per_sec || + f2fs_is_sequential_zone_area(sbi, pblock)) { + bool last_extent = false; + + not_aligned++; + + nr_pblocks = roundup(nr_pblocks, blks_per_sec); + if (cur_lblock + nr_pblocks > sis->max) + nr_pblocks -= blks_per_sec; + + /* this extent is last one */ + if (!nr_pblocks) { + nr_pblocks = last_lblock - cur_lblock; + last_extent = true; + } + + ret = f2fs_migrate_blocks(inode, cur_lblock, + nr_pblocks); + if (ret) { + if (ret == -ENOENT) + ret = -EINVAL; + goto out; + } + + if (!last_extent) + goto retry; + } + + if (cur_lblock + nr_pblocks >= sis->max) + nr_pblocks = sis->max - cur_lblock; + + if (cur_lblock) { /* exclude the header page */ + if (pblock < lowest_pblock) + lowest_pblock = pblock; + if (pblock + nr_pblocks - 1 > highest_pblock) + highest_pblock = pblock + nr_pblocks - 1; + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); + if (ret < 0) + goto out; + nr_extents += ret; + cur_lblock += nr_pblocks; } - return 0; + ret = nr_extents; + *span = 1 + highest_pblock - lowest_pblock; + if (cur_lblock == 0) + cur_lblock = 1; /* force Empty message */ + sis->max = cur_lblock; + sis->pages = cur_lblock - 1; +out: + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)", + not_aligned, blks_per_sec * F2FS_BLKSIZE); + return ret; } -static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) +static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) { - return generic_block_bmap(mapping, block, get_data_block_ro); + struct inode *inode = file_inode(file); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (f2fs_lfs_mode(sbi) && !f2fs_sb_has_blkzoned(sbi)) { + f2fs_err(sbi, "Swapfile not supported in LFS mode"); + return -EINVAL; + } + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + if (!f2fs_disable_compressed_file(inode)) + return -EINVAL; + + ret = filemap_fdatawrite(inode->i_mapping); + if (ret < 0) + return ret; + + f2fs_precache_extents(inode); + + ret = check_swap_activate(sis, file, span); + if (ret < 0) + return ret; + + stat_inc_swapfile_inode(inode); + set_inode_flag(inode, FI_PIN_FILE); + f2fs_update_time(sbi, REQ_TIME); + return ret; +} + +static void f2fs_swap_deactivate(struct file *file) +{ + struct inode *inode = file_inode(file); + + stat_dec_swapfile_inode(inode); + clear_inode_flag(inode, FI_PIN_FILE); +} +#else +static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + return -EOPNOTSUPP; } +static void f2fs_swap_deactivate(struct file *file) +{ +} +#endif + const struct address_space_operations f2fs_dblock_aops = { - .readpage = f2fs_read_data_page, - .readpages = f2fs_read_data_pages, - .writepage = f2fs_write_data_page, + .read_folio = f2fs_read_data_folio, + .readahead = f2fs_readahead, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, - .set_page_dirty = f2fs_set_data_page_dirty, - .invalidatepage = f2fs_invalidate_data_page, - .releasepage = f2fs_release_data_page, - .direct_IO = f2fs_direct_IO, + .dirty_folio = f2fs_dirty_data_folio, + .migrate_folio = filemap_migrate_folio, + .invalidate_folio = f2fs_invalidate_folio, + .release_folio = f2fs_release_folio, .bmap = f2fs_bmap, + .swap_activate = f2fs_swap_activate, + .swap_deactivate = f2fs_swap_deactivate, +}; + +void f2fs_clear_page_cache_dirty_tag(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + unsigned long flags; + + xa_lock_irqsave(&mapping->i_pages, flags); + __xa_clear_mark(&mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); + xa_unlock_irqrestore(&mapping->i_pages, flags); +} + +int __init f2fs_init_post_read_processing(void) +{ + bio_post_read_ctx_cache = + kmem_cache_create("f2fs_bio_post_read_ctx", + sizeof(struct bio_post_read_ctx), 0, 0, NULL); + if (!bio_post_read_ctx_cache) + goto fail; + bio_post_read_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, + bio_post_read_ctx_cache); + if (!bio_post_read_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_post_read_ctx_cache); +fail: + return -ENOMEM; +} + +void f2fs_destroy_post_read_processing(void) +{ + mempool_destroy(bio_post_read_ctx_pool); + kmem_cache_destroy(bio_post_read_ctx_cache); +} + +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) +{ + if (!f2fs_sb_has_encrypt(sbi) && + !f2fs_sb_has_verity(sbi) && + !f2fs_sb_has_compression(sbi)) + return 0; + + sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); + return sbi->post_read_wq ? 0 : -ENOMEM; +} + +void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) +{ + if (sbi->post_read_wq) + destroy_workqueue(sbi->post_read_wq); +} + +int __init f2fs_init_bio_entry_cache(void) +{ + bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", + sizeof(struct bio_entry)); + return bio_entry_slab ? 0 : -ENOMEM; +} + +void f2fs_destroy_bio_entry_cache(void) +{ + kmem_cache_destroy(bio_entry_slab); +} + +static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct f2fs_map_blocks map = { NULL, }; + pgoff_t next_pgofs = 0; + int err; + + map.m_lblk = F2FS_BYTES_TO_BLK(offset); + map.m_len = F2FS_BYTES_TO_BLK(offset + length - 1) - map.m_lblk + 1; + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), + inode->i_write_hint); + if (flags & IOMAP_WRITE && iomap->private) { + map.m_last_pblk = (unsigned long)iomap->private; + iomap->private = NULL; + } + + /* + * If the blocks being overwritten are already allocated, + * f2fs_map_lock and f2fs_balance_fs are not necessary. + */ + if ((flags & IOMAP_WRITE) && + !f2fs_overwrite_io(inode, offset, length)) + map.m_may_create = true; + + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); + if (err) + return err; + + iomap->offset = F2FS_BLK_TO_BYTES(map.m_lblk); + + /* + * When inline encryption is enabled, sometimes I/O to an encrypted file + * has to be broken up to guarantee DUN contiguity. Handle this by + * limiting the length of the mapping returned. + */ + map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + + /* + * We should never see delalloc or compressed extents here based on + * prior flushing and checks. + */ + if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR)) + return -EINVAL; + + if (map.m_flags & F2FS_MAP_MAPPED) { + if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) + return -EINVAL; + + iomap->length = F2FS_BLK_TO_BYTES(map.m_len); + iomap->type = IOMAP_MAPPED; + iomap->flags |= IOMAP_F_MERGED; + iomap->bdev = map.m_bdev; + iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk); + + if (flags & IOMAP_WRITE && map.m_last_pblk) + iomap->private = (void *)map.m_last_pblk; + } else { + if (flags & IOMAP_WRITE) + return -ENOTBLK; + + if (map.m_pblk == NULL_ADDR) { + iomap->length = F2FS_BLK_TO_BYTES(next_pgofs) - + iomap->offset; + iomap->type = IOMAP_HOLE; + } else if (map.m_pblk == NEW_ADDR) { + iomap->length = F2FS_BLK_TO_BYTES(map.m_len); + iomap->type = IOMAP_UNWRITTEN; + } else { + f2fs_bug_on(F2FS_I_SB(inode), 1); + } + iomap->addr = IOMAP_NULL_ADDR; + } + + if (map.m_flags & F2FS_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + if ((inode_state_read_once(inode) & I_DIRTY_DATASYNC) || + offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; + + return 0; +} + +const struct iomap_ops f2fs_iomap_ops = { + .iomap_begin = f2fs_iomap_begin, }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0d6c6aafb235..032683835569 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs debugging statistics * @@ -5,10 +6,6 @@ * http://www.samsung.com/ * Copyright (c) 2012 Linux Foundation * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> @@ -24,93 +21,276 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static struct dentry *debugfs_root; -static DEFINE_MUTEX(f2fs_stat_mutex); +static DEFINE_SPINLOCK(f2fs_stat_lock); +#ifdef CONFIG_DEBUG_FS +static struct dentry *f2fs_debugfs_root; +#endif + +/* + * This function calculates BDF of every segments + */ +void f2fs_update_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; + unsigned long long bimodal, dist; + unsigned int segno, vblocks; + int ndirty = 0; + + bimodal = 0; + total_vblocks = 0; + blks_per_sec = CAP_BLKS_PER_SEC(sbi); + hblks_per_sec = blks_per_sec / 2; + for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { + vblocks = get_valid_blocks(sbi, segno, true); + dist = abs(vblocks - hblks_per_sec); + bimodal += dist * dist; + + if (vblocks > 0 && vblocks < blks_per_sec) { + total_vblocks += vblocks; + ndirty++; + } + } + dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); + si->bimodal = div64_u64(bimodal, dist); + if (si->dirty_count) + si->avg_vblocks = div_u64(total_vblocks, ndirty); + else + si->avg_vblocks = 0; +} + +#ifdef CONFIG_DEBUG_FS +static void update_multidevice_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + struct f2fs_dev_stats *dev_stats = si->dev_stats; + int i, j; + + if (!f2fs_is_multi_device(sbi)) + return; + + memset(dev_stats, 0, sizeof(struct f2fs_dev_stats) * sbi->s_ndevs); + for (i = 0; i < sbi->s_ndevs; i++) { + unsigned int start_segno, end_segno; + block_t start_blk, end_blk; + + if (i == 0) { + start_blk = MAIN_BLKADDR(sbi); + end_blk = FDEV(i).end_blk + 1 - SEG0_BLKADDR(sbi); + } else { + start_blk = FDEV(i).start_blk; + end_blk = FDEV(i).end_blk + 1; + } + + start_segno = GET_SEGNO(sbi, start_blk); + end_segno = GET_SEGNO(sbi, end_blk); + + for (j = start_segno; j < end_segno; j++) { + unsigned int seg_blks, sec_blks; + + seg_blks = get_seg_entry(sbi, j)->valid_blocks; + + /* update segment stats */ + if (is_curseg(sbi, j)) + dev_stats[i].devstats[0][DEVSTAT_INUSE]++; + else if (seg_blks == BLKS_PER_SEG(sbi)) + dev_stats[i].devstats[0][DEVSTAT_FULL]++; + else if (seg_blks != 0) + dev_stats[i].devstats[0][DEVSTAT_DIRTY]++; + else if (!test_bit(j, FREE_I(sbi)->free_segmap)) + dev_stats[i].devstats[0][DEVSTAT_FREE]++; + else + dev_stats[i].devstats[0][DEVSTAT_PREFREE]++; + + if (!__is_large_section(sbi) || + (j % SEGS_PER_SEC(sbi)) != 0) + continue; + + sec_blks = get_sec_entry(sbi, j)->valid_blocks; + + /* update section stats */ + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, j))) + dev_stats[i].devstats[1][DEVSTAT_INUSE]++; + else if (sec_blks == BLKS_PER_SEC(sbi)) + dev_stats[i].devstats[1][DEVSTAT_FULL]++; + else if (sec_blks != 0) + dev_stats[i].devstats[1][DEVSTAT_DIRTY]++; + else if (!test_bit(GET_SEC_FROM_SEG(sbi, j), + FREE_I(sbi)->free_secmap)) + dev_stats[i].devstats[1][DEVSTAT_FREE]++; + else + dev_stats[i].devstats[1][DEVSTAT_PREFREE]++; + } + } +} static void update_general_status(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); int i; - /* valid check of the segment numbers */ - si->hit_ext = sbi->read_hit_ext; - si->total_ext = sbi->total_hit_ext; + /* these will be changed if online resize is done */ + si->main_area_segs = le32_to_cpu(raw_super->segment_count_main); + si->main_area_sections = le32_to_cpu(raw_super->section_count); + si->main_area_zones = si->main_area_sections / + le32_to_cpu(raw_super->secs_per_zone); + + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]); + si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]); + si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]); + si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i]; + si->ext_tree[i] = atomic_read(&eti->total_ext_tree); + si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree); + si->ext_node[i] = atomic_read(&eti->total_ext_node); + } + /* read extent_cache only */ + si->hit_largest = atomic64_read(&sbi->read_hit_largest); + si->hit_total[EX_READ] += si->hit_largest; + + /* block age extent_cache only */ + si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks); + + /* validation check of the segment numbers */ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); - si->ndirty_dirs = sbi->n_dirty_dirs; si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); - si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA); + si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndonate_files = sbi->donate_files; + si->nquota_files = sbi->nquota_files; + si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; + si->aw_cnt = atomic_read(&sbi->atomic_files); + si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); + si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA); + si->nr_rd_node = get_pages(sbi, F2FS_RD_NODE); + si->nr_rd_meta = get_pages(sbi, F2FS_RD_META); + if (SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->queued_flush); + si->flush_list_empty = + llist_empty(&SM_I(sbi)->fcc_info->issue_list); + } + if (SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->queued_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; + } + si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt); + si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt); + si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt); + spin_lock(&sbi->cprc_info.stat_lock); + si->cur_ckpt_time = sbi->cprc_info.cur_time; + si->peak_ckpt_time = sbi->cprc_info.peak_time; + spin_unlock(&sbi->cprc_info.stat_lock); + si->total_count = BLKS_TO_SEGS(sbi, (int)sbi->user_block_count); si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); si->valid_count = valid_user_blocks(sbi); + si->discard_blks = discard_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); + si->inline_xattr = atomic_read(&sbi->inline_xattr); + si->inline_inode = atomic_read(&sbi->inline_inode); + si->inline_dir = atomic_read(&sbi->inline_dir); + si->compr_inode = atomic_read(&sbi->compr_inode); + si->swapfile_inode = atomic_read(&sbi->swapfile_inode); + si->compr_blocks = atomic64_read(&sbi->compr_blocks); + si->append = sbi->im[APPEND_INO].ino_num; + si->update = sbi->im[UPDATE_INO].ino_num; + si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); si->free_secs = free_sections(sbi); si->prefree_count = prefree_segments(sbi); si->dirty_count = dirty_segments(sbi); - si->node_pages = sbi->node_inode->i_mapping->nrpages; - si->meta_pages = sbi->meta_inode->i_mapping->nrpages; - si->nats = NM_I(sbi)->nat_cnt; - si->sits = SIT_I(sbi)->dirty_sentries; - si->fnids = NM_I(sbi)->fcnt; - si->bg_gc = sbi->bg_gc; - si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) + if (sbi->node_inode) + si->node_pages = NODE_MAPPING(sbi)->nrpages; + if (sbi->meta_inode) + si->meta_pages = META_MAPPING(sbi)->nrpages; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + si->compress_pages = COMPRESS_MAPPING(sbi)->nrpages; + si->compress_page_hit = atomic_read(&sbi->compress_page_hit); + } +#endif + si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; + si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; + si->sits = MAIN_SEGS(sbi); + si->dirty_sits = SIT_I(sbi)->dirty_sentries; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; + si->avail_nids = NM_I(sbi)->available_nids; + si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; + si->io_skip_bggc = sbi->io_skip_bggc; + si->other_skip_bggc = sbi->other_skip_bggc; + si->util_free = (int)(BLKS_TO_SEGS(sbi, free_user_blocks(sbi))) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; - si->util_valid = (int)(written_block_count(sbi) >> - sbi->log_blocks_per_seg) + si->util_valid = (int)(BLKS_TO_SEGS(sbi, written_block_count(sbi))) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; si->util_invalid = 50 - si->util_free - si->util_valid; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + + si->blkoff[i] = curseg->next_blkoff; si->curseg[i] = curseg->segno; - si->cursec[i] = curseg->segno / sbi->segs_per_sec; - si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } - for (i = 0; i < 2; i++) { - si->segment_count[i] = sbi->segment_count[i]; - si->block_count[i] = sbi->block_count[i]; + for (i = META_CP; i < META_MAX; i++) + si->meta_count[i] = atomic_read(&sbi->meta_count[i]); + + for (i = 0; i < NO_CHECK_TYPE; i++) { + si->dirty_seg[i] = 0; + si->full_seg[i] = 0; + si->valid_blks[i] = 0; } -} -/* - * This function calculates BDF of every segments - */ -static void update_sit_info(struct f2fs_sb_info *sbi) -{ - struct f2fs_stat_info *si = sbi->stat_info; - unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; - struct sit_info *sit_i = SIT_I(sbi); - unsigned int segno, vblocks; - int ndirty = 0; + for (i = 0; i < MAIN_SEGS(sbi); i++) { + int blks = get_seg_entry(sbi, i)->valid_blocks; + int type = get_seg_entry(sbi, i)->type; - bimodal = 0; - total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); - hblks_per_sec = blks_per_sec / 2; - mutex_lock(&sit_i->sentry_lock); - for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); - dist = abs(vblocks - hblks_per_sec); - bimodal += dist * dist; + if (!blks) + continue; - if (vblocks > 0 && vblocks < blks_per_sec) { - total_vblocks += vblocks; - ndirty++; - } + if (blks == BLKS_PER_SEG(sbi)) + si->full_seg[type]++; + else + si->dirty_seg[type]++; + si->valid_blks[type] += blks; } - mutex_unlock(&sit_i->sentry_lock); - dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; - si->bimodal = bimodal / dist; - if (si->dirty_count) - si->avg_vblocks = total_vblocks / ndirty; - else - si->avg_vblocks = 0; + + update_multidevice_stats(sbi); + + for (i = 0; i < MAX_CALL_TYPE; i++) + si->cp_call_count[i] = atomic_read(&sbi->cp_call_count[i]); + + for (i = 0; i < 2; i++) { + si->segment_count[i] = sbi->segment_count[i]; + si->block_count[i] = sbi->block_count[i]; + } + + si->inplace_count = atomic_read(&sbi->inplace_count); } /* @@ -118,13 +298,17 @@ static void update_sit_info(struct f2fs_sb_info *sbi) */ static void update_mem_info(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; - unsigned npages; + struct f2fs_stat_info *si = F2FS_STAT(sbi); + int i; if (si->base_mem) goto get_cache; - si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); @@ -133,183 +317,477 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build sit */ si->base_mem += sizeof(struct sit_info); - si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry); - si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); - if (sbi->segs_per_sec > 1) - si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry); + si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE; + if (__is_large_section(sbi)) + si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); si->base_mem += __bitmap_size(sbi, SIT_BITMAP); /* build free segmap */ si->base_mem += sizeof(struct free_segmap_info); - si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; - si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE; + si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE; /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); - si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); + si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); - /* buld nm */ + /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + si->base_mem += F2FS_BLK_TO_BYTES(NM_I(sbi)->nat_bits_blocks); + si->base_mem += NM_I(sbi)->nat_blocks * + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK); + si->base_mem += NM_I(sbi)->nat_blocks / 8; + si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); + +get_cache: + si->cache_mem = 0; /* build gc */ - si->base_mem += sizeof(struct f2fs_gc_kthread); + if (sbi->gc_thread) + si->cache_mem += sizeof(struct f2fs_gc_kthread); + + /* build merge flush thread */ + if (SM_I(sbi)->fcc_info) + si->cache_mem += sizeof(struct flush_cmd_control); + if (SM_I(sbi)->dcc_info) { + si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } -get_cache: /* free nids */ - si->cache_mem = NM_I(sbi)->fcnt; - si->cache_mem += NM_I(sbi)->nat_cnt; - npages = sbi->node_inode->i_mapping->nrpages; - si->cache_mem += npages << PAGE_CACHE_SHIFT; - npages = sbi->meta_inode->i_mapping->nrpages; - si->cache_mem += npages << PAGE_CACHE_SHIFT; - si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); - si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * + sizeof(struct free_nid); + si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry); + si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * + sizeof(struct nat_entry_set); + for (i = 0; i < MAX_INO_ENTRY; i++) + si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); + + for (i = 0; i < NR_EXTENT_CACHES; i++) { + struct extent_tree_info *eti = &sbi->extent_tree[i]; + + si->ext_mem[i] = atomic_read(&eti->total_ext_tree) * + sizeof(struct extent_tree); + si->ext_mem[i] += atomic_read(&eti->total_ext_node) * + sizeof(struct extent_node); + si->cache_mem += si->ext_mem[i]; + } + + si->page_mem = 0; + if (sbi->node_inode) { + unsigned long npages = NODE_MAPPING(sbi)->nrpages; + + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } + if (sbi->meta_inode) { + unsigned long npages = META_MAPPING(sbi)->nrpages; + + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages; + + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } +#endif } +static const char *s_flag[MAX_SBI_FLAG] = { + [SBI_IS_DIRTY] = "fs_dirty", + [SBI_IS_CLOSE] = "closing", + [SBI_NEED_FSCK] = "need_fsck", + [SBI_POR_DOING] = "recovering", + [SBI_NEED_SB_WRITE] = "sb_dirty", + [SBI_NEED_CP] = "need_cp", + [SBI_IS_SHUTDOWN] = "shutdown", + [SBI_IS_RECOVERED] = "recovered", + [SBI_CP_DISABLED] = "cp_disabled", + [SBI_CP_DISABLED_QUICK] = "cp_disabled_quick", + [SBI_QUOTA_NEED_FLUSH] = "quota_need_flush", + [SBI_QUOTA_SKIP_FLUSH] = "quota_skip_flush", + [SBI_QUOTA_NEED_REPAIR] = "quota_need_repair", + [SBI_IS_RESIZEFS] = "resizefs", + [SBI_IS_FREEZING] = "freezefs", + [SBI_IS_WRITABLE] = "writable", +}; + +static const char *ipu_mode_names[F2FS_IPU_MAX] = { + [F2FS_IPU_FORCE] = "FORCE", + [F2FS_IPU_SSR] = "SSR", + [F2FS_IPU_UTIL] = "UTIL", + [F2FS_IPU_SSR_UTIL] = "SSR_UTIL", + [F2FS_IPU_FSYNC] = "FSYNC", + [F2FS_IPU_ASYNC] = "ASYNC", + [F2FS_IPU_NOCACHE] = "NOCACHE", + [F2FS_IPU_HONOR_OPU_WRITE] = "HONOR_OPU_WRITE", +}; + static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; - int i = 0; - int j; + int i = 0, j = 0; - mutex_lock(&f2fs_stat_mutex); + spin_lock(&f2fs_stat_lock); list_for_each_entry(si, &f2fs_stat_list, stat_list) { - char devname[BDEVNAME_SIZE]; - - update_general_status(si->sbi); - - seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", - bdevname(si->sbi->sb->s_bdev, devname), i++); + struct f2fs_sb_info *sbi = si->sbi; + + update_general_status(sbi); + + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", + sbi->sb->s_bdev, i++, + f2fs_readonly(sbi->sb) ? "RO" : "RW", + is_set_ckpt_flags(sbi, CP_DISABLED_FLAG) ? + "Disabled" : (f2fs_cp_error(sbi) ? "Error" : "Good")); + if (sbi->s_flag) { + seq_puts(s, "[SBI:"); + for_each_set_bit(j, &sbi->s_flag, MAX_SBI_FLAG) + seq_printf(s, " %s", s_flag[j]); + seq_puts(s, "]\n"); + } seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", si->overp_segs, si->rsvd_segs); - seq_printf(s, "Utilization: %d%% (%d valid blocks)\n", - si->utilization, si->valid_count); + seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n", + ktime_get_boottime_seconds(), + SIT_I(sbi)->mounted_time); + + seq_puts(s, "Policy:\n"); + seq_puts(s, " - IPU: ["); + if (IS_F2FS_IPU_DISABLE(sbi)) { + seq_puts(s, " DISABLE"); + } else { + unsigned long policy = SM_I(sbi)->ipu_policy; + + for_each_set_bit(j, &policy, F2FS_IPU_MAX) + seq_printf(s, " %s", ipu_mode_names[j]); + } + seq_puts(s, " ]\n\n"); + + if (test_opt(sbi, DISCARD)) + seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", + si->utilization, si->valid_count, si->discard_blks); + else + seq_printf(s, "Utilization: %u%% (%u valid blocks)\n", + si->utilization, si->valid_count); + seq_printf(s, " - Node: %u (Inode: %u, ", si->valid_node_count, si->valid_inode_count); seq_printf(s, "Other: %u)\n - Data: %u\n", si->valid_node_count - si->valid_inode_count, si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_xattr Inode: %u\n", + si->inline_xattr); + seq_printf(s, " - Inline_data Inode: %u\n", + si->inline_inode); + seq_printf(s, " - Inline_dentry Inode: %u\n", + si->inline_dir); + seq_printf(s, " - Compressed Inode: %u, Blocks: %llu\n", + si->compr_inode, si->compr_blocks); + seq_printf(s, " - Swapfile Inode: %u\n", + si->swapfile_inode); + seq_printf(s, " - Donate Inode: %u\n", + si->ndonate_files); + seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", + si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); - seq_printf(s, " - COLD data: %d, %d, %d\n", + seq_printf(s, " TYPE %8s %8s %8s %8s %10s %10s %10s\n", + "blkoff", "segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk"); + seq_printf(s, " - COLD data: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_COLD_DATA], si->curseg[CURSEG_COLD_DATA], si->cursec[CURSEG_COLD_DATA], - si->curzone[CURSEG_COLD_DATA]); - seq_printf(s, " - WARM data: %d, %d, %d\n", + si->curzone[CURSEG_COLD_DATA], + si->dirty_seg[CURSEG_COLD_DATA], + si->full_seg[CURSEG_COLD_DATA], + si->valid_blks[CURSEG_COLD_DATA]); + seq_printf(s, " - WARM data: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_WARM_DATA], si->curseg[CURSEG_WARM_DATA], si->cursec[CURSEG_WARM_DATA], - si->curzone[CURSEG_WARM_DATA]); - seq_printf(s, " - HOT data: %d, %d, %d\n", + si->curzone[CURSEG_WARM_DATA], + si->dirty_seg[CURSEG_WARM_DATA], + si->full_seg[CURSEG_WARM_DATA], + si->valid_blks[CURSEG_WARM_DATA]); + seq_printf(s, " - HOT data: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_HOT_DATA], si->curseg[CURSEG_HOT_DATA], si->cursec[CURSEG_HOT_DATA], - si->curzone[CURSEG_HOT_DATA]); - seq_printf(s, " - Dir dnode: %d, %d, %d\n", + si->curzone[CURSEG_HOT_DATA], + si->dirty_seg[CURSEG_HOT_DATA], + si->full_seg[CURSEG_HOT_DATA], + si->valid_blks[CURSEG_HOT_DATA]); + seq_printf(s, " - Dir dnode: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_HOT_NODE], si->curseg[CURSEG_HOT_NODE], si->cursec[CURSEG_HOT_NODE], - si->curzone[CURSEG_HOT_NODE]); - seq_printf(s, " - File dnode: %d, %d, %d\n", + si->curzone[CURSEG_HOT_NODE], + si->dirty_seg[CURSEG_HOT_NODE], + si->full_seg[CURSEG_HOT_NODE], + si->valid_blks[CURSEG_HOT_NODE]); + seq_printf(s, " - File dnode: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_WARM_NODE], si->curseg[CURSEG_WARM_NODE], si->cursec[CURSEG_WARM_NODE], - si->curzone[CURSEG_WARM_NODE]); - seq_printf(s, " - Indir nodes: %d, %d, %d\n", + si->curzone[CURSEG_WARM_NODE], + si->dirty_seg[CURSEG_WARM_NODE], + si->full_seg[CURSEG_WARM_NODE], + si->valid_blks[CURSEG_WARM_NODE]); + seq_printf(s, " - Indir nodes: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_COLD_NODE], si->curseg[CURSEG_COLD_NODE], si->cursec[CURSEG_COLD_NODE], - si->curzone[CURSEG_COLD_NODE]); + si->curzone[CURSEG_COLD_NODE], + si->dirty_seg[CURSEG_COLD_NODE], + si->full_seg[CURSEG_COLD_NODE], + si->valid_blks[CURSEG_COLD_NODE]); + seq_printf(s, " - Pinned file: %8d %8d %8d %8d\n", + si->blkoff[CURSEG_COLD_DATA_PINNED], + si->curseg[CURSEG_COLD_DATA_PINNED], + si->cursec[CURSEG_COLD_DATA_PINNED], + si->curzone[CURSEG_COLD_DATA_PINNED]); + seq_printf(s, " - ATGC data: %8d %8d %8d %8d\n", + si->blkoff[CURSEG_ALL_DATA_ATGC], + si->curseg[CURSEG_ALL_DATA_ATGC], + si->cursec[CURSEG_ALL_DATA_ATGC], + si->curzone[CURSEG_ALL_DATA_ATGC]); seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n", si->main_area_segs - si->dirty_count - si->prefree_count - si->free_segs, si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); - seq_printf(s, "GC calls: %d (BG: %d)\n", - si->call_count, si->bg_gc); - seq_printf(s, " - data segments : %d\n", si->data_segs); - seq_printf(s, " - node segments : %d\n", si->node_segs); - seq_printf(s, "Try to move %d blocks\n", si->tot_blks); - seq_printf(s, " - data blocks : %d\n", si->data_blks); - seq_printf(s, " - node blocks : %d\n", si->node_blks); - seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", - si->hit_ext, si->total_ext); - seq_printf(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - nodes %4d in %4d\n", + if (f2fs_is_multi_device(sbi)) { + seq_puts(s, "Multidevice stats:\n"); + seq_printf(s, " [seg: %8s %8s %8s %8s %8s]", + "inuse", "dirty", "full", "free", "prefree"); + if (__is_large_section(sbi)) + seq_printf(s, " [sec: %8s %8s %8s %8s %8s]\n", + "inuse", "dirty", "full", "free", "prefree"); + else + seq_puts(s, "\n"); + + for (i = 0; i < sbi->s_ndevs; i++) { + seq_printf(s, " #%-2d %8u %8u %8u %8u %8u", i, + si->dev_stats[i].devstats[0][DEVSTAT_INUSE], + si->dev_stats[i].devstats[0][DEVSTAT_DIRTY], + si->dev_stats[i].devstats[0][DEVSTAT_FULL], + si->dev_stats[i].devstats[0][DEVSTAT_FREE], + si->dev_stats[i].devstats[0][DEVSTAT_PREFREE]); + if (!__is_large_section(sbi)) { + seq_puts(s, "\n"); + continue; + } + seq_printf(s, " %8u %8u %8u %8u %8u\n", + si->dev_stats[i].devstats[1][DEVSTAT_INUSE], + si->dev_stats[i].devstats[1][DEVSTAT_DIRTY], + si->dev_stats[i].devstats[1][DEVSTAT_FULL], + si->dev_stats[i].devstats[1][DEVSTAT_FREE], + si->dev_stats[i].devstats[1][DEVSTAT_PREFREE]); + } + seq_puts(s, "\n"); + } + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_call_count[TOTAL_CALL], + si->cp_call_count[BACKGROUND]); + seq_printf(s, "CP count: %d\n", si->cp_count); + seq_printf(s, " - cp blocks : %u\n", si->meta_count[META_CP]); + seq_printf(s, " - sit blocks : %u\n", + si->meta_count[META_SIT]); + seq_printf(s, " - nat blocks : %u\n", + si->meta_count[META_NAT]); + seq_printf(s, " - ssa blocks : %u\n", + si->meta_count[META_SSA]); + seq_puts(s, "CP merge:\n"); + seq_printf(s, " - Queued : %4d\n", si->nr_queued_ckpt); + seq_printf(s, " - Issued : %4d\n", si->nr_issued_ckpt); + seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt); + seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time); + seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time); + seq_printf(s, "GC calls: %d (gc_thread: %d)\n", + si->gc_call_count[BACKGROUND] + + si->gc_call_count[FOREGROUND], + si->gc_call_count[BACKGROUND]); + if (__is_large_section(sbi)) { + seq_printf(s, " - data sections : %d (BG: %d)\n", + si->gc_secs[DATA][BG_GC] + si->gc_secs[DATA][FG_GC], + si->gc_secs[DATA][BG_GC]); + seq_printf(s, " - node sections : %d (BG: %d)\n", + si->gc_secs[NODE][BG_GC] + si->gc_secs[NODE][FG_GC], + si->gc_secs[NODE][BG_GC]); + } + seq_printf(s, " - data segments : %d (BG: %d)\n", + si->gc_segs[DATA][BG_GC] + si->gc_segs[DATA][FG_GC], + si->gc_segs[DATA][BG_GC]); + seq_printf(s, " - node segments : %d (BG: %d)\n", + si->gc_segs[NODE][BG_GC] + si->gc_segs[NODE][FG_GC], + si->gc_segs[NODE][BG_GC]); + seq_puts(s, " - Reclaimed segs :\n"); + seq_printf(s, " - Normal : %d\n", sbi->gc_reclaimed_segs[GC_NORMAL]); + seq_printf(s, " - Idle CB : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_CB]); + seq_printf(s, " - Idle Greedy : %d\n", + sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]); + seq_printf(s, " - Idle AT : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_AT]); + seq_printf(s, " - Urgent High : %d\n", + sbi->gc_reclaimed_segs[GC_URGENT_HIGH]); + seq_printf(s, " - Urgent Mid : %d\n", sbi->gc_reclaimed_segs[GC_URGENT_MID]); + seq_printf(s, " - Urgent Low : %d\n", sbi->gc_reclaimed_segs[GC_URGENT_LOW]); + seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, + si->bg_data_blks + si->bg_node_blks); + seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, + si->bg_data_blks); + seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, + si->bg_node_blks); + seq_printf(s, "BG skip : IO: %u, Other: %u\n", + si->io_skip_bggc, si->other_skip_bggc); + seq_puts(s, "\nExtent Cache (Read):\n"); + seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", + si->hit_largest, si->hit_cached[EX_READ], + si->hit_rbtree[EX_READ]); + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", + !si->total_ext[EX_READ] ? 0 : + div64_u64(si->hit_total[EX_READ] * 100, + si->total_ext[EX_READ]), + si->hit_total[EX_READ], si->total_ext[EX_READ]); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree[EX_READ], si->zombie_tree[EX_READ], + si->ext_node[EX_READ]); + seq_puts(s, "\nExtent Cache (Block Age):\n"); + seq_printf(s, " - Allocated Data Blocks: %llu\n", + si->allocated_data_blocks); + seq_printf(s, " - Hit Count: L1:%llu L2:%llu\n", + si->hit_cached[EX_BLOCK_AGE], + si->hit_rbtree[EX_BLOCK_AGE]); + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", + !si->total_ext[EX_BLOCK_AGE] ? 0 : + div64_u64(si->hit_total[EX_BLOCK_AGE] * 100, + si->total_ext[EX_BLOCK_AGE]), + si->hit_total[EX_BLOCK_AGE], + si->total_ext[EX_BLOCK_AGE]); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree[EX_BLOCK_AGE], + si->zombie_tree[EX_BLOCK_AGE], + si->ext_node[EX_BLOCK_AGE]); + seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - DIO (R: %4d, W: %4d)\n", + si->nr_dio_read, si->nr_dio_write); + seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", + si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); + seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ", + si->nr_wb_cp_data, si->nr_wb_data, + si->nr_flushing, si->nr_flushed, + si->flush_list_empty); + seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd, si->undiscard_blks); + seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); + seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents %4d in dirs:%4d\n", - si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - meta %4d in %4d\n", + seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", + si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); + seq_printf(s, " - data: %4d in files:%4d\n", + si->ndirty_data, si->ndirty_files); + seq_printf(s, " - quota data: %4d in quota files:%4d\n", + si->ndirty_qdata, si->nquota_files); + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - NATs %5d > %lu\n", - si->nats, NM_WOUT_THRESHOLD); - seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", - si->sits, si->fnids); - seq_printf(s, "\nDistribution of User Blocks:"); - seq_printf(s, " [ valid | invalid | free ]\n"); - seq_printf(s, " ["); + seq_printf(s, " - imeta: %4d\n", + si->ndirty_imeta); + seq_printf(s, " - fsync mark: %4lld\n", + percpu_counter_sum_positive( + &sbi->rf_node_block_count)); + seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", + si->dirty_nats, si->nats, si->dirty_sits, si->sits); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); + seq_puts(s, "\nDistribution of User Blocks:"); + seq_puts(s, " [ valid | invalid | free ]\n"); + seq_puts(s, " ["); for (j = 0; j < si->util_valid; j++) - seq_printf(s, "-"); - seq_printf(s, "|"); + seq_putc(s, '-'); + seq_putc(s, '|'); for (j = 0; j < si->util_invalid; j++) - seq_printf(s, "-"); - seq_printf(s, "|"); + seq_putc(s, '-'); + seq_putc(s, '|'); for (j = 0; j < si->util_free; j++) - seq_printf(s, "-"); - seq_printf(s, "]\n\n"); + seq_putc(s, '-'); + seq_puts(s, "]\n\n"); + seq_printf(s, "IPU: %u blocks\n", si->inplace_count); seq_printf(s, "SSR: %u blocks in %u segments\n", si->block_count[SSR], si->segment_count[SSR]); seq_printf(s, "LFS: %u blocks in %u segments\n", si->block_count[LFS], si->segment_count[LFS]); /* segment usage info */ - update_sit_info(si->sbi); + f2fs_update_sit_info(sbi); seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", si->bimodal, si->avg_vblocks); /* memory footprint */ - update_mem_info(si->sbi); - seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", - (si->base_mem + si->cache_mem) >> 10, - si->base_mem >> 10, si->cache_mem >> 10); + update_mem_info(sbi); + seq_printf(s, "\nMemory: %llu KB\n", + (si->base_mem + si->cache_mem + si->page_mem) >> 10); + seq_printf(s, " - static: %llu KB\n", + si->base_mem >> 10); + seq_printf(s, " - cached all: %llu KB\n", + si->cache_mem >> 10); + seq_printf(s, " - read extent cache: %llu KB\n", + si->ext_mem[EX_READ] >> 10); + seq_printf(s, " - block age extent cache: %llu KB\n", + si->ext_mem[EX_BLOCK_AGE] >> 10); + seq_printf(s, " - paged : %llu KB\n", + si->page_mem >> 10); } - mutex_unlock(&f2fs_stat_mutex); + spin_unlock(&f2fs_stat_lock); return 0; } -static int stat_open(struct inode *inode, struct file *file) -{ - return single_open(file, stat_show, inode->i_private); -} - -static const struct file_operations stat_fops = { - .open = stat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(stat); +#endif int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; + struct f2fs_dev_stats *dev_stats; + int i; - sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); - if (!sbi->stat_info) + si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); + if (!si) return -ENOMEM; - si = sbi->stat_info; + dev_stats = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_stats) * + sbi->s_ndevs, GFP_KERNEL); + if (!dev_stats) { + kfree(si); + return -ENOMEM; + } + + si->dev_stats = dev_stats; + si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -319,35 +797,66 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); si->sbi = sbi; + sbi->stat_info = si; - mutex_lock(&f2fs_stat_mutex); + /* general extent cache stats */ + for (i = 0; i < NR_EXTENT_CACHES; i++) { + atomic64_set(&sbi->total_hit_ext[i], 0); + atomic64_set(&sbi->read_hit_rbtree[i], 0); + atomic64_set(&sbi->read_hit_cached[i], 0); + } + + /* read extent_cache only */ + atomic64_set(&sbi->read_hit_largest, 0); + + atomic_set(&sbi->inline_xattr, 0); + atomic_set(&sbi->inline_inode, 0); + atomic_set(&sbi->inline_dir, 0); + atomic_set(&sbi->compr_inode, 0); + atomic64_set(&sbi->compr_blocks, 0); + atomic_set(&sbi->swapfile_inode, 0); + atomic_set(&sbi->atomic_files, 0); + atomic_set(&sbi->inplace_count, 0); + for (i = META_CP; i < META_MAX; i++) + atomic_set(&sbi->meta_count[i], 0); + for (i = 0; i < MAX_CALL_TYPE; i++) + atomic_set(&sbi->cp_call_count[i], 0); + + atomic_set(&sbi->max_aw_cnt, 0); + + spin_lock(&f2fs_stat_lock); list_add_tail(&si->stat_list, &f2fs_stat_list); - mutex_unlock(&f2fs_stat_mutex); + spin_unlock(&f2fs_stat_lock); return 0; } void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); - mutex_lock(&f2fs_stat_mutex); + spin_lock(&f2fs_stat_lock); list_del(&si->stat_list); - mutex_unlock(&f2fs_stat_mutex); + spin_unlock(&f2fs_stat_lock); - kfree(sbi->stat_info); + kfree(si->dev_stats); + kfree(si); } void __init f2fs_create_root_stats(void) { - debugfs_root = debugfs_create_dir("f2fs", NULL); - if (debugfs_root) - debugfs_create_file("status", S_IRUGO, debugfs_root, - NULL, &stat_fops); +#ifdef CONFIG_DEBUG_FS + f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); + + debugfs_create_file("status", 0444, f2fs_debugfs_root, NULL, + &stat_fops); +#endif } void f2fs_destroy_root_stats(void) { - debugfs_remove_recursive(debugfs_root); - debugfs_root = NULL; +#ifdef CONFIG_DEBUG_FS + debugfs_remove_recursive(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; +#endif } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 62f0d5977c64..48f4f98afb01 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1,32 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/dir.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ +#include <linux/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/sched/signal.h> +#include <linux/unicode.h> #include "f2fs.h" #include "node.h" #include "acl.h" #include "xattr.h" +#include <trace/events/f2fs.h> + +static inline bool f2fs_should_fallback_to_linear(struct inode *dir) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + switch (F2FS_OPTION(sbi).lookup_mode) { + case LOOKUP_PERF: + return false; + case LOOKUP_COMPAT: + return true; + case LOOKUP_AUTO: + return !sb_no_casefold_compat_fallback(sbi->sb); + } + return false; +} + +#if IS_ENABLED(CONFIG_UNICODE) +extern struct kmem_cache *f2fs_cf_name_slab; +#endif static unsigned long dir_blocks(struct inode *inode) { - return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1)) - >> PAGE_CACHE_SHIFT; + return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) + >> PAGE_SHIFT; } -static unsigned int dir_buckets(unsigned int level) +static unsigned int dir_buckets(unsigned int level, int dir_level) { - if (level < MAX_DIR_HASH_DEPTH / 2) - return 1 << level; + if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) + return BIT(level + dir_level); else - return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); + return MAX_DIR_BUCKETS; } static unsigned int bucket_blocks(unsigned int level) @@ -37,558 +57,900 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } -static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { - [F2FS_FT_UNKNOWN] = DT_UNKNOWN, - [F2FS_FT_REG_FILE] = DT_REG, - [F2FS_FT_DIR] = DT_DIR, - [F2FS_FT_CHRDEV] = DT_CHR, - [F2FS_FT_BLKDEV] = DT_BLK, - [F2FS_FT_FIFO] = DT_FIFO, - [F2FS_FT_SOCK] = DT_SOCK, - [F2FS_FT_SYMLINK] = DT_LNK, -}; +#if IS_ENABLED(CONFIG_UNICODE) +/* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */ +int f2fs_init_casefolded_name(const struct inode *dir, + struct f2fs_filename *fname) +{ + struct super_block *sb = dir->i_sb; + unsigned char *buf; + int len; + + if (IS_CASEFOLDED(dir) && + !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { + buf = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, + GFP_NOFS, false, F2FS_SB(sb)); + if (!buf) + return -ENOMEM; + + len = utf8_casefold(sb->s_encoding, fname->usr_fname, + buf, F2FS_NAME_LEN); + if (len <= 0) { + kmem_cache_free(f2fs_cf_name_slab, buf); + if (sb_has_strict_encoding(sb)) + return -EINVAL; + /* fall back to treating name as opaque byte sequence */ + return 0; + } + fname->cf_name.name = buf; + fname->cf_name.len = len; + } -#define S_SHIFT 12 -static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, - [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK, - [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, -}; + return 0; +} + +void f2fs_free_casefolded_name(struct f2fs_filename *fname) +{ + unsigned char *buf = (unsigned char *)fname->cf_name.name; + + if (buf) { + kmem_cache_free(f2fs_cf_name_slab, buf); + fname->cf_name.name = NULL; + } +} +#endif /* CONFIG_UNICODE */ + +static int __f2fs_setup_filename(const struct inode *dir, + const struct fscrypt_name *crypt_name, + struct f2fs_filename *fname) +{ + int err; + + memset(fname, 0, sizeof(*fname)); + + fname->usr_fname = crypt_name->usr_fname; + fname->disk_name = crypt_name->disk_name; +#ifdef CONFIG_FS_ENCRYPTION + fname->crypto_buf = crypt_name->crypto_buf; +#endif + if (crypt_name->is_nokey_name) { + /* hash was decoded from the no-key name */ + fname->hash = cpu_to_le32(crypt_name->hash); + } else { + err = f2fs_init_casefolded_name(dir, fname); + if (err) { + f2fs_free_filename(fname); + return err; + } + f2fs_hash_filename(dir, fname); + } + return 0; +} + +/* + * Prepare to search for @iname in @dir. This is similar to + * fscrypt_setup_filename(), but this also handles computing the casefolded name + * and the f2fs dirhash if needed, then packing all the information about this + * filename up into a 'struct f2fs_filename'. + */ +int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct f2fs_filename *fname) +{ + struct fscrypt_name crypt_name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &crypt_name); + if (err) + return err; + + return __f2fs_setup_filename(dir, &crypt_name, fname); +} + +/* + * Prepare to look up @dentry in @dir. This is similar to + * fscrypt_prepare_lookup(), but this also handles computing the casefolded name + * and the f2fs dirhash if needed, then packing all the information about this + * filename up into a 'struct f2fs_filename'. + */ +int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct f2fs_filename *fname) +{ + struct fscrypt_name crypt_name; + int err; -static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) + err = fscrypt_prepare_lookup(dir, dentry, &crypt_name); + if (err) + return err; + + return __f2fs_setup_filename(dir, &crypt_name, fname); +} + +void f2fs_free_filename(struct f2fs_filename *fname) { - umode_t mode = inode->i_mode; - de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; +#ifdef CONFIG_FS_ENCRYPTION + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; +#endif + f2fs_free_casefolded_name(fname); } -static unsigned long dir_block_index(unsigned int level, unsigned int idx) +static unsigned long dir_block_index(unsigned int level, + int dir_level, unsigned int idx) { unsigned long i; unsigned long bidx = 0; for (i = 0; i < level; i++) - bidx += dir_buckets(i) * bucket_blocks(i); + bidx += mul_u32_u32(dir_buckets(i, dir_level), + bucket_blocks(i)); bidx += idx * bucket_blocks(level); return bidx; } -static bool early_match_name(const char *name, size_t namelen, - f2fs_hash_t namehash, struct f2fs_dir_entry *de) +static struct f2fs_dir_entry *find_in_block(struct inode *dir, + struct folio *dentry_folio, + const struct f2fs_filename *fname, + int *max_slots, + bool use_hash) { - if (le16_to_cpu(de->name_len) != namelen) - return false; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr d; - if (de->hash_code != namehash) - return false; + dentry_blk = folio_address(dentry_folio); - return true; + make_dentry_ptr_block(dir, &d, dentry_blk); + return f2fs_find_target_dentry(&d, fname, max_slots, use_hash); } -static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - const char *name, size_t namelen, int *max_slots, - f2fs_hash_t namehash, struct page **res_page) +static inline int f2fs_match_name(const struct inode *dir, + const struct f2fs_filename *fname, + const u8 *de_name, u32 de_name_len) +{ + struct fscrypt_name f; + +#if IS_ENABLED(CONFIG_UNICODE) + if (fname->cf_name.name) + return generic_ci_match(dir, fname->usr_fname, + &fname->cf_name, + de_name, de_name_len); + +#endif + f.usr_fname = fname->usr_fname; + f.disk_name = fname->disk_name; +#ifdef CONFIG_FS_ENCRYPTION + f.crypto_buf = fname->crypto_buf; +#endif + return fscrypt_match_name(&f, de_name, de_name_len); +} + +struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, + const struct f2fs_filename *fname, int *max_slots, + bool use_hash) { struct f2fs_dir_entry *de; - unsigned long bit_pos, end_pos, next_pos; - struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); - int slots; + unsigned long bit_pos = 0; + int max_len = 0; + int res = 0; + + if (max_slots) + *max_slots = 0; + while (bit_pos < d->max) { + if (!test_bit_le(bit_pos, d->bitmap)) { + bit_pos++; + max_len++; + continue; + } - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, 0); - while (bit_pos < NR_DENTRY_IN_BLOCK) { - de = &dentry_blk->dentry[bit_pos]; - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - - if (early_match_name(name, namelen, namehash, de)) { - if (!memcmp(dentry_blk->filename[bit_pos], - name, namelen)) { - *res_page = dentry_page; + de = &d->dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + if (!use_hash || de->hash_code == fname->hash) { + res = f2fs_match_name(d->inode, fname, + d->filename[bit_pos], + le16_to_cpu(de->name_len)); + if (res < 0) + return ERR_PTR(res); + if (res) goto found; - } } - next_pos = bit_pos + slots; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, next_pos); - if (bit_pos >= NR_DENTRY_IN_BLOCK) - end_pos = NR_DENTRY_IN_BLOCK; - else - end_pos = bit_pos; - if (*max_slots < end_pos - next_pos) - *max_slots = end_pos - next_pos; + + if (max_slots && max_len > *max_slots) + *max_slots = max_len; + max_len = 0; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; - kunmap(dentry_page); found: + if (max_slots && max_len > *max_slots) + *max_slots = max_len; return de; } static struct f2fs_dir_entry *find_in_level(struct inode *dir, - unsigned int level, const char *name, size_t namelen, - f2fs_hash_t namehash, struct page **res_page) + unsigned int level, + const struct f2fs_filename *fname, + struct folio **res_folio, + bool use_hash) { - int s = GET_DENTRY_SLOTS(namelen); + int s = GET_DENTRY_SLOTS(fname->disk_name.len); unsigned int nbucket, nblock; - unsigned int bidx, end_block; - struct page *dentry_page; + unsigned int bidx, end_block, bucket_no; struct f2fs_dir_entry *de = NULL; + pgoff_t next_pgofs; bool room = false; - int max_slots = 0; + int max_slots; - BUG_ON(level > MAX_DIR_HASH_DEPTH); - - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); + bucket_no = use_hash ? le32_to_cpu(fname->hash) % nbucket : 0; + +start_find_bucket: + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + bucket_no); end_block = bidx + nblock; - for (; bidx < end_block; bidx++) { + while (bidx < end_block) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = find_data_page(dir, bidx, true); - if (IS_ERR(dentry_page)) { - room = true; - continue; + struct folio *dentry_folio; + dentry_folio = f2fs_find_data_folio(dir, bidx, &next_pgofs); + if (IS_ERR(dentry_folio)) { + if (PTR_ERR(dentry_folio) == -ENOENT) { + room = true; + bidx = next_pgofs; + continue; + } else { + *res_folio = dentry_folio; + break; + } } - de = find_in_block(dentry_page, name, namelen, - &max_slots, namehash, res_page); - if (de) + de = find_in_block(dir, dentry_folio, fname, &max_slots, use_hash); + if (IS_ERR(de)) { + *res_folio = ERR_CAST(de); + de = NULL; break; + } else if (de) { + *res_folio = dentry_folio; + break; + } if (max_slots >= s) room = true; - f2fs_put_page(dentry_page, 0); - } + f2fs_folio_put(dentry_folio, false); - if (!de && room && F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; + bidx++; } - return de; + if (de) + return de; + + if (likely(use_hash)) { + if (room && F2FS_I(dir)->chash != fname->hash) { + F2FS_I(dir)->chash = fname->hash; + F2FS_I(dir)->clevel = level; + } + } else if (++bucket_no < nbucket) { + goto start_find_bucket; + } + return NULL; } -/* - * Find an entry in the specified directory with the wanted name. - * It returns the page where the entry was found (as a parameter - res_page), - * and the entry itself. Page is returned mapped and unlocked. - * Entry is guaranteed to be valid. - */ -struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, - struct qstr *child, struct page **res_page) +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + const struct f2fs_filename *fname, + struct folio **res_folio) { - const char *name = child->name; - size_t namelen = child->len; unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; - f2fs_hash_t name_hash; unsigned int max_depth; unsigned int level; + bool use_hash = true; - if (namelen > F2FS_NAME_LEN) - return NULL; + *res_folio = NULL; - if (npages == 0) - return NULL; +#if IS_ENABLED(CONFIG_UNICODE) +start_find_entry: +#endif + if (f2fs_has_inline_dentry(dir)) { + de = f2fs_find_in_inline_dir(dir, fname, res_folio, use_hash); + goto out; + } - *res_page = NULL; + if (npages == 0) + goto out; - name_hash = f2fs_dentry_hash(name, namelen); max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_warn(F2FS_I_SB(dir), "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + f2fs_i_depth_write(dir, max_depth); + } for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, name, - namelen, name_hash, res_page); - if (de) + de = find_in_level(dir, level, fname, res_folio, use_hash); + if (de || IS_ERR(*res_folio)) break; } - if (!de && F2FS_I(dir)->chash != name_hash) { - F2FS_I(dir)->chash = name_hash; - F2FS_I(dir)->clevel = level - 1; + +out: +#if IS_ENABLED(CONFIG_UNICODE) + if (f2fs_should_fallback_to_linear(dir) && + IS_CASEFOLDED(dir) && !de && use_hash) { + use_hash = false; + goto start_find_entry; } +#endif + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; return de; } -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) +/* + * Find an entry in the specified directory with the wanted name. + * It returns the page where the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct folio **res_folio) { - struct page *page; - struct f2fs_dir_entry *de; - struct f2fs_dentry_block *dentry_blk; + struct f2fs_dir_entry *de = NULL; + struct f2fs_filename fname; + int err; - page = get_lock_data_page(dir, 0); - if (IS_ERR(page)) + err = f2fs_setup_filename(dir, child, 1, &fname); + if (err) { + if (err == -ENOENT) + *res_folio = NULL; + else + *res_folio = ERR_PTR(err); return NULL; + } + + de = __f2fs_find_entry(dir, &fname, res_folio); - dentry_blk = kmap(page); - de = &dentry_blk->dentry[1]; - *p = page; - unlock_page(page); + f2fs_free_filename(&fname); return de; } -ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct folio **f) +{ + return f2fs_find_entry(dir, &dotdot_name, f); +} + +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct folio **folio) { ino_t res = 0; struct f2fs_dir_entry *de; - struct page *page; - de = f2fs_find_entry(dir, qstr, &page); + de = f2fs_find_entry(dir, qstr, folio); if (de) { res = le32_to_cpu(de->ino); - kunmap(page); - f2fs_put_page(page, 0); + f2fs_folio_put(*folio, false); } return res; } void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, - struct page *page, struct inode *inode) + struct folio *folio, struct inode *inode) { - lock_page(page); - wait_on_page_writeback(page); + enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; + + folio_lock(folio); + f2fs_folio_wait_writeback(folio, type, true, true); de->ino = cpu_to_le32(inode->i_ino); - set_de_type(de, inode); - kunmap(page); - set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + de->file_type = fs_umode_to_ftype(inode->i_mode); + folio_mark_dirty(folio); - /* update parent inode number before releasing dentry page */ - F2FS_I(inode)->i_pino = dir->i_ino; + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + f2fs_mark_inode_dirty_sync(dir, false); + f2fs_folio_put(folio, true); +} - f2fs_put_page(page, 1); +static void init_dent_inode(struct inode *dir, struct inode *inode, + const struct f2fs_filename *fname, + struct folio *ifolio) +{ + struct f2fs_inode *ri; + + if (!fname) /* tmpfile case? */ + return; + + f2fs_folio_wait_writeback(ifolio, NODE, true, true); + + /* copy name info. to this inode folio */ + ri = F2FS_INODE(ifolio); + ri->i_namelen = cpu_to_le32(fname->disk_name.len); + memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); + if (IS_ENCRYPTED(dir)) { + file_set_enc_name(inode); + /* + * Roll-forward recovery doesn't have encryption keys available, + * so it can't compute the dirhash for encrypted+casefolded + * filenames. Append it to i_name if possible. Else, disable + * roll-forward recovery of the dentry (i.e., make fsync'ing the + * file force a checkpoint) by setting LOST_PINO. + */ + if (IS_CASEFOLDED(dir)) { + if (fname->disk_name.len + sizeof(f2fs_hash_t) <= + F2FS_NAME_LEN) + put_unaligned(fname->hash, (f2fs_hash_t *) + &ri->i_name[fname->disk_name.len]); + else + file_lost_pino(inode); + } + } + folio_mark_dirty(ifolio); } -static void init_dent_inode(const struct qstr *name, struct page *ipage) +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d) { - struct f2fs_node *rn; + struct fscrypt_str dot = FSTR_INIT(".", 1); + struct fscrypt_str dotdot = FSTR_INIT("..", 2); - /* copy name info. to this inode page */ - rn = (struct f2fs_node *)page_address(ipage); - rn->i.i_namelen = cpu_to_le32(name->len); - memcpy(rn->i.i_name, name->name, name->len); - set_page_dirty(ipage); + /* update dirent of "." */ + f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); + + /* update dirent of ".." */ + f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1); } static int make_empty_dir(struct inode *inode, - struct inode *parent, struct page *page) + struct inode *parent, struct folio *folio) { - struct page *dentry_page; + struct folio *dentry_folio; struct f2fs_dentry_block *dentry_blk; - struct f2fs_dir_entry *de; - void *kaddr; + struct f2fs_dentry_ptr d; - dentry_page = get_new_data_page(inode, page, 0, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); + if (f2fs_has_inline_dentry(inode)) + return f2fs_make_empty_inline_dir(inode, parent, folio); - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; + dentry_folio = f2fs_get_new_data_folio(inode, folio, 0, true); + if (IS_ERR(dentry_folio)) + return PTR_ERR(dentry_folio); - de = &dentry_blk->dentry[0]; - de->name_len = cpu_to_le16(1); - de->hash_code = 0; - de->ino = cpu_to_le32(inode->i_ino); - memcpy(dentry_blk->filename[0], ".", 1); - set_de_type(de, inode); - - de = &dentry_blk->dentry[1]; - de->hash_code = 0; - de->name_len = cpu_to_le16(2); - de->ino = cpu_to_le32(parent->i_ino); - memcpy(dentry_blk->filename[1], "..", 2); - set_de_type(de, inode); - - test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); - test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); - kunmap_atomic(kaddr); - - set_page_dirty(dentry_page); - f2fs_put_page(dentry_page, 1); + dentry_blk = folio_address(dentry_folio); + + make_dentry_ptr_block(NULL, &d, dentry_blk); + f2fs_do_make_empty_dir(inode, parent, &d); + + folio_mark_dirty(dentry_folio); + f2fs_folio_put(dentry_folio, true); return 0; } -static struct page *init_inode_metadata(struct inode *inode, - struct inode *dir, const struct qstr *name) +struct folio *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, + const struct f2fs_filename *fname, struct folio *dfolio) { - struct page *page; + struct folio *folio; int err; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - page = new_inode_page(inode, name); - if (IS_ERR(page)) - return page; + if (is_inode_flag_set(inode, FI_NEW_INODE)) { + folio = f2fs_new_inode_folio(inode); + if (IS_ERR(folio)) + return folio; if (S_ISDIR(inode->i_mode)) { - err = make_empty_dir(inode, dir, page); - if (err) - goto error; + /* in order to handle error case */ + folio_get(folio); + err = make_empty_dir(inode, dir, folio); + if (err) { + folio_lock(folio); + goto put_error; + } + folio_put(folio); } - err = f2fs_init_acl(inode, dir); + err = f2fs_init_acl(inode, dir, folio, dfolio); if (err) - goto error; + goto put_error; - err = f2fs_init_security(inode, dir, name, page); + err = f2fs_init_security(inode, dir, + fname ? fname->usr_fname : NULL, + folio); if (err) - goto error; + goto put_error; - wait_on_page_writeback(page); + if (IS_ENCRYPTED(inode)) { + err = fscrypt_set_context(inode, folio); + if (err) + goto put_error; + } } else { - page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); - if (IS_ERR(page)) - return page; - - wait_on_page_writeback(page); - set_cold_node(inode, page); + folio = f2fs_get_inode_folio(F2FS_I_SB(dir), inode->i_ino); + if (IS_ERR(folio)) + return folio; } - init_dent_inode(name, page); + init_dent_inode(dir, inode, fname, folio); /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { - file_lost_pino(inode); - inc_nlink(inode); + if (is_inode_flag_set(inode, FI_INC_LINK)) { + if (!S_ISDIR(inode->i_mode)) + file_lost_pino(inode); + /* + * If link the tmpfile to alias through linkat path, + * we should remove this inode from orphan list. + */ + if (inode->i_nlink == 0) + f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); + f2fs_i_links_write(inode, true); } - return page; + return folio; -error: - f2fs_put_page(page, 1); - remove_inode_page(inode); +put_error: + clear_nlink(inode); + f2fs_update_inode(inode, folio); + f2fs_folio_put(folio, true); return ERR_PTR(err); } -static void update_parent_metadata(struct inode *dir, struct inode *inode, +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - if (S_ISDIR(inode->i_mode)) { - inc_nlink(dir); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); - } - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - if (F2FS_I(dir)->i_current_depth != current_depth) { - F2FS_I(dir)->i_current_depth = current_depth; - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, true); + clear_inode_flag(inode, FI_NEW_INODE); } + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + f2fs_mark_inode_dirty_sync(dir, false); - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) - update_inode_page(dir); - else - mark_inode_dirty(dir); + if (F2FS_I(dir)->i_current_depth != current_depth) + f2fs_i_depth_write(dir, current_depth); - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + if (inode && is_inode_flag_set(inode, FI_INC_LINK)) + clear_inode_flag(inode, FI_INC_LINK); } -static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots) +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots) { int bit_start = 0; int zero_start, zero_end; next: - zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, - bit_start); - if (zero_start >= NR_DENTRY_IN_BLOCK) - return NR_DENTRY_IN_BLOCK; + zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start); + if (zero_start >= max_slots) + return max_slots; - zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, - zero_start); + zero_end = find_next_bit_le(bitmap, max_slots, zero_start); if (zero_end - zero_start >= slots) return zero_start; bit_start = zero_end + 1; - if (zero_end + 1 >= NR_DENTRY_IN_BLOCK) - return NR_DENTRY_IN_BLOCK; + if (zero_end + 1 >= max_slots) + return max_slots; goto next; } -/* - * Caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). - */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) +bool f2fs_has_enough_room(struct inode *dir, struct folio *ifolio, + const struct f2fs_filename *fname) +{ + struct f2fs_dentry_ptr d; + unsigned int bit_pos; + int slots = GET_DENTRY_SLOTS(fname->disk_name.len); + + make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ifolio)); + + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); + + return bit_pos < d.max; +} + +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct fscrypt_str *name, f2fs_hash_t name_hash, + unsigned int bit_pos) +{ + struct f2fs_dir_entry *de; + int slots = GET_DENTRY_SLOTS(name->len); + int i; + + de = &d->dentry[bit_pos]; + de->hash_code = name_hash; + de->name_len = cpu_to_le16(name->len); + memcpy(d->filename[bit_pos], name->name, name->len); + de->ino = cpu_to_le32(ino); + de->file_type = fs_umode_to_ftype(mode); + for (i = 0; i < slots; i++) { + __set_bit_le(bit_pos + i, (void *)d->bitmap); + /* avoid wrong garbage data for readdir */ + if (i) + (de + i)->name_len = 0; + } +} + +int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; unsigned int level; unsigned int current_depth; unsigned long bidx, block; - f2fs_hash_t dentry_hash; - struct f2fs_dir_entry *de; unsigned int nbucket, nblock; - size_t namelen = name->len; - struct page *dentry_page = NULL; + struct folio *dentry_folio = NULL; struct f2fs_dentry_block *dentry_blk = NULL; - int slots = GET_DENTRY_SLOTS(namelen); - struct page *page; - int err = 0; - int i; + struct f2fs_dentry_ptr d; + struct folio *folio = NULL; + int slots, err = 0; - dentry_hash = f2fs_dentry_hash(name->name, name->len); level = 0; + slots = GET_DENTRY_SLOTS(fname->disk_name.len); + current_depth = F2FS_I(dir)->i_current_depth; - if (F2FS_I(dir)->chash == dentry_hash) { + if (F2FS_I(dir)->chash == fname->hash) { level = F2FS_I(dir)->clevel; F2FS_I(dir)->chash = 0; } start: - if (current_depth == MAX_DIR_HASH_DEPTH) + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + return -ENOSPC; + + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) ++current_depth; - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + (le32_to_cpu(fname->hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { - dentry_page = get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); + dentry_folio = f2fs_get_new_data_folio(dir, NULL, block, true); + if (IS_ERR(dentry_folio)) + return PTR_ERR(dentry_folio); - dentry_blk = kmap(dentry_page); - bit_pos = room_for_filename(dentry_blk, slots); + dentry_blk = folio_address(dentry_folio); + bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap, + slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); + f2fs_folio_put(dentry_folio, true); } /* Move to next level to find the empty slot for new dentry */ ++level; goto start; add_dentry: - wait_on_page_writeback(dentry_page); + f2fs_folio_wait_writeback(dentry_folio, DATA, true, true); - page = init_inode_metadata(inode, dir, name); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto fail; + if (inode) { + f2fs_down_write(&F2FS_I(inode)->i_sem); + folio = f2fs_init_inode_metadata(inode, dir, fname, NULL); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto fail; + } } - de = &dentry_blk->dentry[bit_pos]; - de->hash_code = dentry_hash; - de->name_len = cpu_to_le16(namelen); - memcpy(dentry_blk->filename[bit_pos], name->name, name->len); - de->ino = cpu_to_le32(inode->i_ino); - set_de_type(de, inode); - for (i = 0; i < slots; i++) - test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); - set_page_dirty(dentry_page); - /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); - f2fs_put_page(page, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); + f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, + bit_pos); + + folio_mark_dirty(dentry_folio); + + if (inode) { + f2fs_i_pino_write(inode, dir->i_ino); + + /* synchronize inode page's data from inode cache */ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + f2fs_update_inode(inode, folio); + + f2fs_folio_put(folio, true); + } + + f2fs_update_parent_metadata(dir, inode, current_depth); +fail: + if (inode) + f2fs_up_write(&F2FS_I(inode)->i_sem); + + f2fs_folio_put(dentry_folio, true); + + return err; +} + +int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + int err = -EAGAIN; + + if (f2fs_has_inline_dentry(dir)) { + /* + * Should get i_xattr_sem to keep the lock order: + * i_xattr_sem -> inode_page lock used by f2fs_setxattr. + */ + f2fs_down_read(&F2FS_I(dir)->i_xattr_sem); + err = f2fs_add_inline_entry(dir, fname, inode, ino, mode); + f2fs_up_read(&F2FS_I(dir)->i_xattr_sem); + } + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, fname, inode, ino, mode); + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + return err; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct f2fs_filename fname; + struct folio *folio = NULL; + struct f2fs_dir_entry *de = NULL; + int err; + + err = f2fs_setup_filename(dir, name, 0, &fname); + if (err) + return err; - update_parent_metadata(dir, inode, current_depth); + /* + * An immature stackable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &folio); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_folio_put(folio, false); + err = -EEXIST; + } else if (IS_ERR(folio)) { + err = PTR_ERR(folio); + } else { + err = f2fs_add_dentry(dir, &fname, inode, ino, mode); + } + f2fs_free_filename(&fname); + return err; +} + +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, + struct f2fs_filename *fname) +{ + struct folio *folio; + int err = 0; + + f2fs_down_write(&F2FS_I(inode)->i_sem); + folio = f2fs_init_inode_metadata(inode, dir, fname, NULL); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto fail; + } + f2fs_folio_put(folio, true); + + clear_inode_flag(inode, FI_NEW_INODE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); + f2fs_up_write(&F2FS_I(inode)->i_sem); return err; } +void f2fs_drop_nlink(struct inode *dir, struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + f2fs_down_write(&F2FS_I(inode)->i_sem); + + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, false); + inode_set_ctime_current(inode); + + f2fs_i_links_write(inode, false); + if (S_ISDIR(inode->i_mode)) { + f2fs_i_links_write(inode, false); + f2fs_i_size_write(inode, 0); + } + f2fs_up_write(&F2FS_I(inode)->i_sem); + + if (inode->i_nlink == 0) + f2fs_add_orphan_inode(inode); + else + f2fs_release_orphan_inode(sbi); +} + /* - * It only removes the dentry from the dentry page,corresponding name + * It only removes the dentry from the dentry page, corresponding name * entry in name page does not need to be touched during deletion. */ -void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, - struct inode *inode) +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio, + struct inode *dir, struct inode *inode) { - struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; - struct address_space *mapping = page->mapping; - struct inode *dir = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); - void *kaddr = page_address(page); + pgoff_t index = folio->index; int i; - lock_page(page); - wait_on_page_writeback(page); + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + + if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) + f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + + if (f2fs_has_inline_dentry(dir)) + return f2fs_delete_inline_entry(dentry, folio, dir, inode); - dentry_blk = (struct f2fs_dentry_block *)kaddr; - bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; + folio_lock(folio); + f2fs_folio_wait_writeback(folio, DATA, true, true); + + dentry_blk = folio_address(folio); + bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) - test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, 0); - kunmap(page); /* kunmap - pair of f2fs_find_entry */ - set_page_dirty(page); + folio_mark_dirty(folio); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + if (bit_pos == NR_DENTRY_IN_BLOCK && + !f2fs_truncate_hole(dir, index, index + 1)) { + f2fs_clear_page_cache_dirty_tag(folio); + folio_clear_dirty_for_io(folio); + folio_clear_uptodate(folio); + folio_detach_private(folio); - if (inode && S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - update_inode_page(dir); - } else { - mark_inode_dirty(dir); + inode_dec_dirty_pages(dir); + f2fs_remove_dirty_inode(dir); } + f2fs_folio_put(folio, true); - if (inode) { - inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); - if (S_ISDIR(inode->i_mode)) { - drop_nlink(inode); - i_size_write(inode, 0); - } - update_inode_page(inode); - - if (inode->i_nlink == 0) - add_orphan_inode(sbi, inode->i_ino); - } + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + f2fs_mark_inode_dirty_sync(dir, false); - if (bit_pos == NR_DENTRY_IN_BLOCK) { - truncate_hole(dir, page->index, page->index + 1); - clear_page_dirty_for_io(page); - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(dir); - } - f2fs_put_page(page, 1); + if (inode) + f2fs_drop_nlink(dir, inode); } bool f2fs_empty_dir(struct inode *dir) { - unsigned long bidx; - struct page *dentry_page; + unsigned long bidx = 0; unsigned int bit_pos; - struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_block *dentry_blk; unsigned long nblock = dir_blocks(dir); - for (bidx = 0; bidx < nblock; bidx++) { - void *kaddr; - dentry_page = get_lock_data_page(dir, bidx); - if (IS_ERR(dentry_page)) { - if (PTR_ERR(dentry_page) == -ENOENT) + if (f2fs_has_inline_dentry(dir)) + return f2fs_empty_inline_dir(dir); + + while (bidx < nblock) { + pgoff_t next_pgofs; + struct folio *dentry_folio; + + dentry_folio = f2fs_find_data_folio(dir, bidx, &next_pgofs); + if (IS_ERR(dentry_folio)) { + if (PTR_ERR(dentry_folio) == -ENOENT) { + bidx = next_pgofs; continue; - else + } else { return false; + } } - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; + dentry_blk = folio_address(dentry_folio); if (bidx == 0) bit_pos = 2; else @@ -596,75 +958,182 @@ bool f2fs_empty_dir(struct inode *dir) bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); - kunmap_atomic(kaddr); - f2fs_put_page(dentry_page, 1); + f2fs_folio_put(dentry_folio, false); if (bit_pos < NR_DENTRY_IN_BLOCK) return false; + + bidx++; } return true; } +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr) +{ + unsigned char d_type = DT_UNKNOWN; + unsigned int bit_pos; + struct f2fs_dir_entry *de = NULL; + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); + struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); + struct blk_plug plug; + bool readdir_ra = sbi->readdir_ra; + bool found_valid_dirent = false; + int err = 0; + + bit_pos = ((unsigned long)ctx->pos % d->max); + + if (readdir_ra) + blk_start_plug(&plug); + + while (bit_pos < d->max) { + bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); + if (bit_pos >= d->max) + break; + + de = &d->dentry[bit_pos]; + if (de->name_len == 0) { + if (found_valid_dirent || !bit_pos) { + f2fs_warn_ratelimited(sbi, + "invalid namelen(0), ino:%u, run fsck to fix.", + le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + bit_pos++; + ctx->pos = start_pos + bit_pos; + continue; + } + + d_type = fs_ftype_to_dtype(de->file_type); + + de_name.name = d->filename[bit_pos]; + de_name.len = le16_to_cpu(de->name_len); + + /* check memory boundary before moving forward */ + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + if (unlikely(bit_pos > d->max || + le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { + f2fs_warn(sbi, "%s: corrupted namelen=%d, run fsck to fix.", + __func__, le16_to_cpu(de->name_len)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); + goto out; + } + + if (IS_ENCRYPTED(d->inode)) { + int save_len = fstr->len; + + err = fscrypt_fname_disk_to_usr(d->inode, + (u32)le32_to_cpu(de->hash_code), + 0, &de_name, fstr); + if (err) + goto out; + + de_name = *fstr; + fstr->len = save_len; + } + + if (!dir_emit(ctx, de_name.name, de_name.len, + le32_to_cpu(de->ino), d_type)) { + err = 1; + goto out; + } + + if (readdir_ra) + f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); + + ctx->pos = start_pos + bit_pos; + found_valid_dirent = true; + } +out: + if (readdir_ra) + blk_finish_plug(&plug); + return err; +} + static int f2fs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); unsigned long npages = dir_blocks(inode); - unsigned int bit_pos = 0; struct f2fs_dentry_block *dentry_blk = NULL; - struct f2fs_dir_entry *de = NULL; - struct page *dentry_page = NULL; + struct file_ra_state *ra = &file->f_ra; + loff_t start_pos = ctx->pos; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); - unsigned char d_type = DT_UNKNOWN; + struct f2fs_dentry_ptr d; + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); + int err = 0; - bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); + if (IS_ENCRYPTED(inode)) { + err = fscrypt_prepare_readdir(inode); + if (err) + goto out; - for ( ; n < npages; n++) { - dentry_page = get_lock_data_page(inode, n); - if (IS_ERR(dentry_page)) - continue; + err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr); + if (err < 0) + goto out; + } - dentry_blk = kmap(dentry_page); - while (bit_pos < NR_DENTRY_IN_BLOCK) { - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, - bit_pos); - if (bit_pos >= NR_DENTRY_IN_BLOCK) - break; + if (f2fs_has_inline_dentry(inode)) { + err = f2fs_read_inline_dir(file, ctx, &fstr); + goto out_free; + } - de = &dentry_blk->dentry[bit_pos]; - if (de->file_type < F2FS_FT_MAX) - d_type = f2fs_filetype_table[de->file_type]; - else - d_type = DT_UNKNOWN; - if (!dir_emit(ctx, - dentry_blk->filename[bit_pos], - le16_to_cpu(de->name_len), - le32_to_cpu(de->ino), d_type)) - goto stop; - - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos; + for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { + struct folio *dentry_folio; + pgoff_t next_pgofs; + + /* allow readdir() to be interrupted */ + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out_free; + } + cond_resched(); + + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, + min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); + + dentry_folio = f2fs_find_data_folio(inode, n, &next_pgofs); + if (IS_ERR(dentry_folio)) { + err = PTR_ERR(dentry_folio); + if (err == -ENOENT) { + err = 0; + n = next_pgofs; + continue; + } else { + goto out_free; + } } - bit_pos = 0; - ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); - dentry_page = NULL; - } -stop: - if (dentry_page && !IS_ERR(dentry_page)) { - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); - } - return 0; + dentry_blk = folio_address(dentry_folio); + + make_dentry_ptr_block(inode, &d, dentry_blk); + + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + f2fs_folio_put(dentry_folio, false); + if (err) + break; + + n++; + } +out_free: + fscrypt_fname_free_buffer(&fstr); +out: + trace_f2fs_readdir(inode, start_pos, ctx->pos, err); + return err < 0 ? err : 0; } const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = f2fs_readdir, + .iterate_shared = f2fs_readdir, .fsync = f2fs_sync_file, .unlocked_ioctl = f2fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = f2fs_compat_ioctl, +#endif }; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c new file mode 100644 index 000000000000..0ed84cc065a7 --- /dev/null +++ b/fs/f2fs/extent_cache.c @@ -0,0 +1,1255 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs extent cache support + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Samsung Electronics + * Authors: Jaegeuk Kim <jaegeuk@kernel.org> + * Chao Yu <chao2.yu@samsung.com> + * + * block_age-based extent cache added by: + * Copyright (c) 2022 xiaomi Co., Ltd. + * http://www.xiaomi.com/ + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "node.h" +#include <trace/events/f2fs.h> + +bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext; + struct extent_info ei; + int devi; + + get_read_extent_info(&ei, i_ext); + + if (!ei.len) + return true; + + if (!f2fs_is_valid_blkaddr(sbi, ei.blk, DATA_GENERIC_ENHANCE) || + !f2fs_is_valid_blkaddr(sbi, ei.blk + ei.len - 1, + DATA_GENERIC_ENHANCE)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", + __func__, inode->i_ino, + ei.blk, ei.fofs, ei.len); + return false; + } + + if (!IS_DEVICE_ALIASING(inode)) + return true; + + for (devi = 0; devi < sbi->s_ndevs; devi++) { + if (FDEV(devi).start_blk != ei.blk || + FDEV(devi).end_blk != ei.blk + ei.len - 1) + continue; + + if (devi == 0) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) is an alias of meta device", + __func__, inode->i_ino); + return false; + } + + if (bdev_is_zoned(FDEV(devi).bdev)) { + f2fs_warn(sbi, + "%s: device alias inode (ino=%lx)'s extent info " + "[%u, %u, %u] maps to zoned block device", + __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); + return false; + } + return true; + } + + f2fs_warn(sbi, "%s: device alias inode (ino=%lx)'s extent info " + "[%u, %u, %u] is inconsistent w/ any devices", + __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); + return false; +} + +static void __set_extent_info(struct extent_info *ei, + unsigned int fofs, unsigned int len, + block_t blk, bool keep_clen, + unsigned long age, unsigned long last_blocks, + enum extent_type type) +{ + ei->fofs = fofs; + ei->len = len; + + if (type == EX_READ) { + ei->blk = blk; + if (keep_clen) + return; +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif + } else if (type == EX_BLOCK_AGE) { + ei->age = age; + ei->last_blocks = last_blocks; + } +} + +static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) +{ + if (type == EX_READ) + return test_opt(F2FS_I_SB(inode), READ_EXTENT_CACHE) && + S_ISREG(inode->i_mode); + if (type == EX_BLOCK_AGE) + return test_opt(F2FS_I_SB(inode), AGE_EXTENT_CACHE) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)); + return false; +} + +static bool __may_extent_tree(struct inode *inode, enum extent_type type) +{ + if (IS_DEVICE_ALIASING(inode) && type == EX_READ) + return true; + + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&F2FS_I_SB(inode)->s_list)) + return false; + + if (!__init_may_extent_tree(inode, type)) + return false; + + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(F2FS_I_SB(inode))) + return false; + } else if (type == EX_BLOCK_AGE) { + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return false; + if (file_is_cold(inode)) + return false; + } + return true; +} + +static void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (et->type != EX_READ) + return; + if (en->ei.len <= et->largest.len) + return; + + et->largest = en->ei; + et->largest_updated = true; +} + +static bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front, enum extent_type type) +{ + if (type == EX_READ) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); + } else if (type == EX_BLOCK_AGE) { + return (back->fofs + back->len == front->fofs && + abs(back->age - front->age) <= SAME_AGE_REGION && + abs(back->last_blocks - front->last_blocks) <= + SAME_AGE_REGION); + } + return false; +} + +static bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back, enum extent_type type) +{ + return __is_extent_mergeable(back, cur, type); +} + +static bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front, enum extent_type type) +{ + return __is_extent_mergeable(cur, front, type); +} + +static struct extent_node *__lookup_extent_node(struct rb_root_cached *root, + struct extent_node *cached_en, unsigned int fofs) +{ + struct rb_node *node = root->rb_root.rb_node; + struct extent_node *en; + + /* check a cached entry */ + if (cached_en && cached_en->ei.fofs <= fofs && + cached_en->ei.fofs + cached_en->ei.len > fofs) + return cached_en; + + /* check rb_tree */ + while (node) { + en = rb_entry(node, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) + node = node->rb_left; + else if (fofs >= en->ei.fofs + en->ei.len) + node = node->rb_right; + else + return en; + } + return NULL; +} + +/* + * lookup rb entry in position of @fofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before fofs + * @next_ex: extent after fofs + * @insert_p: insert point for new extent at fofs + * in order to simplify the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +static struct extent_node *__lookup_extent_node_ret(struct rb_root_cached *root, + struct extent_node *cached_en, + unsigned int fofs, + struct extent_node **prev_entry, + struct extent_node **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent, + bool *leftmost) +{ + struct rb_node **pnode = &root->rb_root.rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct extent_node *en = cached_en; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(&root->rb_root)) + return NULL; + + if (en && en->ei.fofs <= fofs && en->ei.fofs + en->ei.len > fofs) + goto lookup_neighbors; + + *leftmost = true; + + while (*pnode) { + parent = *pnode; + en = rb_entry(*pnode, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) { + pnode = &(*pnode)->rb_left; + } else if (fofs >= en->ei.fofs + en->ei.len) { + pnode = &(*pnode)->rb_right; + *leftmost = false; + } else { + goto lookup_neighbors; + } + } + + *insert_p = pnode; + *insert_parent = parent; + + en = rb_entry(parent, struct extent_node, rb_node); + tmp_node = parent; + if (parent && fofs > en->ei.fofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct extent_node, rb_node); + + tmp_node = parent; + if (parent && fofs < en->ei.fofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct extent_node, rb_node); + return NULL; + +lookup_neighbors: + if (fofs == en->ei.fofs) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&en->rb_node); + *prev_entry = rb_entry_safe(tmp_node, + struct extent_node, rb_node); + } + if (fofs == en->ei.fofs + en->ei.len - 1) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&en->rb_node); + *next_entry = rb_entry_safe(tmp_node, + struct extent_node, rb_node); + } + return en; +} + +static struct kmem_cache *extent_tree_slab; +static struct kmem_cache *extent_node_slab; + +static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node *parent, struct rb_node **p, + bool leftmost) +{ + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + struct extent_node *en; + + en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); + if (!en) + return NULL; + + en->ei = *ei; + INIT_LIST_HEAD(&en->list); + en->et = et; + + rb_link_node(&en->rb_node, parent, p); + rb_insert_color_cached(&en->rb_node, &et->root, leftmost); + atomic_inc(&et->node_cnt); + atomic_inc(&eti->total_ext_node); + return en; +} + +static void __detach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + + rb_erase_cached(&en->rb_node, &et->root); + atomic_dec(&et->node_cnt); + atomic_dec(&eti->total_ext_node); + + if (et->cached_en == en) + et->cached_en = NULL; + kmem_cache_free(extent_node_slab, en); +} + +/* + * Flow to release an extent_node: + * 1. list_del_init + * 2. __detach_extent_node + * 3. kmem_cache_free. + */ +static void __release_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + + spin_lock(&eti->extent_lock); + f2fs_bug_on(sbi, list_empty(&en->list)); + list_del_init(&en->list); + spin_unlock(&eti->extent_lock); + + __detach_extent_node(sbi, et, en); +} + +static struct extent_tree *__grab_extent_tree(struct inode *inode, + enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et; + nid_t ino = inode->i_ino; + + mutex_lock(&eti->extent_tree_lock); + et = radix_tree_lookup(&eti->extent_tree_root, ino); + if (!et) { + et = f2fs_kmem_cache_alloc(extent_tree_slab, + GFP_NOFS, true, NULL); + f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et); + memset(et, 0, sizeof(struct extent_tree)); + et->ino = ino; + et->type = type; + et->root = RB_ROOT_CACHED; + et->cached_en = NULL; + rwlock_init(&et->lock); + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&eti->total_ext_tree); + } else { + atomic_dec(&eti->total_zombie_tree); + list_del_init(&et->list); + } + mutex_unlock(&eti->extent_tree_lock); + + /* never died until evict_inode */ + F2FS_I(inode)->extent_tree[type] = et; + + return et; +} + +static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, unsigned int nr_shrink) +{ + struct rb_node *node, *next; + struct extent_node *en; + unsigned int count; + + node = rb_first_cached(&et->root); + + for (count = 0; node && count < nr_shrink; count++) { + next = rb_next(node); + en = rb_entry(node, struct extent_node, rb_node); + __release_extent_node(sbi, et, en); + node = next; + } + + return count; +} + +static void __drop_largest_extent(struct extent_tree *et, + pgoff_t fofs, unsigned int len) +{ + if (fofs < (pgoff_t)et->largest.fofs + et->largest.len && + fofs + len > et->largest.fofs) { + et->largest.len = 0; + et->largest_updated = true; + } +} + +void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; + struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext; + struct extent_tree *et; + struct extent_node *en; + struct extent_info ei = {0}; + + if (!__may_extent_tree(inode, EX_READ)) { + /* drop largest read extent */ + if (i_ext->len) { + f2fs_folio_wait_writeback(ifolio, NODE, true, true); + i_ext->len = 0; + folio_mark_dirty(ifolio); + } + set_inode_flag(inode, FI_NO_EXTENT); + return; + } + + et = __grab_extent_tree(inode, EX_READ); + + get_read_extent_info(&ei, i_ext); + + write_lock(&et->lock); + if (atomic_read(&et->node_cnt) || !ei.len) + goto skip; + + if (IS_DEVICE_ALIASING(inode)) { + et->largest = ei; + goto skip; + } + + en = __attach_extent_node(sbi, et, &ei, NULL, + &et->root.rb_root.rb_node, true); + if (en) { + et->largest = en->ei; + et->cached_en = en; + + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); + spin_unlock(&eti->extent_lock); + } +skip: + /* Let's drop, if checkpoint got corrupted. */ + if (f2fs_cp_error(sbi)) { + et->largest.len = 0; + et->largest_updated = true; + } + write_unlock(&et->lock); +} + +void f2fs_init_age_extent_tree(struct inode *inode) +{ + if (!__init_may_extent_tree(inode, EX_BLOCK_AGE)) + return; + __grab_extent_tree(inode, EX_BLOCK_AGE); +} + +void f2fs_init_extent_tree(struct inode *inode) +{ + /* initialize read cache */ + if (__init_may_extent_tree(inode, EX_READ)) + __grab_extent_tree(inode, EX_READ); + + /* initialize block age cache */ + if (__init_may_extent_tree(inode, EX_BLOCK_AGE)) + __grab_extent_tree(inode, EX_BLOCK_AGE); +} + +static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei, enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + struct extent_node *en; + bool ret = false; + + if (!et) + return false; + + trace_f2fs_lookup_extent_tree_start(inode, pgofs, type); + + read_lock(&et->lock); + + if (type == EX_READ && + et->largest.fofs <= pgofs && + (pgoff_t)et->largest.fofs + et->largest.len > pgofs) { + *ei = et->largest; + ret = true; + stat_inc_largest_node_hit(sbi); + goto out; + } + + if (IS_DEVICE_ALIASING(inode)) { + ret = false; + goto out; + } + + en = __lookup_extent_node(&et->root, et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi, type); + else + stat_inc_rbtree_node_hit(sbi, type); + + *ei = en->ei; + spin_lock(&eti->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &eti->extent_list); + et->cached_en = en; + } + spin_unlock(&eti->extent_lock); + ret = true; +out: + stat_inc_total_hit(sbi, type); + read_unlock(&et->lock); + + if (type == EX_READ) + trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei); + else if (type == EX_BLOCK_AGE) + trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei); + return ret; +} + +static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct extent_node *prev_ex, + struct extent_node *next_ex) +{ + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + struct extent_node *en = NULL; + + if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) { + prev_ex->ei.len += ei->len; + ei = &prev_ex->ei; + en = prev_ex; + } + + if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) { + next_ex->ei.fofs = ei->fofs; + next_ex->ei.len += ei->len; + if (et->type == EX_READ) + next_ex->ei.blk = ei->blk; + if (en) + __release_extent_node(sbi, et, prev_ex); + + en = next_ex; + } + + if (!en) + return NULL; + + __try_update_largest_extent(et, en); + + spin_lock(&eti->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &eti->extent_list); + et->cached_en = en; + } + spin_unlock(&eti->extent_lock); + return en; +} + +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node **insert_p, + struct rb_node *insert_parent, + bool leftmost) +{ + struct extent_tree_info *eti = &sbi->extent_tree[et->type]; + struct rb_node **p = &et->root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct extent_node *en = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + leftmost = true; + + /* look up extent_node in the rb tree */ + while (*p) { + parent = *p; + en = rb_entry(parent, struct extent_node, rb_node); + + if (ei->fofs < en->ei.fofs) { + p = &(*p)->rb_left; + } else if (ei->fofs >= en->ei.fofs + en->ei.len) { + p = &(*p)->rb_right; + leftmost = false; + } else { + f2fs_err_ratelimited(sbi, "%s: corrupted extent, type: %d, " + "extent node in rb tree [%u, %u, %u], age [%llu, %llu], " + "extent node to insert [%u, %u, %u], age [%llu, %llu]", + __func__, et->type, en->ei.fofs, en->ei.blk, en->ei.len, en->ei.age, + en->ei.last_blocks, ei->fofs, ei->blk, ei->len, ei->age, ei->last_blocks); + f2fs_bug_on(sbi, 1); + return NULL; + } + } + +do_insert: + en = __attach_extent_node(sbi, et, ei, parent, p, leftmost); + if (!en) + return NULL; + + __try_update_largest_extent(et, en); + + /* update in global extent list */ + spin_lock(&eti->extent_lock); + list_add_tail(&en->list, &eti->extent_list); + et->cached_en = en; + spin_unlock(&eti->extent_lock); + return en; +} + +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + unsigned int nr_shrink = type == EX_READ ? + READ_EXTENT_CACHE_SHRINK_NUMBER : + AGE_EXTENT_CACHE_SHRINK_NUMBER; + unsigned int node_cnt = 0; + + if (!et || !atomic_read(&et->node_cnt)) + return 0; + + while (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, nr_shrink); + write_unlock(&et->lock); + } + + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + + return node_cnt; +} + +static void __update_extent_tree_range(struct inode *inode, + struct extent_info *tei, enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + struct extent_node *en = NULL, *en1 = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei, dei, prev; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int fofs = tei->fofs, len = tei->len; + unsigned int end = fofs + len; + bool updated = false; + bool leftmost = false; + + if (!et) + return; + + if (unlikely(len == 0)) { + f2fs_err_ratelimited(sbi, "%s: extent len is zero, type: %d, " + "extent [%u, %u, %u], age [%llu, %llu]", + __func__, type, tei->fofs, tei->blk, tei->len, + tei->age, tei->last_blocks); + f2fs_bug_on(sbi, 1); + return; + } + + if (type == EX_READ) + trace_f2fs_update_read_extent_tree_range(inode, fofs, len, + tei->blk, 0); + else if (type == EX_BLOCK_AGE) + trace_f2fs_update_age_extent_tree_range(inode, fofs, len, + tei->age, tei->last_blocks); + + write_lock(&et->lock); + + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { + write_unlock(&et->lock); + return; + } + + prev = et->largest; + dei.len = 0; + + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(et, fofs, len); + } + + /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ + en = __lookup_extent_node_ret(&et->root, + et->cached_en, fofs, + &prev_en, &next_en, + &insert_p, &insert_parent, + &leftmost); + if (!en) + en = next_en; + + /* 2. invalidate all extent nodes in range [fofs, fofs + len - 1] */ + while (en && en->ei.fofs < end) { + unsigned int org_end; + int parts = 0; /* # of parts current extent split into */ + + next_en = en1 = NULL; + + dei = en->ei; + org_end = dei.fofs + dei.len; + f2fs_bug_on(sbi, fofs >= org_end); + + if (fofs > dei.fofs && (type != EX_READ || + fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) { + en->ei.len = fofs - en->ei.fofs; + prev_en = en; + parts = 1; + } + + if (end < org_end && (type != EX_READ || + (org_end - end >= F2FS_MIN_EXTENT_LEN && + atomic_read(&et->node_cnt) < + sbi->max_read_extent_count))) { + if (parts) { + __set_extent_info(&ei, + end, org_end - end, + end - dei.fofs + dei.blk, false, + dei.age, dei.last_blocks, + type); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL, true); + next_en = en1; + } else { + __set_extent_info(&en->ei, + end, en->ei.len - (end - dei.fofs), + en->ei.blk + (end - dei.fofs), true, + dei.age, dei.last_blocks, + type); + next_en = en; + } + parts++; + } + + if (!next_en) { + struct rb_node *node = rb_next(&en->rb_node); + + next_en = rb_entry_safe(node, struct extent_node, + rb_node); + } + + if (parts) + __try_update_largest_extent(et, en); + else + __release_extent_node(sbi, et, en); + + /* + * if original extent is split into zero or two parts, extent + * tree has been altered by deletion or insertion, therefore + * invalidate pointers regard to tree. + */ + if (parts != 1) { + insert_p = NULL; + insert_parent = NULL; + } + en = next_en; + } + + if (type == EX_BLOCK_AGE) + goto update_age_extent_cache; + + /* 3. update extent in read extent cache */ + BUG_ON(type != EX_READ); + + if (tei->blk) { + __set_extent_info(&ei, fofs, len, tei->blk, false, + 0, 0, EX_READ); + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); + + /* give up extent_cache, if split and small updates happen */ + if (dei.len >= 1 && + prev.len < F2FS_MIN_EXTENT_LEN && + et->largest.len < F2FS_MIN_EXTENT_LEN) { + et->largest.len = 0; + et->largest_updated = true; + set_inode_flag(inode, FI_NO_EXTENT); + } + } + + if (et->largest_updated) { + et->largest_updated = false; + updated = true; + } + goto out_read_extent_cache; +update_age_extent_cache: + if (tei->last_blocks == F2FS_EXTENT_AGE_INVALID) + goto out_read_extent_cache; + + __set_extent_info(&ei, fofs, len, 0, false, + tei->age, tei->last_blocks, EX_BLOCK_AGE); + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +out_read_extent_cache: + write_unlock(&et->lock); + + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + __destroy_extent_node(inode, EX_READ); + + if (updated) + f2fs_mark_inode_dirty_sync(inode, true); +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; + struct extent_node *en = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + bool leftmost = false; + + trace_f2fs_update_read_extent_tree_range(inode, fofs, llen, + blkaddr, c_len); + + /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return; + + write_lock(&et->lock); + + en = __lookup_extent_node_ret(&et->root, + et->cached_en, fofs, + &prev_en, &next_en, + &insert_p, &insert_parent, + &leftmost); + if (en) + goto unlock_out; + + __set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ); + ei.c_len = c_len; + + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +unlock_out: + write_unlock(&et->lock); +} +#endif + +static unsigned long long __calculate_block_age(struct f2fs_sb_info *sbi, + unsigned long long new, + unsigned long long old) +{ + unsigned int rem_old, rem_new; + unsigned long long res; + unsigned int weight = sbi->last_age_weight; + + res = div_u64_rem(new, 100, &rem_new) * (100 - weight) + + div_u64_rem(old, 100, &rem_old) * weight; + + if (rem_new) + res += rem_new * (100 - weight) / 100; + if (rem_old) + res += rem_old * weight / 100; + + return res; +} + +/* This returns a new age and allocated blocks in ei */ +static int __get_new_block_age(struct inode *inode, struct extent_info *ei, + block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + loff_t f_size = i_size_read(inode); + unsigned long long cur_blocks = + atomic64_read(&sbi->allocated_data_blocks); + struct extent_info tei = *ei; /* only fofs and len are valid */ + + /* + * When I/O is not aligned to a PAGE_SIZE, update will happen to the last + * file block even in seq write. So don't record age for newly last file + * block here. + */ + if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) && + blkaddr == NEW_ADDR) + return -EINVAL; + + if (__lookup_extent_tree(inode, ei->fofs, &tei, EX_BLOCK_AGE)) { + unsigned long long cur_age; + + if (cur_blocks >= tei.last_blocks) + cur_age = cur_blocks - tei.last_blocks; + else + /* allocated_data_blocks overflow */ + cur_age = (ULLONG_MAX - 1) - tei.last_blocks + cur_blocks; + + if (tei.age) + ei->age = __calculate_block_age(sbi, cur_age, tei.age); + else + ei->age = cur_age; + ei->last_blocks = cur_blocks; + WARN_ON(ei->age > cur_blocks); + return 0; + } + + f2fs_bug_on(sbi, blkaddr == NULL_ADDR); + + /* the data block was allocated for the first time */ + if (blkaddr == NEW_ADDR) + goto out; + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) + return -EINVAL; +out: + /* + * init block age with zero, this can happen when the block age extent + * was reclaimed due to memory constraint or system reboot + */ + ei->age = 0; + ei->last_blocks = cur_blocks; + return 0; +} + +static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type) +{ + struct extent_info ei = {}; + + if (!__may_extent_tree(dn->inode, type)) + return; + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio), dn->inode) + + dn->ofs_in_node; + ei.len = 1; + + if (type == EX_READ) { + if (dn->data_blkaddr == NEW_ADDR) + ei.blk = NULL_ADDR; + else + ei.blk = dn->data_blkaddr; + } else if (type == EX_BLOCK_AGE) { + if (__get_new_block_age(dn->inode, &ei, dn->data_blkaddr)) + return; + } + __update_extent_tree_range(dn->inode, &ei, type); +} + +static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink, + enum extent_type type) +{ + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et, *next; + struct extent_node *en; + unsigned int node_cnt = 0, tree_cnt = 0; + int remained; + + if (!atomic_read(&eti->total_zombie_tree)) + goto free_node; + + if (!mutex_trylock(&eti->extent_tree_lock)) + goto out; + + /* 1. remove unreferenced extent tree */ + list_for_each_entry_safe(et, next, &eti->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, + nr_shrink - node_cnt - tree_cnt); + write_unlock(&et->lock); + } + + if (atomic_read(&et->node_cnt)) + goto unlock_out; + + list_del_init(&et->list); + radix_tree_delete(&eti->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&eti->total_ext_tree); + atomic_dec(&eti->total_zombie_tree); + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + cond_resched(); + } + mutex_unlock(&eti->extent_tree_lock); + +free_node: + /* 2. remove LRU extent entries */ + if (!mutex_trylock(&eti->extent_tree_lock)) + goto out; + + remained = nr_shrink - (node_cnt + tree_cnt); + + spin_lock(&eti->extent_lock); + for (; remained > 0; remained--) { + if (list_empty(&eti->extent_list)) + break; + en = list_first_entry(&eti->extent_list, + struct extent_node, list); + et = en->et; + if (!write_trylock(&et->lock)) { + /* refresh this extent node's position in extent list */ + list_move_tail(&en->list, &eti->extent_list); + continue; + } + + list_del_init(&en->list); + spin_unlock(&eti->extent_lock); + + __detach_extent_node(sbi, et, en); + + write_unlock(&et->lock); + node_cnt++; + spin_lock(&eti->extent_lock); + } + spin_unlock(&eti->extent_lock); + +unlock_out: + mutex_unlock(&eti->extent_tree_lock); +out: + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type); + + return node_cnt + tree_cnt; +} + +/* read extent cache operations */ +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_READ)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_READ); +} + +bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index, + block_t *blkaddr) +{ + struct extent_info ei = {}; + + if (!f2fs_lookup_read_extent_cache(inode, index, &ei)) + return false; + *blkaddr = ei.blk + index - ei.fofs; + return true; +} + +void f2fs_update_read_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_READ); +} + +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + .blk = blkaddr, + }; + + if (!__may_extent_tree(dn->inode, EX_READ)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_READ); +} + +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, READ_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_READ); +} + +/* block age extent cache operations */ +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (!__may_extent_tree(inode, EX_BLOCK_AGE)) + return false; + + return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache(struct dnode_of_data *dn) +{ + return __update_extent_cache(dn, EX_BLOCK_AGE); +} + +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len) +{ + struct extent_info ei = { + .fofs = fofs, + .len = len, + .last_blocks = F2FS_EXTENT_AGE_INVALID, + }; + + if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE)) + return; + + __update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE); +} + +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + if (!test_opt(sbi, AGE_EXTENT_CACHE)) + return 0; + + return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE); +} + +void f2fs_destroy_extent_node(struct inode *inode) +{ + __destroy_extent_node(inode, EX_READ); + __destroy_extent_node(inode, EX_BLOCK_AGE); +} + +static void __drop_extent_tree(struct inode *inode, enum extent_type type) +{ + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + bool updated = false; + + if (!__may_extent_tree(inode, type)) + return; + + write_lock(&et->lock); + if (type == EX_READ) { + set_inode_flag(inode, FI_NO_EXTENT); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } + } + write_unlock(&et->lock); + + __destroy_extent_node(inode, type); + + if (updated) + f2fs_mark_inode_dirty_sync(inode, true); +} + +void f2fs_drop_extent_tree(struct inode *inode) +{ + __drop_extent_tree(inode, EX_READ); + __drop_extent_tree(inode, EX_BLOCK_AGE); +} + +static void __destroy_extent_tree(struct inode *inode, enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree_info *eti = &sbi->extent_tree[type]; + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + unsigned int node_cnt = 0; + + if (!et) + return; + + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + mutex_lock(&eti->extent_tree_lock); + list_add_tail(&et->list, &eti->zombie_list); + atomic_inc(&eti->total_zombie_tree); + mutex_unlock(&eti->extent_tree_lock); + return; + } + + /* free all extent info belong to this extent tree */ + node_cnt = __destroy_extent_node(inode, type); + + /* delete extent tree entry in radix tree */ + mutex_lock(&eti->extent_tree_lock); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + radix_tree_delete(&eti->extent_tree_root, inode->i_ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&eti->total_ext_tree); + mutex_unlock(&eti->extent_tree_lock); + + F2FS_I(inode)->extent_tree[type] = NULL; + + trace_f2fs_destroy_extent_tree(inode, node_cnt, type); +} + +void f2fs_destroy_extent_tree(struct inode *inode) +{ + __destroy_extent_tree(inode, EX_READ); + __destroy_extent_tree(inode, EX_BLOCK_AGE); +} + +static void __init_extent_tree_info(struct extent_tree_info *eti) +{ + INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO); + mutex_init(&eti->extent_tree_lock); + INIT_LIST_HEAD(&eti->extent_list); + spin_lock_init(&eti->extent_lock); + atomic_set(&eti->total_ext_tree, 0); + INIT_LIST_HEAD(&eti->zombie_list); + atomic_set(&eti->total_zombie_tree, 0); + atomic_set(&eti->total_ext_node, 0); +} + +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) +{ + __init_extent_tree_info(&sbi->extent_tree[EX_READ]); + __init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]); + + /* initialize for block age extents */ + atomic64_set(&sbi->allocated_data_blocks, 0); + sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; + sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; + sbi->last_age_weight = LAST_AGE_WEIGHT; + sbi->max_read_extent_count = DEF_MAX_READ_EXTENT_COUNT; +} + +int __init f2fs_create_extent_cache(void) +{ + extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", + sizeof(struct extent_tree)); + if (!extent_tree_slab) + return -ENOMEM; + extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", + sizeof(struct extent_node)); + if (!extent_node_slab) { + kmem_cache_destroy(extent_tree_slab); + return -ENOMEM; + } + return 0; +} + +void f2fs_destroy_extent_cache(void) +{ + kmem_cache_destroy(extent_node_slab); + kmem_cache_destroy(extent_tree_slab); +} diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 467d42d65c48..20edbb99b814 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1,37 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/f2fs.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef _LINUX_F2FS_H #define _LINUX_F2FS_H +#include <linux/uio.h> #include <linux/types.h> #include <linux/page-flags.h> -#include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/crc32.h> #include <linux/magic.h> +#include <linux/kobject.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/sched/mm.h> +#include <linux/vmalloc.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/quotaops.h> +#include <linux/part_stat.h> +#include <linux/rw_hint.h> + +#include <linux/fscrypt.h> +#include <linux/fsverity.h> + +struct pagevec; + +#ifdef CONFIG_F2FS_CHECK_FS +#define f2fs_bug_on(sbi, condition) BUG_ON(condition) +#else +#define f2fs_bug_on(sbi, condition) \ + do { \ + if (WARN_ON(condition)) \ + set_sbi_flag(sbi, SBI_NEED_FSCK); \ + } while (0) +#endif + +enum { + FAULT_KMALLOC, + FAULT_KVMALLOC, + FAULT_PAGE_ALLOC, + FAULT_PAGE_GET, + FAULT_ALLOC_BIO, /* it's obsolete due to bio_alloc() will never fail */ + FAULT_ALLOC_NID, + FAULT_ORPHAN, + FAULT_BLOCK, + FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, + FAULT_TRUNCATE, + FAULT_READ_IO, + FAULT_CHECKPOINT, + FAULT_DISCARD, + FAULT_WRITE_IO, + FAULT_SLAB_ALLOC, + FAULT_DQUOT_INIT, + FAULT_LOCK_OP, + FAULT_BLKADDR_VALIDITY, + FAULT_BLKADDR_CONSISTENCE, + FAULT_NO_SEGMENT, + FAULT_INCONSISTENT_FOOTER, + FAULT_TIMEOUT, + FAULT_VMALLOC, + FAULT_MAX, +}; + +/* indicate which option to update */ +enum fault_option { + FAULT_RATE = 1, /* only update fault rate */ + FAULT_TYPE = 2, /* only update fault type */ + FAULT_ALL = 4, /* reset all fault injection options/stats */ +}; + +#ifdef CONFIG_F2FS_FAULT_INJECTION +struct f2fs_fault_info { + atomic_t inject_ops; + int inject_rate; + unsigned int inject_type; + /* Used to account total count of injection for each type */ + unsigned int inject_count[FAULT_MAX]; +}; + +extern const char *f2fs_fault_name[FAULT_MAX]; +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type)) + +/* maximum retry count for injected failure */ +#define DEFAULT_FAILURE_RETRY_COUNT 8 +#else +#define DEFAULT_FAILURE_RETRY_COUNT 1 +#endif /* * For mount options */ -#define F2FS_MOUNT_BG_GC 0x00000001 -#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002 -#define F2FS_MOUNT_DISCARD 0x00000004 -#define F2FS_MOUNT_NOHEAP 0x00000008 -#define F2FS_MOUNT_XATTR_USER 0x00000010 -#define F2FS_MOUNT_POSIX_ACL 0x00000020 -#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 - -#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) +enum f2fs_mount_opt { + F2FS_MOUNT_DISABLE_ROLL_FORWARD, + F2FS_MOUNT_DISCARD, + F2FS_MOUNT_NOHEAP, + F2FS_MOUNT_XATTR_USER, + F2FS_MOUNT_POSIX_ACL, + F2FS_MOUNT_DISABLE_EXT_IDENTIFY, + F2FS_MOUNT_INLINE_XATTR, + F2FS_MOUNT_INLINE_DATA, + F2FS_MOUNT_INLINE_DENTRY, + F2FS_MOUNT_FLUSH_MERGE, + F2FS_MOUNT_NOBARRIER, + F2FS_MOUNT_FASTBOOT, + F2FS_MOUNT_READ_EXTENT_CACHE, + F2FS_MOUNT_DATA_FLUSH, + F2FS_MOUNT_FAULT_INJECTION, + F2FS_MOUNT_USRQUOTA, + F2FS_MOUNT_GRPQUOTA, + F2FS_MOUNT_PRJQUOTA, + F2FS_MOUNT_QUOTA, + F2FS_MOUNT_INLINE_XATTR_SIZE, + F2FS_MOUNT_RESERVE_ROOT, + F2FS_MOUNT_DISABLE_CHECKPOINT, + F2FS_MOUNT_NORECOVERY, + F2FS_MOUNT_ATGC, + F2FS_MOUNT_MERGE_CHECKPOINT, + F2FS_MOUNT_GC_MERGE, + F2FS_MOUNT_COMPRESS_CACHE, + F2FS_MOUNT_AGE_EXTENT_CACHE, + F2FS_MOUNT_NAT_BITS, + F2FS_MOUNT_INLINECRYPT, + /* + * Some f2fs environments expect to be able to pass the "lazytime" option + * string rather than using the MS_LAZYTIME flag, so this must remain. + */ + F2FS_MOUNT_LAZYTIME, + F2FS_MOUNT_RESERVE_NODE, +}; + +#define F2FS_OPTION(sbi) ((sbi)->mount_opt) +#define clear_opt(sbi, option) \ + (F2FS_OPTION(sbi).opt &= ~BIT(F2FS_MOUNT_##option)) +#define set_opt(sbi, option) \ + (F2FS_OPTION(sbi).opt |= BIT(F2FS_MOUNT_##option)) +#define test_opt(sbi, option) \ + (F2FS_OPTION(sbi).opt & BIT(F2FS_MOUNT_##option)) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -43,30 +153,114 @@ typedef u32 block_t; /* */ typedef u32 nid_t; -struct f2fs_mount_info { - unsigned int opt; +#define COMPRESS_EXT_NUM 16 + +enum blkzone_allocation_policy { + BLKZONE_ALLOC_PRIOR_SEQ, /* Prioritize writing to sequential zones */ + BLKZONE_ALLOC_ONLY_SEQ, /* Only allow writing to sequential zones */ + BLKZONE_ALLOC_PRIOR_CONV, /* Prioritize writing to conventional zones */ }; -#define CRCPOLY_LE 0xedb88320 +enum bggc_io_aware_policy { + AWARE_ALL_IO, /* skip background GC if there is any kind of pending IO */ + AWARE_READ_IO, /* skip background GC if there is pending read IO */ + AWARE_NONE, /* don't aware IO for background GC */ +}; -static inline __u32 f2fs_crc32(void *buf, size_t len) -{ - unsigned char *p = (unsigned char *)buf; - __u32 crc = F2FS_SUPER_MAGIC; - int i; +enum device_allocation_policy { + ALLOCATE_FORWARD_NOHINT, + ALLOCATE_FORWARD_WITHIN_HINT, + ALLOCATE_FORWARD_FROM_HINT, +}; - while (len--) { - crc ^= *p++; - for (i = 0; i < 8; i++) - crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); - } - return crc; -} +/* + * An implementation of an rwsem that is explicitly unfair to readers. This + * prevents priority inversion when a low-priority reader acquires the read lock + * while sleeping on the write lock but the write lock is needed by + * higher-priority clients. + */ -static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size) -{ - return f2fs_crc32(buf, buf_size) == blk_crc; -} +struct f2fs_rwsem { + struct rw_semaphore internal_rwsem; +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wait_queue_head_t read_waiters; +#endif +}; + +struct f2fs_mount_info { + unsigned long long opt; + block_t root_reserved_blocks; /* root reserved blocks */ + block_t root_reserved_nodes; /* root reserved nodes */ + kuid_t s_resuid; /* reserved blocks for uid */ + kgid_t s_resgid; /* reserved blocks for gid */ + int active_logs; /* # of active logs */ + int inline_xattr_size; /* inline xattr size */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; /* For fault injection */ +#endif +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + /* For which write hints are passed down to block layer */ + int alloc_mode; /* segment allocation policy */ + int fsync_mode; /* fsync policy */ + int fs_mode; /* fs mode: LFS or ADAPTIVE */ + int bggc_mode; /* bggc mode: off, on or sync */ + int memory_mode; /* memory mode */ + int errors; /* errors parameter */ + int discard_unit; /* + * discard command's offset/size should + * be aligned to this unit: block, + * segment or section + */ + struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */ + block_t unusable_cap_perc; /* percentage for cap */ + block_t unusable_cap; /* Amount of space allowed to be + * unusable when disabling checkpoint + */ + + /* For compression */ + unsigned char compress_algorithm; /* algorithm type */ + unsigned char compress_log_size; /* cluster log size */ + unsigned char compress_level; /* compress level */ + bool compress_chksum; /* compressed data chksum */ + unsigned char compress_ext_cnt; /* extension count */ + unsigned char nocompress_ext_cnt; /* nocompress extension count */ + int compress_mode; /* compression mode */ + unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned int lookup_mode; +}; + +#define F2FS_FEATURE_ENCRYPT 0x00000001 +#define F2FS_FEATURE_BLKZONED 0x00000002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x00000004 +#define F2FS_FEATURE_EXTRA_ATTR 0x00000008 +#define F2FS_FEATURE_PRJQUOTA 0x00000010 +#define F2FS_FEATURE_INODE_CHKSUM 0x00000020 +#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x00000040 +#define F2FS_FEATURE_QUOTA_INO 0x00000080 +#define F2FS_FEATURE_INODE_CRTIME 0x00000100 +#define F2FS_FEATURE_LOST_FOUND 0x00000200 +#define F2FS_FEATURE_VERITY 0x00000400 +#define F2FS_FEATURE_SB_CHKSUM 0x00000800 +#define F2FS_FEATURE_CASEFOLD 0x00001000 +#define F2FS_FEATURE_COMPRESSION 0x00002000 +#define F2FS_FEATURE_RO 0x00004000 +#define F2FS_FEATURE_DEVICE_ALIAS 0x00008000 +#define F2FS_FEATURE_PACKED_SSA 0x00010000 + +#define __F2FS_HAS_FEATURE(raw_super, mask) \ + ((raw_super->feature & cpu_to_le32(mask)) != 0) +#define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) + +/* + * Default values for user and/or group using reserved blocks + */ +#define F2FS_DEF_RESUID 0 +#define F2FS_DEF_RESGID 0 /* * For checkpoint manager @@ -76,86 +270,548 @@ enum { SIT_BITMAP }; -/* for the list of orphan inodes */ -struct orphan_inode_entry { - struct list_head list; /* list head */ - nid_t ino; /* inode number */ +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 +#define CP_PAUSE 0x00000040 +#define CP_RESIZE 0x00000080 + +#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ +#define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ +#define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_ENABLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ +#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ + +enum cp_time { + CP_TIME_START, /* begin */ + CP_TIME_LOCK, /* after cp_global_sem */ + CP_TIME_OP_LOCK, /* after block_operation */ + CP_TIME_FLUSH_META, /* after flush sit/nat */ + CP_TIME_SYNC_META, /* after sync_meta_pages */ + CP_TIME_SYNC_CP_META, /* after sync cp meta pages */ + CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */ + CP_TIME_WAIT_CP_DATA, /* after wait on cp data */ + CP_TIME_FLUSH_DEVICE, /* after flush device cache */ + CP_TIME_WAIT_LAST_CP, /* after wait on last cp pack */ + CP_TIME_END, /* after unblock_operation */ + CP_TIME_MAX, +}; + +/* time cost stats of checkpoint */ +struct cp_stats { + ktime_t times[CP_TIME_MAX]; +}; + +struct cp_control { + int reason; + __u64 trim_start; + __u64 trim_end; + __u64 trim_minlen; + struct cp_stats stats; +}; + +enum f2fs_cp_phase { + CP_PHASE_START_BLOCK_OPS, + CP_PHASE_FINISH_BLOCK_OPS, + CP_PHASE_FINISH_CHECKPOINT, }; -/* for the list of directory inodes */ -struct dir_inode_entry { +/* + * indicate meta/data type + */ +enum { + META_CP, + META_NAT, + META_SIT, + META_SSA, + META_MAX, + META_POR, + DATA_GENERIC, /* check range only */ + DATA_GENERIC_ENHANCE, /* strong check on range and segment bitmap */ + DATA_GENERIC_ENHANCE_READ, /* + * strong check on range and segment + * bitmap but no warning due to race + * condition of read on truncated area + * by extent_cache + */ + DATA_GENERIC_ENHANCE_UPDATE, /* + * strong check on range and segment + * bitmap for update case + */ + META_GENERIC, +}; + +/* for the list of ino */ +enum { + ORPHAN_INO, /* for orphan ino list */ + APPEND_INO, /* for append ino list */ + UPDATE_INO, /* for update ino list */ + TRANS_DIR_INO, /* for transactions dir ino list */ + XATTR_DIR_INO, /* for xattr updated dir ino list */ + FLUSH_INO, /* for multiple device flushing */ + MAX_INO_ENTRY, /* max. list */ +}; + +struct ino_entry { + struct list_head list; /* list head */ + nid_t ino; /* inode number */ + unsigned int dirty_device; /* dirty device bitmap */ +}; + +/* for the list of inodes to be GCed */ +struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ }; +struct fsync_node_entry { + struct list_head list; /* list head */ + struct folio *folio; /* warm node folio pointer */ + unsigned int seq_id; /* sequence id */ +}; + +struct ckpt_req { + struct completion wait; /* completion for checkpoint done */ + struct llist_node llnode; /* llist_node to be linked in wait queue */ + int ret; /* return code of checkpoint */ + union { + ktime_t queue_time; /* request queued time */ + ktime_t delta_time; /* time in queue */ + }; +}; + +struct ckpt_req_control { + struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ + wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_ckpt; /* # of actually issued ckpts */ + atomic_t total_ckpt; /* # of total ckpts */ + atomic_t queued_ckpt; /* # of queued ckpts */ + struct llist_head issue_list; /* list for command issue */ + spinlock_t stat_lock; /* lock for below checkpoint time stats */ + unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ + unsigned int peak_time; /* peak wait time in msec until now */ +}; + +/* a time threshold that checkpoint was blocked for, unit: ms */ +#define CP_LONG_LATENCY_THRESHOLD 5000 + +/* for the bitmap indicate blocks to be discarded */ +struct discard_entry { + struct list_head list; /* list head */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ +}; + +/* minimum discard granularity, unit: block count */ +#define MIN_DISCARD_GRANULARITY 1 +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 +/* default maximum discard granularity of ordered discard, unit: block count */ +#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16 +/* default interval of periodical discard submission */ +#define DEFAULT_DISCARD_INTERVAL (msecs_to_jiffies(20)) + +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) + +enum { + D_PREP, /* initial */ + D_PARTIAL, /* partially submitted */ + D_SUBMIT, /* all submitted */ + D_DONE, /* finished */ +}; + +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ +}; + +struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + struct discard_info di; /* discard info */ + struct list_head list; /* command list */ + struct completion wait; /* completion */ + struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ + unsigned char state; /* state */ + unsigned char queued; /* queued discard */ + int error; /* bio error */ + spinlock_t lock; /* for state/bio_ref updating */ + unsigned short bio_ref; /* bio reference count */ +}; + +enum { + DPOLICY_BG, + DPOLICY_FORCE, + DPOLICY_FSTRIM, + DPOLICY_UMOUNT, + MAX_DPOLICY, +}; + +enum { + DPOLICY_IO_AWARE_DISABLE, /* force to not be aware of IO */ + DPOLICY_IO_AWARE_ENABLE, /* force to be aware of IO */ + DPOLICY_IO_AWARE_MAX, +}; + +struct discard_policy { + int type; /* type of discard */ + unsigned int min_interval; /* used for candidates exist */ + unsigned int mid_interval; /* used for device busy */ + unsigned int max_interval; /* used for candidates not exist */ + unsigned int max_requests; /* # of discards issued per round */ + unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ + bool io_aware; /* issue discard in idle time */ + bool sync; /* submit discard with REQ_SYNC flag */ + bool ordered; /* issue discard by lba order */ + bool timeout; /* discard timeout for put_super */ + unsigned int granularity; /* discard granularity */ +}; + +struct discard_cmd_control { + struct task_struct *f2fs_issue_discard; /* discard thread */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + struct list_head wait_list; /* store on-flushing entries */ + struct list_head fstrim_list; /* in-flight discard from fstrim */ + wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + struct mutex cmd_lock; + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ + unsigned int max_discard_request; /* max. discard request per round */ + unsigned int min_discard_issue_time; /* min. interval between discard issue */ + unsigned int mid_discard_issue_time; /* mid. interval between discard issue */ + unsigned int max_discard_issue_time; /* max. interval between discard issue */ + unsigned int discard_io_aware_gran; /* minimum discard granularity not be aware of I/O */ + unsigned int discard_urgent_util; /* utilization which issue discard proactively */ + unsigned int discard_granularity; /* discard granularity */ + unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */ + unsigned int discard_io_aware; /* io_aware policy */ + unsigned int undiscard_blks; /* # of undiscard blocks */ + unsigned int next_pos; /* next discard position */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t queued_discard; /* # of queued discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root_cached root; /* root of discard rb-tree */ + bool rbtree_check; /* config for consistence check */ + bool discard_wake; /* to wake up discard thread */ +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ - block_t blkaddr; /* block address locating the last inode */ + block_t blkaddr; /* block address locating the last fsync */ + block_t last_dentry; /* block address locating the last dentry */ }; -#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) -#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne) -#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid) -#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) -#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) -static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) +#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) +#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) + +static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { - int before = nats_in_cursum(rs); - rs->n_nats = cpu_to_le16(before + i); + int before = nats_in_cursum(journal); + + journal->n_nats = cpu_to_le16(before + i); return before; } -static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) +static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { - int before = sits_in_cursum(rs); - rs->n_sits = cpu_to_le16(before + i); + int before = sits_in_cursum(journal); + + journal->n_sits = cpu_to_le16(before + i); return before; } -/* - * ioctl commands - */ -#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS -#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS +static inline bool __has_cursum_space(struct f2fs_journal *journal, + int size, int type) +{ + if (type == NAT_JOURNAL) + return size <= MAX_NAT_JENTRIES(journal); + return size <= MAX_SIT_JENTRIES(journal); +} + +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 +static inline int get_extra_isize(struct inode *inode); +static inline int get_inline_xattr_addrs(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + get_inline_xattr_addrs(inode) - \ + DEF_INLINE_RESERVED_SIZE)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) \ + DIV_ROUND_UP(NR_INLINE_DENTRY(inode), BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) -#if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* - * ioctl commands in 32 bit emulation + * For INODE and NODE manager */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +/* for directory operations */ + +struct f2fs_filename { + /* + * The filename the user specified. This is NULL for some + * filesystem-internal operations, e.g. converting an inline directory + * to a non-inline one, or roll-forward recovering an encrypted dentry. + */ + const struct qstr *usr_fname; + + /* + * The on-disk filename. For encrypted directories, this is encrypted. + * This may be NULL for lookups in an encrypted dir without the key. + */ + struct fscrypt_str disk_name; + + /* The dirhash of this filename */ + f2fs_hash_t hash; + +#ifdef CONFIG_FS_ENCRYPTION + /* + * For lookups in encrypted directories: either the buffer backing + * disk_name, or a buffer that holds the decoded no-key name. + */ + struct fscrypt_str crypto_buf; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + /* + * For casefolded directories: the casefolded name, but it's left NULL + * if the original name is not valid Unicode, if the original name is + * "." or "..", if the directory is both casefolded and encrypted and + * its encryption key is unavailable, or if the filesystem is doing an + * internal operation where usr_fname is also NULL. In all these cases + * we fall back to treating the name as an opaque byte sequence. + */ + struct qstr cf_name; #endif +}; + +struct f2fs_dentry_ptr { + struct inode *inode; + void *bitmap; + struct f2fs_dir_entry *dentry; + __u8 (*filename)[F2FS_SLOT_LEN]; + int max; + int nr_bitmap; +}; + +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) +{ + d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; + d->bitmap = t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} + +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, void *t) +{ + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + + d->inode = inode; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; +} /* - * For INODE and NODE manager + * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1 + * as its node offset to distinguish from index node blocks. + * But some bits are used to mark the node block. */ -#define XATTR_NODE_OFFSET (-1) /* - * store xattrs to one node block per - * file keeping -1 as its node offset to - * distinguish from index node blocks. - */ +#define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \ + >> OFFSET_BIT_SHIFT) enum { ALLOC_NODE, /* allocate a new node page if needed */ LOOKUP_NODE, /* look up a node without readahead */ LOOKUP_NODE_RA, /* * look up a node with readahead called - * by get_datablock_ro. + * by get_data_block. */ }; -#define F2FS_LINK_MAX 32000 /* maximum link count per file */ +#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ + +/* IO/non-IO congestion wait timeout value, default: 1ms */ +#define DEFAULT_SCHEDULE_TIMEOUT (msecs_to_jiffies(1)) + +/* timeout value injected, default: 1000ms */ +#define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000)) + +/* maximum retry quota flush count */ +#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 + +/* maximum retry of EIO'ed page */ +#define MAX_RETRY_PAGE_EIO 100 + +#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ + +#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ + +/* dirty segments threshold for triggering CP */ +#define DEFAULT_DIRTY_THRESHOLD 4 + +#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS +#define RECOVERY_MIN_RA_BLOCKS 1 + +#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */ /* for in-memory extent cache entry */ +#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ + +/* number of extent info in extent cache we try to shrink */ +#define READ_EXTENT_CACHE_SHRINK_NUMBER 128 + +/* number of age extent info in extent cache we try to shrink */ +#define AGE_EXTENT_CACHE_SHRINK_NUMBER 128 +#define LAST_AGE_WEIGHT 30 +#define SAME_AGE_REGION 1024 + +/* + * Define data block with age less than 1GB as hot data + * define data block with age less than 10GB but more than 1GB as warm data + */ +#define DEF_HOT_DATA_AGE_THRESHOLD 262144 +#define DEF_WARM_DATA_AGE_THRESHOLD 2621440 + +/* default max read extent count per inode */ +#define DEF_MAX_READ_EXTENT_COUNT 10240 + +/* extent cache type */ +enum extent_type { + EX_READ, + EX_BLOCK_AGE, + NR_EXTENT_CACHES, +}; + +/* + * Reserved value to mark invalid age extents, hence valid block range + * from 0 to ULLONG_MAX-1 + */ +#define F2FS_EXTENT_AGE_INVALID ULLONG_MAX + struct extent_info { - rwlock_t ext_lock; /* rwlock for consistency */ - unsigned int fofs; /* start offset in a file */ - u32 blk_addr; /* start block address of the extent */ - unsigned int len; /* length of the extent */ + unsigned int fofs; /* start offset in a file */ + unsigned int len; /* length of the extent */ + union { + /* read extent_cache */ + struct { + /* start block address of the extent */ + block_t blk; +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* physical extent length of compressed blocks */ + unsigned int c_len; +#endif + }; + /* block age extent_cache */ + struct { + /* block age of the extent */ + unsigned long long age; + /* last total blocks allocated */ + unsigned long long last_blocks; + }; + }; +}; + +struct extent_node { + struct rb_node rb_node; /* rb node located in rb-tree */ + struct extent_info ei; /* extent info */ + struct list_head list; /* node in global extent list of sbi */ + struct extent_tree *et; /* extent tree pointer */ +}; + +struct extent_tree { + nid_t ino; /* inode number */ + enum extent_type type; /* keep the extent tree type */ + struct rb_root_cached root; /* root of extent info rb-tree */ + struct extent_node *cached_en; /* recently accessed extent node */ + struct list_head list; /* to be used by sbi->zombie_list */ + rwlock_t lock; /* protect extent info rb-tree */ + atomic_t node_cnt; /* # of extent node in rb-tree*/ + bool largest_updated; /* largest extent updated */ + struct extent_info largest; /* largest cached extent for EX_READ */ +}; + +struct extent_tree_info { + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct mutex extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ + atomic_t total_ext_node; /* extent info count */ +}; + +/* + * State of block returned by f2fs_map_blocks. + */ +#define F2FS_MAP_NEW (1U << 0) +#define F2FS_MAP_MAPPED (1U << 1) +#define F2FS_MAP_DELALLOC (1U << 2) +#define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\ + F2FS_MAP_DELALLOC) + +struct f2fs_map_blocks { + struct block_device *m_bdev; /* for multi-device dio */ + block_t m_pblk; + block_t m_lblk; + unsigned int m_len; + unsigned int m_flags; + unsigned long m_last_pblk; /* last allocated block, only used for DIO in LFS mode */ + pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ + pgoff_t *m_next_extent; /* point to next possible extent */ + int m_seg_type; + bool m_may_create; /* indicate it is from write path */ + bool m_multidev_dio; /* indicate it allows multi-device dio */ +}; + +/* for flag in get_data_block */ +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_DIO, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, + F2FS_GET_BLOCK_PRECACHE, }; /* @@ -163,64 +819,256 @@ struct extent_info { */ #define FADVISE_COLD_BIT 0x01 #define FADVISE_LOST_PINO_BIT 0x02 +#define FADVISE_ENCRYPT_BIT 0x04 +#define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 +#define FADVISE_HOT_BIT 0x20 +#define FADVISE_VERITY_BIT 0x40 +#define FADVISE_TRUNC_BIT 0x80 + +#define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) + +#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) +#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) +#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) + +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) +#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) + +#define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) +#define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) + +#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) +#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) + +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) + +#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) +#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) +#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) + +#define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) +#define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) + +#define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT) +#define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT) +#define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT) + +#define DEF_DIR_LEVEL 0 + +/* used for f2fs_inode_info->flags */ +enum { + FI_NEW_INODE, /* indicate newly allocated inode */ + FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ + FI_DIRTY_DIR, /* indicate directory has dirty pages */ + FI_INC_LINK, /* need to increment i_nlink */ + FI_ACL_MODE, /* indicate acl mode */ + FI_NO_ALLOC, /* should not allocate any blocks */ + FI_FREE_NID, /* free allocated nide */ + FI_NO_EXTENT, /* not to use the extent cache */ + FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ + FI_INLINE_DENTRY, /* used for inline dentry */ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used for ipu per file */ + FI_ATOMIC_FILE, /* indicate atomic file */ + FI_DATA_EXIST, /* indicate data exists */ + FI_SKIP_WRITES, /* should skip data page writeback */ + FI_OPU_WRITE, /* used for opu per file */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ + FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ + FI_PIN_FILE, /* indicate file should not be gced */ + FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ + FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ + FI_MMAP_FILE, /* indicate file was mmapped */ + FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ + FI_COMPRESS_RELEASED, /* compressed blocks were released */ + FI_ALIGNED_WRITE, /* enable aligned write */ + FI_COW_FILE, /* indicate COW file */ + FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */ + FI_ATOMIC_DIRTIED, /* indicate atomic file is dirtied */ + FI_ATOMIC_REPLACE, /* indicate atomic replace */ + FI_OPENED_FILE, /* indicate file has been opened */ + FI_DONATE_FINISHED, /* indicate page donation of file has been finished */ + FI_MAX, /* max flag, never be used */ +}; struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ - unsigned int i_current_depth; /* use only in directory structure */ + unsigned char i_dir_level; /* use for dentry level for large dir */ + union { + unsigned int i_current_depth; /* only for directory depth */ + unsigned short i_gc_failures; /* for gc failure statistic */ + }; unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ - unsigned long flags; /* use to pass per-file flags */ - atomic_t dirty_dents; /* # of dirty dentry pages */ + unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ + unsigned int ioprio_hint; /* hint for IO priority */ + struct f2fs_rwsem i_sem; /* protect fi info */ + atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ + struct task_struct *wb_task; /* indicate inode is in context of writeback */ nid_t i_xattr_nid; /* node id that contains xattrs */ - struct extent_info ext; /* in-memory extent cache entry */ + loff_t last_disk_size; /* lastly written file size */ + spinlock_t i_size_lock; /* protect last_disk_size */ + +#ifdef CONFIG_QUOTA + struct dquot __rcu *i_dquot[MAXQUOTAS]; + + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + struct list_head dirty_list; /* dirty list for dirs and files */ + struct list_head gdirty_list; /* linked in global dirty list */ + + /* linked in global inode list for cache donation */ + struct list_head gdonate_list; + pgoff_t donate_start, donate_end; /* inclusive */ + atomic_t open_count; /* # of open files */ + + struct task_struct *atomic_write_task; /* store atomic write task */ + struct extent_tree *extent_tree[NR_EXTENT_CACHES]; + /* cached extent_tree entry */ + union { + struct inode *cow_inode; /* copy-on-write inode for atomic write */ + struct inode *atomic_inode; + /* point to atomic_inode, available only for cow_inode */ + }; + + /* avoid racing between foreground op and gc */ + struct f2fs_rwsem i_gc_rwsem[2]; + struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */ + + int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ + int i_inline_xattr_size; /* inline xattr size */ + struct timespec64 i_crtime; /* inode creation time */ + struct timespec64 i_disk_time[3];/* inode disk times */ + + /* for file compress */ + atomic_t i_compr_blocks; /* # of compressed blocks */ + unsigned char i_compress_algorithm; /* algorithm type */ + unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ + unsigned char i_compress_flag; /* compress flag */ + unsigned int i_cluster_size; /* cluster size */ + atomic_t writeback; /* count # of writeback thread */ + + unsigned int atomic_write_cnt; + loff_t original_i_size; /* original i_size before atomic write */ +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */ +#endif +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; /* filesystem verity info */ +#endif }; -static inline void get_extent_info(struct extent_info *ext, - struct f2fs_extent i_ext) +static inline void get_read_extent_info(struct extent_info *ext, + struct f2fs_extent *i_ext) { - write_lock(&ext->ext_lock); - ext->fofs = le32_to_cpu(i_ext.fofs); - ext->blk_addr = le32_to_cpu(i_ext.blk_addr); - ext->len = le32_to_cpu(i_ext.len); - write_unlock(&ext->ext_lock); + ext->fofs = le32_to_cpu(i_ext->fofs); + ext->blk = le32_to_cpu(i_ext->blk); + ext->len = le32_to_cpu(i_ext->len); } -static inline void set_raw_extent(struct extent_info *ext, +static inline void set_raw_read_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { - read_lock(&ext->ext_lock); i_ext->fofs = cpu_to_le32(ext->fofs); - i_ext->blk_addr = cpu_to_le32(ext->blk_addr); + i_ext->blk = cpu_to_le32(ext->blk); i_ext->len = cpu_to_le32(ext->len); - read_unlock(&ext->ext_lock); } +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front, unsigned int max_len) +{ + return (back->lstart + back->len == front->lstart) && + (back->len + front->len <= max_len); +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back, unsigned int max_len) +{ + return __is_discard_mergeable(back, cur, max_len); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front, unsigned int max_len) +{ + return __is_discard_mergeable(cur, front, max_len); +} + +/* + * For free nid management + */ +enum nid_state { + FREE_NID, /* newly added to free nid list */ + PREALLOC_NID, /* it is preallocated */ + MAX_NID_STATE, +}; + +enum nat_state { + TOTAL_NAT, + DIRTY_NAT, + RECLAIMABLE_NAT, + MAX_NAT_STATE, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ + nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + nid_t max_rf_node_blocks; /* max # of nodes for recovery */ + unsigned int ram_thresh; /* control the memory footprint */ + unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ + unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ - rwlock_t nat_tree_lock; /* protect nat_tree_lock */ - unsigned int nat_cnt; /* the # of cached nat entries */ + struct radix_tree_root nat_set_root;/* root of the nat set cache */ + struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ - struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ + spinlock_t nat_list_lock; /* protect clean nat entry list */ + unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ + unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ - struct list_head free_nid_list; /* a list for free nids */ - spinlock_t free_nid_list_lock; /* protect free nid list */ - unsigned int fcnt; /* the number of free node id */ + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ + struct list_head free_nid_list; /* list for free nids excluding preallocated nids */ + unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ + unsigned char **free_nid_bitmap; + unsigned char *nat_block_bitmap; + unsigned short *free_nid_count; /* free nid count of NAT block */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ + + unsigned int nat_bits_blocks; /* # of nat bits blocks */ + unsigned char *nat_bits; /* NAT bits blocks */ + unsigned char *full_nat_bits; /* full NAT pages */ + unsigned char *empty_nat_bits; /* empty NAT pages */ +#ifdef CONFIG_F2FS_CHECK_FS + char *nat_bitmap_mir; /* NAT bitmap mirror */ +#endif int bitmap_size; /* bitmap size */ }; @@ -231,21 +1079,24 @@ struct f2fs_nm_info { */ struct dnode_of_data { struct inode *inode; /* vfs inode pointer */ - struct page *inode_page; /* its inode page, NULL is possible */ - struct page *node_page; /* cached direct node page */ + struct folio *inode_folio; /* its inode folio, NULL is possible */ + struct folio *node_folio; /* cached direct node folio */ nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ - bool inode_page_locked; /* inode page is locked or not */ + bool inode_folio_locked; /* inode folio is locked or not */ + bool node_changed; /* is node block changed */ + char cur_level; /* level of hole node page */ + char max_level; /* level of current page located */ block_t data_blkaddr; /* block address of the node block */ }; static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, - struct page *ipage, struct page *npage, nid_t nid) + struct folio *ifolio, struct folio *nfolio, nid_t nid) { memset(dn, 0, sizeof(*dn)); dn->inode = inode; - dn->inode_page = ipage; - dn->node_page = npage; + dn->inode_folio = ifolio; + dn->node_folio = nfolio; dn->nid = nid; } @@ -264,16 +1115,39 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, */ #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) -#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_INMEM_TYPE (2) +#define NR_CURSEG_RO_TYPE (2) +#define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) +#define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) -enum { +enum log_type { CURSEG_HOT_DATA = 0, /* directory entry blocks */ CURSEG_WARM_DATA, /* data blocks */ CURSEG_COLD_DATA, /* multimedia or GCed data blocks */ CURSEG_HOT_NODE, /* direct node blocks of directory files */ CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ - NO_CHECK_TYPE + NR_PERSISTENT_LOG, /* number of persistent log */ + CURSEG_COLD_DATA_PINNED = NR_PERSISTENT_LOG, + /* pinned file that needs consecutive block address */ + CURSEG_ALL_DATA_ATGC, /* SSR alloctor in hot/warm/cold data area */ + NO_CHECK_TYPE, /* number of persistent & inmem log */ +}; + +struct flush_cmd { + struct completion wait; + struct llist_node llnode; + nid_t ino; + int ret; +}; + +struct flush_cmd_control { + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t queued_flush; /* # of queued flushes */ + struct llist_head issue_list; /* list for command issue */ + struct llist_node *dispatch_list; /* list for command dispatch */ }; struct f2fs_sm_info { @@ -282,8 +1156,7 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct list_head wblist_head; /* list of under-writeback pages */ - spinlock_t wblist_lock; /* lock for checkpoint */ + struct f2fs_rwsem curseg_lock; /* for preventing curseg change */ block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ @@ -293,16 +1166,25 @@ struct f2fs_sm_info { unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ unsigned int ovp_segments; /* # of overprovision segments */ -}; -/* - * For directory operation - */ -#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1) -#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2) -#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3) -#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4) -#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5) + /* a threshold to reclaim prefree segments */ + unsigned int rec_prefree_segments; + + struct list_head sit_entry_set; /* sit entry set list */ + + unsigned int ipu_policy; /* in-place-update policy */ + unsigned int min_ipu_util; /* in-place-update threshold */ + unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_seq_blocks; /* threshold for sequential blocks */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ + unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ + + /* for flush command control */ + struct flush_cmd_control *fcc_info; + + /* for discard command control */ + struct discard_cmd_control *dcc_info; +}; /* * For superblock @@ -313,24 +1195,27 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ +#define WB_DATA_TYPE(folio, f) \ + (f || f2fs_is_cp_guaranteed(folio) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { - F2FS_WRITEBACK, F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, + F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, + F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, + F2FS_RD_DATA, + F2FS_RD_NODE, + F2FS_RD_META, + F2FS_DIO_WRITE, + F2FS_DIO_READ, NR_COUNT_TYPE, }; /* - * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS]. - * The checkpoint procedure blocks all the locks in this fs_lock array. - * Some FS operations grab free locks, and if there is no free lock, - * then wait to grab a lock in a round-robin manner. - */ -#define NR_GLOBAL_LOCKS 8 - -/* - * The below are the page types of bios used in submti_bio(). + * The below are the page types of bios used in submit_bio(). * The available types are: * DATA User data pages. It operates as async mode. * NODE Node pages. It operates as async mode. @@ -340,19 +1225,468 @@ enum count_type { * with waiting the bio's completion * ... Only can be used with META. */ +#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) +#define PAGE_TYPE_ON_MAIN(type) ((type) == DATA || (type) == NODE) enum page_type { - DATA, - NODE, + DATA = 0, + NODE = 1, /* should not change this */ META, NR_PAGE_TYPE, META_FLUSH, + IPU, /* the below types are used by tracepoints only. */ + OPU, +}; + +enum temp_type { + HOT = 0, /* must be zero for meta bio */ + WARM, + COLD, + NR_TEMP_TYPE, +}; + +enum need_lock_type { + LOCK_REQ = 0, + LOCK_DONE, + LOCK_RETRY, +}; + +enum cp_reason_type { + CP_NO_NEEDED, + CP_NON_REGULAR, + CP_COMPRESSED, + CP_HARDLINK, + CP_SB_NEED_CP, + CP_WRONG_PINO, + CP_NO_SPC_ROLL, + CP_NODE_NEED_CP, + CP_FASTBOOT_MODE, + CP_SPEC_LOG_NUM, + CP_RECOVER_DIR, + CP_XATTR_DIR, +}; + +enum iostat_type { + /* WRITE IO */ + APP_DIRECT_IO, /* app direct write IOs */ + APP_BUFFERED_IO, /* app buffered write IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + APP_BUFFERED_CDATA_IO, /* app buffered write IOs on compressed file */ + APP_MAPPED_CDATA_IO, /* app mapped write IOs on compressed file */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_CDATA_IO, /* data IOs from kworker/fsync/reclaimer on compressed file */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + + /* READ IO */ + APP_DIRECT_READ_IO, /* app direct read IOs */ + APP_BUFFERED_READ_IO, /* app buffered read IOs */ + APP_READ_IO, /* app read IOs */ + APP_MAPPED_READ_IO, /* app mapped read IOs */ + APP_BUFFERED_CDATA_READ_IO, /* app buffered read IOs on compressed file */ + APP_MAPPED_CDATA_READ_IO, /* app mapped read IOs on compressed file */ + FS_DATA_READ_IO, /* data read IOs */ + FS_GDATA_READ_IO, /* data read IOs from background gc */ + FS_CDATA_READ_IO, /* compressed data read IOs */ + FS_NODE_READ_IO, /* node read IOs */ + FS_META_READ_IO, /* meta read IOs */ + + /* other */ + FS_DISCARD_IO, /* discard */ + FS_FLUSH_IO, /* flush */ + FS_ZONE_RESET_IO, /* zone reset */ + NR_IO_TYPE, +}; + +struct f2fs_io_info { + struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ + nid_t ino; /* inode number */ + enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + enum temp_type temp; /* contains HOT/WARM/COLD */ + enum req_op op; /* contains REQ_OP_ */ + blk_opf_t op_flags; /* req_flag_bits */ + block_t new_blkaddr; /* new block address to be written */ + block_t old_blkaddr; /* old block address before Cow */ + union { + struct page *page; /* page to be written */ + struct folio *folio; + }; + struct page *encrypted_page; /* encrypted page */ + struct page *compressed_page; /* compressed page */ + struct list_head list; /* serialize IOs */ + unsigned int compr_blocks; /* # of compressed block addresses */ + unsigned int need_lock:8; /* indicate we need to lock cp_rwsem */ + unsigned int version:8; /* version of the node */ + unsigned int submitted:1; /* indicate IO submission */ + unsigned int in_list:1; /* indicate fio is in io_list */ + unsigned int is_por:1; /* indicate IO is from recovery or not */ + unsigned int encrypted:1; /* indicate file is encrypted */ + unsigned int meta_gc:1; /* require meta inode GC */ + enum iostat_type io_type; /* io type */ + struct writeback_control *io_wbc; /* writeback control */ + struct bio **bio; /* bio for ipu */ + sector_t *last_block; /* last block number in bio */ +}; + +struct bio_entry { + struct bio *bio; + struct list_head list; +}; + +#define is_read_io(rw) ((rw) == READ) +struct f2fs_bio_info { + struct f2fs_sb_info *sbi; /* f2fs superblock */ + struct bio *bio; /* bios to merge */ + sector_t last_block_in_bio; /* last block number */ + struct f2fs_io_info fio; /* store buffered io info. */ +#ifdef CONFIG_BLK_DEV_ZONED + struct completion zone_wait; /* condition value for the previous open zone to close */ + struct bio *zone_pending_bio; /* pending bio for the previous zone */ + void *bi_private; /* previous bi_private for pending bio */ +#endif + struct f2fs_rwsem io_rwsem; /* blocking op for bio */ + spinlock_t io_lock; /* serialize DATA/NODE IOs */ + struct list_head io_list; /* track fios */ + struct list_head bio_list; /* bio entry list head */ + struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */ +}; + +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct file *bdev_file; + struct block_device *bdev; + char path[MAX_PATH_LEN + 1]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + unsigned long *blkz_seq; /* Bitmap indicating sequential zones */ +#endif +}; + +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + DIRTY_META, /* for all dirtied inode metadata */ + DONATE_INODE, /* for all inode to donate pages */ + NR_INODE_TYPE, +}; + +/* for inner inode cache management */ +struct inode_management { + struct radix_tree_root ino_root; /* ino entry array */ + spinlock_t ino_lock; /* for ino entry lock */ + struct list_head ino_list; /* inode list head */ + unsigned long ino_num; /* number of entries */ +}; + +/* for GC_AT */ +struct atgc_management { + bool atgc_enabled; /* ATGC is enabled or not */ + struct rb_root_cached root; /* root of victim rb-tree */ + struct list_head victim_list; /* linked with all victim entries */ + unsigned int victim_count; /* victim count in rb-tree */ + unsigned int candidate_ratio; /* candidate ratio */ + unsigned int max_candidate_count; /* max candidate count */ + unsigned int age_weight; /* age weight, vblock_weight = 100 - age_weight */ + unsigned long long age_threshold; /* age threshold */ +}; + +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ + bool one_time; /* require one time GC in one migration unit */ + unsigned int nr_free_secs; /* # of free sections to do GC */ +}; + +/* + * For s_flag in struct f2fs_sb_info + * Modification on enum should be synchronized with s_flag array + */ +enum { + SBI_IS_DIRTY, /* dirty flag for checkpoint */ + SBI_IS_CLOSE, /* specify unmounting */ + SBI_NEED_FSCK, /* need fsck.f2fs to fix */ + SBI_POR_DOING, /* recovery is doing or not */ + SBI_NEED_SB_WRITE, /* need to recover superblock */ + SBI_NEED_CP, /* need to checkpoint */ + SBI_IS_SHUTDOWN, /* shutdown by ioctl */ + SBI_IS_RECOVERED, /* recovered orphan/data */ + SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ + SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ + SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ + SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ + SBI_IS_RESIZEFS, /* resizefs is in process */ + SBI_IS_FREEZING, /* freezefs is in process */ + SBI_IS_WRITABLE, /* remove ro mountoption transiently */ + MAX_SBI_FLAG, +}; + +enum { + CP_TIME, + REQ_TIME, + DISCARD_TIME, + GC_TIME, + DISABLE_TIME, + ENABLE_TIME, + UMOUNT_DISCARD_TIMEOUT, + MAX_TIME, +}; + +/* Note that you need to keep synchronization with this gc_mode_names array */ +enum { + GC_NORMAL, + GC_IDLE_CB, + GC_IDLE_GREEDY, + GC_IDLE_AT, + GC_URGENT_HIGH, + GC_URGENT_LOW, + GC_URGENT_MID, + MAX_GC_MODE, +}; + +enum { + BGGC_MODE_ON, /* background gc is on */ + BGGC_MODE_OFF, /* background gc is off */ + BGGC_MODE_SYNC, /* + * background gc is on, migrating blocks + * like foreground gc + */ +}; + +enum { + FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ + FS_MODE_LFS, /* use lfs allocation only */ + FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */ + FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ +}; + +enum { + ALLOC_MODE_DEFAULT, /* stay default */ + ALLOC_MODE_REUSE, /* reuse segments as much as possible */ +}; + +enum fsync_mode { + FSYNC_MODE_POSIX, /* fsync follows posix semantics */ + FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ + FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ +}; + +enum { + COMPR_MODE_FS, /* + * automatically compress compression + * enabled files + */ + COMPR_MODE_USER, /* + * automatical compression is disabled. + * user can control the file compression + * using ioctls + */ +}; + +enum { + DISCARD_UNIT_BLOCK, /* basic discard unit is block */ + DISCARD_UNIT_SEGMENT, /* basic discard unit is segment */ + DISCARD_UNIT_SECTION, /* basic discard unit is section */ +}; + +enum { + MEMORY_MODE_NORMAL, /* memory mode for normal devices */ + MEMORY_MODE_LOW, /* memory mode for low memory devices */ +}; + +enum errors_option { + MOUNT_ERRORS_READONLY, /* remount fs ro on errors */ + MOUNT_ERRORS_CONTINUE, /* continue on errors */ + MOUNT_ERRORS_PANIC, /* panic on errors */ }; +enum { + BACKGROUND, + FOREGROUND, + MAX_CALL_TYPE, + TOTAL_CALL = FOREGROUND, +}; + +enum f2fs_lookup_mode { + LOOKUP_PERF, + LOOKUP_COMPAT, + LOOKUP_AUTO, +}; + +static inline int f2fs_test_bit(unsigned int nr, char *addr); +static inline void f2fs_set_bit(unsigned int nr, char *addr); +static inline void f2fs_clear_bit(unsigned int nr, char *addr); + +/* + * Layout of f2fs page.private: + * + * Layout A: lowest bit should be 1 + * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | + * bit 0 PAGE_PRIVATE_NOT_POINTER + * bit 1 PAGE_PRIVATE_ONGOING_MIGRATION + * bit 2 PAGE_PRIVATE_INLINE_INODE + * bit 3 PAGE_PRIVATE_REF_RESOURCE + * bit 4 PAGE_PRIVATE_ATOMIC_WRITE + * bit 5- f2fs private data + * + * Layout B: lowest bit should be 0 + * page.private is a wrapped pointer. + */ +enum { + PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ + PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ + PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ + PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */ + PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */ + PAGE_PRIVATE_MAX +}; + +/* For compression */ +enum compress_algorithm_type { + COMPRESS_LZO, + COMPRESS_LZ4, + COMPRESS_ZSTD, + COMPRESS_LZORLE, + COMPRESS_MAX, +}; + +enum compress_flag { + COMPRESS_CHKSUM, + COMPRESS_MAX_FLAG, +}; + +#define COMPRESS_WATERMARK 20 +#define COMPRESS_PERCENT 20 + +#define COMPRESS_DATA_RESERVED_SIZE 4 +struct compress_data { + __le32 clen; /* compressed data size */ + __le32 chksum; /* compressed data checksum */ + __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ + u8 cdata[]; /* compressed data */ +}; + +#define COMPRESS_HEADER_SIZE (sizeof(struct compress_data)) + +#define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 + +#define F2FS_ZSTD_DEFAULT_CLEVEL 1 + +#define COMPRESS_LEVEL_OFFSET 8 + +/* compress context */ +struct compress_ctx { + struct inode *inode; /* inode the context belong to */ + pgoff_t cluster_idx; /* cluster index number */ + unsigned int cluster_size; /* page count in cluster */ + unsigned int log_cluster_size; /* log of cluster size */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + struct page **cpages; /* pages store compressed data in cluster */ + unsigned int nr_cpages; /* total page number in cpages */ + unsigned int valid_nr_cpages; /* valid page number in cpages */ + void *rbuf; /* virtual mapped address on rpages */ + struct compress_data *cbuf; /* virtual mapped address on cpages */ + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + void *private; /* payload buffer for specified compression algorithm */ + void *private2; /* extra payload buffer */ +}; + +/* compress context for write IO path */ +struct compress_io_ctx { + u32 magic; /* magic number to indicate page is compressed */ + struct inode *inode; /* inode the context belong to */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + atomic_t pending_pages; /* in-flight compressed page count */ +}; + +/* Context for decompressing one cluster on the read IO path */ +struct decompress_io_ctx { + u32 magic; /* magic number to indicate page is compressed */ + struct inode *inode; /* inode the context belong to */ + struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ + pgoff_t cluster_idx; /* cluster index number */ + unsigned int cluster_size; /* page count in cluster */ + unsigned int log_cluster_size; /* log of cluster size */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + struct page **cpages; /* pages store compressed data in cluster */ + unsigned int nr_cpages; /* total page number in cpages */ + struct page **tpages; /* temp pages to pad holes in cluster */ + void *rbuf; /* virtual mapped address on rpages */ + struct compress_data *cbuf; /* virtual mapped address on cpages */ + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + + /* + * The number of compressed pages remaining to be read in this cluster. + * This is initially nr_cpages. It is decremented by 1 each time a page + * has been read (or failed to be read). When it reaches 0, the cluster + * is decompressed (or an error is reported). + * + * If an error occurs before all the pages have been submitted for I/O, + * then this will never reach 0. In this case the I/O submitter is + * responsible for calling f2fs_decompress_end_io() instead. + */ + atomic_t remaining_pages; + + /* + * Number of references to this decompress_io_ctx. + * + * One reference is held for I/O completion. This reference is dropped + * after the pagecache pages are updated and unlocked -- either after + * decompression (and verity if enabled), or after an error. + * + * In addition, each compressed page holds a reference while it is in a + * bio. These references are necessary prevent compressed pages from + * being freed while they are still in a bio. + */ + refcount_t refcnt; + + bool failed; /* IO error occurred before decompression? */ + bool need_verity; /* need fs-verity verification after decompression? */ + unsigned char compress_algorithm; /* backup algorithm type */ + void *private; /* payload buffer for specified decompression algorithm */ + void *private2; /* extra payload buffer */ + struct work_struct verity_work; /* work to verify the decompressed pages */ + struct work_struct free_work; /* work for late free this structure itself */ +}; + +#define NULL_CLUSTER ((unsigned int)(~0)) +#define MIN_COMPRESS_LOG_SIZE 2 +#define MAX_COMPRESS_LOG_SIZE 8 +#define MAX_COMPRESS_WINDOW_SIZE(log_size) ((PAGE_SIZE) << (log_size)) + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ - struct buffer_head *raw_super_buf; /* buffer head of raw sb */ + struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - int s_dirty; /* dirty flag for checkpoint */ + struct f2fs_rwsem sb_lock; /* lock for raw super block */ + int valid_super_block; /* valid super block no */ + unsigned long s_flag; /* flags for sbi */ + struct mutex writepages; /* mutex for writepages() */ + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int unusable_blocks_per_sec; /* unusable blocks per section */ + unsigned int max_open_zones; /* max open zone resources of the zoned device */ + /* For adjust the priority writing position of data in zone UFS */ + unsigned int blkzone_alloc_policy; +#endif /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ @@ -360,31 +1694,59 @@ struct f2fs_sb_info { /* for segment-related operations */ struct f2fs_sm_info *sm_info; /* segment manager */ - struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */ - sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */ - struct rw_semaphore bio_sem; /* IO semaphore */ + + /* for bio operations */ + struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ + /* keep migration IO order for LFS mode */ + struct f2fs_rwsem io_order_lock; + pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ + int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + int cur_cp_pack; /* remain current cp pack */ + spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ - struct mutex cp_mutex; /* checkpoint procedure lock */ - struct mutex fs_lock[NR_GLOBAL_LOCKS]; /* blocking FS operations */ - struct mutex node_write; /* locking node writes */ - struct mutex writepages; /* mutex for writepages() */ - unsigned char next_lock_num; /* round-robin global locks */ - int por_doing; /* recovery is doing or not */ - int on_build_free_nids; /* build_free_nids is doing */ - - /* for orphan inode management */ - struct list_head orphan_inode_list; /* orphan inode list */ - struct mutex orphan_inode_mutex; /* for orphan inode list */ - unsigned int n_orphans; /* # of orphan inodes */ - - /* for directory inode management */ - struct list_head dir_inode_list; /* dir inode list */ - spinlock_t dir_inode_lock; /* for dir inode list lock */ - - /* basic file system units */ + struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */ + struct f2fs_rwsem cp_rwsem; /* blocking FS operations */ + struct f2fs_rwsem node_write; /* locking node writes */ + struct f2fs_rwsem node_change; /* locking node change */ + wait_queue_head_t cp_wait; + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ + struct ckpt_req_control cprc_info; /* for checkpoint request control */ + struct cp_stats cp_stats; /* for time stat of checkpoint */ + struct f2fs_rwsem cp_enable_rwsem; /* block cache/dio write */ + + struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ + + spinlock_t fsync_node_lock; /* for node entry lock */ + struct list_head fsync_node_list; /* node list head */ + unsigned int fsync_seg_id; /* sequence id */ + unsigned int fsync_node_num; /* number of node entries */ + + /* for orphan inode, use 0'th array */ + unsigned int max_orphans; /* max orphan inodes */ + + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ + struct mutex flush_lock; /* for flush exclusion */ + + /* for extent tree cache */ + struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; + atomic64_t allocated_data_blocks; /* for block age extent_cache */ + unsigned int max_read_extent_count; /* max read extent count per inode */ + + /* The threshold used for hot and warm data seperation*/ + unsigned int hot_data_age_threshold; + unsigned int warm_data_age_threshold; + unsigned int last_age_weight; + + /* control donate caches */ + unsigned int donate_files; + + /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ unsigned int blocksize; /* block size */ @@ -398,22 +1760,69 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - unsigned int total_valid_inode_count; /* valid inode count */ - int active_logs; /* # of active logs */ + int dir_level; /* directory level */ + bool readdir_ra; /* readahead inode in readdir */ + u64 max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ - block_t alloc_valid_block_count; /* # of allocated blocks */ + block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ - u32 s_next_generation; /* for NFS support */ - atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ + block_t reserved_blocks; /* configurable reserved blocks */ + block_t current_reserved_blocks; /* current reserved blocks */ + + /* Additional tracking for no checkpoint mode */ + block_t unusable_block_count; /* # of blocks saved by last cp */ + + unsigned int nquota_files; /* # of quota sysfile */ + struct f2fs_rwsem quota_sem; /* blocking cp for flags */ + struct task_struct *umount_lock_holder; /* s_umount lock holder */ + + /* # of pages, see count_type */ + atomic_t nr_pages[NR_COUNT_TYPE]; + /* # of allocated blocks */ + struct percpu_counter alloc_valid_block_count; + /* # of node block writes as roll forward recovery */ + struct percpu_counter rf_node_block_count; + + /* writeback control */ + atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ + + /* valid inode count */ + struct percpu_counter total_valid_inode_count; struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ - struct mutex gc_mutex; /* mutex for GC */ + struct f2fs_rwsem gc_lock; /* + * semaphore for GC, avoid + * race between GC and GC or CP + */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ + struct atgc_management am; /* atgc management */ unsigned int cur_victim_sec; /* current victim section num */ + unsigned int gc_mode; /* current GC state */ + unsigned int next_victim_seg[2]; /* next segment in victim section */ + spinlock_t gc_remaining_trials_lock; + /* remaining trial count for GC_URGENT_* and GC_IDLE_* */ + unsigned int gc_remaining_trials; + + /* for skip statistic */ + unsigned long long skipped_gc_rwsem; /* FG_GC only */ + + /* free sections reserved for pinned file */ + unsigned int reserved_pin_section; + + /* threshold for gc trials on pinned files */ + unsigned short gc_pin_file_threshold; + struct f2fs_rwsem pin_sem; + + /* maximum # of trials to find a victim segment for SSR and GC */ + unsigned int max_victim_search; + /* migration granularity of garbage collection, unit: segment */ + unsigned int migration_granularity; + /* migration window granularity of garbage collection, unit: segment */ + unsigned int migration_window_granularity; /* * for stat information. @@ -421,19 +1830,258 @@ struct f2fs_sb_info { */ #ifdef CONFIG_F2FS_STAT_FS struct f2fs_stat_info *stat_info; /* FS status information */ + atomic_t meta_count[META_MAX]; /* # of meta blocks */ unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ - int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ - int bg_gc; /* background gc calls */ - unsigned int n_dirty_dirs; /* # of dir inodes */ + atomic_t inplace_count; /* # of inplace update */ + /* # of lookup extent cache */ + atomic64_t total_hit_ext[NR_EXTENT_CACHES]; + /* # of hit rbtree extent node */ + atomic64_t read_hit_rbtree[NR_EXTENT_CACHES]; + /* # of hit cached extent node */ + atomic64_t read_hit_cached[NR_EXTENT_CACHES]; + /* # of hit largest extent node in read extent cache */ + atomic64_t read_hit_largest; + atomic_t inline_xattr; /* # of inline_xattr inodes */ + atomic_t inline_inode; /* # of inline_data inodes */ + atomic_t inline_dir; /* # of inline_dentry inodes */ + atomic_t compr_inode; /* # of compressed inodes */ + atomic64_t compr_blocks; /* # of compressed blocks */ + atomic_t swapfile_inode; /* # of swapfile inodes */ + atomic_t atomic_files; /* # of opened atomic file */ + atomic_t max_aw_cnt; /* max # of atomic writes */ + unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ + unsigned int other_skip_bggc; /* skip background gc for other reasons */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ + atomic_t cp_call_count[MAX_CALL_TYPE]; /* # of cp call */ #endif - unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ + + /* to attach REQ_META|REQ_FUA flags */ + unsigned int data_io_flag; + unsigned int node_io_flag; + + /* For sysfs support */ + struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */ + struct completion s_kobj_unregister; + + struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */ + struct completion s_stat_kobj_unregister; + + struct kobject s_feature_list_kobj; /* /sys/fs/f2fs/<devname>/feature_list */ + struct completion s_feature_list_kobj_unregister; + + /* For shrinker support */ + struct list_head s_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + /* For multi devices */ + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ + unsigned int dirty_device; /* for checkpoint data flush */ + spinlock_t dev_lock; /* protect dirty_device */ + bool aligned_blksize; /* all devices has the same logical blksize */ + unsigned int first_seq_zone_segno; /* first segno in sequential zone */ + unsigned int bggc_io_aware; /* For adjust the BG_GC priority when pending IO */ + unsigned int allocate_section_hint; /* the boundary position between devices */ + unsigned int allocate_section_policy; /* determine the section writing priority */ + + /* For write statistics */ + u64 sectors_written_start; + u64 kbytes_written; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + + struct workqueue_struct *post_read_wq; /* post read workqueue */ + + /* + * If we are in irq context, let's update error information into + * on-disk superblock in the work. + */ + struct work_struct s_error_work; + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + unsigned char stop_reason[MAX_STOP_REASON]; /* stop reason */ + spinlock_t error_lock; /* protect errors/stop_reason array */ + bool error_dirty; /* errors of sb is dirty */ + + /* For reclaimed segs statistics per each GC mode */ + unsigned int gc_segment_mode; /* GC state for reclaimed segments */ + unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ + + unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ + + int max_fragment_chunk; /* max chunk size for block fragmentation mode */ + int max_fragment_hole; /* max hole size for block fragmentation mode */ + + /* For atomic write statistics */ + atomic64_t current_atomic_write; + s64 peak_atomic_write; + u64 committed_atomic_block; + u64 revoked_atomic_block; + + /* carve out reserved_blocks from total blocks */ + bool carve_out; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct kmem_cache *page_array_slab; /* page array entry */ + unsigned int page_array_slab_size; /* default page array slab size */ + + /* For runtime compression statistics */ + u64 compr_written_block; + u64 compr_saved_block; + u32 compr_new_inode; + + /* For compressed block cache */ + struct inode *compress_inode; /* cache compressed blocks */ + unsigned int compress_percent; /* cache page percentage */ + unsigned int compress_watermark; /* cache page watermark */ + atomic_t compress_page_hit; /* cache hit count */ +#endif + +#ifdef CONFIG_F2FS_IOSTAT + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long iostat_count[NR_IO_TYPE]; + unsigned long long iostat_bytes[NR_IO_TYPE]; + unsigned long long prev_iostat_bytes[NR_IO_TYPE]; + bool iostat_enable; + unsigned long iostat_next_period; + unsigned int iostat_period_ms; + + /* For io latency related statistics info in one iostat period */ + spinlock_t iostat_lat_lock; + struct iostat_lat_info *iostat_io_lat; +#endif }; +/* Definitions to access f2fs_sb_info */ +#define SEGS_TO_BLKS(sbi, segs) \ + ((segs) << (sbi)->log_blocks_per_seg) +#define BLKS_TO_SEGS(sbi, blks) \ + ((blks) >> (sbi)->log_blocks_per_seg) + +#define BLKS_PER_SEG(sbi) ((sbi)->blocks_per_seg) +#define BLKS_PER_SEC(sbi) (SEGS_TO_BLKS(sbi, (sbi)->segs_per_sec)) +#define SEGS_PER_SEC(sbi) ((sbi)->segs_per_sec) + +__printf(3, 4) +void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, const char *fmt, ...); + +#define f2fs_err(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_ERR fmt, ##__VA_ARGS__) +#define f2fs_warn(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_WARNING fmt, ##__VA_ARGS__) +#define f2fs_notice(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_NOTICE fmt, ##__VA_ARGS__) +#define f2fs_info(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_INFO fmt, ##__VA_ARGS__) +#define f2fs_debug(sbi, fmt, ...) \ + f2fs_printk(sbi, false, KERN_DEBUG fmt, ##__VA_ARGS__) + +#define f2fs_err_ratelimited(sbi, fmt, ...) \ + f2fs_printk(sbi, true, KERN_ERR fmt, ##__VA_ARGS__) +#define f2fs_warn_ratelimited(sbi, fmt, ...) \ + f2fs_printk(sbi, true, KERN_WARNING fmt, ##__VA_ARGS__) +#define f2fs_info_ratelimited(sbi, fmt, ...) \ + f2fs_printk(sbi, true, KERN_INFO fmt, ##__VA_ARGS__) + +#ifdef CONFIG_F2FS_FAULT_INJECTION +#define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__, \ + __builtin_return_address(0)) +static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type, + const char *func, const char *parent_func) +{ + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + ffi->inject_count[type]++; + f2fs_info_ratelimited(sbi, "inject %s in %s of %pS", + f2fs_fault_name[type], func, parent_func); + return true; + } + return false; +} +#else +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + return false; +} +#endif + +/* + * Test if the mounted volume is a multi-device volume. + * - For a single regular disk volume, sbi->s_ndevs is 0. + * - For a single zoned disk volume, sbi->s_ndevs is 1. + * - For a multi-device volume, sbi->s_ndevs is always 2 or more. + */ +static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) +{ + return sbi->s_ndevs > 1; +} + +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + unsigned long now = jiffies; + + sbi->last_time[type] = now; + + /* DISCARD_TIME and GC_TIME are based on REQ_TIME */ + if (type == REQ_TIME) { + sbi->last_time[DISCARD_TIME] = now; + sbi->last_time[GC_TIME] = now; + } +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + unsigned long interval = sbi->interval_time[type] * HZ; + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi, + int type) +{ + unsigned long interval = sbi->interval_time[type] * HZ; + unsigned int wait_ms = 0; + long delta; + + delta = (sbi->last_time[type] + interval) - jiffies; + if (delta > 0) + wait_ms = jiffies_to_msecs(delta); + + return wait_ms; +} + /* * Inline functions */ +static inline u32 __f2fs_crc32(u32 crc, const void *address, + unsigned int length) +{ + return crc32(crc, address, length); +} + +static inline u32 f2fs_crc32(const void *address, unsigned int length) +{ + return __f2fs_crc32(F2FS_SUPER_MAGIC, address, length); +} + +static inline u32 f2fs_chksum(u32 crc, const void *address, unsigned int length) +{ + return __f2fs_crc32(crc, address, length); +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -444,16 +2092,51 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb) return sb->s_fs_info; } +static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode) +{ + return F2FS_SB(inode->i_sb); +} + +static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) +{ + return F2FS_I_SB(mapping->host); +} + +static inline struct f2fs_sb_info *F2FS_F_SB(const struct folio *folio) +{ + return F2FS_M_SB(folio->mapping); +} + static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) { return (struct f2fs_super_block *)(sbi->raw_super); } +static inline struct f2fs_super_block *F2FS_SUPER_BLOCK(struct folio *folio, + pgoff_t index) +{ + pgoff_t idx_in_folio = index % folio_nr_pages(folio); + + return (struct f2fs_super_block *) + (page_address(folio_page(folio, idx_in_folio)) + + F2FS_SUPER_OFFSET); +} + static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) { return (struct f2fs_checkpoint *)(sbi->ckpt); } +static inline struct f2fs_node *F2FS_NODE(const struct folio *folio) +{ + return (struct f2fs_node *)folio_address(folio); +} + +static inline struct f2fs_inode *F2FS_INODE(const struct folio *folio) +{ + return &((struct f2fs_node *)folio_address(folio))->i; +} + static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_nm_info *)(sbi->nm_info); @@ -479,145 +2162,515 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); } -static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) +static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->meta_inode->i_mapping; +} + +static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->node_inode->i_mapping; +} + +static inline bool is_meta_folio(struct folio *folio) +{ + return folio->mapping == META_MAPPING(F2FS_F_SB(folio)); +} + +static inline bool is_node_folio(struct folio *folio) +{ + return folio->mapping == NODE_MAPPING(F2FS_F_SB(folio)); +} + +static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) +{ + return test_bit(type, &sbi->s_flag); +} + +static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_dirty = 1; + set_bit(type, &sbi->s_flag); } -static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi) +static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_dirty = 0; + clear_bit(type, &sbi->s_flag); } -static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) +{ + return le64_to_cpu(cp->checkpoint_ver); +} + +static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type) +{ + if (type < F2FS_MAX_QUOTAS) + return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]); + return 0; +} + +static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) +{ + size_t crc_offset = le32_to_cpu(cp->checksum_offset); + return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); +} + +static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return ckpt_flags & f; } -static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); +} + +static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags |= f; cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); + __set_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags &= (~f); cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void mutex_lock_all(struct f2fs_sb_info *sbi) +static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - int i; + unsigned long flags; - for (i = 0; i < NR_GLOBAL_LOCKS; i++) { - /* - * This is the only time we take multiple fs_lock[] - * instances; the order is immaterial since we - * always hold cp_mutex, which serializes multiple - * such operations. - */ - mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex); - } + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } -static inline void mutex_unlock_all(struct f2fs_sb_info *sbi) +#define init_f2fs_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_f2fs_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem, + const char *sem_name, struct lock_class_key *key) { - int i = 0; - for (; i < NR_GLOBAL_LOCKS; i++) - mutex_unlock(&sbi->fs_lock[i]); + __init_rwsem(&sem->internal_rwsem, sem_name, key); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + init_waitqueue_head(&sem->read_waiters); +#endif } -static inline int mutex_lock_op(struct f2fs_sb_info *sbi) +static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem) { - unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS; - int i = 0; + return rwsem_is_locked(&sem->internal_rwsem); +} - for (; i < NR_GLOBAL_LOCKS; i++) - if (mutex_trylock(&sbi->fs_lock[i])) - return i; +static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem) +{ + return rwsem_is_contended(&sem->internal_rwsem); +} - mutex_lock(&sbi->fs_lock[next_lock]); - sbi->next_lock_num++; - return next_lock; +static inline void f2fs_down_read(struct f2fs_rwsem *sem) +{ +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem)); +#else + down_read(&sem->internal_rwsem); +#endif } -static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock) +static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) { - if (ilock < 0) - return; - BUG_ON(ilock >= NR_GLOBAL_LOCKS); - mutex_unlock(&sbi->fs_lock[ilock]); + return down_read_trylock(&sem->internal_rwsem); } -/* - * Check whether the given nid is within node id range. - */ -static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +static inline void f2fs_up_read(struct f2fs_rwsem *sem) { - WARN_ON((nid >= NM_I(sbi)->max_nid)); - if (nid >= NM_I(sbi)->max_nid) - return -EINVAL; - return 0; + up_read(&sem->internal_rwsem); +} + +static inline void f2fs_down_write(struct f2fs_rwsem *sem) +{ + down_write(&sem->internal_rwsem); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_read_nested(&sem->internal_rwsem, subclass); } -#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 +static inline void f2fs_down_write_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_write_nested(&sem->internal_rwsem, subclass); +} +#else +#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) +#define f2fs_down_write_nested(sem, subclass) f2fs_down_write(sem) +#endif + +static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) +{ + return down_write_trylock(&sem->internal_rwsem); +} + +static inline void f2fs_up_write(struct f2fs_rwsem *sem) +{ + up_write(&sem->internal_rwsem); +#ifdef CONFIG_F2FS_UNFAIR_RWSEM + wake_up_all(&sem->read_waiters); +#endif +} + +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + unsigned long flags; + unsigned char *nat_bits; + + /* + * In order to re-enable nat_bits we need to call fsck.f2fs by + * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, + * so let's rely on regular fsck or unclean shutdown. + */ + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + nat_bits = NM_I(sbi)->nat_bits; + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); + + kvfree(nat_bits); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; +} + +static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) +{ + f2fs_down_read(&sbi->cp_rwsem); +} + +static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) +{ + if (time_to_inject(sbi, FAULT_LOCK_OP)) + return 0; + return f2fs_down_read_trylock(&sbi->cp_rwsem); +} + +static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) +{ + f2fs_up_read(&sbi->cp_rwsem); +} + +static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) +{ + f2fs_down_write(&sbi->cp_rwsem); +} + +static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) +{ + f2fs_up_write(&sbi->cp_rwsem); +} + +static inline int __get_cp_reason(struct f2fs_sb_info *sbi) +{ + int reason = CP_SYNC; + + if (test_opt(sbi, FASTBOOT)) + reason = CP_FASTBOOT; + if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) + reason = CP_UMOUNT; + return reason; +} + +static inline bool __remain_node_summaries(int reason) +{ + return (reason & (CP_UMOUNT | CP_FASTBOOT)); +} + +static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) +{ + return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || + is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); +} /* * Check whether the inode has blocks or not */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { - if (F2FS_I(inode)->i_xattr_nid) - return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1); - else - return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS); + block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; + + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; } -static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, - struct inode *inode, blkcnt_t count) +static inline bool f2fs_has_xattr_block(unsigned int ofs) { - block_t valid_block_count; + return ofs == XATTR_NODE_OFFSET; +} + +static inline bool __allow_reserved_root(struct f2fs_sb_info *sbi, + struct inode *inode, bool cap) +{ + if (!inode) + return true; + if (IS_NOQUOTA(inode)) + return true; + if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) + return true; + if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && + in_group_p(F2FS_OPTION(sbi).s_resgid)) + return true; + if (cap && capable(CAP_SYS_RESOURCE)) + return true; + return false; +} + +static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool cap) +{ + block_t avail_user_block_count; + + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks; + + if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_root(sbi, inode, cap)) + avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (avail_user_block_count > sbi->unusable_block_count) + avail_user_block_count -= sbi->unusable_block_count; + else + avail_user_block_count = 0; + } + + return avail_user_block_count; +} + +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); +static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, + struct inode *inode, blkcnt_t *count, bool partial) +{ + long long diff = 0, release = 0; + block_t avail_user_block_count; + int ret; + + ret = dquot_reserve_block(inode, *count); + if (ret) + return ret; + + if (time_to_inject(sbi, FAULT_BLOCK)) { + release = *count; + goto release_quota; + } + + /* + * let's increase this in prior to actual block count change in order + * for f2fs_sync_file to avoid data races when deciding checkpoint. + */ + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); spin_lock(&sbi->stat_lock); - valid_block_count = - sbi->total_valid_block_count + (block_t)count; - if (valid_block_count > sbi->user_block_count) { - spin_unlock(&sbi->stat_lock); - return false; + + avail_user_block_count = get_available_block_count(sbi, inode, true); + diff = (long long)sbi->total_valid_block_count + *count - + avail_user_block_count; + if (unlikely(diff > 0)) { + if (!partial) { + spin_unlock(&sbi->stat_lock); + release = *count; + goto enospc; + } + if (diff > *count) + diff = *count; + *count -= diff; + release = diff; + if (!*count) { + spin_unlock(&sbi->stat_lock); + goto enospc; + } } - inode->i_blocks += count; - sbi->total_valid_block_count = valid_block_count; - sbi->alloc_valid_block_count += (block_t)count; + sbi->total_valid_block_count += (block_t)(*count); + spin_unlock(&sbi->stat_lock); - return true; + + if (unlikely(release)) { + percpu_counter_sub(&sbi->alloc_valid_block_count, release); + dquot_release_reservation_block(inode, release); + } + f2fs_i_blocks_write(inode, *count, true, true); + return 0; + +enospc: + percpu_counter_sub(&sbi->alloc_valid_block_count, release); +release_quota: + dquot_release_reservation_block(inode, release); + return -ENOSPC; +} + +#define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool folio_test_f2fs_##name(const struct folio *folio) \ +{ \ + unsigned long priv = (unsigned long)folio->private; \ + unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ + (1UL << PAGE_PRIVATE_##flagname); \ + return (priv & v) == v; \ +} \ +static inline bool page_private_##name(struct page *page) \ +{ \ + return PagePrivate(page) && \ + test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ + test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void folio_set_f2fs_##name(struct folio *folio) \ +{ \ + unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \ + (1UL << PAGE_PRIVATE_##flagname); \ + if (!folio->private) \ + folio_attach_private(folio, (void *)v); \ + else { \ + v |= (unsigned long)folio->private; \ + folio->private = (void *)v; \ + } \ +} \ +static inline void set_page_private_##name(struct page *page) \ +{ \ + if (!PagePrivate(page)) \ + attach_page_private(page, (void *)0); \ + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ + set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void folio_clear_f2fs_##name(struct folio *folio) \ +{ \ + unsigned long v = (unsigned long)folio->private; \ + \ + v &= ~(1UL << PAGE_PRIVATE_##flagname); \ + if (v == (1UL << PAGE_PRIVATE_NOT_POINTER)) \ + folio_detach_private(folio); \ + else \ + folio->private = (void *)v; \ +} \ +static inline void clear_page_private_##name(struct page *page) \ +{ \ + clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) \ + detach_page_private(page); \ +} + +PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); +PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE); + +PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE); + +PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); + +static inline unsigned long folio_get_f2fs_data(struct folio *folio) +{ + unsigned long data = (unsigned long)folio->private; + + if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) + return 0; + return data >> PAGE_PRIVATE_MAX; } -static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, +static inline void folio_set_f2fs_data(struct folio *folio, unsigned long data) +{ + data = (1UL << PAGE_PRIVATE_NOT_POINTER) | (data << PAGE_PRIVATE_MAX); + + if (!folio_test_private(folio)) + folio_attach_private(folio, (void *)data); + else + folio->private = (void *)((unsigned long)folio->private | data); +} + +static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, - blkcnt_t count) + block_t count) { + blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; + spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_block_count < (block_t) count); - BUG_ON(inode->i_blocks < count); - inode->i_blocks -= count; - sbi->total_valid_block_count -= (block_t)count; + if (unlikely(sbi->total_valid_block_count < count)) { + f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%lu, count:%u", + sbi->total_valid_block_count, inode->i_ino, count); + sbi->total_valid_block_count = 0; + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count -= count; + } + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->current_reserved_blocks + count); spin_unlock(&sbi->stat_lock); - return 0; + if (unlikely(inode->i_blocks < sectors)) { + f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu, sectors:%llu", + inode->i_ino, + (unsigned long long)inode->i_blocks, + (unsigned long long)sectors); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return; + } + f2fs_i_blocks_write(inode, count, false, true); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - F2FS_SET_SB_DIRT(sbi); + + if (count_type == F2FS_DIRTY_DENTS || + count_type == F2FS_DIRTY_NODES || + count_type == F2FS_DIRTY_META || + count_type == F2FS_DIRTY_QDATA || + count_type == F2FS_DIRTY_IMETA) + set_sbi_flag(sbi, SBI_IS_DIRTY); } -static inline void inode_inc_dirty_dents(struct inode *inode) +static inline void inode_inc_dirty_pages(struct inode *inode) { - atomic_inc(&F2FS_I(inode)->dirty_dents); + atomic_inc(&F2FS_I(inode)->dirty_pages); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -625,31 +2678,65 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) atomic_dec(&sbi->nr_pages[count_type]); } -static inline void inode_dec_dirty_dents(struct inode *inode) +static inline void inode_dec_dirty_pages(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + return; + + atomic_dec(&F2FS_I(inode)->dirty_pages); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); +} + +static inline void inc_atomic_write_cnt(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + u64 current_write; + + fi->atomic_write_cnt++; + atomic64_inc(&sbi->current_atomic_write); + current_write = atomic64_read(&sbi->current_atomic_write); + if (current_write > sbi->peak_atomic_write) + sbi->peak_atomic_write = current_write; +} + +static inline void release_atomic_write_cnt(struct inode *inode) { - atomic_dec(&F2FS_I(inode)->dirty_dents); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + + atomic64_sub(fi->atomic_write_cnt, &sbi->current_atomic_write); + fi->atomic_write_cnt = 0; } -static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { return atomic_read(&sbi->nr_pages[count_type]); } +static inline int get_dirty_pages(struct inode *inode) +{ + return atomic_read(&F2FS_I(inode)->dirty_pages); +} + static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); - return ((get_pages(sbi, block_type) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + return div_u64(get_pages(sbi, block_type) + BLKS_PER_SEC(sbi) - 1, + BLKS_PER_SEC(sbi)); } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { - block_t ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_block_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_block_count; +} + +static inline block_t discard_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->discard_blks; } static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) @@ -665,170 +2752,396 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) return 0; } +static inline block_t __cp_payload(struct f2fs_sb_info *sbi) +{ + return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); +} + static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - int offset = (flag == NAT_BITMAP) ? + void *tmp_ptr = &ckpt->sit_nat_version_bitmap; + int offset; + + if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { + offset = (flag == SIT_BITMAP) ? + le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0; + /* + * if large_nat_bitmap feature is enabled, leave checksum + * protection for all nat/sit bitmaps. + */ + return tmp_ptr + offset + sizeof(__le32); + } + + if (__cp_payload(sbi) > 0) { + if (flag == NAT_BITMAP) + return tmp_ptr; + else + return (unsigned char *)ckpt + F2FS_BLKSIZE; + } else { + offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; - return &ckpt->sit_nat_version_bitmap + offset; + return tmp_ptr + offset; + } } static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { - block_t start_addr; - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver); + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + if (sbi->cur_cp_pack == 2) + start_addr += BLKS_PER_SEG(sbi); + return start_addr; +} - /* - * odd numbered checkpoint should at cp segment 0 - * and even segent must be at cp segment 1 - */ - if (!(ckpt_version & 1)) - start_addr += sbi->blocks_per_seg; +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + if (sbi->cur_cp_pack == 1) + start_addr += BLKS_PER_SEG(sbi); return start_addr; } +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) +{ + sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; +} + static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } -static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode, - unsigned int count) +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); +static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool is_inode) { block_t valid_block_count; - unsigned int valid_node_count; + unsigned int valid_node_count, avail_user_node_count; + unsigned int avail_user_block_count; + int err; + + if (is_inode) { + if (inode) { + err = dquot_alloc_inode(inode); + if (err) + return err; + } + } else { + err = dquot_reserve_block(inode, 1); + if (err) + return err; + } + + if (time_to_inject(sbi, FAULT_BLOCK)) + goto enospc; spin_lock(&sbi->stat_lock); - valid_block_count = sbi->total_valid_block_count + (block_t)count; - sbi->alloc_valid_block_count += (block_t)count; - valid_node_count = sbi->total_valid_node_count + count; + valid_block_count = sbi->total_valid_block_count + 1; + avail_user_block_count = get_available_block_count(sbi, inode, + test_opt(sbi, RESERVE_NODE)); - if (valid_block_count > sbi->user_block_count) { + if (unlikely(valid_block_count > avail_user_block_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } - if (valid_node_count > sbi->total_node_count) { + avail_user_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + if (test_opt(sbi, RESERVE_NODE) && + !__allow_reserved_root(sbi, inode, true)) + avail_user_node_count -= F2FS_OPTION(sbi).root_reserved_nodes; + valid_node_count = sbi->total_valid_node_count + 1; + if (unlikely(valid_node_count > avail_user_node_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } - if (inode) - inode->i_blocks += count; - sbi->total_valid_node_count = valid_node_count; - sbi->total_valid_block_count = valid_block_count; + sbi->total_valid_node_count++; + sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); - return true; + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true, true); + } + + percpu_counter_inc(&sbi->alloc_valid_block_count); + return 0; + +enospc: + if (is_inode) { + if (inode) + dquot_free_inode(inode); + } else { + dquot_release_reservation_block(inode, 1); + } + return -ENOSPC; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode, - unsigned int count) + struct inode *inode, bool is_inode) { spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_block_count < count); - BUG_ON(sbi->total_valid_node_count < count); - BUG_ON(inode->i_blocks < count); + if (unlikely(!sbi->total_valid_block_count || + !sbi->total_valid_node_count)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", + sbi->total_valid_block_count, + sbi->total_valid_node_count); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count--; + sbi->total_valid_node_count--; + } - inode->i_blocks -= count; - sbi->total_valid_node_count -= count; - sbi->total_valid_block_count -= (block_t)count; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks++; spin_unlock(&sbi->stat_lock); + + if (is_inode) { + dquot_free_inode(inode); + } else { + if (unlikely(inode->i_blocks == 0)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu", + inode->i_ino, + (unsigned long long)inode->i_blocks); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return; + } + f2fs_i_blocks_write(inode, 1, false, true); + } } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_node_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_node_count; } static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count); - sbi->total_valid_inode_count++; - spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->total_valid_inode_count); } -static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi) +static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - BUG_ON(!sbi->total_valid_inode_count); - sbi->total_valid_inode_count--; - spin_unlock(&sbi->stat_lock); - return 0; + percpu_counter_dec(&sbi->total_valid_inode_count); } -static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_inode_count; - spin_unlock(&sbi->stat_lock); - return ret; + return percpu_counter_sum_positive(&sbi->total_valid_inode_count); +} + +static inline struct folio *f2fs_grab_cache_folio(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + struct folio *folio; + unsigned int flags; + + if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { + fgf_t fgf_flags; + + if (!for_write) + fgf_flags = FGP_LOCK | FGP_ACCESSED; + else + fgf_flags = FGP_LOCK; + folio = __filemap_get_folio(mapping, index, fgf_flags, 0); + if (!IS_ERR(folio)) + return folio; + + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + return ERR_PTR(-ENOMEM); + } + + if (!for_write) + return filemap_grab_folio(mapping, index); + + flags = memalloc_nofs_save(); + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + memalloc_nofs_restore(flags); + + return folio; } -static inline void f2fs_put_page(struct page *page, int unlock) +static inline struct folio *f2fs_filemap_get_folio( + struct address_space *mapping, pgoff_t index, + fgf_t fgp_flags, gfp_t gfp_mask) { - if (!page || IS_ERR(page)) + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) + return ERR_PTR(-ENOMEM); + + return __filemap_get_folio(mapping, index, fgp_flags, gfp_mask); +} + +static inline void f2fs_folio_put(struct folio *folio, bool unlock) +{ + if (IS_ERR_OR_NULL(folio)) return; if (unlock) { - BUG_ON(!PageLocked(page)); - unlock_page(page); + f2fs_bug_on(F2FS_F_SB(folio), !folio_test_locked(folio)); + folio_unlock(folio); } - page_cache_release(page); + folio_put(folio); +} + +static inline void f2fs_put_page(struct page *page, bool unlock) +{ + if (!page) + return; + f2fs_folio_put(page_folio(page), unlock); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) { - if (dn->node_page) - f2fs_put_page(dn->node_page, 1); - if (dn->inode_page && dn->node_page != dn->inode_page) - f2fs_put_page(dn->inode_page, 0); - dn->node_page = NULL; - dn->inode_page = NULL; + if (dn->node_folio) + f2fs_folio_put(dn->node_folio, true); + if (dn->inode_folio && dn->node_folio != dn->inode_folio) + f2fs_folio_put(dn->inode_folio, false); + dn->node_folio = NULL; + dn->inode_folio = NULL; } static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, - size_t size, void (*ctor)(void *)) + size_t size) +{ + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); +} + +static inline void *f2fs_kmem_cache_alloc_nofail(struct kmem_cache *cachep, + gfp_t flags) +{ + void *entry; + + entry = kmem_cache_alloc(cachep, flags); + if (!entry) + entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL); + return entry; +} + +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags, bool nofail, struct f2fs_sb_info *sbi) +{ + if (nofail) + return f2fs_kmem_cache_alloc_nofail(cachep, flags); + + if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) + return NULL; + + return kmem_cache_alloc(cachep, flags); +} + +static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type) +{ + if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || + get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || + get_pages(sbi, F2FS_WB_CP_DATA) || + get_pages(sbi, F2FS_DIO_READ) || + get_pages(sbi, F2FS_DIO_WRITE)) + return true; + + if (type != DISCARD_TIME && SM_I(sbi) && SM_I(sbi)->dcc_info && + atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) + return true; + + if (SM_I(sbi) && SM_I(sbi)->fcc_info && + atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + return true; + return false; +} + +static inline bool is_inflight_read_io(struct f2fs_sb_info *sbi) { - return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); + return get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_DIO_READ); +} + +static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +{ + bool zoned_gc = (type == GC_TIME && + F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_BLKZONED)); + + if (sbi->gc_mode == GC_URGENT_HIGH) + return true; + + if (sbi->bggc_io_aware == AWARE_READ_IO && is_inflight_read_io(sbi)) + return false; + if (sbi->bggc_io_aware == AWARE_ALL_IO && is_inflight_io(sbi, type)) + return false; + + if (sbi->gc_mode == GC_URGENT_MID) + return true; + + if (sbi->gc_mode == GC_URGENT_LOW && + (type == DISCARD_TIME || type == GC_TIME)) + return true; + + if (zoned_gc) + return true; + + return f2fs_time_over(sbi, type); +} + +static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) +{ + while (radix_tree_insert(root, index, item)) + cond_resched(); } #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) -static inline bool IS_INODE(struct page *page) +static inline bool IS_INODE(const struct folio *folio) { - struct f2fs_node *p = (struct f2fs_node *)page_address(page); + struct f2fs_node *p = F2FS_NODE(folio); + return RAW_IS_INODE(p); } +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } -static inline block_t datablock_addr(struct page *node_page, - unsigned int offset) +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline unsigned int get_dnode_base(struct inode *inode, + struct folio *node_folio) +{ + if (!IS_INODE(node_folio)) + return 0; + + return inode ? get_extra_isize(inode) : + offset_in_addr(&F2FS_NODE(node_folio)->i); +} + +static inline __le32 *get_dnode_addr(struct inode *inode, + struct folio *node_folio) { - struct f2fs_node *raw_node; - __le32 *addr_array; - raw_node = (struct f2fs_node *)page_address(node_page); - addr_array = blkaddr_in_node(raw_node); - return le32_to_cpu(addr_array[offset]); + return blkaddr_in_node(F2FS_NODE(node_folio)) + + get_dnode_base(inode, node_folio); +} + +static inline block_t data_blkaddr(struct inode *inode, + struct folio *node_folio, unsigned int offset) +{ + return le32_to_cpu(*(get_dnode_addr(inode, node_folio) + offset)); +} + +static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn) +{ + return data_blkaddr(dn->inode, dn->node_folio, dn->ofs_in_node); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -836,326 +3149,1232 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); return mask & *addr; } -static inline int f2fs_set_bit(unsigned int nr, char *addr) +static inline void f2fs_set_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = BIT(7 - (nr & 0x07)); + *addr |= mask; +} + +static inline void f2fs_clear_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = BIT(7 - (nr & 0x07)); + *addr &= ~mask; +} + +static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) { int mask; int ret; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr |= mask; return ret; } -static inline int f2fs_clear_bit(unsigned int nr, char *addr) +static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) { int mask; int ret; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr &= ~mask; return ret; } -/* used for f2fs_inode_info->flags */ -enum { - FI_NEW_INODE, /* indicate newly allocated inode */ - FI_DIRTY_INODE, /* indicate inode is dirty or not */ - FI_INC_LINK, /* need to increment i_nlink */ - FI_ACL_MODE, /* indicate acl mode */ - FI_NO_ALLOC, /* should not allocate any blocks */ - FI_UPDATE_DIR, /* should update inode block for consistency */ - FI_DELAY_IPUT, /* used for the recovery */ -}; +static inline void f2fs_change_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = BIT(7 - (nr & 0x07)); + *addr ^= mask; +} + +/* + * On-disk inode flags (f2fs_inode::i_flags) + */ +#define F2FS_COMPR_FL 0x00000004 /* Compress file */ +#define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ +#define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ +#define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ +#define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ +#define F2FS_NOCOMP_FL 0x00000400 /* Don't compress */ +#define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define F2FS_DEVICE_ALIAS_FL 0x80000000 /* File for aliasing a device */ + +#define F2FS_QUOTA_DEFAULT_FL (F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ + F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ + F2FS_CASEFOLD_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ + F2FS_CASEFOLD_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) + +#define IS_DEVICE_ALIASING(inode) (F2FS_I(inode)->i_flags & F2FS_DEVICE_ALIAS_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + +static inline void __mark_inode_dirty_flag(struct inode *inode, + int flag, bool set) +{ + switch (flag) { + case FI_INLINE_XATTR: + case FI_INLINE_DATA: + case FI_INLINE_DENTRY: + case FI_NEW_INODE: + if (set) + return; + fallthrough; + case FI_DATA_EXIST: + case FI_PIN_FILE: + case FI_COMPRESS_RELEASED: + f2fs_mark_inode_dirty_sync(inode, true); + } +} + +static inline void set_inode_flag(struct inode *inode, int flag) +{ + set_bit(flag, F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, true); +} -static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline int is_inode_flag_set(struct inode *inode, int flag) { - set_bit(flag, &fi->flags); + return test_bit(flag, F2FS_I(inode)->flags); } -static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) +static inline void clear_inode_flag(struct inode *inode, int flag) { - return test_bit(flag, &fi->flags); + clear_bit(flag, F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, false); } -static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline bool f2fs_verity_in_progress(struct inode *inode) { - clear_bit(flag, &fi->flags); + return IS_ENABLED(CONFIG_FS_VERITY) && + is_inode_flag_set(inode, FI_VERITY_IN_PROGRESS); } -static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) +static inline void set_acl_inode(struct inode *inode, umode_t mode) { - fi->i_acl_mode = mode; - set_inode_flag(fi, FI_ACL_MODE); + F2FS_I(inode)->i_acl_mode = mode; + set_inode_flag(inode, FI_ACL_MODE); + f2fs_mark_inode_dirty_sync(inode, false); } -static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline void f2fs_i_links_write(struct inode *inode, bool inc) { - if (is_inode_flag_set(fi, FI_ACL_MODE)) { - clear_inode_flag(fi, FI_ACL_MODE); - return 1; + if (inc) + inc_nlink(inode); + else + drop_nlink(inode); + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_blocks_write(struct inode *inode, + block_t diff, bool add, bool claim) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + /* add = 1, claim = 1 should be dquot_reserve_block in pair */ + if (add) { + if (claim) + dquot_claim_block(inode, diff); + else + dquot_alloc_block_nofail(inode, diff); + } else { + dquot_free_block(inode, diff); } + + f2fs_mark_inode_dirty_sync(inode, true); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline bool f2fs_is_atomic_file(struct inode *inode); + +static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + if (i_size_read(inode) == i_size) + return; + + i_size_write(inode, i_size); + + if (f2fs_is_atomic_file(inode)) + return; + + f2fs_mark_inode_dirty_sync(inode, true); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) +{ + F2FS_I(inode)->i_current_depth = depth; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_gc_failures_write(struct inode *inode, + unsigned int count) +{ + F2FS_I(inode)->i_gc_failures = count; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) +{ + F2FS_I(inode)->i_xattr_nid = xnid; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) +{ + F2FS_I(inode)->i_pino = pino; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (ri->i_inline & F2FS_INLINE_XATTR) + set_bit(FI_INLINE_XATTR, fi->flags); + if (ri->i_inline & F2FS_INLINE_DATA) + set_bit(FI_INLINE_DATA, fi->flags); + if (ri->i_inline & F2FS_INLINE_DENTRY) + set_bit(FI_INLINE_DENTRY, fi->flags); + if (ri->i_inline & F2FS_DATA_EXIST) + set_bit(FI_DATA_EXIST, fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, fi->flags); + if (ri->i_inline & F2FS_PIN_FILE) + set_bit(FI_PIN_FILE, fi->flags); + if (ri->i_inline & F2FS_COMPRESS_RELEASED) + set_bit(FI_COMPRESS_RELEASED, fi->flags); +} + +static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) +{ + ri->i_inline = 0; + + if (is_inode_flag_set(inode, FI_INLINE_XATTR)) + ri->i_inline |= F2FS_INLINE_XATTR; + if (is_inode_flag_set(inode, FI_INLINE_DATA)) + ri->i_inline |= F2FS_INLINE_DATA; + if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) + ri->i_inline |= F2FS_INLINE_DENTRY; + if (is_inode_flag_set(inode, FI_DATA_EXIST)) + ri->i_inline |= F2FS_DATA_EXIST; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + ri->i_inline |= F2FS_PIN_FILE; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + ri->i_inline |= F2FS_COMPRESS_RELEASED; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); +} + +static inline int f2fs_has_inline_xattr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_XATTR); +} + +static inline int f2fs_compressed_file(struct inode *inode) +{ + return S_ISREG(inode->i_mode) && + is_inode_flag_set(inode, FI_COMPRESSED_FILE); +} + +static inline bool f2fs_need_compress_data(struct inode *inode) +{ + int compress_mode = F2FS_OPTION(F2FS_I_SB(inode)).compress_mode; + + if (!f2fs_compressed_file(inode)) + return false; + + if (compress_mode == COMPR_MODE_FS) + return true; + else if (compress_mode == COMPR_MODE_USER && + is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) + return true; + + return false; +} + +static inline unsigned int addrs_per_page(struct inode *inode, + bool is_inode) +{ + unsigned int addrs = is_inode ? (CUR_ADDRS_PER_INODE(inode) - + get_inline_xattr_addrs(inode)) : DEF_ADDRS_PER_BLOCK; + + if (f2fs_compressed_file(inode)) + return ALIGN_DOWN(addrs, F2FS_I(inode)->i_cluster_size); + return addrs; +} + +static inline +void *inline_xattr_addr(struct inode *inode, const struct folio *folio) +{ + struct f2fs_inode *ri = F2FS_INODE(folio); + + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - + get_inline_xattr_addrs(inode)]); +} + +static inline int inline_xattr_size(struct inode *inode) +{ + if (f2fs_has_inline_xattr(inode)) + return get_inline_xattr_addrs(inode) * sizeof(__le32); return 0; } -static inline int f2fs_readonly(struct super_block *sb) +/* + * Notice: check inline_data flag without inode page lock is unsafe. + * It could change at any time by f2fs_convert_inline_folio(). + */ +static inline int f2fs_has_inline_data(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_DATA); +} + +static inline int f2fs_exist_data(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_DATA_EXIST); +} + +static inline int f2fs_is_mmap_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_MMAP_FILE); +} + +static inline bool f2fs_is_pinned_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_PIN_FILE); +} + +static inline bool f2fs_is_atomic_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_ATOMIC_FILE); +} + +static inline bool f2fs_is_cow_file(struct inode *inode) { - return sb->s_flags & MS_RDONLY; + return is_inode_flag_set(inode, FI_COW_FILE); +} + +static inline void *inline_data_addr(struct inode *inode, struct folio *folio) +{ + __le32 *addr = get_dnode_addr(inode, folio); + + return (void *)(addr + DEF_INLINE_RESERVED_SIZE); +} + +static inline int f2fs_has_inline_dentry(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_INLINE_DENTRY); +} + +static inline int is_file(struct inode *inode, int type) +{ + return F2FS_I(inode)->i_advise & type; +} + +static inline void set_file(struct inode *inode, int type) +{ + if (is_file(inode, type)) + return; + F2FS_I(inode)->i_advise |= type; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline void clear_file(struct inode *inode, int type) +{ + if (!is_file(inode, type)) + return; + F2FS_I(inode)->i_advise &= ~type; + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline bool f2fs_is_time_consistent(struct inode *inode) +{ + struct timespec64 ts = inode_get_atime(inode); + + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &ts)) + return false; + ts = inode_get_ctime(inode); + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ts)) + return false; + ts = inode_get_mtime(inode); + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &ts)) + return false; + return true; +} + +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + bool ret; + + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & ~PAGE_MASK) + return false; + + if (!f2fs_is_time_consistent(inode)) + return false; + + spin_lock(&F2FS_I(inode)->i_size_lock); + ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); + spin_unlock(&F2FS_I(inode)->i_size_lock); + + return ret; +} + +static inline bool f2fs_readonly(struct super_block *sb) +{ + return sb_rdonly(sb); +} + +static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) +{ + return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); +} + +static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + if (time_to_inject(sbi, FAULT_KMALLOC)) + return NULL; + + return kmalloc(size, flags); +} + +static inline void *f2fs_getname(struct f2fs_sb_info *sbi) +{ + if (time_to_inject(sbi, FAULT_KMALLOC)) + return NULL; + + return __getname(); +} + +static inline void f2fs_putname(char *buf) +{ + __putname(buf); +} + +static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO); +} + +static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + if (time_to_inject(sbi, FAULT_KVMALLOC)) + return NULL; + + return kvmalloc(size, flags); +} + +static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO); +} + +static inline void *f2fs_vmalloc(struct f2fs_sb_info *sbi, size_t size) +{ + if (time_to_inject(sbi, FAULT_VMALLOC)) + return NULL; + + return vmalloc(size); +} + +static inline int get_extra_isize(struct inode *inode) +{ + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); +} + +static inline int get_inline_xattr_addrs(struct inode *inode) +{ + return F2FS_I(inode)->i_inline_xattr_size; +} + +#define f2fs_get_inode_mode(i) \ + ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ + (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) + +#define F2FS_MIN_EXTRA_ATTR_SIZE (sizeof(__le32)) + +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*(f2fs_inode)), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ + +#define __is_large_section(sbi) (SEGS_PER_SEC(sbi) > 1) + +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META) + +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +static inline void verify_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) + f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.", + blkaddr, type); +} + +static inline bool __is_valid_data_blkaddr(block_t blkaddr) +{ + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR || + blkaddr == COMPRESS_ADDR) + return false; + return true; } /* * file.c */ -int f2fs_sync_file(struct file *, loff_t, loff_t, int); -void truncate_data_blocks(struct dnode_of_data *); -void f2fs_truncate(struct inode *); -int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -int f2fs_setattr(struct dentry *, struct iattr *); -int truncate_hole(struct inode *, pgoff_t, pgoff_t); -int truncate_data_blocks_range(struct dnode_of_data *, int); -long f2fs_ioctl(struct file *, unsigned int, unsigned long); -long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate(struct inode *inode); +int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr); +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); +int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, + bool readonly, bool need_lock); +int f2fs_precache_extents(struct inode *inode); +int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); +int f2fs_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa); +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); +int f2fs_pin_file_control(struct inode *inode, bool inc); /* * inode.c */ -void f2fs_set_inode_flags(struct inode *); -struct inode *f2fs_iget(struct super_block *, unsigned long); -void update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); -int f2fs_write_inode(struct inode *, struct writeback_control *); -void f2fs_evict_inode(struct inode *); +void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio); +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +void f2fs_update_inode(struct inode *inode, struct folio *node_folio); +void f2fs_update_inode_page(struct inode *inode); +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_remove_donate_inode(struct inode *inode); +void f2fs_evict_inode(struct inode *inode); +void f2fs_handle_failed_inode(struct inode *inode); /* * namei.c */ +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); +int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, + struct inode **new_inode); /* * dir.c */ -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, - struct page **); -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, struct qstr *); -void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, - struct page *, struct inode *); -int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); -void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); -int f2fs_make_empty(struct inode *, struct inode *); -bool f2fs_empty_dir(struct inode *); +#if IS_ENABLED(CONFIG_UNICODE) +int f2fs_init_casefolded_name(const struct inode *dir, + struct f2fs_filename *fname); +void f2fs_free_casefolded_name(struct f2fs_filename *fname); +#else +static inline int f2fs_init_casefolded_name(const struct inode *dir, + struct f2fs_filename *fname) +{ + return 0; +} + +static inline void f2fs_free_casefolded_name(struct f2fs_filename *fname) +{ +} +#endif /* CONFIG_UNICODE */ + +int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct f2fs_filename *fname); +int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct f2fs_filename *fname); +void f2fs_free_filename(struct f2fs_filename *fname); +struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, + const struct f2fs_filename *fname, int *max_slots, + bool use_hash); +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr); +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d); +struct folio *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, + const struct f2fs_filename *fname, struct folio *dfolio); +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth); +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots); +void f2fs_drop_nlink(struct inode *dir, struct inode *inode); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + const struct f2fs_filename *fname, struct folio **res_folio); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct folio **res_folio); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct folio **f); +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct folio **folio); +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct folio *folio, struct inode *inode); +bool f2fs_has_enough_room(struct inode *dir, struct folio *ifolio, + const struct f2fs_filename *fname); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct fscrypt_str *name, f2fs_hash_t name_hash, + unsigned int bit_pos); +int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode); +int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode); +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio, + struct inode *dir, struct inode *inode); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, + struct f2fs_filename *fname); +bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { - return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name, - inode); + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; + return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, + inode, inode->i_ino, inode->i_mode); } /* * super.c */ -int f2fs_sync_fs(struct super_block *, int); -extern __printf(3, 4) -void f2fs_msg(struct super_block *, const char *, const char *, ...); +int f2fs_inode_dirtied(struct inode *inode, bool sync); +void f2fs_inode_synced(struct inode *inode); +int f2fs_dquot_initialize(struct inode *inode); +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); +int f2fs_do_quota_sync(struct super_block *sb, int type); +loff_t max_file_blocks(struct inode *inode); +void f2fs_quota_off_umount(struct super_block *sb); +void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); +int f2fs_sync_fs(struct super_block *sb, int sync); +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const char *, size_t); +void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); /* * node.c */ -struct dnode_of_data; struct node_info; - -int is_checkpointed_node(struct f2fs_sb_info *, nid_t); -void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); -int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); -int truncate_inode_blocks(struct inode *, pgoff_t); -int remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *, const struct qstr *); -struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); -void ra_node_page(struct f2fs_sb_info *, nid_t); -struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_node_page_ra(struct page *, int); -void sync_inode_page(struct dnode_of_data *); -int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); -bool alloc_nid(struct f2fs_sb_info *, nid_t *); -void alloc_nid_done(struct f2fs_sb_info *, nid_t); -void alloc_nid_failed(struct f2fs_sb_info *, nid_t); -void recover_node_page(struct f2fs_sb_info *, struct page *, - struct f2fs_summary *, struct node_info *, block_t); -int recover_inode_page(struct f2fs_sb_info *, struct page *); -int restore_node_summary(struct f2fs_sb_info *, unsigned int, - struct f2fs_summary_block *); -void flush_nat_entries(struct f2fs_sb_info *); -int build_node_manager(struct f2fs_sb_info *); -void destroy_node_manager(struct f2fs_sb_info *); -int __init create_node_manager_caches(void); -void destroy_node_manager_caches(void); +enum node_type; + +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio); +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct folio *folio); +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni, bool checkpoint_context); +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); +int f2fs_truncate_xattr_node(struct inode *inode); +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id); +int f2fs_remove_inode_page(struct inode *inode); +struct folio *f2fs_new_inode_folio(struct inode *inode); +struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs); +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + enum node_type node_type); +struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); +struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid); +int f2fs_move_node_folio(struct folio *node_folio, int gc_type); +void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id); +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type); +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio); +int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio); +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio); +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum); +int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_build_node_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_node_manager_caches(void); +void f2fs_destroy_node_manager_caches(void); /* * segment.c */ -void f2fs_balance_fs(struct f2fs_sb_info *); -void invalidate_blocks(struct f2fs_sb_info *, block_t); -void clear_prefree_segments(struct f2fs_sb_info *); -int npages_for_summary_flush(struct f2fs_sb_info *); -void allocate_new_segments(struct f2fs_sb_info *); -struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -struct bio *f2fs_bio_alloc(struct block_device *, int); -void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync); -void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, - block_t, block_t *); -void write_data_page(struct inode *, struct page *, struct dnode_of_data*, - block_t, block_t *); -void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t); -void recover_data_page(struct f2fs_sb_info *, struct page *, - struct f2fs_summary *, block_t, block_t); -void rewrite_node_page(struct f2fs_sb_info *, struct page *, - struct f2fs_summary *, block_t, block_t); -void write_data_summaries(struct f2fs_sb_info *, block_t); -void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_summary_block *, - int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *); -int build_segment_manager(struct f2fs_sb_info *); -void destroy_segment_manager(struct f2fs_sb_info *); +bool f2fs_need_SSR(struct f2fs_sb_info *sbi); +int f2fs_commit_atomic_write(struct inode *inode); +void f2fs_abort_atomic_write(struct inode *inode, bool clean); +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr, + unsigned int len); +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi); +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); +block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); +int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno); +int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); +int f2fs_reinit_atgc_curseg(struct f2fs_sb_info *sbi); +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); +int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, + unsigned int start, unsigned int end); +int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); +int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi); +int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno); +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, + block_t blk_addr); +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, + enum iostat_type io_type); +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio); +int f2fs_inplace_write_data(struct f2fs_io_info *fio); +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr, + bool from_gc); +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr); +enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, + enum log_type seg_type); +int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio); +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt); +void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, + bool ordered, bool locked); +#define f2fs_wait_on_page_writeback(page, type, ordered, locked) \ + f2fs_folio_wait_writeback(page_folio(page), type, ordered, locked) +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); +void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, + block_t len); +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc); +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi); +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_segment_manager_caches(void); +void f2fs_destroy_segment_manager_caches(void); +int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint); +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp); +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi); +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno); +unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, + unsigned int segno); + +static inline struct inode *fio_inode(struct f2fs_io_info *fio) +{ + return fio->folio->mapping->host; +} + +#define DEF_FRAGMENT_SIZE 4 +#define MIN_FRAGMENT_SIZE 1 +#define MAX_FRAGMENT_SIZE 512 + +static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG || + F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK; +} /* * checkpoint.c */ -struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -int check_orphan_space(struct f2fs_sb_info *); -void add_orphan_inode(struct f2fs_sb_info *, nid_t); -void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); -int get_valid_checkpoint(struct f2fs_sb_info *); -void set_dirty_dir_page(struct inode *, struct page *); -void add_dirty_dir_inode(struct inode *); -void remove_dirty_dir_inode(struct inode *); -struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t); -void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, bool); -void init_orphan_info(struct f2fs_sb_info *); -int __init create_checkpoint_caches(void); -void destroy_checkpoint_caches(void); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, + unsigned char reason); +void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); +struct folio *f2fs_grab_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index); +struct folio *f2fs_get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index); +struct folio *f2fs_get_meta_folio_retry(struct f2fs_sb_info *sbi, pgoff_t index); +struct folio *f2fs_get_tmp_folio(struct f2fs_sb_info *sbi, pgoff_t index); +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, + unsigned int ra_blocks); +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write, enum iostat_type io_type); +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_add_orphan_inode(struct inode *inode); +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); +void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio); +void f2fs_remove_dirty_inode(struct inode *inode); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, + bool from_cp); +void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); +u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_checkpoint_caches(void); +void f2fs_destroy_checkpoint_caches(void); +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); /* * data.c */ -int reserve_new_block(struct dnode_of_data *); -void update_extent_cache(block_t, struct dnode_of_data *); -struct page *find_data_page(struct inode *, pgoff_t, bool); -struct page *get_lock_data_page(struct inode *, pgoff_t); -struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); -int do_write_data_page(struct page *); +int __init f2fs_init_bioset(void); +void f2fs_destroy_bioset(void); +bool f2fs_is_cp_guaranteed(const struct folio *folio); +int f2fs_init_bio_entry_cache(void); +void f2fs_destroy_bio_entry_cache(void); +void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, + enum page_type type); +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct folio *folio, + nid_t ino, enum page_type type); +void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, + struct bio **bio, struct folio *folio); +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); +int f2fs_submit_page_bio(struct f2fs_io_info *fio); +int f2fs_merge_page_bio(struct f2fs_io_info *fio); +void f2fs_submit_page_write(struct f2fs_io_info *fio); +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, sector_t *sector); +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int f2fs_reserve_new_block(struct dnode_of_data *dn); +int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index); +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); +struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); +struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, + bool for_write); +struct folio *f2fs_get_new_data_folio(struct inode *inode, + struct folio *ifolio, pgoff_t index, bool new_i_size); +int f2fs_do_write_data_page(struct f2fs_io_info *fio); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +int f2fs_encrypt_one_page(struct f2fs_io_info *fio); +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); +int f2fs_write_single_data_page(struct folio *folio, int *submitted, + struct bio **bio, sector_t *last_block, + struct writeback_control *wbc, + enum iostat_type io_type, + int compr_blocks, bool allow_balance); +void f2fs_write_failed(struct inode *inode, loff_t to); +void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length); +bool f2fs_release_folio(struct folio *folio, gfp_t wait); +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); +void f2fs_clear_page_cache_dirty_tag(struct folio *folio); +int f2fs_init_post_read_processing(void); +void f2fs_destroy_post_read_processing(void); +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); +void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); +extern const struct iomap_ops f2fs_iomap_ops; /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *); -void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int); -int f2fs_gc(struct f2fs_sb_info *); -void build_gc_manager(struct f2fs_sb_info *); -int __init create_gc_caches(void); -void destroy_gc_caches(void); +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); +int f2fs_gc_range(struct f2fs_sb_info *sbi, + unsigned int start_seg, unsigned int end_seg, + bool dry_run, unsigned int dry_run_sections); +int f2fs_resize_fs(struct file *filp, __u64 block_count); +int __init f2fs_create_garbage_collection_cache(void); +void f2fs_destroy_garbage_collection_cache(void); +/* victim selection function for cleaning and SSR */ +int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, + int gc_type, int type, char alloc_mode, + unsigned long long age, bool one_time); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *); -bool space_for_roll_forward(struct f2fs_sb_info *); +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); +int __init f2fs_create_recovery_cache(void); +void f2fs_destroy_recovery_cache(void); /* * debug.c */ #ifdef CONFIG_F2FS_STAT_FS +enum { + DEVSTAT_INUSE, + DEVSTAT_DIRTY, + DEVSTAT_FULL, + DEVSTAT_FREE, + DEVSTAT_PREFREE, + DEVSTAT_MAX, +}; + +struct f2fs_dev_stats { + unsigned int devstats[2][DEVSTAT_MAX]; /* 0: segs, 1: secs */ +}; + struct f2fs_stat_info { struct list_head stat_list; struct f2fs_sb_info *sbi; - struct mutex stat_lock; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_ext, total_ext; - int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; - int nats, sits, fnids; + unsigned long long hit_cached[NR_EXTENT_CACHES]; + unsigned long long hit_rbtree[NR_EXTENT_CACHES]; + unsigned long long total_ext[NR_EXTENT_CACHES]; + unsigned long long hit_total[NR_EXTENT_CACHES]; + int ext_tree[NR_EXTENT_CACHES]; + int zombie_tree[NR_EXTENT_CACHES]; + int ext_node[NR_EXTENT_CACHES]; + /* to count memory footprint */ + unsigned long long ext_mem[NR_EXTENT_CACHES]; + /* for read extent cache */ + unsigned long long hit_largest; + /* for block age extent cache */ + unsigned long long allocated_data_blocks; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; + int ndirty_data, ndirty_qdata; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; + unsigned int nquota_files, ndonate_files; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; int total_count, utilization; - int bg_gc; - unsigned int valid_count, valid_node_count, valid_inode_count; + int nr_wb_cp_data, nr_wb_data; + int nr_rd_data, nr_rd_node, nr_rd_meta; + int nr_dio_read, nr_dio_write; + unsigned int io_skip_bggc, other_skip_bggc; + int nr_flushing, nr_flushed, flush_list_empty; + int nr_discarding, nr_discarded; + int nr_discard_cmd; + unsigned int undiscard_blks; + int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; + unsigned int cur_ckpt_time, peak_ckpt_time; + int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int compr_inode, swapfile_inode; + unsigned long long compr_blocks; + int aw_cnt, max_aw_cnt; + unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; - int dirty_count, node_pages, meta_pages; - int prefree_count, call_count; - int tot_segs, node_segs, data_segs, free_segs, free_secs; + int dirty_count, node_pages, meta_pages, compress_pages; + int compress_page_hit; + int prefree_count, free_segs, free_secs; + int cp_call_count[MAX_CALL_TYPE], cp_count; + int gc_call_count[MAX_CALL_TYPE]; + int gc_segs[2][2]; + int gc_secs[2][2]; int tot_blks, data_blks, node_blks; + int bg_data_blks, bg_node_blks; + int blkoff[NR_CURSEG_TYPE]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; + unsigned int dirty_seg[NR_CURSEG_TYPE]; + unsigned int full_seg[NR_CURSEG_TYPE]; + unsigned int valid_blks[NR_CURSEG_TYPE]; + unsigned int meta_count[META_MAX]; unsigned int segment_count[2]; unsigned int block_count[2]; - unsigned base_mem, cache_mem; + unsigned int inplace_count; + unsigned long long base_mem, cache_mem, page_mem; + struct f2fs_dev_stats *dev_stats; }; -#define stat_inc_call_count(si) ((si)->call_count++) +static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_stat_info *)sbi->stat_info; +} -#define stat_inc_seg_count(sbi, type) \ +#define stat_inc_cp_call_count(sbi, foreground) \ + atomic_inc(&sbi->cp_call_count[(foreground)]) +#define stat_inc_cp_count(sbi) (F2FS_STAT(sbi)->cp_count++) +#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) +#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) +#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type])) +#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type])) +#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) +#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type])) +#define stat_inc_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) +#define stat_dec_inline_xattr(inode) \ + do { \ + if (f2fs_has_inline_xattr(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_xattr)); \ + } while (0) +#define stat_inc_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_inode)); \ + } while (0) +#define stat_dec_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_inode)); \ + } while (0) +#define stat_inc_inline_dir(inode) \ + do { \ + if (f2fs_has_inline_dentry(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_dir)); \ + } while (0) +#define stat_dec_inline_dir(inode) \ + do { \ + if (f2fs_has_inline_dentry(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ + } while (0) +#define stat_inc_compr_inode(inode) \ + do { \ + if (f2fs_compressed_file(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->compr_inode)); \ + } while (0) +#define stat_dec_compr_inode(inode) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ - (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) \ - si->data_segs++; \ - else \ - si->node_segs++; \ + if (f2fs_compressed_file(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ } while (0) +#define stat_add_compr_blocks(inode, blocks) \ + (atomic64_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_sub_compr_blocks(inode, blocks) \ + (atomic64_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_inc_swapfile_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_dec_swapfile_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->swapfile_inode)) +#define stat_inc_atomic_inode(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->atomic_files)) +#define stat_dec_atomic_inode(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->atomic_files)) +#define stat_inc_meta_count(sbi, blkaddr) \ + do { \ + if (blkaddr < SIT_I(sbi)->sit_base_addr) \ + atomic_inc(&(sbi)->meta_count[META_CP]); \ + else if (blkaddr < NM_I(sbi)->nat_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_SIT]); \ + else if (blkaddr < SM_I(sbi)->ssa_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_NAT]); \ + else if (blkaddr < SM_I(sbi)->main_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_SSA]); \ + } while (0) +#define stat_inc_seg_type(sbi, curseg) \ + ((sbi)->segment_count[(curseg)->alloc_type]++) +#define stat_inc_block_count(sbi, curseg) \ + ((sbi)->block_count[(curseg)->alloc_type]++) +#define stat_inc_inplace_blocks(sbi) \ + (atomic_inc(&(sbi)->inplace_count)) +#define stat_update_max_atomic_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->atomic_files); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ + } while (0) +#define stat_inc_gc_call_count(sbi, foreground) \ + (F2FS_STAT(sbi)->gc_call_count[(foreground)]++) +#define stat_inc_gc_sec_count(sbi, type, gc_type) \ + (F2FS_STAT(sbi)->gc_secs[(type)][(gc_type)]++) +#define stat_inc_gc_seg_count(sbi, type, gc_type) \ + (F2FS_STAT(sbi)->gc_segs[(type)][(gc_type)]++) #define stat_inc_tot_blk_count(si, blks) \ - (si->tot_blks += (blks)) + ((si)->tot_blks += (blks)) -#define stat_inc_data_blk_count(sbi, blks) \ +#define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) -#define stat_inc_node_blk_count(sbi, blks) \ +#define stat_inc_node_blk_count(sbi, blks, gc_type) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) -int f2fs_build_stats(struct f2fs_sb_info *); -void f2fs_destroy_stats(struct f2fs_sb_info *); +int f2fs_build_stats(struct f2fs_sb_info *sbi); +void f2fs_destroy_stats(struct f2fs_sb_info *sbi); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); +void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #else -#define stat_inc_call_count(si) -#define stat_inc_seg_count(si, type) -#define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(si, blks) -#define stat_inc_node_blk_count(sbi, blks) +#define stat_inc_cp_call_count(sbi, foreground) do { } while (0) +#define stat_inc_cp_count(sbi) do { } while (0) +#define stat_io_skip_bggc_count(sbi) do { } while (0) +#define stat_other_skip_bggc_count(sbi) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sbi, type) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi, type) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_compr_inode(inode) do { } while (0) +#define stat_dec_compr_inode(inode) do { } while (0) +#define stat_add_compr_blocks(inode, blocks) do { } while (0) +#define stat_sub_compr_blocks(inode, blocks) do { } while (0) +#define stat_inc_swapfile_inode(inode) do { } while (0) +#define stat_dec_swapfile_inode(inode) do { } while (0) +#define stat_inc_atomic_inode(inode) do { } while (0) +#define stat_dec_atomic_inode(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_meta_count(sbi, blkaddr) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_gc_call_count(sbi, foreground) do { } while (0) +#define stat_inc_gc_sec_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_gc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } +static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {} #endif extern const struct file_operations f2fs_dir_operations; @@ -1166,5 +4385,626 @@ extern const struct address_space_operations f2fs_node_aops; extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; +extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; +extern struct kmem_cache *f2fs_inode_entry_slab; + +/* + * inline.c + */ +bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio); +bool f2fs_may_inline_dentry(struct inode *inode); +void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio); +void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio, + u64 from); +int f2fs_read_inline_data(struct inode *inode, struct folio *folio); +int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio); +int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); +int f2fs_write_inline_data(struct inode *inode, struct folio *folio); +int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio); +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, + const struct f2fs_filename *fname, struct folio **res_folio, + bool use_hash); +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct folio *ifolio); +int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, + struct folio *folio, struct inode *dir, struct inode *inode); +bool f2fs_empty_inline_dir(struct inode *dir); +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr); +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + +/* + * shrinker.c + */ +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +unsigned int f2fs_donate_files(void); +void f2fs_reclaim_caches(unsigned int reclaim_caches_kb); +void f2fs_join_shrinker(struct f2fs_sb_info *sbi); +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); + +/* + * extent_cache.c + */ +bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio); +void f2fs_init_extent_tree(struct inode *inode); +void f2fs_drop_extent_tree(struct inode *inode); +void f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_tree(struct inode *inode); +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_extent_cache(void); +void f2fs_destroy_extent_cache(void); + +/* read extent cache ops */ +void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio); +bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index, + block_t *blkaddr); +void f2fs_update_read_extent_cache(struct dnode_of_data *dn); +void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, block_t blkaddr, unsigned int len); +unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + +/* block age extent cache ops */ +void f2fs_init_age_extent_tree(struct inode *inode); +bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_age_extent_cache(struct dnode_of_data *dn); +void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, + pgoff_t fofs, unsigned int len); +unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, + int nr_shrink); + +/* + * sysfs.c + */ +#define MIN_RA_MUL 2 +#define MAX_RA_MUL 256 + +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); + +/* verity.c */ +extern const struct fsverity_operations f2fs_verityops; + +/* + * crypto support + */ +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); +} + +static inline void f2fs_set_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_ENCRYPTION + file_set_encrypt(inode); + f2fs_set_inode_flags(inode); +#endif +} + +/* + * Returns true if the reads of the inode's data need to undergo some + * postprocessing step, like decryption or authenticity verification. + */ +static inline bool f2fs_post_read_required(struct inode *inode) +{ + return f2fs_encrypted_file(inode) || fsverity_active(inode) || + f2fs_compressed_file(inode); +} + +static inline bool f2fs_used_in_atomic_write(struct inode *inode) +{ + return f2fs_is_atomic_file(inode) || f2fs_is_cow_file(inode); +} + +static inline bool f2fs_meta_inode_gc_required(struct inode *inode) +{ + return f2fs_post_read_required(inode) || f2fs_used_in_atomic_write(inode); +} + +/* + * compress.c + */ +#ifdef CONFIG_F2FS_FS_COMPRESSION +enum cluster_check_type { + CLUSTER_IS_COMPR, /* check only if compressed cluster */ + CLUSTER_COMPR_BLKS, /* return # of compressed blocks in a cluster */ + CLUSTER_RAW_BLKS /* return # of raw blocks in a cluster */ +}; +bool f2fs_is_compressed_page(struct folio *folio); +struct folio *f2fs_compress_control_folio(struct folio *folio); +int f2fs_prepare_compress_overwrite(struct inode *inode, + struct page **pagep, pgoff_t index, void **fsdata); +bool f2fs_compress_write_end(struct inode *inode, void *fsdata, + pgoff_t index, unsigned copied); +int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); +void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio); +bool f2fs_is_compress_backend_ready(struct inode *inode); +bool f2fs_is_compress_level_valid(int alg, int lvl); +int __init f2fs_init_compress_mempool(void); +void f2fs_destroy_compress_mempool(void); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); +void f2fs_end_read_compressed_page(struct folio *folio, bool failed, + block_t blkaddr, bool in_task); +bool f2fs_cluster_is_empty(struct compress_ctx *cc); +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, + int index, int nr_pages, bool uptodate); +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); +void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct folio *folio); +int f2fs_write_multi_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type); +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); +bool f2fs_is_sparse_cluster(struct inode *inode, pgoff_t index); +void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len); +int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + unsigned nr_pages, sector_t *last_block_in_bio, + struct readahead_control *rac, bool for_write); +struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task); +void f2fs_put_folio_dic(struct folio *folio, bool in_task); +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn, + unsigned int ofs_in_node); +int f2fs_init_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); +void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi); +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi); +int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); +void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); +int __init f2fs_init_compress_cache(void); +void f2fs_destroy_compress_cache(void); +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); +void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, + block_t blkaddr, unsigned int len); +bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, + block_t blkaddr); +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); +#define inc_compr_inode_stat(inode) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + sbi->compr_new_inode++; \ + } while (0) +#define add_compr_block_stat(inode, blocks) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + int diff = F2FS_I(inode)->i_cluster_size - blocks; \ + sbi->compr_written_block += blocks; \ + sbi->compr_saved_block += diff; \ + } while (0) +#else +static inline bool f2fs_is_compressed_page(struct folio *folio) { return false; } +static inline bool f2fs_is_compress_backend_ready(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return true; + /* not support compression */ + return false; +} +static inline bool f2fs_is_compress_level_valid(int alg, int lvl) { return false; } +static inline struct folio *f2fs_compress_control_folio(struct folio *folio) +{ + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +} +static inline int __init f2fs_init_compress_mempool(void) { return 0; } +static inline void f2fs_destroy_compress_mempool(void) { } +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, + bool in_task) { } +static inline void f2fs_end_read_compressed_page(struct folio *folio, + bool failed, block_t blkaddr, bool in_task) +{ + WARN_ON_ONCE(1); +} +static inline void f2fs_put_folio_dic(struct folio *folio, bool in_task) +{ + WARN_ON_ONCE(1); +} +static inline unsigned int f2fs_cluster_blocks_are_contiguous( + struct dnode_of_data *dn, unsigned int ofs_in_node) { return 0; } +static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; } +static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } +static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } +static inline int __init f2fs_init_compress_cache(void) { return 0; } +static inline void f2fs_destroy_compress_cache(void) { } +static inline void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, + block_t blkaddr, unsigned int len) { } +static inline bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, + struct folio *folio, block_t blkaddr) { return false; } +static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, + nid_t ino) { } +#define inc_compr_inode_stat(inode) do { } while (0) +static inline int f2fs_is_compressed_cluster( + struct inode *inode, + pgoff_t index) { return 0; } +static inline bool f2fs_is_sparse_cluster( + struct inode *inode, + pgoff_t index) { return true; } +static inline void f2fs_update_read_extent_tree_range_compressed( + struct inode *inode, + pgoff_t fofs, block_t blkaddr, + unsigned int llen, unsigned int c_len) { } +#endif + +static inline int set_compress_context(struct inode *inode) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_compress_algorithm = F2FS_OPTION(sbi).compress_algorithm; + fi->i_log_cluster_size = F2FS_OPTION(sbi).compress_log_size; + fi->i_compress_flag = F2FS_OPTION(sbi).compress_chksum ? + BIT(COMPRESS_CHKSUM) : 0; + fi->i_cluster_size = BIT(fi->i_log_cluster_size); + if ((fi->i_compress_algorithm == COMPRESS_LZ4 || + fi->i_compress_algorithm == COMPRESS_ZSTD) && + F2FS_OPTION(sbi).compress_level) + fi->i_compress_level = F2FS_OPTION(sbi).compress_level; + fi->i_flags |= F2FS_COMPR_FL; + set_inode_flag(inode, FI_COMPRESSED_FILE); + stat_inc_compr_inode(inode); + inc_compr_inode_stat(inode); + f2fs_mark_inode_dirty_sync(inode, true); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static inline bool f2fs_disable_compressed_file(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + f2fs_down_write(&fi->i_sem); + + if (!f2fs_compressed_file(inode)) { + f2fs_up_write(&fi->i_sem); + return true; + } + if (f2fs_is_mmap_file(inode) || atomic_read(&fi->writeback) || + (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) { + f2fs_up_write(&fi->i_sem); + return false; + } + + fi->i_flags &= ~F2FS_COMPR_FL; + stat_dec_compr_inode(inode); + clear_inode_flag(inode, FI_COMPRESSED_FILE); + f2fs_mark_inode_dirty_sync(inode, true); + + f2fs_up_write(&fi->i_sem); + return true; +} + +#define F2FS_FEATURE_FUNCS(name, flagname) \ +static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ +{ \ + return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ +} + +F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); +F2FS_FEATURE_FUNCS(blkzoned, BLKZONED); +F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR); +F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA); +F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); +F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); +F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); +F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); +F2FS_FEATURE_FUNCS(verity, VERITY); +F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); +F2FS_FEATURE_FUNCS(casefold, CASEFOLD); +F2FS_FEATURE_FUNCS(compression, COMPRESSION); +F2FS_FEATURE_FUNCS(readonly, RO); +F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS); +F2FS_FEATURE_FUNCS(packed_ssa, PACKED_SSA); + +#ifdef CONFIG_BLK_DEV_ZONED +static inline bool f2fs_zone_is_seq(struct f2fs_sb_info *sbi, int devi, + unsigned int zone) +{ + return test_bit(zone, FDEV(devi).blkz_seq); +} + +static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, + block_t blkaddr) +{ + return f2fs_zone_is_seq(sbi, devi, blkaddr / sbi->blocks_per_blkz); +} #endif + +static inline int f2fs_bdev_index(struct f2fs_sb_info *sbi, + struct block_device *bdev) +{ + int i; + + if (!f2fs_is_multi_device(sbi)) + return 0; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return i; + + WARN_ON(1); + return -1; +} + +static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_blkzoned(sbi); +} + +static inline bool f2fs_bdev_support_discard(struct block_device *bdev) +{ + return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev); +} + +static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) +{ + int i; + + if (!f2fs_is_multi_device(sbi)) + return f2fs_bdev_support_discard(sbi->sb->s_bdev); + + for (i = 0; i < sbi->s_ndevs; i++) + if (f2fs_bdev_support_discard(FDEV(i).bdev)) + return true; + return false; +} + +static inline unsigned int f2fs_hw_discard_granularity(struct f2fs_sb_info *sbi) +{ + int i = 1; + unsigned int discard_granularity = bdev_discard_granularity(sbi->sb->s_bdev); + + if (f2fs_is_multi_device(sbi)) + for (; i < sbi->s_ndevs && !bdev_is_zoned(FDEV(i).bdev); i++) + discard_granularity = max_t(unsigned int, discard_granularity, + bdev_discard_granularity(FDEV(i).bdev)); + return discard_granularity; +} + +static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) +{ + return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || + f2fs_hw_should_discard(sbi); +} + +static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi) +{ + int i; + + if (!f2fs_is_multi_device(sbi)) + return bdev_read_only(sbi->sb->s_bdev); + + for (i = 0; i < sbi->s_ndevs; i++) + if (bdev_read_only(FDEV(i).bdev)) + return true; + return false; +} + +static inline bool f2fs_dev_is_readonly(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_readonly(sbi) || f2fs_hw_is_readonly(sbi); +} + +static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; +} + +static inline bool f2fs_is_sequential_zone_area(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + if (f2fs_sb_has_blkzoned(sbi)) { +#ifdef CONFIG_BLK_DEV_ZONED + int devi = f2fs_target_device_index(sbi, blkaddr); + + if (!bdev_is_zoned(FDEV(devi).bdev)) + return false; + + if (f2fs_is_multi_device(sbi)) { + if (blkaddr < FDEV(devi).start_blk || + blkaddr > FDEV(devi).end_blk) { + f2fs_err(sbi, "Invalid block %x", blkaddr); + return false; + } + blkaddr -= FDEV(devi).start_blk; + } + + return f2fs_blkz_is_seq(sbi, devi, blkaddr); +#else + return false; +#endif + } + return false; +} + +static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW; +} + +static inline bool f2fs_may_compress(struct inode *inode) +{ + if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || + f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode) || + f2fs_is_mmap_file(inode)) + return false; + return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); +} + +static inline void f2fs_i_compr_blocks_update(struct inode *inode, + u64 blocks, bool add) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + int diff = fi->i_cluster_size - blocks; + + /* don't update i_compr_blocks if saved blocks were released */ + if (!add && !atomic_read(&fi->i_compr_blocks)) + return; + + if (add) { + atomic_add(diff, &fi->i_compr_blocks); + stat_add_compr_blocks(inode, diff); + } else { + atomic_sub(diff, &fi->i_compr_blocks); + stat_sub_compr_blocks(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, + int flag) +{ + if (!f2fs_is_multi_device(sbi)) + return false; + if (flag != F2FS_GET_BLOCK_DIO) + return false; + return sbi->aligned_blksize; +} + +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + +#ifdef CONFIG_F2FS_FAULT_INJECTION +extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, + unsigned long type, enum fault_option fo); +#else +static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, + unsigned long rate, unsigned long type, + enum fault_option fo) +{ + return 0; +} +#endif + +static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) +{ +#ifdef CONFIG_QUOTA + if (f2fs_sb_has_quota_ino(sbi)) + return true; + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + return true; +#endif + return false; +} + +static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; +} + +static inline void __f2fs_schedule_timeout(long timeout, bool io) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + if (io) + io_schedule_timeout(timeout); + else + schedule_timeout(timeout); +} + +#define f2fs_io_schedule_timeout(timeout) \ + __f2fs_schedule_timeout(timeout, true) +#define f2fs_schedule_timeout(timeout) \ + __f2fs_schedule_timeout(timeout, false) + +static inline void f2fs_io_schedule_timeout_killable(long timeout) +{ + while (timeout) { + if (fatal_signal_pending(current)) + return; + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + if (timeout <= DEFAULT_SCHEDULE_TIMEOUT) + return; + timeout -= DEFAULT_SCHEDULE_TIMEOUT; + } +} + +static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, + struct folio *folio, enum page_type type) +{ + pgoff_t ofs = folio->index; + + if (unlikely(f2fs_cp_error(sbi))) + return; + + if (ofs == sbi->page_eio_ofs[type]) { + if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->page_eio_ofs[type] = ofs; + sbi->page_eio_cnt[type] = 0; + } +} + +static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb); +} + +static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi, + block_t blkaddr, unsigned int cnt) +{ + bool need_submit = false; + int i = 0; + + do { + struct folio *folio; + + folio = filemap_get_folio(META_MAPPING(sbi), blkaddr + i); + if (!IS_ERR(folio)) { + if (folio_test_writeback(folio)) + need_submit = true; + f2fs_folio_put(folio, false); + } + } while (++i < cnt && !need_submit); + + if (need_submit) + f2fs_submit_merged_write_cond(sbi, sbi->meta_inode, + NULL, 0, DATA); + + truncate_inode_pages_range(META_MAPPING(sbi), + F2FS_BLK_TO_BYTES((loff_t)blkaddr), + F2FS_BLK_END_BYTES((loff_t)(blkaddr + cnt - 1))); +} + +static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi, + block_t blkaddr, unsigned int len) +{ + f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); + f2fs_invalidate_compress_pages_range(sbi, blkaddr, len); +} + +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + +#endif /* _LINUX_F2FS_H */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d2d2b7dbdcc1..d7047ca6b98d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1,17 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/file.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/stat.h> -#include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/falloc.h> @@ -19,492 +15,1659 @@ #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/mount.h> +#include <linux/pagevec.h> +#include <linux/uio.h> +#include <linux/uuid.h> +#include <linux/file.h> +#include <linux/nls.h> +#include <linux/sched/signal.h> +#include <linux/fileattr.h> +#include <linux/fadvise.h> +#include <linux/iomap.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "xattr.h" #include "acl.h" +#include "gc.h" +#include "iostat.h" #include <trace/events/f2fs.h> +#include <uapi/linux/f2fs.h> + +static void f2fs_zero_post_eof_page(struct inode *inode, + loff_t new_size, bool lock) +{ + loff_t old_size = i_size_read(inode); + + if (old_size >= new_size) + return; + + if (mapping_empty(inode->i_mapping)) + return; + + if (lock) + filemap_invalidate_lock(inode->i_mapping); + /* zero or drop pages only in range of [old_size, new_size] */ + truncate_inode_pages_range(inode->i_mapping, old_size, new_size); + if (lock) + filemap_invalidate_unlock(inode->i_mapping); +} + +static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + vm_flags_t flags = vmf->vma->vm_flags; + vm_fault_t ret; + + ret = filemap_fault(vmf); + if (ret & VM_FAULT_LOCKED) + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_MAPPED_READ_IO, F2FS_BLKSIZE); + + trace_f2fs_filemap_fault(inode, vmf->pgoff, flags, ret); + + return ret; +} -static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) +static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - block_t old_blk_addr; + struct folio *folio = page_folio(vmf->page); + struct inode *inode = file_inode(vmf->vma->vm_file); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - int err, ilock; + bool need_alloc = !f2fs_is_pinned_file(inode); + int err = 0; + vm_fault_t ret; - f2fs_balance_fs(sbi); + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; - sb_start_pagefault(inode->i_sb); + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + err = -EIO; + goto out; + } - /* block allocation */ - ilock = mutex_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, ALLOC_NODE); - if (err) { - mutex_unlock_op(sbi, ilock); + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto out; } - old_blk_addr = dn.data_blkaddr; + if (!f2fs_is_checkpoint_ready(sbi)) { + err = -ENOSPC; + goto out; + } - if (old_blk_addr == NULL_ADDR) { - err = reserve_new_block(&dn); - if (err) { - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); + err = f2fs_convert_inline_inode(inode); + if (err) + goto out; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + int ret = f2fs_is_compressed_cluster(inode, folio->index); + + if (ret < 0) { + err = ret; goto out; + } else if (ret) { + need_alloc = false; } } - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); - - file_update_time(vma->vm_file); - lock_page(page); - if (page->mapping != inode->i_mapping || - page_offset(page) > i_size_read(inode) || - !PageUptodate(page)) { - unlock_page(page); +#endif + /* should do out of any locked page */ + if (need_alloc) + f2fs_balance_fs(sbi, true); + + sb_start_pagefault(inode->i_sb); + + f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); + + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT, true); + + file_update_time(vmf->vma->vm_file); + filemap_invalidate_lock_shared(inode->i_mapping); + + folio_lock(folio); + if (unlikely(folio->mapping != inode->i_mapping || + folio_pos(folio) > i_size_read(inode) || + !folio_test_uptodate(folio))) { + folio_unlock(folio); err = -EFAULT; - goto out; + goto out_sem; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_alloc) { + /* block allocation */ + err = f2fs_get_block_locked(&dn, folio->index); + } else { + err = f2fs_get_dnode_of_data(&dn, folio->index, LOOKUP_NODE); + f2fs_put_dnode(&dn); + if (f2fs_is_pinned_file(inode) && + !__is_valid_data_blkaddr(dn.data_blkaddr)) + err = -EIO; } + if (err) { + folio_unlock(folio); + goto out_sem; + } + + f2fs_folio_wait_writeback(folio, DATA, false, true); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + /* * check to see if the page is mapped already (no holes) */ - if (PageMappedToDisk(page)) - goto mapped; + if (folio_test_mappedtodisk(folio)) + goto out_sem; /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { - unsigned offset; - offset = i_size_read(inode) & ~PAGE_CACHE_MASK; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + if (((loff_t)(folio->index + 1) << PAGE_SHIFT) > + i_size_read(inode)) { + loff_t offset; + + offset = i_size_read(inode) & ~PAGE_MASK; + folio_zero_segment(folio, offset, folio_size(folio)); } - set_page_dirty(page); - SetPageUptodate(page); + folio_mark_dirty(folio); + + f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); + f2fs_update_time(sbi, REQ_TIME); + +out_sem: + filemap_invalidate_unlock_shared(inode->i_mapping); -mapped: - /* fill the page */ - wait_on_page_writeback(page); -out: sb_end_pagefault(inode->i_sb); - return block_page_mkwrite_return(err); +out: + ret = vmf_fs_error(err); + + trace_f2fs_vm_page_mkwrite(inode, folio->index, vmf->vma->vm_flags, ret); + return ret; } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, + .fault = f2fs_filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; - inode = igrab(inode); - dentry = d_find_any_alias(inode); - iput(inode); + /* + * Make sure to get the non-deleted alias. The alias associated with + * the open file descriptor being fsync()'ed may be deleted already. + */ + dentry = d_find_alias(inode); if (!dentry) return 0; - inode = igrab(dentry->d_parent->d_inode); + *pino = d_parent_ino(dentry); dput(dentry); - - *pino = inode->i_ino; - iput(inode); return 1; } -int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + enum cp_reason_type cp_reason = CP_NO_NEEDED; + + if (!S_ISREG(inode->i_mode)) + cp_reason = CP_NON_REGULAR; + else if (f2fs_compressed_file(inode)) + cp_reason = CP_COMPRESSED; + else if (inode->i_nlink != 1) + cp_reason = CP_HARDLINK; + else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) + cp_reason = CP_SB_NEED_CP; + else if (file_wrong_pino(inode)) + cp_reason = CP_WRONG_PINO; + else if (!f2fs_space_for_roll_forward(sbi)) + cp_reason = CP_NO_SPC_ROLL; + else if (!f2fs_is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) + cp_reason = CP_NODE_NEED_CP; + else if (test_opt(sbi, FASTBOOT)) + cp_reason = CP_FASTBOOT_MODE; + else if (F2FS_OPTION(sbi).active_logs == 2) + cp_reason = CP_SPEC_LOG_NUM; + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT && + f2fs_need_dentry_mark(sbi, inode->i_ino) && + f2fs_exist_written_data(sbi, F2FS_I(inode)->i_pino, + TRANS_DIR_INO)) + cp_reason = CP_RECOVER_DIR; + else if (f2fs_exist_written_data(sbi, F2FS_I(inode)->i_pino, + XATTR_DIR_INO)) + cp_reason = CP_XATTR_DIR; + + return cp_reason; +} + +static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct folio *i = filemap_get_folio(NODE_MAPPING(sbi), ino); + bool ret = false; + /* But we need to avoid that there are some inode updates */ + if ((!IS_ERR(i) && folio_test_dirty(i)) || + f2fs_need_inode_block_update(sbi, ino)) + ret = true; + f2fs_folio_put(i, false); + return ret; +} + +static void try_to_fix_pino(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t pino; + + f2fs_down_write(&fi->i_sem); + if (file_wrong_pino(inode) && inode->i_nlink == 1 && + get_parent_ino(inode, &pino)) { + f2fs_i_pino_write(inode, pino); + file_got_pino(inode); + } + f2fs_up_write(&fi->i_sem); +} + +static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, + int datasync, bool atomic) { struct inode *inode = file->f_mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t ino = inode->i_ino; int ret = 0; - bool need_cp = false; + enum cp_reason_type cp_reason = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .for_reclaim = 0, }; + unsigned int seq_id = 0; - if (f2fs_readonly(inode->i_sb)) + if (unlikely(f2fs_readonly(inode->i_sb))) return 0; trace_f2fs_sync_file_enter(inode); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) { - trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + + if (S_ISDIR(inode->i_mode)) + goto go_write; + + /* if fdatasync is triggered, let's do in-place-update */ + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + set_inode_flag(inode, FI_NEED_IPU); + ret = file_write_and_wait_range(file, start, end); + clear_inode_flag(inode, FI_NEED_IPU); + + if (ret || is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); return ret; } - /* guarantee free sections for fsync */ - f2fs_balance_fs(sbi); - - mutex_lock(&inode->i_mutex); + /* if the inode is dirty, let's recover all the time */ + if (!f2fs_skip_inode_update(inode, datasync)) { + f2fs_write_inode(inode, NULL); + goto go_write; + } - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; + /* + * if there is no written data, don't waste time to write recovery info. + */ + if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && + !f2fs_exist_written_data(sbi, ino, APPEND_INO)) { - if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) - need_cp = true; - else if (file_wrong_pino(inode)) - need_cp = true; - else if (!space_for_roll_forward(sbi)) - need_cp = true; - else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) - need_cp = true; + /* it may call write_inode just prior to fsync */ + if (need_inode_page_update(sbi, ino)) + goto go_write; - if (need_cp) { - nid_t pino; + if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || + f2fs_exist_written_data(sbi, ino, UPDATE_INO)) + goto flush_out; + goto out; + } else { + /* + * for OPU case, during fsync(), node can be persisted before + * data when lower device doesn't support write barrier, result + * in data corruption after SPO. + * So for strict fsync mode, force to use atomic write semantics + * to keep write order in between data/node and last node to + * avoid potential data corruption. + */ + if (F2FS_OPTION(sbi).fsync_mode == + FSYNC_MODE_STRICT && !atomic) + atomic = true; + } +go_write: + /* + * Both of fdatasync() and fsync() are able to be recovered from + * sudden-power-off. + */ + f2fs_down_read(&F2FS_I(inode)->i_sem); + cp_reason = need_do_checkpoint(inode); + f2fs_up_read(&F2FS_I(inode)->i_sem); + if (cp_reason) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); - if (file_wrong_pino(inode) && inode->i_nlink == 1 && - get_parent_ino(inode, &pino)) { - F2FS_I(inode)->i_pino = pino; - file_got_pino(inode); - mark_inode_dirty_sync(inode); - ret = f2fs_write_inode(inode, NULL); - if (ret) - goto out; - } - } else { - /* if there is no written node page, write its inode page */ - while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { - mark_inode_dirty_sync(inode); - ret = f2fs_write_inode(inode, NULL); - if (ret) - goto out; - } - filemap_fdatawait_range(sbi->node_inode->i_mapping, - 0, LONG_MAX); - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + + /* + * We've secured consistency through sync_fs. Following pino + * will be used only for fsynced inodes after checkpoint. + */ + try_to_fix_pino(inode); + clear_inode_flag(inode, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); + goto out; + } +sync_nodes: + atomic_inc(&sbi->wb_sync_req[NODE]); + ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id); + atomic_dec(&sbi->wb_sync_req[NODE]); + if (ret) + goto out; + + /* if cp_error was enabled, we should avoid infinite loop */ + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto out; + } + + if (f2fs_need_inode_block_update(sbi, ino)) { + f2fs_mark_inode_dirty_sync(inode, true); + f2fs_write_inode(inode, NULL); + goto sync_nodes; } + + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id); + if (ret) + goto out; + } + + /* once recovery info is written, don't need to tack this */ + f2fs_remove_ino_entry(sbi, ino, APPEND_INO); + clear_inode_flag(inode, FI_APPEND_WRITE); +flush_out: + if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) + ret = f2fs_issue_flush(sbi, inode->i_ino); + if (!ret) { + f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); + f2fs_remove_ino_entry(sbi, ino, FLUSH_INO); + } + f2fs_update_time(sbi, REQ_TIME); out: - mutex_unlock(&inode->i_mutex); - trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); return ret; } -static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; + return f2fs_do_sync_file(file, start, end, datasync, false); +} + +static bool __found_offset(struct address_space *mapping, + struct dnode_of_data *dn, pgoff_t index, int whence) +{ + block_t blkaddr = f2fs_data_blkaddr(dn); + struct inode *inode = mapping->host; + bool compressed_cluster = false; + + if (f2fs_compressed_file(inode)) { + block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_folio, + ALIGN_DOWN(dn->ofs_in_node, F2FS_I(inode)->i_cluster_size)); + + compressed_cluster = first_blkaddr == COMPRESS_ADDR; + } + + switch (whence) { + case SEEK_DATA: + if (__is_valid_data_blkaddr(blkaddr)) + return true; + if (blkaddr == NEW_ADDR && + xa_get_mark(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY)) + return true; + if (compressed_cluster) + return true; + break; + case SEEK_HOLE: + if (compressed_cluster) + return false; + if (blkaddr == NULL_ADDR) + return true; + break; + } + return false; +} + +static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); + struct dnode_of_data dn; + pgoff_t pgofs, end_offset; + loff_t data_ofs = offset; + loff_t isize; + int err = 0; + + inode_lock_shared(inode); + + isize = i_size_read(inode); + if (offset >= isize) + goto fail; + + /* handle inline data case */ + if (f2fs_has_inline_data(inode)) { + if (whence == SEEK_HOLE) { + data_ofs = isize; + goto found; + } else if (whence == SEEK_DATA) { + data_ofs = offset; + goto found; + } + } + + pgofs = (pgoff_t)(offset >> PAGE_SHIFT); + + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); + if (err && err != -ENOENT) { + goto fail; + } else if (err == -ENOENT) { + /* direct node does not exists */ + if (whence == SEEK_DATA) { + pgofs = f2fs_get_next_page_offset(&dn, pgofs); + continue; + } else { + goto found; + } + } + + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); + + /* find data/hole in dnode block */ + for (; dn.ofs_in_node < end_offset; + dn.ofs_in_node++, pgofs++, + data_ofs = (loff_t)pgofs << PAGE_SHIFT) { + block_t blkaddr; + + blkaddr = f2fs_data_blkaddr(&dn); + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), + blkaddr, DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + goto fail; + } + + if (__found_offset(file->f_mapping, &dn, + pgofs, whence)) { + f2fs_put_dnode(&dn); + goto found; + } + } + f2fs_put_dnode(&dn); + } + + if (whence == SEEK_DATA) + goto fail; +found: + if (whence == SEEK_HOLE && data_ofs > isize) + data_ofs = isize; + inode_unlock_shared(inode); + return vfs_setpos(file, data_ofs, maxbytes); +fail: + inode_unlock_shared(inode); + return -ENXIO; +} + +static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + case SEEK_HOLE: + if (offset < 0) + return -ENXIO; + return f2fs_seek_block(file, offset, whence); + } + + return -EINVAL; +} + +static int f2fs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; + struct inode *inode = file_inode(file); + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + file_accessed(file); - vma->vm_ops = &f2fs_file_vm_ops; + desc->vm_ops = &f2fs_file_vm_ops; + + f2fs_down_read(&F2FS_I(inode)->i_sem); + set_inode_flag(inode, FI_MMAP_FILE); + f2fs_up_read(&F2FS_I(inode)->i_sem); + return 0; } -int truncate_data_blocks_range(struct dnode_of_data *dn, int count) +static int finish_preallocate_blocks(struct inode *inode) +{ + int ret = 0; + bool opened; + + f2fs_down_read(&F2FS_I(inode)->i_sem); + opened = is_inode_flag_set(inode, FI_OPENED_FILE); + f2fs_up_read(&F2FS_I(inode)->i_sem); + if (opened) + return 0; + + inode_lock(inode); + if (is_inode_flag_set(inode, FI_OPENED_FILE)) + goto out_unlock; + + if (!file_should_truncate(inode)) + goto out_update; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + truncate_setsize(inode, i_size_read(inode)); + ret = f2fs_truncate(inode); + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (ret) + goto out_unlock; + + file_dont_truncate(inode); +out_update: + f2fs_down_write(&F2FS_I(inode)->i_sem); + set_inode_flag(inode, FI_OPENED_FILE); + f2fs_up_write(&F2FS_I(inode)->i_sem); +out_unlock: + inode_unlock(inode); + return ret; +} + +static int f2fs_file_open(struct inode *inode, struct file *filp) { - int nr_free = 0, ofs = dn->ofs_in_node; - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - struct f2fs_node *raw_node; + int err = fscrypt_file_open(inode, filp); + + if (err) + return err; + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + err = fsverity_file_open(inode, filp); + if (err) + return err; + + filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_CAN_ODIRECT; + + err = dquot_file_open(inode, filp); + if (err) + return err; + + err = finish_preallocate_blocks(inode); + if (!err) + atomic_inc(&F2FS_I(inode)->open_count); + return err; +} + +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; + bool compressed_cluster = false; + int cluster_index = 0, valid_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks); + block_t blkstart; + int blklen = 0; - raw_node = page_address(dn->node_page); - addr = blkaddr_in_node(raw_node) + ofs; + addr = get_dnode_addr(dn->inode, dn->node_folio) + ofs; + blkstart = le32_to_cpu(*addr); - for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { + /* Assumption: truncation starts with cluster */ + for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { block_t blkaddr = le32_to_cpu(*addr); + + if (f2fs_compressed_file(dn->inode) && + !(cluster_index & (cluster_size - 1))) { + if (compressed_cluster) + f2fs_i_compr_blocks_update(dn->inode, + valid_blocks, false); + compressed_cluster = (blkaddr == COMPRESS_ADDR); + valid_blocks = 0; + } + if (blkaddr == NULL_ADDR) - continue; + goto next; - update_extent_cache(NULL_ADDR, dn); - invalidate_blocks(sbi, blkaddr); - nr_free++; + f2fs_set_data_blkaddr(dn, NULL_ADDR); + + if (__is_valid_data_blkaddr(blkaddr)) { + if (time_to_inject(sbi, FAULT_BLKADDR_CONSISTENCE)) + goto next; + if (!f2fs_is_valid_blkaddr_raw(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) + goto next; + if (compressed_cluster) + valid_blocks++; + } + + if (blkstart + blklen == blkaddr) { + blklen++; + } else { + f2fs_invalidate_blocks(sbi, blkstart, blklen); + blkstart = blkaddr; + blklen = 1; + } + + if (!released || blkaddr != COMPRESS_ADDR) + nr_free++; + + continue; + +next: + if (blklen) + f2fs_invalidate_blocks(sbi, blkstart, blklen); + + blkstart = le32_to_cpu(*(addr + 1)); + blklen = 0; } + + if (blklen) + f2fs_invalidate_blocks(sbi, blkstart, blklen); + + if (compressed_cluster) + f2fs_i_compr_blocks_update(dn->inode, valid_blocks, false); + if (nr_free) { + pgoff_t fofs; + /* + * once we invalidate valid blkaddr in range [ofs, ofs + count], + * we will invalidate all blkaddr in the whole range. + */ + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio), + dn->inode) + ofs; + f2fs_update_read_extent_cache_range(dn, fofs, 0, len); + f2fs_update_age_extent_cache_range(dn, fofs, len); dec_valid_block_count(sbi, dn->inode, nr_free); - set_page_dirty(dn->node_page); - sync_inode_page(dn); } dn->ofs_in_node = ofs; + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); - return nr_free; } -void truncate_data_blocks(struct dnode_of_data *dn) +static int truncate_partial_data_page(struct inode *inode, u64 from, + bool cache_only) { - truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); -} + loff_t offset = from & (PAGE_SIZE - 1); + pgoff_t index = from >> PAGE_SHIFT; + struct address_space *mapping = inode->i_mapping; + struct folio *folio; -static void truncate_partial_data_page(struct inode *inode, u64 from) -{ - unsigned offset = from & (PAGE_CACHE_SIZE - 1); - struct page *page; + if (!offset && !cache_only) + return 0; - if (!offset) - return; + if (cache_only) { + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) + return 0; + if (folio_test_uptodate(folio)) + goto truncate_out; + f2fs_folio_put(folio, true); + return 0; + } - page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false); - if (IS_ERR(page)) - return; + folio = f2fs_get_lock_data_folio(inode, index, true); + if (IS_ERR(folio)) + return PTR_ERR(folio) == -ENOENT ? 0 : PTR_ERR(folio); +truncate_out: + f2fs_folio_wait_writeback(folio, DATA, true, true); + folio_zero_segment(folio, offset, folio_size(folio)); - lock_page(page); - if (page->mapping != inode->i_mapping) { - f2fs_put_page(page, 1); - return; - } - wait_on_page_writeback(page); - zero_user(page, offset, PAGE_CACHE_SIZE - offset); - set_page_dirty(page); - f2fs_put_page(page, 1); + /* An encrypted inode should have a key and truncate the last page. */ + f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode)); + if (!cache_only) + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); + return 0; } -static int truncate_blocks(struct inode *inode, u64 from) +int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - unsigned int blocksize = inode->i_sb->s_blocksize; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; pgoff_t free_from; - int count = 0, ilock = -1; - int err; + int count = 0, err = 0; + struct folio *ifolio; + bool truncate_page = false; trace_f2fs_truncate_blocks_enter(inode, from); - free_from = (pgoff_t) - ((from + blocksize - 1) >> (sbi->log_blocksize)); + if (IS_DEVICE_ALIASING(inode) && from) { + err = -EINVAL; + goto out_err; + } + + free_from = (pgoff_t)F2FS_BLK_ALIGN(from); - ilock = mutex_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); + if (free_from >= max_file_blocks(inode)) + goto free_partial; + + if (lock) + f2fs_lock_op(sbi); + + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) { + err = PTR_ERR(ifolio); + goto out; + } + + if (IS_DEVICE_ALIASING(inode)) { + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; + struct extent_info ei = et->largest; + + f2fs_invalidate_blocks(sbi, ei.blk, ei.len); + + dec_valid_block_count(sbi, inode, ei.len); + f2fs_update_time(sbi, REQ_TIME); + + f2fs_folio_put(ifolio, true); + goto out; + } + + if (f2fs_has_inline_data(inode)) { + f2fs_truncate_inline_inode(inode, ifolio, from); + f2fs_folio_put(ifolio, true); + truncate_page = true; + goto out; + } + + set_new_dnode(&dn, inode, ifolio, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; - mutex_unlock_op(sbi, ilock); - trace_f2fs_truncate_blocks_exit(inode, err); - return err; + goto out; } - if (IS_INODE(dn.node_page)) - count = ADDRS_PER_INODE; - else - count = ADDRS_PER_BLOCK; + count = ADDRS_PER_PAGE(dn.node_folio, inode); count -= dn.ofs_in_node; - BUG_ON(count < 0); + f2fs_bug_on(sbi, count < 0); - if (dn.ofs_in_node || IS_INODE(dn.node_page)) { - truncate_data_blocks_range(&dn, count); + if (dn.ofs_in_node || IS_INODE(dn.node_folio)) { + f2fs_truncate_data_blocks_range(&dn, count); free_from += count; } f2fs_put_dnode(&dn); free_next: - err = truncate_inode_blocks(inode, free_from); - mutex_unlock_op(sbi, ilock); - + err = f2fs_truncate_inode_blocks(inode, free_from); +out: + if (lock) + f2fs_unlock_op(sbi); +free_partial: /* lastly zero out the first data page */ - truncate_partial_data_page(inode, from); - + if (!err) + err = truncate_partial_data_page(inode, from, truncate_page); +out_err: trace_f2fs_truncate_blocks_exit(inode, err); return err; } -void f2fs_truncate(struct inode *inode) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { + u64 free_from = from; + int err; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * for compressed file, only support cluster size + * aligned truncation. + */ + if (f2fs_compressed_file(inode)) + free_from = round_up(from, + F2FS_I(inode)->i_cluster_size << PAGE_SHIFT); +#endif + + err = f2fs_do_truncate_blocks(inode, free_from, lock); + if (err) + return err; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * For compressed file, after release compress blocks, don't allow write + * direct, but we should allow write direct after truncate to zero. + */ + if (f2fs_compressed_file(inode) && !free_from + && is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + clear_inode_flag(inode, FI_COMPRESS_RELEASED); + + if (from != free_from) { + err = f2fs_truncate_partial_cluster(inode, from, lock); + if (err) + return err; + } +#endif + + return 0; +} + +int f2fs_truncate(struct inode *inode) +{ + int err; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) - return; + return 0; trace_f2fs_truncate(inode); - if (!truncate_blocks(inode, i_size_read(inode))) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) + return -EIO; + + err = f2fs_dquot_initialize(inode); + if (err) + return err; + + /* we should check inline_data size */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) { + /* + * Always truncate page #0 to avoid page cache + * leak in evict() path. + */ + truncate_inode_pages_range(inode->i_mapping, + F2FS_BLK_TO_BYTES(0), + F2FS_BLK_END_BYTES(0)); + return err; + } } + + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); + if (err) + return err; + + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + f2fs_mark_inode_dirty_sync(inode, false); + return 0; } -int f2fs_getattr(struct vfsmount *mnt, - struct dentry *dentry, struct kstat *stat) +static bool f2fs_force_buffered_io(struct inode *inode, int rw) { - struct inode *inode = dentry->d_inode; - generic_fillattr(inode, stat); - stat->blocks <<= 3; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!fscrypt_dio_supported(inode)) + return true; + if (fsverity_active(inode)) + return true; + if (f2fs_compressed_file(inode)) + return true; + /* + * only force direct read to use buffered IO, for direct write, + * it expects inline data conversion before committing IO. + */ + if (f2fs_has_inline_data(inode) && rw == READ) + return true; + + /* disallow direct IO if any of devices has unaligned blksize */ + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) + return true; + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE) && + !f2fs_is_pinned_file(inode)) + return true; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + return true; + + return false; +} + +int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri = NULL; + unsigned int flags; + + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = fi->i_crtime.tv_sec; + stat->btime.tv_nsec = fi->i_crtime.tv_nsec; + } + + /* + * Return the DIO alignment restrictions if requested. We only return + * this information when requested, since on encrypted files it might + * take a fair bit of work to get if the file wasn't opened recently. + * + * f2fs sometimes supports DIO reads but not DIO writes. STATX_DIOALIGN + * cannot represent that, so in that case we report no DIO support. + */ + if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { + unsigned int bsize = i_blocksize(inode); + + stat->result_mask |= STATX_DIOALIGN; + if (!f2fs_force_buffered_io(inode, WRITE)) { + stat->dio_mem_align = bsize; + stat->dio_offset_align = bsize; + } + } + + flags = fi->i_flags; + if (flags & F2FS_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (flags & F2FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (IS_ENCRYPTED(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + if (flags & F2FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (flags & F2FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + if (IS_VERITY(inode)) + stat->attributes |= STATX_ATTR_VERITY; + + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | + STATX_ATTR_APPEND | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP | + STATX_ATTR_VERITY); + + generic_fillattr(idmap, request_mask, inode, stat); + + /* we need to show initial sectors used for inline_data/dentries */ + if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || + f2fs_has_inline_dentry(inode)) + stat->blocks += (stat->size + 511) >> 9; + return 0; } #ifdef CONFIG_F2FS_FS_POSIX_ACL -static void __setattr_copy(struct inode *inode, const struct iattr *attr) +static void __setattr_copy(struct mnt_idmap *idmap, + struct inode *inode, const struct iattr *attr) { - struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) - inode->i_atime = timespec_trunc(attr->ia_atime, - inode->i_sb->s_time_gran); + inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) - inode->i_mtime = timespec_trunc(attr->ia_mtime, - inode->i_sb->s_time_gran); + inode_set_mtime_to_ts(inode, attr->ia_mtime); if (ia_valid & ATTR_CTIME) - inode->i_ctime = timespec_trunc(attr->ia_ctime, - inode->i_sb->s_time_gran); + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; - set_acl_inode(fi, mode); + set_acl_inode(inode, mode); } } #else #define __setattr_copy setattr_copy #endif -int f2fs_setattr(struct dentry *dentry, struct iattr *attr) +int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; - err = inode_change_ok(inode, attr); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + err = setattr_prepare(idmap, dentry, attr); + if (err) + return err; + + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + + err = fsverity_prepare_setattr(dentry, attr); if (err) return err; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (attr->ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + + if ((attr->ia_valid & ATTR_SIZE)) { + if (!f2fs_is_compress_backend_ready(inode) || + IS_DEVICE_ALIASING(inode)) + return -EOPNOTSUPP; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) && + !IS_ALIGNED(attr->ia_size, + F2FS_BLK_TO_BYTES(fi->i_cluster_size))) + return -EINVAL; + /* + * To prevent scattered pin block generation, we don't allow + * smaller/equal size unaligned truncation for pinned file. + * We only support overwrite IO to pinned file, so don't + * care about larger size truncation. + */ + if (f2fs_is_pinned_file(inode) && + attr->ia_size <= i_size_read(inode) && + !IS_ALIGNED(attr->ia_size, + F2FS_BLK_TO_BYTES(CAP_BLKS_PER_SEC(sbi)))) + return -EINVAL; + } + + if (is_quota_modification(idmap, inode, attr)) { + err = f2fs_dquot_initialize(inode); + if (err) + return err; + } + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { + f2fs_lock_op(sbi); + err = dquot_transfer(idmap, inode, attr); + if (err) { + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + f2fs_unlock_op(sbi); + return err; + } + /* + * update uid/gid under lock_op(), so that dquot and inode can + * be updated atomically. + */ + i_uid_update(idmap, attr, inode); + i_gid_update(idmap, attr, inode); + f2fs_mark_inode_dirty_sync(inode, true); + f2fs_unlock_op(sbi); + } + + if (attr->ia_valid & ATTR_SIZE) { + loff_t old_size = i_size_read(inode); + + if (attr->ia_size > MAX_INLINE_DATA(inode)) { + /* + * should convert inline inode before i_size_write to + * keep smaller than inline_data size with inline flag. + */ + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + + /* + * wait for inflight dio, blocks should be removed after + * IO completion. + */ + if (attr->ia_size < old_size) + inode_dio_wait(inode); + + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + if (attr->ia_size > old_size) + f2fs_zero_post_eof_page(inode, attr->ia_size, false); truncate_setsize(inode, attr->ia_size); - f2fs_truncate(inode); - f2fs_balance_fs(F2FS_SB(inode->i_sb)); + + if (attr->ia_size <= old_size) + err = f2fs_truncate(inode); + /* + * do not trim all blocks after i_size if target size is + * larger than i_size. + */ + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + if (err) + return err; + + spin_lock(&fi->i_size_lock); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + fi->last_disk_size = i_size_read(inode); + spin_unlock(&fi->i_size_lock); } - __setattr_copy(inode, attr); + __setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = f2fs_acl_chmod(inode); - if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; - clear_inode_flag(fi, FI_ACL_MODE); + err = posix_acl_chmod(idmap, dentry, f2fs_get_inode_mode(inode)); + + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + if (!err) + inode->i_mode = fi->i_acl_mode; + clear_inode_flag(inode, FI_ACL_MODE); } } - mark_inode_dirty(inode); + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, true); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(sbi, true); + return err; } const struct inode_operations f2fs_file_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, -#ifdef CONFIG_F2FS_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, + .get_inode_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, - .removexattr = generic_removexattr, -#endif + .fiemap = f2fs_fiemap, + .fileattr_get = f2fs_fileattr_get, + .fileattr_set = f2fs_fileattr_set, }; -static void fill_zero(struct inode *inode, pgoff_t index, +static int fill_zero(struct inode *inode, pgoff_t index, loff_t start, loff_t len) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct page *page; - int ilock; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct folio *folio; if (!len) - return; + return 0; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); - ilock = mutex_lock_op(sbi); - page = get_new_data_page(inode, NULL, index, false); - mutex_unlock_op(sbi, ilock); + f2fs_lock_op(sbi); + folio = f2fs_get_new_data_folio(inode, NULL, index, false); + f2fs_unlock_op(sbi); - if (!IS_ERR(page)) { - wait_on_page_writeback(page); - zero_user(page, start, len); - set_page_dirty(page); - f2fs_put_page(page, 1); - } + if (IS_ERR(folio)) + return PTR_ERR(folio); + + f2fs_folio_wait_writeback(folio, DATA, true, true); + folio_zero_range(folio, start, len); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); + return 0; } -int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) { - pgoff_t index; int err; - for (index = pg_start; index < pg_end; index++) { + while (pg_start < pg_end) { struct dnode_of_data dn; + pgoff_t end_offset, count; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { - if (err == -ENOENT) + if (err == -ENOENT) { + pg_start = f2fs_get_next_page_offset(&dn, + pg_start); continue; + } return err; } - if (dn.data_blkaddr != NULL_ADDR) - truncate_data_blocks_range(&dn, 1); + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); + count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); + + f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); + + f2fs_truncate_data_blocks_range(&dn, count); f2fs_put_dnode(&dn); + + pg_start += count; } return 0; } -static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) +static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; - int ret = 0; + int ret; - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_zero_post_eof_page(inode, offset + len, true); + + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { - fill_zero(inode, pg_start, off_start, + ret = fill_zero(inode, pg_start, off_start, off_end - off_start); + if (ret) + return ret; } else { - if (off_start) - fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); - if (off_end) - fill_zero(inode, pg_end, 0, off_end); + if (off_start) { + ret = fill_zero(inode, pg_start++, off_start, + PAGE_SIZE - off_start); + if (ret) + return ret; + } + if (off_end) { + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + return ret; + } if (pg_start < pg_end) { - struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ilock; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + f2fs_balance_fs(sbi, true); + + blk_start = (loff_t)pg_start << PAGE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_SHIFT; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + truncate_pagecache_range(inode, blk_start, blk_end - 1); + + f2fs_lock_op(sbi); + ret = f2fs_truncate_hole(inode, pg_start, pg_end); + f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } + } + + return ret; +} + +static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, pgoff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, done, i; + +next_dnode: + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + return ret; + } else if (ret == -ENOENT) { + if (dn.max_level == 0) + return -ENOENT; + done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - + dn.ofs_in_node, len); + blkaddr += done; + do_replace += done; + goto next; + } + + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, inode) - + dn.ofs_in_node, len); + for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { + *blkaddr = f2fs_data_blkaddr(&dn); + + if (__is_valid_data_blkaddr(*blkaddr) && + !f2fs_is_valid_blkaddr(sbi, *blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + return -EFSCORRUPTED; + } + + if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) { + + if (f2fs_lfs_mode(sbi)) { + f2fs_put_dnode(&dn); + return -EOPNOTSUPP; + } + + /* do not invalidate this block address */ + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + *do_replace = 1; + } + } + f2fs_put_dnode(&dn); +next: + len -= done; + off += done; + if (len) + goto next_dnode; + return 0; +} + +static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, i; + + for (i = 0; i < len; i++, do_replace++, blkaddr++) { + if (*do_replace == 0) + continue; - blk_start = pg_start << PAGE_CACHE_SHIFT; - blk_end = pg_end << PAGE_CACHE_SHIFT; - truncate_inode_pages_range(mapping, blk_start, - blk_end - 1); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + if (ret) { + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, *blkaddr, 1); + } else { + f2fs_update_data_blkaddr(&dn, *blkaddr); + } + f2fs_put_dnode(&dn); + } + return 0; +} + +static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, + block_t *blkaddr, int *do_replace, + pgoff_t src, pgoff_t dst, pgoff_t len, bool full) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode); + pgoff_t i = 0; + int ret; - ilock = mutex_lock_op(sbi); - ret = truncate_hole(inode, pg_start, pg_end); - mutex_unlock_op(sbi, ilock); + while (i < len) { + if (blkaddr[i] == NULL_ADDR && !full) { + i++; + continue; + } + + if (do_replace[i] || blkaddr[i] == NULL_ADDR) { + struct dnode_of_data dn; + struct node_info ni; + size_t new_size; + pgoff_t ilen; + + set_new_dnode(&dn, dst_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + if (ret) + return ret; + + ret = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (ret) { + f2fs_put_dnode(&dn); + return ret; + } + + ilen = min((pgoff_t) + ADDRS_PER_PAGE(dn.node_folio, dst_inode) - + dn.ofs_in_node, len - i); + do { + dn.data_blkaddr = f2fs_data_blkaddr(&dn); + f2fs_truncate_data_blocks_range(&dn, 1); + + if (do_replace[i]) { + f2fs_i_blocks_write(src_inode, + 1, false, false); + f2fs_i_blocks_write(dst_inode, + 1, true, false); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + blkaddr[i], ni.version, true, false); + + do_replace[i] = 0; + } + dn.ofs_in_node++; + i++; + new_size = (loff_t)(dst + i) << PAGE_SHIFT; + if (dst_inode->i_size < new_size) + f2fs_i_size_write(dst_inode, new_size); + } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); + + f2fs_put_dnode(&dn); + } else { + struct folio *fsrc, *fdst; + + fsrc = f2fs_get_lock_data_folio(src_inode, + src + i, true); + if (IS_ERR(fsrc)) + return PTR_ERR(fsrc); + fdst = f2fs_get_new_data_folio(dst_inode, NULL, dst + i, + true); + if (IS_ERR(fdst)) { + f2fs_folio_put(fsrc, true); + return PTR_ERR(fdst); + } + + f2fs_folio_wait_writeback(fdst, DATA, true, true); + + memcpy_folio(fdst, 0, fsrc, 0, PAGE_SIZE); + folio_mark_dirty(fdst); + folio_set_f2fs_gcing(fdst); + f2fs_folio_put(fdst, true); + f2fs_folio_put(fsrc, true); + + ret = f2fs_truncate_hole(src_inode, + src + i, src + i + 1); + if (ret) + return ret; + i++; } } + return 0; +} + +static int __exchange_data_block(struct inode *src_inode, + struct inode *dst_inode, pgoff_t src, pgoff_t dst, + pgoff_t len, bool full) +{ + block_t *src_blkaddr; + int *do_replace; + pgoff_t olen; + int ret; + + while (len) { + olen = min((pgoff_t)4 * ADDRS_PER_BLOCK(src_inode), len); + + src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), + array_size(olen, sizeof(block_t)), + GFP_NOFS); + if (!src_blkaddr) + return -ENOMEM; + + do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), + array_size(olen, sizeof(int)), + GFP_NOFS); + if (!do_replace) { + kvfree(src_blkaddr); + return -ENOMEM; + } - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) <= (offset + len)) { - i_size_write(inode, offset); - mark_inode_dirty(inode); + ret = __read_out_blkaddrs(src_inode, src_blkaddr, + do_replace, src, olen); + if (ret) + goto roll_back; + + ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr, + do_replace, src, dst, olen, full); + if (ret) + goto roll_back; + + src += olen; + dst += olen; + len -= olen; + + kvfree(src_blkaddr); + kvfree(do_replace); } + return 0; +roll_back: + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, olen); + kvfree(src_blkaddr); + kvfree(do_replace); return ret; } -static int expand_inode_data(struct inode *inode, loff_t offset, - loff_t len, int mode) +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t nrpages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; + int ret; + + f2fs_balance_fs(sbi, true); + + /* avoid gc operation during block exchange */ + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + f2fs_zero_post_eof_page(inode, offset + len, false); + + f2fs_lock_op(sbi); + f2fs_drop_extent_tree(inode); + truncate_pagecache(inode, offset); + ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + f2fs_unlock_op(sbi); + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + return ret; +} + +static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + loff_t new_size; + int ret; + + if (offset + len >= i_size_read(inode)) + return -EINVAL; + + /* collapse range should be aligned to block size of f2fs. */ + if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + if (ret) + return ret; + + ret = f2fs_do_collapse(inode, offset, len); + if (ret) + return ret; + + /* write out all moved pages, if possible */ + filemap_invalidate_lock(inode->i_mapping); + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + + new_size = i_size_read(inode) - len; + ret = f2fs_truncate_blocks(inode, new_size, true); + filemap_invalidate_unlock(inode->i_mapping); + if (!ret) + f2fs_i_size_write(inode, new_size); + return ret; +} + +static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, + pgoff_t end) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + pgoff_t index = start; + unsigned int ofs_in_node = dn->ofs_in_node; + blkcnt_t count = 0; + int ret; + + for (; index < end; index++, dn->ofs_in_node++) { + if (f2fs_data_blkaddr(dn) == NULL_ADDR) + count++; + } + + dn->ofs_in_node = ofs_in_node; + ret = f2fs_reserve_new_blocks(dn, count); + if (ret) + return ret; + + dn->ofs_in_node = ofs_in_node; + for (index = start; index < end; index++, dn->ofs_in_node++) { + dn->data_blkaddr = f2fs_data_blkaddr(dn); + /* + * f2fs_reserve_new_blocks will not guarantee entire block + * allocation. + */ + if (dn->data_blkaddr == NULL_ADDR) { + ret = -ENOSPC; + break; + } + + if (dn->data_blkaddr == NEW_ADDR) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + break; + } + + f2fs_invalidate_blocks(sbi, dn->data_blkaddr, 1); + f2fs_set_data_blkaddr(dn, NEW_ADDR); + } + + if (index > start) { + f2fs_update_read_extent_cache_range(dn, start, 0, + index - start); + f2fs_update_age_extent_cache_range(dn, start, index - start); + } + + return ret; +} + +static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; pgoff_t index, pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_start, off_end; @@ -514,180 +1677,3784 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (ret) return ret; - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); + if (ret) + return ret; + + f2fs_zero_post_eof_page(inode, offset + len, true); - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - for (index = pg_start; index <= pg_end; index++) { - struct dnode_of_data dn; - int ilock; + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); - ilock = mutex_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, ALLOC_NODE); - if (ret) { - mutex_unlock_op(sbi, ilock); - break; + if (pg_start == pg_end) { + ret = fill_zero(inode, pg_start, off_start, + off_end - off_start); + if (ret) + return ret; + + new_size = max_t(loff_t, new_size, offset + len); + } else { + if (off_start) { + ret = fill_zero(inode, pg_start++, off_start, + PAGE_SIZE - off_start); + if (ret) + return ret; + + new_size = max_t(loff_t, new_size, + (loff_t)pg_start << PAGE_SHIFT); } - if (dn.data_blkaddr == NULL_ADDR) { - ret = reserve_new_block(&dn); + for (index = pg_start; index < pg_end;) { + struct dnode_of_data dn; + unsigned int end_offset; + pgoff_t end; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(mapping); + + truncate_pagecache_range(inode, + (loff_t)index << PAGE_SHIFT, + ((loff_t)pg_end << PAGE_SHIFT) - 1); + + f2fs_lock_op(sbi); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); - break; + f2fs_unlock_op(sbi); + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + goto out; } + + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); + end = min(pg_end, end_offset - dn.ofs_in_node + index); + + ret = f2fs_do_zero_range(&dn, index, end); + f2fs_put_dnode(&dn); + + f2fs_unlock_op(sbi); + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + f2fs_balance_fs(sbi, dn.node_changed); + + if (ret) + goto out; + + index = end; + new_size = max_t(loff_t, new_size, + (loff_t)index << PAGE_SHIFT); } - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); - - if (pg_start == pg_end) - new_size = offset + len; - else if (index == pg_start && off_start) - new_size = (index + 1) << PAGE_CACHE_SHIFT; - else if (index == pg_end) - new_size = (index << PAGE_CACHE_SHIFT) + off_end; + + if (off_end) { + ret = fill_zero(inode, pg_end, 0, off_end); + if (ret) + goto out; + + new_size = max_t(loff_t, new_size, offset + len); + } + } + +out: + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); else - new_size += PAGE_CACHE_SIZE; + f2fs_i_size_write(inode, new_size); } + return ret; +} + +static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + pgoff_t nr, pg_start, pg_end, delta, idx; + loff_t new_size; + int ret = 0; + + new_size = i_size_read(inode) + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; + + if (offset >= i_size_read(inode)) + return -EINVAL; + + /* insert range should be aligned to block size of f2fs. */ + if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); + + filemap_invalidate_lock(mapping); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); + filemap_invalidate_unlock(mapping); + if (ret) + return ret; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX); + if (ret) + return ret; + + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; + delta = pg_end - pg_start; + idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + /* avoid gc operation during block exchange */ + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(mapping); + + f2fs_zero_post_eof_page(inode, offset + len, false); + truncate_pagecache(inode, offset); - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); + while (!ret && idx > pg_start) { + nr = idx - pg_start; + if (nr > delta) + nr = delta; + idx -= nr; + + f2fs_lock_op(sbi); + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, idx, + idx + delta, nr, false); + f2fs_unlock_op(sbi); } + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (ret) + return ret; + /* write out all moved pages, if possible */ + filemap_invalidate_lock(mapping); + ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + filemap_invalidate_unlock(mapping); + + if (!ret) + f2fs_i_size_write(inode, new_size); return ret; } +static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, + loff_t len, int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_map_blocks map = { .m_next_pgofs = NULL, + .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, + .m_may_create = true }; + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 0 }; + pgoff_t pg_start, pg_end; + loff_t new_size; + loff_t off_end; + block_t expanded = 0; + int err; + + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; + + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + + f2fs_zero_post_eof_page(inode, offset + len, true); + + f2fs_balance_fs(sbi, true); + + pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; + off_end = (offset + len) & (PAGE_SIZE - 1); + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + if (off_end) + map.m_len++; + + if (!map.m_len) + return 0; + + if (f2fs_is_pinned_file(inode)) { + block_t sec_blks = CAP_BLKS_PER_SEC(sbi); + block_t sec_len = roundup(map.m_len, sec_blks); + + map.m_len = sec_blks; +next_alloc: + f2fs_down_write(&sbi->pin_sem); + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (has_not_enough_free_secs(sbi, 0, 0)) { + f2fs_up_write(&sbi->pin_sem); + err = -ENOSPC; + f2fs_warn_ratelimited(sbi, + "ino:%lu, start:%lu, end:%lu, need to trigger GC to " + "reclaim enough free segment when checkpoint is enabled", + inode->i_ino, pg_start, pg_end); + goto out_err; + } + } + + if (has_not_enough_free_secs(sbi, 0, + sbi->reserved_pin_section)) { + f2fs_down_write(&sbi->gc_lock); + stat_inc_gc_call_count(sbi, FOREGROUND); + err = f2fs_gc(sbi, &gc_control); + if (err && err != -ENODATA) { + f2fs_up_write(&sbi->pin_sem); + goto out_err; + } + } + + err = f2fs_allocate_pinning_section(sbi); + if (err) { + f2fs_up_write(&sbi->pin_sem); + goto out_err; + } + + map.m_seg_type = CURSEG_COLD_DATA_PINNED; + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_DIO); + file_dont_truncate(inode); + + f2fs_up_write(&sbi->pin_sem); + + expanded += map.m_len; + sec_len -= map.m_len; + map.m_lblk += map.m_len; + if (!err && sec_len) + goto next_alloc; + + map.m_len = expanded; + } else { + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_AIO); + expanded = map.m_len; + } +out_err: + if (err) { + pgoff_t last_off; + + if (!expanded) + return err; + + last_off = pg_start + expanded - 1; + + /* update new size to the failed position */ + new_size = (last_off == pg_end) ? offset + len : + (loff_t)(last_off + 1) << PAGE_SHIFT; + } else { + new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; + } + + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } + + return err; +} + static long f2fs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - long ret; + long ret = 0; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) + return -ENOSPC; + if (!f2fs_is_compress_backend_ready(inode) || IS_DEVICE_ALIASING(inode)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_PUNCH_HOLE) - ret = punch_hole(inode, offset, len, mode); - else - ret = expand_inode_data(inode, offset, len, mode); + /* f2fs only support ->fallocate for regular file */ + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (IS_ENCRYPTED(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) + return -EOPNOTSUPP; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_INSERT_RANGE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + /* + * Pinned file should not support partial truncation since the block + * can be used by applications. + */ + if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = file_modified(file); + if (ret) + goto out; + + /* + * wait for inflight dio, blocks should be removed after IO + * completion. + */ + inode_dio_wait(inode); + + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (offset >= inode->i_size) + goto out; + + ret = f2fs_punch_hole(inode, offset, len); + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + ret = f2fs_collapse_range(inode, offset, len); + } else if (mode & FALLOC_FL_ZERO_RANGE) { + ret = f2fs_zero_range(inode, offset, len, mode); + } else if (mode & FALLOC_FL_INSERT_RANGE) { + ret = f2fs_insert_range(inode, offset, len); + } else { + ret = f2fs_expand_inode_data(inode, offset, len, mode); + } if (!ret) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + f2fs_mark_inode_dirty_sync(inode, false); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } + +out: + inode_unlock(inode); + trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +static int f2fs_release_file(struct inode *inode, struct file *filp) +{ + if (atomic_dec_and_test(&F2FS_I(inode)->open_count)) + f2fs_remove_donate_inode(inode); + + /* + * f2fs_release_file is called at every close calls. So we should + * not drop any inmemory pages by close called by other process. + */ + if (!(filp->f_mode & FMODE_WRITE) || + atomic_read(&inode->i_writecount) != 1) + return 0; + + inode_lock(inode); + f2fs_abort_atomic_write(inode, true); + inode_unlock(inode); + + return 0; +} + +static int f2fs_file_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (F2FS_I(inode)->atomic_write_task == current && + (current->flags & PF_EXITING)) { + inode_lock(inode); + f2fs_abort_atomic_write(inode, true); + inode_unlock(inode); + } + + return 0; +} -static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & F2FS_REG_FLMASK; + struct f2fs_inode_info *fi = F2FS_I(inode); + u32 masked_flags = fi->i_flags & mask; + + /* mask can be shrunk by flags_valid selector */ + iflags &= mask; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return -EPERM; + + if ((iflags ^ masked_flags) & F2FS_CASEFOLD_FL) { + if (!f2fs_sb_has_casefold(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + if (!f2fs_empty_dir(inode)) + return -ENOTEMPTY; + } + + if (iflags & (F2FS_COMPR_FL | F2FS_NOCOMP_FL)) { + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + if ((iflags & F2FS_COMPR_FL) && (iflags & F2FS_NOCOMP_FL)) + return -EINVAL; + } + + if ((iflags ^ masked_flags) & F2FS_COMPR_FL) { + if (masked_flags & F2FS_COMPR_FL) { + if (!f2fs_disable_compressed_file(inode)) + return -EINVAL; + } else { + /* try to convert inline_data to support compression */ + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; + + f2fs_down_write(&fi->i_sem); + if (!f2fs_may_compress(inode) || + atomic_read(&fi->writeback) || + (S_ISREG(inode->i_mode) && + F2FS_HAS_BLOCKS(inode))) { + f2fs_up_write(&fi->i_sem); + return -EINVAL; + } + err = set_compress_context(inode); + f2fs_up_write(&fi->i_sem); + + if (err) + return err; + } + } + + fi->i_flags = iflags | (fi->i_flags & ~mask); + f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) && + (fi->i_flags & F2FS_NOCOMP_FL)); + + if (fi->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); else - return flags & F2FS_OTHER_FLMASK; + clear_inode_flag(inode, FI_PROJ_INHERIT); + + inode_set_ctime_current(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); + return 0; } -long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +/* FS_IOC_[GS]ETFLAGS and FS_IOC_FS[GS]ETXATTR support */ + +/* + * To make a new on-disk f2fs i_flag gettable via FS_IOC_GETFLAGS, add an entry + * for it to f2fs_fsflags_map[], and add its FS_*_FL equivalent to + * F2FS_GETTABLE_FS_FL. To also make it settable via FS_IOC_SETFLAGS, also add + * its FS_*_FL equivalent to F2FS_SETTABLE_FS_FL. + * + * Translating flags to fsx_flags value used by FS_IOC_FSGETXATTR and + * FS_IOC_FSSETXATTR is done by the VFS. + */ + +static const struct { + u32 iflag; + u32 fsflag; +} f2fs_fsflags_map[] = { + { F2FS_COMPR_FL, FS_COMPR_FL }, + { F2FS_SYNC_FL, FS_SYNC_FL }, + { F2FS_IMMUTABLE_FL, FS_IMMUTABLE_FL }, + { F2FS_APPEND_FL, FS_APPEND_FL }, + { F2FS_NODUMP_FL, FS_NODUMP_FL }, + { F2FS_NOATIME_FL, FS_NOATIME_FL }, + { F2FS_NOCOMP_FL, FS_NOCOMP_FL }, + { F2FS_INDEX_FL, FS_INDEX_FL }, + { F2FS_DIRSYNC_FL, FS_DIRSYNC_FL }, + { F2FS_PROJINHERIT_FL, FS_PROJINHERIT_FL }, + { F2FS_CASEFOLD_FL, FS_CASEFOLD_FL }, +}; + +#define F2FS_GETTABLE_FS_FL ( \ + FS_COMPR_FL | \ + FS_SYNC_FL | \ + FS_IMMUTABLE_FL | \ + FS_APPEND_FL | \ + FS_NODUMP_FL | \ + FS_NOATIME_FL | \ + FS_NOCOMP_FL | \ + FS_INDEX_FL | \ + FS_DIRSYNC_FL | \ + FS_PROJINHERIT_FL | \ + FS_ENCRYPT_FL | \ + FS_INLINE_DATA_FL | \ + FS_NOCOW_FL | \ + FS_VERITY_FL | \ + FS_CASEFOLD_FL) + +#define F2FS_SETTABLE_FS_FL ( \ + FS_COMPR_FL | \ + FS_SYNC_FL | \ + FS_IMMUTABLE_FL | \ + FS_APPEND_FL | \ + FS_NODUMP_FL | \ + FS_NOATIME_FL | \ + FS_NOCOMP_FL | \ + FS_DIRSYNC_FL | \ + FS_PROJINHERIT_FL | \ + FS_CASEFOLD_FL) + +/* Convert f2fs on-disk i_flags to FS_IOC_{GET,SET}FLAGS flags */ +static inline u32 f2fs_iflags_to_fsflags(u32 iflags) +{ + u32 fsflags = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(f2fs_fsflags_map); i++) + if (iflags & f2fs_fsflags_map[i].iflag) + fsflags |= f2fs_fsflags_map[i].fsflag; + + return fsflags; +} + +/* Convert FS_IOC_{GET,SET}FLAGS flags to f2fs on-disk i_flags */ +static inline u32 f2fs_fsflags_to_iflags(u32 fsflags) +{ + u32 iflags = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(f2fs_fsflags_map); i++) + if (fsflags & f2fs_fsflags_map[i].fsflag) + iflags |= f2fs_fsflags_map[i].iflag; + + return iflags; +} + +static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + return put_user(inode->i_generation, (int __user *)arg); +} + +static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) { struct inode *inode = file_inode(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + loff_t isize; int ret; - switch (cmd) { - case F2FS_IOC_GETFLAGS: - flags = fi->i_flags & FS_FL_USER_VISIBLE; - return put_user(flags, (int __user *) arg); - case F2FS_IOC_SETFLAGS: - { - unsigned int oldflags; + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; - ret = mnt_want_write_file(filp); + if (!inode_owner_or_capable(idmap, inode)) + return -EACCES; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (filp->f_flags & O_DIRECT) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (!f2fs_disable_compressed_file(inode) || + f2fs_is_pinned_file(inode)) { + ret = -EINVAL; + goto out; + } + + if (f2fs_is_atomic_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[READ]); + + /* + * Should wait end_io to count F2FS_WB_CP_DATA correctly by + * f2fs_is_atomic_file. + */ + if (get_dirty_pages(inode)) + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", + inode->i_ino, get_dirty_pages(inode)); + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out_unlock; + + /* Check if the inode already has a COW inode */ + if (fi->cow_inode == NULL) { + /* Create a COW inode for atomic write */ + struct dentry *dentry = file_dentry(filp); + struct inode *dir = d_inode(dentry->d_parent); + + ret = f2fs_get_tmpfile(idmap, dir, &fi->cow_inode); if (ret) - return ret; + goto out_unlock; + + set_inode_flag(fi->cow_inode, FI_COW_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + + /* Set the COW inode's atomic_inode to the atomic inode */ + F2FS_I(fi->cow_inode)->atomic_inode = inode; + } else { + /* Reuse the already created COW inode */ + f2fs_bug_on(sbi, get_dirty_pages(fi->cow_inode)); + + invalidate_mapping_pages(fi->cow_inode->i_mapping, 0, -1); + + ret = f2fs_do_truncate_blocks(fi->cow_inode, 0, true); + if (ret) + goto out_unlock; + } + + f2fs_write_inode(inode, NULL); + + stat_inc_atomic_inode(inode); + + set_inode_flag(inode, FI_ATOMIC_FILE); + + isize = i_size_read(inode); + fi->original_i_size = isize; + if (truncate) { + set_inode_flag(inode, FI_ATOMIC_REPLACE); + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, 0); + isize = 0; + } + f2fs_i_size_write(fi->cow_inode, isize); + +out_unlock: + f2fs_up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + if (ret) + goto out; + + f2fs_update_time(sbi, REQ_TIME); + fi->atomic_write_task = current; + stat_update_max_atomic_write(inode); + fi->atomic_write_cnt = 0; +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_commit_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); + int ret; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!inode_owner_or_capable(idmap, inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) { + ret = f2fs_commit_atomic_write(inode); + if (!ret) + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + + f2fs_abort_atomic_write(inode, ret); + } else { + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); + } + + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_abort_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct mnt_idmap *idmap = file_mnt_idmap(filp); + int ret; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!inode_owner_or_capable(idmap, inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + f2fs_abort_atomic_write(inode, true); + + inode_unlock(inode); + + mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return ret; +} + +int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, + bool readonly, bool need_lock) +{ + struct super_block *sb = sbi->sb; + int ret = 0; + + switch (flag) { + case F2FS_GOING_DOWN_FULLSYNC: + ret = bdev_freeze(sb->s_bdev); + if (ret) + goto out; + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); + bdev_thaw(sb->s_bdev); + break; + case F2FS_GOING_DOWN_METASYNC: + /* do checkpoint only */ + ret = f2fs_sync_fs(sb, 1); + if (ret) { + if (ret == -EIO) + ret = 0; + goto out; + } + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); + break; + case F2FS_GOING_DOWN_NOSYNC: + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); + break; + case F2FS_GOING_DOWN_METAFLUSH: + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); + break; + case F2FS_GOING_DOWN_NEED_FSCK: + set_sbi_flag(sbi, SBI_NEED_FSCK); + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + set_sbi_flag(sbi, SBI_IS_DIRTY); + /* do checkpoint only */ + ret = f2fs_sync_fs(sb, 1); + if (ret == -EIO) + ret = 0; + goto out; + default: + ret = -EINVAL; + goto out; + } + + if (readonly) + goto out; + + /* + * grab sb->s_umount to avoid racing w/ remount() and other shutdown + * paths. + */ + if (need_lock) + down_write(&sbi->sb->s_umount); + + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); + + f2fs_drop_discard_cmd(sbi); + clear_opt(sbi, DISCARD); + + if (need_lock) + up_write(&sbi->sb->s_umount); + + f2fs_update_time(sbi, REQ_TIME); +out: + + trace_f2fs_shutdown(sbi, flag, ret); + + return ret; +} + +static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + __u32 in; + int ret; + bool need_drop = false, readonly = false; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__u32 __user *)arg)) + return -EFAULT; + + if (in != F2FS_GOING_DOWN_FULLSYNC) { + ret = mnt_want_write_file(filp); + if (ret) { + if (ret != -EROFS) + return ret; + + /* fallback to nosync shutdown for readonly fs */ + in = F2FS_GOING_DOWN_NOSYNC; + readonly = true; + } else { + need_drop = true; + } + } + + ret = f2fs_do_shutdown(sbi, in, readonly, true); + + if (need_drop) + mnt_drop_write_file(filp); + + return ret; +} + +static int f2fs_keep_noreuse_range(struct inode *inode, + loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u64 max_bytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); + u64 start, end; + int ret = 0; + + if (!S_ISREG(inode->i_mode)) + return 0; + + if (offset >= max_bytes || len > max_bytes || + (offset + len) > max_bytes) + return 0; + + start = offset >> PAGE_SHIFT; + end = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + inode_lock(inode); + if (f2fs_is_atomic_file(inode)) { + inode_unlock(inode); + return 0; + } + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + /* let's remove the range, if len = 0 */ + if (!len) { + if (!list_empty(&F2FS_I(inode)->gdonate_list)) { + list_del_init(&F2FS_I(inode)->gdonate_list); + sbi->donate_files--; + if (is_inode_flag_set(inode, FI_DONATE_FINISHED)) + ret = -EALREADY; + else + set_inode_flag(inode, FI_DONATE_FINISHED); + } else + ret = -ENOENT; + } else { + if (list_empty(&F2FS_I(inode)->gdonate_list)) { + list_add_tail(&F2FS_I(inode)->gdonate_list, + &sbi->inode_list[DONATE_INODE]); + sbi->donate_files++; + } else { + list_move_tail(&F2FS_I(inode)->gdonate_list, + &sbi->inode_list[DONATE_INODE]); + } + F2FS_I(inode)->donate_start = start; + F2FS_I(inode)->donate_end = end - 1; + clear_inode_flag(inode, FI_DONATE_FINISHED); + } + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + inode_unlock(inode); + + return ret; +} + +static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fstrim_range range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!f2fs_hw_support_discard(sbi)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + range.minlen = max_t(unsigned int, range.minlen, + f2fs_hw_discard_granularity(sbi)); + ret = f2fs_trim_fs(sbi, &range); + mnt_drop_write_file(filp); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + f2fs_update_time(sbi, REQ_TIME); + return 0; +} + +static bool uuid_is_nonzero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return true; + return false; +} + +static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + ret = fscrypt_ioctl_set_policy(filp, (const void __user *)arg); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return ret; +} + +static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); +} + +static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u8 encrypt_pw_salt[16]; + int err; + + if (!f2fs_sb_has_encrypt(sbi)) + return -EOPNOTSUPP; + + err = mnt_want_write_file(filp); + if (err) + return err; + + f2fs_down_write(&sbi->sb_lock); + + if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) + goto got_it; + + /* update superblock with uuid */ + generate_random_uuid(sbi->raw_super->encrypt_pw_salt); + + err = f2fs_commit_super(sbi, false); + if (err) { + /* undo new data */ + memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + goto out_err; + } +got_it: + memcpy(encrypt_pw_salt, sbi->raw_super->encrypt_pw_salt, 16); +out_err: + f2fs_up_write(&sbi->sb_lock); + mnt_drop_write_file(filp); + + if (!err && copy_to_user((__u8 __user *)arg, encrypt_pw_salt, 16)) + err = -EFAULT; + + return err; +} + +static int f2fs_ioc_get_encryption_policy_ex(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); +} + +static int f2fs_ioc_add_encryption_key(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_add_key(filp, (void __user *)arg); +} + +static int f2fs_ioc_remove_encryption_key(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_remove_key(filp, (void __user *)arg); +} + +static int f2fs_ioc_remove_encryption_key_all_users(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_remove_key_all_users(filp, (void __user *)arg); +} + +static int f2fs_ioc_get_encryption_key_status(struct file *filp, + unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); +} + +static int f2fs_ioc_get_encryption_nonce(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fscrypt_ioctl_get_nonce(filp, (void __user *)arg); +} + +static int f2fs_ioc_gc(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .no_bg_gc = false, + .should_migrate_blocks = false, + .nr_free_secs = 0 }; + __u32 sync; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(sync, (__u32 __user *)arg)) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!sync) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + ret = -EBUSY; + goto out; + } + } else { + f2fs_down_write(&sbi->gc_lock); + } + + gc_control.init_gc_type = sync ? FG_GC : BG_GC; + gc_control.err_gc_skipped = sync; + stat_inc_gc_call_count(sbi, FOREGROUND); + ret = f2fs_gc(sbi, &gc_control); +out: + mnt_drop_write_file(filp); + return ret; +} + +static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + struct f2fs_gc_control gc_control = { + .init_gc_type = range->sync ? FG_GC : BG_GC, + .no_bg_gc = false, + .should_migrate_blocks = false, + .err_gc_skipped = range->sync, + .nr_free_secs = 0 }; + u64 end; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + end = range->start + range->len; + if (end < range->start || range->start < MAIN_BLKADDR(sbi) || + end >= MAX_BLKADDR(sbi)) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + +do_more: + if (!range->sync) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + ret = -EBUSY; + goto out; + } + } else { + f2fs_down_write(&sbi->gc_lock); + } + + gc_control.victim_segno = GET_SEGNO(sbi, range->start); + stat_inc_gc_call_count(sbi, FOREGROUND); + ret = f2fs_gc(sbi, &gc_control); + if (ret) { + if (ret == -EBUSY) + ret = -EAGAIN; + goto out; + } + range->start += CAP_BLKS_PER_SEC(sbi); + if (range->start <= end) + goto do_more; +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct f2fs_gc_range range; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + return __f2fs_ioc_gc_range(filp, &range); +} + +static int f2fs_ioc_write_checkpoint(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + f2fs_info(sbi, "Skipping Checkpoint. Checkpoints currently disabled."); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = f2fs_sync_fs(sbi->sb, 1); + + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map = { .m_next_extent = NULL, + .m_seg_type = NO_CHECK_TYPE, + .m_may_create = false }; + struct extent_info ei = {}; + pgoff_t pg_start, pg_end, next_pgofs; + unsigned int total = 0, sec_num; + block_t blk_end = 0; + bool fragmented = false; + int err; + + f2fs_balance_fs(sbi, true); + + inode_lock(inode); + pg_start = range->start >> PAGE_SHIFT; + pg_end = min_t(pgoff_t, + (range->start + range->len) >> PAGE_SHIFT, + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) || + f2fs_is_atomic_file(inode)) { + err = -EINVAL; + goto unlock_out; + } + + /* if in-place-update policy is enabled, don't waste time here */ + set_inode_flag(inode, FI_OPU_WRITE); + if (f2fs_should_update_inplace(inode, NULL)) { + err = -EINVAL; + goto out; + } + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, + pg_start << PAGE_SHIFT, + (pg_end << PAGE_SHIFT) - 1); + if (err) + goto out; - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) { + if ((pgoff_t)ei.fofs + ei.len >= pg_end) goto out; + } + + map.m_lblk = pg_start; + map.m_next_pgofs = &next_pgofs; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk = next_pgofs; + continue; + } + + if (blk_end && blk_end != map.m_pblk) + fragmented = true; + + /* record total count of block that we're going to move */ + total += map.m_len; + + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; + } + + if (!fragmented) { + total = 0; + goto out; + } + + sec_num = DIV_ROUND_UP(total, CAP_BLKS_PER_SEC(sbi)); + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, 0, sec_num)) { + err = -EAGAIN; + goto out; + } + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + total = 0; + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk = next_pgofs; + goto check; } - if (get_user(flags, (int __user *) arg)) { - ret = -EFAULT; + set_inode_flag(inode, FI_SKIP_WRITES); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && + cnt < BLKS_PER_SEG(sbi)) { + struct folio *folio; + + folio = f2fs_get_lock_data_folio(inode, idx, true); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto clear_out; + } + + f2fs_folio_wait_writeback(folio, DATA, true, true); + + folio_mark_dirty(folio); + folio_set_f2fs_gcing(folio); + f2fs_folio_put(folio, true); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; +check: + if (map.m_lblk < pg_end && cnt < BLKS_PER_SEG(sbi)) + goto do_map; + + clear_inode_flag(inode, FI_SKIP_WRITES); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(inode, FI_SKIP_WRITES); +out: + clear_inode_flag(inode, FI_OPU_WRITE); +unlock_out: + inode_unlock(inode); + if (!err) + range->len = (u64)total << PAGE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) + return -EFAULT; + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + max_file_blocks(inode))) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + + if (range.len) + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + return err; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; +} + +static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len) +{ + struct inode *src = file_inode(file_in); + struct inode *dst = file_inode(file_out); + struct f2fs_sb_info *sbi = F2FS_I_SB(src); + size_t olen = len, dst_max_i_size = 0; + size_t dst_osize; + int ret; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + src->i_sb != dst->i_sb) + return -EXDEV; + + if (unlikely(f2fs_readonly(src->i_sb))) + return -EROFS; + + if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) + return -EINVAL; + + if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst)) + return -EOPNOTSUPP; + + if (pos_out < 0 || pos_in < 0) + return -EINVAL; + + if (src == dst) { + if (pos_in == pos_out) + return 0; + if (pos_out > pos_in && pos_out < pos_in + len) + return -EINVAL; + } + + inode_lock(src); + if (src != dst) { + ret = -EBUSY; + if (!inode_trylock(dst)) + goto out; + } + + if (f2fs_compressed_file(src) || f2fs_compressed_file(dst) || + f2fs_is_pinned_file(src) || f2fs_is_pinned_file(dst)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + if (f2fs_is_atomic_file(src) || f2fs_is_atomic_file(dst)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = -EINVAL; + if (pos_in + len > src->i_size || pos_in + len < pos_in) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - pos_in; + if (pos_in + len == src->i_size) + len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in; + if (len == 0) { + ret = 0; + goto out_unlock; + } + + dst_osize = dst->i_size; + if (pos_out + olen > dst->i_size) + dst_max_i_size = pos_out + olen; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_out, F2FS_BLKSIZE)) + goto out_unlock; + + ret = f2fs_convert_inline_inode(src); + if (ret) + goto out_unlock; + + ret = f2fs_convert_inline_inode(dst); + if (ret) + goto out_unlock; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(src->i_mapping, + pos_in, pos_in + len); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(dst->i_mapping, + pos_out, pos_out + len); + if (ret) + goto out_unlock; + + f2fs_balance_fs(sbi, true); + + f2fs_down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (src != dst) { + ret = -EBUSY; + if (!f2fs_down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + goto out_src; + } + + f2fs_lock_op(sbi); + ret = __exchange_data_block(src, dst, F2FS_BYTES_TO_BLK(pos_in), + F2FS_BYTES_TO_BLK(pos_out), + F2FS_BYTES_TO_BLK(len), false); + + if (!ret) { + if (dst_max_i_size) + f2fs_i_size_write(dst, dst_max_i_size); + else if (dst_osize != dst->i_size) + f2fs_i_size_write(dst, dst_osize); + } + f2fs_unlock_op(sbi); + + if (src != dst) + f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); +out_src: + f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (ret) + goto out_unlock; + + inode_set_mtime_to_ts(src, inode_set_ctime_current(src)); + f2fs_mark_inode_dirty_sync(src, false); + if (src != dst) { + inode_set_mtime_to_ts(dst, inode_set_ctime_current(dst)); + f2fs_mark_inode_dirty_sync(dst, false); + } + f2fs_update_time(sbi, REQ_TIME); + +out_unlock: + if (src != dst) + inode_unlock(dst); +out: + inode_unlock(src); + return ret; +} + +static int __f2fs_ioc_move_range(struct file *filp, + struct f2fs_move_range *range) +{ + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + CLASS(fd, dst)(range->dst_fd); + if (fd_empty(dst)) + return -EBADF; + + if (!(fd_file(dst)->f_mode & FMODE_WRITE)) + return -EBADF; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = f2fs_move_file_range(filp, range->pos_in, fd_file(dst), + range->pos_out, range->len); + + mnt_drop_write_file(filp); + return err; +} + +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + return __f2fs_ioc_move_range(filp, &range); +} + +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + struct f2fs_gc_control gc_control = { + .init_gc_type = FG_GC, + .should_migrate_blocks = true, + .err_gc_skipped = true, + .nr_free_secs = 0 }; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return -EINVAL; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num || + __is_large_section(sbi)) { + f2fs_warn(sbi, "Can't flush %u in %d for SEGS_PER_SEC %u != 1", + range.dev_num, sbi->s_ndevs, SEGS_PER_SEC(sbi)); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + ret = -EBUSY; goto out; } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + + gc_control.victim_segno = start_segno; + stat_inc_gc_call_count(sbi, FOREGROUND); + ret = f2fs_gc(sbi, &gc_control); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} + +#ifdef CONFIG_QUOTA +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) +{ + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + int err; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (IS_ERR(transfer_to[PRJQUOTA])) + return PTR_ERR(transfer_to[PRJQUOTA]); + + err = __dquot_transfer(inode, transfer_to); + if (err) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + dqput(transfer_to[PRJQUOTA]); + return err; +} + +static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode *ri = NULL; + kprojid_t kprojid; + int err; + + if (!f2fs_sb_has_project_quota(sbi)) { + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (!f2fs_has_extra_attr(inode)) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, fi->i_projid)) + return 0; + + err = -EPERM; + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return err; + + if (!F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + return -EOVERFLOW; + + err = f2fs_dquot_initialize(inode); + if (err) + return err; + + f2fs_lock_op(sbi); + err = f2fs_transfer_project_quota(inode, kprojid); + if (err) + goto out_unlock; + + fi->i_projid = kprojid; + inode_set_ctime_current(inode); + f2fs_mark_inode_dirty_sync(inode, true); +out_unlock: + f2fs_unlock_op(sbi); + return err; +} +#else +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) +{ + return 0; +} + +static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) +{ + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct f2fs_inode_info *fi = F2FS_I(inode); + u32 fsflags = f2fs_iflags_to_fsflags(fi->i_flags); + + if (IS_ENCRYPTED(inode)) + fsflags |= FS_ENCRYPT_FL; + if (IS_VERITY(inode)) + fsflags |= FS_VERITY_FL; + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + fsflags |= FS_INLINE_DATA_FL; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + fsflags |= FS_NOCOW_FL; + + fileattr_fill_flags(fa, fsflags & F2FS_GETTABLE_FS_FL); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode))) + fa->fsx_projid = from_kprojid(&init_user_ns, fi->i_projid); + + return 0; +} + +int f2fs_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa) +{ + struct inode *inode = d_inode(dentry); + u32 fsflags = fa->flags, mask = F2FS_SETTABLE_FS_FL; + u32 iflags; + int err; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) + return -ENOSPC; + if (fsflags & ~F2FS_GETTABLE_FS_FL) + return -EOPNOTSUPP; + fsflags &= F2FS_SETTABLE_FS_FL; + if (!fa->flags_valid) + mask &= FS_COMMON_FL; + + iflags = f2fs_fsflags_to_iflags(fsflags); + if (f2fs_mask_flags(inode->i_mode, iflags) != iflags) + return -EOPNOTSUPP; + + err = f2fs_setflags_common(inode, iflags, f2fs_fsflags_to_iflags(mask)); + if (!err) + err = f2fs_ioc_setproject(inode, fa->fsx_projid); + + return err; +} + +int f2fs_pin_file_control(struct inode *inode, bool inc) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (IS_DEVICE_ALIASING(inode)) + return -EINVAL; + + if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) { + f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials", + __func__, inode->i_ino, fi->i_gc_failures); + clear_inode_flag(inode, FI_PIN_FILE); + return -EAGAIN; + } + + /* Use i_gc_failures for normal file as a risk signal. */ + if (inc) + f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1); + + return 0; +} + +static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + __u32 pin; + int ret = 0; + + if (get_user(pin, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (!pin && IS_DEVICE_ALIASING(inode)) + return -EOPNOTSUPP; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) { + ret = -EINVAL; + goto out; + } + + if (!pin) { + clear_inode_flag(inode, FI_PIN_FILE); + f2fs_i_gc_failures_write(inode, 0); + goto done; + } else if (f2fs_is_pinned_file(inode)) { + goto done; + } + + if (F2FS_HAS_BLOCKS(inode)) { + ret = -EFBIG; + goto out; + } + + /* Let's allow file pinning on zoned device. */ + if (!f2fs_sb_has_blkzoned(sbi) && + f2fs_should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + + if (f2fs_pin_file_control(inode, false)) { + ret = -EAGAIN; + goto out; + } + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + if (!f2fs_disable_compressed_file(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + set_inode_flag(inode, FI_PIN_FILE); + ret = F2FS_I(inode)->i_gc_failures; +done: + f2fs_update_time(sbi, REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin = 0; + + if (is_inode_flag_set(inode, FI_PIN_FILE)) + pin = F2FS_I(inode)->i_gc_failures; + return put_user(pin, (u32 __user *)arg); +} + +static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg) +{ + return put_user(IS_DEVICE_ALIASING(file_inode(filp)) ? 1 : 0, + (u32 __user *)arg); +} + +static int f2fs_ioc_io_prio(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 level; + + if (get_user(level, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode) || level >= F2FS_IOPRIO_MAX) + return -EINVAL; + + inode_lock(inode); + F2FS_I(inode)->ioprio_hint = level; + inode_unlock(inode); + return 0; +} + +int f2fs_precache_extents(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_map_blocks map; + pgoff_t m_next_extent; + loff_t end; + int err; + + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return -EOPNOTSUPP; + + map.m_lblk = 0; + map.m_pblk = 0; + map.m_next_pgofs = NULL; + map.m_next_extent = &m_next_extent; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + end = F2FS_BLK_ALIGN(i_size_read(inode)); + + while (map.m_lblk < end) { + map.m_len = end - map.m_lblk; + + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); + err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRECACHE); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + if (err || !map.m_len) + return err; + + map.m_lblk = m_next_extent; + } + + return 0; +} + +static int f2fs_ioc_precache_extents(struct file *filp) +{ + return f2fs_precache_extents(file_inode(filp)); +} + +static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + __u64 block_count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&block_count, (void __user *)arg, + sizeof(block_count))) + return -EFAULT; + + return f2fs_resize_fs(filp, block_count); +} + +static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) { + f2fs_warn(F2FS_I_SB(inode), + "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem", + inode->i_ino); + return -EOPNOTSUPP; + } + + return fsverity_ioctl_enable(filp, (const void __user *)arg); +} + +static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_measure(filp, (void __user *)arg); +} + +static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); +} + +static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + char *vbuf; + int count; + int err = 0; + + vbuf = f2fs_kzalloc(sbi, MAX_VOLUME_NAME, GFP_KERNEL); + if (!vbuf) + return -ENOMEM; + + f2fs_down_read(&sbi->sb_lock); + count = utf16s_to_utf8s(sbi->raw_super->volume_name, + ARRAY_SIZE(sbi->raw_super->volume_name), + UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME); + f2fs_up_read(&sbi->sb_lock); + + if (copy_to_user((char __user *)arg, vbuf, + min(FSLABEL_MAX, count))) + err = -EFAULT; + + kfree(vbuf); + return err; +} + +static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + char *vbuf; + int err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vbuf = strndup_user((const char __user *)arg, FSLABEL_MAX); + if (IS_ERR(vbuf)) + return PTR_ERR(vbuf); + + err = mnt_want_write_file(filp); + if (err) + goto out; + + f2fs_down_write(&sbi->sb_lock); + + memset(sbi->raw_super->volume_name, 0, + sizeof(sbi->raw_super->volume_name)); + utf8s_to_utf16s(vbuf, strlen(vbuf), UTF16_LITTLE_ENDIAN, + sbi->raw_super->volume_name, + ARRAY_SIZE(sbi->raw_super->volume_name)); + + err = f2fs_commit_super(sbi, false); + + f2fs_up_write(&sbi->sb_lock); + + mnt_drop_write_file(filp); +out: + kfree(vbuf); + return err; +} + +static int f2fs_get_compress_blocks(struct inode *inode, __u64 *blocks) +{ + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + *blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks); + + return 0; +} + +static int f2fs_ioc_get_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u64 blocks; + int ret; + + ret = f2fs_get_compress_blocks(inode, &blocks); + if (ret < 0) + return ret; + + return put_user(blocks, (u64 __user *)arg); +} + +static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int released_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + block_t blkaddr; + int i; + + for (i = 0; i < count; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_folio, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE))) + return -EFSCORRUPTED; + } + + while (count) { + int compr_blocks = 0; + + for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { + blkaddr = f2fs_data_blkaddr(dn); + + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + continue; + dn->ofs_in_node += cluster_size; + goto next; + } + + if (__is_valid_data_blkaddr(blkaddr)) + compr_blocks++; + + if (blkaddr != NEW_ADDR) + continue; + + f2fs_set_data_blkaddr(dn, NULL_ADDR); + } + + f2fs_i_compr_blocks_update(dn->inode, compr_blocks, false); + dec_valid_block_count(sbi, dn->inode, + cluster_size - compr_blocks); + + released_blocks += cluster_size - compr_blocks; +next: + count -= cluster_size; + } + + return released_blocks; +} + +static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int released_blocks = 0; + int ret; + int writecount; + + if (!f2fs_sb_has_compression(sbi)) + return -EOPNOTSUPP; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); + + inode_lock(inode); + + writecount = atomic_read(&inode->i_writecount); + if ((filp->f_mode & FMODE_WRITE && writecount != 1) || + (!(filp->f_mode & FMODE_WRITE) && writecount)) { + ret = -EBUSY; + goto out; + } + + if (!f2fs_compressed_file(inode) || + is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + if (!atomic_read(&fi->i_compr_blocks)) { + ret = -EPERM; + goto out; + } + + set_inode_flag(inode, FI_COMPRESS_RELEASED); + inode_set_ctime_current(inode); + f2fs_mark_inode_dirty_sync(inode, true); + + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + while (page_idx < last_idx) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + + f2fs_lock_op(sbi); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); + if (ret) { + f2fs_unlock_op(sbi); + if (ret == -ENOENT) { + page_idx = f2fs_get_next_page_offset(&dn, + page_idx); + ret = 0; + continue; + } + break; + } + + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); + count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); + count = round_up(count, fi->i_cluster_size); + + ret = release_compress_blocks(&dn, count); + + f2fs_put_dnode(&dn); + + f2fs_unlock_op(sbi); + + if (ret < 0) + break; + + page_idx += count; + released_blocks += ret; + } + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); +out: + if (released_blocks) + f2fs_update_time(sbi, REQ_TIME); + inode_unlock(inode); + + mnt_drop_write_file(filp); + + if (ret >= 0) { + ret = put_user(released_blocks, (u64 __user *)arg); + } else if (released_blocks && + atomic_read(&fi->i_compr_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " + "iblocks=%llu, released=%u, compr_blocks=%u, " + "run fsck to fix.", + __func__, inode->i_ino, inode->i_blocks, + released_blocks, + atomic_read(&fi->i_compr_blocks)); + } + + return ret; +} + +static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count, + unsigned int *reserved_blocks) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + block_t blkaddr; + int i; + + for (i = 0; i < count; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_folio, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE))) + return -EFSCORRUPTED; + } + + while (count) { + int compr_blocks = 0; + blkcnt_t reserved = 0; + blkcnt_t to_reserved; + int ret; + + for (i = 0; i < cluster_size; i++) { + blkaddr = data_blkaddr(dn->inode, dn->node_folio, + dn->ofs_in_node + i); + + if (i == 0) { + if (blkaddr != COMPRESS_ADDR) { + dn->ofs_in_node += cluster_size; + goto next; + } + continue; + } + + /* + * compressed cluster was not released due to it + * fails in release_compress_blocks(), so NEW_ADDR + * is a possible case. + */ + if (blkaddr == NEW_ADDR) { + reserved++; + continue; + } + if (__is_valid_data_blkaddr(blkaddr)) { + compr_blocks++; + continue; + } + } + + to_reserved = cluster_size - compr_blocks - reserved; + + /* for the case all blocks in cluster were reserved */ + if (reserved && to_reserved == 1) { + dn->ofs_in_node += cluster_size; + goto next; + } + + ret = inc_valid_block_count(sbi, dn->inode, + &to_reserved, false); + if (unlikely(ret)) + return ret; + + for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) { + if (f2fs_data_blkaddr(dn) == NULL_ADDR) + f2fs_set_data_blkaddr(dn, NEW_ADDR); + } + + f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true); - flags = f2fs_mask_flags(inode->i_mode, flags); + *reserved_blocks += to_reserved; +next: + count -= cluster_size; + } + + return 0; +} + +static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int reserved_blocks = 0; + int ret; + + if (!f2fs_sb_has_compression(sbi)) + return -EOPNOTSUPP; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); + + inode_lock(inode); + + if (!f2fs_compressed_file(inode) || + !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto unlock_inode; + } - mutex_lock(&inode->i_mutex); + if (atomic_read(&fi->i_compr_blocks)) + goto unlock_inode; - oldflags = fi->i_flags; + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); - ret = -EPERM; + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + while (page_idx < last_idx) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + + f2fs_lock_op(sbi); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, page_idx, LOOKUP_NODE); + if (ret) { + f2fs_unlock_op(sbi); + if (ret == -ENOENT) { + page_idx = f2fs_get_next_page_offset(&dn, + page_idx); + ret = 0; + continue; + } + break; + } + + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); + count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); + count = round_up(count, fi->i_cluster_size); + + ret = reserve_compress_blocks(&dn, count, &reserved_blocks); + + f2fs_put_dnode(&dn); + + f2fs_unlock_op(sbi); + + if (ret < 0) + break; + + page_idx += count; + } + + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + + if (!ret) { + clear_inode_flag(inode, FI_COMPRESS_RELEASED); + inode_set_ctime_current(inode); + f2fs_mark_inode_dirty_sync(inode, true); + } +unlock_inode: + if (reserved_blocks) + f2fs_update_time(sbi, REQ_TIME); + inode_unlock(inode); + mnt_drop_write_file(filp); + + if (!ret) { + ret = put_user(reserved_blocks, (u64 __user *)arg); + } else if (reserved_blocks && + atomic_read(&fi->i_compr_blocks)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: partial blocks were reserved i_ino=%lx " + "iblocks=%llu, reserved=%u, compr_blocks=%u, " + "run fsck to fix.", + __func__, inode->i_ino, inode->i_blocks, + reserved_blocks, + atomic_read(&fi->i_compr_blocks)); + } + + return ret; +} + +static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, + pgoff_t off, block_t block, block_t len, u32 flags) +{ + sector_t sector = SECTOR_FROM_BLOCK(block); + sector_t nr_sects = SECTOR_FROM_BLOCK(len); + int ret = 0; + + if (flags & F2FS_TRIM_FILE_DISCARD) { + if (bdev_max_secure_erase_sectors(bdev)) + ret = blkdev_issue_secure_erase(bdev, sector, nr_sects, + GFP_NOFS); + else + ret = blkdev_issue_discard(bdev, sector, nr_sects, + GFP_NOFS); + } + + if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { + if (IS_ENCRYPTED(inode)) + ret = fscrypt_zeroout_range(inode, off, block, len); + else + ret = blkdev_issue_zeroout(bdev, sector, nr_sects, + GFP_NOFS, 0); + } + + return ret; +} + +static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct block_device *prev_bdev = NULL; + struct f2fs_sectrim_range range; + pgoff_t index, pg_end, prev_index = 0; + block_t prev_block = 0, len = 0; + loff_t end_addr; + bool to_end = false; + int ret = 0; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_sectrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (range.flags == 0 || (range.flags & ~F2FS_TRIM_FILE_MASK) || + !S_ISREG(inode->i_mode)) + return -EINVAL; + + if (((range.flags & F2FS_TRIM_FILE_DISCARD) && + !f2fs_hw_support_discard(sbi)) || + ((range.flags & F2FS_TRIM_FILE_ZEROOUT) && + IS_ENCRYPTED(inode) && f2fs_is_multi_device(sbi))) + return -EOPNOTSUPP; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + inode_lock(inode); + + if (f2fs_is_atomic_file(inode) || f2fs_compressed_file(inode) || + range.start >= inode->i_size) { + ret = -EINVAL; + goto err; + } + + if (range.len == 0) + goto err; + + if (inode->i_size - range.start > range.len) { + end_addr = range.start + range.len; + } else { + end_addr = range.len == (u64)-1 ? + sbi->sb->s_maxbytes : inode->i_size; + to_end = true; + } + + if (!IS_ALIGNED(range.start, F2FS_BLKSIZE) || + (!to_end && !IS_ALIGNED(end_addr, F2FS_BLKSIZE))) { + ret = -EINVAL; + goto err; + } + + index = F2FS_BYTES_TO_BLK(range.start); + pg_end = DIV_ROUND_UP(end_addr, F2FS_BLKSIZE); + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto err; + + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(mapping); + + ret = filemap_write_and_wait_range(mapping, range.start, + to_end ? LLONG_MAX : end_addr - 1); + if (ret) + goto out; + + truncate_inode_pages_range(mapping, range.start, + to_end ? -1 : end_addr - 1); + + while (index < pg_end) { + struct dnode_of_data dn; + pgoff_t end_offset, count; + int i; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) { + index = f2fs_get_next_page_offset(&dn, index); + continue; + } + goto out; + } + + end_offset = ADDRS_PER_PAGE(dn.node_folio, inode); + count = min(end_offset - dn.ofs_in_node, pg_end - index); + for (i = 0; i < count; i++, index++, dn.ofs_in_node++) { + struct block_device *cur_bdev; + block_t blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + f2fs_put_dnode(&dn); goto out; } + + cur_bdev = f2fs_target_device(sbi, blkaddr, NULL); + if (f2fs_is_multi_device(sbi)) { + int di = f2fs_target_device_index(sbi, blkaddr); + + blkaddr -= FDEV(di).start_blk; + } + + if (len) { + if (prev_bdev == cur_bdev && + index == prev_index + len && + blkaddr == prev_block + len) { + len++; + } else { + ret = f2fs_secure_erase(prev_bdev, + inode, prev_index, prev_block, + len, range.flags); + if (ret) { + f2fs_put_dnode(&dn); + goto out; + } + + len = 0; + } + } + + if (!len) { + prev_bdev = cur_bdev; + prev_index = index; + prev_block = blkaddr; + len = 1; + } } - flags = flags & FS_FL_USER_MODIFIABLE; - flags |= oldflags & ~FS_FL_USER_MODIFIABLE; - fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + f2fs_put_dnode(&dn); + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + cond_resched(); + } - f2fs_set_inode_flags(inode); - inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + if (len) + ret = f2fs_secure_erase(prev_bdev, inode, prev_index, + prev_block, len, range.flags); + f2fs_update_time(sbi, REQ_TIME); out: - mnt_drop_write_file(filp); + filemap_invalidate_unlock(mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); +err: + inode_unlock(inode); + mnt_drop_write_file(filp); + + return ret; +} + +static int f2fs_ioc_get_compress_option(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_comp_option option; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + inode_lock_shared(inode); + + if (!f2fs_compressed_file(inode)) { + inode_unlock_shared(inode); + return -ENODATA; + } + + option.algorithm = F2FS_I(inode)->i_compress_algorithm; + option.log_cluster_size = F2FS_I(inode)->i_log_cluster_size; + + inode_unlock_shared(inode); + + if (copy_to_user((struct f2fs_comp_option __user *)arg, &option, + sizeof(option))) + return -EFAULT; + + return 0; +} + +static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_comp_option option; + int ret = 0; + + if (!f2fs_sb_has_compression(sbi)) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&option, (struct f2fs_comp_option __user *)arg, + sizeof(option))) + return -EFAULT; + + if (option.log_cluster_size < MIN_COMPRESS_LOG_SIZE || + option.log_cluster_size > MAX_COMPRESS_LOG_SIZE || + option.algorithm >= COMPRESS_MAX) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + inode_lock(inode); + + f2fs_down_write(&F2FS_I(inode)->i_sem); + if (!f2fs_compressed_file(inode)) { + ret = -EINVAL; + goto out; + } + + if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) { + ret = -EBUSY; + goto out; + } + + if (F2FS_HAS_BLOCKS(inode)) { + ret = -EFBIG; + goto out; + } + + fi->i_compress_algorithm = option.algorithm; + fi->i_log_cluster_size = option.log_cluster_size; + fi->i_cluster_size = BIT(option.log_cluster_size); + /* Set default level */ + if (fi->i_compress_algorithm == COMPRESS_ZSTD) + fi->i_compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + else + fi->i_compress_level = 0; + /* Adjust mount option level */ + if (option.algorithm == F2FS_OPTION(sbi).compress_algorithm && + F2FS_OPTION(sbi).compress_level) + fi->i_compress_level = F2FS_OPTION(sbi).compress_level; + f2fs_mark_inode_dirty_sync(inode, true); + + if (!f2fs_is_compress_backend_ready(inode)) + f2fs_warn(sbi, "compression algorithm is successfully set, " + "but current kernel doesn't support this algorithm."); +out: + f2fs_up_write(&fi->i_sem); + inode_unlock(inode); + mnt_drop_write_file(filp); + + return ret; +} + +static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) +{ + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, page_idx); + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t redirty_idx = page_idx; + int page_len = 0, ret = 0; + + page_cache_ra_unbounded(&ractl, len, 0); + + do { + folio = read_cache_folio(mapping, page_idx, NULL, NULL); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + break; + } + page_len += folio_nr_pages(folio) - (page_idx - folio->index); + page_idx = folio_next_index(folio); + } while (page_len < len); + + do { + folio = filemap_lock_folio(mapping, redirty_idx); + + /* It will never fail, when folio has pinned above */ + f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(folio)); + + f2fs_folio_wait_writeback(folio, DATA, true, true); + + folio_mark_dirty(folio); + folio_set_f2fs_gcing(folio); + redirty_idx = folio_next_index(folio); + folio_unlock(folio); + folio_put_refs(folio, 2); + } while (redirty_idx < page_idx); + + return ret; +} + +static int f2fs_ioc_decompress_file(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + pgoff_t page_idx = 0, last_idx, cluster_idx; + int ret; + + if (!f2fs_sb_has_compression(sbi) || + F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + f2fs_balance_fs(sbi, true); + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + inode_lock(inode); + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!f2fs_compressed_file(inode) || + is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + if (!atomic_read(&fi->i_compr_blocks)) + goto out; + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + last_idx >>= fi->i_log_cluster_size; + + for (cluster_idx = 0; cluster_idx < last_idx; cluster_idx++) { + page_idx = cluster_idx << fi->i_log_cluster_size; + + if (!f2fs_is_compressed_cluster(inode, page_idx)) + continue; + + ret = redirty_blocks(inode, page_idx, fi->i_cluster_size); + if (ret < 0) + break; + + if (get_dirty_pages(inode) >= BLKS_PER_SEG(sbi)) { + ret = filemap_fdatawrite(inode->i_mapping); + if (ret < 0) + break; + } + + cond_resched(); + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + } + + if (!ret) + ret = filemap_write_and_wait_range(inode->i_mapping, 0, + LLONG_MAX); + + if (ret) + f2fs_warn(sbi, "%s: The file might be partially decompressed (errno=%d). Please delete the file.", + __func__, ret); + f2fs_update_time(sbi, REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + + return ret; +} + +static int f2fs_ioc_compress_file(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + pgoff_t page_idx = 0, last_idx, cluster_idx; + int ret; + + if (!f2fs_sb_has_compression(sbi) || + F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + f2fs_balance_fs(sbi, true); + + ret = mnt_want_write_file(filp); + if (ret) return ret; + inode_lock(inode); + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!f2fs_compressed_file(inode) || + is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EINVAL; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + set_inode_flag(inode, FI_ENABLE_COMPRESS); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + last_idx >>= fi->i_log_cluster_size; + + for (cluster_idx = 0; cluster_idx < last_idx; cluster_idx++) { + page_idx = cluster_idx << fi->i_log_cluster_size; + + if (f2fs_is_sparse_cluster(inode, page_idx)) + continue; + + ret = redirty_blocks(inode, page_idx, fi->i_cluster_size); + if (ret < 0) + break; + + if (get_dirty_pages(inode) >= BLKS_PER_SEG(sbi)) { + ret = filemap_fdatawrite(inode->i_mapping); + if (ret < 0) + break; + } + + cond_resched(); + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } } + + if (!ret) + ret = filemap_write_and_wait_range(inode->i_mapping, 0, + LLONG_MAX); + + clear_inode_flag(inode, FI_ENABLE_COMPRESS); + + if (ret) + f2fs_warn(sbi, "%s: The file might be partially compressed (errno=%d). Please delete the file.", + __func__, ret); + f2fs_update_time(sbi, REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + + return ret; +} + +static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC_GETVERSION: + return f2fs_ioc_getversion(filp, arg); + case F2FS_IOC_START_ATOMIC_WRITE: + return f2fs_ioc_start_atomic_write(filp, false); + case F2FS_IOC_START_ATOMIC_REPLACE: + return f2fs_ioc_start_atomic_write(filp, true); + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + return f2fs_ioc_commit_atomic_write(filp); + case F2FS_IOC_ABORT_ATOMIC_WRITE: + return f2fs_ioc_abort_atomic_write(filp); + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + return -EOPNOTSUPP; + case F2FS_IOC_SHUTDOWN: + return f2fs_ioc_shutdown(filp, arg); + case FITRIM: + return f2fs_ioc_fitrim(filp, arg); + case FS_IOC_SET_ENCRYPTION_POLICY: + return f2fs_ioc_set_encryption_policy(filp, arg); + case FS_IOC_GET_ENCRYPTION_POLICY: + return f2fs_ioc_get_encryption_policy(filp, arg); + case FS_IOC_GET_ENCRYPTION_PWSALT: + return f2fs_ioc_get_encryption_pwsalt(filp, arg); + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + return f2fs_ioc_get_encryption_policy_ex(filp, arg); + case FS_IOC_ADD_ENCRYPTION_KEY: + return f2fs_ioc_add_encryption_key(filp, arg); + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return f2fs_ioc_remove_encryption_key(filp, arg); + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return f2fs_ioc_remove_encryption_key_all_users(filp, arg); + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return f2fs_ioc_get_encryption_key_status(filp, arg); + case FS_IOC_GET_ENCRYPTION_NONCE: + return f2fs_ioc_get_encryption_nonce(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT: + return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + return f2fs_ioc_gc_range(filp, arg); + case F2FS_IOC_WRITE_CHECKPOINT: + return f2fs_ioc_write_checkpoint(filp); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); + case F2FS_IOC_MOVE_RANGE: + return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); + case F2FS_IOC_GET_PIN_FILE: + return f2fs_ioc_get_pin_file(filp, arg); + case F2FS_IOC_SET_PIN_FILE: + return f2fs_ioc_set_pin_file(filp, arg); + case F2FS_IOC_PRECACHE_EXTENTS: + return f2fs_ioc_precache_extents(filp); + case F2FS_IOC_RESIZE_FS: + return f2fs_ioc_resize_fs(filp, arg); + case FS_IOC_ENABLE_VERITY: + return f2fs_ioc_enable_verity(filp, arg); + case FS_IOC_MEASURE_VERITY: + return f2fs_ioc_measure_verity(filp, arg); + case FS_IOC_READ_VERITY_METADATA: + return f2fs_ioc_read_verity_metadata(filp, arg); + case FS_IOC_GETFSLABEL: + return f2fs_ioc_getfslabel(filp, arg); + case FS_IOC_SETFSLABEL: + return f2fs_ioc_setfslabel(filp, arg); + case F2FS_IOC_GET_COMPRESS_BLOCKS: + return f2fs_ioc_get_compress_blocks(filp, arg); + case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: + return f2fs_release_compress_blocks(filp, arg); + case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: + return f2fs_reserve_compress_blocks(filp, arg); + case F2FS_IOC_SEC_TRIM_FILE: + return f2fs_sec_trim_file(filp, arg); + case F2FS_IOC_GET_COMPRESS_OPTION: + return f2fs_ioc_get_compress_option(filp, arg); + case F2FS_IOC_SET_COMPRESS_OPTION: + return f2fs_ioc_set_compress_option(filp, arg); + case F2FS_IOC_DECOMPRESS_FILE: + return f2fs_ioc_decompress_file(filp); + case F2FS_IOC_COMPRESS_FILE: + return f2fs_ioc_compress_file(filp); + case F2FS_IOC_GET_DEV_ALIAS_FILE: + return f2fs_ioc_get_dev_alias_file(filp, arg); + case F2FS_IOC_IO_PRIO: + return f2fs_ioc_io_prio(filp, arg); default: return -ENOTTY; } } +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp)))) + return -ENOSPC; + + return __f2fs_ioctl(filp, cmd, arg); +} + +/* + * Return %true if the given read or write request should use direct I/O, or + * %false if it should use buffered I/O. + */ +static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb, + struct iov_iter *iter) +{ + unsigned int align; + + if (!(iocb->ki_flags & IOCB_DIRECT)) + return false; + + if (f2fs_force_buffered_io(inode, iov_iter_rw(iter))) + return false; + + /* + * Direct I/O not aligned to the disk's logical_block_size will be + * attempted, but will fail with -EINVAL. + * + * f2fs additionally requires that direct I/O be aligned to the + * filesystem block size, which is often a stricter requirement. + * However, f2fs traditionally falls back to buffered I/O on requests + * that are logical_block_size-aligned but not fs-block aligned. + * + * The below logic implements this behavior. + */ + align = iocb->ki_pos | iov_iter_alignment(iter); + if (!IS_ALIGNED(align, i_blocksize(inode)) && + IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev))) + return false; + + return true; +} + +static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_READ); + if (error) + return error; + f2fs_update_iostat(sbi, NULL, APP_DIRECT_READ_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_read_ops = { + .end_io = f2fs_dio_read_end_io, +}; + +static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(to); + struct iomap_dio *dio; + ssize_t ret; + + if (count == 0) + return 0; /* skip atime update */ + + trace_f2fs_direct_IO_enter(inode, iocb, count, READ); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { + ret = -EAGAIN; + goto out; + } + } else { + f2fs_down_read(&fi->i_gc_rwsem[READ]); + } + + /* dio is not compatible w/ atomic file */ + if (f2fs_is_atomic_file(inode)) { + f2fs_up_read(&fi->i_gc_rwsem[READ]); + ret = -EOPNOTSUPP; + goto out; + } + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_READ counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_READ); + dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops, + &f2fs_iomap_dio_read_ops, 0, NULL, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_READ); + } else { + ret = iomap_dio_complete(dio); + } + + f2fs_up_read(&fi->i_gc_rwsem[READ]); + + file_accessed(file); +out: + trace_f2fs_direct_IO_exit(inode, pos, count, READ, ret); + return ret; +} + +static void f2fs_trace_rw_file_path(struct file *file, loff_t pos, size_t count, + int rw) +{ + struct inode *inode = file_inode(file); + char *buf, *path; + + buf = f2fs_getname(F2FS_I_SB(inode)); + if (!buf) + return; + path = dentry_path_raw(file_dentry(file), buf, PATH_MAX); + if (IS_ERR(path)) + goto free_buf; + if (rw == WRITE) + trace_f2fs_datawrite_start(inode, pos, count, + current->pid, path, current->comm); + else + trace_f2fs_dataread_start(inode, pos, count, + current->pid, path, current->comm); +free_buf: + f2fs_putname(buf); +} + +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const loff_t pos = iocb->ki_pos; + ssize_t ret; + bool dio; + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + if (trace_f2fs_dataread_start_enabled()) + f2fs_trace_rw_file_path(iocb->ki_filp, iocb->ki_pos, + iov_iter_count(to), READ); + + dio = f2fs_should_use_dio(inode, iocb, to); + + /* In LFS mode, if there is inflight dio, wait for its completion */ + if (f2fs_lfs_mode(F2FS_I_SB(inode)) && + get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE) && + (!f2fs_is_pinned_file(inode) || !dio)) + inode_dio_wait(inode); + + if (dio) { + ret = f2fs_dio_read_iter(iocb, to); + } else { + ret = filemap_read(iocb, to, 0); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_READ_IO, ret); + } + trace_f2fs_dataread_end(inode, pos, ret); + return ret; +} + +static ssize_t f2fs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + const loff_t pos = *ppos; + ssize_t ret; + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + if (trace_f2fs_dataread_start_enabled()) + f2fs_trace_rw_file_path(in, pos, len, READ); + + ret = filemap_splice_read(in, ppos, pipe, len, flags); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_READ_IO, ret); + + trace_f2fs_dataread_end(inode, pos, ret); + return ret; +} + +static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t count; + int err; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return -EPERM; + + count = generic_write_checks(iocb, from); + if (count <= 0) + return count; + + err = file_modified(file); + if (err) + return err; + + f2fs_zero_post_eof_page(inode, + iocb->ki_pos + iov_iter_count(from), true); + return count; +} + +/* + * Preallocate blocks for a write request, if it is possible and helpful to do + * so. Returns a positive number if blocks may have been preallocated, 0 if no + * blocks were preallocated, or a negative errno value if something went + * seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the + * requested blocks (not just some of them) have been allocated. + */ +static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, + bool dio) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(iter); + struct f2fs_map_blocks map = {}; + int flag; + int ret; + + /* If it will be an out-of-place direct write, don't bother. */ + if (dio && f2fs_lfs_mode(sbi)) + return 0; + /* + * Don't preallocate holes aligned to DIO_SKIP_HOLES which turns into + * buffered IO, if DIO meets any holes. + */ + if (dio && i_size_read(inode) && + (F2FS_BYTES_TO_BLK(pos) < F2FS_BLK_ALIGN(i_size_read(inode)))) + return 0; + + /* No-wait I/O can't allocate blocks. */ + if (iocb->ki_flags & IOCB_NOWAIT) + return 0; + + /* If it will be a short write, don't bother. */ + if (fault_in_iov_iter_readable(iter, count)) + return 0; + + if (f2fs_has_inline_data(inode)) { + /* If the data will fit inline, don't bother. */ + if (pos + count <= MAX_INLINE_DATA(inode)) + return 0; + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + /* Do not preallocate blocks that will be written partially in 4KB. */ + map.m_lblk = F2FS_BLK_ALIGN(pos); + map.m_len = F2FS_BYTES_TO_BLK(pos + count); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + return 0; + + if (!IS_DEVICE_ALIASING(inode)) + map.m_may_create = true; + if (dio) { + map.m_seg_type = f2fs_rw_hint_to_seg_type(sbi, + inode->i_write_hint); + flag = F2FS_GET_BLOCK_PRE_DIO; + } else { + map.m_seg_type = NO_CHECK_TYPE; + flag = F2FS_GET_BLOCK_PRE_AIO; + } + + ret = f2fs_map_blocks(inode, &map, flag); + /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */ + if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0)) + return ret; + if (ret == 0) + set_inode_flag(inode, FI_PREALLOCATED_ALL); + return map.m_len; +} + +static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + ret = generic_perform_write(iocb, from); + + if (ret > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_IO, ret); + } + return ret; +} + +static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_WRITE); + if (error) + return error; + f2fs_update_time(sbi, REQ_TIME); + f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size); + return 0; +} + +static void f2fs_dio_write_submit_io(const struct iomap_iter *iter, + struct bio *bio, loff_t file_offset) +{ + struct inode *inode = iter->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + enum log_type type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); + enum temp_type temp = f2fs_get_segment_temp(sbi, type); + + bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp); + submit_bio(bio); +} + +static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { + .end_io = f2fs_dio_write_end_io, + .submit_io = f2fs_dio_write_submit_io, +}; + +static void f2fs_flush_buffered_write(struct address_space *mapping, + loff_t start_pos, loff_t end_pos) +{ + int ret; + + ret = filemap_write_and_wait_range(mapping, start_pos, end_pos); + if (ret < 0) + return; + invalidate_mapping_pages(mapping, + start_pos >> PAGE_SHIFT, + end_pos >> PAGE_SHIFT); +} + +static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, + bool *may_need_sync) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const bool do_opu = f2fs_lfs_mode(sbi); + const loff_t pos = iocb->ki_pos; + const ssize_t count = iov_iter_count(from); + unsigned int dio_flags; + struct iomap_dio *dio; + ssize_t ret; + + trace_f2fs_direct_IO_enter(inode, iocb, count, WRITE); + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* f2fs_convert_inline_inode() and block allocation can block */ + if (f2fs_has_inline_data(inode) || + !f2fs_overwrite_io(inode, pos, count)) { + ret = -EAGAIN; + goto out; + } + + if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[WRITE])) { + ret = -EAGAIN; + goto out; + } + if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) { + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); + ret = -EAGAIN; + goto out; + } + } else { + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + f2fs_down_read(&fi->i_gc_rwsem[WRITE]); + if (do_opu) + f2fs_down_read(&fi->i_gc_rwsem[READ]); + } + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_WRITE counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_WRITE); + dio_flags = 0; + if (pos + count > inode->i_size) + dio_flags |= IOMAP_DIO_FORCE_WAIT; + dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops, + &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_WRITE); + } else { + ret = iomap_dio_complete(dio); + } + + if (do_opu) + f2fs_up_read(&fi->i_gc_rwsem[READ]); + f2fs_up_read(&fi->i_gc_rwsem[WRITE]); + + if (ret < 0) + goto out; + if (pos + ret > inode->i_size) + f2fs_i_size_write(inode, pos + ret); + if (!do_opu) + set_inode_flag(inode, FI_UPDATE_WRITE); + + if (iov_iter_count(from)) { + ssize_t ret2; + loff_t bufio_start_pos = iocb->ki_pos; + + /* + * The direct write was partial, so we need to fall back to a + * buffered write for the remainder. + */ + + ret2 = f2fs_buffered_write_iter(iocb, from); + if (iov_iter_count(from)) + f2fs_write_failed(inode, iocb->ki_pos); + if (ret2 < 0) + goto out; + + /* + * Ensure that the pagecache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + if (ret2 > 0) { + loff_t bufio_end_pos = bufio_start_pos + ret2 - 1; + + ret += ret2; + + f2fs_flush_buffered_write(file->f_mapping, + bufio_start_pos, + bufio_end_pos); + } + } else { + /* iomap_dio_rw() already handled the generic_write_sync(). */ + *may_need_sync = false; + } +out: + trace_f2fs_direct_IO_exit(inode, pos, count, WRITE, ret); + return ret; +} + +static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const loff_t orig_pos = iocb->ki_pos; + const size_t orig_count = iov_iter_count(from); + loff_t target_size; + bool dio; + bool may_need_sync = true; + int preallocated; + const loff_t pos = iocb->ki_pos; + const ssize_t count = iov_iter_count(from); + ssize_t ret; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + ret = -EIO; + goto out; + } + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) { + ret = -EAGAIN; + goto out; + } + } else { + inode_lock(inode); + } + + if (f2fs_is_pinned_file(inode) && + !f2fs_overwrite_io(inode, pos, count)) { + ret = -EIO; + goto out_unlock; + } + + ret = f2fs_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; + + /* Determine whether we will do a direct write or a buffered write. */ + dio = f2fs_should_use_dio(inode, iocb, from); + + /* dio is not compatible w/ atomic write */ + if (dio && f2fs_is_atomic_file(inode)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + + /* Possibly preallocate the blocks for the write. */ + target_size = iocb->ki_pos + iov_iter_count(from); + preallocated = f2fs_preallocate_blocks(iocb, from, dio); + if (preallocated < 0) { + ret = preallocated; + } else { + if (trace_f2fs_datawrite_start_enabled()) + f2fs_trace_rw_file_path(iocb->ki_filp, iocb->ki_pos, + orig_count, WRITE); + + /* Do the actual write. */ + ret = dio ? + f2fs_dio_write_iter(iocb, from, &may_need_sync) : + f2fs_buffered_write_iter(iocb, from); + + trace_f2fs_datawrite_end(inode, orig_pos, ret); + } + + /* Don't leave any preallocated blocks around past i_size. */ + if (preallocated && i_size_read(inode) < target_size) { + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + if (!f2fs_truncate(inode)) + file_dont_truncate(inode); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } else { + file_dont_truncate(inode); + } + + clear_inode_flag(inode, FI_PREALLOCATED_ALL); +out_unlock: + inode_unlock(inode); +out: + trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); + + if (ret > 0 && may_need_sync) + ret = generic_write_sync(iocb, ret); + + /* If buffered IO was forced, flush and drop the data from + * the page cache to preserve O_DIRECT semantics + */ + if (ret > 0 && !dio && (iocb->ki_flags & IOCB_DIRECT)) + f2fs_flush_buffered_write(iocb->ki_filp->f_mapping, + orig_pos, + orig_pos + ret - 1); + + return ret; +} + +static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, + int advice) +{ + struct address_space *mapping; + struct backing_dev_info *bdi; + struct inode *inode = file_inode(filp); + int err; + + trace_f2fs_fadvise(inode, offset, len, advice); + + if (advice == POSIX_FADV_SEQUENTIAL) { + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; + + mapping = filp->f_mapping; + if (!mapping || len < 0) + return -EINVAL; + + bdi = inode_to_bdi(mapping->host); + filp->f_ra.ra_pages = bdi->ra_pages * + F2FS_I_SB(inode)->seq_file_ra_mul; + spin_lock(&filp->f_lock); + filp->f_mode &= ~FMODE_RANDOM; + spin_unlock(&filp->f_lock); + return 0; + } else if (advice == POSIX_FADV_WILLNEED && offset == 0) { + /* Load extent cache at the first readahead. */ + f2fs_precache_extents(inode); + } + + err = generic_fadvise(filp, offset, len, advice); + if (err) + return err; + + if (advice == POSIX_FADV_DONTNEED && + (test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode))) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + else if (advice == POSIX_FADV_NOREUSE) + err = f2fs_keep_noreuse_range(inode, offset, len); + return err; +} + #ifdef CONFIG_COMPAT +struct compat_f2fs_gc_range { + u32 sync; + compat_u64 start; + compat_u64 len; +}; +#define F2FS_IOC32_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11,\ + struct compat_f2fs_gc_range) + +static int f2fs_compat_ioc_gc_range(struct file *file, unsigned long arg) +{ + struct compat_f2fs_gc_range __user *urange; + struct f2fs_gc_range range; + int err; + + urange = compat_ptr(arg); + err = get_user(range.sync, &urange->sync); + err |= get_user(range.start, &urange->start); + err |= get_user(range.len, &urange->len); + if (err) + return -EFAULT; + + return __f2fs_ioc_gc_range(file, &range); +} + +struct compat_f2fs_move_range { + u32 dst_fd; + compat_u64 pos_in; + compat_u64 pos_out; + compat_u64 len; +}; +#define F2FS_IOC32_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct compat_f2fs_move_range) + +static int f2fs_compat_ioc_move_range(struct file *file, unsigned long arg) +{ + struct compat_f2fs_move_range __user *urange; + struct f2fs_move_range range; + int err; + + urange = compat_ptr(arg); + err = get_user(range.dst_fd, &urange->dst_fd); + err |= get_user(range.pos_in, &urange->pos_in); + err |= get_user(range.pos_out, &urange->pos_out); + err |= get_user(range.len, &urange->len); + if (err) + return -EFAULT; + + return __f2fs_ioc_move_range(file, &range); +} + long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(file)))) + return -ENOSPC; + switch (cmd) { - case F2FS_IOC32_GETFLAGS: - cmd = F2FS_IOC_GETFLAGS; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; break; - case F2FS_IOC32_SETFLAGS: - cmd = F2FS_IOC_SETFLAGS; + case F2FS_IOC32_GARBAGE_COLLECT_RANGE: + return f2fs_compat_ioc_gc_range(file, arg); + case F2FS_IOC32_MOVE_RANGE: + return f2fs_compat_ioc_move_range(file, arg); + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_START_ATOMIC_REPLACE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_ATOMIC_WRITE: + case F2FS_IOC_SHUTDOWN: + case FITRIM: + case FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_PWSALT: + case FS_IOC_GET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + case FS_IOC_ADD_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY: + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + case FS_IOC_GET_ENCRYPTION_NONCE: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: + case F2FS_IOC_GET_PIN_FILE: + case F2FS_IOC_SET_PIN_FILE: + case F2FS_IOC_PRECACHE_EXTENTS: + case F2FS_IOC_RESIZE_FS: + case FS_IOC_ENABLE_VERITY: + case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: + case FS_IOC_GETFSLABEL: + case FS_IOC_SETFSLABEL: + case F2FS_IOC_GET_COMPRESS_BLOCKS: + case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: + case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: + case F2FS_IOC_SEC_TRIM_FILE: + case F2FS_IOC_GET_COMPRESS_OPTION: + case F2FS_IOC_SET_COMPRESS_OPTION: + case F2FS_IOC_DECOMPRESS_FILE: + case F2FS_IOC_COMPRESS_FILE: + case F2FS_IOC_GET_DEV_ALIAS_FILE: + case F2FS_IOC_IO_PRIO: break; default: return -ENOIOCTLCMD; } - return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); + return __f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif const struct file_operations f2fs_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, - .open = generic_file_open, - .mmap = f2fs_file_mmap, + .llseek = f2fs_llseek, + .read_iter = f2fs_file_read_iter, + .write_iter = f2fs_file_write_iter, + .iopoll = iocb_bio_iopoll, + .open = f2fs_file_open, + .release = f2fs_release_file, + .mmap_prepare = f2fs_file_mmap_prepare, + .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, #endif - .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_read = f2fs_file_splice_read, + .splice_write = iter_file_splice_write, + .fadvise = f2fs_file_fadvise, + .fop_flags = FOP_BUFFER_RASYNC, }; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 35f9b1a196aa..384fa7e2085b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1,51 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/gc.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/module.h> -#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/f2fs_fs.h> #include <linux/kthread.h> #include <linux/delay.h> #include <linux/freezer.h> -#include <linux/blkdev.h> +#include <linux/sched/signal.h> +#include <linux/random.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "gc.h" +#include "iostat.h" #include <trace/events/f2fs.h> -static struct kmem_cache *winode_slab; +static struct kmem_cache *victim_entry_slab; + +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len); static int gc_thread_func(void *data) { struct f2fs_sb_info *sbi = data; + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; - long wait_ms; + wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; + unsigned int wait_ms; + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false, + .err_gc_skipped = false, + .one_time = false }; - wait_ms = GC_THREAD_MIN_SLEEP_TIME; + wait_ms = gc_th->min_sleep_time; + set_freezable(); do { - if (try_to_freeze()) + bool sync_mode, foreground = false, gc_boost = false; + + wait_event_freezable_timeout(*wq, + kthread_should_stop() || + waitqueue_active(fggc_wq) || + gc_th->gc_wake, + msecs_to_jiffies(wait_ms)); + + if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) { + foreground = true; + gc_control.one_time = false; + } else if (f2fs_sb_has_blkzoned(sbi)) { + gc_control.one_time = true; + } + + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = false; + + if (f2fs_readonly(sbi->sb)) { + stat_other_skip_bggc_count(sbi); continue; - else - wait_event_interruptible_timeout(*wq, - kthread_should_stop(), - msecs_to_jiffies(wait_ms)); + } if (kthread_should_stop()) break; if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { - wait_ms = GC_THREAD_MAX_SLEEP_TIME; + increase_sleep_time(gc_th, &wait_ms); + stat_other_skip_bggc_count(sbi); + continue; + } + + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FAULT_INJECT); + + if (!sb_start_write_trylock(sbi->sb)) { + stat_other_skip_bggc_count(sbi); continue; } @@ -57,77 +94,188 @@ static int gc_thread_func(void *data) * 3. IO subsystem is idle by checking the # of requests in * bdev's request list. * - * Note) We have to avoid triggering GCs too much frequently. + * Note) We have to avoid triggering GCs frequently. * Because it is possible that some segments can be * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (!mutex_trylock(&sbi->gc_mutex)) - continue; + if (sbi->gc_mode == GC_URGENT_HIGH || + sbi->gc_mode == GC_URGENT_MID) { + wait_ms = gc_th->urgent_sleep_time; + f2fs_down_write(&sbi->gc_lock); + goto do_gc; + } - if (!is_idle(sbi)) { - wait_ms = increase_sleep_time(wait_ms); - mutex_unlock(&sbi->gc_mutex); - continue; + if (foreground) { + f2fs_down_write(&sbi->gc_lock); + goto do_gc; + } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + stat_other_skip_bggc_count(sbi); + goto next; } - if (has_enough_invalid_blocks(sbi)) - wait_ms = decrease_sleep_time(wait_ms); - else - wait_ms = increase_sleep_time(wait_ms); + if (!is_idle(sbi, GC_TIME)) { + increase_sleep_time(gc_th, &wait_ms); + f2fs_up_write(&sbi->gc_lock); + stat_io_skip_bggc_count(sbi); + goto next; + } -#ifdef CONFIG_F2FS_STAT_FS - sbi->bg_gc++; -#endif + if (f2fs_sb_has_blkzoned(sbi)) { + if (has_enough_free_blocks(sbi, + gc_th->no_zoned_gc_percent)) { + wait_ms = gc_th->no_gc_sleep_time; + f2fs_up_write(&sbi->gc_lock); + goto next; + } + if (wait_ms == gc_th->no_gc_sleep_time) + wait_ms = gc_th->max_sleep_time; + } + + if (need_to_boost_gc(sbi)) { + decrease_sleep_time(gc_th, &wait_ms); + if (f2fs_sb_has_blkzoned(sbi)) + gc_boost = true; + } else { + increase_sleep_time(gc_th, &wait_ms); + } +do_gc: + stat_inc_gc_call_count(sbi, foreground ? + FOREGROUND : BACKGROUND); + + sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) || + (gc_boost && gc_th->boost_gc_greedy); + + /* foreground GC was been triggered via f2fs_balance_fs() */ + if (foreground && !f2fs_sb_has_blkzoned(sbi)) + sync_mode = false; + + gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; + gc_control.no_bg_gc = foreground; + gc_control.nr_free_secs = foreground ? 1 : 0; /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi)) - wait_ms = GC_THREAD_NOGC_SLEEP_TIME; + if (f2fs_gc(sbi, &gc_control)) { + /* don't bother wait_ms by foreground gc */ + if (!foreground) + wait_ms = gc_th->no_gc_sleep_time; + } else { + /* reset wait_ms to default sleep time */ + if (wait_ms == gc_th->no_gc_sleep_time) + wait_ms = gc_th->min_sleep_time; + } + + if (foreground) + wake_up_all(&gc_th->fggc_wq); + + trace_f2fs_background_gc(sbi->sb, wait_ms, + prefree_segments(sbi), free_segments(sbi)); + + /* balancing f2fs's metadata periodically */ + f2fs_balance_fs_bg(sbi, true); +next: + if (sbi->gc_mode != GC_NORMAL) { + spin_lock(&sbi->gc_remaining_trials_lock); + if (sbi->gc_remaining_trials) { + sbi->gc_remaining_trials--; + if (!sbi->gc_remaining_trials) + sbi->gc_mode = GC_NORMAL; + } + spin_unlock(&sbi->gc_remaining_trials_lock); + } + sb_end_write(sbi->sb); + } while (!kthread_should_stop()); return 0; } -int start_gc_thread(struct f2fs_sb_info *sbi) +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; - int err = 0; - if (!test_opt(sbi, BG_GC)) - goto out; - gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); - if (!gc_th) { - err = -ENOMEM; - goto out; + gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + if (!gc_th) + return -ENOMEM; + + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; + gc_th->valid_thresh_ratio = DEF_GC_THREAD_VALID_THRESH_RATIO; + gc_th->boost_gc_multiple = BOOST_GC_MULTIPLE; + gc_th->boost_gc_greedy = GC_GREEDY; + + if (f2fs_sb_has_blkzoned(sbi)) { + gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED; + gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME_ZONED; + gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME_ZONED; + gc_th->no_zoned_gc_percent = LIMIT_NO_ZONED_GC; + gc_th->boost_zoned_gc_percent = LIMIT_BOOST_ZONED_GC; + } else { + gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; + gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; + gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; + gc_th->no_zoned_gc_percent = 0; + gc_th->boost_zoned_gc_percent = 0; } + gc_th->gc_wake = false; + sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + init_waitqueue_head(&sbi->gc_thread->fggc_wq); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { - err = PTR_ERR(gc_th->f2fs_gc_task); + int err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); sbi->gc_thread = NULL; + return err; } -out: - return err; + return 0; } -void stop_gc_thread(struct f2fs_sb_info *sbi) +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); + wake_up_all(&gc_th->fggc_wq); kfree(gc_th); sbi->gc_thread = NULL; } -static int select_gc_type(int gc_type) +static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) { - return (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + int gc_mode; + + if (gc_type == BG_GC) { + if (sbi->am.atgc_enabled) + gc_mode = GC_AT; + else + gc_mode = GC_CB; + } else { + gc_mode = GC_GREEDY; + } + + switch (sbi->gc_mode) { + case GC_IDLE_CB: + case GC_URGENT_LOW: + case GC_URGENT_MID: + gc_mode = GC_CB; + break; + case GC_IDLE_GREEDY: + case GC_URGENT_HIGH: + gc_mode = GC_GREEDY; + break; + case GC_IDLE_AT: + gc_mode = GC_AT; + break; + } + + return gc_mode; } static void select_policy(struct f2fs_sb_info *sbi, int gc_type, @@ -135,16 +283,42 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - if (p->alloc_mode == SSR) { + if (p->alloc_mode == SSR || p->alloc_mode == AT_SSR) { p->gc_mode = GC_GREEDY; - p->dirty_segmap = dirty_i->dirty_segmap[type]; + p->dirty_bitmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else { - p->gc_mode = select_gc_type(gc_type); - p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; - p->ofs_unit = sbi->segs_per_sec; + p->gc_mode = select_gc_type(sbi, gc_type); + p->ofs_unit = SEGS_PER_SEC(sbi); + if (__is_large_section(sbi)) { + p->dirty_bitmap = dirty_i->dirty_secmap; + p->max_search = count_bits(p->dirty_bitmap, + 0, MAIN_SECS(sbi)); + } else { + p->dirty_bitmap = dirty_i->dirty_segmap[DIRTY]; + p->max_search = dirty_i->nr_dirty[DIRTY]; + } } - p->offset = sbi->last_victim[p->gc_mode]; + + /* + * adjust candidates range, should select all dirty segments for + * foreground GC and urgent GC cases. + */ + if (gc_type != FG_GC && + (sbi->gc_mode != GC_URGENT_HIGH) && + (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) && + p->max_search > sbi->max_victim_search) + p->max_search = sbi->max_victim_search; + + /* let's select beginning hot/small space first. */ + if (f2fs_need_rand_seg(sbi)) + p->offset = get_random_u32_below(MAIN_SECS(sbi) * + SEGS_PER_SEC(sbi)); + else if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + p->offset = 0; + else + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, @@ -152,11 +326,17 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) - return 1 << sbi->log_blocks_per_seg; + return BLKS_PER_SEG(sbi); + else if (p->alloc_mode == AT_SSR) + return UINT_MAX; + + /* LFS */ if (p->gc_mode == GC_GREEDY) - return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + return SEGS_TO_BLKS(sbi, 2 * p->ofs_unit); else if (p->gc_mode == GC_CB) return UINT_MAX; + else if (p->gc_mode == GC_AT) + return UINT_MAX; else /* No other gc_mode */ return 0; } @@ -164,7 +344,6 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int hint = 0; unsigned int secno; /* @@ -172,13 +351,11 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) * selected by background GC before. * Those segments guarantee they have small valid blocks. */ -next: - secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++); - if (secno < TOTAL_SECS(sbi)) { + for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) - goto next; + continue; clear_bit(secno, dirty_i->victim_secmap); - return secno * sbi->segs_per_sec; + return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } @@ -186,24 +363,20 @@ next: static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SECNO(sbi, segno); - unsigned int start = secno * sbi->segs_per_sec; unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; unsigned char u; - unsigned int i; + unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); - for (i = 0; i < sbi->segs_per_sec; i++) - mtime += get_seg_entry(sbi, start + i)->mtime; - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + mtime = f2fs_get_section_mtime(sbi, segno); + f2fs_bug_on(sbi, mtime == INVALID_MTIME); + vblocks = get_valid_blocks(sbi, segno, true); + vblocks = div_u64(vblocks, usable_segs_per_sec); - mtime = div_u64(mtime, sbi->segs_per_sec); - vblocks = div_u64(vblocks, sbi->segs_per_sec); + u = BLKS_TO_SEGS(sbi, vblocks * 100); - u = (vblocks * 100) >> sbi->log_blocks_per_seg; - - /* Handle if the system time is changed by user */ + /* Handle if the system time has changed by the user */ if (mtime < sit_i->min_mtime) sit_i->min_mtime = mtime; if (mtime > sit_i->max_mtime) @@ -215,17 +388,376 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } -static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, - struct victim_sel_policy *p) +static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, + unsigned int segno, struct victim_sel_policy *p, + unsigned int valid_thresh_ratio) { if (p->alloc_mode == SSR) return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + if (p->one_time_gc && (valid_thresh_ratio < 100) && + (get_valid_blocks(sbi, segno, true) >= + CAP_BLKS_PER_SEC(sbi) * valid_thresh_ratio / 100)) + return UINT_MAX; + /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_valid_blocks(sbi, segno, sbi->segs_per_sec); - else + return get_valid_blocks(sbi, segno, true); + else if (p->gc_mode == GC_CB) return get_cb_cost(sbi, segno); + + f2fs_bug_on(sbi, 1); + return 0; +} + +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len) +{ + unsigned int end = offset + len, sum = 0; + + while (offset < end) { + if (test_bit(offset++, addr)) + ++sum; + } + return sum; +} + +static bool f2fs_check_victim_tree(struct f2fs_sb_info *sbi, + struct rb_root_cached *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first_cached(root), *next; + struct victim_entry *cur_ve, *next_ve; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_ve = rb_entry(cur, struct victim_entry, rb_node); + next_ve = rb_entry(next, struct victim_entry, rb_node); + + if (cur_ve->mtime > next_ve->mtime) { + f2fs_info(sbi, "broken victim_rbtree, " + "cur_mtime(%llu) next_mtime(%llu)", + cur_ve->mtime, next_ve->mtime); + return false; + } + cur = next; + } +#endif + return true; +} + +static struct victim_entry *__lookup_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime) +{ + struct atgc_management *am = &sbi->am; + struct rb_node *node = am->root.rb_root.rb_node; + struct victim_entry *ve = NULL; + + while (node) { + ve = rb_entry(node, struct victim_entry, rb_node); + + if (mtime < ve->mtime) + node = node->rb_left; + else + node = node->rb_right; + } + return ve; +} + +static struct victim_entry *__create_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve; + + ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS, true, NULL); + + ve->mtime = mtime; + ve->segno = segno; + + list_add_tail(&ve->list, &am->victim_list); + am->victim_count++; + + return ve; +} + +static void __insert_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) +{ + struct atgc_management *am = &sbi->am; + struct rb_root_cached *root = &am->root; + struct rb_node **p = &root->rb_root.rb_node; + struct rb_node *parent = NULL; + struct victim_entry *ve; + bool left_most = true; + + /* look up rb tree to find parent node */ + while (*p) { + parent = *p; + ve = rb_entry(parent, struct victim_entry, rb_node); + + if (mtime < ve->mtime) { + p = &(*p)->rb_left; + } else { + p = &(*p)->rb_right; + left_most = false; + } + } + + ve = __create_victim_entry(sbi, mtime, segno); + + rb_link_node(&ve->rb_node, parent, p); + rb_insert_color_cached(&ve->rb_node, root, left_most); +} + +static void add_victim_entry(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned long long mtime = 0; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p->gc_mode == GC_AT && + get_valid_blocks(sbi, segno, true) == 0) + return; + } + + mtime = f2fs_get_section_mtime(sbi, segno); + f2fs_bug_on(sbi, mtime == INVALID_MTIME); + + /* Handle if the system time has changed by the user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (mtime < sit_i->dirty_min_mtime) + sit_i->dirty_min_mtime = mtime; + if (mtime > sit_i->dirty_max_mtime) + sit_i->dirty_max_mtime = mtime; + + /* don't choose young section as candidate */ + if (sit_i->dirty_max_mtime - mtime < p->age_threshold) + return; + + __insert_victim_entry(sbi, mtime, segno); +} + +static void atgc_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct rb_root_cached *root = &am->root; + struct rb_node *node; + struct victim_entry *ve; + unsigned long long total_time; + unsigned long long age, u, accu; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int sec_blocks = CAP_BLKS_PER_SEC(sbi); + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int age_weight = am->age_weight; + unsigned int cost; + unsigned int iter = 0; + + if (max_mtime < min_mtime) + return; + + max_mtime += 1; + total_time = max_mtime - min_mtime; + + accu = div64_u64(ULLONG_MAX, total_time); + accu = min_t(unsigned long long, div_u64(accu, 100), + DEFAULT_ACCURACY_CLASS); + + node = rb_first_cached(root); +next: + ve = rb_entry_safe(node, struct victim_entry, rb_node); + if (!ve) + return; + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip; + + /* age = 10000 * x% * 60 */ + age = div64_u64(accu * (max_mtime - ve->mtime), total_time) * + age_weight; + + vblocks = get_valid_blocks(sbi, ve->segno, true); + f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks); + + /* u = 10000 * x% * 40 */ + u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) * + (100 - age_weight); + + f2fs_bug_on(sbi, age + u >= UINT_MAX); + + cost = UINT_MAX - (age + u); + iter++; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip: + if (iter < dirty_threshold) { + node = rb_next(node); + goto next; + } +} + +/* + * select candidates around source section in range of + * [target - dirty_threshold, target + dirty_threshold] + */ +static void atssr_lookup_victim(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct atgc_management *am = &sbi->am; + struct victim_entry *ve; + unsigned long long age; + unsigned long long max_mtime = sit_i->dirty_max_mtime; + unsigned long long min_mtime = sit_i->dirty_min_mtime; + unsigned int vblocks; + unsigned int dirty_threshold = max(am->max_candidate_count, + am->candidate_ratio * + am->victim_count / 100); + unsigned int cost, iter; + int stage = 0; + + if (max_mtime < min_mtime) + return; + max_mtime += 1; +next_stage: + iter = 0; + ve = __lookup_victim_entry(sbi, p->age); +next_node: + if (!ve) { + if (stage++ == 0) + goto next_stage; + return; + } + + if (ve->mtime >= max_mtime || ve->mtime < min_mtime) + goto skip_node; + + age = max_mtime - ve->mtime; + + vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks; + f2fs_bug_on(sbi, !vblocks); + + /* rare case */ + if (vblocks == BLKS_PER_SEG(sbi)) + goto skip_node; + + iter++; + + age = max_mtime - abs(p->age - age); + cost = UINT_MAX - vblocks; + + if (cost < p->min_cost || + (cost == p->min_cost && age > p->oldest_age)) { + p->min_cost = cost; + p->oldest_age = age; + p->min_segno = ve->segno; + } +skip_node: + if (iter < dirty_threshold) { + ve = rb_entry(stage == 0 ? rb_prev(&ve->rb_node) : + rb_next(&ve->rb_node), + struct victim_entry, rb_node); + goto next_node; + } + + if (stage++ == 0) + goto next_stage; +} + +static void lookup_victim_by_age(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + f2fs_bug_on(sbi, !f2fs_check_victim_tree(sbi, &sbi->am.root)); + + if (p->gc_mode == GC_AT) + atgc_lookup_victim(sbi, p); + else if (p->alloc_mode == AT_SSR) + atssr_lookup_victim(sbi, p); + else + f2fs_bug_on(sbi, 1); +} + +static void release_victim_entry(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + struct victim_entry *ve, *tmp; + + list_for_each_entry_safe(ve, tmp, &am->victim_list, list) { + list_del(&ve->list); + kmem_cache_free(victim_entry_slab, ve); + am->victim_count--; + } + + am->root = RB_ROOT_CACHED; + + f2fs_bug_on(sbi, am->victim_count); + f2fs_bug_on(sbi, !list_empty(&am->victim_list)); +} + +static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!dirty_i->enable_pin_section) + return false; + if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) + dirty_i->pinned_secmap_cnt++; + return true; +} + +static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) +{ + return dirty_i->pinned_secmap_cnt; +} + +static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, + unsigned int secno) +{ + return dirty_i->enable_pin_section && + f2fs_pinned_section_exists(dirty_i) && + test_bit(secno, dirty_i->pinned_secmap); +} + +static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) +{ + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { + memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); + DIRTY_I(sbi)->pinned_secmap_cnt = 0; + } + DIRTY_I(sbi)->enable_pin_section = enable; +} + +static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, + unsigned int segno) +{ + if (!f2fs_is_pinned_file(inode)) + return 0; + if (gc_type != FG_GC) + return -EBUSY; + if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) + f2fs_pin_file_control(inode, true); + return -EAGAIN; } /* @@ -236,22 +768,81 @@ static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, * When it is called from SSR segment selection, it finds a segment * which has minimum valid blocks and removes it from dirty seglist. */ -static int get_victim_by_default(struct f2fs_sb_info *sbi, - unsigned int *result, int gc_type, int type, char alloc_mode) +int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, + int gc_type, int type, char alloc_mode, + unsigned long long age, bool one_time) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - struct victim_sel_policy p; - unsigned int secno, max_cost; - int nsearched = 0; + struct sit_info *sm = SIT_I(sbi); + struct victim_sel_policy p = {0}; + unsigned int secno, last_victim; + unsigned int last_segment; + unsigned int nsearched; + unsigned int valid_thresh_ratio = 100; + bool is_atgc; + int ret = 0; + + mutex_lock(&dirty_i->seglist_lock); + last_segment = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi); p.alloc_mode = alloc_mode; - select_policy(sbi, gc_type, type, &p); + p.age = age; + p.age_threshold = sbi->am.age_threshold; + if (one_time) { + p.one_time_gc = one_time; + if (has_enough_free_secs(sbi, 0, NR_PERSISTENT_LOG)) + valid_thresh_ratio = sbi->gc_thread->valid_thresh_ratio; + } +retry: + select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; - p.min_cost = max_cost = get_max_cost(sbi, &p); + p.oldest_age = 0; + p.min_cost = get_max_cost(sbi, &p); - mutex_lock(&dirty_i->seglist_lock); + is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR); + nsearched = 0; + + if (is_atgc) + SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX; + + if (*result != NULL_SEGNO) { + if (!get_valid_blocks(sbi, *result, false)) { + ret = -ENODATA; + goto out; + } + + if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) { + ret = -EBUSY; + goto out; + } + if (gc_type == FG_GC) + clear_bit(GET_SEC_FROM_SEG(sbi, *result), dirty_i->victim_secmap); + p.min_segno = *result; + goto got_result; + } + + ret = -ENODATA; + if (p.max_search == 0) + goto out; + if (__is_large_section(sbi) && p.alloc_mode == LFS) { + if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) { + p.min_segno = sbi->next_victim_seg[BG_GC]; + *result = p.min_segno; + sbi->next_victim_seg[BG_GC] = NULL_SEGNO; + goto got_result; + } + if (gc_type == FG_GC && + sbi->next_victim_seg[FG_GC] != NULL_SEGNO) { + p.min_segno = sbi->next_victim_seg[FG_GC]; + *result = p.min_segno; + sbi->next_victim_seg[FG_GC] = NULL_SEGNO; + goto got_result; + } + } + + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -259,101 +850,164 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } while (1) { - unsigned long cost; - unsigned int segno; - - segno = find_next_bit(p.dirty_segmap, - TOTAL_SEGS(sbi), p.offset); - if (segno >= TOTAL_SEGS(sbi)) { - if (sbi->last_victim[p.gc_mode]) { - sbi->last_victim[p.gc_mode] = 0; + unsigned long cost, *dirty_bitmap; + unsigned int unit_no, segno; + + dirty_bitmap = p.dirty_bitmap; + unit_no = find_next_bit(dirty_bitmap, + last_segment / p.ofs_unit, + p.offset / p.ofs_unit); + segno = unit_no * p.ofs_unit; + if (segno >= last_segment) { + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } break; } - p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; - secno = GET_SECNO(sbi, segno); + + p.offset = segno + p.ofs_unit; + nsearched++; + +#ifdef CONFIG_F2FS_CHECK_FS + /* + * skip selecting the invalid segno (that is failed due to block + * validity check failure during GC) to avoid endless GC loop in + * such cases. + */ + if (test_bit(segno, sm->invalid_segmap)) + goto next; +#endif + + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) - continue; + goto next; + + /* Don't touch checkpointed data */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p.alloc_mode == LFS) { + /* + * LFS is set to find source section during GC. + * The victim should have no checkpointed data. + */ + if (get_ckpt_valid_blocks(sbi, segno, true)) + goto next; + } else { + /* + * SSR | AT_SSR are set to find target segment + * for writes which can be full by checkpointed + * and newly written blocks. + */ + if (!f2fs_segment_has_free_slot(sbi, segno)) + goto next; + } + } + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) - continue; + goto next; - cost = get_gc_cost(sbi, segno, &p); + if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) + goto next; + + if (is_atgc) { + add_victim_entry(sbi, &p, segno); + goto next; + } + + cost = get_gc_cost(sbi, segno, &p, valid_thresh_ratio); if (p.min_cost > cost) { p.min_segno = segno; p.min_cost = cost; } - - if (cost == max_cost) - continue; - - if (nsearched++ >= MAX_VICTIM_SEARCH) { - sbi->last_victim[p.gc_mode] = segno; +next: + if (nsearched >= p.max_search) { + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = + last_victim + p.ofs_unit; + else + sm->last_victim[p.gc_mode] = segno + p.ofs_unit; + sm->last_victim[p.gc_mode] %= + (MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); break; } } + + /* get victim for GC_AT/AT_SSR */ + if (is_atgc) { + lookup_victim_by_age(sbi, &p); + release_victim_entry(sbi); + } + + if (is_atgc && p.min_segno == NULL_SEGNO && + sm->elapsed_time < p.age_threshold) { + p.age_threshold = 0; + goto retry; + } + if (p.min_segno != NULL_SEGNO) { got_it: + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; +got_result: if (p.alloc_mode == LFS) { - secno = GET_SECNO(sbi, p.min_segno); + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else set_bit(secno, dirty_i->victim_secmap); } - *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + ret = 0; + } +out: + if (p.min_segno != NULL_SEGNO) trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, sbi->cur_victim_sec, prefree_segments(sbi), free_segments(sbi)); - } mutex_unlock(&dirty_i->seglist_lock); - return (p.min_segno == NULL_SEGNO) ? 0 : 1; + return ret; } -static const struct victim_selection default_v_ops = { - .get_victim = get_victim_by_default, -}; - -static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) +static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) { struct inode_entry *ie; - list_for_each_entry(ie, ilist, list) - if (ie->inode->i_ino == ino) - return ie->inode; + ie = radix_tree_lookup(&gc_list->iroot, ino); + if (ie) + return ie->inode; return NULL; } -static void add_gc_inode(struct inode *inode, struct list_head *ilist) +static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) { struct inode_entry *new_ie; - if (inode == find_gc_inode(inode->i_ino, ilist)) { + if (inode == find_gc_inode(gc_list, inode->i_ino)) { iput(inode); return; } -repeat: - new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); - if (!new_ie) { - cond_resched(); - goto repeat; - } + new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, + GFP_NOFS, true, NULL); new_ie->inode = inode; - list_add_tail(&new_ie->list, ilist); + + f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); + list_add_tail(&new_ie->list, &gc_list->ilist); } -static void put_gc_inode(struct list_head *ilist) +static void put_gc_inode(struct gc_inode_list *gc_list) { struct inode_entry *ie, *next_ie; - list_for_each_entry_safe(ie, next_ie, ilist, list) { + + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { + radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); list_del(&ie->list); - kmem_cache_free(winode_slab, ie); + kmem_cache_free(f2fs_inode_entry_slab, ie); } } @@ -364,10 +1018,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi, struct seg_entry *sentry; int ret; - mutex_lock(&sit_i->sentry_lock); + down_read(&sit_i->sentry_lock); sentry = get_seg_entry(sbi, segno); ret = f2fs_test_bit(offset, sentry->cur_valid_map); - mutex_unlock(&sit_i->sentry_lock); + up_read(&sit_i->sentry_lock); return ret; } @@ -376,68 +1030,82 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static void gc_node_segment(struct f2fs_sb_info *sbi, +static int gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { - bool initial = true; struct f2fs_summary *entry; + block_t start_addr; int off; + int phase = 0; + bool fggc = (gc_type == FG_GC); + int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); + + start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; - for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + if (fggc && phase == 2) + atomic_inc(&sbi->wb_sync_req[NODE]); + + for (off = 0; off < usable_blks_in_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); - struct page *node_page; + struct folio *node_folio; + struct node_info ni; + int err; /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return submitted; if (check_valid_map(sbi, segno, off) == 0) continue; - if (initial) { - ra_node_page(sbi, nid); + if (phase == 0) { + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); continue; } - node_page = get_node_page(sbi, nid); - if (IS_ERR(node_page)) + + if (phase == 1) { + f2fs_ra_node_page(sbi, nid); continue; + } - /* set page dirty and write it */ - if (gc_type == FG_GC) { - f2fs_submit_bio(sbi, NODE, true); - wait_on_page_writeback(node_page); - set_page_dirty(node_page); - } else { - if (!PageWriteback(node_page)) - set_page_dirty(node_page); + /* phase == 2 */ + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); + if (IS_ERR(node_folio)) + continue; + + /* block may become invalid during f2fs_get_node_folio */ + if (check_valid_map(sbi, segno, off) == 0) { + f2fs_folio_put(node_folio, true); + continue; } - f2fs_put_page(node_page, 1); - stat_inc_node_blk_count(sbi, 1); - } - if (initial) { - initial = false; - goto next_step; - } + if (f2fs_get_node_info(sbi, nid, &ni, false)) { + f2fs_folio_put(node_folio, true); + continue; + } - if (gc_type == FG_GC) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_reclaim = 0, - }; - sync_node_pages(sbi, 0, &wbc); + if (ni.blk_addr != start_addr + off) { + f2fs_folio_put(node_folio, true); + continue; + } - /* - * In the case of FG_GC, it'd be better to reclaim this victim - * completely. - */ - if (get_valid_blocks(sbi, segno, 1) != 0) - goto next_step; + err = f2fs_move_node_folio(node_folio, gc_type); + if (!err && gc_type == FG_GC) + submitted++; + stat_inc_node_blk_count(sbi, 1, gc_type); } + + if (++phase < 3) + goto next_step; + + if (fggc) + atomic_dec(&sbi->wb_sync_req[NODE]); + return submitted; } /* @@ -447,7 +1115,7 @@ next_step: * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs) +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -459,71 +1127,403 @@ block_t start_bidx_of_node(unsigned int node_ofs) bidx = node_ofs - 1; } else if (node_ofs <= indirect_blks) { int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 2 - dec; } else { int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 5 - dec; } - return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; + return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode); } -static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct node_info *dni, block_t blkaddr, unsigned int *nofs) { - struct page *node_page; + struct folio *node_folio; nid_t nid; - unsigned int ofs_in_node; + unsigned int ofs_in_node, max_addrs, base; block_t source_blkaddr; nid = le32_to_cpu(sum->nid); ofs_in_node = le16_to_cpu(sum->ofs_in_node); - node_page = get_node_page(sbi, nid); - if (IS_ERR(node_page)) - return 0; + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); + if (IS_ERR(node_folio)) + return false; - get_node_info(sbi, nid, dni); + if (f2fs_get_node_info(sbi, nid, dni, false)) { + f2fs_folio_put(node_folio, true); + return false; + } if (sum->version != dni->version) { - f2fs_put_page(node_page, 1); - return 0; + f2fs_warn(sbi, "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + + if (f2fs_check_nid_range(sbi, dni->ino)) { + f2fs_folio_put(node_folio, true); + return false; } - *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(node_page, ofs_in_node); - f2fs_put_page(node_page, 1); + if (IS_INODE(node_folio)) { + base = offset_in_addr(F2FS_INODE(node_folio)); + max_addrs = DEF_ADDRS_PER_INODE; + } else { + base = 0; + max_addrs = DEF_ADDRS_PER_BLOCK; + } - if (source_blkaddr != blkaddr) - return 0; - return 1; + if (base + ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", + base, ofs_in_node, max_addrs, dni->ino, dni->nid); + f2fs_folio_put(node_folio, true); + return false; + } + + *nofs = ofs_of_node(node_folio); + source_blkaddr = data_blkaddr(NULL, node_folio, ofs_in_node); + f2fs_folio_put(node_folio, true); + + if (source_blkaddr != blkaddr) { +#ifdef CONFIG_F2FS_CHECK_FS + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned long offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (unlikely(check_valid_map(sbi, segno, offset))) { + if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { + f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", + blkaddr, source_blkaddr, segno); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } +#endif + return false; + } + return true; +} + +static int ra_data_block(struct inode *inode, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = f2fs_is_cow_file(inode) ? + F2FS_I(inode)->atomic_inode->i_mapping : inode->i_mapping; + struct dnode_of_data dn; + struct folio *folio, *efolio; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_READ, + .op_flags = 0, + .encrypted_page = NULL, + .in_list = 0, + }; + int err; + + folio = f2fs_grab_cache_folio(mapping, index, true); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + if (f2fs_lookup_read_extent_cache_block(inode, index, + &dn.data_blkaddr)) { + if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, + DATA_GENERIC_ENHANCE_READ))) { + err = -EFSCORRUPTED; + goto put_folio; + } + goto got_it; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) + goto put_folio; + f2fs_put_dnode(&dn); + + if (!__is_valid_data_blkaddr(dn.data_blkaddr)) { + err = -ENOENT; + goto put_folio; + } + if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, + DATA_GENERIC_ENHANCE))) { + err = -EFSCORRUPTED; + goto put_folio; + } +got_it: + /* read folio */ + fio.folio = folio; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_folio_wait_writeback(folio, DATA, true, true); + + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + + efolio = f2fs_filemap_get_folio(META_MAPPING(sbi), dn.data_blkaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (IS_ERR(efolio)) { + err = PTR_ERR(efolio); + goto put_folio; + } + + fio.encrypted_page = &efolio->page; + + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_encrypted_page; + f2fs_put_page(fio.encrypted_page, false); + f2fs_folio_put(folio, true); + + f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); + f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); + + return 0; +put_encrypted_page: + f2fs_put_page(fio.encrypted_page, true); +put_folio: + f2fs_folio_put(folio, true); + return err; +} + +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static int move_data_block(struct inode *inode, block_t bidx, + int gc_type, unsigned int segno, int off) +{ + struct address_space *mapping = f2fs_is_cow_file(inode) ? + F2FS_I(inode)->atomic_inode->i_mapping : inode->i_mapping; + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_READ, + .op_flags = 0, + .encrypted_page = NULL, + .in_list = 0, + }; + struct dnode_of_data dn; + struct f2fs_summary sum; + struct node_info ni; + struct folio *folio, *mfolio, *efolio; + block_t newaddr; + int err = 0; + bool lfs_mode = f2fs_lfs_mode(fio.sbi); + int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) && + (fio.sbi->gc_mode != GC_URGENT_HIGH) ? + CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; + + /* do not read out */ + folio = f2fs_grab_cache_folio(mapping, bidx, false); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { + err = -ENOENT; + goto out; + } + + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) + goto out; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); + if (err) + goto out; + + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + folio_clear_uptodate(folio); + err = -ENOENT; + goto put_out; + } + + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_folio_wait_writeback(folio, DATA, true, true); + + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); + if (err) + goto put_out; + + /* read page */ + fio.folio = folio; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + + if (lfs_mode) + f2fs_down_write(&fio.sbi->io_order_lock); + + mfolio = f2fs_grab_cache_folio(META_MAPPING(fio.sbi), + fio.old_blkaddr, false); + if (IS_ERR(mfolio)) { + err = PTR_ERR(mfolio); + goto up_out; + } + + fio.encrypted_page = folio_file_page(mfolio, fio.old_blkaddr); + + /* read source block in mfolio */ + if (!folio_test_uptodate(mfolio)) { + err = f2fs_submit_page_bio(&fio); + if (err) { + f2fs_folio_put(mfolio, true); + goto up_out; + } + + f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO, + F2FS_BLKSIZE); + f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, + F2FS_BLKSIZE); + + folio_lock(mfolio); + if (unlikely(!is_meta_folio(mfolio) || + !folio_test_uptodate(mfolio))) { + err = -EIO; + f2fs_folio_put(mfolio, true); + goto up_out; + } + } + + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* allocate block address */ + err = f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + &sum, type, NULL); + if (err) { + f2fs_folio_put(mfolio, true); + /* filesystem should shutdown, no need to recovery block */ + goto up_out; + } + + efolio = f2fs_filemap_get_folio(META_MAPPING(fio.sbi), newaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (IS_ERR(efolio)) { + err = PTR_ERR(efolio); + f2fs_folio_put(mfolio, true); + goto recover_block; + } + + fio.encrypted_page = &efolio->page; + + /* write target block */ + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true); + memcpy(page_address(fio.encrypted_page), + folio_address(mfolio), PAGE_SIZE); + f2fs_folio_put(mfolio, true); + + f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr, 1); + + set_page_dirty(fio.encrypted_page); + if (clear_page_dirty_for_io(fio.encrypted_page)) + dec_page_count(fio.sbi, F2FS_DIRTY_META); + + set_page_writeback(fio.encrypted_page); + + fio.op = REQ_OP_WRITE; + fio.op_flags = REQ_SYNC; + fio.new_blkaddr = newaddr; + f2fs_submit_page_write(&fio); + + f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); + + f2fs_update_data_blkaddr(&dn, newaddr); + set_inode_flag(inode, FI_APPEND_WRITE); + + f2fs_put_page(fio.encrypted_page, true); +recover_block: + if (err) + f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, + true, true, true); +up_out: + if (lfs_mode) + f2fs_up_write(&fio.sbi->io_order_lock); +put_out: + f2fs_put_dnode(&dn); +out: + f2fs_folio_put(folio, true); + return err; } -static void move_data_page(struct inode *inode, struct page *page, int gc_type) +static int move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) { + struct folio *folio; + int err = 0; + + folio = f2fs_get_lock_data_folio(inode, bidx, true); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { + err = -ENOENT; + goto out; + } + + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) + goto out; + if (gc_type == BG_GC) { - if (PageWriteback(page)) + if (folio_test_writeback(folio)) { + err = -EAGAIN; goto out; - set_page_dirty(page); - set_cold_data(page); + } + folio_mark_dirty(folio); + folio_set_f2fs_gcing(folio); } else { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, + .folio = folio, + .encrypted_page = NULL, + .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, + }; + bool is_dirty = folio_test_dirty(folio); + +retry: + f2fs_folio_wait_writeback(folio, DATA, true, true); - if (PageWriteback(page)) { - f2fs_submit_bio(sbi, DATA, true); - wait_on_page_writeback(page); + folio_mark_dirty(folio); + if (folio_clear_dirty_for_io(folio)) { + inode_dec_dirty_pages(inode); + f2fs_remove_dirty_inode(inode); } - if (clear_page_dirty_for_io(page) && - S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); + folio_set_f2fs_gcing(folio); + + err = f2fs_do_write_data_page(&fio); + if (err) { + folio_clear_f2fs_gcing(folio); + if (err == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto retry; + } + if (is_dirty) + folio_mark_dirty(folio); } - set_cold_data(page); - do_write_data_page(page); - clear_cold_data(page); } out: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); + return err; } /* @@ -533,197 +1533,857 @@ out: * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, - struct list_head *ilist, unsigned int segno, int gc_type) +static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct gc_inode_list *gc_list, unsigned int segno, int gc_type, + bool force_migrate) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; block_t start_addr; int off; int phase = 0; + int submitted = 0; + unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; - for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { - struct page *data_page; + for (off = 0; off < usable_blks_in_seg; off++, entry++) { struct inode *inode; struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; block_t start_bidx; + nid_t nid = le32_to_cpu(entry->nid); - /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return; + /* + * stop BG_GC if there is not enough free sections. + * Or, stop GC if the segment becomes fully valid caused by + * race condition along with SSR block allocation. + */ + if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || + (!force_migrate && get_valid_blocks(sbi, segno, true) == + CAP_BLKS_PER_SEC(sbi))) + return submitted; if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { - ra_node_page(sbi, le32_to_cpu(entry->nid)); + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + f2fs_ra_node_page(sbi, nid); continue; } /* Get an inode by ino with checking validity */ - if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0) + if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; - if (phase == 1) { - ra_node_page(sbi, dni.ino); + if (phase == 2) { + f2fs_ra_node_page(sbi, dni.ino); continue; } - start_bidx = start_bidx_of_node(nofs); ofs_in_node = le16_to_cpu(entry->ofs_in_node); - if (phase == 2) { + if (phase == 3) { + struct folio *data_folio; + int err; + inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode)) continue; - data_page = find_data_page(inode, - start_bidx + ofs_in_node, false); - if (IS_ERR(data_page)) - goto next_iput; + if (is_bad_inode(inode) || + special_file(inode->i_mode)) { + iput(inode); + continue; + } - f2fs_put_page(data_page, 0); - add_gc_inode(inode, ilist); - } else { - inode = find_gc_inode(dni.ino, ilist); - if (inode) { - data_page = get_lock_data_page(inode, - start_bidx + ofs_in_node); - if (IS_ERR(data_page)) + if (f2fs_has_inline_data(inode)) { + iput(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_err_ratelimited(sbi, + "inode %lx has both inline_data flag and " + "data block, nid=%u, ofs_in_node=%u", + inode->i_ino, dni.nid, ofs_in_node); + continue; + } + + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err == -EAGAIN) { + iput(inode); + return submitted; + } + + if (!f2fs_down_write_trylock( + &F2FS_I(inode)->i_gc_rwsem[WRITE])) { + iput(inode); + sbi->skipped_gc_rwsem++; + continue; + } + + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + + ofs_in_node; + + if (f2fs_meta_inode_gc_required(inode)) { + int err = ra_data_block(inode, start_bidx); + + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (err) { + iput(inode); continue; - move_data_page(inode, data_page, gc_type); - stat_inc_data_blk_count(sbi, 1); + } + add_gc_inode(gc_list, inode); + continue; + } + + data_folio = f2fs_get_read_data_folio(inode, start_bidx, + REQ_RAHEAD, true, NULL); + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (IS_ERR(data_folio)) { + iput(inode); + continue; } + + f2fs_folio_put(data_folio, false); + add_gc_inode(gc_list, inode); + continue; } - continue; -next_iput: - iput(inode); - } - if (++phase < 4) - goto next_step; + /* phase 4 */ + inode = find_gc_inode(gc_list, dni.ino); + if (inode) { + struct f2fs_inode_info *fi = F2FS_I(inode); + bool locked = false; + int err; - if (gc_type == FG_GC) { - f2fs_submit_bio(sbi, DATA, true); + if (S_ISREG(inode->i_mode)) { + if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[WRITE])) { + sbi->skipped_gc_rwsem++; + continue; + } + if (!f2fs_down_write_trylock( + &fi->i_gc_rwsem[READ])) { + sbi->skipped_gc_rwsem++; + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + continue; + } + locked = true; - /* - * In the case of FG_GC, it'd be better to reclaim this victim - * completely. - */ - if (get_valid_blocks(sbi, segno, 1) != 0) { - phase = 2; - goto next_step; + /* wait for all inflight aio data */ + inode_dio_wait(inode); + } + + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + + ofs_in_node; + if (f2fs_meta_inode_gc_required(inode)) + err = move_data_block(inode, start_bidx, + gc_type, segno, off); + else + err = move_data_page(inode, start_bidx, gc_type, + segno, off); + + if (!err && (gc_type == FG_GC || + f2fs_meta_inode_gc_required(inode))) + submitted++; + + if (locked) { + f2fs_up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + } + + stat_inc_data_blk_count(sbi, 1, gc_type); } } + + if (++phase < 5) + goto next_step; + + return submitted; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, - int gc_type, int type) + int gc_type, bool one_time) { struct sit_info *sit_i = SIT_I(sbi); int ret; - mutex_lock(&sit_i->sentry_lock); - ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS); - mutex_unlock(&sit_i->sentry_lock); + + down_write(&sit_i->sentry_lock); + ret = f2fs_get_victim(sbi, victim, gc_type, NO_CHECK_TYPE, + LFS, 0, one_time); + up_write(&sit_i->sentry_lock); return ret; } -static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, - struct list_head *ilist, int gc_type) +static int do_garbage_collect(struct f2fs_sb_info *sbi, + unsigned int start_segno, + struct gc_inode_list *gc_list, int gc_type, + bool force_migrate, bool one_time) { - struct page *sum_page; - struct f2fs_summary_block *sum; struct blk_plug plug; + unsigned int segno = start_segno; + unsigned int end_segno = start_segno + SEGS_PER_SEC(sbi); + unsigned int sec_end_segno; + int seg_freed = 0, migrated = 0; + unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + SUM_TYPE_DATA : SUM_TYPE_NODE; + unsigned char data_type = (type == SUM_TYPE_DATA) ? DATA : NODE; + int submitted = 0, sum_blk_cnt; + + if (__is_large_section(sbi)) { + sec_end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi)); - /* read segment summary of victim */ - sum_page = get_sum_page(sbi, segno); - if (IS_ERR(sum_page)) - return; + /* + * zone-capacity can be less than zone-size in zoned devices, + * resulting in less than expected usable segments in the zone, + * calculate the end segno in the zone which can be garbage + * collected + */ + if (f2fs_sb_has_blkzoned(sbi)) + sec_end_segno -= SEGS_PER_SEC(sbi) - + f2fs_usable_segs_in_sec(sbi); + + if (gc_type == BG_GC || one_time) { + unsigned int window_granularity = + sbi->migration_window_granularity; + + if (f2fs_sb_has_blkzoned(sbi) && + !has_enough_free_blocks(sbi, + sbi->gc_thread->boost_zoned_gc_percent)) + window_granularity *= + sbi->gc_thread->boost_gc_multiple; + + end_segno = start_segno + window_granularity; + } + + if (end_segno > sec_end_segno) + end_segno = sec_end_segno; + } + + sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); + + segno = rounddown(segno, SUMS_PER_BLOCK); + sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, SUMS_PER_BLOCK); + /* readahead multi ssa blocks those have contiguous address */ + if (__is_large_section(sbi)) + f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), + sum_blk_cnt, META_SSA, true); + + /* reference all summary page */ + while (segno < end_segno) { + struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno); + + segno += SUMS_PER_BLOCK; + if (IS_ERR(sum_folio)) { + int err = PTR_ERR(sum_folio); + + end_segno = segno - SUMS_PER_BLOCK; + segno = rounddown(start_segno, SUMS_PER_BLOCK); + while (segno < end_segno) { + sum_folio = filemap_get_folio(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + folio_put_refs(sum_folio, 2); + segno += SUMS_PER_BLOCK; + } + return err; + } + folio_unlock(sum_folio); + } blk_start_plug(&plug); - sum = page_address(sum_page); + segno = start_segno; + while (segno < end_segno) { + unsigned int cur_segno; - switch (GET_SUM_TYPE((&sum->footer))) { - case SUM_TYPE_NODE: - gc_node_segment(sbi, sum->entries, segno, gc_type); - break; - case SUM_TYPE_DATA: - gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); - break; + /* find segment summary of victim */ + struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + unsigned int block_end_segno = rounddown(segno, SUMS_PER_BLOCK) + + SUMS_PER_BLOCK; + + if (block_end_segno > end_segno) + block_end_segno = end_segno; + + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) { + f2fs_err(sbi, "%s: segment %u is used by log", + __func__, segno); + f2fs_bug_on(sbi, 1); + goto next_block; + } + + if (!folio_test_uptodate(sum_folio) || + unlikely(f2fs_cp_error(sbi))) + goto next_block; + + for (cur_segno = segno; cur_segno < block_end_segno; + cur_segno++) { + struct f2fs_summary_block *sum; + + if (get_valid_blocks(sbi, cur_segno, false) == 0) + goto freed; + if (gc_type == BG_GC && __is_large_section(sbi) && + migrated >= sbi->migration_granularity) + continue; + + sum = SUM_BLK_PAGE_ADDR(sum_folio, cur_segno); + if (type != GET_SUM_TYPE((&sum->footer))) { + f2fs_err(sbi, "Inconsistent segment (%u) type " + "[%d, %d] in SSA and SIT", + cur_segno, type, + GET_SUM_TYPE((&sum->footer))); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_SUMMARY); + continue; + } + + /* + * this is to avoid deadlock: + * - lock_page(sum_page) - f2fs_replace_block + * - check_valid_map() - down_write(sentry_lock) + * - down_read(sentry_lock) - change_curseg() + * - lock_page(sum_page) + */ + if (type == SUM_TYPE_NODE) + submitted += gc_node_segment(sbi, sum->entries, + cur_segno, gc_type); + else + submitted += gc_data_segment(sbi, sum->entries, + gc_list, cur_segno, + gc_type, force_migrate); + + stat_inc_gc_seg_count(sbi, data_type, gc_type); + sbi->gc_reclaimed_segs[sbi->gc_mode]++; + migrated++; + +freed: + if (gc_type == FG_GC && + get_valid_blocks(sbi, cur_segno, false) == 0) + seg_freed++; + + if (__is_large_section(sbi)) + sbi->next_victim_seg[gc_type] = + (cur_segno + 1 < sec_end_segno) ? + cur_segno + 1 : NULL_SEGNO; + } +next_block: + folio_put_refs(sum_folio, 2); + segno = block_end_segno; } + + if (submitted) + f2fs_submit_merged_write(sbi, data_type); + blk_finish_plug(&plug); - stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); - stat_inc_call_count(sbi->stat_info); + if (migrated) + stat_inc_gc_sec_count(sbi, data_type, gc_type); - f2fs_put_page(sum_page, 1); + return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi) +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { - struct list_head ilist; - unsigned int segno, i; - int gc_type = BG_GC; - int nfree = 0; - int ret = -1; - - INIT_LIST_HEAD(&ilist); + int gc_type = gc_control->init_gc_type; + unsigned int segno = gc_control->victim_segno; + int sec_freed = 0, seg_freed = 0, total_freed = 0, total_sec_freed = 0; + int ret = 0; + struct cp_control cpc; + struct gc_inode_list gc_list = { + .ilist = LIST_HEAD_INIT(gc_list.ilist), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), + }; + unsigned int skipped_round = 0, round = 0; + unsigned int upper_secs; + + trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, + gc_control->nr_free_secs, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + + cpc.reason = __get_cp_reason(sbi); gc_more: - if (!(sbi->sb->s_flags & MS_ACTIVE)) + sbi->skipped_gc_rwsem = 0; + if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { + ret = -EINVAL; + goto stop; + } + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto stop; + } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { + /* Let's run FG_GC, if we don't have enough space. */ + if (has_not_enough_free_secs(sbi, 0, 0)) { gc_type = FG_GC; - write_checkpoint(sbi, false); + gc_control->one_time = false; + + /* + * For example, if there are many prefree_segments below given + * threshold, we can make them free by checkpoint. Then, we + * secure free segments which doesn't need fggc any more. + */ + if (prefree_segments(sbi)) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); + ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + /* Reset due to checkpoint */ + sec_freed = 0; + } + } + + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (gc_type == BG_GC && gc_control->no_bg_gc) { + ret = -EINVAL; + goto stop; + } +retry: + ret = __get_victim(sbi, &segno, gc_type, gc_control->one_time); + if (ret) { + /* allow to search victim from sections has pinned data */ + if (ret == -ENODATA && gc_type == FG_GC && + f2fs_pinned_section_exists(DIRTY_I(sbi))) { + f2fs_unpin_all_sections(sbi, false); + goto retry; + } + goto stop; } - if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, + gc_control->should_migrate_blocks, + gc_control->one_time); + if (seg_freed < 0) goto stop; - ret = 0; - for (i = 0; i < sbi->segs_per_sec; i++) - do_garbage_collect(sbi, segno + i, &ilist, gc_type); + total_freed += seg_freed; + + if (seg_freed == f2fs_usable_segs_in_sec(sbi)) { + sec_freed++; + total_sec_freed++; + } + + if (gc_control->one_time) + goto stop; if (gc_type == FG_GC) { sbi->cur_victim_sec = NULL_SEGNO; - nfree++; - WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec)); + + if (has_enough_free_secs(sbi, sec_freed, 0)) { + if (!gc_control->no_bg_gc && + total_sec_freed < gc_control->nr_free_secs) + goto go_gc_more; + goto stop; + } + if (sbi->skipped_gc_rwsem) + skipped_round++; + round++; + if (skipped_round > MAX_SKIP_GC_COUNT && + skipped_round * 2 >= round) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); + ret = f2fs_write_checkpoint(sbi, &cpc); + goto stop; + } + } else if (has_enough_free_secs(sbi, 0, 0)) { + goto stop; } - if (has_not_enough_free_secs(sbi, nfree)) - goto gc_more; + __get_secs_required(sbi, NULL, &upper_secs, NULL); + + /* + * Write checkpoint to reclaim prefree segments. + * We need more three extra sections for writer's data/node/dentry. + */ + if (free_sections(sbi) <= upper_secs + NR_GC_CHECKPOINT_SECS && + prefree_segments(sbi)) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); + ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + /* Reset due to checkpoint */ + sec_freed = 0; + } +go_gc_more: + segno = NULL_SEGNO; + goto gc_more; - if (gc_type == FG_GC) - write_checkpoint(sbi, false); stop: - mutex_unlock(&sbi->gc_mutex); + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; + + if (gc_type == FG_GC) + f2fs_unpin_all_sections(sbi, true); + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, total_sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + + f2fs_up_write(&sbi->gc_lock); - put_gc_inode(&ilist); + put_gc_inode(&gc_list); + + if (gc_control->err_gc_skipped && !ret) + ret = total_sec_freed ? 0 : -EAGAIN; return ret; } -void build_gc_manager(struct f2fs_sb_info *sbi) +int __init f2fs_create_garbage_collection_cache(void) { - DIRTY_I(sbi)->v_ops = &default_v_ops; + victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", + sizeof(struct victim_entry)); + return victim_entry_slab ? 0 : -ENOMEM; } -int __init create_gc_caches(void) +void f2fs_destroy_garbage_collection_cache(void) { - winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", - sizeof(struct inode_entry), NULL); - if (!winode_slab) - return -ENOMEM; + kmem_cache_destroy(victim_entry_slab); +} + +static void init_atgc_management(struct f2fs_sb_info *sbi) +{ + struct atgc_management *am = &sbi->am; + + if (test_opt(sbi, ATGC) && + SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD) + am->atgc_enabled = true; + + am->root = RB_ROOT_CACHED; + INIT_LIST_HEAD(&am->victim_list); + am->victim_count = 0; + + am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; + am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; + am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; + am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD; +} + +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) +{ + sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; + + /* give warm/cold data area from slower device */ + if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi)) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; + + init_atgc_management(sbi); +} + +int f2fs_gc_range(struct f2fs_sb_info *sbi, + unsigned int start_seg, unsigned int end_seg, + bool dry_run, unsigned int dry_run_sections) +{ + unsigned int segno; + unsigned int gc_secs = dry_run_sections; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + for (segno = start_seg; segno <= end_seg; segno += SEGS_PER_SEC(sbi)) { + struct gc_inode_list gc_list = { + .ilist = LIST_HEAD_INIT(gc_list.ilist), + .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), + }; + + /* + * avoid migrating empty section, as it can be allocated by + * log in parallel. + */ + if (!get_valid_blocks(sbi, segno, true)) + continue; + + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) + continue; + + do_garbage_collect(sbi, segno, &gc_list, FG_GC, true, false); + put_gc_inode(&gc_list); + + if (!dry_run && get_valid_blocks(sbi, segno, true)) + return -EAGAIN; + if (dry_run && dry_run_sections && + !get_valid_blocks(sbi, segno, true) && --gc_secs == 0) + break; + + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + } + return 0; } -void destroy_gc_caches(void) +static int free_segment_range(struct f2fs_sb_info *sbi, + unsigned int secs, bool dry_run) +{ + unsigned int next_inuse, start, end; + struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; + int gc_mode, gc_type; + int err = 0; + int type; + + /* Force block allocation for GC */ + MAIN_SECS(sbi) -= secs; + start = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi); + end = MAIN_SEGS(sbi) - 1; + + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++) + if (SIT_I(sbi)->last_victim[gc_mode] >= start) + SIT_I(sbi)->last_victim[gc_mode] = 0; + + for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++) + if (sbi->next_victim_seg[gc_type] >= start) + sbi->next_victim_seg[gc_type] = NULL_SEGNO; + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); + + /* Move out cursegs from the target range */ + for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) { + err = f2fs_allocate_segment_for_resize(sbi, type, start, end); + if (err) + goto out; + } + + /* do GC to move out valid blocks in the range */ + err = f2fs_gc_range(sbi, start, end, dry_run, 0); + if (err || dry_run) + goto out; + + stat_inc_cp_call_count(sbi, TOTAL_CALL); + err = f2fs_write_checkpoint(sbi, &cpc); + if (err) + goto out; + + next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start); + if (next_inuse <= end) { + f2fs_err(sbi, "segno %u should be free but still inuse!", + next_inuse); + f2fs_bug_on(sbi, 1); + } +out: + MAIN_SECS(sbi) += secs; + return err; +} + +static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) { - kmem_cache_destroy(winode_slab); + struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi); + int section_count; + int segment_count; + int segment_count_main; + long long block_count; + int segs = secs * SEGS_PER_SEC(sbi); + + f2fs_down_write(&sbi->sb_lock); + + section_count = le32_to_cpu(raw_sb->section_count); + segment_count = le32_to_cpu(raw_sb->segment_count); + segment_count_main = le32_to_cpu(raw_sb->segment_count_main); + block_count = le64_to_cpu(raw_sb->block_count); + + raw_sb->section_count = cpu_to_le32(section_count + secs); + raw_sb->segment_count = cpu_to_le32(segment_count + segs); + raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); + raw_sb->block_count = cpu_to_le64(block_count + + (long long)SEGS_TO_BLKS(sbi, segs)); + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + int dev_segs = + le32_to_cpu(raw_sb->devs[last_dev].total_segments); + + raw_sb->devs[last_dev].total_segments = + cpu_to_le32(dev_segs + segs); + } + + f2fs_up_write(&sbi->sb_lock); +} + +static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) +{ + int segs = secs * SEGS_PER_SEC(sbi); + long long blks = SEGS_TO_BLKS(sbi, segs); + long long user_block_count = + le64_to_cpu(F2FS_CKPT(sbi)->user_block_count); + + SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; + MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; + MAIN_SECS(sbi) += secs; + if (sbi->allocate_section_hint > MAIN_SECS(sbi)) + sbi->allocate_section_hint = MAIN_SECS(sbi); + FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; + FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; + F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); + + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + + sbi->allocate_section_hint = FDEV(0).total_segments / + SEGS_PER_SEC(sbi); + + FDEV(last_dev).total_segments = + (int)FDEV(last_dev).total_segments + segs; + FDEV(last_dev).end_blk = + (long long)FDEV(last_dev).end_blk + blks; +#ifdef CONFIG_BLK_DEV_ZONED + FDEV(last_dev).nr_blkz = FDEV(last_dev).nr_blkz + + div_u64(blks, sbi->blocks_per_blkz); +#endif + } +} + +int f2fs_resize_fs(struct file *filp, __u64 block_count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + __u64 old_block_count, shrunk_blocks; + struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; + unsigned int secs; + int err = 0; + __u32 rem; + + old_block_count = le64_to_cpu(F2FS_RAW_SUPER(sbi)->block_count); + if (block_count > old_block_count) + return -EINVAL; + + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + __u64 last_segs = FDEV(last_dev).total_segments; + + if (block_count + SEGS_TO_BLKS(sbi, last_segs) <= + old_block_count) + return -EINVAL; + } + + /* new fs size should align to section size */ + div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem); + if (rem) + return -EINVAL; + + if (block_count == old_block_count) + return 0; + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_err(sbi, "Should run fsck to repair first."); + return -EFSCORRUPTED; + } + + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + f2fs_err(sbi, "Checkpoint should be enabled."); + return -EINVAL; + } + + err = mnt_want_write_file(filp); + if (err) + return err; + + shrunk_blocks = old_block_count - block_count; + secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); + + /* stop other GC */ + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + err = -EAGAIN; + goto out_drop_write; + } + + /* stop CP to protect MAIN_SEC in free_segment_range */ + f2fs_lock_op(sbi); + + spin_lock(&sbi->stat_lock); + if (shrunk_blocks + valid_user_blocks(sbi) + + sbi->current_reserved_blocks + sbi->unusable_block_count + + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) + err = -ENOSPC; + spin_unlock(&sbi->stat_lock); + + if (err) + goto out_unlock; + + err = free_segment_range(sbi, secs, true); + +out_unlock: + f2fs_unlock_op(sbi); + f2fs_up_write(&sbi->gc_lock); +out_drop_write: + mnt_drop_write_file(filp); + if (err) + return err; + + err = freeze_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + err = thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); + if (err) + return err; + return -EROFS; + } + + f2fs_down_write(&sbi->gc_lock); + f2fs_down_write(&sbi->cp_global_sem); + + spin_lock(&sbi->stat_lock); + if (shrunk_blocks + valid_user_blocks(sbi) + + sbi->current_reserved_blocks + sbi->unusable_block_count + + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) + err = -ENOSPC; + else + sbi->user_block_count -= shrunk_blocks; + spin_unlock(&sbi->stat_lock); + if (err) + goto out_err; + + set_sbi_flag(sbi, SBI_IS_RESIZEFS); + err = free_segment_range(sbi, secs, false); + if (err) + goto recover_out; + + update_sb_metadata(sbi, -secs); + + err = f2fs_commit_super(sbi, false); + if (err) { + update_sb_metadata(sbi, secs); + goto recover_out; + } + + update_fs_metadata(sbi, -secs); + clear_sbi_flag(sbi, SBI_IS_RESIZEFS); + set_sbi_flag(sbi, SBI_IS_DIRTY); + + stat_inc_cp_call_count(sbi, TOTAL_CALL); + err = f2fs_write_checkpoint(sbi, &cpc); + if (err) { + update_fs_metadata(sbi, secs); + update_sb_metadata(sbi, secs); + f2fs_commit_super(sbi, false); + } +recover_out: + clear_sbi_flag(sbi, SBI_IS_RESIZEFS); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); + + spin_lock(&sbi->stat_lock); + sbi->user_block_count += shrunk_blocks; + spin_unlock(&sbi->stat_lock); + } +out_err: + f2fs_up_write(&sbi->cp_global_sem); + f2fs_up_write(&sbi->gc_lock); + thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); + return err; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 2c6a6bd08322..6c4d4567571e 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -1,102 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/gc.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #define GC_THREAD_MIN_WB_PAGES 1 /* * a threshold to determine * whether IO subsystem is idle * or not */ -#define GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ -#define GC_THREAD_MAX_SLEEP_TIME 60000 -#define GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ +#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ +#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 +#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ + +/* GC sleep parameters for zoned deivces */ +#define DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED 10 +#define DEF_GC_THREAD_MAX_SLEEP_TIME_ZONED 20 +#define DEF_GC_THREAD_NOGC_SLEEP_TIME_ZONED 60000 + +/* choose candidates from sections which has age of more than 7 days */ +#define DEF_GC_THREAD_AGE_THRESHOLD (60 * 60 * 24 * 7) +#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */ +#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */ +#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */ +#define DEF_GC_THREAD_VALID_THRESH_RATIO 80 /* do not GC over 80% valid block ratio for one time GC */ +#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */ + #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ +#define LIMIT_NO_ZONED_GC 60 /* percentage over total user space of no gc for zoned devices */ +#define LIMIT_BOOST_ZONED_GC 25 /* percentage over total user space of boosted gc for zoned devices */ +#define DEF_MIGRATION_WINDOW_GRANULARITY_ZONED 3 +#define BOOST_GC_MULTIPLE 5 +#define ZONED_PIN_SEC_REQUIRED_COUNT 1 + +#define DEF_GC_FAILED_PINNED_FILES 2048 +#define MAX_GC_FAILED_PINNED_FILES USHRT_MAX + /* Search max. number of dirty segments to select a victim segment */ -#define MAX_VICTIM_SEARCH 20 +#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ + +#define NR_GC_CHECKPOINT_SECS (3) /* data/node/dentry sections */ struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; wait_queue_head_t gc_wait_queue_head; + + /* for gc sleep time */ + unsigned int urgent_sleep_time; + unsigned int min_sleep_time; + unsigned int max_sleep_time; + unsigned int no_gc_sleep_time; + + /* for changing gc mode */ + bool gc_wake; + + /* for GC_MERGE mount option */ + wait_queue_head_t fggc_wq; /* + * caller of f2fs_balance_fs() + * will wait on this wait queue. + */ + + /* for gc control for zoned devices */ + unsigned int no_zoned_gc_percent; + unsigned int boost_zoned_gc_percent; + unsigned int valid_thresh_ratio; + unsigned int boost_gc_multiple; + unsigned int boost_gc_greedy; }; -struct inode_entry { +struct gc_inode_list { + struct list_head ilist; + struct radix_tree_root iroot; +}; + +struct victim_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* segment No. */ struct list_head list; - struct inode *inode; }; /* * inline functions */ + +/* + * On a Zoned device zone-capacity can be less than zone-size and if + * zone-capacity is not aligned to f2fs segment size(2MB), then the segment + * starting just before zone-capacity has some blocks spanning across the + * zone-capacity, these blocks are not usable. + * Such spanning segments can be in free list so calculate the sum of usable + * blocks in currently free segments including normal and spanning segments. + */ +static inline block_t free_segs_blk_count_zoned(struct f2fs_sb_info *sbi) +{ + block_t free_seg_blks = 0; + struct free_segmap_info *free_i = FREE_I(sbi); + int j; + + spin_lock(&free_i->segmap_lock); + for (j = 0; j < MAIN_SEGS(sbi); j++) + if (!test_bit(j, free_i->free_segmap)) + free_seg_blks += f2fs_usable_blks_in_seg(sbi, j); + spin_unlock(&free_i->segmap_lock); + + return free_seg_blks; +} + +static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return free_segs_blk_count_zoned(sbi); + + return SEGS_TO_BLKS(sbi, free_segments(sbi)); +} + static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) { - if (free_segments(sbi) < overprovision_segments(sbi)) + block_t free_blks, ovp_blks; + + free_blks = free_segs_blk_count(sbi); + ovp_blks = SEGS_TO_BLKS(sbi, overprovision_segments(sbi)); + + if (free_blks < ovp_blks) return 0; - else - return (free_segments(sbi) - overprovision_segments(sbi)) - << sbi->log_blocks_per_seg; + + return free_blks - ovp_blks; } -static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) +static inline block_t limit_invalid_user_blocks(block_t user_block_count) { - return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100; + return (long)(user_block_count * LIMIT_INVALID_BLOCK) / 100; } -static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) +static inline block_t limit_free_user_blocks(block_t reclaimable_user_blocks) { - block_t reclaimable_user_blocks = sbi->user_block_count - - written_block_count(sbi); return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; } -static inline long increase_sleep_time(long wait) +static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, + unsigned int *wait) { - if (wait == GC_THREAD_NOGC_SLEEP_TIME) - return wait; + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + + if (*wait == gc_th->no_gc_sleep_time) + return; - wait += GC_THREAD_MIN_SLEEP_TIME; - if (wait > GC_THREAD_MAX_SLEEP_TIME) - wait = GC_THREAD_MAX_SLEEP_TIME; - return wait; + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; } -static inline long decrease_sleep_time(long wait) +static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, + unsigned int *wait) { - if (wait == GC_THREAD_NOGC_SLEEP_TIME) - wait = GC_THREAD_MAX_SLEEP_TIME; + unsigned int min_time = gc_th->min_sleep_time; - wait -= GC_THREAD_MIN_SLEEP_TIME; - if (wait <= GC_THREAD_MIN_SLEEP_TIME) - wait = GC_THREAD_MIN_SLEEP_TIME; - return wait; + if (*wait == gc_th->no_gc_sleep_time) + *wait = gc_th->max_sleep_time; + + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; +} + +static inline bool has_enough_free_blocks(struct f2fs_sb_info *sbi, + unsigned int limit_perc) +{ + return free_sections(sbi) > ((sbi->total_sections * limit_perc) / 100); } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) { - block_t invalid_user_blocks = sbi->user_block_count - - written_block_count(sbi); + block_t user_block_count = sbi->user_block_count; + block_t invalid_user_blocks = user_block_count - + written_block_count(sbi); /* - * Background GC is triggered with the following condition. + * Background GC is triggered with the following conditions. * 1. There are a number of invalid blocks. * 2. There is not enough free space. */ - if (invalid_user_blocks > limit_invalid_user_blocks(sbi) && - free_user_blocks(sbi) < limit_free_user_blocks(sbi)) - return true; - return false; + return (invalid_user_blocks > + limit_invalid_user_blocks(user_block_count) && + free_user_blocks(sbi) < + limit_free_user_blocks(invalid_user_blocks)); } -static inline int is_idle(struct f2fs_sb_info *sbi) +static inline bool need_to_boost_gc(struct f2fs_sb_info *sbi) { - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; - return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); + if (f2fs_sb_has_blkzoned(sbi)) + return !has_enough_free_blocks(sbi, + sbi->gc_thread->boost_zoned_gc_percent); + return has_enough_invalid_blocks(sbi); } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 6eb8d269b53b..049ce50cec9b 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/hash.c * @@ -7,16 +8,12 @@ * Portions of this code from linux/fs/ext3/hash.c * * Copyright (C) 2002 by Theodore Ts'o - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> -#include <linux/cryptohash.h> #include <linux/pagemap.h> +#include <linux/unicode.h> #include "f2fs.h" @@ -42,7 +39,8 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[]) buf[1] += b1; } -static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) +static void str2hashbuf(const unsigned char *msg, size_t len, + unsigned int *buf, int num) { unsigned pad, val; int i; @@ -69,24 +67,16 @@ static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len) +static u32 TEA_hash_name(const u8 *p, size_t len) { - __u32 hash; - f2fs_hash_t f2fs_hash; - const char *p; __u32 in[8], buf[4]; - if ((len <= 2) && (name[0] == '.') && - (name[1] == '.' || name[1] == '\0')) - return 0; - /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; buf[1] = 0xefcdab89; buf[2] = 0x98badcfe; buf[3] = 0x10325476; - p = name; while (1) { str2hashbuf(p, len, in, 4); TEA_transform(buf, in); @@ -95,7 +85,53 @@ f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len) break; len -= 16; } - hash = buf[0]; - f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT); - return f2fs_hash; + return buf[0] & ~F2FS_HASH_COL_BIT; +} + +/* + * Compute @fname->hash. For all directories, @fname->disk_name must be set. + * For casefolded directories, @fname->usr_fname must be set, and also + * @fname->cf_name if the filename is valid Unicode and is not "." or "..". + */ +void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) +{ + const u8 *name = fname->disk_name.name; + size_t len = fname->disk_name.len; + + WARN_ON_ONCE(!name); + + if (is_dot_dotdot(name, len)) { + fname->hash = 0; + return; + } + +#if IS_ENABLED(CONFIG_UNICODE) + if (IS_CASEFOLDED(dir)) { + /* + * If the casefolded name is provided, hash it instead of the + * on-disk name. If the casefolded name is *not* provided, that + * should only be because the name wasn't valid Unicode or was + * "." or "..", so fall back to treating the name as an opaque + * byte sequence. Note that to handle encrypted directories, + * the fallback must use usr_fname (plaintext) rather than + * disk_name (ciphertext). + */ + WARN_ON_ONCE(!fname->usr_fname->name); + if (fname->cf_name.name) { + name = fname->cf_name.name; + len = fname->cf_name.len; + } else { + name = fname->usr_fname->name; + len = fname->usr_fname->len; + } + if (IS_ENCRYPTED(dir)) { + struct qstr tmp = QSTR_INIT(name, len); + + fname->hash = + cpu_to_le32(fscrypt_fname_siphash(dir, &tmp)); + return; + } + } +#endif + fname->hash = cpu_to_le32(TEA_hash_name(name, len)); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c new file mode 100644 index 000000000000..e5c6a08b7e4f --- /dev/null +++ b/fs/f2fs/inline.c @@ -0,0 +1,834 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/inline.c + * Copyright (c) 2013, Intel Corporation + * Authors: Huajun Li <huajun.li@intel.com> + * Haicheng Li <haicheng.li@intel.com> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/fiemap.h> + +#include "f2fs.h" +#include "node.h" +#include <trace/events/f2fs.h> + +static bool support_inline_data(struct inode *inode) +{ + if (f2fs_used_in_atomic_write(inode)) + return false; + if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) + return false; + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) + return false; + return true; +} + +bool f2fs_may_inline_data(struct inode *inode) +{ + if (!support_inline_data(inode)) + return false; + + return !f2fs_post_read_required(inode); +} + +static bool inode_has_blocks(struct inode *inode, struct folio *ifolio) +{ + struct f2fs_inode *ri = F2FS_INODE(ifolio); + int i; + + if (F2FS_HAS_BLOCKS(inode)) + return true; + + for (i = 0; i < DEF_NIDS_PER_INODE; i++) { + if (ri->i_nid[i]) + return true; + } + return false; +} + +bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio) +{ + if (!f2fs_has_inline_data(inode)) + return false; + + if (inode_has_blocks(inode, ifolio)) + return false; + + if (!support_inline_data(inode)) + return true; + + /* + * used by sanity_check_inode(), when disk layout fields has not + * been synchronized to inmem fields. + */ + return (S_ISREG(inode->i_mode) && + (file_is_encrypt(inode) || file_is_verity(inode) || + (F2FS_I(inode)->i_flags & F2FS_COMPR_FL))); +} + +bool f2fs_may_inline_dentry(struct inode *inode) +{ + if (!test_opt(F2FS_I_SB(inode), INLINE_DENTRY)) + return false; + + if (!S_ISDIR(inode->i_mode)) + return false; + + return true; +} + +void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio) +{ + struct inode *inode = folio->mapping->host; + + if (folio_test_uptodate(folio)) + return; + + f2fs_bug_on(F2FS_I_SB(inode), folio->index); + + folio_zero_segment(folio, MAX_INLINE_DATA(inode), folio_size(folio)); + + /* Copy the whole inline data block */ + memcpy_to_folio(folio, 0, inline_data_addr(inode, ifolio), + MAX_INLINE_DATA(inode)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); +} + +void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio, + u64 from) +{ + void *addr; + + if (from >= MAX_INLINE_DATA(inode)) + return; + + addr = inline_data_addr(inode, ifolio); + + f2fs_folio_wait_writeback(ifolio, NODE, true, true); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); + folio_mark_dirty(ifolio); + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); +} + +int f2fs_read_inline_data(struct inode *inode, struct folio *folio) +{ + struct folio *ifolio; + + ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ifolio)) { + folio_unlock(folio); + return PTR_ERR(ifolio); + } + + if (!f2fs_has_inline_data(inode)) { + f2fs_folio_put(ifolio, true); + return -EAGAIN; + } + + if (folio->index) + folio_zero_segment(folio, 0, folio_size(folio)); + else + f2fs_do_read_inline_data(folio, ifolio); + + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + f2fs_folio_put(ifolio, true); + folio_unlock(folio); + return 0; +} + +int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio) +{ + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(dn->inode), + .ino = dn->inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_PRIO, + .folio = folio, + .encrypted_page = NULL, + .io_type = FS_DATA_IO, + }; + struct node_info ni; + int dirty, err; + + if (!f2fs_exist_data(dn->inode)) + goto clear_out; + + err = f2fs_reserve_block(dn, 0); + if (err) + return err; + + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false); + if (err) { + f2fs_truncate_data_blocks_range(dn, 1); + f2fs_put_dnode(dn); + return err; + } + + fio.version = ni.version; + + if (unlikely(dn->data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(dn); + set_sbi_flag(fio.sbi, SBI_NEED_FSCK); + f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", + __func__, dn->inode->i_ino, dn->data_blkaddr); + f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); + return -EFSCORRUPTED; + } + + f2fs_bug_on(F2FS_F_SB(folio), folio_test_writeback(folio)); + + f2fs_do_read_inline_data(folio, dn->inode_folio); + folio_mark_dirty(folio); + + /* clear dirty state */ + dirty = folio_clear_dirty_for_io(folio); + + /* write data page to try to make data consistent */ + folio_start_writeback(folio); + fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); + f2fs_outplace_write_data(dn, &fio); + f2fs_folio_wait_writeback(folio, DATA, true, true); + if (dirty) { + inode_dec_dirty_pages(dn->inode); + f2fs_remove_dirty_inode(dn->inode); + } + + /* this converted inline_data should be recovered. */ + set_inode_flag(dn->inode, FI_APPEND_WRITE); + + /* clear inline data and flag after data writeback */ + f2fs_truncate_inline_inode(dn->inode, dn->inode_folio, 0); + folio_clear_f2fs_inline(dn->inode_folio); +clear_out: + stat_dec_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); + f2fs_put_dnode(dn); + return 0; +} + +int f2fs_convert_inline_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct folio *ifolio, *folio; + int err = 0; + + if (f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) + return -EROFS; + + if (!f2fs_has_inline_data(inode)) + return 0; + + err = f2fs_dquot_initialize(inode); + if (err) + return err; + + folio = f2fs_grab_cache_folio(inode->i_mapping, 0, false); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + f2fs_lock_op(sbi); + + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) { + err = PTR_ERR(ifolio); + goto out; + } + + set_new_dnode(&dn, inode, ifolio, ifolio, 0); + + if (f2fs_has_inline_data(inode)) + err = f2fs_convert_inline_folio(&dn, folio); + + f2fs_put_dnode(&dn); +out: + f2fs_unlock_op(sbi); + + f2fs_folio_put(folio, true); + + if (!err) + f2fs_balance_fs(sbi, dn.node_changed); + + return err; +} + +int f2fs_write_inline_data(struct inode *inode, struct folio *folio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct folio *ifolio; + + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); + + if (!f2fs_has_inline_data(inode)) { + f2fs_folio_put(ifolio, true); + return -EAGAIN; + } + + f2fs_bug_on(F2FS_I_SB(inode), folio->index); + + f2fs_folio_wait_writeback(ifolio, NODE, true, true); + memcpy_from_folio(inline_data_addr(inode, ifolio), + folio, 0, MAX_INLINE_DATA(inode)); + folio_mark_dirty(ifolio); + + f2fs_clear_page_cache_dirty_tag(folio); + + set_inode_flag(inode, FI_APPEND_WRITE); + set_inode_flag(inode, FI_DATA_EXIST); + + folio_clear_f2fs_inline(ifolio); + f2fs_folio_put(ifolio, true); + return 0; +} + +int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode *ri = NULL; + void *src_addr, *dst_addr; + + /* + * The inline_data recovery policy is as follows. + * [prev.] [next] of inline_data flag + * o o -> recover inline_data + * o x -> remove inline_data, and then recover data blocks + * x o -> remove data blocks, and then recover inline_data + * x x -> recover data blocks + */ + if (IS_INODE(nfolio)) + ri = F2FS_INODE(nfolio); + + if (f2fs_has_inline_data(inode) && + ri && (ri->i_inline & F2FS_INLINE_DATA)) { + struct folio *ifolio; +process_inline: + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); + + f2fs_folio_wait_writeback(ifolio, NODE, true, true); + + src_addr = inline_data_addr(inode, nfolio); + dst_addr = inline_data_addr(inode, ifolio); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); + + set_inode_flag(inode, FI_INLINE_DATA); + set_inode_flag(inode, FI_DATA_EXIST); + + folio_mark_dirty(ifolio); + f2fs_folio_put(ifolio, true); + return 1; + } + + if (f2fs_has_inline_data(inode)) { + struct folio *ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); + f2fs_truncate_inline_inode(inode, ifolio, 0); + stat_dec_inline_inode(inode); + clear_inode_flag(inode, FI_INLINE_DATA); + f2fs_folio_put(ifolio, true); + } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { + int ret; + + ret = f2fs_truncate_blocks(inode, 0, false); + if (ret) + return ret; + stat_inc_inline_inode(inode); + goto process_inline; + } + return 0; +} + +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, + const struct f2fs_filename *fname, + struct folio **res_folio, + bool use_hash) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_dir_entry *de; + struct f2fs_dentry_ptr d; + struct folio *ifolio; + void *inline_dentry; + + ifolio = f2fs_get_inode_folio(sbi, dir->i_ino); + if (IS_ERR(ifolio)) { + *res_folio = ifolio; + return NULL; + } + + inline_dentry = inline_data_addr(dir, ifolio); + + make_dentry_ptr_inline(dir, &d, inline_dentry); + de = f2fs_find_target_dentry(&d, fname, NULL, use_hash); + folio_unlock(ifolio); + if (IS_ERR(de)) { + *res_folio = ERR_CAST(de); + de = NULL; + } + if (de) + *res_folio = ifolio; + else + f2fs_folio_put(ifolio, false); + + return de; +} + +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct folio *ifolio) +{ + struct f2fs_dentry_ptr d; + void *inline_dentry; + + inline_dentry = inline_data_addr(inode, ifolio); + + make_dentry_ptr_inline(inode, &d, inline_dentry); + f2fs_do_make_empty_dir(inode, parent, &d); + + folio_mark_dirty(ifolio); + + /* update i_size to MAX_INLINE_DATA */ + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); + return 0; +} + +/* + * NOTE: ipage is grabbed by caller, but if any error occurs, we should + * release ipage in this function. + */ +static int f2fs_move_inline_dirents(struct inode *dir, struct folio *ifolio, + void *inline_dentry) +{ + struct folio *folio; + struct dnode_of_data dn; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; + int err; + + folio = f2fs_grab_cache_folio(dir->i_mapping, 0, true); + if (IS_ERR(folio)) { + f2fs_folio_put(ifolio, true); + return PTR_ERR(folio); + } + + set_new_dnode(&dn, dir, ifolio, NULL, 0); + err = f2fs_reserve_block(&dn, 0); + if (err) + goto out; + + if (unlikely(dn.data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(&dn); + set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); + f2fs_warn(F2FS_F_SB(folio), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", + __func__, dir->i_ino, dn.data_blkaddr); + f2fs_handle_error(F2FS_F_SB(folio), ERROR_INVALID_BLKADDR); + err = -EFSCORRUPTED; + goto out; + } + + f2fs_folio_wait_writeback(folio, DATA, true, true); + + dentry_blk = folio_address(folio); + + /* + * Start by zeroing the full block, to ensure that all unused space is + * zeroed and no uninitialized memory is leaked to disk. + */ + memset(dentry_blk, 0, F2FS_BLKSIZE); + + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); + + /* copy data from inline dentry block to new dentry block */ + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); + + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + folio_mark_dirty(folio); + + /* clear inline dir and flag after data writeback */ + f2fs_truncate_inline_inode(dir, ifolio, 0); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + + /* + * should retrieve reserved space which was used to keep + * inline_dentry's structure for backward compatibility. + */ + if (!f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(dir)) && + !f2fs_has_inline_xattr(dir)) + F2FS_I(dir)->i_inline_xattr_size = 0; + + f2fs_i_depth_write(dir, 1); + if (i_size_read(dir) < PAGE_SIZE) + f2fs_i_size_write(dir, PAGE_SIZE); +out: + f2fs_folio_put(folio, true); + return err; +} + +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) +{ + struct f2fs_dentry_ptr d; + unsigned long bit_pos = 0; + int err = 0; + + make_dentry_ptr_inline(dir, &d, inline_dentry); + + while (bit_pos < d.max) { + struct f2fs_dir_entry *de; + struct f2fs_filename fname; + nid_t ino; + umode_t fake_mode; + + if (!test_bit_le(bit_pos, d.bitmap)) { + bit_pos++; + continue; + } + + de = &d.dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + /* + * We only need the disk_name and hash to move the dentry. + * We don't need the original or casefolded filenames. + */ + memset(&fname, 0, sizeof(fname)); + fname.disk_name.name = d.filename[bit_pos]; + fname.disk_name.len = le16_to_cpu(de->name_len); + fname.hash = de->hash_code; + + ino = le32_to_cpu(de->ino); + fake_mode = fs_ftype_to_dtype(de->file_type) << S_DT_SHIFT; + + err = f2fs_add_regular_entry(dir, &fname, NULL, ino, fake_mode); + if (err) + goto punch_dentry_pages; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + return 0; +punch_dentry_pages: + truncate_inode_pages(&dir->i_data, 0); + f2fs_truncate_blocks(dir, 0, false); + f2fs_remove_dirty_inode(dir); + return err; +} + +static int f2fs_move_rehashed_dirents(struct inode *dir, struct folio *ifolio, + void *inline_dentry) +{ + void *backup_dentry; + int err; + + backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); + if (!backup_dentry) { + f2fs_folio_put(ifolio, true); + return -ENOMEM; + } + + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); + f2fs_truncate_inline_inode(dir, ifolio, 0); + + folio_unlock(ifolio); + + err = f2fs_add_inline_entries(dir, backup_dentry); + if (err) + goto recover; + + folio_lock(ifolio); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + + /* + * should retrieve reserved space which was used to keep + * inline_dentry's structure for backward compatibility. + */ + if (!f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(dir)) && + !f2fs_has_inline_xattr(dir)) + F2FS_I(dir)->i_inline_xattr_size = 0; + + kfree(backup_dentry); + return 0; +recover: + folio_lock(ifolio); + f2fs_folio_wait_writeback(ifolio, NODE, true, true); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); + f2fs_i_depth_write(dir, 0); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); + folio_mark_dirty(ifolio); + f2fs_folio_put(ifolio, true); + + kfree(backup_dentry); + return err; +} + +static int do_convert_inline_dir(struct inode *dir, struct folio *ifolio, + void *inline_dentry) +{ + if (!F2FS_I(dir)->i_dir_level) + return f2fs_move_inline_dirents(dir, ifolio, inline_dentry); + else + return f2fs_move_rehashed_dirents(dir, ifolio, inline_dentry); +} + +int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct folio *ifolio; + struct f2fs_filename fname; + void *inline_dentry = NULL; + int err = 0; + + if (!f2fs_has_inline_dentry(dir)) + return 0; + + f2fs_lock_op(sbi); + + err = f2fs_setup_filename(dir, &dentry->d_name, 0, &fname); + if (err) + goto out; + + ifolio = f2fs_get_inode_folio(sbi, dir->i_ino); + if (IS_ERR(ifolio)) { + err = PTR_ERR(ifolio); + goto out_fname; + } + + if (f2fs_has_enough_room(dir, ifolio, &fname)) { + f2fs_folio_put(ifolio, true); + goto out_fname; + } + + inline_dentry = inline_data_addr(dir, ifolio); + + err = do_convert_inline_dir(dir, ifolio, inline_dentry); + if (!err) + f2fs_folio_put(ifolio, true); +out_fname: + f2fs_free_filename(&fname); +out: + f2fs_unlock_op(sbi); + return err; +} + +int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct folio *ifolio; + unsigned int bit_pos; + void *inline_dentry = NULL; + struct f2fs_dentry_ptr d; + int slots = GET_DENTRY_SLOTS(fname->disk_name.len); + struct folio *folio = NULL; + int err = 0; + + ifolio = f2fs_get_inode_folio(sbi, dir->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); + + inline_dentry = inline_data_addr(dir, ifolio); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { + err = do_convert_inline_dir(dir, ifolio, inline_dentry); + if (err) + return err; + err = -EAGAIN; + goto out; + } + + if (inode) { + f2fs_down_write_nested(&F2FS_I(inode)->i_sem, + SINGLE_DEPTH_NESTING); + folio = f2fs_init_inode_metadata(inode, dir, fname, ifolio); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto fail; + } + } + + f2fs_folio_wait_writeback(ifolio, NODE, true, true); + + f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, + bit_pos); + + folio_mark_dirty(ifolio); + + /* we don't need to mark_inode_dirty now */ + if (inode) { + f2fs_i_pino_write(inode, dir->i_ino); + + /* synchronize inode page's data from inode cache */ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + f2fs_update_inode(inode, folio); + + f2fs_folio_put(folio, true); + } + + f2fs_update_parent_metadata(dir, inode, 0); +fail: + if (inode) + f2fs_up_write(&F2FS_I(inode)->i_sem); +out: + f2fs_folio_put(ifolio, true); + return err; +} + +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, + struct folio *folio, struct inode *dir, struct inode *inode) +{ + struct f2fs_dentry_ptr d; + void *inline_dentry; + int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + unsigned int bit_pos; + int i; + + folio_lock(folio); + f2fs_folio_wait_writeback(folio, NODE, true, true); + + inline_dentry = inline_data_addr(dir, folio); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = dentry - d.dentry; + for (i = 0; i < slots; i++) + __clear_bit_le(bit_pos + i, d.bitmap); + + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); + + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + f2fs_mark_inode_dirty_sync(dir, false); + + if (inode) + f2fs_drop_nlink(dir, inode); +} + +bool f2fs_empty_inline_dir(struct inode *dir) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct folio *ifolio; + unsigned int bit_pos = 2; + void *inline_dentry; + struct f2fs_dentry_ptr d; + + ifolio = f2fs_get_inode_folio(sbi, dir->i_ino); + if (IS_ERR(ifolio)) + return false; + + inline_dentry = inline_data_addr(dir, ifolio); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); + + f2fs_folio_put(ifolio, true); + + if (bit_pos < d.max) + return false; + + return true; +} + +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr) +{ + struct inode *inode = file_inode(file); + struct folio *ifolio = NULL; + struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; + int err; + + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) + return 0; + + ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); + + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + folio_unlock(ifolio); + + inline_dentry = inline_data_addr(inode, ifolio); + + make_dentry_ptr_inline(inode, &d, inline_dentry); + + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) + ctx->pos = d.max; + + f2fs_folio_put(ifolio, false); + return err < 0 ? err : 0; +} + +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) +{ + __u64 byteaddr, ilen; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; + struct node_info ni; + struct folio *ifolio; + int err = 0; + + ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); + + if ((S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !f2fs_has_inline_data(inode)) { + err = -EAGAIN; + goto out; + } + + if (S_ISDIR(inode->i_mode) && !f2fs_has_inline_dentry(inode)) { + err = -EAGAIN; + goto out; + } + + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); + if (start >= ilen) + goto out; + if (start + len < ilen) + ilen = start + len; + ilen -= start; + + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false); + if (err) + goto out; + + byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; + byteaddr += (char *)inline_data_addr(inode, ifolio) - + (char *)F2FS_INODE(ifolio); + err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); + trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err); +out: + f2fs_folio_put(ifolio, true); + return err; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2b2d45d19e3e..38b8994bc1b2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -1,94 +1,570 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/inode.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> -#include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/sched/mm.h> +#include <linux/lz4.h> +#include <linux/zstd.h> #include "f2fs.h" #include "node.h" +#include "segment.h" +#include "xattr.h" #include <trace/events/f2fs.h> +#ifdef CONFIG_F2FS_FS_COMPRESSION +extern const struct address_space_operations f2fs_compress_aops; +#endif + +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) +{ + if (is_inode_flag_set(inode, FI_NEW_INODE)) + return; + + if (f2fs_readonly(F2FS_I_SB(inode)->sb)) + return; + + if (f2fs_inode_dirtied(inode, sync)) + return; + + /* only atomic file w/ FI_ATOMIC_COMMITTED can be set vfs dirty */ + if (f2fs_is_atomic_file(inode) && + !is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) + return; + + mark_inode_dirty_sync(inode); +} + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; + unsigned int new_fl = 0; + + if (flags & F2FS_SYNC_FL) + new_fl |= S_SYNC; + if (flags & F2FS_APPEND_FL) + new_fl |= S_APPEND; + if (flags & F2FS_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + if (flags & F2FS_NOATIME_FL) + new_fl |= S_NOATIME; + if (flags & F2FS_DIRSYNC_FL) + new_fl |= S_DIRSYNC; + if (file_is_encrypt(inode)) + new_fl |= S_ENCRYPTED; + if (file_is_verity(inode)) + new_fl |= S_VERITY; + if (flags & F2FS_CASEFOLD_FL) + new_fl |= S_CASEFOLD; + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| + S_ENCRYPTED|S_VERITY|S_CASEFOLD); +} + +static void __get_inode_rdev(struct inode *inode, struct folio *node_folio) +{ + __le32 *addr = get_dnode_addr(inode, node_folio); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + if (addr[0]) + inode->i_rdev = old_decode_dev(le32_to_cpu(addr[0])); + else + inode->i_rdev = new_decode_dev(le32_to_cpu(addr[1])); + } +} + +static void __set_inode_rdev(struct inode *inode, struct folio *node_folio) +{ + __le32 *addr = get_dnode_addr(inode, node_folio); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); + addr[1] = 0; + } else { + addr[0] = 0; + addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); + addr[2] = 0; + } + } +} + +static void __recover_inline_status(struct inode *inode, struct folio *ifolio) +{ + void *inline_data = inline_data_addr(inode, ifolio); + __le32 *start = inline_data; + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); + + while (start < end) { + if (*start++) { + f2fs_folio_wait_writeback(ifolio, NODE, true, true); + + set_inode_flag(inode, FI_DATA_EXIST); + set_raw_inline(inode, F2FS_INODE(ifolio)); + folio_mark_dirty(ifolio); + return; + } + } + return; +} + +static +bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio) +{ + struct f2fs_inode *ri = &F2FS_NODE(folio)->i; + + if (!f2fs_sb_has_inode_chksum(sbi)) + return false; + + if (!IS_INODE(folio) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), + i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio) +{ + struct f2fs_node *node = F2FS_NODE(folio); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi->s_chksum_seed, (__u8 *)&ino, sizeof(ino)); + chksum_seed = f2fs_chksum(chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (unlikely(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN))) + return true; + +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_enable_inode_chksum(sbi, folio)) +#else + if (!f2fs_enable_inode_chksum(sbi, folio) || + folio_test_dirty(folio) || + folio_test_writeback(folio)) +#endif + return true; + + ri = &F2FS_NODE(folio)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, folio); + + if (provided != calculated) + f2fs_warn(sbi, "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x", + folio->index, ino_of_node(folio), + provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio) +{ + struct f2fs_inode *ri = &F2FS_NODE(folio)->i; + + if (!f2fs_enable_inode_chksum(sbi, folio)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, folio)); +} + +static bool sanity_check_compress_inode(struct inode *inode, + struct f2fs_inode *ri) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned char clevel; + + if (ri->i_compress_algorithm >= COMPRESS_MAX) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) has unsupported compress algorithm: %u, run fsck to fix", + __func__, inode->i_ino, ri->i_compress_algorithm); + return false; + } + if (le64_to_cpu(ri->i_compr_blocks) > + SECTOR_TO_BLOCK(inode->i_blocks)) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) has inconsistent i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", + __func__, inode->i_ino, le64_to_cpu(ri->i_compr_blocks), + SECTOR_TO_BLOCK(inode->i_blocks)); + return false; + } + if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || + ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) has unsupported log cluster size: %u, run fsck to fix", + __func__, inode->i_ino, ri->i_log_cluster_size); + return false; + } + + clevel = le16_to_cpu(ri->i_compress_flag) >> + COMPRESS_LEVEL_OFFSET; + switch (ri->i_compress_algorithm) { + case COMPRESS_LZO: +#ifdef CONFIG_F2FS_FS_LZO + if (clevel) + goto err_level; +#endif + break; + case COMPRESS_LZORLE: +#ifdef CONFIG_F2FS_FS_LZORLE + if (clevel) + goto err_level; +#endif + break; + case COMPRESS_LZ4: +#ifdef CONFIG_F2FS_FS_LZ4 +#ifdef CONFIG_F2FS_FS_LZ4HC + if (clevel && + (clevel < LZ4HC_MIN_CLEVEL || clevel > LZ4HC_MAX_CLEVEL)) + goto err_level; +#else + if (clevel) + goto err_level; +#endif +#endif + break; + case COMPRESS_ZSTD: +#ifdef CONFIG_F2FS_FS_ZSTD + if (clevel < zstd_min_clevel() || clevel > zstd_max_clevel()) + goto err_level; +#endif + break; + default: + goto err_level; + } + + return true; +err_level: + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported compress level: %u, run fsck to fix", + __func__, inode->i_ino, clevel); + return false; +} + +static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri = F2FS_INODE(node_folio); + unsigned long long iblocks; + + iblocks = le64_to_cpu(F2FS_INODE(node_folio)->i_blocks); + if (!iblocks) { + f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.", + __func__, inode->i_ino, iblocks); + return false; + } + + if (ino_of_node(node_folio) != nid_of_node(node_folio)) { + f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.", + __func__, inode->i_ino, + ino_of_node(node_folio), nid_of_node(node_folio)); + return false; + } + + if (ino_of_node(node_folio) == fi->i_xattr_nid) { + f2fs_warn(sbi, "%s: corrupted inode i_ino=%lx, xnid=%x, run fsck to fix.", + __func__, inode->i_ino, fi->i_xattr_nid); + return false; + } + + if (S_ISDIR(inode->i_mode) && unlikely(inode->i_nlink == 1)) { + f2fs_warn(sbi, "%s: directory inode (ino=%lx) has a single i_nlink", + __func__, inode->i_ino); + return false; + } + + if (f2fs_has_extra_attr(inode)) { + if (!f2fs_sb_has_extra_attr(sbi)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", + __func__, inode->i_ino); + return false; + } + if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || + fi->i_extra_isize < F2FS_MIN_EXTRA_ATTR_SIZE || + fi->i_extra_isize % sizeof(__le32)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu", + __func__, inode->i_ino, fi->i_extra_isize, + F2FS_TOTAL_EXTRA_ATTR_SIZE); + return false; + } + if (f2fs_sb_has_compression(sbi) && + fi->i_flags & F2FS_COMPR_FL && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_compress_flag)) { + if (!sanity_check_compress_inode(inode, ri)) + return false; + } + } + + if (f2fs_sb_has_flexible_inline_xattr(sbi) && + f2fs_has_inline_xattr(inode) && + (fi->i_inline_xattr_size < MIN_INLINE_XATTR_SIZE || + fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, min: %zu, max: %lu", + __func__, inode->i_ino, fi->i_inline_xattr_size, + MIN_INLINE_XATTR_SIZE, MAX_INLINE_XATTR_SIZE); + return false; + } + + if (!f2fs_sb_has_extra_attr(sbi)) { + if (f2fs_sb_has_project_quota(sbi)) { + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_PRJQUOTA); + return false; + } + if (f2fs_sb_has_inode_chksum(sbi)) { + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_INODE_CHKSUM); + return false; + } + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); + return false; + } + if (f2fs_sb_has_inode_crtime(sbi)) { + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_INODE_CRTIME); + return false; + } + if (f2fs_sb_has_compression(sbi)) { + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_COMPRESSION); + return false; + } + } + + if (f2fs_sanity_check_inline_data(inode, node_folio)) { + f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + + if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) { + f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_dentry, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + + if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off", + __func__, inode->i_ino); + return false; + } + + if (fi->i_xattr_nid && f2fs_check_nid_range(sbi, fi->i_xattr_nid)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_xattr_nid: %u, run fsck to fix.", + __func__, inode->i_ino, fi->i_xattr_nid); + return false; + } + + if (IS_DEVICE_ALIASING(inode)) { + if (!f2fs_sb_has_device_alias(sbi)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but the feature is off", + __func__, inode->i_ino); + return false; + } + if (!f2fs_is_pinned_file(inode)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but is not pinned", + __func__, inode->i_ino); + return false; + } + } - inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | - S_NOATIME | S_DIRSYNC); - - if (flags & FS_SYNC_FL) - inode->i_flags |= S_SYNC; - if (flags & FS_APPEND_FL) - inode->i_flags |= S_APPEND; - if (flags & FS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - if (flags & FS_NOATIME_FL) - inode->i_flags |= S_NOATIME; - if (flags & FS_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + return true; +} + +static void init_idisk_time(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + fi->i_disk_time[0] = inode_get_atime(inode); + fi->i_disk_time[1] = inode_get_ctime(inode); + fi->i_disk_time[2] = inode_get_mtime(inode); } static int do_read_inode(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - struct page *node_page; - struct f2fs_node *rn; + struct folio *node_folio; struct f2fs_inode *ri; + projid_t i_projid; /* Check if ino is within scope */ - if (check_nid_range(sbi, inode->i_ino)) { - f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", - (unsigned long) inode->i_ino); + if (f2fs_check_nid_range(sbi, inode->i_ino)) return -EINVAL; - } - node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); + node_folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(node_folio)) + return PTR_ERR(node_folio); - rn = page_address(node_page); - ri = &(rn->i); + ri = F2FS_INODE(node_folio); inode->i_mode = le16_to_cpu(ri->i_mode); i_uid_write(inode, le32_to_cpu(ri->i_uid)); i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); - inode->i_blocks = le64_to_cpu(ri->i_blocks); - - inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); - inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); - inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); - inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); - inode->i_generation = le32_to_cpu(ri->i_generation); - if (ri->i_addr[0]) - inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); - else - inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); - fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + inode_set_atime(inode, le64_to_cpu(ri->i_atime), + le32_to_cpu(ri->i_atime_nsec)); + inode_set_ctime(inode, le64_to_cpu(ri->i_ctime), + le32_to_cpu(ri->i_ctime_nsec)); + inode_set_mtime(inode, le64_to_cpu(ri->i_mtime), + le32_to_cpu(ri->i_mtime_nsec)); + inode->i_generation = le32_to_cpu(ri->i_generation); + if (S_ISDIR(inode->i_mode)) + fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + else if (S_ISREG(inode->i_mode)) + fi->i_gc_failures = le16_to_cpu(ri->i_gc_failures); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); - fi->flags = 0; + if (S_ISREG(inode->i_mode)) + fi->i_flags &= ~F2FS_PROJINHERIT_FL; + bitmap_zero(fi->flags, FI_MAX); fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); - get_extent_info(&fi->ext, ri->i_ext); - f2fs_put_page(node_page, 1); + fi->i_dir_level = ri->i_dir_level; + + get_inline_info(inode, ri); + + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { + fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } else { + + /* + * Previous inline data or directory always reserved 200 bytes + * in inode layout, even if inline_xattr is disabled. In order + * to keep inline_dentry's structure for backward compatibility, + * we get the space back only from inline_data. + */ + fi->i_inline_xattr_size = 0; + } + + if (!sanity_check_inode(inode, node_folio)) { + f2fs_folio_put(node_folio, true); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + + /* check data exist */ + if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) + __recover_inline_status(inode, node_folio); + + /* try to recover cold bit for non-dir inode */ + if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_folio)) { + f2fs_folio_wait_writeback(node_folio, NODE, true, true); + set_cold_node(node_folio, false); + folio_mark_dirty(node_folio); + } + + /* get rdev by using inline_info */ + __get_inode_rdev(inode, node_folio); + + if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) + fi->last_disk_size = inode->i_size; + + if (fi->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime); + fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); + } + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && + (fi->i_flags & F2FS_COMPR_FL)) { + if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_compress_flag)) { + unsigned short compress_flag; + + atomic_set(&fi->i_compr_blocks, + le64_to_cpu(ri->i_compr_blocks)); + fi->i_compress_algorithm = ri->i_compress_algorithm; + fi->i_log_cluster_size = ri->i_log_cluster_size; + compress_flag = le16_to_cpu(ri->i_compress_flag); + fi->i_compress_level = compress_flag >> + COMPRESS_LEVEL_OFFSET; + fi->i_compress_flag = compress_flag & + GENMASK(COMPRESS_LEVEL_OFFSET - 1, 0); + fi->i_cluster_size = BIT(fi->i_log_cluster_size); + set_inode_flag(inode, FI_COMPRESSED_FILE); + } + } + + init_idisk_time(inode); + + if (!sanity_check_extent_cache(inode, node_folio)) { + f2fs_folio_put(node_folio, true); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + + /* Need all the flag bits */ + f2fs_init_read_extent_tree(inode, node_folio); + f2fs_init_age_extent_tree(inode); + + f2fs_folio_put(node_folio, true); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + stat_inc_compr_inode(inode); + stat_add_compr_blocks(inode, atomic_read(&fi->i_compr_blocks)); + return 0; } +static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino) +{ + return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) || + ino == F2FS_COMPRESS_INO(sbi); +} + struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -99,11 +575,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { + if (is_meta_ino(sbi, ino)) { + f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + ret = -EFSCORRUPTED; + trace_f2fs_iget_exit(inode, ret); + iput(inode); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return ERR_PTR(ret); + } + trace_f2fs_iget(inode); return inode; } - if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) + + if (is_meta_ino(sbi, ino)) goto make_now; ret = do_read_inode(inode); @@ -112,10 +599,21 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (ino == F2FS_META_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (ino == F2FS_COMPRESS_INO(sbi)) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + inode->i_mapping->a_ops = &f2fs_compress_aops; + /* + * generic_error_remove_folio only truncates pages of regular + * inode + */ + inode->i_mode |= S_IFREG; +#endif + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); } else if (S_ISREG(inode->i_mode)) { inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; @@ -124,9 +622,13 @@ make_now: inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &f2fs_symlink_inode_operations; + if (file_is_encrypt(inode)) + inode->i_op = &f2fs_encrypted_symlink_inode_operations; + else + inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { @@ -136,102 +638,206 @@ make_now: ret = -EIO; goto bad_inode; } + f2fs_set_inode_flags(inode); + unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; bad_inode: + f2fs_inode_synced(inode); iget_failed(inode); trace_f2fs_iget_exit(inode, ret); return ERR_PTR(ret); } -void update_inode(struct inode *inode, struct page *node_page) +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; +retry: + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto retry; + } + } + return inode; +} + +void f2fs_update_inode(struct inode *inode, struct folio *node_folio) { - struct f2fs_node *rn; + struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_inode *ri; + struct extent_tree *et = fi->extent_tree[EX_READ]; - wait_on_page_writeback(node_page); + f2fs_folio_wait_writeback(node_folio, NODE, true, true); + folio_mark_dirty(node_folio); - rn = page_address(node_page); - ri = &(rn->i); + f2fs_inode_synced(inode); + + ri = F2FS_INODE(node_folio); ri->i_mode = cpu_to_le16(inode->i_mode); - ri->i_advise = F2FS_I(inode)->i_advise; + ri->i_advise = fi->i_advise; ri->i_uid = cpu_to_le32(i_uid_read(inode)); ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); - ri->i_size = cpu_to_le64(i_size_read(inode)); - ri->i_blocks = cpu_to_le64(inode->i_blocks); - set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); - - ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); - ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); - ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); - ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); + + if (!f2fs_is_atomic_file(inode) || + is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) + ri->i_size = cpu_to_le64(i_size_read(inode)); + + if (et) { + read_lock(&et->lock); + set_raw_read_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { + memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } + set_raw_inline(inode, ri); + + ri->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + ri->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + ri->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); + ri->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); + ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ri->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); + if (S_ISDIR(inode->i_mode)) + ri->i_current_depth = cpu_to_le32(fi->i_current_depth); + else if (S_ISREG(inode->i_mode)) + ri->i_gc_failures = cpu_to_le16(fi->i_gc_failures); + ri->i_xattr_nid = cpu_to_le32(fi->i_xattr_nid); + ri->i_flags = cpu_to_le32(fi->i_flags); + ri->i_pino = cpu_to_le32(fi->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + ri->i_dir_level = fi->i_dir_level; - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; - } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; + if (f2fs_has_extra_attr(inode)) { + ri->i_extra_isize = cpu_to_le16(fi->i_extra_isize); + + if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode))) + ri->i_inline_xattr_size = + cpu_to_le16(fi->i_inline_xattr_size); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, fi->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + + if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + ri->i_crtime = cpu_to_le64(fi->i_crtime.tv_sec); + ri->i_crtime_nsec = cpu_to_le32(fi->i_crtime.tv_nsec); + } + + if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_compress_flag)) { + unsigned short compress_flag; + + ri->i_compr_blocks = cpu_to_le64( + atomic_read(&fi->i_compr_blocks)); + ri->i_compress_algorithm = fi->i_compress_algorithm; + compress_flag = fi->i_compress_flag | + fi->i_compress_level << + COMPRESS_LEVEL_OFFSET; + ri->i_compress_flag = cpu_to_le16(compress_flag); + ri->i_log_cluster_size = fi->i_log_cluster_size; } } - set_cold_node(inode, node_page); - set_page_dirty(node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + __set_inode_rdev(inode, node_folio); + + /* deleted inode */ + if (inode->i_nlink == 0) + folio_clear_f2fs_inline(node_folio); + + init_idisk_time(inode); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_I_SB(inode), node_folio); +#endif } -int update_inode_page(struct inode *inode) +void f2fs_update_inode_page(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct page *node_page; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct folio *node_folio; + int count = 0; +retry: + node_folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(node_folio)) { + int err = PTR_ERR(node_folio); - node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); + /* The node block was truncated. */ + if (err == -ENOENT) + return; - update_inode(inode, node_page); - f2fs_put_page(node_page, 1); - return 0; + if (err == -EFSCORRUPTED) + goto stop_checkpoint; + + if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT) + goto retry; +stop_checkpoint: + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE); + return; + } + f2fs_update_inode(inode, node_folio); + f2fs_folio_put(node_folio, true); } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ret, ilock; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) return 0; - if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + /* + * atime could be updated without dirtying f2fs inode in lazytime mode + */ + if (f2fs_is_time_consistent(inode) && + !is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; - if (wbc) - f2fs_balance_fs(sbi); + /* + * no need to update inode page, ultimately f2fs_evict_inode() will + * clear dirty status of inode. + */ + if (f2fs_cp_error(sbi)) + return -EIO; + + if (!f2fs_is_checkpoint_ready(sbi)) { + f2fs_mark_inode_dirty_sync(inode, true); + return -ENOSPC; + } /* - * We need to lock here to prevent from producing dirty node pages - * during the urgent cleaning time when runing out of free sections. + * We need to balance fs here to prevent from producing dirty node pages + * during the urgent cleaning time when running out of free sections. */ - ilock = mutex_lock_op(sbi); - ret = update_inode_page(inode); - mutex_unlock_op(sbi, ilock); - return ret; + f2fs_update_inode_page(inode); + if (wbc && wbc->nr_to_write) + f2fs_balance_fs(sbi, true); + return 0; +} + +void f2fs_remove_donate_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (list_empty(&F2FS_I(inode)->gdonate_list)) + return; + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + list_del_init(&F2FS_I(inode)->gdonate_list); + sbi->donate_files--; + spin_unlock(&sbi->inode_lock[DONATE_INODE]); } /* @@ -239,34 +845,217 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) */ void f2fs_evict_inode(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ilock; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t xnid = fi->i_xattr_nid; + int err = 0; + bool freeze_protected = false; + + f2fs_abort_atomic_write(inode, true); + + if (fi->cow_inode && f2fs_is_cow_file(fi->cow_inode)) { + clear_inode_flag(fi->cow_inode, FI_COW_FILE); + F2FS_I(fi->cow_inode)->atomic_inode = NULL; + iput(fi->cow_inode); + fi->cow_inode = NULL; + } trace_f2fs_evict_inode(inode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); + + if ((inode->i_nlink || is_bad_inode(inode)) && + test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); if (inode->i_ino == F2FS_NODE_INO(sbi) || - inode->i_ino == F2FS_META_INO(sbi)) - goto no_delete; + inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_COMPRESS_INO(sbi)) + goto out_clear; - BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents)); - remove_dirty_dir_inode(inode); + f2fs_bug_on(sbi, get_dirty_pages(inode)); + f2fs_remove_dirty_inode(inode); + f2fs_remove_donate_inode(inode); + + if (!IS_DEVICE_ALIASING(inode)) + f2fs_destroy_extent_tree(inode); if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - sb_start_intwrite(inode->i_sb); - set_inode_flag(F2FS_I(inode), FI_NO_ALLOC); - i_size_write(inode, 0); + err = f2fs_dquot_initialize(inode); + if (err) { + err = 0; + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + } + + f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); + if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING)) { + sb_start_intwrite(inode->i_sb); + freeze_protected = true; + } + set_inode_flag(inode, FI_NO_ALLOC); + i_size_write(inode, 0); +retry: if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode); + err = f2fs_truncate(inode); - ilock = mutex_lock_op(sbi); - remove_inode_page(inode); - mutex_unlock_op(sbi, ilock); + if (time_to_inject(sbi, FAULT_EVICT_INODE)) + err = -EIO; + + if (!err) { + f2fs_lock_op(sbi); + err = f2fs_remove_inode_page(inode); + f2fs_unlock_op(sbi); + if (err == -ENOENT) { + err = 0; + + /* + * in fuzzed image, another node may has the same + * block address as inode's, if it was truncated + * previously, truncation of inode node will fail. + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_evict_inode: inconsistent node id, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } + } - sb_end_intwrite(inode->i_sb); + /* give more chances, if ENOMEM case */ + if (err == -ENOMEM) { + err = 0; + goto retry; + } + + if (IS_DEVICE_ALIASING(inode)) + f2fs_destroy_extent_tree(inode); + + if (err) { + f2fs_update_inode_page(inode); + if (dquot_initialize_needed(inode)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + + /* + * If both f2fs_truncate() and f2fs_update_inode_page() failed + * due to fuzzed corrupted inode, call f2fs_inode_synced() to + * avoid triggering later f2fs_bug_on(). + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(sbi, + "f2fs_evict_inode: inode is dirty, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } + if (freeze_protected) + sb_end_intwrite(inode->i_sb); no_delete: + dquot_drop(inode); + + stat_dec_inline_xattr(inode); + stat_dec_inline_dir(inode); + stat_dec_inline_inode(inode); + stat_dec_compr_inode(inode); + stat_sub_compr_blocks(inode, + atomic_read(&fi->i_compr_blocks)); + + if (likely(!f2fs_cp_error(sbi) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + + /* + * anyway, it needs to remove the inode from sbi->inode_list[DIRTY_META] + * list to avoid UAF in f2fs_sync_inode_meta() during checkpoint. + */ + f2fs_inode_synced(inode); + + /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); + if (xnid) + invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + f2fs_add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + f2fs_add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } + if (is_inode_flag_set(inode, FI_FREE_NID)) { + f2fs_alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(inode, FI_FREE_NID); + } else { + /* + * If xattr nid is corrupted, we can reach out error condition, + * err & !f2fs_exist_written_data(sbi, inode->i_ino, ORPHAN_INO)). + * In that case, f2fs_check_nid_range() is enough to give a clue. + */ + } +out_clear: + fscrypt_put_encryption_info(inode); + fsverity_cleanup_inode(inode); clear_inode(inode); } + +/* caller should call f2fs_lock_op() */ +void f2fs_handle_failed_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct node_info ni; + int err; + + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + f2fs_update_inode_page(inode); + f2fs_inode_synced(inode); + + /* don't make bad inode, since it becomes a regular file. */ + unlock_new_inode(inode); + + /* + * Note: we should add inode to orphan list before f2fs_unlock_op() + * so we can prevent losing this orphan when encoutering checkpoint + * and following suddenly power-off. + */ + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + set_inode_flag(inode, FI_FREE_NID); + f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); + goto out; + } + + if (ni.blk_addr != NULL_ADDR) { + err = f2fs_acquire_orphan_inode(sbi); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "Too many orphan inodes, run fsck to fix."); + } else { + f2fs_add_orphan_inode(inode); + } + f2fs_alloc_nid_done(sbi, inode->i_ino); + } else { + set_inode_flag(inode, FI_FREE_NID); + } + +out: + f2fs_unlock_op(sbi); + + /* iput will drop the inode object */ + iput(inode); +} diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c new file mode 100644 index 000000000000..f8703038e1d8 --- /dev/null +++ b/fs/f2fs/iostat.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs iostat support + * + * Copyright 2021 Google LLC + * Author: Daeho Jeong <daehojeong@google.com> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/seq_file.h> + +#include "f2fs.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +static struct kmem_cache *bio_iostat_ctx_cache; +static mempool_t *bio_iostat_ctx_pool; + +static inline unsigned long long iostat_get_avg_bytes(struct f2fs_sb_info *sbi, + enum iostat_type type) +{ + return sbi->iostat_count[type] ? div64_u64(sbi->iostat_bytes[type], + sbi->iostat_count[type]) : 0; +} + +#define IOSTAT_INFO_SHOW(name, type) \ + seq_printf(seq, "%-23s %-16llu %-16llu %-16llu\n", \ + name":", sbi->iostat_bytes[type], \ + sbi->iostat_count[type], \ + iostat_get_avg_bytes(sbi, type)) + +int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", ktime_get_real_seconds()); + seq_printf(seq, "\t\t\t%-16s %-16s %-16s\n", + "io_bytes", "count", "avg_bytes"); + + /* print app write IOs */ + seq_puts(seq, "[WRITE]\n"); + IOSTAT_INFO_SHOW("app buffered data", APP_BUFFERED_IO); + IOSTAT_INFO_SHOW("app direct data", APP_DIRECT_IO); + IOSTAT_INFO_SHOW("app mapped data", APP_MAPPED_IO); + IOSTAT_INFO_SHOW("app buffered cdata", APP_BUFFERED_CDATA_IO); + IOSTAT_INFO_SHOW("app mapped cdata", APP_MAPPED_CDATA_IO); + + /* print fs write IOs */ + IOSTAT_INFO_SHOW("fs data", FS_DATA_IO); + IOSTAT_INFO_SHOW("fs cdata", FS_CDATA_IO); + IOSTAT_INFO_SHOW("fs node", FS_NODE_IO); + IOSTAT_INFO_SHOW("fs meta", FS_META_IO); + IOSTAT_INFO_SHOW("fs gc data", FS_GC_DATA_IO); + IOSTAT_INFO_SHOW("fs gc node", FS_GC_NODE_IO); + IOSTAT_INFO_SHOW("fs cp data", FS_CP_DATA_IO); + IOSTAT_INFO_SHOW("fs cp node", FS_CP_NODE_IO); + IOSTAT_INFO_SHOW("fs cp meta", FS_CP_META_IO); + + /* print app read IOs */ + seq_puts(seq, "[READ]\n"); + IOSTAT_INFO_SHOW("app buffered data", APP_BUFFERED_READ_IO); + IOSTAT_INFO_SHOW("app direct data", APP_DIRECT_READ_IO); + IOSTAT_INFO_SHOW("app mapped data", APP_MAPPED_READ_IO); + IOSTAT_INFO_SHOW("app buffered cdata", APP_BUFFERED_CDATA_READ_IO); + IOSTAT_INFO_SHOW("app mapped cdata", APP_MAPPED_CDATA_READ_IO); + + /* print fs read IOs */ + IOSTAT_INFO_SHOW("fs data", FS_DATA_READ_IO); + IOSTAT_INFO_SHOW("fs gc data", FS_GDATA_READ_IO); + IOSTAT_INFO_SHOW("fs cdata", FS_CDATA_READ_IO); + IOSTAT_INFO_SHOW("fs node", FS_NODE_READ_IO); + IOSTAT_INFO_SHOW("fs meta", FS_META_READ_IO); + + /* print other IOs */ + seq_puts(seq, "[OTHER]\n"); + IOSTAT_INFO_SHOW("fs discard", FS_DISCARD_IO); + IOSTAT_INFO_SHOW("fs flush", FS_FLUSH_IO); + IOSTAT_INFO_SHOW("fs zone reset", FS_ZONE_RESET_IO); + + return 0; +} + +static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) +{ + int io, idx; + struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + unsigned long flags; + + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + for (idx = 0; idx < MAX_IO_TYPE; idx++) { + for (io = 0; io < NR_PAGE_TYPE; io++) { + iostat_lat[idx][io].peak_lat = + jiffies_to_msecs(io_lat->peak_lat[idx][io]); + iostat_lat[idx][io].cnt = io_lat->bio_cnt[idx][io]; + iostat_lat[idx][io].avg_lat = iostat_lat[idx][io].cnt ? + jiffies_to_msecs(io_lat->sum_lat[idx][io]) / iostat_lat[idx][io].cnt : 0; + io_lat->sum_lat[idx][io] = 0; + io_lat->peak_lat[idx][io] = 0; + io_lat->bio_cnt[idx][io] = 0; + } + } + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); + + trace_f2fs_iostat_latency(sbi, iostat_lat); +} + +static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) +{ + unsigned long long iostat_diff[NR_IO_TYPE]; + int i; + unsigned long flags; + + if (time_is_after_jiffies(sbi->iostat_next_period)) + return; + + /* Need double check under the lock */ + spin_lock_irqsave(&sbi->iostat_lock, flags); + if (time_is_after_jiffies(sbi->iostat_next_period)) { + spin_unlock_irqrestore(&sbi->iostat_lock, flags); + return; + } + sbi->iostat_next_period = jiffies + + msecs_to_jiffies(sbi->iostat_period_ms); + + for (i = 0; i < NR_IO_TYPE; i++) { + iostat_diff[i] = sbi->iostat_bytes[i] - + sbi->prev_iostat_bytes[i]; + sbi->prev_iostat_bytes[i] = sbi->iostat_bytes[i]; + } + spin_unlock_irqrestore(&sbi->iostat_lock, flags); + + trace_f2fs_iostat(sbi, iostat_diff); + + __record_iostat_latency(sbi); +} + +void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + int i; + + spin_lock_irq(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) { + sbi->iostat_count[i] = 0; + sbi->iostat_bytes[i] = 0; + sbi->prev_iostat_bytes[i] = 0; + } + spin_unlock_irq(&sbi->iostat_lock); + + spin_lock_irq(&sbi->iostat_lat_lock); + memset(io_lat, 0, sizeof(struct iostat_lat_info)); + spin_unlock_irq(&sbi->iostat_lat_lock); +} + +static inline void __f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + sbi->iostat_bytes[type] += io_bytes; + sbi->iostat_count[type]++; +} + +void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, + enum iostat_type type, unsigned long long io_bytes) +{ + unsigned long flags; + + if (!sbi->iostat_enable) + return; + + spin_lock_irqsave(&sbi->iostat_lock, flags); + __f2fs_update_iostat(sbi, type, io_bytes); + + if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) + __f2fs_update_iostat(sbi, APP_WRITE_IO, io_bytes); + + if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) + __f2fs_update_iostat(sbi, APP_READ_IO, io_bytes); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (inode && f2fs_compressed_file(inode)) { + if (type == APP_BUFFERED_IO) + __f2fs_update_iostat(sbi, APP_BUFFERED_CDATA_IO, io_bytes); + + if (type == APP_BUFFERED_READ_IO) + __f2fs_update_iostat(sbi, APP_BUFFERED_CDATA_READ_IO, io_bytes); + + if (type == APP_MAPPED_READ_IO) + __f2fs_update_iostat(sbi, APP_MAPPED_CDATA_READ_IO, io_bytes); + + if (type == APP_MAPPED_IO) + __f2fs_update_iostat(sbi, APP_MAPPED_CDATA_IO, io_bytes); + + if (type == FS_DATA_READ_IO) + __f2fs_update_iostat(sbi, FS_CDATA_READ_IO, io_bytes); + + if (type == FS_DATA_IO) + __f2fs_update_iostat(sbi, FS_CDATA_IO, io_bytes); + } +#endif + + spin_unlock_irqrestore(&sbi->iostat_lock, flags); + + f2fs_record_iostat(sbi); +} + +static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, + enum iostat_lat_type lat_type) +{ + unsigned long ts_diff; + unsigned int page_type = iostat_ctx->type; + struct f2fs_sb_info *sbi = iostat_ctx->sbi; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + unsigned long flags; + + if (!sbi->iostat_enable) + return; + + ts_diff = jiffies - iostat_ctx->submit_ts; + if (page_type == META_FLUSH) { + page_type = META; + } else if (page_type >= NR_PAGE_TYPE) { + f2fs_warn(sbi, "%s: %d over NR_PAGE_TYPE", __func__, page_type); + return; + } + + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + io_lat->sum_lat[lat_type][page_type] += ts_diff; + io_lat->bio_cnt[lat_type][page_type]++; + if (ts_diff > io_lat->peak_lat[lat_type][page_type]) + io_lat->peak_lat[lat_type][page_type] = ts_diff; + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); +} + +void iostat_update_and_unbind_ctx(struct bio *bio) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + enum iostat_lat_type lat_type; + + if (op_is_write(bio_op(bio))) { + lat_type = bio->bi_opf & REQ_SYNC ? + WRITE_SYNC_IO : WRITE_ASYNC_IO; + bio->bi_private = iostat_ctx->sbi; + } else { + lat_type = READ_IO; + bio->bi_private = iostat_ctx->post_read_ctx; + } + + __update_iostat_latency(iostat_ctx, lat_type); + mempool_free(iostat_ctx, bio_iostat_ctx_pool); +} + +void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) +{ + struct bio_iostat_ctx *iostat_ctx; + /* Due to the mempool, this never fails. */ + iostat_ctx = mempool_alloc(bio_iostat_ctx_pool, GFP_NOFS); + iostat_ctx->sbi = sbi; + iostat_ctx->submit_ts = 0; + iostat_ctx->type = 0; + iostat_ctx->post_read_ctx = ctx; + bio->bi_private = iostat_ctx; +} + +int __init f2fs_init_iostat_processing(void) +{ + bio_iostat_ctx_cache = + kmem_cache_create("f2fs_bio_iostat_ctx", + sizeof(struct bio_iostat_ctx), 0, 0, NULL); + if (!bio_iostat_ctx_cache) + goto fail; + bio_iostat_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_IOSTAT_CTXS, + bio_iostat_ctx_cache); + if (!bio_iostat_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_iostat_ctx_cache); +fail: + return -ENOMEM; +} + +void f2fs_destroy_iostat_processing(void) +{ + mempool_destroy(bio_iostat_ctx_pool); + kmem_cache_destroy(bio_iostat_ctx_cache); +} + +int f2fs_init_iostat(struct f2fs_sb_info *sbi) +{ + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + spin_lock_init(&sbi->iostat_lat_lock); + sbi->iostat_enable = false; + sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; + sbi->iostat_io_lat = f2fs_kzalloc(sbi, sizeof(struct iostat_lat_info), + GFP_KERNEL); + if (!sbi->iostat_io_lat) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) +{ + kfree(sbi->iostat_io_lat); +} diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h new file mode 100644 index 000000000000..eb99d05cf272 --- /dev/null +++ b/fs/f2fs/iostat.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + * Author: Daeho Jeong <daehojeong@google.com> + */ +#ifndef __F2FS_IOSTAT_H__ +#define __F2FS_IOSTAT_H__ + +struct bio_post_read_ctx; + +enum iostat_lat_type { + READ_IO = 0, + WRITE_SYNC_IO, + WRITE_ASYNC_IO, + MAX_IO_TYPE, +}; + +#ifdef CONFIG_F2FS_IOSTAT + +#define NUM_PREALLOC_IOSTAT_CTXS 128 +#define DEFAULT_IOSTAT_PERIOD_MS 3000 +#define MIN_IOSTAT_PERIOD_MS 100 +/* maximum period of iostat tracing is 1 day */ +#define MAX_IOSTAT_PERIOD_MS 8640000 + +struct iostat_lat_info { + unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */ + unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */ + unsigned int bio_cnt[MAX_IO_TYPE][NR_PAGE_TYPE]; /* bio count */ +}; + +extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, + void *offset); +extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, + enum iostat_type type, unsigned long long io_bytes); + +struct bio_iostat_ctx { + struct f2fs_sb_info *sbi; + unsigned long submit_ts; + enum page_type type; + struct bio_post_read_ctx *post_read_ctx; +}; + +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + iostat_ctx->submit_ts = jiffies; + iostat_ctx->type = type; +} + +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + return iostat_ctx->post_read_ctx; +} + +extern void iostat_update_and_unbind_ctx(struct bio *bio); +extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx); +extern int f2fs_init_iostat_processing(void); +extern void f2fs_destroy_iostat_processing(void); +extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); +#else +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode, + enum iostat_type type, unsigned long long io_bytes) {} +static inline void iostat_update_and_unbind_ctx(struct bio *bio) {} +static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) {} +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) {} +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + return bio->bi_private; +} +static inline int f2fs_init_iostat_processing(void) { return 0; } +static inline void f2fs_destroy_iostat_processing(void) {} +static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) {} +#endif +#endif /* __F2FS_IOSTAT_H__ */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 64c07169df05..043d20516a21 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1,201 +1,461 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/namei.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/pagemap.h> #include <linux/sched.h> #include <linux/ctype.h> +#include <linux/random.h> +#include <linux/dcache.h> +#include <linux/namei.h> +#include <linux/quotaops.h> #include "f2fs.h" #include "node.h" +#include "segment.h" #include "xattr.h" #include "acl.h" #include <trace/events/f2fs.h> -static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) +static inline bool is_extension_exist(const unsigned char *s, const char *sub, + bool tmp_ext, bool tmp_dot) +{ + size_t slen = strlen(s); + size_t sublen = strlen(sub); + int i; + + if (sublen == 1 && *sub == '*') + return true; + + /* + * filename format of multimedia file should be defined as: + * "filename + '.' + extension + (optional: '.' + temp extension)". + */ + if (slen < sublen + 2) + return false; + + if (!tmp_ext) { + /* file has no temp extension */ + if (s[slen - sublen - 1] != '.') + return false; + return !strncasecmp(s + slen - sublen, sub, sublen); + } + + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) { + if (!tmp_dot) + return true; + if (i == slen - sublen - 1 || s[i + 1 + sublen] == '.') + return true; + } + } + + return false; +} + +static inline bool is_temperature_extension(const unsigned char *s, const char *sub) +{ + return is_extension_exist(s, sub, true, false); +} + +static inline bool is_compress_extension(const unsigned char *s, const char *sub) +{ + return is_extension_exist(s, sub, true, true); +} + +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int total_count = cold_count + hot_count; + int start, count; + int i; + + if (set) { + if (total_count == F2FS_MAX_EXTENSION) + return -EINVAL; + } else { + if (!hot && !cold_count) + return -EINVAL; + if (hot && !hot_count) + return -EINVAL; + } + + if (hot) { + start = cold_count; + count = total_count; + } else { + start = 0; + count = cold_count; + } + + for (i = start; i < count; i++) { + if (strcmp(name, extlist[i])) + continue; + + if (set) + return -EINVAL; + + memcpy(extlist[i], extlist[i + 1], + F2FS_EXTENSION_LEN * (total_count - i - 1)); + memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); + if (hot) + sbi->raw_super->hot_ext_count = hot_count - 1; + else + sbi->raw_super->extension_count = + cpu_to_le32(cold_count - 1); + return 0; + } + + if (!set) + return -EINVAL; + + if (hot) { + memcpy(extlist[count], name, strlen(name)); + sbi->raw_super->hot_ext_count = hot_count + 1; + } else { + char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; + + memcpy(buf, &extlist[cold_count], + F2FS_EXTENSION_LEN * hot_count); + memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); + memcpy(extlist[cold_count], name, strlen(name)); + memcpy(&extlist[cold_count + 1], buf, + F2FS_EXTENSION_LEN * hot_count); + sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); + } + return 0; +} + +static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, + struct inode *inode, const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = + F2FS_OPTION(sbi).noextensions; + unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; + unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + int i, cold_count, hot_count; + + if (!f2fs_sb_has_compression(sbi)) + return; + + if (S_ISDIR(inode->i_mode)) + goto inherit_comp; + + /* This name comes only from normal files. */ + if (!name) + return; + + /* Don't compress hot files. */ + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = cold_count; i < cold_count + hot_count; i++) + if (is_temperature_extension(name, extlist[i])) + break; + f2fs_up_read(&sbi->sb_lock); + if (i < (cold_count + hot_count)) + return; + + /* Don't compress unallowed extension. */ + for (i = 0; i < noext_cnt; i++) + if (is_compress_extension(name, noext[i])) + return; + + /* Compress wanting extension. */ + for (i = 0; i < ext_cnt; i++) { + if (is_compress_extension(name, ext[i])) { + set_compress_context(inode); + return; + } + } +inherit_comp: + /* Inherit the {no-}compression flag in directory */ + if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) { + F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL; + f2fs_mark_inode_dirty_sync(inode, true); + } else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) { + set_compress_context(inode); + } +} + +/* + * Set file's temperature for hot/cold data separation + */ +static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, cold_count, hot_count; + + f2fs_down_read(&sbi->sb_lock); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + for (i = 0; i < cold_count + hot_count; i++) + if (is_temperature_extension(name, extlist[i])) + break; + f2fs_up_read(&sbi->sb_lock); + + if (i == cold_count + hot_count) + return; + + if (i < cold_count) + file_set_cold(inode); + else + file_set_hot(inode); +} + +static struct inode *f2fs_new_inode(struct mnt_idmap *idmap, + struct inode *dir, umode_t mode, + const char *name) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct f2fs_inode_info *fi; nid_t ino; struct inode *inode; bool nid_free = false; - int err, ilock; + bool encrypt = false; + int xattr_size = 0; + int err; - inode = new_inode(sb); + inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); - ilock = mutex_lock_op(sbi); - if (!alloc_nid(sbi, &ino)) { - mutex_unlock_op(sbi, ilock); + if (!f2fs_alloc_nid(sbi, &ino)) { err = -ENOSPC; goto fail; } - mutex_unlock_op(sbi, ilock); - inode->i_uid = current_fsuid(); + nid_free = true; - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - inode->i_gid = current_fsgid(); - } + inode_init_owner(idmap, inode, dir, mode); + fi = F2FS_I(inode); inode->i_ino = ino; - inode->i_mode = mode; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_generation = sbi->s_next_generation++; + simple_inode_init_ts(inode); + fi->i_crtime = inode_get_mtime(inode); + inode->i_generation = get_random_u32(); + + if (S_ISDIR(inode->i_mode)) + fi->i_current_depth = 1; err = insert_inode_locked(inode); if (err) { err = -EINVAL; - nid_free = true; - goto out; + goto fail; } - trace_f2fs_new_inode(inode, 0); - mark_inode_dirty(inode); - return inode; -out: - clear_nlink(inode); - unlock_new_inode(inode); -fail: - trace_f2fs_new_inode(inode, err); - make_bad_inode(inode); - iput(inode); - if (nid_free) - alloc_nid_failed(sbi, ino); - return ERR_PTR(err); -} + if (f2fs_sb_has_project_quota(sbi) && + (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) + fi->i_projid = F2FS_I(dir)->i_projid; + else + fi->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); -static int is_multimedia_file(const unsigned char *s, const char *sub) -{ - size_t slen = strlen(s); - size_t sublen = strlen(sub); - int ret; + err = fscrypt_prepare_new_inode(dir, inode, &encrypt); + if (err) + goto fail_drop; - if (sublen > slen) - return 0; + err = f2fs_dquot_initialize(inode); + if (err) + goto fail_drop; + + set_inode_flag(inode, FI_NEW_INODE); + + if (encrypt) + f2fs_set_encrypted_inode(inode); - ret = memcmp(s + slen - sublen, sub, sublen); - if (ret) { /* compare upper case */ - int i; - char upper_sub[8]; - for (i = 0; i < sublen && i < sizeof(upper_sub); i++) - upper_sub[i] = toupper(sub[i]); - return !memcmp(s + slen - sublen, upper_sub, sublen); + if (f2fs_sb_has_extra_attr(sbi)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + fi->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; } - return !ret; -} + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); -/* - * Set multimedia files as cold files for hot/cold data separation - */ -static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, - const unsigned char *name) -{ - int i; - __u8 (*extlist)[8] = sbi->raw_super->extension_list; + if (f2fs_may_inline_dentry(inode)) + set_inode_flag(inode, FI_INLINE_DENTRY); - int count = le32_to_cpu(sbi->raw_super->extension_count); - for (i = 0; i < count; i++) { - if (is_multimedia_file(name, extlist[i])) { - file_set_cold(inode); - break; - } + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + if (f2fs_has_inline_xattr(inode)) + xattr_size = F2FS_OPTION(sbi).inline_xattr_size; + /* Otherwise, will be 0 */ + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + xattr_size = DEFAULT_INLINE_XATTR_ADDRS; } + fi->i_inline_xattr_size = xattr_size; + + fi->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + fi->i_flags |= F2FS_INDEX_FL; + + if (fi->i_flags & F2FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + /* Check compression first. */ + set_compress_new_inode(sbi, dir, inode, name); + + /* Should enable inline_data after compression set */ + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); + + if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY)) + set_file_temperature(sbi, inode, name); + + stat_inc_inline_xattr(inode); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + + f2fs_set_inode_flags(inode); + + f2fs_init_extent_tree(inode); + + trace_f2fs_new_inode(inode, 0); + return inode; + +fail: + trace_f2fs_new_inode(inode, err); + make_bad_inode(inode); + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + iput(inode); + return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + make_bad_inode(inode); + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); } -static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int f2fs_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; nid_t ino = 0; - int err, ilock; + int err; - f2fs_balance_fs(sbi); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; - inode = f2fs_new_inode(dir, mode); + err = f2fs_dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(idmap, dir, mode, dentry->d_name.name); if (IS_ERR(inode)) return PTR_ERR(inode); - if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_cold_files(sbi, inode, dentry->d_name.name); - inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); if (err) goto out; + f2fs_unlock_op(sbi); - alloc_nid_done(sbi, ino); + f2fs_alloc_nid_done(sbi, ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: - clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); - alloc_nid_failed(sbi, ino); + f2fs_handle_failed_inode(inode); return err; } static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - int err, ilock; + struct inode *inode = d_inode(old_dentry); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; - f2fs_balance_fs(sbi); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; - inode->i_ctime = CURRENT_TIME; + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; + + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(inode)->i_projid))) + return -EXDEV; + + err = f2fs_dquot_initialize(dir); + if (err) + return err; + + f2fs_balance_fs(sbi, true); + + inode_set_ctime_current(inode); ihold(inode); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); - ilock = mutex_lock_op(sbi); + set_inode_flag(inode, FI_INC_LINK); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); if (err) goto out; + f2fs_unlock_op(sbi); d_instantiate(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); return 0; out: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); iput(inode); + f2fs_unlock_op(sbi); return err; } struct dentry *f2fs_get_parent(struct dentry *child) { - struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot); - if (!ino) + struct folio *folio; + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &folio); + + if (!ino) { + if (IS_ERR(folio)) + return ERR_CAST(folio); return ERR_PTR(-ENOENT); - return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino)); + } + return d_obtain_alias(f2fs_iget(child->d_sb, ino)); } static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, @@ -203,283 +463,883 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, { struct inode *inode = NULL; struct f2fs_dir_entry *de; - struct page *page; + struct folio *folio; + struct dentry *new; + nid_t ino = -1; + int err = 0; + struct f2fs_filename fname; + + trace_f2fs_lookup_start(dir, dentry, flags); - if (dentry->d_name.len > F2FS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > F2FS_NAME_LEN) { + err = -ENAMETOOLONG; + goto out; + } + + err = f2fs_prepare_lookup(dir, dentry, &fname); + if (err == -ENOENT) + goto out_splice; + if (err) + goto out; + de = __f2fs_find_entry(dir, &fname, &folio); + f2fs_free_filename(&fname); + + if (!de) { + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto out; + } + err = -ENOENT; + goto out_splice; + } - de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (de) { - nid_t ino = le32_to_cpu(de->ino); - kunmap(page); - f2fs_put_page(page, 0); + ino = le32_to_cpu(de->ino); + f2fs_folio_put(folio, false); + + inode = f2fs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } - inode = f2fs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (inode->i_nlink == 0) { + f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink", + __func__, inode->i_ino); + err = -EFSCORRUPTED; + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + goto out_iput; } - return d_splice_alias(inode, dentry); + if (IS_ENCRYPTED(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + f2fs_warn(F2FS_I_SB(inode), "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; + goto out_iput; + } +out_splice: + if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) { + /* Eventually we want to call d_add_ci(dentry, NULL) + * for negative dentries in the encoding case as + * well. For now, prevent the negative dentry + * from being cached. + */ + trace_f2fs_lookup_end(dir, dentry, ino, err); + return NULL; + } + + new = d_splice_alias(inode, dentry); + trace_f2fs_lookup_end(dir, !IS_ERR_OR_NULL(new) ? new : dentry, + ino, IS_ERR(new) ? PTR_ERR(new) : err); + return new; +out_iput: + iput(inode); +out: + trace_f2fs_lookup_end(dir, dentry, ino, err); + return ERR_PTR(err); } static int f2fs_unlink(struct inode *dir, struct dentry *dentry) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - struct inode *inode = dentry->d_inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode = d_inode(dentry); struct f2fs_dir_entry *de; - struct page *page; - int err = -ENOENT; - int ilock; + struct folio *folio; + int err; trace_f2fs_unlink_enter(dir, dentry); - f2fs_balance_fs(sbi); - de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) - goto fail; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto out; + } - err = check_orphan_space(sbi); - if (err) { - kunmap(page); - f2fs_put_page(page, 0); - goto fail; + err = f2fs_dquot_initialize(dir); + if (err) + goto out; + err = f2fs_dquot_initialize(inode); + if (err) + goto out; + + de = f2fs_find_entry(dir, &dentry->d_name, &folio); + if (!de) { + if (IS_ERR(folio)) + err = PTR_ERR(folio); + goto out; } - ilock = mutex_lock_op(sbi); - f2fs_delete_entry(de, page, inode); - mutex_unlock_op(sbi, ilock); + if (unlikely(inode->i_nlink == 0)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has zero i_nlink", + __func__, inode->i_ino); + goto corrupted; + } else if (S_ISDIR(inode->i_mode) && unlikely(inode->i_nlink == 1)) { + f2fs_warn(sbi, "%s: directory inode (ino=%lx) has a single i_nlink", + __func__, inode->i_ino); + goto corrupted; + } - /* In order to evict this inode, we set it dirty */ - mark_inode_dirty(inode); -fail: + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + err = f2fs_acquire_orphan_inode(sbi); + if (err) { + f2fs_unlock_op(sbi); + f2fs_folio_put(folio, false); + goto out; + } + f2fs_delete_entry(de, folio, dir, inode); + f2fs_unlock_op(sbi); + + /* VFS negative dentries are incompatible with Encoding and + * Case-insensitiveness. Eventually we'll want avoid + * invalidating the dentries here, alongside with returning the + * negative dentries at f2fs_lookup(), when it is better + * supported by the VFS for the CI case. + */ + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_invalidate(dentry); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + + goto out; +corrupted: + err = -EFSCORRUPTED; + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_folio_put(folio, false); +out: trace_f2fs_unlink_exit(inode, err); return err; } -static int f2fs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static const char *f2fs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + const char *link = page_get_link(dentry, inode, done); + + if (!IS_ERR(link) && !*link) { + /* this is broken symlink case */ + do_delayed_call(done); + clear_delayed_call(done); + link = ERR_PTR(-ENOENT); + } + return link; +} + +static int f2fs_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *symname) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; - size_t symlen = strlen(symname) + 1; - int err, ilock; + size_t len = strlen(symname); + struct fscrypt_str disk_link; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, + &disk_link); + if (err) + return err; - f2fs_balance_fs(sbi); + err = f2fs_dquot_initialize(dir); + if (err) + return err; - inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); + inode = f2fs_new_inode(idmap, dir, S_IFLNK | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); - inode->i_op = &f2fs_symlink_inode_operations; + if (IS_ENCRYPTED(inode)) + inode->i_op = &f2fs_encrypted_symlink_inode_operations; + else + inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); if (err) - goto out; + goto out_f2fs_handle_failed_inode; + f2fs_unlock_op(sbi); + f2fs_alloc_nid_done(sbi, inode->i_ino); - err = page_symlink(inode, symname, symlen); - alloc_nid_done(sbi, inode->i_ino); + err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); + if (err) + goto err_out; + + err = page_symlink(inode, disk_link.name, disk_link.len); + +err_out: + d_instantiate_new(dentry, inode); + + /* + * Let's flush symlink data in order to avoid broken symlink as much as + * possible. Nevertheless, fsyncing is the best way, but there is no + * way to get a file descriptor in order to flush that. + * + * Note that, it needs to do dir->fsync to make this recoverable. + * If the symlink path is stored into inline_data, there is no + * performance regression. + */ + if (!err) { + filemap_write_and_wait_range(inode->i_mapping, 0, + disk_link.len - 1); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + } else { + f2fs_unlink(dir, dentry); + } - d_instantiate(dentry, inode); - unlock_new_inode(inode); - return err; -out: - clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); - alloc_nid_failed(sbi, inode->i_ino); + f2fs_balance_fs(sbi, true); + goto out_free_encrypted_link; + +out_f2fs_handle_failed_inode: + f2fs_handle_failed_inode(inode); +out_free_encrypted_link: + if (disk_link.name != (unsigned char *)symname) + kfree(disk_link.name); return err; } -static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; - int err, ilock; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return ERR_PTR(-EIO); - f2fs_balance_fs(sbi); + err = f2fs_dquot_initialize(dir); + if (err) + return ERR_PTR(err); - inode = f2fs_new_inode(dir, S_IFDIR | mode); + inode = f2fs_new_inode(idmap, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); - ilock = mutex_lock_op(sbi); + set_inode_flag(inode, FI_INC_LINK); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); if (err) goto out_fail; + f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + d_instantiate_new(dentry, inode); - return 0; + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); + return NULL; out_fail: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); - clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); - alloc_nid_failed(sbi, inode->i_ino); - return err; + clear_inode_flag(inode, FI_INC_LINK); + f2fs_handle_failed_inode(inode); + return ERR_PTR(err); } static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); + if (f2fs_empty_dir(inode)) return f2fs_unlink(dir, dentry); return -ENOTEMPTY; } -static int f2fs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int f2fs_mknod(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; int err = 0; - int ilock; - if (!new_valid_dev(rdev)) - return -EINVAL; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; - f2fs_balance_fs(sbi); + err = f2fs_dquot_initialize(dir); + if (err) + return err; - inode = f2fs_new_inode(dir, mode); + inode = f2fs_new_inode(idmap, dir, mode, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - ilock = mutex_lock_op(sbi); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - mutex_unlock_op(sbi, ilock); if (err) goto out; + f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); - unlock_new_inode(inode); + f2fs_alloc_nid_done(sbi, inode->i_ino); + + d_instantiate_new(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: - clear_nlink(inode); + f2fs_handle_failed_inode(inode); + return err; +} + +static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, + struct file *file, umode_t mode, bool is_whiteout, + struct inode **new_inode, struct f2fs_filename *fname) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err; + + err = f2fs_dquot_initialize(dir); + if (err) + return err; + + inode = f2fs_new_inode(idmap, dir, mode, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (is_whiteout) { + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + inode->i_op = &f2fs_special_inode_operations; + } else { + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } + + f2fs_lock_op(sbi); + err = f2fs_acquire_orphan_inode(sbi); + if (err) + goto out; + + err = f2fs_do_tmpfile(inode, dir, fname); + if (err) + goto release_out; + + /* + * add this non-linked tmpfile to orphan list, in this way we could + * remove all unused data of tmpfile after abnormal power-off. + */ + f2fs_add_orphan_inode(inode); + f2fs_alloc_nid_done(sbi, inode->i_ino); + + if (is_whiteout) { + f2fs_i_links_write(inode, false); + + spin_lock(&inode->i_lock); + inode_state_set(inode, I_LINKABLE); + spin_unlock(&inode->i_lock); + } else { + if (file) + d_tmpfile(file, inode); + else + f2fs_i_links_write(inode, false); + } + /* link_count was changed by d_tmpfile as well. */ + f2fs_unlock_op(sbi); unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); - alloc_nid_failed(sbi, inode->i_ino); + + if (new_inode) + *new_inode = inode; + + f2fs_balance_fs(sbi, true); + return 0; + +release_out: + f2fs_release_orphan_inode(sbi); +out: + f2fs_handle_failed_inode(inode); return err; } -static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, + struct file *file, umode_t mode) { - struct super_block *sb = old_dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; - struct page *old_dir_page; - struct page *old_page; + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL, NULL); + + return finish_open_simple(file, err); +} + +static int f2fs_create_whiteout(struct mnt_idmap *idmap, + struct inode *dir, struct inode **whiteout, + struct f2fs_filename *fname) +{ + return __f2fs_tmpfile(idmap, dir, NULL, S_IFCHR | WHITEOUT_MODE, + true, whiteout, fname); +} + +int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, + struct inode **new_inode) +{ + return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG, + false, new_inode, NULL); +} + +static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct inode *whiteout = NULL; + struct folio *old_dir_folio = NULL; + struct folio *old_folio, *new_folio = NULL; struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; - int err = -ENOENT, ilock = -1; + bool old_is_dir = S_ISDIR(old_inode->i_mode); + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_inode)->i_projid))) + return -EXDEV; + + /* + * If new_inode is null, the below renaming flow will + * add a link in old_dir which can convert inline_dir. + * After then, if we failed to get the entry due to other + * reasons like ENOMEM, we had to remove the new entry. + * Instead of adding such the error handling routine, let's + * simply convert first here. + */ + if (old_dir == new_dir && !new_inode) { + err = f2fs_try_convert_inline_dir(old_dir, new_dentry); + if (err) + return err; + } + + if (flags & RENAME_WHITEOUT) { + struct f2fs_filename fname; + + err = f2fs_setup_filename(old_dir, &old_dentry->d_name, + 0, &fname); + if (err) + return err; - f2fs_balance_fs(sbi); + err = f2fs_create_whiteout(idmap, old_dir, &whiteout, &fname); + if (err) + return err; + } - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + err = f2fs_dquot_initialize(old_dir); + if (err) goto out; - if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; - old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) - goto out_old; + err = f2fs_dquot_initialize(new_dir); + if (err) + goto out; + + if (new_inode) { + err = f2fs_dquot_initialize(new_inode); + if (err) + goto out; + } + + err = -ENOENT; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_folio); + if (!old_entry) { + if (IS_ERR(old_folio)) + err = PTR_ERR(old_folio); + goto out; } - ilock = mutex_lock_op(sbi); + if (old_is_dir && old_dir != new_dir) { + old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_folio); + if (!old_dir_entry) { + if (IS_ERR(old_dir_folio)) + err = PTR_ERR(old_dir_folio); + goto out_old; + } + } if (new_inode) { - struct page *new_page; err = -ENOTEMPTY; - if (old_dir_entry && !f2fs_empty_dir(new_inode)) + if (old_is_dir && !f2fs_empty_dir(new_inode)) goto out_dir; err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, - &new_page); - if (!new_entry) + &new_folio); + if (!new_entry) { + if (IS_ERR(new_folio)) + err = PTR_ERR(new_folio); goto out_dir; + } + + f2fs_balance_fs(sbi, true); - f2fs_set_link(new_dir, new_entry, new_page, old_inode); + f2fs_lock_op(sbi); + + err = f2fs_acquire_orphan_inode(sbi); + if (err) + goto put_out_dir; + + f2fs_set_link(new_dir, new_entry, new_folio, old_inode); + new_folio = NULL; + + inode_set_ctime_current(new_inode); + f2fs_down_write(&F2FS_I(new_inode)->i_sem); + if (old_is_dir) + f2fs_i_links_write(new_inode, false); + f2fs_i_links_write(new_inode, false); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); - new_inode->i_ctime = CURRENT_TIME; - if (old_dir_entry) - drop_nlink(new_inode); - drop_nlink(new_inode); if (!new_inode->i_nlink) - add_orphan_inode(sbi, new_inode->i_ino); - update_inode_page(new_inode); + f2fs_add_orphan_inode(new_inode); + else + f2fs_release_orphan_inode(sbi); } else { + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + err = f2fs_add_link(new_dentry, old_inode); - if (err) + if (err) { + f2fs_unlock_op(sbi); goto out_dir; - - if (old_dir_entry) { - inc_nlink(new_dir); - update_inode_page(new_dir); } + + if (old_is_dir) + f2fs_i_links_write(new_dir, true); } - old_inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(old_inode); + f2fs_down_write(&F2FS_I(old_inode)->i_sem); + if (!old_is_dir || whiteout) + file_lost_pino(old_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(old_inode, new_dir->i_ino); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); + + inode_set_ctime_current(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); + + f2fs_delete_entry(old_entry, old_folio, old_dir, NULL); + old_folio = NULL; + + if (whiteout) { + set_inode_flag(whiteout, FI_INC_LINK); + err = f2fs_add_link(old_dentry, whiteout); + if (err) { + d_invalidate(old_dentry); + d_invalidate(new_dentry); + goto put_out_dir; + } + spin_lock(&whiteout->i_lock); + inode_state_clear(whiteout, I_LINKABLE); + spin_unlock(&whiteout->i_lock); - f2fs_delete_entry(old_entry, old_page, NULL); + iput(whiteout); + } - if (old_dir_entry) { - if (old_dir != new_dir) { - f2fs_set_link(old_inode, old_dir_entry, - old_dir_page, new_dir); - } else { - kunmap(old_dir_page); - f2fs_put_page(old_dir_page, 0); - } - drop_nlink(old_dir); - update_inode_page(old_dir); + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_folio, new_dir); + if (old_is_dir) + f2fs_i_links_write(old_dir, false); + + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (S_ISDIR(old_inode->i_mode)) + f2fs_add_ino_entry(sbi, old_inode->i_ino, + TRANS_DIR_INO); } - mutex_unlock_op(sbi, ilock); + f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_update_time(sbi, REQ_TIME); return 0; +put_out_dir: + f2fs_unlock_op(sbi); + f2fs_folio_put(new_folio, false); out_dir: + if (old_dir_entry) + f2fs_folio_put(old_dir_folio, false); +out_old: + f2fs_folio_put(old_folio, false); +out: + iput(whiteout); + return err; +} + +static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct folio *old_dir_folio, *new_dir_folio; + struct folio *old_folio, *new_folio; + struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; + struct f2fs_dir_entry *old_entry, *new_entry; + int old_nlink = 0, new_nlink = 0; + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_inode)->i_projid)) || + (is_inode_flag_set(old_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_inode)->i_projid))) + return -EXDEV; + + err = f2fs_dquot_initialize(old_dir); + if (err) + goto out; + + err = f2fs_dquot_initialize(new_dir); + if (err) + goto out; + + err = -ENOENT; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_folio); + if (!old_entry) { + if (IS_ERR(old_folio)) + err = PTR_ERR(old_folio); + goto out; + } + + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_folio); + if (!new_entry) { + if (IS_ERR(new_folio)) + err = PTR_ERR(new_folio); + goto out_old; + } + + /* prepare for updating ".." directory entry info later */ + if (old_dir != new_dir) { + if (S_ISDIR(old_inode->i_mode)) { + old_dir_entry = f2fs_parent_dir(old_inode, + &old_dir_folio); + if (!old_dir_entry) { + if (IS_ERR(old_dir_folio)) + err = PTR_ERR(old_dir_folio); + goto out_new; + } + } + + if (S_ISDIR(new_inode->i_mode)) { + new_dir_entry = f2fs_parent_dir(new_inode, + &new_dir_folio); + if (!new_dir_entry) { + if (IS_ERR(new_dir_folio)) + err = PTR_ERR(new_dir_folio); + goto out_old_dir; + } + } + } + + /* + * If cross rename between file and directory those are not + * in the same directory, we will inc nlink of file's parent + * later, so we should check upper boundary of its nlink. + */ + if ((!old_dir_entry || !new_dir_entry) && + old_dir_entry != new_dir_entry) { + old_nlink = old_dir_entry ? -1 : 1; + new_nlink = -old_nlink; + err = -EMLINK; + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) + goto out_new_dir; + } + + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + + /* update ".." directory entry info of old dentry */ + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_folio, new_dir); + + /* update ".." directory entry info of new dentry */ + if (new_dir_entry) + f2fs_set_link(new_inode, new_dir_entry, new_dir_folio, old_dir); + + /* update directory entry info of old dir inode */ + f2fs_set_link(old_dir, old_entry, old_folio, new_inode); + + f2fs_down_write(&F2FS_I(old_inode)->i_sem); + if (!old_dir_entry) + file_lost_pino(old_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(old_inode, new_dir->i_ino); + f2fs_up_write(&F2FS_I(old_inode)->i_sem); + + inode_set_ctime_current(old_dir); + if (old_nlink) { + f2fs_down_write(&F2FS_I(old_dir)->i_sem); + f2fs_i_links_write(old_dir, old_nlink > 0); + f2fs_up_write(&F2FS_I(old_dir)->i_sem); + } + f2fs_mark_inode_dirty_sync(old_dir, false); + + /* update directory entry info of new dir inode */ + f2fs_set_link(new_dir, new_entry, new_folio, old_inode); + + f2fs_down_write(&F2FS_I(new_inode)->i_sem); + if (!new_dir_entry) + file_lost_pino(new_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(new_inode, old_dir->i_ino); + f2fs_up_write(&F2FS_I(new_inode)->i_sem); + + inode_set_ctime_current(new_dir); + if (new_nlink) { + f2fs_down_write(&F2FS_I(new_dir)->i_sem); + f2fs_i_links_write(new_dir, new_nlink > 0); + f2fs_up_write(&F2FS_I(new_dir)->i_sem); + } + f2fs_mark_inode_dirty_sync(new_dir, false); + + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { + f2fs_add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + } + + f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + f2fs_sync_fs(sbi->sb, 1); + + f2fs_update_time(sbi, REQ_TIME); + return 0; +out_new_dir: + if (new_dir_entry) { + f2fs_folio_put(new_dir_folio, false); + } +out_old_dir: if (old_dir_entry) { - kunmap(old_dir_page); - f2fs_put_page(old_dir_page, 0); + f2fs_folio_put(old_dir_folio, false); } - mutex_unlock_op(sbi, ilock); +out_new: + f2fs_folio_put(new_folio, false); out_old: - kunmap(old_page); - f2fs_put_page(old_page, 0); + f2fs_folio_put(old_folio, false); out: return err; } +static int f2fs_rename2(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + trace_f2fs_rename_start(old_dir, old_dentry, new_dir, new_dentry, + flags); + + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + + if (flags & RENAME_EXCHANGE) + err = f2fs_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + else + /* + * VFS has already handled the new dentry existence case, + * here, we just deal with "RENAME_NOREPLACE" as regular rename. + */ + err = f2fs_rename(idmap, old_dir, old_dentry, + new_dir, new_dentry, flags); + + trace_f2fs_rename_end(old_dentry, new_dentry, flags, err); + return err; +} + +static const char *f2fs_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct folio *folio; + const char *target; + + if (!dentry) + return ERR_PTR(-ECHILD); + + folio = read_mapping_folio(inode->i_mapping, 0, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); + + target = fscrypt_get_symlink(inode, folio_address(folio), + inode->i_sb->s_blocksize, done); + folio_put(folio); + return target; +} + +static int f2fs_encrypted_symlink_getattr(struct mnt_idmap *idmap, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + f2fs_getattr(idmap, path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + +const struct inode_operations f2fs_encrypted_symlink_inode_operations = { + .get_link = f2fs_encrypted_get_link, + .getattr = f2fs_encrypted_symlink_getattr, + .setattr = f2fs_setattr, + .listxattr = f2fs_listxattr, +}; + const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, .lookup = f2fs_lookup, @@ -489,40 +1349,29 @@ const struct inode_operations f2fs_dir_inode_operations = { .mkdir = f2fs_mkdir, .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, - .rename = f2fs_rename, + .rename = f2fs_rename2, + .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, -#ifdef CONFIG_F2FS_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, + .get_inode_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, - .removexattr = generic_removexattr, -#endif + .fiemap = f2fs_fiemap, + .fileattr_get = f2fs_fileattr_get, + .fileattr_set = f2fs_fileattr_set, }; const struct inode_operations f2fs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = f2fs_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, -#ifdef CONFIG_F2FS_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, - .removexattr = generic_removexattr, -#endif }; const struct inode_operations f2fs_special_inode_operations = { .getattr = f2fs_getattr, - .setattr = f2fs_setattr, - .get_acl = f2fs_get_acl, -#ifdef CONFIG_F2FS_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, + .setattr = f2fs_setattr, + .get_inode_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, .listxattr = f2fs_listxattr, - .removexattr = generic_removexattr, -#endif }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b418aee09573..482a362f2625 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1,17 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/node.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/mpage.h> -#include <linux/backing-dev.h> +#include <linux/sched/mm.h> #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/swap.h> @@ -19,106 +16,223 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" +#include "iostat.h" #include <trace/events/f2fs.h> +#define on_f2fs_build_free_nids(nm_i) mutex_is_locked(&(nm_i)->build_lock) + static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +static struct kmem_cache *nat_entry_set_slab; +static struct kmem_cache *fsync_node_entry_slab; -static void clear_node_page_dirty(struct page *page) +static inline bool is_invalid_nid(struct f2fs_sb_info *sbi, nid_t nid) { - struct address_space *mapping = page->mapping; - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - unsigned int long flags; + return nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid; +} - if (PageDirty(page)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); +/* + * Check whether the given nid is within node id range. + */ +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +{ + if (unlikely(is_invalid_nid(sbi, nid))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", + __func__, nid); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + return 0; +} - clear_page_dirty_for_io(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct sysinfo val; + unsigned long avail_ram; + unsigned long mem_size = 0; + bool res = false; + + if (!nm_i) + return true; + + si_meminfo(&val); + + /* only uses low memory */ + avail_ram = val.totalram - val.totalhigh; + + /* + * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively + */ + if (type == FREE_NIDS) { + mem_size = (nm_i->nid_cnt[FREE_NID] * + sizeof(struct free_nid)) >> PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + } else if (type == NAT_ENTRIES) { + mem_size = (nm_i->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry)) >> PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + if (excess_cached_nats(sbi)) + res = false; + } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->wb.dirty_exceeded) + return false; + mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == INO_ENTRIES) { + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) { + enum extent_type etype = type == READ_EXTENT_CACHE ? + EX_READ : EX_BLOCK_AGE; + struct extent_tree_info *eti = &sbi->extent_tree[etype]; + + mem_size = (atomic_read(&eti->total_ext_tree) * + sizeof(struct extent_tree) + + atomic_read(&eti->total_ext_node) * + sizeof(struct extent_node)) >> PAGE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + } else if (type == DISCARD_CACHE) { + mem_size = (atomic_read(&dcc->discard_cmd_cnt) * + sizeof(struct discard_cmd)) >> PAGE_SHIFT; + res = mem_size < (avail_ram * nm_i->ram_thresh / 100); + } else if (type == COMPRESS_PAGE) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned long free_ram = val.freeram; + + /* + * free memory is lower than watermark or cached page count + * exceed threshold, deny caching compress page. + */ + res = (free_ram > avail_ram * sbi->compress_watermark / 100) && + (COMPRESS_MAPPING(sbi)->nrpages < + free_ram * sbi->compress_percent / 100); +#else + res = false; +#endif + } else { + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; + } + return res; +} + +static void clear_node_folio_dirty(struct folio *folio) +{ + if (folio_test_dirty(folio)) { + f2fs_clear_page_cache_dirty_tag(folio); + folio_clear_dirty_for_io(folio); + dec_page_count(F2FS_F_SB(folio), F2FS_DIRTY_NODES); } - ClearPageUptodate(page); + folio_clear_uptodate(folio); } -static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +static struct folio *get_current_nat_folio(struct f2fs_sb_info *sbi, nid_t nid) { - pgoff_t index = current_nat_addr(sbi, nid); - return get_meta_page(sbi, index); + return f2fs_get_meta_folio_retry(sbi, current_nat_addr(sbi, nid)); } -static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +static struct folio *get_next_nat_folio(struct f2fs_sb_info *sbi, nid_t nid) { - struct page *src_page; - struct page *dst_page; - pgoff_t src_off; + struct folio *src_folio; + struct folio *dst_folio; pgoff_t dst_off; void *src_addr; void *dst_addr; struct f2fs_nm_info *nm_i = NM_I(sbi); - src_off = current_nat_addr(sbi, nid); - dst_off = next_nat_addr(sbi, src_off); + dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid)); /* get current nat block page with lock */ - src_page = get_meta_page(sbi, src_off); + src_folio = get_current_nat_folio(sbi, nid); + if (IS_ERR(src_folio)) + return src_folio; + dst_folio = f2fs_grab_meta_folio(sbi, dst_off); + f2fs_bug_on(sbi, folio_test_dirty(src_folio)); + + src_addr = folio_address(src_folio); + dst_addr = folio_address(dst_folio); + memcpy(dst_addr, src_addr, PAGE_SIZE); + folio_mark_dirty(dst_folio); + f2fs_folio_put(src_folio, true); - /* Dirty src_page means that it is already the new target NAT page. */ - if (PageDirty(src_page)) - return src_page; + set_to_next_nat(nm_i, nid); - dst_page = grab_meta_page(sbi, dst_off); + return dst_folio; +} - src_addr = page_address(src_page); - dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); - set_page_dirty(dst_page); - f2fs_put_page(src_page, 1); +static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, + nid_t nid, bool no_fail) +{ + struct nat_entry *new; - set_to_next_nat(nm_i, nid); + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_F2FS_ZERO, no_fail, sbi); + if (new) { + nat_set_nid(new, nid); + nat_reset_flag(new); + } + return new; +} - return dst_page; +static void __free_nat_entry(struct nat_entry *e) +{ + kmem_cache_free(nat_entry_slab, e); } -/* - * Readahead NAT pages - */ -static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) +/* must be locked by nat_tree_lock */ +static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, + struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail, bool init_dirty) { - struct address_space *mapping = sbi->meta_inode->i_mapping; - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct blk_plug plug; - struct page *page; - pgoff_t index; - int i; + if (no_fail) + f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne); + else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne)) + return NULL; - blk_start_plug(&plug); + if (raw_ne) + node_info_from_raw_nat(&ne->ni, raw_ne); - for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { - if (nid >= nm_i->max_nid) - nid = 0; - index = current_nat_addr(sbi, nid); + if (init_dirty) { + INIT_LIST_HEAD(&ne->list); + nm_i->nat_cnt[TOTAL_NAT]++; + return ne; + } - page = grab_cache_page(mapping, index); - if (!page) - continue; - if (PageUptodate(page)) { - f2fs_put_page(page, 1); - continue; - } - if (f2fs_readpage(sbi, page, index, READ)) - continue; + spin_lock(&nm_i->nat_list_lock); + list_add_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); - f2fs_put_page(page, 0); - } - blk_finish_plug(&plug); + nm_i->nat_cnt[TOTAL_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; + return ne; } -static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) +static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n, bool for_dirty) { - return radix_tree_lookup(&nm_i->nat_root, n); + struct nat_entry *ne; + + ne = radix_tree_lookup(&nm_i->nat_root, n); + + /* + * for recent accessed nat entry which will not be dirtied soon + * later, move it to tail of lru list. + */ + if (ne && !get_nat_flag(ne, IS_DIRTY) && !for_dirty) { + spin_lock(&nm_i->nat_list_lock); + if (!list_empty(&ne->list)) + list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + } + + return ne; } static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, @@ -129,198 +243,479 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) { - list_del(&e->list); radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); - nm_i->nat_cnt--; - kmem_cache_free(nat_entry_slab, e); + nm_i->nat_cnt[TOTAL_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; + __free_nat_entry(e); } -int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) { - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct nat_entry *e; - int is_cp = 1; + nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); + struct nat_entry_set *head; + + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (!head) { + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, + GFP_NOFS, true, NULL); + + INIT_LIST_HEAD(&head->entry_list); + INIT_LIST_HEAD(&head->set_list); + head->set = set; + head->entry_cnt = 0; + f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); + } + return head; +} - read_lock(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); - if (e && !e->checkpointed) - is_cp = 0; - read_unlock(&nm_i->nat_tree_lock); - return is_cp; +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne, bool init_dirty) +{ + struct nat_entry_set *head; + bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR; + + if (!new_ne) + head = __grab_nat_entry_set(nm_i, ne); + + /* + * update entry_cnt in below condition: + * 1. update NEW_ADDR to valid block address; + * 2. update old block address to new one; + */ + if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) || + !get_nat_flag(ne, IS_DIRTY))) + head->entry_cnt++; + + set_nat_flag(ne, IS_PREALLOC, new_ne); + + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + + nm_i->nat_cnt[DIRTY_NAT]++; + if (!init_dirty) + nm_i->nat_cnt[RECLAIMABLE_NAT]--; + set_nat_flag(ne, IS_DIRTY, true); +refresh_list: + spin_lock(&nm_i->nat_list_lock); + if (new_ne) + list_del_init(&ne->list); + else + list_move_tail(&ne->list, &head->entry_list); + spin_unlock(&nm_i->nat_list_lock); } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry_set *set, struct nat_entry *ne) { - struct nat_entry *new; + spin_lock(&nm_i->nat_list_lock); + list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->nat_cnt[DIRTY_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; +} - new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC); - if (!new) - return NULL; - if (radix_tree_insert(&nm_i->nat_root, nid, new)) { - kmem_cache_free(nat_entry_slab, new); - return NULL; +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry_set **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep, + start, nr); +} + +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio) +{ + return is_node_folio(folio) && IS_DNODE(folio) && is_cold_node(folio); +} + +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) +{ + spin_lock_init(&sbi->fsync_node_lock); + INIT_LIST_HEAD(&sbi->fsync_node_list); + sbi->fsync_seg_id = 0; + sbi->fsync_node_num = 0; +} + +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, + struct folio *folio) +{ + struct fsync_node_entry *fn; + unsigned long flags; + unsigned int seq_id; + + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, + GFP_NOFS, true, NULL); + + folio_get(folio); + fn->folio = folio; + INIT_LIST_HEAD(&fn->list); + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_add_tail(&fn->list, &sbi->fsync_node_list); + fn->seq_id = sbi->fsync_seg_id++; + seq_id = fn->seq_id; + sbi->fsync_node_num++; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + + return seq_id; +} + +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct folio *folio) +{ + struct fsync_node_entry *fn; + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_for_each_entry(fn, &sbi->fsync_node_list, list) { + if (fn->folio == folio) { + list_del(&fn->list); + sbi->fsync_node_num--; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + kmem_cache_free(fsync_node_entry_slab, fn); + folio_put(folio); + return; + } } - memset(new, 0, sizeof(struct nat_entry)); - nat_set_nid(new, nid); - list_add_tail(&new->list, &nm_i->nat_entries); - nm_i->nat_cnt++; - return new; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + f2fs_bug_on(sbi, 1); } -static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, - struct f2fs_nat_entry *ne) +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi) { + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + sbi->fsync_seg_id = 0; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); +} + +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; -retry: - write_lock(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); - if (!e) { - e = grab_nat_entry(nm_i, nid); - if (!e) { - write_unlock(&nm_i->nat_tree_lock); - goto retry; - } - nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); - nat_set_ino(e, le32_to_cpu(ne->ino)); - nat_set_version(e, ne->version); - e->checkpointed = true; + bool need = false; + + f2fs_down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid, false); + if (e) { + if (!get_nat_flag(e, IS_CHECKPOINTED) && + !get_nat_flag(e, HAS_FSYNCED_INODE)) + need = true; } - write_unlock(&nm_i->nat_tree_lock); + f2fs_up_read(&nm_i->nat_tree_lock); + return need; +} + +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool is_cp = true; + + f2fs_down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid, false); + if (e && !get_nat_flag(e, IS_CHECKPOINTED)) + is_cp = false; + f2fs_up_read(&nm_i->nat_tree_lock); + return is_cp; +} + +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool need_update = true; + + f2fs_down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ino, false); + if (e && get_nat_flag(e, HAS_LAST_FSYNC) && + (get_nat_flag(e, IS_CHECKPOINTED) || + get_nat_flag(e, HAS_FSYNCED_INODE))) + need_update = false; + f2fs_up_read(&nm_i->nat_tree_lock); + return need_update; +} + +/* must be locked by nat_tree_lock */ +static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, + struct f2fs_nat_entry *ne) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *new, *e; + + /* Let's mitigate lock contention of nat_tree_lock during checkpoint */ + if (f2fs_rwsem_is_locked(&sbi->cp_global_sem)) + return; + + new = __alloc_nat_entry(sbi, nid, false); + if (!new) + return; + + f2fs_down_write(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid, false); + if (!e) + e = __init_nat_entry(nm_i, new, ne, false, false); + else + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || + nat_get_version(e) != ne->version); + f2fs_up_write(&nm_i->nat_tree_lock); + if (e != new) + __free_nat_entry(new); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, - block_t new_blkaddr) + block_t new_blkaddr, bool fsync_done) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; -retry: - write_lock(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, ni->nid); + struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); + bool init_dirty = false; + + f2fs_down_write(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ni->nid, true); if (!e) { - e = grab_nat_entry(nm_i, ni->nid); - if (!e) { - write_unlock(&nm_i->nat_tree_lock); - goto retry; - } - e->ni = *ni; - e->checkpointed = true; - BUG_ON(ni->blk_addr == NEW_ADDR); + init_dirty = true; + e = __init_nat_entry(nm_i, new, NULL, true, true); + copy_node_info(&e->ni, ni); + f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* * when nid is reallocated, * previous nat entry can be remained in nat cache. * So, reinitialize it with new information. */ - e->ni = *ni; - BUG_ON(ni->blk_addr != NULL_ADDR); + copy_node_info(&e->ni, ni); + f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); } - - if (new_blkaddr == NEW_ADDR) - e->checkpointed = false; + /* let's free early to reduce memory consumption */ + if (e != new) + __free_nat_entry(new); /* sanity check */ - BUG_ON(nat_get_blkaddr(e) != ni->blk_addr); - BUG_ON(nat_get_blkaddr(e) == NULL_ADDR && + f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr); + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR && new_blkaddr == NULL_ADDR); - BUG_ON(nat_get_blkaddr(e) == NEW_ADDR && + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - BUG_ON(nat_get_blkaddr(e) != NEW_ADDR && - nat_get_blkaddr(e) != NULL_ADDR && + f2fs_bug_on(sbi, __is_valid_data_blkaddr(nat_get_blkaddr(e)) && new_blkaddr == NEW_ADDR); - /* increament version no as node is removed */ + /* increment version no as node is removed */ if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); + nat_set_version(e, inc_node_version(version)); } /* change address */ nat_set_blkaddr(e, new_blkaddr); - __set_nat_cache_dirty(nm_i, e); - write_unlock(&nm_i->nat_tree_lock); + if (!__is_valid_data_blkaddr(new_blkaddr)) + set_nat_flag(e, IS_CHECKPOINTED, false); + __set_nat_cache_dirty(nm_i, e, init_dirty); + + /* update fsync_mark if its inode nat entry is still alive */ + if (ni->nid != ni->ino) + e = __lookup_nat_cache(nm_i, ni->ino, false); + if (e) { + if (fsync_done && ni->nid == ni->ino) + set_nat_flag(e, HAS_FSYNCED_INODE, true); + set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); + } + f2fs_up_write(&nm_i->nat_tree_lock); } -static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); + int nr = nr_shrink; - if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) + if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock)) return 0; - write_lock(&nm_i->nat_tree_lock); - while (nr_shrink && !list_empty(&nm_i->nat_entries)) { + spin_lock(&nm_i->nat_list_lock); + while (nr_shrink) { struct nat_entry *ne; + + if (list_empty(&nm_i->nat_entries)) + break; + ne = list_first_entry(&nm_i->nat_entries, struct nat_entry, list); + list_del(&ne->list); + spin_unlock(&nm_i->nat_list_lock); + __del_from_nat_cache(nm_i, ne); nr_shrink--; + + spin_lock(&nm_i->nat_list_lock); } - write_unlock(&nm_i->nat_tree_lock); - return nr_shrink; + spin_unlock(&nm_i->nat_list_lock); + + f2fs_up_write(&nm_i->nat_tree_lock); + return nr - nr_shrink; } -/* - * This function returns always success - */ -void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni, bool checkpoint_context) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; nid_t start_nid = START_NID(nid); struct f2fs_nat_block *nat_blk; - struct page *page = NULL; + struct folio *folio = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; + pgoff_t index; int i; + bool need_cache = true; - memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + ni->flag = 0; ni->nid = nid; - +retry: /* Check nat cache */ - read_lock(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); + f2fs_down_read(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid, false); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); + f2fs_up_read(&nm_i->nat_tree_lock); + if (IS_ENABLED(CONFIG_F2FS_CHECK_FS)) { + need_cache = false; + goto sanity_check; + } + return 0; } - read_unlock(&nm_i->nat_tree_lock); - if (e) - return; - /* Check current segment summary */ - mutex_lock(&curseg->curseg_mutex); - i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); + /* + * Check current segment summary by trying to grab journal_rwsem first. + * This sem is on the critical path on the checkpoint requiring the above + * nat_tree_lock. Therefore, we should retry, if we failed to grab here + * while not bothering checkpoint. + */ + if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { + down_read(&curseg->journal_rwsem); + } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) || + !down_read_trylock(&curseg->journal_rwsem)) { + f2fs_up_read(&nm_i->nat_tree_lock); + goto retry; + } + + i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { - ne = nat_in_journal(sum, i); + ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - mutex_unlock(&curseg->curseg_mutex); - if (i >= 0) - goto cache; + up_read(&curseg->journal_rwsem); + if (i >= 0) { + f2fs_up_read(&nm_i->nat_tree_lock); + goto sanity_check; + } /* Fill node_info from nat page */ - page = get_current_nat_page(sbi, start_nid); - nat_blk = (struct f2fs_nat_block *)page_address(page); + index = current_nat_addr(sbi, nid); + f2fs_up_read(&nm_i->nat_tree_lock); + + folio = f2fs_get_meta_folio(sbi, index); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + nat_blk = folio_address(folio); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); - f2fs_put_page(page, 1); -cache: + f2fs_folio_put(folio, true); +sanity_check: + if (__is_valid_data_blkaddr(ni->blk_addr) && + !f2fs_is_valid_blkaddr(sbi, ni->blk_addr, + DATA_GENERIC_ENHANCE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_err_ratelimited(sbi, + "f2fs_get_node_info of %pS: inconsistent nat entry, " + "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", + __builtin_return_address(0), + ni->ino, ni->nid, ni->blk_addr, ni->version, ni->flag); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + return -EFSCORRUPTED; + } + /* cache nat entry */ - cache_nat_entry(NM_I(sbi), nid, &ne); + if (need_cache) + cache_nat_entry(sbi, nid, &ne); + return 0; +} + +/* + * readahead MAX_RA_NODE number of node pages. + */ +static void f2fs_ra_node_pages(struct folio *parent, int start, int n) +{ + struct f2fs_sb_info *sbi = F2FS_F_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + n; + end = min(end, (int)NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + f2fs_ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); +} + +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) +{ + const long direct_index = ADDRS_PER_INODE(dn->inode); + const long direct_blks = ADDRS_PER_BLOCK(dn->inode); + const long indirect_blks = ADDRS_PER_BLOCK(dn->inode) * NIDS_PER_BLOCK; + unsigned int skipped_unit = ADDRS_PER_BLOCK(dn->inode); + int cur_level = dn->cur_level; + int max_level = dn->max_level; + pgoff_t base = 0; + + if (!dn->max_level) + return pgofs + 1; + + while (max_level-- > cur_level) + skipped_unit *= NIDS_PER_BLOCK; + + switch (dn->max_level) { + case 3: + base += 2 * indirect_blks; + fallthrough; + case 2: + base += 2 * direct_blks; + fallthrough; + case 1: + base += direct_index; + break; + default: + f2fs_bug_on(F2FS_I_SB(dn->inode), 1); + } + + return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base; } /* * The maximum depth is four. * Offset[0] will have raw inode offset. */ -static int get_node_path(long block, int offset[4], unsigned int noffset[4]) +static int get_node_path(struct inode *inode, long block, + int offset[4], unsigned int noffset[4]) { - const long direct_index = ADDRS_PER_INODE; - const long direct_blks = ADDRS_PER_BLOCK; + const long direct_index = ADDRS_PER_INODE(inode); + const long direct_blks = ADDRS_PER_BLOCK(inode); const long dptrs_per_blk = NIDS_PER_BLOCK; - const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; + const long indirect_blks = ADDRS_PER_BLOCK(inode) * NIDS_PER_BLOCK; const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK; int n = 0; int level = 0; @@ -382,172 +777,257 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) level = 3; goto got; } else { - BUG(); + return -E2BIG; } got: return level; } +static struct folio *f2fs_get_node_folio_ra(struct folio *parent, int start); + /* * Caller should call f2fs_put_dnode(dn). - * Also, it should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op() only if ro is not set RDONLY_NODE. - * In the case of RDONLY_NODE, we don't need to care about mutex. + * Also, it should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op() only if mode is set with ALLOC_NODE. */ -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - struct page *npage[4]; - struct page *parent; + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct folio *nfolio[4]; + struct folio *parent = NULL; int offset[4]; unsigned int noffset[4]; nid_t nids[4]; - int level, i; + int level, i = 0; int err = 0; - level = get_node_path(index, offset, noffset); + level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; nids[0] = dn->inode->i_ino; - npage[0] = dn->inode_page; - if (!npage[0]) { - npage[0] = get_node_page(sbi, nids[0]); - if (IS_ERR(npage[0])) - return PTR_ERR(npage[0]); + if (!dn->inode_folio) { + nfolio[0] = f2fs_get_inode_folio(sbi, nids[0]); + if (IS_ERR(nfolio[0])) + return PTR_ERR(nfolio[0]); + } else { + nfolio[0] = dn->inode_folio; + } + + /* if inline_data is set, should not report any block indices */ + if (f2fs_has_inline_data(dn->inode) && index) { + err = -ENOENT; + f2fs_folio_put(nfolio[0], true); + goto release_out; } - parent = npage[0]; + + parent = nfolio[0]; if (level != 0) nids[1] = get_nid(parent, offset[0], true); - dn->inode_page = npage[0]; - dn->inode_page_locked = true; + dn->inode_folio = nfolio[0]; + dn->inode_folio_locked = true; /* get indirect or direct nodes */ for (i = 1; i <= level; i++) { bool done = false; + if (nids[i] && nids[i] == dn->inode->i_ino) { + err = -EFSCORRUPTED; + f2fs_err_ratelimited(sbi, + "inode mapping table is corrupted, run fsck to fix it, " + "ino:%lu, nid:%u, level:%d, offset:%d", + dn->inode->i_ino, nids[i], level, offset[level]); + set_sbi_flag(sbi, SBI_NEED_FSCK); + goto release_pages; + } + if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ - if (!alloc_nid(sbi, &(nids[i]))) { + if (!f2fs_alloc_nid(sbi, &(nids[i]))) { err = -ENOSPC; goto release_pages; } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i], NULL); - if (IS_ERR(npage[i])) { - alloc_nid_failed(sbi, nids[i]); - err = PTR_ERR(npage[i]); + nfolio[i] = f2fs_new_node_folio(dn, noffset[i]); + if (IS_ERR(nfolio[i])) { + f2fs_alloc_nid_failed(sbi, nids[i]); + err = PTR_ERR(nfolio[i]); goto release_pages; } set_nid(parent, offset[i - 1], nids[i], i == 1); - alloc_nid_done(sbi, nids[i]); + f2fs_alloc_nid_done(sbi, nids[i]); done = true; } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { - npage[i] = get_node_page_ra(parent, offset[i - 1]); - if (IS_ERR(npage[i])) { - err = PTR_ERR(npage[i]); + nfolio[i] = f2fs_get_node_folio_ra(parent, offset[i - 1]); + if (IS_ERR(nfolio[i])) { + err = PTR_ERR(nfolio[i]); goto release_pages; } done = true; } if (i == 1) { - dn->inode_page_locked = false; - unlock_page(parent); + dn->inode_folio_locked = false; + folio_unlock(parent); } else { - f2fs_put_page(parent, 1); + f2fs_folio_put(parent, true); } if (!done) { - npage[i] = get_node_page(sbi, nids[i]); - if (IS_ERR(npage[i])) { - err = PTR_ERR(npage[i]); - f2fs_put_page(npage[0], 0); + nfolio[i] = f2fs_get_node_folio(sbi, nids[i], + NODE_TYPE_NON_INODE); + if (IS_ERR(nfolio[i])) { + err = PTR_ERR(nfolio[i]); + f2fs_folio_put(nfolio[0], false); goto release_out; } } if (i < level) { - parent = npage[i]; + parent = nfolio[i]; nids[i + 1] = get_nid(parent, offset[i], false); } } dn->nid = nids[level]; dn->ofs_in_node = offset[level]; - dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->node_folio = nfolio[level]; + dn->data_blkaddr = f2fs_data_blkaddr(dn); + + if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) && + f2fs_sb_has_readonly(sbi)) { + unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + unsigned int ofs_in_node = dn->ofs_in_node; + pgoff_t fofs = index; + unsigned int c_len; + block_t blkaddr; + + /* should align fofs and ofs_in_node to cluster_size */ + if (fofs % cluster_size) { + fofs = round_down(fofs, cluster_size); + ofs_in_node = round_down(ofs_in_node, cluster_size); + } + + c_len = f2fs_cluster_blocks_are_contiguous(dn, ofs_in_node); + if (!c_len) + goto out; + + blkaddr = data_blkaddr(dn->inode, dn->node_folio, ofs_in_node); + if (blkaddr == COMPRESS_ADDR) + blkaddr = data_blkaddr(dn->inode, dn->node_folio, + ofs_in_node + 1); + + f2fs_update_read_extent_tree_range_compressed(dn->inode, + fofs, blkaddr, cluster_size, c_len); + } +out: return 0; release_pages: - f2fs_put_page(parent, 1); + f2fs_folio_put(parent, true); if (i > 1) - f2fs_put_page(npage[0], 0); + f2fs_folio_put(nfolio[0], false); release_out: - dn->inode_page = NULL; - dn->node_page = NULL; + dn->inode_folio = NULL; + dn->node_folio = NULL; + if (err == -ENOENT) { + dn->cur_level = i; + dn->max_level = level; + dn->ofs_in_node = offset[level]; + } return err; } -static void truncate_node(struct dnode_of_data *dn) +static int truncate_node(struct dnode_of_data *dn) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; + int err; + pgoff_t index; - get_node_info(sbi, dn->nid, &ni); - if (dn->inode->i_blocks == 0) { - BUG_ON(ni.blk_addr != NULL_ADDR); - goto invalidate; + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); + if (err) + return err; + + if (ni.blk_addr != NEW_ADDR && + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) { + f2fs_err_ratelimited(sbi, + "nat entry is corrupted, run fsck to fix it, ino:%u, " + "nid:%u, blkaddr:%u", ni.ino, ni.nid, ni.blk_addr); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + return -EFSCORRUPTED; } - BUG_ON(ni.blk_addr == NULL_ADDR); /* Deallocate node address */ - invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode, 1); - set_node_addr(sbi, &ni, NULL_ADDR); + f2fs_invalidate_blocks(sbi, ni.blk_addr, 1); + dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); + set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { - remove_orphan_inode(sbi, dn->nid); + f2fs_remove_orphan_inode(sbi, dn->nid); dec_valid_inode_count(sbi); - } else { - sync_inode_page(dn); + f2fs_inode_synced(dn->inode); } -invalidate: - clear_node_page_dirty(dn->node_page); - F2FS_SET_SB_DIRT(sbi); - f2fs_put_page(dn->node_page, 1); - dn->node_page = NULL; + clear_node_folio_dirty(dn->node_folio); + set_sbi_flag(sbi, SBI_IS_DIRTY); + + index = dn->node_folio->index; + f2fs_folio_put(dn->node_folio, true); + + invalidate_mapping_pages(NODE_MAPPING(sbi), + index, index); + + dn->node_folio = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); + + return 0; } static int truncate_dnode(struct dnode_of_data *dn) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - struct page *page; + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct folio *folio; + int err; if (dn->nid == 0) return 1; /* get direct node */ - page = get_node_page(sbi, dn->nid); - if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) + folio = f2fs_get_node_folio(sbi, dn->nid, NODE_TYPE_NON_INODE); + if (PTR_ERR(folio) == -ENOENT) return 1; - else if (IS_ERR(page)) - return PTR_ERR(page); + else if (IS_ERR(folio)) + return PTR_ERR(folio); + + if (IS_INODE(folio) || ino_of_node(folio) != dn->inode->i_ino) { + f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u", + dn->inode->i_ino, dn->nid, ino_of_node(folio)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE); + f2fs_folio_put(folio, true); + return -EFSCORRUPTED; + } /* Make dnode_of_data for parameter */ - dn->node_page = page; + dn->node_folio = folio; dn->ofs_in_node = 0; - truncate_data_blocks(dn); - truncate_node(dn); + f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK(dn->inode)); + err = truncate_node(dn); + if (err) { + f2fs_folio_put(folio, true); + return err; + } + return 1; } static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, int ofs, int depth) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct dnode_of_data rdn = *dn; - struct page *page; + struct folio *folio; struct f2fs_node *rn; nid_t child_nid; unsigned int child_nofs; @@ -559,13 +1039,16 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - page = get_node_page(sbi, dn->nid); - if (IS_ERR(page)) { - trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); - return PTR_ERR(page); + folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid, + NODE_TYPE_NON_INODE); + if (IS_ERR(folio)) { + trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(folio)); + return PTR_ERR(folio); } - rn = (struct f2fs_node *)page_address(page); + f2fs_ra_node_pages(folio, ofs, NIDS_PER_BLOCK); + + rn = F2FS_NODE(folio); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { child_nid = le32_to_cpu(rn->in.nid[i]); @@ -575,7 +1058,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, ret = truncate_dnode(&rdn); if (ret < 0) goto out_err; - set_nid(page, i, 0, false); + if (set_nid(folio, i, 0, false)) + dn->node_changed = true; } } else { child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; @@ -588,7 +1072,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, rdn.nid = child_nid; ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); if (ret == (NIDS_PER_BLOCK + 1)) { - set_nid(page, i, 0, false); + if (set_nid(folio, i, 0, false)) + dn->node_changed = true; child_nofs += ret; } else if (ret < 0 && ret != -ENOENT) { goto out_err; @@ -599,17 +1084,19 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, if (!ofs) { /* remove current indirect node */ - dn->node_page = page; - truncate_node(dn); + dn->node_folio = folio; + ret = truncate_node(dn); + if (ret) + goto out_err; freed++; } else { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); } trace_f2fs_truncate_nodes_exit(dn->inode, freed); return freed; out_err: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); trace_f2fs_truncate_nodes_exit(dn->inode, ret); return ret; } @@ -617,54 +1104,60 @@ out_err: static int truncate_partial_nodes(struct dnode_of_data *dn, struct f2fs_inode *ri, int *offset, int depth) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - struct page *pages[2]; + struct folio *folios[2]; nid_t nid[3]; nid_t child_nid; int err = 0; int i; int idx = depth - 2; - nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + nid[0] = get_nid(dn->inode_folio, offset[0], true); if (!nid[0]) return 0; /* get indirect nodes in the path */ - for (i = 0; i < depth - 1; i++) { - /* refernece count'll be increased */ - pages[i] = get_node_page(sbi, nid[i]); - if (IS_ERR(pages[i])) { - depth = i + 1; - err = PTR_ERR(pages[i]); + for (i = 0; i < idx + 1; i++) { + /* reference count'll be increased */ + folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i], + NODE_TYPE_NON_INODE); + if (IS_ERR(folios[i])) { + err = PTR_ERR(folios[i]); + idx = i - 1; goto fail; } - nid[i + 1] = get_nid(pages[i], offset[i + 1], false); + nid[i + 1] = get_nid(folios[i], offset[i + 1], false); } + f2fs_ra_node_pages(folios[idx], offset[idx + 1], NIDS_PER_BLOCK); + /* free direct nodes linked to a partial indirect node */ - for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) { - child_nid = get_nid(pages[idx], i, false); + for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { + child_nid = get_nid(folios[idx], i, false); if (!child_nid) continue; dn->nid = child_nid; err = truncate_dnode(dn); if (err < 0) goto fail; - set_nid(pages[idx], i, 0, false); + if (set_nid(folios[idx], i, 0, false)) + dn->node_changed = true; } - if (offset[depth - 1] == 0) { - dn->node_page = pages[idx]; + if (offset[idx + 1] == 0) { + dn->node_folio = folios[idx]; dn->nid = nid[idx]; - truncate_node(dn); + err = truncate_node(dn); + if (err) + goto fail; } else { - f2fs_put_page(pages[idx], 1); + f2fs_folio_put(folios[idx], true); } offset[idx]++; - offset[depth - 1] = 0; + offset[idx + 1] = 0; + idx--; fail: - for (i = depth - 3; i >= 0; i--) - f2fs_put_page(pages[i], 1); + for (i = idx; i >= 0; i--) + f2fs_folio_put(folios[i], true); trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); @@ -674,31 +1167,41 @@ fail: /* * All the block addresses of data and nodes should be nullified. */ -int truncate_inode_blocks(struct inode *inode, pgoff_t from) +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct address_space *node_mapping = sbi->node_inode->i_mapping; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err = 0, cont = 1; int level, offset[4], noffset[4]; unsigned int nofs = 0; - struct f2fs_node *rn; + struct f2fs_inode *ri; struct dnode_of_data dn; - struct page *page; + struct folio *folio; trace_f2fs_truncate_inode_blocks_enter(inode, from); - level = get_node_path(from, offset, noffset); -restart: - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); - return PTR_ERR(page); + level = get_node_path(inode, from, offset, noffset); + if (level <= 0) { + if (!level) { + level = -EFSCORRUPTED; + f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u", + __func__, inode->i_ino, + from, ADDRS_PER_INODE(inode)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + trace_f2fs_truncate_inode_blocks_exit(inode, level); + return level; + } + + folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(folio)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(folio)); + return PTR_ERR(folio); } - set_new_dnode(&dn, inode, page, NULL, 0); - unlock_page(page); + set_new_dnode(&dn, inode, folio, NULL, 0); + folio_unlock(folio); - rn = page_address(page); + ri = F2FS_INODE(folio); switch (level) { case 0: case 1: @@ -708,7 +1211,7 @@ restart: nofs = noffset[1]; if (!offset[level - 1]) goto skip_partial; - err = truncate_partial_nodes(&dn, &rn->i, offset, level); + err = truncate_partial_nodes(&dn, ri, offset, level); if (err < 0 && err != -ENOENT) goto fail; nofs += 1 + NIDS_PER_BLOCK; @@ -717,7 +1220,7 @@ restart: nofs = 5 + 2 * NIDS_PER_BLOCK; if (!offset[level - 1]) goto skip_partial; - err = truncate_partial_nodes(&dn, &rn->i, offset, level); + err = truncate_partial_nodes(&dn, ri, offset, level); if (err < 0 && err != -ENOENT) goto fail; break; @@ -727,7 +1230,7 @@ restart: skip_partial: while (cont) { - dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]); + dn.nid = get_nid(folio, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -747,305 +1250,833 @@ skip_partial: default: BUG(); } - if (err < 0 && err != -ENOENT) + if (err == -ENOENT) { + set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + f2fs_err_ratelimited(sbi, + "truncate node fail, ino:%lu, nid:%u, " + "offset[0]:%d, offset[1]:%d, nofs:%d", + inode->i_ino, dn.nid, offset[0], + offset[1], nofs); + err = 0; + } + if (err < 0) goto fail; - if (offset[1] == 0 && - rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { - lock_page(page); - if (page->mapping != node_mapping) { - f2fs_put_page(page, 1); - goto restart; - } - wait_on_page_writeback(page); - rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; - set_page_dirty(page); - unlock_page(page); + if (offset[1] == 0 && get_nid(folio, offset[0], true)) { + folio_lock(folio); + BUG_ON(!is_node_folio(folio)); + set_nid(folio, offset[0], 0, true); + folio_unlock(folio); } offset[1] = 0; offset[0]++; nofs += err; } fail: - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); trace_f2fs_truncate_inode_blocks_exit(inode, err); return err > 0 ? 0 : err; } +/* caller must lock inode page */ +int f2fs_truncate_xattr_node(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t nid = F2FS_I(inode)->i_xattr_nid; + struct dnode_of_data dn; + struct folio *nfolio; + int err; + + if (!nid) + return 0; + + nfolio = f2fs_get_xnode_folio(sbi, nid); + if (IS_ERR(nfolio)) + return PTR_ERR(nfolio); + + set_new_dnode(&dn, inode, NULL, nfolio, nid); + err = truncate_node(&dn); + if (err) { + f2fs_folio_put(nfolio, true); + return err; + } + + f2fs_i_xnid_write(inode, 0); + + return 0; +} + /* - * Caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). */ -int remove_inode_page(struct inode *inode) +int f2fs_remove_inode_page(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct page *page; - nid_t ino = inode->i_ino; struct dnode_of_data dn; + int err; - page = get_node_page(sbi, ino); - if (IS_ERR(page)) - return PTR_ERR(page); + set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; - if (F2FS_I(inode)->i_xattr_nid) { - nid_t nid = F2FS_I(inode)->i_xattr_nid; - struct page *npage = get_node_page(sbi, nid); + err = f2fs_truncate_xattr_node(inode); + if (err) { + f2fs_put_dnode(&dn); + return err; + } - if (IS_ERR(npage)) - return PTR_ERR(npage); + /* remove potential inline_data blocks */ + if (!IS_DEVICE_ALIASING(inode) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + f2fs_truncate_data_blocks_range(&dn, 1); - F2FS_I(inode)->i_xattr_nid = 0; - set_new_dnode(&dn, inode, page, npage, nid); - dn.inode_page_locked = 1; - truncate_node(&dn); + /* 0 is possible, after f2fs_new_inode() has failed */ + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + f2fs_put_dnode(&dn); + return -EIO; } - /* 0 is possible, after f2fs_new_inode() is failed */ - BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1); - set_new_dnode(&dn, inode, page, page, ino); - truncate_node(&dn); + if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu", + inode->i_ino, (unsigned long long)inode->i_blocks); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + } + + /* will put inode & node pages */ + err = truncate_node(&dn); + if (err) { + f2fs_put_dnode(&dn); + return err; + } return 0; } -struct page *new_inode_page(struct inode *inode, const struct qstr *name) +struct folio *f2fs_new_inode_folio(struct inode *inode) { struct dnode_of_data dn; /* allocate inode page for new inode */ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0, NULL); + /* caller should f2fs_folio_put(folio, true); */ + return f2fs_new_node_folio(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage) +struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - struct address_space *mapping = sbi->node_inode->i_mapping; - struct node_info old_ni, new_ni; - struct page *page; + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct node_info new_ni; + struct folio *folio; int err; - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(mapping, dn->nid); - if (!page) - return ERR_PTR(-ENOMEM); - - get_node_info(sbi, dn->nid, &old_ni); - - SetPageUptodate(page); - fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); + folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), dn->nid, false); + if (IS_ERR(folio)) + return folio; - /* Reinitialize old_ni with new node page */ - BUG_ON(old_ni.blk_addr != NULL_ADDR); - new_ni = old_ni; - new_ni.ino = dn->inode->i_ino; + if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) + goto fail; - if (!inc_valid_node_count(sbi, dn->inode, 1)) { - err = -ENOSPC; +#ifdef CONFIG_F2FS_CHECK_FS + err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false); + if (err) { + dec_valid_node_count(sbi, dn->inode, !ofs); goto fail; } - set_node_addr(sbi, &new_ni, NEW_ADDR); - set_cold_node(dn->inode, page); + if (unlikely(new_ni.blk_addr != NULL_ADDR)) { + err = -EFSCORRUPTED; + dec_valid_node_count(sbi, dn->inode, !ofs); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn_ratelimited(sbi, + "f2fs_new_node_folio: inconsistent nat entry, " + "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", + new_ni.ino, new_ni.nid, new_ni.blk_addr, + new_ni.version, new_ni.flag); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + goto fail; + } +#endif + new_ni.nid = dn->nid; + new_ni.ino = dn->inode->i_ino; + new_ni.blk_addr = NULL_ADDR; + new_ni.flag = 0; + new_ni.version = 0; + set_node_addr(sbi, &new_ni, NEW_ADDR, false); + + f2fs_folio_wait_writeback(folio, NODE, true, true); + fill_node_footer(folio, dn->nid, dn->inode->i_ino, ofs, true); + set_cold_node(folio, S_ISDIR(dn->inode->i_mode)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + if (folio_mark_dirty(folio)) + dn->node_changed = true; + + if (f2fs_has_xattr_block(ofs)) + f2fs_i_xnid_write(dn->inode, dn->nid); - dn->node_page = page; - if (ipage) - update_inode(dn->inode, ipage); - else - sync_inode_page(dn); - set_page_dirty(page); if (ofs == 0) inc_valid_inode_count(sbi); - - return page; - + return folio; fail: - clear_node_page_dirty(page); - f2fs_put_page(page, 1); + clear_node_folio_dirty(folio); + f2fs_folio_put(folio, true); return ERR_PTR(err); } /* * Caller should do after getting the following values. - * 0: f2fs_put_page(page, 0) - * LOCKED_PAGE: f2fs_put_page(page, 1) - * error: nothing + * 0: f2fs_folio_put(folio, false) + * LOCKED_PAGE or error: f2fs_folio_put(folio, true) */ -static int read_node_page(struct page *page, int type) +static int read_node_folio(struct folio *folio, blk_opf_t op_flags) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = NODE, + .op = REQ_OP_READ, + .op_flags = op_flags, + .folio = folio, + .encrypted_page = NULL, + }; + int err; - get_node_info(sbi, page->index, &ni); + if (folio_test_uptodate(folio)) { + if (!f2fs_inode_chksum_verify(sbi, folio)) { + folio_clear_uptodate(folio); + return -EFSBADCRC; + } + return LOCKED_PAGE; + } - if (ni.blk_addr == NULL_ADDR) { - f2fs_put_page(page, 1); + err = f2fs_get_node_info(sbi, folio->index, &ni, false); + if (err) + return err; + + /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) { + folio_clear_uptodate(folio); return -ENOENT; } - if (PageUptodate(page)) - return LOCKED_PAGE; + fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr; - return f2fs_readpage(sbi, page, ni.blk_addr, type); + err = f2fs_submit_page_bio(&fio); + + if (!err) + f2fs_update_iostat(sbi, NULL, FS_NODE_READ_IO, F2FS_BLKSIZE); + + return err; } /* * Readahead a node page */ -void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { - struct address_space *mapping = sbi->node_inode->i_mapping; - struct page *apage; + struct folio *afolio; int err; - apage = find_get_page(mapping, nid); - if (apage && PageUptodate(apage)) { - f2fs_put_page(apage, 0); + if (!nid) + return; + if (f2fs_check_nid_range(sbi, nid)) + return; + + afolio = xa_load(&NODE_MAPPING(sbi)->i_pages, nid); + if (afolio) return; - } - f2fs_put_page(apage, 0); - apage = grab_cache_page(mapping, nid); - if (!apage) + afolio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false); + if (IS_ERR(afolio)) return; - err = read_node_page(apage, READA); - if (err == 0) - f2fs_put_page(apage, 0); - else if (err == LOCKED_PAGE) - f2fs_put_page(apage, 1); - return; + err = read_node_folio(afolio, REQ_RAHEAD); + f2fs_folio_put(afolio, err ? true : false); } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +static int sanity_check_node_footer(struct f2fs_sb_info *sbi, + struct folio *folio, pgoff_t nid, + enum node_type ntype) { - struct address_space *mapping = sbi->node_inode->i_mapping; - struct page *page; - int err; -repeat: - page = grab_cache_page(mapping, nid); - if (!page) - return ERR_PTR(-ENOMEM); + if (unlikely(nid != nid_of_node(folio))) + goto out_err; - err = read_node_page(page, READ_SYNC); - if (err < 0) - return ERR_PTR(err); - else if (err == LOCKED_PAGE) - goto got_it; - - lock_page(page); - if (!PageUptodate(page)) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } - if (page->mapping != mapping) { - f2fs_put_page(page, 1); - goto repeat; + switch (ntype) { + case NODE_TYPE_INODE: + if (!IS_INODE(folio)) + goto out_err; + break; + case NODE_TYPE_XATTR: + if (!f2fs_has_xattr_block(ofs_of_node(folio))) + goto out_err; + break; + case NODE_TYPE_NON_INODE: + if (IS_INODE(folio)) + goto out_err; + break; + default: + break; } -got_it: - BUG_ON(nid != nid_of_node(page)); - mark_page_accessed(page); - return page; + if (time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER)) + goto out_err; + return 0; +out_err: + f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(folio), ino_of_node(folio), + ofs_of_node(folio), cpver_of_node(folio), + next_blkaddr_of_node(folio)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + return -EFSCORRUPTED; } -/* - * Return a locked page for the desired node page. - * And, readahead MAX_RA_NODE number of node pages. - */ -struct page *get_node_page_ra(struct page *parent, int start) +static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + struct folio *parent, int start, enum node_type ntype) { - struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); - struct address_space *mapping = sbi->node_inode->i_mapping; - struct blk_plug plug; - struct page *page; - int err, i, end; - nid_t nid; + struct folio *folio; + int err; - /* First, try getting the desired direct node. */ - nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); + if (f2fs_check_nid_range(sbi, nid)) + return ERR_PTR(-EINVAL); repeat: - page = grab_cache_page(mapping, nid); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false); + if (IS_ERR(folio)) + return folio; - err = read_node_page(page, READ_SYNC); + err = read_node_folio(folio, 0); if (err < 0) - return ERR_PTR(err); - else if (err == LOCKED_PAGE) + goto out_put_err; + if (err == LOCKED_PAGE) goto page_hit; - blk_start_plug(&plug); + if (parent) + f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start + 1; i < end; i++) { - nid = get_nid(parent, i, false); - if (!nid) - continue; - ra_node_page(sbi, nid); + folio_lock(folio); + + if (unlikely(!is_node_folio(folio))) { + f2fs_folio_put(folio, true); + goto repeat; } - blk_finish_plug(&plug); + if (unlikely(!folio_test_uptodate(folio))) { + err = -EIO; + goto out_put_err; + } - lock_page(page); - if (page->mapping != mapping) { - f2fs_put_page(page, 1); - goto repeat; + if (!f2fs_inode_chksum_verify(sbi, folio)) { + err = -EFSBADCRC; + goto out_err; } page_hit: - if (!PageUptodate(page)) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + err = sanity_check_node_footer(sbi, folio, nid, ntype); + if (!err) + return folio; +out_err: + folio_clear_uptodate(folio); +out_put_err: + /* ENOENT comes from read_node_folio which is not an error. */ + if (err != -ENOENT) + f2fs_handle_page_eio(sbi, folio, NODE); + f2fs_folio_put(folio, true); + return ERR_PTR(err); +} + +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + enum node_type node_type) +{ + return __get_node_folio(sbi, nid, NULL, 0, node_type); +} + +struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) +{ + return __get_node_folio(sbi, ino, NULL, 0, NODE_TYPE_INODE); +} + +struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid) +{ + return __get_node_folio(sbi, xnid, NULL, 0, NODE_TYPE_XATTR); +} + +static struct folio *f2fs_get_node_folio_ra(struct folio *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_F_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_folio(sbi, nid, parent, start, NODE_TYPE_REGULAR); +} + +static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode; + struct folio *folio; + int ret; + + /* should flush inline_data before evict_inode */ + inode = ilookup(sbi->sb, ino); + if (!inode) + return; + + folio = f2fs_filemap_get_folio(inode->i_mapping, 0, + FGP_LOCK|FGP_NOWAIT, 0); + if (IS_ERR(folio)) + goto iput_out; + + if (!folio_test_uptodate(folio)) + goto folio_out; + + if (!folio_test_dirty(folio)) + goto folio_out; + + if (!folio_clear_dirty_for_io(folio)) + goto folio_out; + + ret = f2fs_write_inline_data(inode, folio); + inode_dec_dirty_pages(inode); + f2fs_remove_dirty_inode(inode); + if (ret) + folio_mark_dirty(folio); +folio_out: + f2fs_folio_put(folio, true); +iput_out: + iput(inode); +} + +static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) +{ + pgoff_t index; + struct folio_batch fbatch; + struct folio *last_folio = NULL; + int nr_folios; + + folio_batch_init(&fbatch); + index = 0; + + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { + int i; + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_folio_put(last_folio, false); + folio_batch_release(&fbatch); + return ERR_PTR(-EIO); + } + + if (!IS_DNODE(folio) || !is_cold_node(folio)) + continue; + if (ino_of_node(folio) != ino) + continue; + + folio_lock(folio); + + if (unlikely(!is_node_folio(folio))) { +continue_unlock: + folio_unlock(folio); + continue; + } + if (ino_of_node(folio) != ino) + goto continue_unlock; + + if (!folio_test_dirty(folio)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (last_folio) + f2fs_folio_put(last_folio, false); + + folio_get(folio); + last_folio = folio; + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); } - mark_page_accessed(page); - return page; + return last_folio; } -void sync_inode_page(struct dnode_of_data *dn) +static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted, + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type, unsigned int *seq_id) { - if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - update_inode(dn->inode, dn->node_page); - } else if (dn->inode_page) { - if (!dn->inode_page_locked) - lock_page(dn->inode_page); - update_inode(dn->inode, dn->inode_page); - if (!dn->inode_page_locked) - unlock_page(dn->inode_page); + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = ino_of_node(folio), + .type = NODE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .folio = folio, + .encrypted_page = NULL, + .submitted = 0, + .io_type = io_type, + .io_wbc = wbc, + }; + unsigned int seq; + + trace_f2fs_writepage(folio, NODE); + + if (unlikely(f2fs_cp_error(sbi))) { + /* keep node pages in remount-ro mode */ + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) + goto redirty_out; + folio_clear_uptodate(folio); + dec_page_count(sbi, F2FS_DIRTY_NODES); + folio_unlock(folio); + return true; + } + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + + if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + wbc->sync_mode == WB_SYNC_NONE && + IS_DNODE(folio) && is_cold_node(folio)) + goto redirty_out; + + /* get old block addr of this node page */ + nid = nid_of_node(folio); + f2fs_bug_on(sbi, folio->index != nid); + + if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) + goto redirty_out; + + f2fs_down_read(&sbi->node_write); + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + folio_clear_uptodate(folio); + dec_page_count(sbi, F2FS_DIRTY_NODES); + f2fs_up_read(&sbi->node_write); + folio_unlock(folio); + return true; + } + + if (__is_valid_data_blkaddr(ni.blk_addr) && + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, + DATA_GENERIC_ENHANCE)) { + f2fs_up_read(&sbi->node_write); + goto redirty_out; + } + + if (atomic && !test_opt(sbi, NOBARRIER)) + fio.op_flags |= REQ_PREFLUSH | REQ_FUA; + + /* should add to global list before clearing PAGECACHE status */ + if (f2fs_in_warm_node_list(sbi, folio)) { + seq = f2fs_add_fsync_node_entry(sbi, folio); + if (seq_id) + *seq_id = seq; + } + + folio_start_writeback(folio); + + fio.old_blkaddr = ni.blk_addr; + f2fs_do_write_node_page(nid, &fio); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(folio)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + f2fs_up_read(&sbi->node_write); + + folio_unlock(folio); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, NODE); + submitted = NULL; + } + if (submitted) + *submitted = fio.submitted; + + if (do_balance) + f2fs_balance_fs(sbi, false); + return true; + +redirty_out: + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + return false; +} + +int f2fs_move_node_folio(struct folio *node_folio, int gc_type) +{ + int err = 0; + + if (gc_type == FG_GC) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + }; + + f2fs_folio_wait_writeback(node_folio, NODE, true, true); + + folio_mark_dirty(node_folio); + + if (!folio_clear_dirty_for_io(node_folio)) { + err = -EAGAIN; + goto out_page; + } + + if (!__write_node_folio(node_folio, false, NULL, + &wbc, false, FS_GC_NODE_IO, NULL)) + err = -EAGAIN; + goto release_page; } else { - update_inode_page(dn->inode); + /* set page dirty and write it */ + if (!folio_test_writeback(node_folio)) + folio_mark_dirty(node_folio); + } +out_page: + folio_unlock(node_folio); +release_page: + f2fs_folio_put(node_folio, false); + return err; +} + +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id) +{ + pgoff_t index; + struct folio_batch fbatch; + int ret = 0; + struct folio *last_folio = NULL; + bool marked = false; + nid_t ino = inode->i_ino; + int nr_folios; + int nwritten = 0; + + if (atomic) { + last_folio = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_folio)) + return PTR_ERR_OR_ZERO(last_folio); + } +retry: + folio_batch_init(&fbatch); + index = 0; + + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { + int i; + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + bool submitted = false; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_folio_put(last_folio, false); + folio_batch_release(&fbatch); + ret = -EIO; + goto out; + } + + if (!IS_DNODE(folio) || !is_cold_node(folio)) + continue; + if (ino_of_node(folio) != ino) + continue; + + folio_lock(folio); + + if (unlikely(!is_node_folio(folio))) { +continue_unlock: + folio_unlock(folio); + continue; + } + if (ino_of_node(folio) != ino) + goto continue_unlock; + + if (!folio_test_dirty(folio) && folio != last_folio) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_folio_wait_writeback(folio, NODE, true, true); + + set_fsync_mark(folio, 0); + set_dentry_mark(folio, 0); + + if (!atomic || folio == last_folio) { + set_fsync_mark(folio, 1); + percpu_counter_inc(&sbi->rf_node_block_count); + if (IS_INODE(folio)) { + if (is_inode_flag_set(inode, + FI_DIRTY_INODE)) + f2fs_update_inode(inode, folio); + set_dentry_mark(folio, + f2fs_need_dentry_mark(sbi, ino)); + } + /* may be written by other thread */ + if (!folio_test_dirty(folio)) + folio_mark_dirty(folio); + } + + if (!folio_clear_dirty_for_io(folio)) + goto continue_unlock; + + if (!__write_node_folio(folio, atomic && + folio == last_folio, + &submitted, wbc, true, + FS_NODE_IO, seq_id)) { + f2fs_folio_put(last_folio, false); + folio_batch_release(&fbatch); + ret = -EIO; + goto out; + } + if (submitted) + nwritten++; + + if (folio == last_folio) { + f2fs_folio_put(folio, false); + folio_batch_release(&fbatch); + marked = true; + goto out; + } + } + folio_batch_release(&fbatch); + cond_resched(); + } + if (atomic && !marked) { + f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx", + ino, last_folio->index); + folio_lock(last_folio); + f2fs_folio_wait_writeback(last_folio, NODE, true, true); + folio_mark_dirty(last_folio); + folio_unlock(last_folio); + goto retry; + } +out: + if (nwritten) + f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); + return ret; +} + +static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool clean; + + if (inode->i_ino != ino) + return 0; + + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) + return 0; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + clean = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + if (clean) + return 0; + + inode = igrab(inode); + if (!inode) + return 0; + return 1; +} + +static bool flush_dirty_inode(struct folio *folio) +{ + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); + struct inode *inode; + nid_t ino = ino_of_node(folio); + + inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); + if (!inode) + return false; + + f2fs_update_inode(inode, folio); + folio_unlock(folio); + + iput(inode); + return true; +} + +void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) +{ + pgoff_t index = 0; + struct folio_batch fbatch; + int nr_folios; + + folio_batch_init(&fbatch); + + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { + int i; + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + if (!IS_INODE(folio)) + continue; + + folio_lock(folio); + + if (unlikely(!is_node_folio(folio))) + goto unlock; + if (!folio_test_dirty(folio)) + goto unlock; + + /* flush inline_data, if it's async context. */ + if (folio_test_f2fs_inline(folio)) { + folio_clear_f2fs_inline(folio); + folio_unlock(folio); + flush_inline_data(sbi, ino_of_node(folio)); + continue; + } +unlock: + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); } } -int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, - struct writeback_control *wbc) +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type) { - struct address_space *mapping = sbi->node_inode->i_mapping; - pgoff_t index, end; - struct pagevec pvec; - int step = ino ? 2 : 0; - int nwritten = 0, wrote = 0; + pgoff_t index; + struct folio_batch fbatch; + int step = 0; + int nwritten = 0; + int ret = 0; + int nr_folios, done = 0; - pagevec_init(&pvec, 0); + folio_batch_init(&fbatch); next_step: index = 0; - end = LONG_MAX; - - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + while (!done && (nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), + &index, (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { + int i; + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + bool submitted = false; + + /* give a priority to WB_SYNC threads */ + if (atomic_read(&sbi->wb_sync_req[NODE]) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } /* * flushing sequence with step: @@ -1053,58 +2084,68 @@ next_step: * 1. dentry dnodes * 2. file dnodes */ - if (step == 0 && IS_DNODE(page)) + if (step == 0 && IS_DNODE(folio)) continue; - if (step == 1 && (!IS_DNODE(page) || - is_cold_node(page))) + if (step == 1 && (!IS_DNODE(folio) || + is_cold_node(folio))) continue; - if (step == 2 && (!IS_DNODE(page) || - !is_cold_node(page))) + if (step == 2 && (!IS_DNODE(folio) || + !is_cold_node(folio))) continue; - - /* - * If an fsync mode, - * we should not skip writing node pages. - */ - if (ino && ino_of_node(page) == ino) - lock_page(page); - else if (!trylock_page(page)) +lock_node: + if (wbc->sync_mode == WB_SYNC_ALL) + folio_lock(folio); + else if (!folio_trylock(folio)) continue; - if (unlikely(page->mapping != mapping)) { + if (unlikely(!is_node_folio(folio))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (ino && ino_of_node(page) != ino) - goto continue_unlock; - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (!clear_page_dirty_for_io(page)) + /* flush inline_data/inode, if it's async context. */ + if (!do_balance) + goto write_node; + + /* flush inline_data */ + if (folio_test_f2fs_inline(folio)) { + folio_clear_f2fs_inline(folio); + folio_unlock(folio); + flush_inline_data(sbi, ino_of_node(folio)); + goto lock_node; + } + + /* flush dirty inode */ + if (IS_INODE(folio) && flush_dirty_inode(folio)) + goto lock_node; +write_node: + f2fs_folio_wait_writeback(folio, NODE, true, true); + + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - /* called by fsync() */ - if (ino && IS_DNODE(page)) { - int mark = !is_checkpointed_node(sbi, ino); - set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, mark); - nwritten++; - } else { - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); + set_fsync_mark(folio, 0); + set_dentry_mark(folio, 0); + + if (!__write_node_folio(folio, false, &submitted, + wbc, do_balance, io_type, NULL)) { + folio_batch_release(&fbatch); + ret = -EIO; + goto out; } - mapping->a_ops->writepage(page, wbc); - wrote++; + if (submitted) + nwritten++; if (--wbc->nr_to_write == 0) break; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (wbc->nr_to_write == 0) { @@ -1114,251 +2155,457 @@ continue_unlock: } if (step < 2) { + if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + wbc->sync_mode == WB_SYNC_NONE && step == 1) + goto out; step++; goto next_step; } +out: + if (nwritten) + f2fs_submit_merged_write(sbi, NODE); - if (wrote) - f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL); - - return nwritten; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + return ret; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); - nid_t nid; - block_t new_addr; - struct node_info ni; - - wait_on_page_writeback(page); + struct fsync_node_entry *fn; + struct list_head *head = &sbi->fsync_node_list; + unsigned long flags; + unsigned int cur_seq_id = 0; - /* get old block addr of this node page */ - nid = nid_of_node(page); - BUG_ON(page->index != nid); + while (seq_id && cur_seq_id < seq_id) { + struct folio *folio; - get_node_info(sbi, nid, &ni); + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + fn = list_first_entry(head, struct fsync_node_entry, list); + if (fn->seq_id > seq_id) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + cur_seq_id = fn->seq_id; + folio = fn->folio; + folio_get(folio); + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); - /* This page is already truncated */ - if (ni.blk_addr == NULL_ADDR) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - unlock_page(page); - return 0; - } + f2fs_folio_wait_writeback(folio, NODE, true, false); - if (wbc->for_reclaim) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; + folio_put(folio); } - mutex_lock(&sbi->node_write); - set_page_writeback(page); - write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); - set_node_addr(sbi, &ni, new_addr); - dec_page_count(sbi, F2FS_DIRTY_NODES); - mutex_unlock(&sbi->node_write); - unlock_page(page); - return 0; + return filemap_check_errors(NODE_MAPPING(sbi)); } -/* - * It is very important to gather dirty pages and write at once, so that we can - * submit a big bio without interfering other data writes. - * Be default, 512 pages (2MB), a segment size, is quite reasonable. - */ -#define COLLECT_DIRTY_NODES 512 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - long nr_to_write = wbc->nr_to_write; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; + long diff; - /* First check balancing cached NAT entries */ - if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { - f2fs_sync_fs(sbi->sb, true); - return 0; - } + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + + /* balancing f2fs's metadata in background */ + f2fs_balance_fs_bg(sbi, true); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) - return 0; + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_NODES) < + nr_pages_to_skip(sbi, NODE)) + goto skip_write; + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req[NODE]); + else if (atomic_read(&sbi->wb_sync_req[NODE])) { + /* to avoid potential deadlock */ + if (current->plug) + blk_finish_plug(current->plug); + goto skip_write; + } - /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = max_hw_blocks(sbi); - sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write); - return 0; -} + trace_f2fs_writepages(mapping->host, wbc, NODE); -static int f2fs_set_node_page_dirty(struct page *page) -{ - struct address_space *mapping = page->mapping; - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + diff = nr_pages_to_write(sbi, NODE, wbc); + blk_start_plug(&plug); + f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO); + blk_finish_plug(&plug); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); - SetPageUptodate(page); - if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); - inc_page_count(sbi, F2FS_DIRTY_NODES); - SetPagePrivate(page); - return 1; - } + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req[NODE]); return 0; -} -static void f2fs_invalidate_node_page(struct page *page, unsigned int offset, - unsigned int length) -{ - struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (PageDirty(page)) - dec_page_count(sbi, F2FS_DIRTY_NODES); - ClearPagePrivate(page); +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); + trace_f2fs_writepages(mapping->host, wbc, NODE); + return 0; } -static int f2fs_release_node_page(struct page *page, gfp_t wait) +static bool f2fs_dirty_node_folio(struct address_space *mapping, + struct folio *folio) { - ClearPagePrivate(page); - return 1; + trace_f2fs_set_page_dirty(folio, NODE); + + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); +#ifdef CONFIG_F2FS_CHECK_FS + if (IS_INODE(folio)) + f2fs_inode_chksum_set(F2FS_M_SB(mapping), folio); +#endif + if (filemap_dirty_folio(mapping, folio)) { + inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); + folio_set_f2fs_reference(folio); + return true; + } + return false; } /* * Structure of the f2fs node operations */ const struct address_space_operations f2fs_node_aops = { - .writepage = f2fs_write_node_page, .writepages = f2fs_write_node_pages, - .set_page_dirty = f2fs_set_node_page_dirty, - .invalidatepage = f2fs_invalidate_node_page, - .releasepage = f2fs_release_node_page, + .dirty_folio = f2fs_dirty_node_folio, + .invalidate_folio = f2fs_invalidate_folio, + .release_folio = f2fs_release_folio, + .migrate_folio = filemap_migrate_folio, }; -static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) +static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, + nid_t n) { - struct list_head *this; - struct free_nid *i; - list_for_each(this, head) { - i = list_entry(this, struct free_nid, list); - if (i->nid == n) - return i; + return radix_tree_lookup(&nm_i->free_nid_root, n); +} + +static int __insert_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + + if (err) + return err; + + nm_i->nid_cnt[FREE_NID]++; + list_add_tail(&i->list, &nm_i->free_nid_list); + return 0; +} + +static void __remove_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]--; + if (state == FREE_NID) + list_del(&i->list); + radix_tree_delete(&nm_i->free_nid_root, i->nid); +} + +static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, + enum nid_state org_state, enum nid_state dst_state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, org_state != i->state); + i->state = dst_state; + nm_i->nid_cnt[org_state]--; + nm_i->nid_cnt[dst_state]++; + + switch (dst_state) { + case PREALLOC_NID: + list_del(&i->list); + break; + case FREE_NID: + list_add_tail(&i->list, &nm_i->free_nid_list); + break; + default: + BUG_ON(1); } - return NULL; } -static void __del_from_free_nid_list(struct free_nid *i) +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) { - list_del(&i->list); - kmem_cache_free(free_nid_slab, i); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) { + if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + nm_i->free_nid_count[nat_ofs]++; + } else { + if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + if (!build) + nm_i->free_nid_count[nat_ofs]--; + } } -static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) +/* return if the nid is recognized as free */ +static bool add_free_nid(struct f2fs_sb_info *sbi, + nid_t nid, bool build, bool update) { - struct free_nid *i; + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i, *e; struct nat_entry *ne; - bool allocated = false; - - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) - return -1; + int err; + bool ret = false; /* 0 nid should not be used */ - if (nid == 0) - return 0; + if (unlikely(nid == 0)) + return false; - if (!build) - goto retry; + if (unlikely(f2fs_check_nid_range(sbi, nid))) + return false; - /* do not add allocated nids */ - read_lock(&nm_i->nat_tree_lock); - ne = __lookup_nat_cache(nm_i, nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) - allocated = true; - read_unlock(&nm_i->nat_tree_lock); - if (allocated) - return 0; -retry: - i = kmem_cache_alloc(free_nid_slab, GFP_NOFS); - if (!i) { - cond_resched(); - goto retry; - } + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL); i->nid = nid; - i->state = NID_NEW; + i->state = FREE_NID; - spin_lock(&nm_i->free_nid_list_lock); - if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { - spin_unlock(&nm_i->free_nid_list_lock); - kmem_cache_free(free_nid_slab, i); - return 0; + err = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + f2fs_bug_on(sbi, err); + + err = -EINVAL; + + spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - f2fs_alloc_nid + * - __insert_nid_to_list(PREALLOC_NID) + * - f2fs_balance_fs_bg + * - f2fs_build_free_nids + * - __f2fs_build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - f2fs_init_inode_metadata + * - f2fs_new_inode_folio + * - f2fs_new_node_folio + * - set_node_addr + * - f2fs_alloc_nid_done + * - __remove_nid_from_list(PREALLOC_NID) + * - __insert_nid_to_list(FREE_NID) + */ + ne = __lookup_nat_cache(nm_i, nid, false); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == FREE_NID) + ret = true; + goto err_out; + } } - list_add_tail(&i->list, &nm_i->free_nid_list); - nm_i->fcnt++; - spin_unlock(&nm_i->free_nid_list_lock); - return 1; + ret = true; + err = __insert_free_nid(sbi, i); +err_out: + if (update) { + update_free_nid_bitmap(sbi, nid, ret, build); + if (!build) + nm_i->available_nids++; + } + spin_unlock(&nm_i->nid_list_lock); + radix_tree_preload_end(); + + if (err) + kmem_cache_free(free_nid_slab, i); + return ret; } -static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; - spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - if (i && i->state == NID_NEW) { - __del_from_free_nid_list(i); - nm_i->fcnt--; + bool need_free = false; + + spin_lock(&nm_i->nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + if (i && i->state == FREE_NID) { + __remove_free_nid(sbi, i, FREE_NID); + need_free = true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } -static void scan_nat_page(struct f2fs_nm_info *nm_i, - struct page *nat_page, nid_t start_nid) +static int scan_nat_page(struct f2fs_sb_info *sbi, + struct f2fs_nat_block *nat_blk, nid_t start_nid) { - struct f2fs_nat_block *nat_blk = page_address(nat_page); + struct f2fs_nm_info *nm_i = NM_I(sbi); block_t blk_addr; + unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - - if (start_nid >= nm_i->max_nid) + if (unlikely(start_nid >= nm_i->max_nid)) break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); - BUG_ON(blk_addr == NEW_ADDR); + + if (blk_addr == NEW_ADDR) + return -EFSCORRUPTED; + if (blk_addr == NULL_ADDR) { - if (add_free_nid(nm_i, start_nid, true) < 0) + add_free_nid(sbi, start_nid, true, true); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, false, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + } + + return 0; +} + +static void scan_curseg_cache(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + int i; + + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true, false); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); +} + +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i, idx; + nid_t nid; + + f2fs_down_read(&nm_i->nat_tree_lock); + + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) + continue; + if (!nm_i->free_nid_count[i]) + continue; + for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { + idx = find_next_bit_le(nm_i->free_nid_bitmap[i], + NAT_ENTRY_PER_BLOCK, idx); + if (idx >= NAT_ENTRY_PER_BLOCK) break; + + nid = i * NAT_ENTRY_PER_BLOCK + idx; + add_free_nid(sbi, nid, true, false); + + if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS) + goto out; } } +out: + scan_curseg_cache(sbi); + + f2fs_up_read(&nm_i->nat_tree_lock); } -static void build_free_nids(struct f2fs_sb_info *sbi) +static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, + bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; - int i = 0; + int i = 0, ret; nid_t nid = nm_i->next_scan_nid; + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + + if (unlikely(nid % NAT_ENTRY_PER_BLOCK)) + nid = NAT_BLOCK_OFFSET(nid) * NAT_ENTRY_PER_BLOCK; + /* Enough entries */ - if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) - return; + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) + return 0; + + if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS)) + return 0; + + if (!mount) { + /* try to find free nids in free_nid_bitmap */ + scan_free_nid_bits(sbi); + + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) + return 0; + } /* readahead nat pages to be scanned */ - ra_nat_pages(sbi, nid); + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, + META_NAT, true); + + f2fs_down_read(&nm_i->nat_tree_lock); while (1) { - struct page *page = get_current_nat_page(sbi, nid); + if (!test_bit_le(NAT_BLOCK_OFFSET(nid), + nm_i->nat_block_bitmap)) { + struct folio *folio = get_current_nat_folio(sbi, nid); + + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + } else { + ret = scan_nat_page(sbi, folio_address(folio), + nid); + f2fs_folio_put(folio, true); + } - scan_nat_page(nm_i, page, nid); - f2fs_put_page(page, 1); + if (ret) { + f2fs_up_read(&nm_i->nat_tree_lock); + + if (ret == -EFSCORRUPTED) { + f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_NAT); + } + + return ret; + } + } nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); - if (nid >= nm_i->max_nid) + if (unlikely(nid >= nm_i->max_nid)) nid = 0; - if (i++ == FREE_NID_PAGES) + if (++i >= FREE_NID_PAGES) break; } @@ -1366,16 +2613,25 @@ static void build_free_nids(struct f2fs_sb_info *sbi) nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < nats_in_cursum(sum); i++) { - block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); - nid = le32_to_cpu(nid_in_journal(sum, i)); - if (addr == NULL_ADDR) - add_free_nid(nm_i, nid, true); - else - remove_free_nid(nm_i, nid); - } - mutex_unlock(&curseg->curseg_mutex); + scan_curseg_cache(sbi); + + f2fs_up_read(&nm_i->nat_tree_lock); + + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), + nm_i->ra_nid_pages, META_NAT, false); + + return 0; +} + +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +{ + int ret; + + mutex_lock(&NM_I(sbi)->build_lock); + ret = __f2fs_build_free_nids(sbi, sync, mount); + mutex_unlock(&NM_I(sbi)->build_lock); + + return ret; } /* @@ -1383,306 +2639,643 @@ static void build_free_nids(struct f2fs_sb_info *sbi) * from second parameter of this function. * The returned nid could be used ino as well as nid when inode is created. */ -bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; - struct list_head *this; retry: - if (sbi->total_valid_node_count + 1 >= nm_i->max_nid) + if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); - /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !sbi->on_build_free_nids) { - BUG_ON(list_empty(&nm_i->free_nid_list)); - list_for_each(this, &nm_i->free_nid_list) { - i = list_entry(this, struct free_nid, list); - if (i->state == NID_NEW) - break; + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } + + /* We should not use stale free nids created by f2fs_build_free_nids */ + if (nm_i->nid_cnt[FREE_NID] && !on_f2fs_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); + i = list_first_entry(&nm_i->free_nid_list, + struct free_nid, list); + + if (unlikely(is_invalid_nid(sbi, i->nid))) { + spin_unlock(&nm_i->nid_list_lock); + f2fs_err(sbi, "Corrupted nid %u in free_nid_list", + i->nid); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_NID); + return false; } - BUG_ON(i->state != NID_NEW); *nid = i->nid; - i->state = NID_ALLOC; - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + + __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); + nm_i->available_nids--; + + update_free_nid_bitmap(sbi, *nid, false, false); + + spin_unlock(&nm_i->nid_list_lock); return true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - mutex_lock(&nm_i->build_lock); - sbi->on_build_free_nids = 1; - build_free_nids(sbi); - sbi->on_build_free_nids = 0; - mutex_unlock(&nm_i->build_lock); - goto retry; + if (!f2fs_build_free_nids(sbi, true, false)) + goto retry; + return false; } /* - * alloc_nid() should be called prior to this function. + * f2fs_alloc_nid() should be called prior to this function. */ -void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; - spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - BUG_ON(!i || i->state != NID_ALLOC); - __del_from_free_nid_list(i); - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(sbi, !i); + __remove_free_nid(sbi, i, PREALLOC_NID); + spin_unlock(&nm_i->nid_list_lock); + + kmem_cache_free(free_nid_slab, i); } /* - * alloc_nid() should be called prior to this function. + * f2fs_alloc_nid() should be called prior to this function. */ -void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; + bool need_free = false; - spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - BUG_ON(!i || i->state != NID_ALLOC); - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { - __del_from_free_nid_list(i); + if (!nid) + return; + + spin_lock(&nm_i->nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(sbi, !i); + + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) { + __remove_free_nid(sbi, i, PREALLOC_NID); + need_free = true; } else { - i->state = NID_NEW; - nm_i->fcnt++; + __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID); } - spin_unlock(&nm_i->free_nid_list_lock); + + nm_i->available_nids++; + + update_free_nid_bitmap(sbi, nid, true, false); + + spin_unlock(&nm_i->nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } -void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, - struct f2fs_summary *sum, struct node_info *ni, - block_t new_blkaddr) +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) { - rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); - set_node_addr(sbi, ni, new_blkaddr); - clear_node_page_dirty(page); + struct f2fs_nm_info *nm_i = NM_I(sbi); + int nr = nr_shrink; + + if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) + return 0; + + if (!mutex_trylock(&nm_i->build_lock)) + return 0; + + while (nr_shrink && nm_i->nid_cnt[FREE_NID] > MAX_FREE_NIDS) { + struct free_nid *i, *next; + unsigned int batch = SHRINK_NID_BATCH_SIZE; + + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { + if (!nr_shrink || !batch || + nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) + break; + __remove_free_nid(sbi, i, FREE_NID); + kmem_cache_free(free_nid_slab, i); + nr_shrink--; + batch--; + } + spin_unlock(&nm_i->nid_list_lock); + } + + mutex_unlock(&nm_i->build_lock); + + return nr - nr_shrink; } -int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio) { - struct address_space *mapping = sbi->node_inode->i_mapping; - struct f2fs_node *src, *dst; - nid_t ino = ino_of_node(page); - struct node_info old_ni, new_ni; - struct page *ipage; + void *src_addr, *dst_addr; + size_t inline_size; + struct folio *ifolio; + struct f2fs_inode *ri; + + ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); + + ri = F2FS_INODE(folio); + if (ri->i_inline & F2FS_INLINE_XATTR) { + if (!f2fs_has_inline_xattr(inode)) { + set_inode_flag(inode, FI_INLINE_XATTR); + stat_inc_inline_xattr(inode); + } + } else { + if (f2fs_has_inline_xattr(inode)) { + stat_dec_inline_xattr(inode); + clear_inode_flag(inode, FI_INLINE_XATTR); + } + goto update_inode; + } - ipage = grab_cache_page(mapping, ino); - if (!ipage) - return -ENOMEM; + dst_addr = inline_xattr_addr(inode, ifolio); + src_addr = inline_xattr_addr(inode, folio); + inline_size = inline_xattr_size(inode); - /* Should not use this inode from free nid list */ - remove_free_nid(NM_I(sbi), ino); + f2fs_folio_wait_writeback(ifolio, NODE, true, true); + memcpy(dst_addr, src_addr, inline_size); +update_inode: + f2fs_update_inode(inode, ifolio); + f2fs_folio_put(ifolio, true); + return 0; +} - get_node_info(sbi, ino, &old_ni); - SetPageUptodate(ipage); - fill_node_footer(ipage, ino, ino, 0, true); +int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; + nid_t new_xnid; + struct dnode_of_data dn; + struct node_info ni; + struct folio *xfolio; + int err; - src = (struct f2fs_node *)page_address(page); - dst = (struct f2fs_node *)page_address(ipage); + if (!prev_xnid) + goto recover_xnid; - memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); - dst->i.i_size = 0; - dst->i.i_blocks = cpu_to_le64(1); - dst->i.i_links = cpu_to_le32(1); - dst->i.i_xattr_nid = 0; + /* 1: invalidate the previous xattr nid */ + err = f2fs_get_node_info(sbi, prev_xnid, &ni, false); + if (err) + return err; + + f2fs_invalidate_blocks(sbi, ni.blk_addr, 1); + dec_valid_node_count(sbi, inode, false); + set_node_addr(sbi, &ni, NULL_ADDR, false); + +recover_xnid: + /* 2: update xattr nid in inode */ + if (!f2fs_alloc_nid(sbi, &new_xnid)) + return -ENOSPC; + + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xfolio = f2fs_new_node_folio(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xfolio)) { + f2fs_alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xfolio); + } + + f2fs_alloc_nid_done(sbi, new_xnid); + f2fs_update_inode_page(inode); + + /* 3: update and set xattr node page dirty */ + if (folio) { + memcpy(F2FS_NODE(xfolio), F2FS_NODE(folio), + VALID_XATTR_BLOCK_SIZE); + folio_mark_dirty(xfolio); + } + f2fs_folio_put(xfolio, true); + + return 0; +} + +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio) +{ + struct f2fs_inode *src, *dst; + nid_t ino = ino_of_node(folio); + struct node_info old_ni, new_ni; + struct folio *ifolio; + int err; + + err = f2fs_get_node_info(sbi, ino, &old_ni, false); + if (err) + return err; + + if (unlikely(old_ni.blk_addr != NULL_ADDR)) + return -EINVAL; +retry: + ifolio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), ino, false); + if (IS_ERR(ifolio)) { + memalloc_retry_wait(GFP_NOFS); + goto retry; + } + + /* Should not use this inode from free nid list */ + remove_free_nid(sbi, ino); + + if (!folio_test_uptodate(ifolio)) + folio_mark_uptodate(ifolio); + fill_node_footer(ifolio, ino, ino, 0, true); + set_cold_node(ifolio, false); + + src = F2FS_INODE(folio); + dst = F2FS_INODE(ifolio); + + memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); + dst->i_size = 0; + dst->i_blocks = cpu_to_le64(1); + dst->i_links = cpu_to_le32(1); + dst->i_xattr_nid = 0; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) { + dst->i_extra_isize = src->i_extra_isize; + + if (f2fs_sb_has_flexible_inline_xattr(sbi) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_inline_xattr_size)) + dst->i_inline_xattr_size = src->i_inline_xattr_size; + + if (f2fs_sb_has_project_quota(sbi) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + + if (f2fs_sb_has_inode_crtime(sbi) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_crtime_nsec)) { + dst->i_crtime = src->i_crtime; + dst->i_crtime_nsec = src->i_crtime_nsec; + } + } new_ni = old_ni; new_ni.ino = ino; - if (!inc_valid_node_count(sbi, NULL, 1)) + if (unlikely(inc_valid_node_count(sbi, NULL, true))) WARN_ON(1); - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); - f2fs_put_page(ipage, 1); + folio_mark_dirty(ifolio); + f2fs_folio_put(ifolio, true); return 0; } -int restore_node_summary(struct f2fs_sb_info *sbi, +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; struct f2fs_summary *sum_entry; - struct page *page; block_t addr; - int i, last_offset; - - /* alloc temporal page for read node */ - page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (IS_ERR(page)) - return PTR_ERR(page); - lock_page(page); + int i, idx, last_offset, nrpages; /* scan the node segment */ - last_offset = sbi->blocks_per_seg; + last_offset = BLKS_PER_SEG(sbi); addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; - for (i = 0; i < last_offset; i++, sum_entry++) { - /* - * In order to read next node page, - * we must clear PageUptodate flag. - */ - ClearPageUptodate(page); + for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { + nrpages = bio_max_segs(last_offset - i); - if (f2fs_readpage(sbi, page, addr, READ_SYNC)) - goto out; + /* readahead node pages */ + f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); - lock_page(page); - rn = (struct f2fs_node *)page_address(page); - sum_entry->nid = rn->footer.nid; - sum_entry->version = 0; - sum_entry->ofs_in_node = 0; - addr++; + for (idx = addr; idx < addr + nrpages; idx++) { + struct folio *folio = f2fs_get_tmp_folio(sbi, idx); + + if (IS_ERR(folio)) + return PTR_ERR(folio); + + rn = F2FS_NODE(folio); + sum_entry->nid = rn->footer.nid; + sum_entry->version = 0; + sum_entry->ofs_in_node = 0; + sum_entry++; + f2fs_folio_put(folio, true); + } + + invalidate_mapping_pages(META_MAPPING(sbi), addr, + addr + nrpages); } - unlock_page(page); -out: - __free_pages(page, 0); return 0; } -static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) +static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i; + bool init_dirty; - mutex_lock(&curseg->curseg_mutex); - - if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) { - mutex_unlock(&curseg->curseg_mutex); - return false; - } - - for (i = 0; i < nats_in_cursum(sum); i++) { + down_write(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { struct nat_entry *ne; struct f2fs_nat_entry raw_ne; - nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); + nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); - raw_ne = nat_in_journal(sum, i); -retry: - write_lock(&nm_i->nat_tree_lock); - ne = __lookup_nat_cache(nm_i, nid); - if (ne) { - __set_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); + if (f2fs_check_nid_range(sbi, nid)) continue; - } - ne = grab_nat_entry(nm_i, nid); + + init_dirty = false; + + raw_ne = nat_in_journal(journal, i); + + ne = __lookup_nat_cache(nm_i, nid, true); if (!ne) { - write_unlock(&nm_i->nat_tree_lock); - goto retry; + init_dirty = true; + ne = __alloc_nat_entry(sbi, nid, true); + __init_nat_entry(nm_i, ne, &raw_ne, true, true); } - nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr)); - nat_set_ino(ne, le32_to_cpu(raw_ne.ino)); - nat_set_version(ne, raw_ne.version); - __set_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } - update_nats_in_cursum(sum, -i); - mutex_unlock(&curseg->curseg_mutex); - return true; + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + + __set_nat_cache_dirty(nm_i, ne, init_dirty); + } + update_nats_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); +} + +static void __adjust_nat_entry_set(struct nat_entry_set *nes, + struct list_head *head, int max) +{ + struct nat_entry_set *cur; + + if (nes->entry_cnt >= max) + goto add_out; + + list_for_each_entry(cur, head, set_list) { + if (cur->entry_cnt >= nes->entry_cnt) { + list_add(&nes->set_list, cur->set_list.prev); + return; + } + } +add_out: + list_add_tail(&nes->set_list, head); +} + +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, + const struct f2fs_nat_block *nat_blk) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; + int valid = 0; + int i = 0; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + if (nat_index == 0) { + valid = 1; + i = 1; + } + for (; i < NAT_ENTRY_PER_BLOCK; i++) { + if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) + valid++; + } + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); +} + +static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, + struct nat_entry_set *set, struct cp_control *cpc) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; + bool to_journal = true; + struct f2fs_nat_block *nat_blk; + struct nat_entry *ne, *cur; + struct folio *folio = NULL; + + /* + * there are two steps to flush nat entries: + * #1, flush nat entries to journal in current hot data summary block. + * #2, flush nat entries to nat page. + */ + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + to_journal = false; + + if (to_journal) { + down_write(&curseg->journal_rwsem); + } else { + folio = get_next_nat_folio(sbi, start_nid); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + nat_blk = folio_address(folio); + f2fs_bug_on(sbi, !nat_blk); + } + + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(ne, cur, &set->entry_list, list) { + struct f2fs_nat_entry *raw_ne; + nid_t nid = nat_get_nid(ne); + int offset; + + f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); + + if (to_journal) { + offset = f2fs_lookup_journal_in_cursum(journal, + NAT_JOURNAL, nid, 1); + f2fs_bug_on(sbi, offset < 0); + raw_ne = &nat_in_journal(journal, offset); + nid_in_journal(journal, offset) = cpu_to_le32(nid); + } else { + raw_ne = &nat_blk->entries[nid - start_nid]; + } + raw_nat_from_node_info(raw_ne, &ne->ni); + nat_reset_flag(ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); + if (nat_get_blkaddr(ne) == NULL_ADDR) { + add_free_nid(sbi, nid, false, true); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, nid, false, false); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + } + + if (to_journal) { + up_write(&curseg->journal_rwsem); + } else { + __update_nat_bits(sbi, start_nid, nat_blk); + f2fs_folio_put(folio, true); + } + + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } + return 0; } /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; - struct list_head *cur, *n; - struct page *page = NULL; - struct f2fs_nat_block *nat_blk = NULL; - nid_t start_nid = 0, end_nid = 0; - bool flushed; + struct f2fs_journal *journal = curseg->journal; + struct nat_entry_set *setvec[NAT_VEC_SIZE]; + struct nat_entry_set *set, *tmp; + unsigned int found; + nid_t set_idx = 0; + LIST_HEAD(sets); + int err = 0; - flushed = flush_nats_in_journal(sbi); + /* + * during unmount, let's flush nat_bits before checking + * nat_cnt[DIRTY_NAT]. + */ + if (enabled_nat_bits(sbi, cpc)) { + f2fs_down_write(&nm_i->nat_tree_lock); + remove_nats_in_journal(sbi); + f2fs_up_write(&nm_i->nat_tree_lock); + } - if (!flushed) - mutex_lock(&curseg->curseg_mutex); + if (!nm_i->nat_cnt[DIRTY_NAT]) + return 0; - /* 1) flush dirty nat caches */ - list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { - struct nat_entry *ne; - nid_t nid; - struct f2fs_nat_entry raw_ne; - int offset = -1; - block_t new_blkaddr; + f2fs_down_write(&nm_i->nat_tree_lock); - ne = list_entry(cur, struct nat_entry, list); - nid = nat_get_nid(ne); + /* + * if there are no enough space in journal to store dirty nat + * entries, remove all entries from journal and merge them + * into nat entry set. + */ + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, + nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) + remove_nats_in_journal(sbi); - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; - if (flushed) - goto to_nat_page; - - /* if there is room for nat enries in curseg->sumpage */ - offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); - if (offset >= 0) { - raw_ne = nat_in_journal(sum, offset); - goto flush_now; - } -to_nat_page: - if (!page || (start_nid > nid || nid > end_nid)) { - if (page) { - f2fs_put_page(page, 1); - page = NULL; - } - start_nid = START_NID(nid); - end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1; + while ((found = __gang_lookup_nat_set(nm_i, + set_idx, NAT_VEC_SIZE, setvec))) { + unsigned idx; - /* - * get nat block with dirty flag, increased reference - * count, mapped and lock - */ - page = get_next_nat_page(sbi, start_nid); - nat_blk = page_address(page); - } + set_idx = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) + __adjust_nat_entry_set(setvec[idx], &sets, + MAX_NAT_JENTRIES(journal)); + } - BUG_ON(!nat_blk); - raw_ne = nat_blk->entries[nid - start_nid]; -flush_now: - new_blkaddr = nat_get_blkaddr(ne); + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(set, tmp, &sets, set_list) { + err = __flush_nat_entry_set(sbi, set, cpc); + if (err) + break; + } - raw_ne.ino = cpu_to_le32(nat_get_ino(ne)); - raw_ne.block_addr = cpu_to_le32(new_blkaddr); - raw_ne.version = nat_get_version(ne); + f2fs_up_write(&nm_i->nat_tree_lock); + /* Allow dirty nats by node block allocation in write_begin */ - if (offset < 0) { - nat_blk->entries[nid - start_nid] = raw_ne; - } else { - nat_in_journal(sum, offset) = raw_ne; - nid_in_journal(sum, offset) = cpu_to_le32(nid); - } + return err; +} - if (nat_get_blkaddr(ne) == NULL_ADDR && - add_free_nid(NM_I(sbi), nid, false) <= 0) { - write_lock(&nm_i->nat_tree_lock); - __del_from_nat_cache(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } else { - write_lock(&nm_i->nat_tree_lock); - __clear_nat_cache_dirty(nm_i, ne); - ne->checkpointed = true; - write_unlock(&nm_i->nat_tree_lock); - } +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; + unsigned int i; + __u64 cp_ver = cur_cp_version(ckpt); + block_t nat_bits_addr; + + if (!enabled_nat_bits(sbi, NULL)) + return 0; + + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); + nm_i->nat_bits = f2fs_kvzalloc(sbi, + F2FS_BLK_TO_BYTES(nm_i->nat_bits_blocks), GFP_KERNEL); + if (!nm_i->nat_bits) + return -ENOMEM; + + nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) - + nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) { + struct folio *folio; + + folio = f2fs_get_meta_folio(sbi, nat_bits_addr++); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + memcpy(nm_i->nat_bits + F2FS_BLK_TO_BYTES(i), + folio_address(folio), F2FS_BLKSIZE); + f2fs_folio_put(folio, true); + } + + cp_ver |= (cur_cp_crc(ckpt) << 32); + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { + disable_nat_bits(sbi, true); + return 0; } - if (!flushed) - mutex_unlock(&curseg->curseg_mutex); - f2fs_put_page(page, 1); - /* 2) shrink nat caches if necessary */ - try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD); + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + f2fs_notice(sbi, "Found nat_bits in checkpoint"); + return 0; +} + +static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i = 0; + nid_t nid, last_nid; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + + nid = i * NAT_ENTRY_PER_BLOCK; + last_nid = nid + NAT_ENTRY_PER_BLOCK; + + spin_lock(&NM_I(sbi)->nid_list_lock); + for (; nid < last_nid; nid++) + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + } } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -1690,44 +3283,101 @@ static int init_node_manager(struct f2fs_sb_info *sbi) struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned char *version_bitmap; - unsigned int nat_segs, nat_blocks; + unsigned int nat_segs; + int err; nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; - nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; - nm_i->fcnt = 0; - nm_i->nat_cnt = 0; - + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; + + /* not used nids: 0, node, meta, (and root counted as valid node) */ + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + F2FS_RESERVED_NODE_NUM; + nm_i->nid_cnt[FREE_NID] = 0; + nm_i->nid_cnt[PREALLOC_NID] = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; + nm_i->ra_nid_pages = DEF_RA_NID_PAGES; + nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; + nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS; + + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); - INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); + INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); + INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); - INIT_LIST_HEAD(&nm_i->dirty_nat_entries); + spin_lock_init(&nm_i->nat_list_lock); mutex_init(&nm_i->build_lock); - spin_lock_init(&nm_i->free_nid_list_lock); - rwlock_init(&nm_i->nat_tree_lock); + spin_lock_init(&nm_i->nid_list_lock); + init_f2fs_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); - if (!version_bitmap) - return -EFAULT; - nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, GFP_KERNEL); if (!nm_i->nat_bitmap) return -ENOMEM; + + if (!test_opt(sbi, NAT_BITS)) + disable_nat_bits(sbi, true); + + err = __get_nat_bitmaps(sbi); + if (err) + return err; + +#ifdef CONFIG_F2FS_CHECK_FS + nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap_mir) + return -ENOMEM; +#endif + + return 0; +} + +static int init_free_nid_cache(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + int i; + + nm_i->free_nid_bitmap = + f2fs_kvzalloc(sbi, array_size(sizeof(unsigned char *), + nm_i->nat_blocks), + GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + + for (i = 0; i < nm_i->nat_blocks; i++) { + nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL); + if (!nm_i->free_nid_bitmap[i]) + return -ENOMEM; + } + + nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8, + GFP_KERNEL); + if (!nm_i->nat_block_bitmap) + return -ENOMEM; + + nm_i->free_nid_count = + f2fs_kvzalloc(sbi, array_size(sizeof(unsigned short), + nm_i->nat_blocks), + GFP_KERNEL); + if (!nm_i->free_nid_count) + return -ENOMEM; return 0; } -int build_node_manager(struct f2fs_sb_info *sbi) +int f2fs_build_node_manager(struct f2fs_sb_info *sbi) { int err; - sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL); + sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info), + GFP_KERNEL); if (!sbi->nm_info) return -ENOMEM; @@ -1735,15 +3385,23 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi); - return 0; + err = init_free_nid_cache(sbi); + if (err) + return err; + + /* load free nid status from nat_bits table */ + load_free_nid_bitmap(sbi); + + return f2fs_build_free_nids(sbi, true, true); } -void destroy_node_manager(struct f2fs_sb_info *sbi) +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *next_i; - struct nat_entry *natvec[NATVEC_SIZE]; + void *vec[NAT_VEC_SIZE]; + struct nat_entry **natvec = (struct nat_entry **)vec; + struct nat_entry_set **setvec = (struct nat_entry_set **)vec; nid_t nid = 0; unsigned int found; @@ -1751,52 +3409,108 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) return; /* destroy free nid list */ - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - BUG_ON(i->state == NID_ALLOC); - __del_from_free_nid_list(i); - nm_i->fcnt--; + __remove_free_nid(sbi, i, FREE_NID); + spin_unlock(&nm_i->nid_list_lock); + kmem_cache_free(free_nid_slab, i); + spin_lock(&nm_i->nid_list_lock); } - BUG_ON(nm_i->fcnt); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]); + f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]); + f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list)); + spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ - write_lock(&nm_i->nat_tree_lock); + f2fs_down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, - nid, NATVEC_SIZE, natvec))) { + nid, NAT_VEC_SIZE, natvec))) { + unsigned idx; + + nid = nat_get_nid(natvec[found - 1]) + 1; + for (idx = 0; idx < found; idx++) { + spin_lock(&nm_i->nat_list_lock); + list_del(&natvec[idx]->list); + spin_unlock(&nm_i->nat_list_lock); + + __del_from_nat_cache(nm_i, natvec[idx]); + } + } + f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]); + + /* destroy nat set cache */ + nid = 0; + memset(vec, 0, sizeof(void *) * NAT_VEC_SIZE); + while ((found = __gang_lookup_nat_set(nm_i, + nid, NAT_VEC_SIZE, setvec))) { unsigned idx; + + nid = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) { - struct nat_entry *e = natvec[idx]; - nid = nat_get_nid(e) + 1; - __del_from_nat_cache(nm_i, e); + /* entry_cnt is not zero, when cp_error was occurred */ + f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list)); + radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set); + kmem_cache_free(nat_entry_set_slab, setvec[idx]); } } - BUG_ON(nm_i->nat_cnt); - write_unlock(&nm_i->nat_tree_lock); + f2fs_up_write(&nm_i->nat_tree_lock); + + kvfree(nm_i->nat_block_bitmap); + if (nm_i->free_nid_bitmap) { + int i; + + for (i = 0; i < nm_i->nat_blocks; i++) + kvfree(nm_i->free_nid_bitmap[i]); + kvfree(nm_i->free_nid_bitmap); + } + kvfree(nm_i->free_nid_count); kfree(nm_i->nat_bitmap); + kvfree(nm_i->nat_bits); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(nm_i->nat_bitmap_mir); +#endif sbi->nm_info = NULL; kfree(nm_i); } -int __init create_node_manager_caches(void) +int __init f2fs_create_node_manager_caches(void) { - nat_entry_slab = f2fs_kmem_cache_create("nat_entry", - sizeof(struct nat_entry), NULL); + nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry", + sizeof(struct nat_entry)); if (!nat_entry_slab) - return -ENOMEM; + goto fail; - free_nid_slab = f2fs_kmem_cache_create("free_nid", - sizeof(struct free_nid), NULL); - if (!free_nid_slab) { - kmem_cache_destroy(nat_entry_slab); - return -ENOMEM; - } + free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid", + sizeof(struct free_nid)); + if (!free_nid_slab) + goto destroy_nat_entry; + + nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set", + sizeof(struct nat_entry_set)); + if (!nat_entry_set_slab) + goto destroy_free_nid; + + fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry", + sizeof(struct fsync_node_entry)); + if (!fsync_node_entry_slab) + goto destroy_nat_entry_set; return 0; + +destroy_nat_entry_set: + kmem_cache_destroy(nat_entry_set_slab); +destroy_free_nid: + kmem_cache_destroy(free_nid_slab); +destroy_nat_entry: + kmem_cache_destroy(nat_entry_slab); +fail: + return -ENOMEM; } -void destroy_node_manager_caches(void) +void f2fs_destroy_node_manager_caches(void) { + kmem_cache_destroy(fsync_node_entry_slab); + kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); kmem_cache_destroy(nat_entry_slab); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c65fb4f4230f..9cb8dcf8d417 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -1,37 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/node.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ -#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ -#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) + +/* # of pages to perform synchronous readahead before building free nids */ +#define FREE_NID_PAGES 8 +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) -/* # of pages to perform readahead before building free nids */ -#define FREE_NID_PAGES 4 +/* size of free nid batch when shrinking */ +#define SHRINK_NID_BATCH_SIZE 8 -/* maximum # of free node ids to produce during build_free_nids */ -#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) +#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 -/* maximum cached nat entries to manage memory footprint */ -#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) +/* control the memory footprint threshold (10MB per 1GB ram) */ +#define DEF_RAM_THRESHOLD 1 + +/* control dirty nats ratio threshold (default: 10% over max nid count) */ +#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 +/* control total # of nats */ +#define DEF_NAT_CACHE_THRESHOLD 100000 + +/* control total # of node writes used for roll-forward recovery */ +#define DEF_RF_NODE_BLOCKS 0 /* vector size for gang look-up from nat cache that consists of radix tree */ -#define NATVEC_SIZE 64 +#define NAT_VEC_SIZE 32 /* return value for read_node_page */ #define LOCKED_PAGE 1 +/* check pinned file's alignment status of physical blocks */ +#define FILE_NOT_ALIGNED 1 + +/* For flag in struct node_info */ +enum { + IS_CHECKPOINTED, /* is it checkpointed before? */ + HAS_FSYNCED_INODE, /* is the inode fsynced before? */ + HAS_LAST_FSYNC, /* has the latest node fsync mark? */ + IS_DIRTY, /* this nat entry is dirty? */ + IS_PREALLOC, /* nat entry is preallocated */ +}; + +/* For node type in __get_node_folio() */ +enum node_type { + NODE_TYPE_REGULAR, + NODE_TYPE_INODE, + NODE_TYPE_XATTR, + NODE_TYPE_NON_INODE, +}; + /* * For node information */ @@ -40,28 +68,56 @@ struct node_info { nid_t ino; /* inode number of the node's owner */ block_t blk_addr; /* block address of the node */ unsigned char version; /* version of the node */ + unsigned char flag; /* for node information bits */ }; struct nat_entry { struct list_head list; /* for clean or dirty nat list */ - bool checkpointed; /* whether it is checkpointed or not */ struct node_info ni; /* in-memory node information */ }; -#define nat_get_nid(nat) (nat->ni.nid) -#define nat_set_nid(nat, n) (nat->ni.nid = n) -#define nat_get_blkaddr(nat) (nat->ni.blk_addr) -#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) -#define nat_get_ino(nat) (nat->ni.ino) -#define nat_set_ino(nat, i) (nat->ni.ino = i) -#define nat_get_version(nat) (nat->ni.version) -#define nat_set_version(nat, v) (nat->ni.version = v) - -#define __set_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); -#define __clear_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->nat_entries); -#define inc_node_version(version) (++version) +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) + +#define inc_node_version(version) (++(version)) + +static inline void copy_node_info(struct node_info *dst, + struct node_info *src) +{ + dst->nid = src->nid; + dst->ino = src->ino; + dst->blk_addr = src->blk_addr; + dst->version = src->version; + /* should not copy flag here */ +} + +static inline void set_nat_flag(struct nat_entry *ne, + unsigned int type, bool set) +{ + if (set) + ne->ni.flag |= BIT(type); + else + ne->ni.flag &= ~BIT(type); +} + +static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) +{ + return ne->ni.flag & BIT(type); +} + +static inline void nat_reset_flag(struct nat_entry *ne) +{ + /* these states can be set only after checkpoint was done */ + set_nat_flag(ne, IS_CHECKPOINTED, true); + set_nat_flag(ne, HAS_FSYNCED_INODE, false); + set_nat_flag(ne, HAS_LAST_FSYNC, true); +} static inline void node_info_from_raw_nat(struct node_info *ni, struct f2fs_nat_entry *raw_ne) @@ -71,32 +127,63 @@ static inline void node_info_from_raw_nat(struct node_info *ni, ni->version = raw_ne->version; } -/* - * For free nid mangement - */ -enum nid_state { - NID_NEW, /* newly added to free nid list */ - NID_ALLOC /* it is allocated */ +static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, + struct node_info *ni) +{ + raw_ne->ino = cpu_to_le32(ni->ino); + raw_ne->block_addr = cpu_to_le32(ni->blk_addr); + raw_ne->version = ni->version; +} + +static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid * + NM_I(sbi)->dirty_nats_ratio / 100; +} + +static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; +} + +enum mem_type { + FREE_NIDS, /* indicates the free nid list */ + NAT_ENTRIES, /* indicates the cached nat entry */ + DIRTY_DENTS, /* indicates dirty dentry pages */ + INO_ENTRIES, /* indicates inode entries */ + READ_EXTENT_CACHE, /* indicates read extent cache */ + AGE_EXTENT_CACHE, /* indicates age extent cache */ + DISCARD_CACHE, /* indicates memory of cached discard cmds */ + COMPRESS_PAGE, /* indicates memory of cached compressed pages */ + BASE_CHECK, /* check kernel status */ +}; + +struct nat_entry_set { + struct list_head set_list; /* link with other nat sets */ + struct list_head entry_list; /* link with dirty nat entries */ + nid_t set; /* set number*/ + unsigned int entry_cnt; /* the # of nat entries in set */ }; struct free_nid { struct list_head list; /* for free node id list */ nid_t nid; /* node id */ - int state; /* in use or not: NID_NEW or NID_ALLOC */ + int state; /* in use or not: FREE_NID or PREALLOC_NID */ }; -static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) +static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - if (nm_i->fcnt <= 0) - return -1; - spin_lock(&nm_i->free_nid_list_lock); - fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID] <= 0) { + spin_unlock(&nm_i->nid_list_lock); + return; + } + fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); *nid = fnid->nid; - spin_unlock(&nm_i->free_nid_list_lock); - return 0; + spin_unlock(&nm_i->nid_list_lock); } /* @@ -105,6 +192,12 @@ static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, + nm_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); } @@ -113,17 +206,20 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; - int seg_off; + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ block_off = NAT_BLOCK_OFFSET(start); - seg_off = block_off >> sbi->log_blocks_per_seg; block_addr = (pgoff_t)(nm_i->nat_blkaddr + - (seg_off << sbi->log_blocks_per_seg << 1) + - (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + (block_off << 1) - + (block_off & (BLKS_PER_SEG(sbi) - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) - block_addr += sbi->blocks_per_seg; + block_addr += BLKS_PER_SEG(sbi); return block_addr; } @@ -134,11 +230,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - if ((block_addr >> sbi->log_blocks_per_seg) % 2) - block_addr -= sbi->blocks_per_seg; - else - block_addr += sbi->blocks_per_seg; - + block_addr ^= BIT(sbi->log_blocks_per_seg); return block_addr + nm_i->nat_blkaddr; } @@ -146,77 +238,96 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) { unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); - if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) - f2fs_clear_bit(block_off, nm_i->nat_bitmap); - else - f2fs_set_bit(block_off, nm_i->nat_bitmap); + f2fs_change_bit(block_off, nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); +#endif } -static inline void fill_node_footer(struct page *page, nid_t nid, - nid_t ino, unsigned int ofs, bool reset) +static inline nid_t ino_of_node(const struct folio *node_folio) { - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - if (reset) - memset(rn, 0, sizeof(*rn)); - rn->footer.nid = cpu_to_le32(nid); - rn->footer.ino = cpu_to_le32(ino); - rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT); + struct f2fs_node *rn = F2FS_NODE(node_folio); + return le32_to_cpu(rn->footer.ino); } -static inline void copy_node_footer(struct page *dst, struct page *src) +static inline nid_t nid_of_node(const struct folio *node_folio) { - void *src_addr = page_address(src); - void *dst_addr = page_address(dst); - struct f2fs_node *src_rn = (struct f2fs_node *)src_addr; - struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr; - memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); + struct f2fs_node *rn = F2FS_NODE(node_folio); + return le32_to_cpu(rn->footer.nid); } -static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) +static inline unsigned int ofs_of_node(const struct folio *node_folio) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - rn->footer.cp_ver = ckpt->checkpoint_ver; - rn->footer.next_blkaddr = cpu_to_le32(blkaddr); + struct f2fs_node *rn = F2FS_NODE(node_folio); + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; } -static inline nid_t ino_of_node(struct page *node_page) +static inline __u64 cpver_of_node(const struct folio *node_folio) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - return le32_to_cpu(rn->footer.ino); + struct f2fs_node *rn = F2FS_NODE(node_folio); + return le64_to_cpu(rn->footer.cp_ver); } -static inline nid_t nid_of_node(struct page *node_page) +static inline block_t next_blkaddr_of_node(const struct folio *node_folio) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - return le32_to_cpu(rn->footer.nid); + struct f2fs_node *rn = F2FS_NODE(node_folio); + return le32_to_cpu(rn->footer.next_blkaddr); } -static inline unsigned int ofs_of_node(struct page *node_page) +static inline void fill_node_footer(const struct folio *folio, nid_t nid, + nid_t ino, unsigned int ofs, bool reset) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned flag = le32_to_cpu(rn->footer.flag); - return flag >> OFFSET_BIT_SHIFT; + struct f2fs_node *rn = F2FS_NODE(folio); + unsigned int old_flag = 0; + + if (reset) + memset(rn, 0, sizeof(*rn)); + else + old_flag = le32_to_cpu(rn->footer.flag); + + rn->footer.nid = cpu_to_le32(nid); + rn->footer.ino = cpu_to_le32(ino); + + /* should remain old flag bits such as COLD_BIT_SHIFT */ + rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) | + (old_flag & OFFSET_BIT_MASK)); } -static inline unsigned long long cpver_of_node(struct page *node_page) +static inline void copy_node_footer(const struct folio *dst, + const struct folio *src) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - return le64_to_cpu(rn->footer.cp_ver); + struct f2fs_node *src_rn = F2FS_NODE(src); + struct f2fs_node *dst_rn = F2FS_NODE(dst); + memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); } -static inline block_t next_blkaddr_of_node(struct page *node_page) +static inline void fill_node_footer_blkaddr(struct folio *folio, block_t blkaddr) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - return le32_to_cpu(rn->footer.next_blkaddr); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio)); + struct f2fs_node *rn = F2FS_NODE(folio); + __u64 cp_ver = cur_cp_version(ckpt); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + + rn->footer.cp_ver = cpu_to_le64(cp_ver); + rn->footer.next_blkaddr = cpu_to_le32(blkaddr); +} + +static inline bool is_recoverable_dnode(const struct folio *folio) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio)); + __u64 cp_ver = cur_cp_version(ckpt); + + /* Don't care crc part, if fsck.f2fs sets it. */ + if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG)) + return (cp_ver << 32) == (cpver_of_node(folio) << 32); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + + return cp_ver == cpver_of_node(folio); } /* @@ -232,11 +343,21 @@ static inline block_t next_blkaddr_of_node(struct page *node_page) * | `- direct node (5 + N => 5 + 2N - 1) * `- double indirect node (5 + 2N) * `- indirect node (6 + 2N) - * `- direct node (x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) + * `- direct node */ -static inline bool IS_DNODE(struct page *node_page) +static inline bool IS_DNODE(const struct folio *node_folio) { - unsigned int ofs = ofs_of_node(node_page); + unsigned int ofs = ofs_of_node(node_folio); + + if (f2fs_has_xattr_block(ofs)) + return true; + if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) return false; @@ -248,22 +369,23 @@ static inline bool IS_DNODE(struct page *node_page) return true; } -static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + struct f2fs_node *rn = F2FS_NODE(folio); - wait_on_page_writeback(p); + f2fs_folio_wait_writeback(folio, NODE, true, true); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); - set_page_dirty(p); + return folio_mark_dirty(folio); } -static inline nid_t get_nid(struct page *p, int off, bool i) +static inline nid_t get_nid(const struct folio *folio, int off, bool i) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + struct f2fs_node *rn = F2FS_NODE(folio); + if (i) return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); return le32_to_cpu(rn->in.nid[off]); @@ -275,75 +397,42 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ -static inline int is_file(struct inode *inode, int type) -{ - return F2FS_I(inode)->i_advise & type; -} - -static inline void set_file(struct inode *inode, int type) -{ - F2FS_I(inode)->i_advise |= type; -} -static inline void clear_file(struct inode *inode, int type) +static inline int is_node(const struct folio *folio, int type) { - F2FS_I(inode)->i_advise &= ~type; + struct f2fs_node *rn = F2FS_NODE(folio); + return le32_to_cpu(rn->footer.flag) & BIT(type); } -#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) -#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) -#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) -#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) -#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) -#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) +#define is_cold_node(folio) is_node(folio, COLD_BIT_SHIFT) +#define is_fsync_dnode(folio) is_node(folio, FSYNC_BIT_SHIFT) +#define is_dent_dnode(folio) is_node(folio, DENT_BIT_SHIFT) -static inline int is_cold_data(struct page *page) +static inline void set_cold_node(const struct folio *folio, bool is_dir) { - return PageChecked(page); -} - -static inline void set_cold_data(struct page *page) -{ - SetPageChecked(page); -} - -static inline void clear_cold_data(struct page *page) -{ - ClearPageChecked(page); -} - -static inline int is_node(struct page *page, int type) -{ - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - return le32_to_cpu(rn->footer.flag) & (1 << type); -} - -#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) -#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) -#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) - -static inline void set_cold_node(struct inode *inode, struct page *page) -{ - struct f2fs_node *rn = (struct f2fs_node *)page_address(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int flag = le32_to_cpu(rn->footer.flag); - if (S_ISDIR(inode->i_mode)) - flag &= ~(0x1 << COLD_BIT_SHIFT); + if (is_dir) + flag &= ~BIT(COLD_BIT_SHIFT); else - flag |= (0x1 << COLD_BIT_SHIFT); + flag |= BIT(COLD_BIT_SHIFT); rn->footer.flag = cpu_to_le32(flag); } -static inline void set_mark(struct page *page, int mark, int type) +static inline void set_mark(struct folio *folio, int mark, int type) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(page); + struct f2fs_node *rn = F2FS_NODE(folio); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) - flag |= (0x1 << type); + flag |= BIT(type); else - flag &= ~(0x1 << type); + flag &= ~BIT(type); rn->footer.flag = cpu_to_le32(flag); + +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_F_SB(folio), folio); +#endif } -#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) -#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) +#define set_dentry_mark(folio, mark) set_mark(folio, mark, DENT_BIT_SHIFT) +#define set_fsync_mark(folio, mark) set_mark(folio, mark, FSYNC_BIT_SHIFT) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d56d951c2253..c3415ebb9f50 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -1,25 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/recovery.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ +#include <linux/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/sched/mm.h> #include "f2fs.h" #include "node.h" #include "segment.h" +/* + * Roll forward recovery scenarios. + * + * [Term] F: fsync_mark, D: dentry_mark + * + * 1. inode(x) | CP | inode(x) | dnode(F) + * -> Update the latest inode(x). + * + * 2. inode(x) | CP | inode(F) | dnode(F) + * -> No problem. + * + * 3. inode(x) | CP | dnode(F) | inode(x) + * -> Recover to the latest dnode(F), and drop the last inode(x) + * + * 4. inode(x) | CP | dnode(F) | inode(F) + * -> No problem. + * + * 5. CP | inode(x) | dnode(F) + * -> The inode(DF) was missing. Should drop this dnode(F). + * + * 6. CP | inode(DF) | dnode(F) + * -> No problem. + * + * 7. CP | dnode(F) | inode(DF) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * + * 8. CP | dnode(F) | inode(x) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * But it will fail due to no inode(DF). + */ + static struct kmem_cache *fsync_entry_slab; -bool space_for_roll_forward(struct f2fs_sb_info *sbi) +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) { - if (sbi->last_valid_block_count + sbi->alloc_valid_block_count - > sbi->user_block_count) + s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); + + if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) + return false; + if (NM_I(sbi)->max_rf_node_blocks && + percpu_counter_sum_positive(&sbi->rf_node_block_count) >= + NM_I(sbi)->max_rf_node_blocks) return false; return true; } @@ -27,180 +62,431 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi) static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, nid_t ino) { - struct list_head *this; struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + list_for_each_entry(entry, head, list) if (entry->inode->i_ino == ino) return entry; - } + return NULL; } -static int recover_dentry(struct page *ipage, struct inode *inode) +static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, + struct list_head *head, nid_t ino, bool quota_inode) { - void *kaddr = page_address(ipage); - struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; - struct f2fs_inode *raw_inode = &(raw_node->i); + struct inode *inode; + struct fsync_inode_entry *entry; + int err; + + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + err = f2fs_dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, + GFP_F2FS_ZERO, true, NULL); + entry->inode = inode; + list_add_tail(&entry->list, head); + + return entry; +err_out: + iput(inode); + return ERR_PTR(err); +} + +static void del_fsync_inode(struct fsync_inode_entry *entry, int drop) +{ + if (drop) { + /* inode should not be recovered, drop it */ + f2fs_inode_synced(entry->inode); + } + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); +} + +static int init_recovered_filename(const struct inode *dir, + struct f2fs_inode *raw_inode, + struct f2fs_filename *fname, + struct qstr *usr_fname) +{ + int err; + + memset(fname, 0, sizeof(*fname)); + fname->disk_name.len = le32_to_cpu(raw_inode->i_namelen); + fname->disk_name.name = raw_inode->i_name; + + if (WARN_ON(fname->disk_name.len > F2FS_NAME_LEN)) + return -ENAMETOOLONG; + + if (!IS_ENCRYPTED(dir)) { + usr_fname->name = fname->disk_name.name; + usr_fname->len = fname->disk_name.len; + fname->usr_fname = usr_fname; + } + + /* Compute the hash of the filename */ + if (IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)) { + /* + * In this case the hash isn't computable without the key, so it + * was saved on-disk. + */ + if (fname->disk_name.len + sizeof(f2fs_hash_t) > F2FS_NAME_LEN) + return -EINVAL; + fname->hash = get_unaligned((f2fs_hash_t *) + &raw_inode->i_name[fname->disk_name.len]); + } else if (IS_CASEFOLDED(dir)) { + err = f2fs_init_casefolded_name(dir, fname); + if (err) + return err; + f2fs_hash_filename(dir, fname); + /* Case-sensitive match is fine for recovery */ + f2fs_free_casefolded_name(fname); + } else { + f2fs_hash_filename(dir, fname); + } + return 0; +} + +static int recover_dentry(struct inode *inode, struct folio *ifolio, + struct list_head *dir_list) +{ + struct f2fs_inode *raw_inode = F2FS_INODE(ifolio); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; - struct qstr name; - struct page *page; + struct f2fs_filename fname; + struct qstr usr_fname; + struct folio *folio; struct inode *dir, *einode; + struct fsync_inode_entry *entry; int err = 0; - - dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino); - if (!dir) { - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); + char *name; + + entry = get_fsync_inode(dir_list, pino); + if (!entry) { + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); + if (IS_ERR(entry)) { + dir = ERR_CAST(entry); + err = PTR_ERR(entry); goto out; } - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); - add_dirty_dir_inode(dir); } - name.len = le32_to_cpu(raw_inode->i_namelen); - name.name = raw_inode->i_name; -retry: - de = f2fs_find_entry(dir, &name, &page); - if (de && inode->i_ino == le32_to_cpu(de->ino)) { - kunmap(page); - f2fs_put_page(page, 0); + dir = entry->inode; + err = init_recovered_filename(dir, raw_inode, &fname, &usr_fname); + if (err) goto out; - } +retry: + de = __f2fs_find_entry(dir, &fname, &folio); + if (de && inode->i_ino == le32_to_cpu(de->ino)) + goto out_put; + if (de) { - einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); - if (PTR_ERR(einode) == -ENOENT) + err = PTR_ERR(einode); + if (err == -ENOENT) err = -EEXIST; - goto out; + goto out_put; + } + + err = f2fs_dquot_initialize(einode); + if (err) { + iput(einode); + goto out_put; } - f2fs_delete_entry(de, page, einode); + + err = f2fs_acquire_orphan_inode(F2FS_I_SB(inode)); + if (err) { + iput(einode); + goto out_put; + } + f2fs_delete_entry(de, folio, dir, einode); iput(einode); goto retry; + } else if (IS_ERR(folio)) { + err = PTR_ERR(folio); + } else { + err = f2fs_add_dentry(dir, &fname, inode, + inode->i_ino, inode->i_mode); } - err = __f2fs_add_link(dir, &name, inode); + if (err == -ENOMEM) + goto retry; + goto out; + +out_put: + f2fs_folio_put(folio, false); out: - f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: " - "ino = %x, name = %s, dir = %lx, err = %d", - ino_of_node(ipage), raw_inode->i_name, - IS_ERR(dir) ? 0 : dir->i_ino, err); + if (file_enc_name(inode)) + name = "<encrypted>"; + else + name = raw_inode->i_name; + f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d", + __func__, ino_of_node(ifolio), name, + IS_ERR(dir) ? 0 : dir->i_ino, err); + return err; +} + +static int recover_quota_data(struct inode *inode, struct folio *folio) +{ + struct f2fs_inode *raw = F2FS_INODE(folio); + struct iattr attr; + uid_t i_uid = le32_to_cpu(raw->i_uid); + gid_t i_gid = le32_to_cpu(raw->i_gid); + int err; + + memset(&attr, 0, sizeof(attr)); + + attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); + attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); + + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&nop_mnt_idmap, inode))) + attr.ia_valid |= ATTR_UID; + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&nop_mnt_idmap, inode))) + attr.ia_valid |= ATTR_GID; + + if (!attr.ia_valid) + return 0; + + err = dquot_transfer(&nop_mnt_idmap, inode, &attr); + if (err) + set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; } -static int recover_inode(struct inode *inode, struct page *node_page) +static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) { - void *kaddr = page_address(node_page); - struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; - struct f2fs_inode *raw_inode = &(raw_node->i); + if (ri->i_inline & F2FS_PIN_FILE) + set_inode_flag(inode, FI_PIN_FILE); + else + clear_inode_flag(inode, FI_PIN_FILE); + if (ri->i_inline & F2FS_DATA_EXIST) + set_inode_flag(inode, FI_DATA_EXIST); + else + clear_inode_flag(inode, FI_DATA_EXIST); +} + +static int recover_inode(struct inode *inode, struct folio *folio) +{ + struct f2fs_inode *raw = F2FS_INODE(folio); + struct f2fs_inode_info *fi = F2FS_I(inode); + char *name; + int err; + + inode->i_mode = le16_to_cpu(raw->i_mode); + + err = recover_quota_data(inode, folio); + if (err) + return err; + + i_uid_write(inode, le32_to_cpu(raw->i_uid)); + i_gid_write(inode, le32_to_cpu(raw->i_gid)); + + if (raw->i_inline & F2FS_EXTRA_ATTR) { + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize), + i_projid)) { + projid_t i_projid; + kprojid_t kprojid; + + i_projid = (projid_t)le32_to_cpu(raw->i_projid); + kprojid = make_kprojid(&init_user_ns, i_projid); - if (!IS_INODE(node_page)) + if (!projid_eq(kprojid, fi->i_projid)) { + err = f2fs_transfer_project_quota(inode, + kprojid); + if (err) + return err; + fi->i_projid = kprojid; + } + } + } + + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); + inode_set_atime(inode, le64_to_cpu(raw->i_atime), + le32_to_cpu(raw->i_atime_nsec)); + inode_set_ctime(inode, le64_to_cpu(raw->i_ctime), + le32_to_cpu(raw->i_ctime_nsec)); + inode_set_mtime(inode, le64_to_cpu(raw->i_mtime), + le32_to_cpu(raw->i_mtime_nsec)); + + fi->i_advise = raw->i_advise; + fi->i_flags = le32_to_cpu(raw->i_flags); + f2fs_set_inode_flags(inode); + fi->i_gc_failures = le16_to_cpu(raw->i_gc_failures); + + recover_inline_flags(inode, raw); + + f2fs_mark_inode_dirty_sync(inode, true); + + if (file_enc_name(inode)) + name = "<encrypted>"; + else + name = F2FS_INODE(folio)->i_name; + + f2fs_notice(F2FS_I_SB(inode), "recover_inode: ino = %x, name = %s, inline = %x", + ino_of_node(folio), name, raw->i_inline); + return 0; +} + +static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi, + unsigned int ra_blocks, unsigned int blkaddr, + unsigned int next_blkaddr) +{ + if (blkaddr + 1 == next_blkaddr) + ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS, + ra_blocks * 2); + else if (next_blkaddr % BLKS_PER_SEG(sbi)) + ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS, + ra_blocks / 2); + return ra_blocks; +} + +/* Detect looped node chain with Floyd's cycle detection algorithm. */ +static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr, + block_t *blkaddr_fast, bool *is_detecting) +{ + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; + int i; + + if (!*is_detecting) return 0; - inode->i_mode = le16_to_cpu(raw_inode->i_mode); - i_size_write(inode, le64_to_cpu(raw_inode->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); - inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + for (i = 0; i < 2; i++) { + struct folio *folio; - if (is_dent_dnode(node_page)) - return recover_dentry(node_page, inode); + if (!f2fs_is_valid_blkaddr(sbi, *blkaddr_fast, META_POR)) { + *is_detecting = false; + return 0; + } - f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", - ino_of_node(node_page), raw_inode->i_name); + folio = f2fs_get_tmp_folio(sbi, *blkaddr_fast); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + if (!is_recoverable_dnode(folio)) { + f2fs_folio_put(folio, true); + *is_detecting = false; + return 0; + } + + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, *blkaddr_fast, + next_blkaddr_of_node(folio)); + + *blkaddr_fast = next_blkaddr_of_node(folio); + f2fs_folio_put(folio, true); + + f2fs_ra_meta_pages_cond(sbi, *blkaddr_fast, ra_blocks); + } + + if (*blkaddr_fast == blkaddr) { + f2fs_notice(sbi, "%s: Detect looped node chain on blkaddr:%u." + " Run fsck to fix it.", __func__, blkaddr); + return -EINVAL; + } return 0; } -static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only, bool *new_inode) { - unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); struct curseg_info *curseg; - struct page *page; - block_t blkaddr; + block_t blkaddr, blkaddr_fast; + bool is_detecting = true; int err = 0; /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; - - /* read node page */ - page = alloc_page(GFP_F2FS_ZERO); - if (IS_ERR(page)) - return PTR_ERR(page); - lock_page(page); + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + blkaddr_fast = blkaddr; while (1) { struct fsync_inode_entry *entry; + struct folio *folio; - err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); - if (err) - goto out; + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) + return 0; - lock_page(page); + folio = f2fs_get_tmp_folio(sbi, blkaddr); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + break; + } - if (cp_ver != cpver_of_node(page)) + if (!is_recoverable_dnode(folio)) { + f2fs_folio_put(folio, true); break; + } - if (!is_fsync_dnode(page)) + if (!is_fsync_dnode(folio)) goto next; - entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (IS_INODE(page) && is_dent_dnode(page)) - set_inode_flag(F2FS_I(entry->inode), - FI_INC_LINK); - } else { - if (IS_INODE(page) && is_dent_dnode(page)) { - err = recover_inode_page(sbi, page); - if (err) - break; - } + entry = get_fsync_inode(head, ino_of_node(folio)); + if (!entry) { + bool quota_inode = false; - /* add this fsync inode to the list */ - entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); - if (!entry) { - err = -ENOMEM; - break; + if (!check_only && + IS_INODE(folio) && + is_dent_dnode(folio)) { + err = f2fs_recover_inode_page(sbi, folio); + if (err) { + f2fs_folio_put(folio, true); + break; + } + quota_inode = true; } - entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); - if (IS_ERR(entry->inode)) { - err = PTR_ERR(entry->inode); - kmem_cache_free(fsync_entry_slab, entry); + entry = add_fsync_inode(sbi, head, ino_of_node(folio), + quota_inode); + if (IS_ERR(entry)) { + err = PTR_ERR(entry); + /* + * CP | dnode(F) | inode(DF) + * For this case, we should not give up now. + */ + if (err == -ENOENT) { + if (check_only) + *new_inode = true; + goto next; + } + f2fs_folio_put(folio, true); break; } - list_add_tail(&entry->list, head); } entry->blkaddr = blkaddr; - err = recover_inode(entry->inode, page); - if (err && err != -ENOENT) - break; + if (IS_INODE(folio) && is_dent_dnode(folio)) + entry->last_dentry = blkaddr; next: /* check next segment */ - blkaddr = next_blkaddr_of_node(page); + blkaddr = next_blkaddr_of_node(folio); + f2fs_folio_put(folio, true); + + err = sanity_check_node_chain(sbi, blkaddr, &blkaddr_fast, + &is_detecting); + if (err) + break; } - unlock_page(page); -out: - __free_pages(page, 0); return err; } -static void destroy_fsync_dnodes(struct list_head *head) +static void destroy_fsync_dnodes(struct list_head *head, int drop) { struct fsync_inode_entry *entry, *tmp; - list_for_each_entry_safe(entry, tmp, head, list) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + list_for_each_entry_safe(entry, tmp, head, list) + del_fsync_inode(entry, drop); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -208,13 +494,14 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, { struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); - unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & - (sbi->blocks_per_seg - 1); + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + struct f2fs_summary_block *sum_node; struct f2fs_summary sum; + struct folio *sum_folio, *node_folio; + struct dnode_of_data tdn = *dn; nid_t ino, nid; - void *kaddr; struct inode *inode; - struct page *node_page; + unsigned int offset, ofs_in_node, max_addrs; block_t bidx; int i; @@ -223,224 +510,458 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; /* Get the previous summary */ - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; - break; + goto got_it; } } - if (i > CURSEG_COLD_DATA) { - struct page *sum_page = get_sum_page(sbi, segno); - struct f2fs_summary_block *sum_node; - kaddr = page_address(sum_page); - sum_node = (struct f2fs_summary_block *)kaddr; - sum = sum_node->entries[blkoff]; - f2fs_put_page(sum_page, 1); - } + sum_folio = f2fs_get_sum_folio(sbi, segno); + if (IS_ERR(sum_folio)) + return PTR_ERR(sum_folio); + sum_node = SUM_BLK_PAGE_ADDR(sum_folio, segno); + sum = sum_node->entries[blkoff]; + f2fs_folio_put(sum_folio, true); +got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); + ofs_in_node = le16_to_cpu(sum.ofs_in_node); + + max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode); + if (ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", + ofs_in_node, dn->inode->i_ino, nid, max_addrs); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); + return -EFSCORRUPTED; + } + if (dn->inode->i_ino == nid) { - struct dnode_of_data tdn = *dn; tdn.nid = nid; - tdn.node_page = dn->inode_page; - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); - truncate_data_blocks_range(&tdn, 1); - return 0; + if (!dn->inode_folio_locked) + folio_lock(dn->inode_folio); + tdn.node_folio = dn->inode_folio; + tdn.ofs_in_node = ofs_in_node; + goto truncate_out; } else if (dn->nid == nid) { - struct dnode_of_data tdn = *dn; - tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); - truncate_data_blocks_range(&tdn, 1); - return 0; + tdn.ofs_in_node = ofs_in_node; + goto truncate_out; } /* Get the node page */ - node_page = get_node_page(sbi, nid); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); - bidx = start_bidx_of_node(ofs_of_node(node_page)) + - le16_to_cpu(sum.ofs_in_node); - ino = ino_of_node(node_page); - f2fs_put_page(node_page, 1); - - /* Deallocate previous index in the node page */ - inode = f2fs_iget(sbi->sb, ino); - if (IS_ERR(inode)) - return PTR_ERR(inode); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); + if (IS_ERR(node_folio)) + return PTR_ERR(node_folio); + + offset = ofs_of_node(node_folio); + ino = ino_of_node(node_folio); + f2fs_folio_put(node_folio, true); + + if (ino != dn->inode->i_ino) { + int ret; + + /* Deallocate previous index in the node page */ + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + ret = f2fs_dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } + } else { + inode = dn->inode; + } - truncate_hole(inode, bidx, bidx + 1); - iput(inode); + bidx = f2fs_start_bidx_of_node(offset, inode) + + le16_to_cpu(sum.ofs_in_node); + + /* + * if inode page is locked, unlock temporarily, but its reference + * count keeps alive. + */ + if (ino == dn->inode->i_ino && dn->inode_folio_locked) + folio_unlock(dn->inode_folio); + + set_new_dnode(&tdn, inode, NULL, NULL, 0); + if (f2fs_get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + goto out; + + if (tdn.data_blkaddr == blkaddr) + f2fs_truncate_data_blocks_range(&tdn, 1); + + f2fs_put_dnode(&tdn); +out: + if (ino != dn->inode->i_ino) + iput(inode); + else if (dn->inode_folio_locked) + folio_lock(dn->inode_folio); + return 0; + +truncate_out: + if (f2fs_data_blkaddr(&tdn) == blkaddr) + f2fs_truncate_data_blocks_range(&tdn, 1); + if (dn->inode->i_ino == nid && !dn->inode_folio_locked) + folio_unlock(dn->inode_folio); return 0; } +static int f2fs_reserve_new_block_retry(struct dnode_of_data *dn) +{ + int i, err = 0; + + for (i = DEFAULT_FAILURE_RETRY_COUNT; i > 0; i--) { + err = f2fs_reserve_new_block(dn); + if (!err) + break; + } + + return err; +} + static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, block_t blkaddr) + struct folio *folio) { - unsigned int start, end; struct dnode_of_data dn; - struct f2fs_summary sum; struct node_info ni; + unsigned int start = 0, end = 0, index; int err = 0, recovered = 0; - int ilock; - start = start_bidx_of_node(ofs_of_node(page)); - if (IS_INODE(page)) - end = start + ADDRS_PER_INODE; - else - end = start + ADDRS_PER_BLOCK; + /* step 1: recover xattr */ + if (IS_INODE(folio)) { + err = f2fs_recover_inline_xattr(inode, folio); + if (err) + goto out; + } else if (f2fs_has_xattr_block(ofs_of_node(folio))) { + err = f2fs_recover_xattr_data(inode, folio); + if (!err) + recovered++; + goto out; + } - ilock = mutex_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); + /* step 2: recover inline data */ + err = f2fs_recover_inline_data(inode, folio); + if (err) { + if (err == 1) + err = 0; + goto out; + } - err = get_dnode_of_data(&dn, start, ALLOC_NODE); + /* step 3: recover data indices */ + start = f2fs_start_bidx_of_node(ofs_of_node(folio), inode); + end = start + ADDRS_PER_PAGE(folio, inode); + + set_new_dnode(&dn, inode, NULL, NULL, 0); +retry_dn: + err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { - mutex_unlock_op(sbi, ilock); - return err; + if (err == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto retry_dn; + } + goto out; } - wait_on_page_writeback(dn.node_page); + f2fs_folio_wait_writeback(dn.node_folio, NODE, true, true); + + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) + goto err; + + f2fs_bug_on(sbi, ni.ino != ino_of_node(folio)); - get_node_info(sbi, dn.nid, &ni); - BUG_ON(ni.ino != ino_of_node(page)); - BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page)); + if (ofs_of_node(dn.node_folio) != ofs_of_node(folio)) { + f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", + inode->i_ino, ofs_of_node(dn.node_folio), + ofs_of_node(folio)); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + goto err; + } - for (; start < end; start++) { + for (index = start; index < end; index++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.node_page, dn.ofs_in_node); - dest = datablock_addr(page, dn.ofs_in_node); + src = f2fs_data_blkaddr(&dn); + dest = data_blkaddr(dn.inode, folio, dn.ofs_in_node); - if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) { + if (__is_valid_data_blkaddr(src) && + !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { + err = -EFSCORRUPTED; + goto err; + } + + if (__is_valid_data_blkaddr(dest) && + !f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { + err = -EFSCORRUPTED; + goto err; + } + + /* skip recovering if dest is the same as src */ + if (src == dest) + continue; + + /* dest is invalid, just invalidate src block */ + if (dest == NULL_ADDR) { + f2fs_truncate_data_blocks_range(&dn, 1); + continue; + } + + if (!file_keep_isize(inode) && + (i_size_read(inode) <= ((loff_t)index << PAGE_SHIFT))) + f2fs_i_size_write(inode, + (loff_t)(index + 1) << PAGE_SHIFT); + + /* + * dest is reserved block, invalidate src block + * and then reserve one new block in dnode page. + */ + if (dest == NEW_ADDR) { + f2fs_truncate_data_blocks_range(&dn, 1); + + err = f2fs_reserve_new_block_retry(&dn); + if (err) + goto err; + continue; + } + + /* dest is valid block, try to recover from src to dest */ + if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { - int err = reserve_new_block(&dn); - /* We should not get -ENOSPC */ - BUG_ON(err); + err = f2fs_reserve_new_block_retry(&dn); + if (err) + goto err; } - +retry_prev: /* Check the previous node page having this index */ err = check_index_in_prev_nodes(sbi, dest, &dn); - if (err) + if (err) { + if (err == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto retry_prev; + } goto err; + } - set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + if (f2fs_is_valid_blkaddr(sbi, dest, + DATA_GENERIC_ENHANCE_UPDATE)) { + f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", + dest, inode->i_ino, dn.ofs_in_node); + err = -EFSCORRUPTED; + goto err; + } /* write dummy data page */ - recover_data_page(sbi, NULL, &sum, src, dest); - update_extent_cache(dest, &dn); + f2fs_replace_block(sbi, &dn, src, dest, + ni.version, false, false); recovered++; } - dn.ofs_in_node++; } - /* write node page in place */ - set_summary(&sum, dn.nid, 0, 0); - if (IS_INODE(dn.node_page)) - sync_inode_page(&dn); - - copy_node_footer(dn.node_page, page); - fill_node_footer(dn.node_page, dn.nid, ni.ino, - ofs_of_node(page), false); - set_page_dirty(dn.node_page); - - recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); + copy_node_footer(dn.node_folio, folio); + fill_node_footer(dn.node_folio, dn.nid, ni.ino, + ofs_of_node(folio), false); + folio_mark_dirty(dn.node_folio); err: f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, ilock); - - f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, " - "recovered_data = %d blocks, err = %d", - inode->i_ino, recovered, err); +out: + f2fs_notice(sbi, "recover_data: ino = %lx, nid = %x (i_size: %s), " + "range (%u, %u), recovered = %d, err = %d", + inode->i_ino, nid_of_node(folio), + file_keep_isize(inode) ? "keep" : "recover", + start, end, recovered, err); return err; } -static int recover_data(struct f2fs_sb_info *sbi, - struct list_head *head, int type) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + struct list_head *tmp_inode_list, struct list_head *dir_list) { - unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); struct curseg_info *curseg; - struct page *page; int err = 0; block_t blkaddr; + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; + unsigned int recoverable_dnode = 0; + unsigned int fsynced_dnode = 0; + unsigned int total_dnode = 0; + unsigned int recovered_inode = 0; + unsigned int recovered_dentry = 0; + unsigned int recovered_dnode = 0; + + f2fs_notice(sbi, "do_recover_data: start to recover dnode"); /* get node pages in the current segment */ - curseg = CURSEG_I(sbi, type); + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* read node page */ - page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (IS_ERR(page)) - return -ENOMEM; - - lock_page(page); - while (1) { struct fsync_inode_entry *entry; + struct folio *folio; - err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); - if (err) - goto out; + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) + break; - lock_page(page); + folio = f2fs_get_tmp_folio(sbi, blkaddr); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + break; + } - if (cp_ver != cpver_of_node(page)) + if (!is_recoverable_dnode(folio)) { + f2fs_folio_put(folio, true); break; + } + recoverable_dnode++; - entry = get_fsync_inode(head, ino_of_node(page)); + entry = get_fsync_inode(inode_list, ino_of_node(folio)); if (!entry) goto next; - - err = do_recover_data(sbi, entry->inode, page, blkaddr); - if (err) + fsynced_dnode++; + /* + * inode(x) | CP | inode(x) | dnode(F) + * In this case, we can lose the latest inode(x). + * So, call recover_inode for the inode update. + */ + if (IS_INODE(folio)) { + err = recover_inode(entry->inode, folio); + if (err) { + f2fs_folio_put(folio, true); + break; + } + recovered_inode++; + } + if (entry->last_dentry == blkaddr) { + err = recover_dentry(entry->inode, folio, dir_list); + if (err) { + f2fs_folio_put(folio, true); + break; + } + recovered_dentry++; + } + err = do_recover_data(sbi, entry->inode, folio); + if (err) { + f2fs_folio_put(folio, true); break; - - if (entry->blkaddr == blkaddr) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); } + recovered_dnode++; + + if (entry->blkaddr == blkaddr) + list_move_tail(&entry->list, tmp_inode_list); next: + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, + next_blkaddr_of_node(folio)); + /* check next segment */ - blkaddr = next_blkaddr_of_node(page); - } - unlock_page(page); -out: - __free_pages(page, 0); + blkaddr = next_blkaddr_of_node(folio); + f2fs_folio_put(folio, true); + f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); + total_dnode++; + } if (!err) - allocate_new_segments(sbi); + err = f2fs_allocate_new_segments(sbi); + + f2fs_notice(sbi, "do_recover_data: dnode: (recoverable: %u, fsynced: %u, " + "total: %u), recovered: (inode: %u, dentry: %u, dnode: %u), err: %d", + recoverable_dnode, fsynced_dnode, total_dnode, recovered_inode, + recovered_dentry, recovered_dnode, err); return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi) +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct list_head inode_list; + LIST_HEAD(inode_list); + LIST_HEAD(tmp_inode_list); + LIST_HEAD(dir_list); int err; + int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; + bool need_writecp = false; + bool new_inode = false; - fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry), NULL); - if (unlikely(!fsync_entry_slab)) - return -ENOMEM; + f2fs_notice(sbi, "f2fs_recover_fsync_data: recovery fsync data, " + "check_only: %d", check_only); + + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) + f2fs_info(sbi, "recover fsync data on readonly fs"); - INIT_LIST_HEAD(&inode_list); + /* prevent checkpoint */ + f2fs_down_write(&sbi->cp_global_sem); /* step #1: find fsynced inode numbers */ - sbi->por_doing = 1; - err = find_fsync_dnodes(sbi, &inode_list); - if (err) - goto out; + err = find_fsync_dnodes(sbi, &inode_list, check_only, &new_inode); + if (err < 0 || (list_empty(&inode_list) && (!check_only || !new_inode))) + goto skip; - if (list_empty(&inode_list)) - goto out; + if (check_only) { + ret = 1; + goto skip; + } + + need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); - BUG_ON(!list_empty(&inode_list)); -out: - destroy_fsync_dnodes(&inode_list); - kmem_cache_destroy(fsync_entry_slab); - sbi->por_doing = 0; + err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) - write_checkpoint(sbi, false); - return err; + f2fs_bug_on(sbi, !list_empty(&inode_list)); + else + f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); +skip: + destroy_fsync_dnodes(&inode_list, err); + destroy_fsync_dnodes(&tmp_inode_list, err); + + /* truncate meta pages to be used by the recovery */ + truncate_inode_pages_range(META_MAPPING(sbi), + (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1); + + if (err) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } + + /* + * If fsync data succeeds or there is no fsync data to recover, + * and the f2fs is not read only, check and fix zoned block devices' + * write pointer consistency. + */ + if (!err) + err = f2fs_check_and_fix_write_pointer(sbi); + + if (!err) + clear_sbi_flag(sbi, SBI_POR_DOING); + + f2fs_up_write(&sbi->cp_global_sem); + + /* let's drop all the directory inodes for clean checkpoint */ + destroy_fsync_dnodes(&dir_list, err); + + if (need_writecp) { + set_sbi_flag(sbi, SBI_IS_RECOVERED); + + if (!err) { + struct cp_control cpc = { + .reason = CP_RECOVERY, + }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); + err = f2fs_write_checkpoint(sbi, &cpc); + } + } + + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + + return ret ? ret : err; +} + +int __init f2fs_create_recovery_cache(void) +{ + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", + sizeof(struct fsync_inode_entry)); + return fsync_entry_slab ? 0 : -ENOMEM; +} + +void f2fs_destroy_recovery_cache(void) +{ + kmem_cache_destroy(fsync_entry_slab); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a86d125a9885..c26424f47686 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1,39 +1,770 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/segment.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/sched/mm.h> #include <linux/prefetch.h> -#include <linux/vmalloc.h> +#include <linux/kthread.h> +#include <linux/swap.h> +#include <linux/timer.h> +#include <linux/freezer.h> +#include <linux/sched/signal.h> +#include <linux/random.h> #include "f2fs.h" #include "segment.h" #include "node.h" +#include "gc.h" +#include "iostat.h" #include <trace/events/f2fs.h> +#define __reverse_ffz(x) __reverse_ffs(~(x)) + +static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *discard_cmd_slab; +static struct kmem_cache *sit_entry_set_slab; +static struct kmem_cache *revoke_entry_slab; + +static unsigned long __reverse_ulong(unsigned char *str) +{ + unsigned long tmp = 0; + int shift = 24, idx = 0; + +#if BITS_PER_LONG == 64 + shift = 56; +#endif + while (shift >= 0) { + tmp |= (unsigned long)str[idx++] << shift; + shift -= BITS_PER_BYTE; + } + return tmp; +} + +/* + * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since + * MSB and LSB are reversed in a byte by f2fs_set_bit. + */ +static inline unsigned long __reverse_ffs(unsigned long word) +{ + int num = 0; + +#if BITS_PER_LONG == 64 + if ((word & 0xffffffff00000000UL) == 0) + num += 32; + else + word >>= 32; +#endif + if ((word & 0xffff0000) == 0) + num += 16; + else + word >>= 16; + + if ((word & 0xff00) == 0) + num += 8; + else + word >>= 8; + + if ((word & 0xf0) == 0) + num += 4; + else + word >>= 4; + + if ((word & 0xc) == 0) + num += 2; + else + word >>= 2; + + if ((word & 0x2) == 0) + num += 1; + return num; +} + +/* + * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because + * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. + * Example: + * MSB <--> LSB + * f2fs_set_bit(0, bitmap) => 1000 0000 + * f2fs_set_bit(7, bitmap) => 0000 0001 + */ +static unsigned long __find_rev_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = size; + unsigned long tmp; + + if (offset >= size) + return size; + + size -= (offset & ~(BITS_PER_LONG - 1)); + offset %= BITS_PER_LONG; + + while (1) { + if (*p == 0) + goto pass; + + tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); + if (tmp) + goto found; +pass: + if (size <= BITS_PER_LONG) + break; + size -= BITS_PER_LONG; + offset = 0; + p++; + } + return result; +found: + return result - size + __reverse_ffs(tmp); +} + +static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = size; + unsigned long tmp; + + if (offset >= size) + return size; + + size -= (offset & ~(BITS_PER_LONG - 1)); + offset %= BITS_PER_LONG; + + while (1) { + if (*p == ~0UL) + goto pass; + + tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; + if (tmp != ~0UL) + goto found; +pass: + if (size <= BITS_PER_LONG) + break; + size -= BITS_PER_LONG; + offset = 0; + p++; + } + return result; +found: + return result - size + __reverse_ffz(tmp); +} + +bool f2fs_need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (f2fs_lfs_mode(sbi)) + return false; + if (sbi->gc_mode == GC_URGENT_HIGH) + return true; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); +} + +void f2fs_abort_atomic_write(struct inode *inode, bool clean) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (!f2fs_is_atomic_file(inode)) + return; + + if (clean) + truncate_inode_pages_final(inode->i_mapping); + + release_atomic_write_cnt(inode); + clear_inode_flag(inode, FI_ATOMIC_COMMITTED); + clear_inode_flag(inode, FI_ATOMIC_REPLACE); + clear_inode_flag(inode, FI_ATOMIC_FILE); + if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) { + clear_inode_flag(inode, FI_ATOMIC_DIRTIED); + /* + * The vfs inode keeps clean during commit, but the f2fs inode + * doesn't. So clear the dirty state after commit and let + * f2fs_mark_inode_dirty_sync ensure a consistent dirty state. + */ + f2fs_inode_synced(inode); + f2fs_mark_inode_dirty_sync(inode, true); + } + stat_dec_atomic_inode(inode); + + F2FS_I(inode)->atomic_write_task = NULL; + + if (clean) { + f2fs_i_size_write(inode, fi->original_i_size); + fi->original_i_size = 0; + } + /* avoid stale dirty inode during eviction */ + sync_inode_metadata(inode, 0); +} + +static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, + block_t new_addr, block_t *old_addr, bool recover) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct node_info ni; + int err; + +retry: + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); + if (err) { + if (err == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto retry; + } + return err; + } + + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + if (recover) { + /* dn.data_blkaddr is always valid */ + if (!__is_valid_data_blkaddr(new_addr)) { + if (new_addr == NULL_ADDR) + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr, 1); + f2fs_update_data_blkaddr(&dn, new_addr); + } else { + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + new_addr, ni.version, true, true); + } + } else { + blkcnt_t count = 1; + + err = inc_valid_block_count(sbi, inode, &count, true); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + *old_addr = dn.data_blkaddr; + f2fs_truncate_data_blocks_range(&dn, 1); + dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); + + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true, false); + } + + f2fs_put_dnode(&dn); + + trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, + index, old_addr ? *old_addr : 0, new_addr, recover); + return 0; +} + +static void __complete_revoke_list(struct inode *inode, struct list_head *head, + bool revoke) +{ + struct revoke_entry *cur, *tmp; + pgoff_t start_index = 0; + bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); + + list_for_each_entry_safe(cur, tmp, head, list) { + if (revoke) { + __replace_atomic_write_block(inode, cur->index, + cur->old_addr, NULL, true); + } else if (truncate) { + f2fs_truncate_hole(inode, start_index, cur->index); + start_index = cur->index + 1; + } + + list_del(&cur->list); + kmem_cache_free(revoke_entry_slab, cur); + } + + if (!revoke && truncate) + f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); +} + +static int __f2fs_commit_atomic_write(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct inode *cow_inode = fi->cow_inode; + struct revoke_entry *new; + struct list_head revoke_list; + block_t blkaddr; + struct dnode_of_data dn; + pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t off = 0, blen, index; + int ret = 0, i; + + INIT_LIST_HEAD(&revoke_list); + + while (len) { + blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); + + set_new_dnode(&dn, cow_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + if (dn.max_level == 0) + goto out; + goto next; + } + + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, cow_inode), + len); + index = off; + for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { + blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) { + continue; + } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + ret = -EFSCORRUPTED; + goto out; + } + + new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, + true, NULL); + + ret = __replace_atomic_write_block(inode, index, blkaddr, + &new->old_addr, false); + if (ret) { + f2fs_put_dnode(&dn); + kmem_cache_free(revoke_entry_slab, new); + goto out; + } + + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + new->index = index; + list_add_tail(&new->list, &revoke_list); + } + f2fs_put_dnode(&dn); +next: + off += blen; + len -= blen; + } + +out: + if (time_to_inject(sbi, FAULT_TIMEOUT)) + f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT); + + if (ret) { + sbi->revoked_atomic_block += fi->atomic_write_cnt; + } else { + sbi->committed_atomic_block += fi->atomic_write_cnt; + set_inode_flag(inode, FI_ATOMIC_COMMITTED); + + /* + * inode may has no FI_ATOMIC_DIRTIED flag due to no write + * before commit. + */ + if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) { + /* clear atomic dirty status and set vfs dirty status */ + clear_inode_flag(inode, FI_ATOMIC_DIRTIED); + f2fs_mark_inode_dirty_sync(inode, true); + } + } + + __complete_revoke_list(inode, &revoke_list, ret ? true : false); + + return ret; +} + +int f2fs_commit_atomic_write(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + int err; + + err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (err) + return err; + + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); + f2fs_lock_op(sbi); + + err = __f2fs_commit_atomic_write(inode); + + f2fs_unlock_op(sbi); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + + return err; +} + /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ -void f2fs_balance_fs(struct f2fs_sb_info *sbi) +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { + if (f2fs_cp_error(sbi)) + return; + + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); + + /* balance_fs_bg is able to be pending */ + if (need && excess_cached_nats(sbi)) + f2fs_balance_fs_bg(sbi, false); + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return; + /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi, 0)) { - mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi); + if (has_enough_free_secs(sbi, 0, 0)) + return; + + if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && + sbi->gc_thread->f2fs_gc_task) { + DEFINE_WAIT(wait); + + prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sbi->gc_thread->gc_wait_queue_head); + io_schedule(); + finish_wait(&sbi->gc_thread->fggc_wq, &wait); + } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = f2fs_sb_has_blkzoned(sbi) ? + FG_GC : BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false, + .nr_free_secs = 1 }; + f2fs_down_write(&sbi->gc_lock); + stat_inc_gc_call_count(sbi, FOREGROUND); + f2fs_gc(sbi, &gc_control); + } +} + +static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) +{ + int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; + unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); + unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); + unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); + unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int threshold = + SEGS_TO_BLKS(sbi, (factor * DEFAULT_DIRTY_THRESHOLD)); + unsigned int global_threshold = threshold * 3 / 2; + + if (dents >= threshold || qdata >= threshold || + nodes >= threshold || meta >= threshold || + imeta >= threshold) + return true; + return dents + qdata + nodes + meta + imeta > global_threshold; +} + +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) +{ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return; + + /* try to shrink extent cache when there is no enough memory */ + if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) + f2fs_shrink_read_extent_tree(sbi, + READ_EXTENT_CACHE_SHRINK_NUMBER); + + /* try to shrink age extent cache when there is no enough memory */ + if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) + f2fs_shrink_age_extent_tree(sbi, + AGE_EXTENT_CACHE_SHRINK_NUMBER); + + /* check the # of cached NAT entries */ + if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) + f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) + f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS); + else + f2fs_build_free_nids(sbi, false, false); + + if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || + excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) + goto do_sync; + + /* there is background inflight IO or foreground operation recently */ + if (is_inflight_io(sbi, REQ_TIME) || + (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem))) + return; + + /* exceed periodical checkpoint timeout threshold */ + if (f2fs_time_over(sbi, CP_TIME)) + goto do_sync; + + /* checkpoint is the only way to shrink partial cached entries */ + if (f2fs_available_free_memory(sbi, NAT_ENTRIES) && + f2fs_available_free_memory(sbi, INO_ENTRIES)) + return; + +do_sync: + if (test_opt(sbi, DATA_FLUSH) && from_bg) { + struct blk_plug plug; + + mutex_lock(&sbi->flush_lock); + + blk_start_plug(&plug); + f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); + blk_finish_plug(&plug); + + mutex_unlock(&sbi->flush_lock); + } + stat_inc_cp_call_count(sbi, BACKGROUND); + f2fs_sync_fs(sbi->sb, 1); +} + +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) +{ + int ret = blkdev_issue_flush(bdev); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); + if (!ret) + f2fs_update_iostat(sbi, NULL, FS_FLUSH_IO, 0); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) +{ + int ret = 0; + int i; + + if (!f2fs_is_multi_device(sbi)) + return __submit_flush_wait(sbi, sbi->sb->s_bdev); + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO)) + continue; + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; + } + return ret; +} + +static int issue_flush_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; + wait_queue_head_t *q = &fcc->flush_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + if (!llist_empty(&fcc->issue_list)) { + struct flush_cmd *cmd, *next; + int ret; + + fcc->dispatch_list = llist_del_all(&fcc->issue_list); + fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); + + cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); + + ret = submit_flush_wait(sbi, cmd->ino); + atomic_inc(&fcc->issued_flush); + + llist_for_each_entry_safe(cmd, next, + fcc->dispatch_list, llnode) { + cmd->ret = ret; + complete(&cmd->wait); + } + fcc->dispatch_list = NULL; + } + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&fcc->issue_list)); + goto repeat; +} + +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; + struct flush_cmd cmd; + int ret; + + if (test_opt(sbi, NOBARRIER)) + return 0; + + if (!test_opt(sbi, FLUSH_MERGE)) { + atomic_inc(&fcc->queued_flush); + ret = submit_flush_wait(sbi, ino); + atomic_dec(&fcc->queued_flush); + atomic_inc(&fcc->issued_flush); + return ret; + } + + if (atomic_inc_return(&fcc->queued_flush) == 1 || + f2fs_is_multi_device(sbi)) { + ret = submit_flush_wait(sbi, ino); + atomic_dec(&fcc->queued_flush); + + atomic_inc(&fcc->issued_flush); + return ret; + } + + cmd.ino = ino; + init_completion(&cmd.wait); + + llist_add(&cmd.llnode, &fcc->issue_list); + + /* + * update issue_list before we wake up issue_flush thread, this + * smp_mb() pairs with another barrier in ___wait_event(), see + * more details in comments of waitqueue_active(). + */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) + wake_up(&fcc->flush_wait_queue); + + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->queued_flush); + } else { + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->queued_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi, ino); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->queued_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } + } + + return cmd.ret; +} + +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct flush_cmd_control *fcc; + + if (SM_I(sbi)->fcc_info) { + fcc = SM_I(sbi)->fcc_info; + if (fcc->f2fs_issue_flush) + return 0; + goto init_thread; + } + + fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL); + if (!fcc) + return -ENOMEM; + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->queued_flush, 0); + init_waitqueue_head(&fcc->flush_wait_queue); + init_llist_head(&fcc->issue_list); + SM_I(sbi)->fcc_info = fcc; + if (!test_opt(sbi, FLUSH_MERGE)) + return 0; + +init_thread: + fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(fcc->f2fs_issue_flush)) { + int err = PTR_ERR(fcc->f2fs_issue_flush); + + fcc->f2fs_issue_flush = NULL; + return err; } + + return 0; +} + +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) +{ + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; + + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->fcc_info = NULL; + } +} + +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) +{ + int ret = 0, i; + + if (!f2fs_is_multi_device(sbi)) + return 0; + + if (test_opt(sbi, NOBARRIER)) + return 0; + + for (i = 1; i < sbi->s_ndevs; i++) { + int count = DEFAULT_RETRY_IO_COUNT; + + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) + continue; + + do { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + } while (ret && --count); + + if (ret) { + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_FLUSH_FAIL); + break; + } + + spin_lock(&sbi->dev_lock); + f2fs_clear_bit(i, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + return ret; } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -42,7 +773,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); /* need not be added */ - if (IS_CURSEG(sbi, segno)) + if (is_curseg(sbi, segno)) return; if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) @@ -50,19 +781,27 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); - enum dirty_type t = DIRTY_HOT_DATA; + enum dirty_type t = sentry->type; - dirty_type = sentry->type; + if (unlikely(t >= DIRTY)) { + f2fs_bug_on(sbi, 1); + return; + } + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]++; - if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) - dirty_i->nr_dirty[dirty_type]++; + if (__is_large_section(sbi)) { + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + block_t valid_blocks = + get_valid_blocks(sbi, segno, true); - /* Only one bitmap should be set */ - for (; t <= DIRTY_COLD_NODE; t++) { - if (t == dirty_type) - continue; - if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) - dirty_i->nr_dirty[t]--; + f2fs_bug_on(sbi, + (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + !valid_blocks) || + valid_blocks == CAP_BLKS_PER_SEC(sbi)); + + if (!is_cursec(sbi, secno)) + set_bit(secno, dirty_i->dirty_secmap); } } } @@ -71,21 +810,38 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + block_t valid_blocks; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]--; if (dirty_type == DIRTY) { - enum dirty_type t = DIRTY_HOT_DATA; + struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = sentry->type; - /* clear all the bitmaps */ - for (; t <= DIRTY_COLD_NODE; t++) - if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) - dirty_i->nr_dirty[t]--; + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) - clear_bit(GET_SECNO(sbi, segno), + valid_blocks = get_valid_blocks(sbi, segno, true); + if (valid_blocks == 0) { + clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); +#ifdef CONFIG_F2FS_CHECK_FS + clear_bit(segno, SIT_I(sbi)->invalid_segmap); +#endif + } + if (__is_large_section(sbi)) { + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!valid_blocks || + valid_blocks == CAP_BLKS_PER_SEC(sbi)) { + clear_bit(secno, dirty_i->dirty_secmap); + return; + } + + if (!is_cursec(sbi, secno)) + set_bit(secno, dirty_i->dirty_secmap); + } } } @@ -97,19 +853,23 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned short valid_blocks; + unsigned short valid_blocks, ckpt_valid_blocks; + unsigned int usable_blocks; - if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) + if (segno == NULL_SEGNO || is_curseg(sbi, segno)) return; + usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); mutex_lock(&dirty_i->seglist_lock); - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); + ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); - if (valid_blocks == 0) { + if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || + ckpt_valid_blocks == usable_blocks)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); - } else if (valid_blocks < sbi->blocks_per_seg) { + } else if (valid_blocks < usable_blocks) { __locate_dirty_segment(sbi, segno, DIRTY); } else { /* Recovery routine with SSR needs this */ @@ -117,73 +877,1689 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } mutex_unlock(&dirty_i->seglist_lock); - return; } -/* - * Should call clear_prefree_segments after checkpoint is done. - */ -static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) +/* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */ +void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno = -1; - unsigned int total_segs = TOTAL_SEGS(sbi); + unsigned int segno; mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + if (get_valid_blocks(sbi, segno, false)) + continue; + if (is_curseg(sbi, segno)) + continue; + __locate_dirty_segment(sbi, segno, PRE); + __remove_dirty_segment(sbi, segno, DIRTY); + } + mutex_unlock(&dirty_i->seglist_lock); +} + +block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) +{ + int ovp_hole_segs = + (overprovision_segments(sbi) - reserved_segments(sbi)); + block_t ovp_holes = SEGS_TO_BLKS(sbi, ovp_hole_segs); + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + block_t holes[2] = {0, 0}; /* DATA and NODE */ + block_t unusable; + struct seg_entry *se; + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + se = get_seg_entry(sbi, segno); + if (IS_NODESEG(se->type)) + holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; + else + holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) - + se->valid_blocks; + } + mutex_unlock(&dirty_i->seglist_lock); + + unusable = max(holes[DATA], holes[NODE]); + if (unusable > ovp_holes) + return unusable - ovp_holes; + return 0; +} + +int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable) +{ + int ovp_hole_segs = + (overprovision_segments(sbi) - reserved_segments(sbi)); + + if (F2FS_OPTION(sbi).unusable_cap_perc == 100) + return 0; + if (unusable > F2FS_OPTION(sbi).unusable_cap) + return -EAGAIN; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && + dirty_segments(sbi) > ovp_hole_segs) + return -EAGAIN; + if (has_not_enough_free_secs(sbi, 0, 0)) + return -EAGAIN; + return 0; +} + +/* This is only used by SBI_CP_DISABLED */ +static unsigned int get_free_segment(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno = 0; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + if (get_valid_blocks(sbi, segno, false)) + continue; + if (get_ckpt_valid_blocks(sbi, segno, false)) + continue; + mutex_unlock(&dirty_i->seglist_lock); + return segno; + } + mutex_unlock(&dirty_i->seglist_lock); + return NULL_SEGNO; +} + +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc; + + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); + INIT_LIST_HEAD(&dc->list); + dc->bdev = bdev; + dc->di.lstart = lstart; + dc->di.start = start; + dc->di.len = len; + dc->ref = 0; + dc->state = D_PREP; + dc->queued = 0; + dc->error = 0; + init_completion(&dc->wait); + list_add_tail(&dc->list, pend_list); + spin_lock_init(&dc->lock); + dc->bio_ref = 0; + atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; + + return dc; +} + +static bool f2fs_check_discard_tree(struct f2fs_sb_info *sbi) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node *cur = rb_first_cached(&dcc->root), *next; + struct discard_cmd *cur_dc, *next_dc; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_dc = rb_entry(cur, struct discard_cmd, rb_node); + next_dc = rb_entry(next, struct discard_cmd, rb_node); + + if (cur_dc->di.lstart + cur_dc->di.len > next_dc->di.lstart) { + f2fs_info(sbi, "broken discard_rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_dc->di.lstart, cur_dc->di.len, + next_dc->di.lstart, next_dc->di.len); + return false; + } + cur = next; + } +#endif + return true; +} + +static struct discard_cmd *__lookup_discard_cmd(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node *node = dcc->root.rb_root.rb_node; + struct discard_cmd *dc; + + while (node) { + dc = rb_entry(node, struct discard_cmd, rb_node); + + if (blkaddr < dc->di.lstart) + node = node->rb_left; + else if (blkaddr >= dc->di.lstart + dc->di.len) + node = node->rb_right; + else + return dc; + } + return NULL; +} + +static struct discard_cmd *__lookup_discard_cmd_ret(struct rb_root_cached *root, + block_t blkaddr, + struct discard_cmd **prev_entry, + struct discard_cmd **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent) +{ + struct rb_node **pnode = &root->rb_root.rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct discard_cmd *dc; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(&root->rb_root)) + return NULL; + + while (*pnode) { + parent = *pnode; + dc = rb_entry(*pnode, struct discard_cmd, rb_node); + + if (blkaddr < dc->di.lstart) + pnode = &(*pnode)->rb_left; + else if (blkaddr >= dc->di.lstart + dc->di.len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + dc = rb_entry(parent, struct discard_cmd, rb_node); + tmp_node = parent; + if (parent && blkaddr > dc->di.lstart) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); + + tmp_node = parent; + if (parent && blkaddr < dc->di.lstart) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); + return NULL; + +lookup_neighbors: + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&dc->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); + + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&dc->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); + return dc; +} + +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + if (dc->state == D_DONE) + atomic_sub(dc->queued, &dcc->queued_discard); + + list_del(&dc->list); + rb_erase_cached(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->di.len; + + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned long flags; + + trace_f2fs_remove_discard(dc->bdev, dc->di.start, dc->di.len); + + spin_lock_irqsave(&dc->lock, flags); + if (dc->bio_ref) { + spin_unlock_irqrestore(&dc->lock, flags); + return; + } + spin_unlock_irqrestore(&dc->lock, flags); + + f2fs_bug_on(sbi, dc->ref); + + if (dc->error == -EOPNOTSUPP) + dc->error = 0; + + if (dc->error) + f2fs_info_ratelimited(sbi, + "Issue discard(%u, %u, %u) failed, ret: %d", + dc->di.lstart, dc->di.start, dc->di.len, dc->error); + __detach_discard_cmd(dcc, dc); +} + +static void f2fs_submit_discard_endio(struct bio *bio) +{ + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + unsigned long flags; + + spin_lock_irqsave(&dc->lock, flags); + if (!dc->error) + dc->error = blk_status_to_errno(bio->bi_status); + dc->bio_ref--; + if (!dc->bio_ref && dc->state == D_SUBMIT) { + dc->state = D_DONE; + complete_all(&dc->wait); + } + spin_unlock_irqrestore(&dc->lock, flags); + bio_put(bio); +} + +static void __check_sit_bitmap(struct f2fs_sb_info *sbi, + block_t start, block_t end) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct seg_entry *sentry; + unsigned int segno; + block_t blk = start; + unsigned long offset, size, *map; + + while (blk < end) { + segno = GET_SEGNO(sbi, blk); + sentry = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blk); + + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = BLKS_PER_SEG(sbi); + map = (unsigned long *)(sentry->cur_valid_map); + offset = __find_rev_next_bit(map, size, offset); + f2fs_bug_on(sbi, offset != size); + blk = START_BLOCK(sbi, segno + 1); + } +#endif +} + +static void __init_discard_policy(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + /* common policy */ + dpolicy->type = discard_type; + dpolicy->sync = true; + dpolicy->ordered = false; + dpolicy->granularity = granularity; + + dpolicy->max_requests = dcc->max_discard_request; + dpolicy->io_aware_gran = dcc->discard_io_aware_gran; + dpolicy->timeout = false; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; + if (dcc->discard_io_aware == DPOLICY_IO_AWARE_ENABLE) + dpolicy->io_aware = true; + else if (dcc->discard_io_aware == DPOLICY_IO_AWARE_DISABLE) + dpolicy->io_aware = false; + dpolicy->sync = false; + dpolicy->ordered = true; + if (utilization(sbi) > dcc->discard_urgent_util) { + dpolicy->granularity = MIN_DISCARD_GRANULARITY; + if (atomic_read(&dcc->discard_cmd_cnt)) + dpolicy->max_interval = + dcc->min_discard_issue_time; + } + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = dcc->min_discard_issue_time; + dpolicy->mid_interval = dcc->mid_discard_issue_time; + dpolicy->max_interval = dcc->max_discard_issue_time; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->io_aware = false; + /* we need to issue all to keep CP_TRIMMED_FLAG */ + dpolicy->granularity = MIN_DISCARD_GRANULARITY; + dpolicy->timeout = true; + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len); + +#ifdef CONFIG_BLK_DEV_ZONED +static void __submit_zone_reset_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, blk_opf_t flag, + struct list_head *wait_list, + unsigned int *issued) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct block_device *bdev = dc->bdev; + struct bio *bio = bio_alloc(bdev, 0, REQ_OP_ZONE_RESET | flag, GFP_NOFS); + unsigned long flags; + + trace_f2fs_issue_reset_zone(bdev, dc->di.start); + + spin_lock_irqsave(&dc->lock, flags); + dc->state = D_SUBMIT; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); + + if (issued) + (*issued)++; + + atomic_inc(&dcc->queued_discard); + dc->queued++; + list_move_tail(&dc->list, wait_list); + + /* sanity check on discard range */ + __check_sit_bitmap(sbi, dc->di.lstart, dc->di.lstart + dc->di.len); + + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(dc->di.start); + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + submit_bio(bio); + + atomic_inc(&dcc->issued_discard); + f2fs_update_iostat(sbi, NULL, FS_ZONE_RESET_IO, dc->di.len * F2FS_BLKSIZE); +} +#endif + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +static int __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + struct discard_cmd *dc, int *issued) +{ + struct block_device *bdev = dc->bdev; + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); + blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; + block_t lstart, start, len, total_len; + int err = 0; + + if (dc->state != D_PREP) + return 0; + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return 0; + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) { + int devi = f2fs_bdev_index(sbi, bdev); + + if (devi < 0) + return -EINVAL; + + if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { + __submit_zone_reset_cmd(sbi, dc, flag, + wait_list, issued); + return 0; + } + } +#endif + + /* + * stop issuing discard for any of below cases: + * 1. device is conventional zone, but it doesn't support discard. + * 2. device is regulare device, after snapshot it doesn't support + * discard. + */ + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; + + trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); + + lstart = dc->di.lstart; + start = dc->di.start; + len = dc->di.len; + total_len = len; + + dc->di.len = 0; + + while (total_len && *issued < dpolicy->max_requests && !err) { + struct bio *bio = NULL; + unsigned long flags; + bool last = true; + + if (len > max_discard_blocks) { + len = max_discard_blocks; + last = false; + } + + (*issued)++; + if (*issued == dpolicy->max_requests) + last = true; + + dc->di.len += len; + + err = 0; + if (time_to_inject(sbi, FAULT_DISCARD)) { + err = -EIO; + spin_lock_irqsave(&dc->lock, flags); + if (dc->state == D_PARTIAL) + dc->state = D_SUBMIT; + spin_unlock_irqrestore(&dc->lock, flags); + + break; + } + + __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), + SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); + f2fs_bug_on(sbi, !bio); + + /* + * should keep before submission to avoid D_DONE + * right away + */ + spin_lock_irqsave(&dc->lock, flags); + if (last) + dc->state = D_SUBMIT; + else + dc->state = D_PARTIAL; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); + + atomic_inc(&dcc->queued_discard); + dc->queued++; + list_move_tail(&dc->list, wait_list); + + /* sanity check on discard range */ + __check_sit_bitmap(sbi, lstart, lstart + len); + + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + bio->bi_opf |= flag; + submit_bio(bio); + + atomic_inc(&dcc->issued_discard); + + f2fs_update_iostat(sbi, NULL, FS_DISCARD_IO, len * F2FS_BLKSIZE); + + lstart += len; + start += len; + total_len -= len; + len = total_len; + } + + if (!err && len) { + dcc->undiscard_blks -= len; + __update_discard_tree_range(sbi, bdev, lstart, start, len); + } + return err; +} + +static void __insert_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p = &dcc->root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct discard_cmd *dc; + bool leftmost = true; + + /* look up rb tree to find parent node */ + while (*p) { + parent = *p; + dc = rb_entry(parent, struct discard_cmd, rb_node); + + if (lstart < dc->di.lstart) { + p = &(*p)->rb_left; + } else if (lstart >= dc->di.lstart + dc->di.len) { + p = &(*p)->rb_right; + leftmost = false; + } else { + /* Let's skip to add, if exists */ + return; + } + } + + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); + + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); +} + +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->di.len)]); +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->di.len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + dcc->undiscard_blks -= di.len; + + if (blkaddr > di.lstart) { + dc->di.len = blkaddr - dc->di.lstart; + dcc->undiscard_blks += dc->di.len; + __relocate_discard_cmd(dcc, dc); + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_cmd(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr); + } else { + dc->di.lstart++; + dc->di.len--; + dc->di.start++; + dcc->undiscard_blks += dc->di.len; + __relocate_discard_cmd(dcc, dc); + } + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); + block_t end = lstart + len; + + dc = __lookup_discard_cmd_ret(&dcc->root, lstart, + &prev_dc, &next_dc, &insert_p, &insert_parent); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->di.lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + while (1) { - segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - segno + 1); - if (segno >= total_segs) + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->di.lstart + prev_dc->di.len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->di.lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->di.lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di, + max_discard_blocks)) { + prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, prev_dc); + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di, + max_discard_blocks)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, next_dc); + if (tdc) + __remove_discard_cmd(sbi, tdc); + merged = true; + } + + if (!merged) + __insert_discard_cmd(sbi, bdev, + di.lstart, di.start, di.len); + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } +} + +#ifdef CONFIG_BLK_DEV_ZONED +static void __queue_zone_reset_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t lblkstart, + block_t blklen) +{ + trace_f2fs_queue_reset_zone(bdev, blkstart); + + mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); + __insert_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); + mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); +} +#endif + +static void __queue_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + block_t lblkstart = blkstart; + + if (!f2fs_bdev_support_discard(bdev)) + return; + + trace_f2fs_queue_discard(bdev, blkstart, blklen); + + if (f2fs_is_multi_device(sbi)) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); +} + +static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, int *issued) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + bool io_interrupted = false; + + mutex_lock(&dcc->cmd_lock); + dc = __lookup_discard_cmd_ret(&dcc->root, dcc->next_pos, + &prev_dc, &next_dc, &insert_p, &insert_parent); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc) { + struct rb_node *node; + int err = 0; + + if (dc->state != D_PREP) + goto next; + + if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) { + io_interrupted = true; + break; + } + + dcc->next_pos = dc->di.lstart + dc->di.len; + err = __submit_discard_cmd(sbi, dpolicy, dc, issued); + + if (*issued >= dpolicy->max_requests) break; - __set_test_and_free(sbi, segno); +next: + node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); } + + blk_finish_plug(&plug); + + if (!dc) + dcc->next_pos = 0; + + mutex_unlock(&dcc->cmd_lock); + + if (!(*issued) && io_interrupted) + *issued = -1; +} +static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy); + +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int i, issued; + bool io_interrupted = false; + + if (dpolicy->timeout) + f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT); + +retry: + issued = 0; + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (dpolicy->timeout && + f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) + break; + + if (i + 1 < dpolicy->granularity) + break; + + if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) { + __issue_discard_cmd_orderly(sbi, dpolicy, &issued); + return issued; + } + + pend_list = &dcc->pend_list[i]; + + mutex_lock(&dcc->cmd_lock); + if (list_empty(pend_list)) + goto next; + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); + blk_start_plug(&plug); + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + if (dpolicy->timeout && + f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) + break; + + if (dpolicy->io_aware && i < dpolicy->io_aware_gran && + !is_idle(sbi, DISCARD_TIME)) { + io_interrupted = true; + break; + } + + __submit_discard_cmd(sbi, dpolicy, dc, &issued); + + if (issued >= dpolicy->max_requests) + break; + } + blk_finish_plug(&plug); +next: + mutex_unlock(&dcc->cmd_lock); + + if (issued >= dpolicy->max_requests || io_interrupted) + break; + } + + if (dpolicy->type == DPOLICY_UMOUNT && issued) { + __wait_all_discard_cmd(sbi, dpolicy); + goto retry; + } + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + bool dropped = false; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + dropped = true; + } + } + mutex_unlock(&dcc->cmd_lock); + + return dropped; +} + +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + __drop_discard_cmd(sbi); +} + +static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned int len = 0; + + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) { + if (!dc->error) + len = dc->di.len; + __remove_discard_cmd(sbi, dc); + } + mutex_unlock(&dcc->cmd_lock); + + return len; +} + +static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + block_t start, block_t end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); + struct discard_cmd *dc = NULL, *iter, *tmp; + unsigned int trimmed = 0; + +next: + dc = NULL; + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(iter, tmp, wait_list, list) { + if (iter->di.lstart + iter->di.len <= start || + end <= iter->di.lstart) + continue; + if (iter->di.len < dpolicy->granularity) + continue; + if (iter->state == D_DONE && !iter->ref) { + wait_for_completion_io(&iter->wait); + if (!iter->error) + trimmed += iter->di.len; + __remove_discard_cmd(sbi, iter); + } else { + iter->ref++; + dc = iter; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (dc) { + trimmed += __wait_one_discard_bio(sbi, dc); + goto next; + } + + return trimmed; +} + +static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_policy dp; + unsigned int discard_blks; + + if (dpolicy) + return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + + /* wait all */ + __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, MIN_DISCARD_GRANULARITY); + discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, MIN_DISCARD_GRANULARITY); + discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + + return discard_blks; +} + +/* This should be covered by global mutex, &sit_i->sentry_lock */ +static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + bool need_wait = false; + + mutex_lock(&dcc->cmd_lock); + dc = __lookup_discard_cmd(sbi, blkaddr); +#ifdef CONFIG_BLK_DEV_ZONED + if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) { + int devi = f2fs_bdev_index(sbi, dc->bdev); + + if (devi < 0) { + mutex_unlock(&dcc->cmd_lock); + return; + } + + if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { + /* force submit zone reset */ + if (dc->state == D_PREP) + __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, + &dcc->wait_list, NULL); + dc->ref++; + mutex_unlock(&dcc->cmd_lock); + /* wait zone reset */ + __wait_one_discard_bio(sbi, dc); + return; + } + } +#endif + if (dc) { + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) + __wait_one_discard_bio(sbi, dc); +} + +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } +} + +/** + * f2fs_issue_discard_timeout() - Issue all discard cmd within UMOUNT_DISCARD_TIMEOUT + * @sbi: the f2fs_sb_info data for discard cmd to issue + * + * When UMOUNT_DISCARD_TIMEOUT is exceeded, all remaining discard commands will be dropped + * + * Return true if issued all discard cmd or no discard cmd need issue, otherwise return false. + */ +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_policy dpolicy; + bool dropped; + + if (!atomic_read(&dcc->discard_cmd_cnt)) + return true; + + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, + dcc->discard_granularity); + __issue_discard_cmd(sbi, &dpolicy); + dropped = __drop_discard_cmd(sbi); + + /* just to make sure there is no pending discard commands */ + __wait_all_discard_cmd(sbi, NULL); + + f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); + return !dropped; +} + +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; + struct discard_policy dpolicy; + unsigned int wait_ms = dcc->min_discard_issue_time; + int issued; + + set_freezable(); + + do { + wait_event_freezable_timeout(*q, + kthread_should_stop() || dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + + if (sbi->gc_mode == GC_URGENT_HIGH || + !f2fs_available_free_memory(sbi, DISCARD_CACHE)) + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, + MIN_DISCARD_GRANULARITY); + else + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, + dcc->discard_granularity); + + if (dcc->discard_wake) + dcc->discard_wake = false; + + /* clean up pending candidates before going to sleep */ + if (atomic_read(&dcc->queued_discard)) + __wait_all_discard_cmd(sbi, NULL); + + if (f2fs_readonly(sbi->sb)) + continue; + if (kthread_should_stop()) + return 0; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || + !atomic_read(&dcc->discard_cmd_cnt)) { + wait_ms = dpolicy.max_interval; + continue; + } + + sb_start_intwrite(sbi->sb); + + issued = __issue_discard_cmd(sbi, &dpolicy); + if (issued > 0) { + __wait_all_discard_cmd(sbi, &dpolicy); + wait_ms = dpolicy.min_interval; + } else if (issued == -1) { + wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); + if (!wait_ms) + wait_ms = dpolicy.mid_interval; + } else { + wait_ms = dpolicy.max_interval; + } + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; + + sb_end_intwrite(sbi->sb); + + } while (!kthread_should_stop()); + return 0; +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + sector_t sector, nr_sects; + block_t lblkstart = blkstart; + int devi = 0; + u64 remainder = 0; + + if (f2fs_is_multi_device(sbi)) { + devi = f2fs_target_device_index(sbi, blkstart); + if (blkstart < FDEV(devi).start_blk || + blkstart > FDEV(devi).end_blk) { + f2fs_err(sbi, "Invalid block %x", blkstart); + return -EIO; + } + blkstart -= FDEV(devi).start_blk; + } + + /* For sequential zones, reset the zone write pointer */ + if (f2fs_blkz_is_seq(sbi, devi, blkstart)) { + sector = SECTOR_FROM_BLOCK(blkstart); + nr_sects = SECTOR_FROM_BLOCK(blklen); + div64_u64_rem(sector, bdev_zone_sectors(bdev), &remainder); + + if (remainder || nr_sects != bdev_zone_sectors(bdev)) { + f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path : "", + blkstart, blklen); + return -EIO; + } + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { + unsigned int nofs_flags; + int ret; + + trace_f2fs_issue_reset_zone(bdev, blkstart); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sector, nr_sects); + memalloc_nofs_restore(nofs_flags); + return ret; + } + + __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); + return 0; + } + + /* For conventional zones, use regular discard if supported */ + __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + return 0; +} +#endif + +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + __queue_discard_cmd(sbi, bdev, blkstart, blklen); + return 0; +} + +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = blkstart, len = 0; + struct block_device *bdev; + struct seg_entry *se; + unsigned int offset; + block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } + + se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); + offset = GET_BLKOFF_FROM_SEG0(sbi, i); + + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) + sbi->discard_blks--; + } + + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + return err; +} + +static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, + bool check_only) +{ + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *discard_map = (unsigned long *)se->discard_map; + unsigned long *dmap = SIT_I(sbi)->tmp_map; + unsigned int start = 0, end = -1; + bool force = (cpc->reason & CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; + int i; + + if (se->valid_blocks == BLKS_PER_SEG(sbi) || + !f2fs_hw_support_discard(sbi) || + !f2fs_block_unit_discard(sbi)) + return false; + + if (!force) { + if (!f2fs_realtime_discard_enable(sbi) || + (!se->valid_blocks && + !is_curseg(sbi, cpc->trim_start)) || + SM_I(sbi)->dcc_info->nr_discards >= + SM_I(sbi)->dcc_info->max_discards) + return false; + } + + /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ + for (i = 0; i < entries; i++) + dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : + (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; + + while (force || SM_I(sbi)->dcc_info->nr_discards <= + SM_I(sbi)->dcc_info->max_discards) { + start = __find_rev_next_bit(dmap, BLKS_PER_SEG(sbi), end + 1); + if (start >= BLKS_PER_SEG(sbi)) + break; + + end = __find_rev_next_zero_bit(dmap, + BLKS_PER_SEG(sbi), start + 1); + if (force && start && end != BLKS_PER_SEG(sbi) && + (end - start) < cpc->trim_minlen) + continue; + + if (check_only) + return true; + + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO, true, NULL); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; + } + return false; +} + +static void release_discard_addr(struct discard_entry *entry) +{ + list_del(&entry->list); + kmem_cache_free(discard_entry_slab, entry); +} + +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); + struct discard_entry *entry, *this; + + /* drop caches */ + list_for_each_entry_safe(entry, this, head, list) + release_discard_addr(entry); +} + +/* + * Should call f2fs_clear_prefree_segments after checkpoint is done. + */ +static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) + __set_test_and_free(sbi, segno, false); mutex_unlock(&dirty_i->seglist_lock); } -void clear_prefree_segments(struct f2fs_sb_info *sbi) +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; + struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno = -1; - unsigned int total_segs = TOTAL_SEGS(sbi); + unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; + unsigned int start = 0, end = -1; + unsigned int secno, start_segno; + bool force = (cpc->reason & CP_DISCARD); + bool section_alignment = F2FS_OPTION(sbi).discard_unit == + DISCARD_UNIT_SECTION; + + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) + section_alignment = true; mutex_lock(&dirty_i->seglist_lock); + while (1) { - segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - segno + 1); - if (segno >= total_segs) + int i; + + if (section_alignment && end != -1) + end--; + start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); + if (start >= MAIN_SEGS(sbi)) break; + end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), + start + 1); - if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE])) - dirty_i->nr_dirty[PRE]--; + if (section_alignment) { + start = rounddown(start, SEGS_PER_SEC(sbi)); + end = roundup(end, SEGS_PER_SEC(sbi)); + } - /* Let's use trim */ - if (test_opt(sbi, DISCARD)) - blkdev_issue_discard(sbi->sb->s_bdev, - START_BLOCK(sbi, segno) << - sbi->log_sectors_per_block, - 1 << (sbi->log_sectors_per_block + - sbi->log_blocks_per_seg), - GFP_NOFS, 0); + for (i = start; i < end; i++) { + if (test_and_clear_bit(i, prefree_map)) + dirty_i->nr_dirty[PRE]--; + } + + if (!f2fs_realtime_discard_enable(sbi)) + continue; + + if (force && start >= cpc->trim_start && + (end - 1) <= cpc->trim_end) + continue; + + /* Should cover 2MB zoned device for zone-based reset */ + if (!f2fs_sb_has_blkzoned(sbi) && + (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) { + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + SEGS_TO_BLKS(sbi, end - start)); + continue; + } +next: + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); + if (!is_cursec(sbi, secno) && + !get_valid_blocks(sbi, start, true)) + f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), + BLKS_PER_SEC(sbi)); + + start = start_segno + SEGS_PER_SEC(sbi); + if (start < end) + goto next; + else + end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); + + if (!f2fs_block_unit_discard(sbi)) + goto wakeup; + + /* send small discards */ + list_for_each_entry_safe(entry, this, head, list) { + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + BLKS_PER_SEG(sbi), cur_pos); + len = next_pos - cur_pos; + + if (f2fs_sb_has_blkzoned(sbi) || + (force && len < cpc->trim_minlen)) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + BLKS_PER_SEG(sbi), cur_pos); + } +skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < BLKS_PER_SEG(sbi)) + goto find_next; + + release_discard_addr(entry); + dcc->nr_discards -= total_len; + } + +wakeup: + wake_up_discard_thread(sbi, false); +} + +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int err = 0; + + if (f2fs_sb_has_readonly(sbi)) { + f2fs_info(sbi, + "Skip to start discard thread for readonly image"); + return 0; + } + + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) { + err = PTR_ERR(dcc->f2fs_issue_discard); + dcc->f2fs_issue_discard = NULL; + } + + return err; +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc; + int err = 0, i; + + if (SM_I(sbi)->dcc_info) { + dcc = SM_I(sbi)->dcc_info; + goto init_thread; + } + + dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL); + if (!dcc) + return -ENOMEM; + + dcc->discard_io_aware_gran = MAX_PLIST_NUM; + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; + dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT || + F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + dcc->discard_granularity = BLKS_PER_SEG(sbi); + + INIT_LIST_HEAD(&dcc->entry_list); + for (i = 0; i < MAX_PLIST_NUM; i++) + INIT_LIST_HEAD(&dcc->pend_list[i]); + INIT_LIST_HEAD(&dcc->wait_list); + INIT_LIST_HEAD(&dcc->fstrim_list); + mutex_init(&dcc->cmd_lock); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->queued_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); + dcc->nr_discards = 0; + dcc->max_discards = SEGS_TO_BLKS(sbi, MAIN_SEGS(sbi)); + dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; + dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; + dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; + dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; + dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; + dcc->undiscard_blks = 0; + dcc->next_pos = 0; + dcc->root = RB_ROOT_CACHED; + dcc->rbtree_check = false; + + init_waitqueue_head(&dcc->discard_wait_queue); + SM_I(sbi)->dcc_info = dcc; +init_thread: + err = f2fs_start_discard_thread(sbi); + if (err) { + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + } + + return err; +} + +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (!dcc) + return; + + f2fs_stop_discard_thread(sbi); + + /* + * Recovery can cache discard commands, so in error path of + * fill_super(), it needs to give a chance to handle them. + */ + f2fs_issue_discard_timeout(sbi); + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; } -static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) +static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) + + if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) { sit_i->dirty_sentries++; + return false; + } + + return true; } static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, unsigned int segno, int modified) { struct seg_entry *se = get_seg_entry(sbi, segno); + se->type = type; if (modified) __mark_sit_entry_dirty(sbi, segno); } +static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + unsigned int segno = GET_SEGNO(sbi, blkaddr); + + if (segno == NULL_SEGNO) + return 0; + return get_seg_entry(sbi, segno)->mtime; +} + +static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, + unsigned long long old_mtime) +{ + struct seg_entry *se; + unsigned int segno = GET_SEGNO(sbi, blkaddr); + unsigned long long ctime = get_mtime(sbi, false); + unsigned long long mtime = old_mtime ? old_mtime : ctime; + + if (segno == NULL_SEGNO) + return; + + se = get_seg_entry(sbi, segno); + + if (!se->mtime) + se->mtime = mtime; + else + se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, + se->valid_blocks + 1); + + if (ctime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = ctime; +} + +/* + * NOTE: when updating multiple blocks at the same time, please ensure + * that the consecutive input blocks belong to the same segment. + */ +static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_entry *se, + unsigned int segno, block_t blkaddr, unsigned int offset, int del) +{ + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif + int i; + int del_count = -del; + + f2fs_bug_on(sbi, GET_SEGNO(sbi, blkaddr) != GET_SEGNO(sbi, blkaddr + del_count - 1)); + + for (i = 0; i < del_count; i++) { + exist = f2fs_test_and_clear_bit(offset + i, se->cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + mir_exist = f2fs_test_and_clear_bit(offset + i, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d", + blkaddr + i, exist); + f2fs_bug_on(sbi, 1); + } +#endif + if (unlikely(!exist)) { + f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u", blkaddr + i); + f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del += 1; + } else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + /* + * If checkpoints are off, we must not reuse data that + * was used in the previous checkpoint. If it was used + * before, we must track that to know how much space we + * really have. + */ + if (f2fs_test_bit(offset + i, se->ckpt_valid_map)) { + spin_lock(&sbi->stat_lock); + sbi->unusable_block_count++; + spin_unlock(&sbi->stat_lock); + } + } + + if (f2fs_block_unit_discard(sbi) && + f2fs_test_and_clear_bit(offset + i, se->discard_map)) + sbi->discard_blks++; + + if (!f2fs_test_bit(offset + i, se->ckpt_valid_map)) { + se->ckpt_valid_blocks -= 1; + if (__is_large_section(sbi)) + get_sec_entry(sbi, segno)->ckpt_valid_blocks -= 1; + } + } + + if (__is_large_section(sbi)) + sanity_check_valid_blocks(sbi, segno); + + return del; +} + +static int update_sit_entry_for_alloc(struct f2fs_sb_info *sbi, struct seg_entry *se, + unsigned int segno, block_t blkaddr, unsigned int offset, int del) +{ + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif + + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); + f2fs_bug_on(sbi, 1); + } +#endif + if (unlikely(exist)) { + f2fs_err(sbi, "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; + } + + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) + sbi->discard_blks--; + + /* + * SSR should never reuse block which is checkpointed + * or newly invalidated. + */ + if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) { + se->ckpt_valid_blocks++; + if (__is_large_section(sbi)) + get_sec_entry(sbi, segno)->ckpt_valid_blocks++; + } + } + + if (!f2fs_test_bit(offset, se->ckpt_valid_map)) { + se->ckpt_valid_blocks += del; + if (__is_large_section(sbi)) + get_sec_entry(sbi, segno)->ckpt_valid_blocks += del; + } + + if (__is_large_section(sbi)) + sanity_check_valid_blocks(sbi, segno); + + return del; +} + +/* + * If releasing blocks, this function supports updating multiple consecutive blocks + * at one time, but please note that these consecutive blocks need to belong to the + * same segment. + */ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) { struct seg_entry *se; @@ -191,131 +2567,213 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) long int new_vblocks; segno = GET_SEGNO(sbi, blkaddr); + if (segno == NULL_SEGNO) + return; se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; - offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) || - (new_vblocks > sbi->blocks_per_seg))); + f2fs_bug_on(sbi, (new_vblocks < 0 || + (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); se->valid_blocks = new_vblocks; - se->mtime = get_mtime(sbi); - SIT_I(sbi)->max_mtime = se->mtime; /* Update valid block bitmap */ if (del > 0) { - if (f2fs_set_bit(offset, se->cur_valid_map)) - BUG(); + del = update_sit_entry_for_alloc(sbi, se, segno, blkaddr, offset, del); } else { - if (!f2fs_clear_bit(offset, se->cur_valid_map)) - BUG(); + del = update_sit_entry_for_release(sbi, se, segno, blkaddr, offset, del); } - if (!f2fs_test_bit(offset, se->ckpt_valid_map)) - se->ckpt_valid_blocks += del; __mark_sit_entry_dirty(sbi, segno); /* update total number of valid blocks to be written in ckpt area */ SIT_I(sbi)->written_valid_blocks += del; - if (sbi->segs_per_sec > 1) + if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->valid_blocks += del; } -static void refresh_sit_entry(struct f2fs_sb_info *sbi, - block_t old_blkaddr, block_t new_blkaddr) -{ - update_sit_entry(sbi, new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) - update_sit_entry(sbi, old_blkaddr, -1); -} - -void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr, + unsigned int len) { unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); + block_t addr_start = addr, addr_end = addr + len - 1; + unsigned int seg_num = GET_SEGNO(sbi, addr_end) - segno + 1; + unsigned int i = 1, max_blocks = sbi->blocks_per_seg, cnt; - BUG_ON(addr == NULL_ADDR); - if (addr == NEW_ADDR) + f2fs_bug_on(sbi, addr == NULL_ADDR); + if (addr == NEW_ADDR || addr == COMPRESS_ADDR) return; + f2fs_invalidate_internal_cache(sbi, addr, len); + /* add it into sit main buffer */ - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); + + if (seg_num == 1) + cnt = len; + else + cnt = max_blocks - GET_BLKOFF_FROM_SEG0(sbi, addr); - update_sit_entry(sbi, addr, -1); + do { + update_segment_mtime(sbi, addr_start, 0); + update_sit_entry(sbi, addr_start, -cnt); - /* add it into dirty seglist */ - locate_dirty_segment(sbi, segno); + /* add it into dirty seglist */ + locate_dirty_segment(sbi, segno); - mutex_unlock(&sit_i->sentry_lock); + /* update @addr_start and @cnt and @segno */ + addr_start = START_BLOCK(sbi, ++segno); + if (++i == seg_num) + cnt = GET_BLKOFF_FROM_SEG0(sbi, addr_end) + 1; + else + cnt = max_blocks; + } while (i <= seg_num); + + up_write(&sit_i->sentry_lock); } -/* - * This function should be resided under the curseg_mutex lock - */ -static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, - struct f2fs_summary *sum) +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno, offset; + struct seg_entry *se; + bool is_cp = false; + + if (!__is_valid_data_blkaddr(blkaddr)) + return true; + + down_read(&sit_i->sentry_lock); + + segno = GET_SEGNO(sbi, blkaddr); + se = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (f2fs_test_bit(offset, se->ckpt_valid_map)) + is_cp = true; + + up_read(&sit_i->sentry_lock); + + return is_cp; +} + +static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - void *addr = curseg->sum_blk; - addr += curseg->next_blkoff * sizeof(struct f2fs_summary); - memcpy(addr, sum, sizeof(struct f2fs_summary)); - return; + + if (sbi->ckpt->alloc_type[type] == SSR) + return BLKS_PER_SEG(sbi); + return curseg->next_blkoff; } /* * Calculate the number of current summary pages for writing */ -int npages_for_summary_flush(struct f2fs_sb_info *sbi) +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) { - int total_size_bytes = 0; int valid_sum_count = 0; - int i, sum_space; + int i, sum_in_page; for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { - if (sbi->ckpt->alloc_type[i] == SSR) - valid_sum_count += sbi->blocks_per_seg; + if (sbi->ckpt->alloc_type[i] != SSR && for_ra) + valid_sum_count += + le16_to_cpu(F2FS_CKPT(sbi)->cur_data_blkoff[i]); else - valid_sum_count += curseg_blkoff(sbi, i); + valid_sum_count += f2fs_curseg_valid_blocks(sbi, i); } - total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1) - + sizeof(struct nat_journal) + 2 - + sizeof(struct sit_journal) + 2; - sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE; - if (total_size_bytes < sum_space) + sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - + SUM_FOOTER_SIZE) / SUMMARY_SIZE; + if (valid_sum_count <= sum_in_page) return 1; - else if (total_size_bytes < 2 * sum_space) + else if ((valid_sum_count - sum_in_page) <= + (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } /* - * Caller should put this summary page + * Caller should put this summary folio */ -struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) +struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno) +{ + if (unlikely(f2fs_cp_error(sbi))) + return ERR_PTR(-EIO); + return f2fs_get_meta_folio_retry(sbi, GET_SUM_BLOCK(sbi, segno)); +} + +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) { - return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); + struct folio *folio; + + if (SUMS_PER_BLOCK == 1) + folio = f2fs_grab_meta_folio(sbi, blk_addr); + else + folio = f2fs_get_meta_folio_retry(sbi, blk_addr); + + if (IS_ERR(folio)) + return; + + memcpy(folio_address(folio), src, PAGE_SIZE); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); } static void write_sum_page(struct f2fs_sb_info *sbi, - struct f2fs_summary_block *sum_blk, block_t blk_addr) + struct f2fs_summary_block *sum_blk, unsigned int segno) { - struct page *page = grab_meta_page(sbi, blk_addr); - void *kaddr = page_address(page); - memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE); - set_page_dirty(page); - f2fs_put_page(page, 1); + struct folio *folio; + + if (SUMS_PER_BLOCK == 1) + return f2fs_update_meta_page(sbi, (void *)sum_blk, + GET_SUM_BLOCK(sbi, segno)); + + folio = f2fs_get_sum_folio(sbi, segno); + if (IS_ERR(folio)) + return; + + memcpy(SUM_BLK_PAGE_ADDR(folio, segno), sum_blk, sizeof(*sum_blk)); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); } -static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +static void write_current_sum_page(struct f2fs_sb_info *sbi, + int type, block_t blk_addr) { struct curseg_info *curseg = CURSEG_I(sbi, type); + struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr); + struct f2fs_summary_block *src = curseg->sum_blk; + struct f2fs_summary_block *dst; + + dst = folio_address(folio); + memset(dst, 0, PAGE_SIZE); + + mutex_lock(&curseg->curseg_mutex); + + down_read(&curseg->journal_rwsem); + memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); + up_read(&curseg->journal_rwsem); + + memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); + memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); + + mutex_unlock(&curseg->curseg_mutex); + + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); +} + +static int is_next_segment_free(struct f2fs_sb_info *sbi, + struct curseg_info *curseg) +{ unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); - if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec) + if (segno < MAIN_SEGS(sbi) && segno % SEGS_PER_SEC(sbi)) return !test_bit(segno, free_i->free_segmap); return 0; } @@ -324,58 +2782,95 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG */ -static void get_new_segment(struct f2fs_sb_info *sbi, - unsigned int *newseg, bool new_sec, int dir) +static int get_new_segment(struct f2fs_sb_info *sbi, + unsigned int *newseg, bool new_sec, bool pinning) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; - unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone; - unsigned int hint = *newseg / sbi->segs_per_sec; - unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); - unsigned int left_start = hint; + unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); + unsigned int alloc_policy = sbi->allocate_section_policy; + unsigned int alloc_hint = sbi->allocate_section_hint; bool init = true; - int go_left = 0; int i; + int ret = 0; + + spin_lock(&free_i->segmap_lock); - write_lock(&free_i->segmap_lock); + if (time_to_inject(sbi, FAULT_NO_SEGMENT)) { + ret = -ENOSPC; + goto out_unlock; + } - if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { + if (!new_sec && ((*newseg + 1) % SEGS_PER_SEC(sbi))) { segno = find_next_zero_bit(free_i->free_segmap, - TOTAL_SEGS(sbi), *newseg + 1); - if (segno - *newseg < sbi->segs_per_sec - - (*newseg % sbi->segs_per_sec)) + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } + +#ifdef CONFIG_BLK_DEV_ZONED + /* + * If we format f2fs on zoned storage, let's try to get pinned sections + * from beginning of the storage, which should be a conventional one. + */ + if (f2fs_sb_has_blkzoned(sbi)) { + /* Prioritize writing to conventional zones */ + if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning) + segno = 0; + else + segno = max(sbi->first_seq_zone_segno, *newseg); + hint = GET_SEC_FROM_SEG(sbi, segno); + } +#endif + + /* + * Prevent allocate_section_hint from exceeding MAIN_SECS() + * due to desynchronization. + */ + if (alloc_policy != ALLOCATE_FORWARD_NOHINT && + alloc_hint > MAIN_SECS(sbi)) + alloc_hint = MAIN_SECS(sbi); + + if (alloc_policy == ALLOCATE_FORWARD_FROM_HINT && + hint < alloc_hint) + hint = alloc_hint; + else if (alloc_policy == ALLOCATE_FORWARD_WITHIN_HINT && + hint >= alloc_hint) + hint = 0; + find_other_zone: - secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint); - if (secno >= TOTAL_SECS(sbi)) { - if (dir == ALLOC_RIGHT) { - secno = find_next_zero_bit(free_i->free_secmap, - TOTAL_SECS(sbi), 0); - BUG_ON(secno >= TOTAL_SECS(sbi)); - } else { - go_left = 1; - left_start = hint - 1; + secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); + +#ifdef CONFIG_BLK_DEV_ZONED + if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) { + /* Write only to sequential zones */ + if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) { + hint = GET_SEC_FROM_SEG(sbi, sbi->first_seq_zone_segno); + secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); + } else + secno = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); + if (secno >= MAIN_SECS(sbi)) { + ret = -ENOSPC; + f2fs_bug_on(sbi, 1); + goto out_unlock; } } - if (go_left == 0) - goto skip_left; +#endif - while (test_bit(left_start, free_i->free_secmap)) { - if (left_start > 0) { - left_start--; - continue; + if (secno >= MAIN_SECS(sbi)) { + secno = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); + if (secno >= MAIN_SECS(sbi)) { + ret = -ENOSPC; + f2fs_bug_on(sbi, !pinning); + goto out_unlock; } - left_start = find_next_zero_bit(free_i->free_secmap, - TOTAL_SECS(sbi), 0); - BUG_ON(left_start >= TOTAL_SECS(sbi)); - break; } - secno = left_start; -skip_left: - hint = secno; - segno = secno * sbi->segs_per_sec; - zoneno = secno / sbi->secs_per_zone; + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) @@ -384,21 +2879,13 @@ skip_left: goto got_it; if (zoneno == old_zoneno) goto got_it; - if (dir == ALLOC_LEFT) { - if (!go_left && zoneno + 1 >= total_zones) - goto got_it; - if (go_left && zoneno == 0) - goto got_it; - } for (i = 0; i < NR_CURSEG_TYPE; i++) if (CURSEG_I(sbi, i)->zone == zoneno) break; if (i < NR_CURSEG_TYPE) { /* zone is in user, try another */ - if (go_left) - hint = zoneno * sbi->secs_per_zone - 1; - else if (zoneno + 1 >= total_zones) + if (zoneno + 1 >= total_zones) hint = 0; else hint = (zoneno + 1) * sbi->secs_per_zone; @@ -407,96 +2894,176 @@ skip_left: } got_it: /* set it as dirty segment in free segmap */ - BUG_ON(test_bit(segno, free_i->free_segmap)); + if (test_bit(segno, free_i->free_segmap)) { + ret = -EFSCORRUPTED; + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_FREE_BITMAP); + goto out_unlock; + } + + /* no free section in conventional device or conventional zone */ + if (new_sec && pinning && + f2fs_is_sequential_zone_area(sbi, START_BLOCK(sbi, segno))) { + ret = -EAGAIN; + goto out_unlock; + } __set_inuse(sbi, segno); *newseg = segno; - write_unlock(&free_i->segmap_lock); +out_unlock: + spin_unlock(&free_i->segmap_lock); + + if (ret == -ENOSPC && !pinning) + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT); + return ret; } static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct summary_footer *sum_footer; + unsigned short seg_type = curseg->seg_type; + + /* only happen when get_new_segment() fails */ + if (curseg->next_segno == NULL_SEGNO) + return; + curseg->inited = true; curseg->segno = curseg->next_segno; - curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; sum_footer = &(curseg->sum_blk->footer); memset(sum_footer, 0, sizeof(struct summary_footer)); - if (IS_DATASEG(type)) + + sanity_check_seg_type(sbi, seg_type); + + if (IS_DATASEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); - if (IS_NODESEG(type)) + if (IS_NODESEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); - __set_sit_entry_type(sbi, type, curseg->segno, modified); + __set_sit_entry_type(sbi, seg_type, curseg->segno, modified); +} + +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned short seg_type = curseg->seg_type; + + sanity_check_seg_type(sbi, seg_type); + if (__is_large_section(sbi)) { + if (f2fs_need_rand_seg(sbi)) { + unsigned int hint = GET_SEC_FROM_SEG(sbi, curseg->segno); + + if (GET_SEC_FROM_SEG(sbi, curseg->segno + 1) != hint) + return curseg->segno; + return get_random_u32_inclusive(curseg->segno + 1, + GET_SEG_FROM_SEC(sbi, hint + 1) - 1); + } + return curseg->segno; + } else if (f2fs_need_rand_seg(sbi)) { + return get_random_u32_below(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); + } + + /* inmem log may not locate on any segment after mount */ + if (!curseg->inited) + return 0; + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return 0; + + if (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)) + return 0; + + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + + /* find segments from 0 to reuse freed segments */ + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) + return 0; + + return curseg->segno; +} + +static void reset_curseg_fields(struct curseg_info *curseg) +{ + curseg->inited = false; + curseg->segno = NULL_SEGNO; + curseg->next_segno = 0; } /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. */ -static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) +static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno = curseg->segno; - int dir = ALLOC_LEFT; - - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, segno)); - if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) - dir = ALLOC_RIGHT; - - if (test_opt(sbi, NOHEAP)) - dir = ALLOC_RIGHT; + bool pinning = type == CURSEG_COLD_DATA_PINNED; + int ret; + + if (curseg->inited) + write_sum_page(sbi, curseg->sum_blk, segno); + + segno = __get_next_segno(sbi, type); + ret = get_new_segment(sbi, &segno, new_sec, pinning); + if (ret) { + if (ret == -ENOSPC) + reset_curseg_fields(curseg); + return ret; + } - get_new_segment(sbi, &segno, new_sec, dir); curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + curseg->fragment_remained_chunk = + get_random_u32_inclusive(1, sbi->max_fragment_chunk); + return 0; } -static void __next_free_blkoff(struct f2fs_sb_info *sbi, - struct curseg_info *seg, block_t start) +static int __next_free_blkoff(struct f2fs_sb_info *sbi, + int segno, block_t start) { - struct seg_entry *se = get_seg_entry(sbi, seg->segno); - block_t ofs; - for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) { - if (!f2fs_test_bit(ofs, se->ckpt_valid_map) - && !f2fs_test_bit(ofs, se->cur_valid_map)) - break; - } - seg->next_blkoff = ofs; + struct seg_entry *se = get_seg_entry(sbi, segno); + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long *target_map = SIT_I(sbi)->tmp_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + return __find_rev_next_zero_bit(target_map, BLKS_PER_SEG(sbi), start); } -/* - * If a segment is written by LFS manner, next block offset is just obtained - * by increasing the current block offset. However, if a segment is written by - * SSR manner, next block offset obtained by calling __next_free_blkoff - */ -static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, - struct curseg_info *seg) +static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi, + struct curseg_info *seg) { - if (seg->alloc_type == SSR) - __next_free_blkoff(sbi, seg, seg->next_blkoff + 1); - else - seg->next_blkoff++; + return __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); +} + +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) +{ + return __next_free_blkoff(sbi, segno, 0) < BLKS_PER_SEG(sbi); } /* - * This function always allocates a used segment (from dirty seglist) by SSR + * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +static int change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int new_segno = curseg->next_segno; struct f2fs_summary_block *sum_node; - struct page *sum_page; + struct folio *sum_folio; + + if (curseg->inited) + write_sum_page(sbi, curseg->sum_blk, curseg->segno); - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); mutex_lock(&dirty_i->seglist_lock); @@ -506,450 +3073,1216 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) reset_curseg(sbi, type, 1); curseg->alloc_type = SSR; - __next_free_blkoff(sbi, curseg, 0); + curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); - if (reuse) { - sum_page = get_sum_page(sbi, new_segno); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); + sum_folio = f2fs_get_sum_folio(sbi, new_segno); + if (IS_ERR(sum_folio)) { + /* GC won't be able to use stale summary pages by cp_error */ + memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); + return PTR_ERR(sum_folio); } + sum_node = SUM_BLK_PAGE_ADDR(sum_folio, new_segno); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_folio_put(sum_folio, true); + return 0; } -static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age); + +static int get_atssr_segment(struct f2fs_sb_info *sbi, int type, + int target_type, int alloc_mode, + unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); - const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + int ret = 0; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) - return v_ops->get_victim(sbi, - &(curseg)->next_segno, BG_GC, type, SSR); + curseg->seg_type = target_type; - /* For data segments, let's do SSR more intensively */ - for (; type >= CURSEG_HOT_DATA; type--) - if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, type, SSR)) - return 1; - return 0; + if (get_ssr_segment(sbi, type, alloc_mode, age)) { + struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); + + curseg->seg_type = se->type; + ret = change_curseg(sbi, type); + } else { + /* allocate cold segment by default */ + curseg->seg_type = CURSEG_COLD_DATA; + ret = new_curseg(sbi, type, true); + } + stat_inc_seg_type(sbi, curseg); + return ret; } -/* - * flush out current segment and replace it with new segment - * This function should be returned with success, otherwise BUG - */ -static void allocate_segment_by_default(struct f2fs_sb_info *sbi, - int type, bool force) +static int __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi, bool force) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); + int ret = 0; + + if (!sbi->am.atgc_enabled && !force) + return 0; + + f2fs_down_read(&SM_I(sbi)->curseg_lock); + + mutex_lock(&curseg->curseg_mutex); + down_write(&SIT_I(sbi)->sentry_lock); + + ret = get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, + CURSEG_COLD_DATA, SSR, 0); + + up_write(&SIT_I(sbi)->sentry_lock); + mutex_unlock(&curseg->curseg_mutex); + + f2fs_up_read(&SM_I(sbi)->curseg_lock); + return ret; +} + +int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) +{ + return __f2fs_init_atgc_curseg(sbi, false); +} + +int f2fs_reinit_atgc_curseg(struct f2fs_sb_info *sbi) +{ + int ret; + + if (!test_opt(sbi, ATGC)) + return 0; + if (sbi->am.atgc_enabled) + return 0; + if (le64_to_cpu(F2FS_CKPT(sbi)->elapsed_time) < + sbi->am.age_threshold) + return 0; + + ret = __f2fs_init_atgc_curseg(sbi, true); + if (!ret) { + sbi->am.atgc_enabled = true; + f2fs_info(sbi, "reenabled age threshold GC"); + } + return ret; +} + +static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) { - new_curseg(sbi, type, true); + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) goto out; + + if (get_valid_blocks(sbi, curseg->segno, false)) { + write_sum_page(sbi, curseg->sum_blk, curseg->segno); + } else { + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_free(sbi, curseg->segno, true); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); } +out: + mutex_unlock(&curseg->curseg_mutex); +} - if (type == CURSEG_WARM_NODE) - new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) - new_curseg(sbi, type, false); - else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type, true); - else - new_curseg(sbi, type, false); +void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi) +{ + __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); + + if (sbi->am.atgc_enabled) + __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + mutex_lock(&curseg->curseg_mutex); + if (!curseg->inited) + goto out; + if (get_valid_blocks(sbi, curseg->segno, false)) + goto out; + + mutex_lock(&DIRTY_I(sbi)->seglist_lock); + __set_test_and_inuse(sbi, curseg->segno); + mutex_unlock(&DIRTY_I(sbi)->seglist_lock); out: -#ifdef CONFIG_F2FS_STAT_FS - sbi->segment_count[curseg->alloc_type]++; -#endif - return; + mutex_unlock(&curseg->curseg_mutex); } -void allocate_new_segments(struct f2fs_sb_info *sbi) +void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi) { - struct curseg_info *curseg; - unsigned int old_curseg; - int i; + __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { - curseg = CURSEG_I(sbi, i); - old_curseg = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); - locate_dirty_segment(sbi, old_curseg); + if (sbi->am.atgc_enabled) + __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); +} + +static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, + int alloc_mode, unsigned long long age) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned segno = NULL_SEGNO; + unsigned short seg_type = curseg->seg_type; + int i, cnt; + bool reversed = false; + + sanity_check_seg_type(sbi, seg_type); + + /* f2fs_need_SSR() already forces to do this */ + if (!f2fs_get_victim(sbi, &segno, BG_GC, seg_type, + alloc_mode, age, false)) { + curseg->next_segno = segno; + return 1; } + + /* For node segments, let's do SSR more intensively */ + if (IS_NODESEG(seg_type)) { + if (seg_type >= CURSEG_WARM_NODE) { + reversed = true; + i = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_NODE; + } + cnt = NR_CURSEG_NODE_TYPE; + } else { + if (seg_type >= CURSEG_WARM_DATA) { + reversed = true; + i = CURSEG_COLD_DATA; + } else { + i = CURSEG_HOT_DATA; + } + cnt = NR_CURSEG_DATA_TYPE; + } + + for (; cnt-- > 0; reversed ? i-- : i++) { + if (i == seg_type) + continue; + if (!f2fs_get_victim(sbi, &segno, BG_GC, i, + alloc_mode, age, false)) { + curseg->next_segno = segno; + return 1; + } + } + + /* find valid_blocks=0 in dirty list */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + segno = get_free_segment(sbi); + if (segno != NULL_SEGNO) { + curseg->next_segno = segno; + return 1; + } + } + return 0; } -static const struct segment_allocation default_salloc_ops = { - .allocate_segment = allocate_segment_by_default, -}; +static bool need_new_seg(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + + if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + curseg->seg_type == CURSEG_WARM_NODE) + return true; + if (curseg->alloc_type == LFS && is_next_segment_free(sbi, curseg) && + likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) + return true; + return false; +} + +int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, + unsigned int start, unsigned int end) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno; + int ret = 0; + + f2fs_down_read(&SM_I(sbi)->curseg_lock); + mutex_lock(&curseg->curseg_mutex); + down_write(&SIT_I(sbi)->sentry_lock); + + segno = CURSEG_I(sbi, type)->segno; + if (segno < start || segno > end) + goto unlock; + + if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) + ret = change_curseg(sbi, type); + else + ret = new_curseg(sbi, type, true); + + stat_inc_seg_type(sbi, curseg); + + locate_dirty_segment(sbi, segno); +unlock: + up_write(&SIT_I(sbi)->sentry_lock); + + if (segno != curseg->segno) + f2fs_notice(sbi, "For resize: curseg of type %d: %u ==> %u", + type, segno, curseg->segno); + + mutex_unlock(&curseg->curseg_mutex); + f2fs_up_read(&SM_I(sbi)->curseg_lock); + return ret; +} -static void f2fs_end_io_write(struct bio *bio, int err) +static int __allocate_new_segment(struct f2fs_sb_info *sbi, int type, + bool new_sec, bool force) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_private *p = bio->bi_private; + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int old_segno; + int err = 0; - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (!uptodate) { - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); - p->sbi->sb->s_flags |= MS_RDONLY; - } - end_page_writeback(page); - dec_page_count(p->sbi, F2FS_WRITEBACK); - } while (bvec >= bio->bi_io_vec); - - if (p->is_sync) - complete(p->wait); - kfree(p); - bio_put(bio); + if (type == CURSEG_COLD_DATA_PINNED && !curseg->inited) + goto allocate; + + if (!force && curseg->inited && + !curseg->next_blkoff && + !get_valid_blocks(sbi, curseg->segno, new_sec) && + !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) + return 0; + +allocate: + old_segno = curseg->segno; + err = new_curseg(sbi, type, true); + if (err) + return err; + stat_inc_seg_type(sbi, curseg); + locate_dirty_segment(sbi, old_segno); + return 0; +} + +int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) +{ + int ret; + + f2fs_down_read(&SM_I(sbi)->curseg_lock); + down_write(&SIT_I(sbi)->sentry_lock); + ret = __allocate_new_segment(sbi, type, true, force); + up_write(&SIT_I(sbi)->sentry_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); + + return ret; } -struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages) +int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi) { - struct bio *bio; - struct bio_private *priv; + int err; + bool gc_required = true; + retry: - priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); - if (!priv) { - cond_resched(); - goto retry; + f2fs_lock_op(sbi); + err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); + f2fs_unlock_op(sbi); + + if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { + f2fs_down_write(&sbi->gc_lock); + err = f2fs_gc_range(sbi, 0, sbi->first_seq_zone_segno - 1, + true, ZONED_PIN_SEC_REQUIRED_COUNT); + f2fs_up_write(&sbi->gc_lock); + + gc_required = false; + if (!err) + goto retry; } - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - bio->bi_bdev = bdev; - bio->bi_private = priv; - return bio; + return err; } -static void do_submit_bio(struct f2fs_sb_info *sbi, - enum page_type type, bool sync) +int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { - int rw = sync ? WRITE_SYNC : WRITE; - enum page_type btype = type > META ? META : type; - - if (type >= META_FLUSH) - rw = WRITE_FLUSH_FUA; + int i; + int err = 0; - if (btype == META) - rw |= REQ_META; + f2fs_down_read(&SM_I(sbi)->curseg_lock); + down_write(&SIT_I(sbi)->sentry_lock); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) + err += __allocate_new_segment(sbi, i, false, false); + up_write(&SIT_I(sbi)->sentry_lock); + f2fs_up_read(&SM_I(sbi)->curseg_lock); - if (sbi->bio[btype]) { - struct bio_private *p = sbi->bio[btype]->bi_private; - p->sbi = sbi; - sbi->bio[btype]->bi_end_io = f2fs_end_io_write; + return err; +} - trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]); +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + __u64 trim_start = cpc->trim_start; + bool has_candidate = false; - if (type == META_FLUSH) { - DECLARE_COMPLETION_ONSTACK(wait); - p->is_sync = true; - p->wait = &wait; - submit_bio(rw, sbi->bio[btype]); - wait_for_completion(&wait); - } else { - p->is_sync = false; - submit_bio(rw, sbi->bio[btype]); + down_write(&SIT_I(sbi)->sentry_lock); + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { + if (add_discard_addrs(sbi, cpc, true)) { + has_candidate = true; + break; } - sbi->bio[btype] = NULL; } -} + up_write(&SIT_I(sbi)->sentry_lock); -void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync) -{ - down_write(&sbi->bio_sem); - do_submit_bio(sbi, type, sync); - up_write(&sbi->bio_sem); + cpc->trim_start = trim_start; + return has_candidate; } -static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, enum page_type type) +static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) { - struct block_device *bdev = sbi->sb->s_bdev; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + unsigned int trimmed = 0; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); + + dc = __lookup_discard_cmd_ret(&dcc->root, start, + &prev_dc, &next_dc, &insert_p, &insert_parent); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->di.lstart <= end) { + struct rb_node *node; + int err = 0; + + if (dc->di.len < dpolicy->granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } - verify_block_addr(sbi, blk_addr); + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); - down_write(&sbi->bio_sem); + if (issued >= dpolicy->max_requests) { + start = dc->di.lstart + dc->di.len; - inc_page_count(sbi, F2FS_WRITEBACK); + if (err) + __remove_discard_cmd(sbi, dc); - if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1) - do_submit_bio(sbi, type, false); -alloc_new: - if (sbi->bio[type] == NULL) { - sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi)); - sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - /* - * The end_io will be assigned at the sumbission phase. - * Until then, let bio_add_page() merge consecutive IOs as much - * as possible. - */ + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + trimmed += __wait_all_discard_cmd(sbi, NULL); + f2fs_schedule_timeout(DEFAULT_DISCARD_INTERVAL); + goto next; + } +skip: + node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; } - if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { - do_submit_bio(sbi, type, false); - goto alloc_new; + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + return trimmed; +} + +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) +{ + __u64 start = F2FS_BYTES_TO_BLK(range->start); + __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; + unsigned int start_segno, end_segno; + block_t start_block, end_block; + struct cp_control cpc; + struct discard_policy dpolicy; + unsigned long long trimmed = 0; + int err = 0; + bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); + + if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) + return -EINVAL; + + if (end < MAIN_BLKADDR(sbi)) + goto out; + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_warn(sbi, "Found FS corruption, run fsck to fix."); + return -EFSCORRUPTED; + } + + /* start/end segment number in main_area */ + start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); + end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : + GET_SEGNO(sbi, end); + if (need_align) { + start_segno = rounddown(start_segno, SEGS_PER_SEC(sbi)); + end_segno = roundup(end_segno + 1, SEGS_PER_SEC(sbi)) - 1; } - sbi->last_block_in_bio[type] = blk_addr; + cpc.reason = CP_DISCARD; + cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; + + if (sbi->discard_blks == 0) + goto out; + + f2fs_down_write(&sbi->gc_lock); + stat_inc_cp_call_count(sbi, TOTAL_CALL); + err = f2fs_write_checkpoint(sbi, &cpc); + f2fs_up_write(&sbi->gc_lock); + if (err) + goto out; + + /* + * We filed discard candidates, but actually we don't need to wait for + * all of them, since they'll be issued in idle time along with runtime + * discard option. User configuration looks like using runtime discard + * or periodic fstrim instead of it. + */ + if (f2fs_realtime_discard_enable(sbi)) + goto out; + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, end_segno + 1); + + __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + trimmed = __issue_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); - up_write(&sbi->bio_sem); - trace_f2fs_submit_write_page(page, blk_addr, type); + trimmed += __wait_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); +out: + if (!err) + range->len = F2FS_BLK_TO_BYTES(trimmed); + return err; } -static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) +int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - if (curseg->next_blkoff < sbi->blocks_per_seg) - return true; - return false; + if (F2FS_OPTION(sbi).active_logs == 2) + return CURSEG_HOT_DATA; + else if (F2FS_OPTION(sbi).active_logs == 4) + return CURSEG_COLD_DATA; + + /* active_log == 6 */ + switch (hint) { + case WRITE_LIFE_SHORT: + return CURSEG_HOT_DATA; + case WRITE_LIFE_EXTREME: + return CURSEG_COLD_DATA; + default: + return CURSEG_WARM_DATA; + } +} + +/* + * This returns write hints for each segment type. This hints will be + * passed down to block layer as below by default. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_NONE|REQ_META + * HOT_NODE WRITE_LIFE_NONE + * WARM_NODE WRITE_LIFE_MEDIUM + * COLD_NODE WRITE_LIFE_LONG + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * COLD_DATA WRITE_LIFE_EXTREME + * HOT_DATA WRITE_LIFE_SHORT + * WARM_DATA WRITE_LIFE_NOT_SET + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + */ +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) +{ + switch (type) { + case DATA: + switch (temp) { + case WARM: + return WRITE_LIFE_NOT_SET; + case HOT: + return WRITE_LIFE_SHORT; + case COLD: + return WRITE_LIFE_EXTREME; + default: + return WRITE_LIFE_NONE; + } + case NODE: + switch (temp) { + case WARM: + return WRITE_LIFE_MEDIUM; + case HOT: + return WRITE_LIFE_NONE; + case COLD: + return WRITE_LIFE_LONG; + default: + return WRITE_LIFE_NONE; + } + case META: + return WRITE_LIFE_NONE; + default: + return WRITE_LIFE_NONE; + } } -static int __get_segment_type_2(struct page *page, enum page_type p_type) +static int __get_segment_type_2(struct f2fs_io_info *fio) { - if (p_type == DATA) + if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } -static int __get_segment_type_4(struct page *page, enum page_type p_type) +static int __get_segment_type_4(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio_inode(fio); if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(page) && !is_cold_node(page)) - return CURSEG_HOT_NODE; + if (IS_DNODE(fio->folio) && is_cold_node(fio->folio)) + return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } -static int __get_segment_type_6(struct page *page, enum page_type p_type) +static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_info ei = {}; - if (S_ISDIR(inode->i_mode)) + if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { + if (!ei.age) + return NO_CHECK_TYPE; + if (ei.age <= sbi->hot_data_age_threshold) return CURSEG_HOT_DATA; - else if (is_cold_data(page) || file_is_cold(inode)) - return CURSEG_COLD_DATA; - else + if (ei.age <= sbi->warm_data_age_threshold) return CURSEG_WARM_DATA; + return CURSEG_COLD_DATA; + } + return NO_CHECK_TYPE; +} + +static int __get_segment_type_6(struct f2fs_io_info *fio) +{ + if (fio->type == DATA) { + struct inode *inode = fio_inode(fio); + int type; + + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return CURSEG_COLD_DATA_PINNED; + + if (page_private_gcing(fio->page)) { + if (fio->sbi->am.atgc_enabled && + (fio->io_type == FS_DATA_IO) && + (fio->sbi->gc_mode != GC_URGENT_HIGH) && + __is_valid_data_blkaddr(fio->old_blkaddr) && + !is_inode_flag_set(inode, FI_OPU_WRITE)) + return CURSEG_ALL_DATA_ATGC; + else + return CURSEG_COLD_DATA; + } + if (file_is_cold(inode) || f2fs_need_compress_data(inode)) + return CURSEG_COLD_DATA; + + type = __get_age_segment_type(inode, fio->folio->index); + if (type != NO_CHECK_TYPE) + return type; + + if (file_is_hot(inode) || + is_inode_flag_set(inode, FI_HOT_DATA) || + f2fs_is_cow_file(inode) || + is_inode_flag_set(inode, FI_NEED_IPU)) + return CURSEG_HOT_DATA; + return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), + inode->i_write_hint); } else { - if (IS_DNODE(page)) - return is_cold_node(page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->folio)) + return is_cold_node(fio->folio) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; - else - return CURSEG_COLD_NODE; + return CURSEG_COLD_NODE; } } -static int __get_segment_type(struct page *page, enum page_type p_type) +enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, + enum log_type type) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); - switch (sbi->active_logs) { + struct curseg_info *curseg = CURSEG_I(sbi, type); + enum temp_type temp = COLD; + + switch (curseg->seg_type) { + case CURSEG_HOT_NODE: + case CURSEG_HOT_DATA: + temp = HOT; + break; + case CURSEG_WARM_NODE: + case CURSEG_WARM_DATA: + temp = WARM; + break; + case CURSEG_COLD_NODE: + case CURSEG_COLD_DATA: + temp = COLD; + break; + default: + f2fs_bug_on(sbi, 1); + } + + return temp; +} + +static int __get_segment_type(struct f2fs_io_info *fio) +{ + enum log_type type = CURSEG_HOT_DATA; + + switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: - return __get_segment_type_2(page, p_type); + type = __get_segment_type_2(fio); + break; case 4: - return __get_segment_type_4(page, p_type); + type = __get_segment_type_4(fio); + break; + case 6: + type = __get_segment_type_6(fio); + break; + default: + f2fs_bug_on(fio->sbi, true); } - /* NR_CURSEG_TYPE(6) logs by default */ - BUG_ON(sbi->active_logs != NR_CURSEG_TYPE); - return __get_segment_type_6(page, p_type); + + fio->temp = f2fs_get_segment_temp(fio->sbi, type); + + return type; +} + +static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, + struct curseg_info *seg) +{ + /* To allocate block chunks in different sizes, use random number */ + if (--seg->fragment_remained_chunk > 0) + return; + + seg->fragment_remained_chunk = + get_random_u32_inclusive(1, sbi->max_fragment_chunk); + seg->next_blkoff += + get_random_u32_inclusive(1, sbi->max_fragment_hole); } -static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, enum page_type p_type) +int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio) { struct sit_info *sit_i = SIT_I(sbi); - struct curseg_info *curseg; - unsigned int old_cursegno; - int type; + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned long long old_mtime; + bool from_gc = (type == CURSEG_ALL_DATA_ATGC); + struct seg_entry *se = NULL; + bool segment_full = false; + int ret = 0; - type = __get_segment_type(page, p_type); - curseg = CURSEG_I(sbi, type); + f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); + down_write(&sit_i->sentry_lock); + if (curseg->segno == NULL_SEGNO) { + ret = -ENOSPC; + goto out_err; + } + + if (from_gc) { + f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); + se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); + sanity_check_seg_type(sbi, se->type); + f2fs_bug_on(sbi, IS_NODESEG(se->type)); + } *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - old_cursegno = curseg->segno; - /* - * __add_sum_entry should be resided under the curseg_mutex - * because, this function updates a summary entry in the - * current summary block. - */ - __add_sum_entry(sbi, type, sum); + f2fs_bug_on(sbi, curseg->next_blkoff >= BLKS_PER_SEG(sbi)); - mutex_lock(&sit_i->sentry_lock); - __refresh_next_blkoff(sbi, curseg); -#ifdef CONFIG_F2FS_STAT_FS - sbi->block_count[curseg->alloc_type]++; -#endif + f2fs_wait_discard_bio(sbi, *new_blkaddr); + + curseg->sum_blk->entries[curseg->next_blkoff] = *sum; + if (curseg->alloc_type == SSR) { + curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg); + } else { + curseg->next_blkoff++; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + f2fs_randomize_chunk(sbi, curseg); + } + if (curseg->next_blkoff >= f2fs_usable_blks_in_seg(sbi, curseg->segno)) + segment_full = true; + stat_inc_block_count(sbi, curseg); + + if (from_gc) { + old_mtime = get_segment_mtime(sbi, old_blkaddr); + } else { + update_segment_mtime(sbi, old_blkaddr, 0); + old_mtime = 0; + } + update_segment_mtime(sbi, *new_blkaddr, old_mtime); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ - refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + update_sit_entry(sbi, *new_blkaddr, 1); + update_sit_entry(sbi, old_blkaddr, -1); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); + /* + * If the current segment is full, flush it out and replace it with a + * new segment. + */ + if (segment_full) { + if (type == CURSEG_COLD_DATA_PINNED && + !((curseg->segno + 1) % sbi->segs_per_sec)) { + write_sum_page(sbi, curseg->sum_blk, curseg->segno); + reset_curseg_fields(curseg); + goto skip_new_segment; + } - locate_dirty_segment(sbi, old_cursegno); + if (from_gc) { + ret = get_atssr_segment(sbi, type, se->type, + AT_SSR, se->mtime); + } else { + if (need_new_seg(sbi, type)) + ret = new_curseg(sbi, type, false); + else + ret = change_curseg(sbi, type); + stat_inc_seg_type(sbi, curseg); + } + + if (ret) + goto out_err; + } + +skip_new_segment: + /* + * segment dirty status should be updated after segment allocation, + * so we just need to update status only one time after previous + * segment being closed. + */ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); - mutex_unlock(&sit_i->sentry_lock); + locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); - if (p_type == NODE) - fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + if (IS_DATASEG(curseg->seg_type)) { + unsigned long long new_val; - /* writeout dirty page into bdev */ - submit_write_page(sbi, page, *new_blkaddr, p_type); + new_val = atomic64_inc_return(&sbi->allocated_data_blocks); + if (unlikely(new_val == ULLONG_MAX)) + atomic64_set(&sbi->allocated_data_blocks, 0); + } + up_write(&sit_i->sentry_lock); + + if (folio && IS_NODESEG(curseg->seg_type)) { + fill_node_footer_blkaddr(folio, NEXT_FREE_BLKADDR(sbi, curseg)); + + f2fs_inode_chksum_set(sbi, folio); + } + + if (fio) { + struct f2fs_bio_info *io; + + INIT_LIST_HEAD(&fio->list); + fio->in_list = 1; + io = sbi->write_io[fio->type] + fio->temp; + spin_lock(&io->io_lock); + list_add_tail(&fio->list, &io->io_list); + spin_unlock(&io->io_lock); + } + + mutex_unlock(&curseg->curseg_mutex); + f2fs_up_read(&SM_I(sbi)->curseg_lock); + return 0; + +out_err: + *new_blkaddr = NULL_ADDR; + up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); + f2fs_up_read(&SM_I(sbi)->curseg_lock); + return ret; } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt) { - set_page_writeback(page); - submit_write_page(sbi, page, page->index, META); + if (!f2fs_is_multi_device(sbi)) + return; + + while (1) { + unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); + unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; + + /* update device state for fsync */ + f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); + + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + if (blkcnt <= blks) + break; + blkcnt -= blks; + blkaddr += blks; + } } -void write_node_page(struct f2fs_sb_info *sbi, struct page *page, - unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr) +static int log_type_to_seg_type(enum log_type type) +{ + int seg_type = CURSEG_COLD_DATA; + + switch (type) { + case CURSEG_HOT_DATA: + case CURSEG_WARM_DATA: + case CURSEG_COLD_DATA: + case CURSEG_HOT_NODE: + case CURSEG_WARM_NODE: + case CURSEG_COLD_NODE: + seg_type = (int)type; + break; + case CURSEG_COLD_DATA_PINNED: + case CURSEG_ALL_DATA_ATGC: + seg_type = CURSEG_COLD_DATA; + break; + default: + break; + } + return seg_type; +} + +static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) +{ + struct folio *folio = fio->folio; + enum log_type type = __get_segment_type(fio); + int seg_type = log_type_to_seg_type(type); + bool keep_order = (f2fs_lfs_mode(fio->sbi) && + seg_type == CURSEG_COLD_DATA); + int err; + + if (keep_order) + f2fs_down_read(&fio->sbi->io_order_lock); + + err = f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, + &fio->new_blkaddr, sum, type, fio); + if (unlikely(err)) { + f2fs_err_ratelimited(fio->sbi, + "%s Failed to allocate data block, ino:%u, index:%lu, type:%d, old_blkaddr:0x%x, new_blkaddr:0x%x, err:%d", + __func__, fio->ino, folio->index, type, + fio->old_blkaddr, fio->new_blkaddr, err); + if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) + fscrypt_finalize_bounce_page(&fio->encrypted_page); + folio_end_writeback(folio); + if (f2fs_in_warm_node_list(fio->sbi, folio)) + f2fs_del_fsync_node_entry(fio->sbi, folio); + f2fs_bug_on(fio->sbi, !is_set_ckpt_flags(fio->sbi, + CP_ERROR_FLAG)); + goto out; + } + + f2fs_bug_on(fio->sbi, !f2fs_is_valid_blkaddr_raw(fio->sbi, + fio->new_blkaddr, DATA_GENERIC_ENHANCE)); + + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr, 1); + + /* writeout dirty page into bdev */ + f2fs_submit_page_write(fio); + + f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); +out: + if (keep_order) + f2fs_up_read(&fio->sbi->io_order_lock); +} + +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, + enum iostat_type io_type) +{ + struct f2fs_io_info fio = { + .sbi = sbi, + .type = META, + .temp = HOT, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, + .old_blkaddr = folio->index, + .new_blkaddr = folio->index, + .folio = folio, + .encrypted_page = NULL, + .in_list = 0, + }; + + if (unlikely(folio->index >= MAIN_BLKADDR(sbi))) + fio.op_flags &= ~REQ_META; + + folio_start_writeback(folio); + f2fs_submit_page_write(&fio); + + stat_inc_meta_count(sbi, folio->index); + f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); +} + +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; + set_summary(&sum, nid, 0, 0); - do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE); + do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } -void write_data_page(struct inode *inode, struct page *page, - struct dnode_of_data *dn, block_t old_blkaddr, - block_t *new_blkaddr) +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; - struct node_info ni; - BUG_ON(old_blkaddr == NULL_ADDR); - get_node_info(sbi, dn->nid, &ni); - set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); + if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) + f2fs_update_age_extent_cache(dn); + set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); + do_write_page(&sum, fio); + f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); +} + +int f2fs_inplace_write_data(struct f2fs_io_info *fio) +{ + int err; + struct f2fs_sb_info *sbi = fio->sbi; + unsigned int segno; + + fio->new_blkaddr = fio->old_blkaddr; + /* i/o temperature is needed for passing down write hints */ + __get_segment_type(fio); + + segno = GET_SEGNO(sbi, fio->new_blkaddr); + + if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", + __func__, segno); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); + goto drop_bio; + } + + if (f2fs_cp_error(sbi)) { + err = -EIO; + goto drop_bio; + } + + if (fio->meta_gc) + f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1); + + stat_inc_inplace_blocks(fio->sbi); + + if (fio->bio && !IS_F2FS_IPU_NOCACHE(sbi)) + err = f2fs_merge_page_bio(fio); + else + err = f2fs_submit_page_bio(fio); + if (!err) { + f2fs_update_device_state(fio->sbi, fio->ino, + fio->new_blkaddr, 1); + f2fs_update_iostat(fio->sbi, fio_inode(fio), + fio->io_type, F2FS_BLKSIZE); + } + + return err; +drop_bio: + if (fio->bio && *(fio->bio)) { + struct bio *bio = *(fio->bio); - do_write_page(sbi, page, old_blkaddr, - new_blkaddr, &sum, DATA); + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + *(fio->bio) = NULL; + } + return err; } -void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blk_addr) +static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, + unsigned int segno) { - submit_write_page(sbi, page, old_blk_addr, DATA); + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (CURSEG_I(sbi, i)->segno == segno) + break; + } + return i; } -void recover_data_page(struct f2fs_sb_info *sbi, - struct page *page, struct f2fs_summary *sum, - block_t old_blkaddr, block_t new_blkaddr) +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr, + bool from_gc) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int segno, old_cursegno; struct seg_entry *se; int type; + unsigned short old_blkoff; + unsigned char old_alloc_type; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); type = se->type; - if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { - if (old_blkaddr == NULL_ADDR) - type = CURSEG_COLD_DATA; - else + f2fs_down_write(&SM_I(sbi)->curseg_lock); + + if (!recover_curseg) { + /* for recovery flow */ + if (se->valid_blocks == 0 && !is_curseg(sbi, segno)) { + if (old_blkaddr == NULL_ADDR) + type = CURSEG_COLD_DATA; + else + type = CURSEG_WARM_DATA; + } + } else { + if (is_curseg(sbi, segno)) { + /* se->type is volatile as SSR allocation */ + type = __f2fs_get_curseg(sbi, segno); + f2fs_bug_on(sbi, type == NO_CHECK_TYPE); + } else { type = CURSEG_WARM_DATA; + } } + curseg = CURSEG_I(sbi, type); + f2fs_bug_on(sbi, !IS_DATASEG(curseg->seg_type)); mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); old_cursegno = curseg->segno; + old_blkoff = curseg->next_blkoff; + old_alloc_type = curseg->alloc_type; /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + if (change_curseg(sbi, type)) + goto out_unlock; } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); - __add_sum_entry(sbi, type, sum); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); + curseg->sum_blk->entries[curseg->next_blkoff] = *sum; - refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); + if (!recover_curseg || recover_newaddr) { + if (!from_gc) + update_segment_mtime(sbi, new_blkaddr, 0); + update_sit_entry(sbi, new_blkaddr, 1); + } + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { + f2fs_invalidate_internal_cache(sbi, old_blkaddr, 1); + if (!from_gc) + update_segment_mtime(sbi, old_blkaddr, 0); + update_sit_entry(sbi, old_blkaddr, -1); + } - locate_dirty_segment(sbi, old_cursegno); locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); - mutex_unlock(&sit_i->sentry_lock); + locate_dirty_segment(sbi, old_cursegno); + + if (recover_curseg) { + if (old_cursegno != curseg->segno) { + curseg->next_segno = old_cursegno; + if (change_curseg(sbi, type)) + goto out_unlock; + } + curseg->next_blkoff = old_blkoff; + curseg->alloc_type = old_alloc_type; + } + +out_unlock: + up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); + f2fs_up_write(&SM_I(sbi)->curseg_lock); } -void rewrite_node_page(struct f2fs_sb_info *sbi, - struct page *page, struct f2fs_summary *sum, - block_t old_blkaddr, block_t new_blkaddr) +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr) { - struct sit_info *sit_i = SIT_I(sbi); - int type = CURSEG_WARM_NODE; - struct curseg_info *curseg; - unsigned int segno, old_cursegno; - block_t next_blkaddr = next_blkaddr_of_node(page); - unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); + struct f2fs_summary sum; - curseg = CURSEG_I(sbi, type); + set_summary(&sum, dn->nid, dn->ofs_in_node, version); - mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, + recover_curseg, recover_newaddr, false); - segno = GET_SEGNO(sbi, new_blkaddr); - old_cursegno = curseg->segno; + f2fs_update_data_blkaddr(dn, new_addr); +} - /* change the current segment */ - if (segno != curseg->segno) { - curseg->next_segno = segno; - change_curseg(sbi, type, true); +void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, + bool ordered, bool locked) +{ + if (folio_test_writeback(folio)) { + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); + + /* submit cached LFS IO */ + f2fs_submit_merged_write_cond(sbi, NULL, folio, 0, type); + /* submit cached IPU IO */ + f2fs_submit_merged_ipu_write(sbi, NULL, folio); + if (ordered) { + folio_wait_writeback(folio); + f2fs_bug_on(sbi, locked && folio_test_writeback(folio)); + } else { + folio_wait_stable(folio); + } } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); - __add_sum_entry(sbi, type, sum); +} + +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct folio *cfolio; + + if (!f2fs_meta_inode_gc_required(inode)) + return; + + if (!__is_valid_data_blkaddr(blkaddr)) + return; - /* change the current log to the next block addr in advance */ - if (next_segno != segno) { - curseg->next_segno = next_segno; - change_curseg(sbi, type, true); + cfolio = filemap_lock_folio(META_MAPPING(sbi), blkaddr); + if (!IS_ERR(cfolio)) { + f2fs_folio_wait_writeback(cfolio, DATA, true, true); + f2fs_folio_put(cfolio, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & - (sbi->blocks_per_seg - 1); +} - /* rewrite node page */ - set_page_writeback(page); - submit_write_page(sbi, page, new_blkaddr, NODE); - f2fs_submit_bio(sbi, NODE, true); - refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); +void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, + block_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + block_t i; - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + if (!f2fs_meta_inode_gc_required(inode)) + return; - mutex_unlock(&sit_i->sentry_lock); - mutex_unlock(&curseg->curseg_mutex); + for (i = 0; i < len; i++) + f2fs_wait_on_block_writeback(inode, blkaddr + i); + + f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) @@ -957,23 +4290,24 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; unsigned char *kaddr; - struct page *page; + struct folio *folio; block_t start; int i, j, offset; start = start_sum_block(sbi); - page = get_meta_page(sbi, start++); - kaddr = (unsigned char *)page_address(page); + folio = f2fs_get_meta_folio(sbi, start++); + if (IS_ERR(folio)) + return PTR_ERR(folio); + kaddr = folio_address(folio); /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE, - SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); offset = 2 * SUM_JOURNAL_SIZE; /* Step 3: restore summary entries */ @@ -990,26 +4324,28 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) seg_i->next_blkoff = blk_off; if (seg_i->alloc_type == SSR) - blk_off = sbi->blocks_per_seg; + blk_off = BLKS_PER_SEG(sbi); for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; + s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; - if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (offset + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; - f2fs_put_page(page, 1); - page = NULL; + f2fs_folio_put(folio, true); - page = get_meta_page(sbi, start++); - kaddr = (unsigned char *)page_address(page); + folio = f2fs_get_meta_folio(sbi, start++); + if (IS_ERR(folio)) + return PTR_ERR(folio); + kaddr = folio_address(folio); offset = 0; } } - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return 0; } @@ -1018,18 +4354,19 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_summary_block *sum; struct curseg_info *curseg; - struct page *new; + struct folio *new; unsigned short blk_off; unsigned int segno = 0; block_t blk_addr = 0; + int err = 0; /* get segment number and block addr */ if (IS_DATASEG(type)) { segno = le32_to_cpu(ckpt->cur_data_segno[type]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); - if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) - blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); + if (__exist_node_summaries(sbi)) + blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); } else { @@ -1037,307 +4374,436 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) CURSEG_HOT_NODE]); blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - CURSEG_HOT_NODE]); - if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) + if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, type - CURSEG_HOT_NODE); else blk_addr = GET_SUM_BLOCK(sbi, segno); } - new = get_meta_page(sbi, blk_addr); - sum = (struct f2fs_summary_block *)page_address(new); + new = f2fs_get_meta_folio(sbi, blk_addr); + if (IS_ERR(new)) + return PTR_ERR(new); + sum = folio_address(new); if (IS_NODESEG(type)) { - if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) { + if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; - for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { + + for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; } } else { - if (restore_node_summary(sbi, segno, sum)) { - f2fs_put_page(new, 1); - return -EINVAL; - } + err = f2fs_restore_node_summary(sbi, segno, sum); + if (err) + goto out; } } /* set uncompleted segment to curseg */ curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE); + + /* update journal info */ + down_write(&curseg->journal_rwsem); + memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); + up_write(&curseg->journal_rwsem); + + memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); + memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; curseg->next_blkoff = blk_off; mutex_unlock(&curseg->curseg_mutex); - f2fs_put_page(new, 1); - return 0; +out: + f2fs_folio_put(new, true); + return err; } static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; + int err; + + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { + int npages = f2fs_npages_for_summary_flush(sbi, true); + + if (npages >= 2) + f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages, + META_CP, true); - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { /* restore for compacted data summary */ - if (read_compacted_summaries(sbi)) - return -EINVAL; + err = read_compacted_summaries(sbi); + if (err) + return err; type = CURSEG_HOT_NODE; } - for (; type <= CURSEG_COLD_NODE; type++) - if (read_normal_summaries(sbi, type)) - return -EINVAL; + if (__exist_node_summaries(sbi)) + f2fs_ra_meta_pages(sbi, + sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type), + NR_CURSEG_PERSIST_TYPE - type, META_CP, true); + + for (; type <= CURSEG_COLD_NODE; type++) { + err = read_normal_summaries(sbi, type); + if (err) + return err; + } + + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { + f2fs_err(sbi, "invalid journal entries nats %u sits %u", + nats_in_cursum(nat_j), sits_in_cursum(sit_j)); + return -EINVAL; + } + return 0; } static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct page *page; + struct folio *folio; unsigned char *kaddr; struct f2fs_summary *summary; struct curseg_info *seg_i; int written_size = 0; int i, j; - page = grab_meta_page(sbi, blkaddr++); - kaddr = (unsigned char *)page_address(page); + folio = f2fs_grab_meta_folio(sbi, blkaddr++); + kaddr = folio_address(folio); + memset(kaddr, 0, PAGE_SIZE); /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE); + memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits, - SUM_JOURNAL_SIZE); + memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; - set_page_dirty(page); - /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { - unsigned short blkoff; seg_i = CURSEG_I(sbi, i); - if (sbi->ckpt->alloc_type[i] == SSR) - blkoff = sbi->blocks_per_seg; - else - blkoff = curseg_blkoff(sbi, i); - - for (j = 0; j < blkoff; j++) { - if (!page) { - page = grab_meta_page(sbi, blkaddr++); - kaddr = (unsigned char *)page_address(page); + for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) { + if (!folio) { + folio = f2fs_grab_meta_folio(sbi, blkaddr++); + kaddr = folio_address(folio); + memset(kaddr, 0, PAGE_SIZE); written_size = 0; } summary = (struct f2fs_summary *)(kaddr + written_size); *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; - set_page_dirty(page); - if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (written_size + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; - f2fs_put_page(page, 1); - page = NULL; + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); + folio = NULL; } } - if (page) - f2fs_put_page(page, 1); + if (folio) { + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); + } } static void write_normal_summaries(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { int i, end; + if (IS_DATASEG(type)) end = type + NR_CURSEG_DATA_TYPE; else end = type + NR_CURSEG_NODE_TYPE; - for (i = type; i < end; i++) { - struct curseg_info *sum = CURSEG_I(sbi, i); - mutex_lock(&sum->curseg_mutex); - write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type)); - mutex_unlock(&sum->curseg_mutex); - } + for (i = type; i < end; i++) + write_current_sum_page(sbi, i, blkaddr + (i - type)); } -void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); } -void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) - write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); - return; + write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } -int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; if (type == NAT_JOURNAL) { - for (i = 0; i < nats_in_cursum(sum); i++) { - if (le32_to_cpu(nid_in_journal(sum, i)) == val) + for (i = 0; i < nats_in_cursum(journal); i++) { + if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } - if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) - return update_nats_in_cursum(sum, 1); + if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) + return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { - for (i = 0; i < sits_in_cursum(sum); i++) - if (le32_to_cpu(segno_in_journal(sum, i)) == val) + for (i = 0; i < sits_in_cursum(journal); i++) + if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; - if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) - return update_sits_in_cursum(sum, 1); + if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) + return update_sits_in_cursum(journal, 1); } return -1; } -static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, +static struct folio *get_current_sit_folio(struct f2fs_sb_info *sbi, unsigned int segno) { - struct sit_info *sit_i = SIT_I(sbi); - unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno); - block_t blk_addr = sit_i->sit_base_addr + offset; - - check_seg_range(sbi, segno); - - /* calculate sit block address */ - if (f2fs_test_bit(offset, sit_i->sit_bitmap)) - blk_addr += sit_i->sit_blocks; - - return get_meta_page(sbi, blk_addr); + return f2fs_get_meta_folio(sbi, current_sit_addr(sbi, segno)); } -static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, +static struct folio *get_next_sit_folio(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); - struct page *src_page, *dst_page; + struct folio *folio; pgoff_t src_off, dst_off; - void *src_addr, *dst_addr; src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); - /* get current sit block page without lock */ - src_page = get_meta_page(sbi, src_off); - dst_page = grab_meta_page(sbi, dst_off); - BUG_ON(PageDirty(src_page)); + folio = f2fs_grab_meta_folio(sbi, dst_off); + seg_info_to_sit_folio(sbi, folio, start); - src_addr = page_address(src_page); - dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + folio_mark_dirty(folio); + set_to_next_sit(sit_i, start); - set_page_dirty(dst_page); - f2fs_put_page(src_page, 1); + return folio; +} - set_to_next_sit(sit_i, start); +static struct sit_entry_set *grab_sit_entry_set(void) +{ + struct sit_entry_set *ses = + f2fs_kmem_cache_alloc(sit_entry_set_slab, + GFP_NOFS, true, NULL); - return dst_page; + ses->entry_cnt = 0; + INIT_LIST_HEAD(&ses->set_list); + return ses; } -static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) +static void release_sit_entry_set(struct sit_entry_set *ses) +{ + list_del(&ses->set_list); + kmem_cache_free(sit_entry_set_slab, ses); +} + +static void adjust_sit_entry_set(struct sit_entry_set *ses, + struct list_head *head) +{ + struct sit_entry_set *next = ses; + + if (list_is_last(&ses->set_list, head)) + return; + + list_for_each_entry_continue(next, head, set_list) + if (ses->entry_cnt <= next->entry_cnt) { + list_move_tail(&ses->set_list, &next->set_list); + return; + } + + list_move_tail(&ses->set_list, head); +} + +static void add_sit_entry(unsigned int segno, struct list_head *head) +{ + struct sit_entry_set *ses; + unsigned int start_segno = START_SEGNO(segno); + + list_for_each_entry(ses, head, set_list) { + if (ses->start_segno == start_segno) { + ses->entry_cnt++; + adjust_sit_entry_set(ses, head); + return; + } + } + + ses = grab_sit_entry_set(); + + ses->start_segno = start_segno; + ses->entry_cnt++; + list_add(&ses->set_list, head); +} + +static void add_sits_in_set(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + struct list_head *set_list = &sm_info->sit_entry_set; + unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap; + unsigned int segno; + + for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi)) + add_sit_entry(segno, set_list); +} + +static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i; - /* - * If the journal area in the current summary is full of sit entries, - * all the sit entries will be flushed. Otherwise the sit entries - * are not able to replace with newly hot sit entries. - */ - if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) { - for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { - unsigned int segno; - segno = le32_to_cpu(segno_in_journal(sum, i)); - __mark_sit_entry_dirty(sbi, segno); - } - update_sits_in_cursum(sum, -sits_in_cursum(sum)); - return 1; + down_write(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int segno; + bool dirtied; + + segno = le32_to_cpu(segno_in_journal(journal, i)); + dirtied = __mark_sit_entry_dirty(sbi, segno); + + if (!dirtied) + add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } - return 0; + update_sits_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); } /* * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ -void flush_sit_entries(struct f2fs_sb_info *sbi) +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; - unsigned long nsegs = TOTAL_SEGS(sbi); - struct page *page = NULL; - struct f2fs_sit_block *raw_sit = NULL; - unsigned int start = 0, end = 0; - unsigned int segno = -1; - bool flushed; + struct f2fs_journal *journal = curseg->journal; + struct sit_entry_set *ses, *tmp; + struct list_head *head = &SM_I(sbi)->sit_entry_set; + bool to_journal = !is_sbi_flag_set(sbi, SBI_IS_RESIZEFS); + struct seg_entry *se; - mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); + + if (!sit_i->dirty_sentries) + goto out; /* - * "flushed" indicates whether sit entries in journal are flushed - * to the SIT area or not. + * add and account sit entries of dirty bitmap in sit entry + * set temporarily */ - flushed = flush_sits_in_journal(sbi); + add_sits_in_set(sbi); - while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) { - struct seg_entry *se = get_seg_entry(sbi, segno); - int sit_offset, offset; + /* + * if there are no enough space in journal to store dirty sit + * entries, remove all entries from journal and add and account + * them in sit entry set. + */ + if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) || + !to_journal) + remove_sits_in_journal(sbi); - sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + /* + * there are two steps to flush sit entries: + * #1, flush sit entries to journal in current cold data summary block. + * #2, flush sit entries to sit page. + */ + list_for_each_entry_safe(ses, tmp, head, set_list) { + struct folio *folio = NULL; + struct f2fs_sit_block *raw_sit = NULL; + unsigned int start_segno = ses->start_segno; + unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + unsigned int segno = start_segno; + + if (to_journal && + !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) + to_journal = false; + + if (to_journal) { + down_write(&curseg->journal_rwsem); + } else { + folio = get_next_sit_folio(sbi, start_segno); + raw_sit = folio_address(folio); + } - if (flushed) - goto to_sit_page; + /* flush dirty sit entries in region of current sit set */ + for_each_set_bit_from(segno, bitmap, end) { + int offset, sit_offset; - offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1); - if (offset >= 0) { - segno_in_journal(sum, offset) = cpu_to_le32(segno); - seg_info_to_raw_sit(se, &sit_in_journal(sum, offset)); - goto flush_done; - } -to_sit_page: - if (!page || (start > segno) || (segno > end)) { - if (page) { - f2fs_put_page(page, 1); - page = NULL; + se = get_seg_entry(sbi, segno); +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(se->cur_valid_map, se->cur_valid_map_mir, + SIT_VBLOCK_MAP_SIZE)) + f2fs_bug_on(sbi, 1); +#endif + + /* add discard candidates */ + if (!(cpc->reason & CP_DISCARD)) { + cpc->trim_start = segno; + add_discard_addrs(sbi, cpc, false); } - start = START_SEGNO(sit_i, segno); - end = start + SIT_ENTRY_PER_BLOCK - 1; + if (to_journal) { + offset = f2fs_lookup_journal_in_cursum(journal, + SIT_JOURNAL, segno, 1); + f2fs_bug_on(sbi, offset < 0); + segno_in_journal(journal, offset) = + cpu_to_le32(segno); + seg_info_to_raw_sit(se, + &sit_in_journal(journal, offset)); + check_block_count(sbi, segno, + &sit_in_journal(journal, offset)); + } else { + sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + seg_info_to_raw_sit(se, + &raw_sit->entries[sit_offset]); + check_block_count(sbi, segno, + &raw_sit->entries[sit_offset]); + } + + /* update ckpt_valid_block */ + if (__is_large_section(sbi)) { + set_ckpt_valid_blocks(sbi, segno); + sanity_check_valid_blocks(sbi, segno); + } - /* read sit block that will be updated */ - page = get_next_sit_page(sbi, start); - raw_sit = page_address(page); + __clear_bit(segno, bitmap); + sit_i->dirty_sentries--; + ses->entry_cnt--; } - /* udpate entry in SIT block */ - seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); -flush_done: - __clear_bit(segno, bitmap); - sit_i->dirty_sentries--; + if (to_journal) + up_write(&curseg->journal_rwsem); + else + f2fs_folio_put(folio, true); + + f2fs_bug_on(sbi, ses->entry_cnt); + release_sit_entry_set(ses); } - mutex_unlock(&sit_i->sentry_lock); - mutex_unlock(&curseg->curseg_mutex); - /* writeout last modified SIT block */ - f2fs_put_page(page, 1); + f2fs_bug_on(sbi, !list_empty(head)); + f2fs_bug_on(sbi, sit_i->dirty_sentries); +out: + if (cpc->reason & CP_DISCARD) { + __u64 trim_start = cpc->trim_start; + + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) + add_discard_addrs(sbi, cpc, false); + + cpc->trim_start = trim_start; + } + up_write(&sit_i->sentry_lock); set_prefree_as_free_segments(sbi); } @@ -1345,41 +4811,70 @@ flush_done: static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; - char *src_bitmap, *dst_bitmap; - unsigned int bitmap_size; + char *src_bitmap, *bitmap; + unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; + unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0; /* allocate memory for SIT information */ - sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL); + sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); if (!sit_i) return -ENOMEM; SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry)); + sit_i->sentries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry), + MAIN_SEGS(sbi)), + GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; - bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + main_bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, main_bitmap_size, + GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; - for (start = 0; start < TOTAL_SEGS(sbi); start++) { - sit_i->sentries[start].cur_valid_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - sit_i->sentries[start].ckpt_valid_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - if (!sit_i->sentries[start].cur_valid_map - || !sit_i->sentries[start].ckpt_valid_map) - return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map); +#else + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map); +#endif + sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!sit_i->bitmap) + return -ENOMEM; + + bitmap = sit_i->bitmap; + + for (start = 0; start < MAIN_SEGS(sbi); start++) { + sit_i->sentries[start].cur_valid_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + + sit_i->sentries[start].ckpt_valid_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sentries[start].cur_valid_map_mir = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; +#endif + + if (discard_map) { + sit_i->sentries[start].discard_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + } } - if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) * - sizeof(struct sec_entry)); + sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->tmp_map) + return -ENOMEM; + + if (__is_large_section(sbi)) { + sit_i->sec_entries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), + MAIN_SECS(sbi)), + GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } @@ -1388,49 +4883,56 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; /* setup SIT bitmap from ckeckpoint pack */ - bitmap_size = __bitmap_size(sbi, SIT_BITMAP); + sit_bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); - if (!dst_bitmap) + sit_i->sit_bitmap = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap) + return -ENOMEM; + +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sit_bitmap_mir = kmemdup(src_bitmap, + sit_bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap_mir) return -ENOMEM; - /* init SIT information */ - sit_i->s_ops = &default_salloc_ops; + sit_i->invalid_segmap = f2fs_kvzalloc(sbi, + main_bitmap_size, GFP_KERNEL); + if (!sit_i->invalid_segmap) + return -ENOMEM; +#endif sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); - sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; - sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); - sit_i->sit_bitmap = dst_bitmap; - sit_i->bitmap_size = bitmap_size; + sit_i->sit_blocks = SEGS_TO_BLKS(sbi, sit_segs); + sit_i->written_valid_blocks = 0; + sit_i->bitmap_size = sit_bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); - sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; - mutex_init(&sit_i->sentry_lock); + sit_i->mounted_time = ktime_get_boottime_seconds(); + init_rwsem(&sit_i->sentry_lock); return 0; } static int build_free_segmap(struct f2fs_sb_info *sbi) { - struct f2fs_sm_info *sm_info = SM_I(sbi); struct free_segmap_info *free_i; unsigned int bitmap_size, sec_bitmap_size; /* allocate memory for free segmap information */ - free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL); + free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL); if (!free_i) return -ENOMEM; SM_I(sbi)->free_info = free_i; - bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); - free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); + free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; - sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); - free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); + sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -1439,11 +4941,10 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) memset(free_i->free_secmap, 0xff, sec_bitmap_size); /* init free segmap information */ - free_i->start_segno = - (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr); + free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); free_i->free_segments = 0; free_i->free_sections = 0; - rwlock_init(&free_i->segmap_lock); + spin_lock_init(&free_i->segmap_lock); return 0; } @@ -1452,74 +4953,205 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); + array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, + sizeof(*array)), GFP_KERNEL); if (!array) return -ENOMEM; SM_I(sbi)->curseg_array = array; - for (i = 0; i < NR_CURSEG_TYPE; i++) { + for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; - array[i].segno = NULL_SEGNO; - array[i].next_blkoff = 0; + init_rwsem(&array[i].journal_rwsem); + array[i].journal = f2fs_kzalloc(sbi, + sizeof(struct f2fs_journal), GFP_KERNEL); + if (!array[i].journal) + return -ENOMEM; + array[i].seg_type = log_type_to_seg_type(i); + reset_curseg_fields(&array[i]); } return restore_curseg_summaries(sbi); } -static void build_sit_entries(struct f2fs_sb_info *sbi) +static int build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; - unsigned int start; + struct f2fs_journal *journal = curseg->journal; + struct seg_entry *se; + struct f2fs_sit_entry sit; + int sit_blk_cnt = SIT_BLK_CNT(sbi); + unsigned int i, start, end; + unsigned int readed, start_blk = 0; + int err = 0; + block_t sit_valid_blocks[2] = {0, 0}; - for (start = 0; start < TOTAL_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; - struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; - struct page *page; - int i; + do { + readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, + META_SIT, true); + + start = start_blk * sit_i->sents_per_block; + end = (start_blk + readed) * sit_i->sents_per_block; + + for (; start < end && start < MAIN_SEGS(sbi); start++) { + struct f2fs_sit_block *sit_blk; + struct folio *folio; + + se = &sit_i->sentries[start]; + folio = get_current_sit_folio(sbi, start); + if (IS_ERR(folio)) + return PTR_ERR(folio); + sit_blk = folio_address(folio); + sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; + f2fs_folio_put(folio, true); + + err = check_block_count(sbi, start, &sit); + if (err) + return err; + seg_info_from_raw_sit(se, &sit); + + if (se->type >= NR_PERSISTENT_LOG) { + f2fs_err(sbi, "Invalid segment type: %u, segno: %u", + se->type, start); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_SUM_TYPE); + return -EFSCORRUPTED; + } + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < sits_in_cursum(sum); i++) { - if (le32_to_cpu(segno_in_journal(sum, i)) == start) { - sit = sit_in_journal(sum, i); - mutex_unlock(&curseg->curseg_mutex); - goto got_it; + if (!f2fs_block_unit_discard(sbi)) + goto init_discard_map_done; + + /* build discard map only one time */ + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + goto init_discard_map_done; } + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += BLKS_PER_SEG(sbi) - + se->valid_blocks; +init_discard_map_done: + if (__is_large_section(sbi)) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; } - mutex_unlock(&curseg->curseg_mutex); - page = get_current_sit_page(sbi, start); - sit_blk = (struct f2fs_sit_block *)page_address(page); - sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; - f2fs_put_page(page, 1); -got_it: - check_block_count(sbi, start, &sit); + start_blk += readed; + } while (start_blk < sit_blk_cnt); + + down_read(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int old_valid_blocks; + + start = le32_to_cpu(segno_in_journal(journal, i)); + if (start >= MAIN_SEGS(sbi)) { + f2fs_err(sbi, "Wrong journal entry on segno %u", + start); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); + break; + } + + se = &sit_i->sentries[start]; + sit = sit_in_journal(journal, i); + + old_valid_blocks = se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; + + err = check_block_count(sbi, start, &sit); + if (err) + break; seg_info_from_raw_sit(se, &sit); - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + + if (se->type >= NR_PERSISTENT_LOG) { + f2fs_err(sbi, "Invalid segment type: %u, segno: %u", + se->type, start); + err = -EFSCORRUPTED; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); + break; + } + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; + + if (f2fs_block_unit_discard(sbi)) { + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; + } + } + + if (__is_large_section(sbi)) { + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; + get_sec_entry(sbi, start)->valid_blocks -= + old_valid_blocks; + } + } + up_read(&curseg->journal_rwsem); + + /* update ckpt_valid_block */ + if (__is_large_section(sbi)) { + unsigned int segno; + + for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { + set_ckpt_valid_blocks(sbi, segno); + sanity_check_valid_blocks(sbi, segno); } } + + if (err) + return err; + + if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { + f2fs_err(sbi, "SIT is corrupted node# %u vs %u", + sit_valid_blocks[NODE], valid_node_count(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); + return -EFSCORRUPTED; + } + + if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > + valid_user_blocks(sbi)) { + f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", + sit_valid_blocks[DATA], sit_valid_blocks[NODE], + valid_user_blocks(sbi)); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); + return -EFSCORRUPTED; + } + + return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) { unsigned int start; int type; + struct seg_entry *sentry; - for (start = 0; start < TOTAL_SEGS(sbi); start++) { - struct seg_entry *sentry = get_seg_entry(sbi, start); + for (start = 0; start < MAIN_SEGS(sbi); start++) { + if (f2fs_usable_blks_in_seg(sbi, start) == 0) + continue; + sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; } /* set use the current segments */ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { struct curseg_info *curseg_t = CURSEG_I(sbi, type); + __set_test_and_inuse(sbi, curseg_t->segno); } } @@ -1528,32 +5160,60 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi); - unsigned short valid_blocks; + unsigned int segno = 0, offset = 0, secno; + block_t valid_blocks, usable_blks_in_seg; while (1) { /* find dirty segment based on free segmap */ - segno = find_next_inuse(free_i, total_segs, offset); - if (segno >= total_segs) + segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset); + if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; - valid_blocks = get_valid_blocks(sbi, segno, 0); - if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks) + valid_blocks = get_valid_blocks(sbi, segno, false); + usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); + if (valid_blocks == usable_blks_in_seg || !valid_blocks) + continue; + if (valid_blocks > usable_blks_in_seg) { + f2fs_bug_on(sbi, 1); continue; + } mutex_lock(&dirty_i->seglist_lock); __locate_dirty_segment(sbi, segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); } + + if (!__is_large_section(sbi)) + return; + + mutex_lock(&dirty_i->seglist_lock); + for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { + valid_blocks = get_valid_blocks(sbi, segno, true); + secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) + continue; + if (is_cursec(sbi, secno)) + continue; + set_bit(secno, dirty_i->dirty_secmap); + } + mutex_unlock(&dirty_i->seglist_lock); } static int init_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; + + dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->pinned_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap_cnt = 0; + dirty_i->enable_pin_section = true; return 0; } @@ -1563,25 +5223,455 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) unsigned int bitmap_size, i; /* allocate memory for dirty segments list information */ - dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL); + dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info), + GFP_KERNEL); if (!dirty_i) return -ENOMEM; SM_I(sbi)->dirty_info = dirty_i; mutex_init(&dirty_i->seglist_lock); - bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size, + GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } + if (__is_large_section(sbi)) { + bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + dirty_i->dirty_secmap = f2fs_kvzalloc(sbi, + bitmap_size, GFP_KERNEL); + if (!dirty_i->dirty_secmap) + return -ENOMEM; + } + init_dirty_segmap(sbi); return init_victim_secmap(sbi); } +static int sanity_check_curseg(struct f2fs_sb_info *sbi) +{ + int i; + + /* + * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr; + * In LFS curseg, all blkaddr after .next_blkoff should be unused. + */ + for (i = 0; i < NR_PERSISTENT_LOG; i++) { + struct curseg_info *curseg = CURSEG_I(sbi, i); + struct seg_entry *se = get_seg_entry(sbi, curseg->segno); + unsigned int blkofs = curseg->next_blkoff; + + if (f2fs_sb_has_readonly(sbi) && + i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE) + continue; + + sanity_check_seg_type(sbi, curseg->seg_type); + + if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) { + f2fs_err(sbi, + "Current segment has invalid alloc_type:%d", + curseg->alloc_type); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); + return -EFSCORRUPTED; + } + + if (f2fs_test_bit(blkofs, se->cur_valid_map)) + goto out; + + if (curseg->alloc_type == SSR) + continue; + + for (blkofs += 1; blkofs < BLKS_PER_SEG(sbi); blkofs++) { + if (!f2fs_test_bit(blkofs, se->cur_valid_map)) + continue; +out: + f2fs_err(sbi, + "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", + i, curseg->segno, curseg->alloc_type, + curseg->next_blkoff, blkofs); + f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); + return -EFSCORRUPTED; + } + } + return 0; +} + +#ifdef CONFIG_BLK_DEV_ZONED +static int check_zone_write_pointer(struct f2fs_sb_info *sbi, + struct f2fs_dev_info *fdev, + struct blk_zone *zone) +{ + unsigned int zone_segno; + block_t zone_block, valid_block_cnt; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; + int ret; + unsigned int nofs_flags; + + if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); + zone_segno = GET_SEGNO(sbi, zone_block); + + /* + * Skip check of zones cursegs point to, since + * fix_curseg_write_pointer() checks them. + */ + if (zone_segno >= MAIN_SEGS(sbi)) + return 0; + + /* + * Get # of valid block of the zone. + */ + valid_block_cnt = get_valid_blocks(sbi, zone_segno, true); + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { + f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]", + zone_segno, valid_block_cnt, + blk_zone_cond_str(zone->cond)); + return 0; + } + + if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) || + (valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL)) + return 0; + + if (!valid_block_cnt) { + f2fs_notice(sbi, "Zone without valid block has non-zero write " + "pointer. Reset the write pointer: cond[%s]", + blk_zone_cond_str(zone->cond)); + ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, + zone->len >> log_sectors_per_block); + if (ret) + f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", + fdev->path, ret); + return ret; + } + + /* + * If there are valid blocks and the write pointer doesn't match + * with them, we need to report the inconsistency and fill + * the zone till the end to close the zone. This inconsistency + * does not cause write error because the zone will not be + * selected for write operation until it get discarded. + */ + f2fs_notice(sbi, "Valid blocks are not aligned with write " + "pointer: valid block[0x%x,0x%x] cond[%s]", + zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); + + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, + zone->start, zone->len); + memalloc_nofs_restore(nofs_flags); + if (ret == -EOPNOTSUPP) { + ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, + zone->len - (zone->wp - zone->start), + GFP_NOFS, 0); + if (ret) + f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)", + fdev->path, ret); + } else if (ret) { + f2fs_err(sbi, "Finishing zone failed: %s (errno=%d)", + fdev->path, ret); + } + + return ret; +} + +static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, + block_t zone_blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!bdev_is_zoned(FDEV(i).bdev)) + continue; + if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr && + zone_blkaddr <= FDEV(i).end_blk)) + return &FDEV(i); + } + + return NULL; +} + +static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + memcpy(data, zone, sizeof(struct blk_zone)); + return 0; +} + +static int do_fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *cs = CURSEG_I(sbi, type); + struct f2fs_dev_info *zbd; + struct blk_zone zone; + unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; + block_t cs_zone_block, wp_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; + sector_t zone_sector; + int err; + + cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); + cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); + + zbd = get_target_zoned_dev(sbi, cs_zone_block); + if (!zbd) + return 0; + + /* report zone for the sector the curseg points to */ + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; + err = blkdev_report_zones(zbd->bdev, zone_sector, 1, + report_one_zone_cb, &zone); + if (err != 1) { + f2fs_err(sbi, "Report zone failed: %s errno=(%d)", + zbd->path, err); + return err; + } + + if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + /* + * When safely unmounted in the previous mount, we could use current + * segments. Otherwise, allocate new sections. + */ + if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); + + if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && + wp_sector_off == 0) + return 0; + + f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " + "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno, + cs->next_blkoff, wp_segno, wp_blkoff); + } + + /* Allocate a new section if it's not new. */ + if (cs->next_blkoff || + cs->segno != GET_SEG_FROM_SEC(sbi, GET_ZONE_FROM_SEC(sbi, cs_section))) { + unsigned int old_segno = cs->segno, old_blkoff = cs->next_blkoff; + + f2fs_allocate_new_section(sbi, type, true); + f2fs_notice(sbi, "Assign new section to curseg[%d]: " + "[0x%x,0x%x] -> [0x%x,0x%x]", + type, old_segno, old_blkoff, + cs->segno, cs->next_blkoff); + } + + /* check consistency of the zone curseg pointed to */ + if (check_zone_write_pointer(sbi, zbd, &zone)) + return -EIO; + + /* check newly assigned zone */ + cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); + cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); + + zbd = get_target_zoned_dev(sbi, cs_zone_block); + if (!zbd) + return 0; + + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; + err = blkdev_report_zones(zbd->bdev, zone_sector, 1, + report_one_zone_cb, &zone); + if (err != 1) { + f2fs_err(sbi, "Report zone failed: %s errno=(%d)", + zbd->path, err); + return err; + } + + if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + if (zone.wp != zone.start) { + f2fs_notice(sbi, + "New zone for curseg[%d] is not yet discarded. " + "Reset the zone: curseg[0x%x,0x%x]", + type, cs->segno, cs->next_blkoff); + err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block, + zone.len >> log_sectors_per_block); + if (err) { + f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", + zbd->path, err); + return err; + } + } + + return 0; +} + +static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < NR_PERSISTENT_LOG; i++) { + ret = do_fix_curseg_write_pointer(sbi, i); + if (ret) + return ret; + } + + return 0; +} + +struct check_zone_write_pointer_args { + struct f2fs_sb_info *sbi; + struct f2fs_dev_info *fdev; +}; + +static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct check_zone_write_pointer_args *args; + + args = (struct check_zone_write_pointer_args *)data; + + return check_zone_write_pointer(args->sbi, args->fdev, zone); +} + +static int check_write_pointer(struct f2fs_sb_info *sbi) +{ + int i, ret; + struct check_zone_write_pointer_args args; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!bdev_is_zoned(FDEV(i).bdev)) + continue; + + args.sbi = sbi; + args.fdev = &FDEV(i); + ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES, + check_zone_write_pointer_cb, &args); + if (ret < 0) + return ret; + } + + return 0; +} + +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) +{ + int ret; + + if (!f2fs_sb_has_blkzoned(sbi) || f2fs_readonly(sbi->sb) || + f2fs_hw_is_readonly(sbi)) + return 0; + + f2fs_notice(sbi, "Checking entire write pointers"); + ret = fix_curseg_write_pointer(sbi); + if (!ret) + ret = check_write_pointer(sbi); + return ret; +} + +/* + * Return the number of usable blocks in a segment. The number of blocks + * returned is always equal to the number of blocks in a segment for + * segments fully contained within a sequential zone capacity or a + * conventional zone. For segments partially contained in a sequential + * zone capacity, the number of usable blocks up to the zone capacity + * is returned. 0 is returned in all other cases. + */ +static inline unsigned int f2fs_usable_zone_blks_in_seg( + struct f2fs_sb_info *sbi, unsigned int segno) +{ + block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; + unsigned int secno; + + if (!sbi->unusable_blocks_per_sec) + return BLKS_PER_SEG(sbi); + + secno = GET_SEC_FROM_SEG(sbi, segno); + seg_start = START_BLOCK(sbi, segno); + sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); + sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi); + + /* + * If segment starts before zone capacity and spans beyond + * zone capacity, then usable blocks are from seg start to + * zone capacity. If the segment starts after the zone capacity, + * then there are no usable blocks. + */ + if (seg_start >= sec_cap_blkaddr) + return 0; + if (seg_start + BLKS_PER_SEG(sbi) > sec_cap_blkaddr) + return sec_cap_blkaddr - seg_start; + + return BLKS_PER_SEG(sbi); +} +#else +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) +{ + return 0; +} + +static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return 0; +} + +#endif +unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return f2fs_usable_zone_blks_in_seg(sbi, segno); + + return BLKS_PER_SEG(sbi); +} + +unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi) +{ + if (f2fs_sb_has_blkzoned(sbi)) + return CAP_SEGS_PER_SEC(sbi); + + return SEGS_PER_SEC(sbi); +} + +unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); + unsigned int secno = 0, start = 0; + unsigned int total_valid_blocks = 0; + unsigned long long mtime = 0; + unsigned int i = 0; + + secno = GET_SEC_FROM_SEG(sbi, segno); + start = GET_SEG_FROM_SEC(sbi, secno); + + if (!__is_large_section(sbi)) { + mtime = get_seg_entry(sbi, start + i)->mtime; + goto out; + } + + for (i = 0; i < usable_segs_per_sec; i++) { + /* for large section, only check the mtime of valid segments */ + struct seg_entry *se = get_seg_entry(sbi, start+i); + + mtime += se->mtime * se->valid_blocks; + total_valid_blocks += se->valid_blocks; + } + + if (total_valid_blocks == 0) + return INVALID_MTIME; + + mtime = div_u64(mtime, total_valid_blocks); +out: + if (unlikely(mtime == INVALID_MTIME)) + mtime -= 1; + return mtime; +} + /* * Update min, max modified time for cost-benefit GC algorithm */ @@ -1590,41 +5680,36 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) struct sit_info *sit_i = SIT_I(sbi); unsigned int segno; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); - sit_i->min_mtime = LLONG_MAX; + sit_i->min_mtime = ULLONG_MAX; - for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { - unsigned int i; + for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { unsigned long long mtime = 0; - for (i = 0; i < sbi->segs_per_sec; i++) - mtime += get_seg_entry(sbi, segno + i)->mtime; - - mtime = div_u64(mtime, sbi->segs_per_sec); + mtime = f2fs_get_section_mtime(sbi, segno); if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; } - sit_i->max_mtime = get_mtime(sbi); - mutex_unlock(&sit_i->sentry_lock); + sit_i->max_mtime = get_mtime(sbi, false); + sit_i->dirty_max_mtime = 0; + up_write(&sit_i->sentry_lock); } -int build_segment_manager(struct f2fs_sb_info *sbi) +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_sm_info *sm_info; int err; - sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL); + sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL); if (!sm_info) return -ENOMEM; /* init sm info */ sbi->sm_info = sm_info; - INIT_LIST_HEAD(&sm_info->wblist_head); - spin_lock_init(&sm_info->wblist_lock); sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); @@ -1632,6 +5717,30 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + sm_info->rec_prefree_segments = sm_info->main_segments * + DEF_RECLAIM_PREFREE_SEGMENTS / 100; + if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) + sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; + + if (!f2fs_lfs_mode(sbi)) + sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC); + sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; + sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_seq_blocks = BLKS_PER_SEG(sbi); + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; + sm_info->min_ssr_sections = reserved_sections(sbi); + + INIT_LIST_HEAD(&sm_info->sit_entry_set); + + init_f2fs_rwsem(&sm_info->curseg_lock); + + err = f2fs_create_flush_cmd_control(sbi); + if (err) + return err; + + err = create_discard_cmd_control(sbi); + if (err) + return err; err = build_sit_info(sbi); if (err) @@ -1644,13 +5753,19 @@ int build_segment_manager(struct f2fs_sb_info *sbi) return err; /* reinit free segmap based on SIT */ - build_sit_entries(sbi); + err = build_sit_entries(sbi); + if (err) + return err; init_free_segmap(sbi); err = build_dirty_segmap(sbi); if (err) return err; + err = sanity_check_curseg(sbi); + if (err) + return err; + init_min_max_mtime(sbi); return 0; } @@ -1661,7 +5776,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); mutex_lock(&dirty_i->seglist_lock); - kfree(dirty_i->dirty_segmap[dirty_type]); + kvfree(dirty_i->dirty_segmap[dirty_type]); dirty_i->nr_dirty[dirty_type] = 0; mutex_unlock(&dirty_i->seglist_lock); } @@ -1669,7 +5784,9 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - kfree(dirty_i->victim_secmap); + + kvfree(dirty_i->pinned_secmap); + kvfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) @@ -1684,6 +5801,12 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) for (i = 0; i < NR_DIRTY_TYPE; i++) discard_dirty_segmap(sbi, i); + if (__is_large_section(sbi)) { + mutex_lock(&dirty_i->seglist_lock); + kvfree(dirty_i->dirty_secmap); + mutex_unlock(&dirty_i->seglist_lock); + } + destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; kfree(dirty_i); @@ -1697,48 +5820,57 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) if (!array) return; SM_I(sbi)->curseg_array = NULL; - for (i = 0; i < NR_CURSEG_TYPE; i++) + for (i = 0; i < NR_CURSEG_TYPE; i++) { kfree(array[i].sum_blk); + kfree(array[i].journal); + } kfree(array); } static void destroy_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i = SM_I(sbi)->free_info; + if (!free_i) return; SM_I(sbi)->free_info = NULL; - kfree(free_i->free_segmap); - kfree(free_i->free_secmap); + kvfree(free_i->free_segmap); + kvfree(free_i->free_secmap); kfree(free_i); } static void destroy_sit_info(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int start; if (!sit_i) return; - if (sit_i->sentries) { - for (start = 0; start < TOTAL_SEGS(sbi); start++) { - kfree(sit_i->sentries[start].cur_valid_map); - kfree(sit_i->sentries[start].ckpt_valid_map); - } - } - vfree(sit_i->sentries); - vfree(sit_i->sec_entries); - kfree(sit_i->dirty_sentries_bitmap); + if (sit_i->sentries) + kvfree(sit_i->bitmap); + kfree(sit_i->tmp_map); + + kvfree(sit_i->sentries); + kvfree(sit_i->sec_entries); + kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sit_bitmap_mir); + kvfree(sit_i->invalid_segmap); +#endif kfree(sit_i); } -void destroy_segment_manager(struct f2fs_sb_info *sbi) +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); + + if (!sm_info) + return; + f2fs_destroy_flush_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); @@ -1746,3 +5878,44 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) sbi->sm_info = NULL; kfree(sm_info); } + +int __init f2fs_create_segment_manager_caches(void) +{ + discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry", + sizeof(struct discard_entry)); + if (!discard_entry_slab) + goto fail; + + discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd", + sizeof(struct discard_cmd)); + if (!discard_cmd_slab) + goto destroy_discard_entry; + + sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set", + sizeof(struct sit_entry_set)); + if (!sit_entry_set_slab) + goto destroy_discard_cmd; + + revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", + sizeof(struct revoke_entry)); + if (!revoke_entry_slab) + goto destroy_sit_entry_set; + return 0; + +destroy_sit_entry_set: + kmem_cache_destroy(sit_entry_set_slab); +destroy_discard_cmd: + kmem_cache_destroy(discard_cmd_slab); +destroy_discard_entry: + kmem_cache_destroy(discard_entry_slab); +fail: + return -ENOMEM; +} + +void f2fs_destroy_segment_manager_caches(void) +{ + kmem_cache_destroy(sit_entry_set_slab); + kmem_cache_destroy(discard_cmd_slab); + kmem_cache_destroy(discard_entry_slab); + kmem_cache_destroy(revoke_entry_slab); +} diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 062424a0e4c3..07dcbcbeb7c6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -1,131 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/segment.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/blkdev.h> +#include <linux/backing-dev.h> /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) #define NULL_SECNO ((unsigned int)(~0)) +#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ +#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ + +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ +#define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ + +#define INVALID_MTIME ULLONG_MAX /* no valid blocks in a segment/section */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ -#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) - -#define IS_DATASEG(t) \ - ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \ - (t == CURSEG_WARM_DATA)) - -#define IS_NODESEG(t) \ - ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \ - (t == CURSEG_WARM_NODE)) - -#define IS_CURSEG(sbi, seg) \ - ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) - -#define IS_CURSEC(sbi, secno) \ - ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - sbi->segs_per_sec)) \ - -#define START_BLOCK(sbi, segno) \ - (SM_I(sbi)->seg0_blkaddr + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) -#define NEXT_FREE_BLKADDR(sbi, curseg) \ - (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) + +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) +#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) + +static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, + unsigned short seg_type) +{ + f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); +} + +#define MAIN_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) +#define SEG0_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) + +#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) +#define MAIN_SECS(sbi) ((sbi)->total_sections) + +#define TOTAL_SEGS(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->segment_count : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) +#define TOTAL_BLKS(sbi) (SEGS_TO_BLKS(sbi, TOTAL_SEGS(sbi))) + +#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) -#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr) +#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ + (SEGS_TO_BLKS(sbi, GET_R2L_SEGNO(FREE_I(sbi), segno)))) -#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \ - ((blk_addr) - SM_I(sbi)->seg0_blkaddr) +#define NEXT_FREE_BLKADDR(sbi, curseg) \ + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) + +#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) + (BLKS_TO_SEGS(sbi, GET_SEGOFF_FROM_SEG0(sbi, blk_addr))) +#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (BLKS_PER_SEG(sbi) - 1)) + #define GET_SEGNO(sbi, blk_addr) \ - (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + ((!__is_valid_data_blkaddr(blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define GET_SECNO(sbi, segno) \ - ((segno) / sbi->segs_per_sec) -#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) - -#define GET_SUM_BLOCK(sbi, segno) \ - ((sbi->sm_info->ssa_blkaddr) + segno) +#ifdef CONFIG_BLK_DEV_ZONED +#define CAP_BLKS_PER_SEC(sbi) \ + (BLKS_PER_SEC(sbi) - (sbi)->unusable_blocks_per_sec) +#define CAP_SEGS_PER_SEC(sbi) \ + (SEGS_PER_SEC(sbi) - \ + BLKS_TO_SEGS(sbi, (sbi)->unusable_blocks_per_sec)) +#else +#define CAP_BLKS_PER_SEC(sbi) BLKS_PER_SEC(sbi) +#define CAP_SEGS_PER_SEC(sbi) SEGS_PER_SEC(sbi) +#endif +#define GET_START_SEG_FROM_SEC(sbi, segno) \ + (rounddown(segno, SEGS_PER_SEC(sbi))) +#define GET_SEC_FROM_SEG(sbi, segno) \ + (((segno) == -1) ? -1 : (segno) / SEGS_PER_SEC(sbi)) +#define GET_SEG_FROM_SEC(sbi, secno) \ + ((secno) * SEGS_PER_SEC(sbi)) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + (((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) + +#define SUMS_PER_BLOCK (F2FS_BLKSIZE / F2FS_SUM_BLKSIZE) +#define GET_SUM_BLOCK(sbi, segno) \ + (SM_I(sbi)->ssa_blkaddr + (segno / SUMS_PER_BLOCK)) +#define GET_SUM_BLKOFF(segno) (segno % SUMS_PER_BLOCK) +#define SUM_BLK_PAGE_ADDR(folio, segno) \ + (folio_address(folio) + GET_SUM_BLKOFF(segno) * F2FS_SUM_BLKSIZE) #define GET_SUM_TYPE(footer) ((footer)->entry_type) -#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ - (segno % sit_i->sents_per_block) -#define SIT_BLOCK_OFFSET(sit_i, segno) \ - (segno / SIT_ENTRY_PER_BLOCK) -#define START_SEGNO(sit_i, segno) \ - (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) + ((segno) % (sit_i)->sents_per_block) +#define SIT_BLOCK_OFFSET(segno) \ + ((segno) / SIT_ENTRY_PER_BLOCK) +#define START_SEGNO(segno) \ + (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) +#define SIT_BLK_CNT(sbi) \ + DIV_ROUND_UP(MAIN_SEGS(sbi), SIT_ENTRY_PER_BLOCK) #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) -#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) -#define TOTAL_SECS(sbi) (sbi->total_sections) - -#define SECTOR_FROM_BLOCK(sbi, blk_addr) \ - (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) -#define SECTOR_TO_BLOCK(sbi, sectors) \ - (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) - -/* during checkpoint, bio_private is used to synchronize the last bio */ -struct bio_private { - struct f2fs_sb_info *sbi; - bool is_sync; - void *wait; -}; -/* - * indicate a block allocation direction: RIGHT and LEFT. - * RIGHT means allocating new sections towards the end of volume. - * LEFT means the opposite direction. - */ -enum { - ALLOC_RIGHT = 0, - ALLOC_LEFT -}; +#define SECTOR_FROM_BLOCK(blk_addr) \ + (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) +#define SECTOR_TO_BLOCK(sectors) \ + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* - * In the victim_sel_policy->alloc_mode, there are two block allocation modes. + * In the victim_sel_policy->alloc_mode, there are three block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. + * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into + * fragmented segment which has similar aging degree. */ enum { LFS = 0, - SSR + SSR, + AT_SSR, }; /* - * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. + * In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. + * GC_AT is based on age-threshold algorithm. */ enum { GC_CB = 0, - GC_GREEDY + GC_GREEDY, + GC_AT, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, }; /* @@ -134,54 +150,78 @@ enum { */ enum { BG_GC = 0, - FG_GC + FG_GC, }; /* for a function parameter to select a victim segment */ struct victim_sel_policy { int alloc_mode; /* LFS or SSR */ int gc_mode; /* GC_CB or GC_GREEDY */ - unsigned long *dirty_segmap; /* dirty segment bitmap */ + unsigned long *dirty_bitmap; /* dirty segment/section bitmap */ + unsigned int max_search; /* + * maximum # of segments/sections + * to search + */ unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ + unsigned long long oldest_age; /* oldest age of segments having the same min cost */ unsigned int min_segno; /* segment # having min. cost */ + unsigned long long age; /* mtime of GCed section*/ + unsigned long long age_threshold;/* age threshold */ + bool one_time_gc; /* one time GC */ }; struct seg_entry { - unsigned short valid_blocks; /* # of valid blocks */ + unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ + unsigned int valid_blocks:10; /* # of valid blocks */ + unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ + unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ +#ifdef CONFIG_F2FS_CHECK_FS + unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ +#endif /* - * # of valid blocks and the validity bitmap stored in the the last + * # of valid blocks and the validity bitmap stored in the last * checkpoint pack. This information is used by the SSR mode. */ - unsigned short ckpt_valid_blocks; - unsigned char *ckpt_valid_map; - unsigned char type; /* segment type like CURSEG_XXX_TYPE */ + unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ + unsigned char *discard_map; unsigned long long mtime; /* modification time of the segment */ }; struct sec_entry { unsigned int valid_blocks; /* # of valid blocks in a section */ + unsigned int ckpt_valid_blocks; /* # of valid blocks last cp in a section */ }; -struct segment_allocation { - void (*allocate_segment)(struct f2fs_sb_info *, int, bool); +#define MAX_SKIP_GC_COUNT 16 + +struct revoke_entry { + struct list_head list; + block_t old_addr; /* for revoking when fail to commit */ + pgoff_t index; }; struct sit_info { - const struct segment_allocation *s_ops; - block_t sit_base_addr; /* start block address of SIT area */ block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ + char *bitmap; /* all bitmaps pointer */ char *sit_bitmap; /* SIT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *sit_bitmap_mir; /* SIT bitmap mirror */ + + /* bitmap of segments to be ignored by GC in case of errors */ + unsigned long *invalid_segmap; +#endif unsigned int bitmap_size; /* SIT bitmap size */ + unsigned long *tmp_map; /* bitmap for temporal use */ unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ unsigned int dirty_sentries; /* # of dirty sentries */ unsigned int sents_per_block; /* # of SIT entries per block */ - struct mutex sentry_lock; /* to protect SIT cache */ + struct rw_semaphore sentry_lock; /* to protect SIT cache */ struct seg_entry *sentries; /* SIT segment-level cache */ struct sec_entry *sec_entries; /* SIT section-level cache */ @@ -190,13 +230,17 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */ + unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { unsigned int start_segno; /* start segment number logically */ unsigned int free_segments; /* # of free segments */ unsigned int free_sections; /* # of free sections */ - rwlock_t segmap_lock; /* free segmap lock */ + spinlock_t segmap_lock; /* free segmap lock */ unsigned long *free_segmap; /* free segment bitmap */ unsigned long *free_secmap; /* free section bitmap */ }; @@ -215,28 +259,36 @@ enum dirty_type { }; struct dirty_seglist_info { - const struct victim_selection *v_ops; /* victim selction operation */ unsigned long *dirty_segmap[NR_DIRTY_TYPE]; + unsigned long *dirty_secmap; struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ -}; - -/* victim selection function for cleaning and SSR */ -struct victim_selection { - int (*get_victim)(struct f2fs_sb_info *, unsigned int *, - int, int, char); + unsigned long *pinned_secmap; /* pinned victims from foreground GC */ + unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ + bool enable_pin_section; /* enable pinning section */ }; /* for active log information */ struct curseg_info { struct mutex curseg_mutex; /* lock for consistency */ struct f2fs_summary_block *sum_blk; /* cached summary block */ + struct rw_semaphore journal_rwsem; /* protect journal area */ + struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ + unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ + int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */ + bool inited; /* indicate inmem log is inited */ +}; + +struct sit_entry_set { + struct list_head set_list; /* link with all sit sets */ + unsigned int start_segno; /* start segno of sits in set */ + unsigned int entry_cnt; /* the # of sit entries in set */ }; /* @@ -247,6 +299,28 @@ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } +static inline bool is_curseg(struct f2fs_sb_info *sbi, unsigned int segno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (segno == CURSEG_I(sbi, i)->segno) + return true; + } + return false; +} + +static inline bool is_cursec(struct f2fs_sb_info *sbi, unsigned int secno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (secno == GET_SEC_FROM_SEG(sbi, CURSEG_I(sbi, i)->segno)) + return true; + } + return false; +} + static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, unsigned int segno) { @@ -258,22 +332,76 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno, int section) + unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (section > 1) + if (use_section && __is_large_section(sbi)) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; } +static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno, bool use_section) +{ + if (use_section && __is_large_section(sbi)) + return get_sec_entry(sbi, segno)->ckpt_valid_blocks; + else + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; +} + +static inline void set_ckpt_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); + unsigned int blocks = 0; + int i; + + for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) { + struct seg_entry *se = get_seg_entry(sbi, start_segno); + + blocks += se->ckpt_valid_blocks; + } + get_sec_entry(sbi, segno)->ckpt_valid_blocks = blocks; +} + +#ifdef CONFIG_F2FS_CHECK_FS +static inline void sanity_check_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); + unsigned int blocks = 0; + int i; + + for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) { + struct seg_entry *se = get_seg_entry(sbi, start_segno); + + blocks += se->ckpt_valid_blocks; + } + + if (blocks != get_sec_entry(sbi, segno)->ckpt_valid_blocks) { + f2fs_err(sbi, + "Inconsistent ckpt valid blocks: " + "seg entry(%d) vs sec entry(%d) at secno %d", + blocks, get_sec_entry(sbi, segno)->ckpt_valid_blocks, secno); + f2fs_bug_on(sbi, 1); + } +} +#else +static inline void sanity_check_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno) +{ +} +#endif static inline void seg_info_from_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { @@ -281,56 +409,87 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se, se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#ifdef CONFIG_F2FS_CHECK_FS + memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#endif se->type = GET_SIT_TYPE(rs); se->mtime = le64_to_cpu(rs->mtime); } -static inline void seg_info_to_raw_sit(struct seg_entry *se, +static inline void __seg_info_to_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | se->valid_blocks; rs->vblocks = cpu_to_le16(raw_vblocks); memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); + rs->mtime = cpu_to_le64(se->mtime); +} + +static inline void seg_info_to_sit_folio(struct f2fs_sb_info *sbi, + struct folio *folio, unsigned int start) +{ + struct f2fs_sit_block *raw_sit; + struct seg_entry *se; + struct f2fs_sit_entry *rs; + unsigned int end = min(start + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + int i; + + raw_sit = folio_address(folio); + memset(raw_sit, 0, PAGE_SIZE); + for (i = 0; i < end - start; i++) { + rs = &raw_sit->entries[i]; + se = get_seg_entry(sbi, start + i); + __seg_info_to_raw_sit(se, rs); + } +} + +static inline void seg_info_to_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + __seg_info_to_raw_sit(se, rs); + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); se->ckpt_valid_blocks = se->valid_blocks; - rs->mtime = cpu_to_le64(se->mtime); } static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, unsigned int max, unsigned int segno) { unsigned int ret; - read_lock(&free_i->segmap_lock); + spin_lock(&free_i->segmap_lock); ret = find_next_bit(free_i->free_segmap, max, segno); - read_unlock(&free_i->segmap_lock); + spin_unlock(&free_i->segmap_lock); return ret; } static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; - write_lock(&free_i->segmap_lock); + spin_lock(&free_i->segmap_lock); clear_bit(segno, free_i->free_segmap); free_i->free_segments++; - next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno); - if (next >= start_segno + sbi->segs_per_sec) { + next = find_next_bit(free_i->free_segmap, + start_segno + SEGS_PER_SEC(sbi), start_segno); + if (next >= start_segno + f2fs_usable_segs_in_sec(sbi)) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; } - write_unlock(&free_i->segmap_lock); + spin_unlock(&free_i->segmap_lock); } static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) @@ -338,87 +497,91 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, } static inline void __set_test_and_free(struct f2fs_sb_info *sbi, - unsigned int segno) + unsigned int segno, bool inmem) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; + bool ret; - write_lock(&free_i->segmap_lock); - if (test_and_clear_bit(segno, free_i->free_segmap)) { - free_i->free_segments++; + spin_lock(&free_i->segmap_lock); + ret = test_and_clear_bit(segno, free_i->free_segmap); + if (!ret) + goto unlock_out; - next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), - start_segno); - if (next >= start_segno + sbi->segs_per_sec) { - if (test_and_clear_bit(secno, free_i->free_secmap)) - free_i->free_sections++; - } - } - write_unlock(&free_i->segmap_lock); + free_i->free_segments++; + + if (!inmem && is_cursec(sbi, secno)) + goto unlock_out; + + /* check large section */ + next = find_next_bit(free_i->free_segmap, + start_segno + SEGS_PER_SEC(sbi), start_segno); + if (next < start_segno + f2fs_usable_segs_in_sec(sbi)) + goto unlock_out; + + ret = test_and_clear_bit(secno, free_i->free_secmap); + if (!ret) + goto unlock_out; + + free_i->free_sections++; + + if (GET_SEC_FROM_SEG(sbi, sbi->next_victim_seg[BG_GC]) == secno) + sbi->next_victim_seg[BG_GC] = NULL_SEGNO; + if (GET_SEC_FROM_SEG(sbi, sbi->next_victim_seg[FG_GC]) == secno) + sbi->next_victim_seg[FG_GC] = NULL_SEGNO; + +unlock_out: + spin_unlock(&free_i->segmap_lock); } static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - write_lock(&free_i->segmap_lock); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) free_i->free_sections--; } - write_unlock(&free_i->segmap_lock); + spin_unlock(&free_i->segmap_lock); } static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, void *dst_addr) { struct sit_info *sit_i = SIT_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, + sit_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); } static inline block_t written_block_count(struct f2fs_sb_info *sbi) { - struct sit_info *sit_i = SIT_I(sbi); - block_t vblocks; - - mutex_lock(&sit_i->sentry_lock); - vblocks = sit_i->written_valid_blocks; - mutex_unlock(&sit_i->sentry_lock); - - return vblocks; + return SIT_I(sbi)->written_valid_blocks; } static inline unsigned int free_segments(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_segs; - - read_lock(&free_i->segmap_lock); - free_segs = free_i->free_segments; - read_unlock(&free_i->segmap_lock); - - return free_segs; + return FREE_I(sbi)->free_segments; } -static inline int reserved_segments(struct f2fs_sb_info *sbi) +static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { return SM_I(sbi)->reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_secs; - - read_lock(&free_i->segmap_lock); - free_secs = free_i->free_sections; - read_unlock(&free_i->segmap_lock); - - return free_secs; + return FREE_I(sbi)->free_sections; } static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) @@ -441,56 +604,219 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) return SM_I(sbi)->ovp_segments; } -static inline int overprovision_sections(struct f2fs_sb_info *sbi) +static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } -static inline int reserved_sections(struct f2fs_sb_info *sbi) +static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi, + enum log_type type, unsigned int segno) { - return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; + if (f2fs_lfs_mode(sbi)) { + unsigned int used_blocks = __is_large_section(sbi) ? SEGS_TO_BLKS(sbi, + (segno - GET_START_SEG_FROM_SEC(sbi, segno))) : 0; + return CAP_BLKS_PER_SEC(sbi) - used_blocks - + CURSEG_I(sbi, type)->next_blkoff; + } + return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); } -static inline bool need_SSR(struct f2fs_sb_info *sbi) +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, + unsigned int node_blocks, unsigned int data_blocks, + unsigned int dent_blocks) +{ + unsigned int segno, left_blocks, blocks; + int i; + + /* check current data/node sections in the worst case. */ + for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) { + segno = CURSEG_I(sbi, i)->segno; + + if (unlikely(segno == NULL_SEGNO)) + return false; + + left_blocks = get_left_section_blocks(sbi, i, segno); + + blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; + if (blocks > left_blocks) + return false; + } + + /* check current data section for dentry blocks. */ + segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + + if (unlikely(segno == NULL_SEGNO)) + return false; + + left_blocks = get_left_section_blocks(sbi, CURSEG_HOT_DATA, segno); + + if (dent_blocks > left_blocks) + return false; + return true; +} + +/* + * calculate needed sections for dirty node/dentry and call + * has_curseg_enough_space, please note that, it needs to account + * dirty data as well in lfs mode when checkpoint is disabled. + */ +static inline void __get_secs_required(struct f2fs_sb_info *sbi, + unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) { - return (free_sections(sbi) < overprovision_sections(sbi)); + unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS) + + get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int total_data_blocks = 0; + unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int data_secs = 0; + unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int data_blocks = 0; + + if (f2fs_lfs_mode(sbi)) { + total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA); + data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi); + data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi); + } + + if (lower_p) + *lower_p = node_secs + dent_secs + data_secs; + if (upper_p) + *upper_p = node_secs + dent_secs + data_secs + + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) + + (data_blocks ? 1 : 0); + if (curseg_p) + *curseg_p = has_curseg_enough_space(sbi, + node_blocks, data_blocks, dent_blocks); } -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) { - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + unsigned int free_secs, lower_secs, upper_secs; + bool curseg_space; - if (sbi->por_doing) + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi))); + __get_secs_required(sbi, &lower_secs, &upper_secs, &curseg_space); + + free_secs = free_sections(sbi) + freed; + lower_secs += needed + reserved_sections(sbi); + upper_secs += needed + reserved_sections(sbi); + + if (free_secs > upper_secs) + return false; + if (free_secs <= lower_secs) + return true; + return !curseg_space; +} + +static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) +{ + return !has_not_enough_free_secs(sbi, freed, needed); +} + +static inline bool has_enough_free_blks(struct f2fs_sb_info *sbi) +{ + unsigned int total_free_blocks = 0; + unsigned int avail_user_block_count; + + spin_lock(&sbi->stat_lock); + + avail_user_block_count = get_available_block_count(sbi, NULL, true); + total_free_blocks = avail_user_block_count - (unsigned int)valid_user_blocks(sbi); + + spin_unlock(&sbi->stat_lock); + + return total_free_blocks > 0; +} + +static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) +{ + if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + if (likely(has_enough_free_secs(sbi, 0, 0))) + return true; + if (!f2fs_lfs_mode(sbi) && + likely(has_enough_free_blks(sbi))) + return true; + return false; +} + +static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) +{ + return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; } static inline int utilization(struct f2fs_sb_info *sbi) { - return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count); + return div_u64((u64)valid_user_blocks(sbi) * 100, + sbi->user_block_count); } /* * Sometimes f2fs may be better to drop out-of-place update policy. - * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write - * data in the original place likewise other traditional file systems. - * But, currently set 100 in percentage, which means it is disabled. - * See below need_inplace_update(). + * And, users can control the policy through sysfs entries. + * There are five policies with triggering conditions as follows. + * F2FS_IPU_FORCE - all the time, + * F2FS_IPU_SSR - if SSR mode is activated, + * F2FS_IPU_UTIL - if FS utilization is over threashold, + * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over + * threashold, + * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash + * storages. IPU will be triggered only if the # of dirty + * pages over min_fsync_blocks. (=default option) + * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. + * F2FS_IPU_NOCACHE - disable IPU bio cache. + * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has + * FI_OPU_WRITE flag. + * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode) */ -#define MIN_IPU_UTIL 100 -static inline bool need_inplace_update(struct inode *inode) +#define DEF_MIN_IPU_UTIL 70 +#define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 + +#define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ + +#define F2FS_IPU_DISABLE 0 + +/* Modification on enum should be synchronized with ipu_mode_names array */ +enum { + F2FS_IPU_FORCE, + F2FS_IPU_SSR, + F2FS_IPU_UTIL, + F2FS_IPU_SSR_UTIL, + F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, + F2FS_IPU_NOCACHE, + F2FS_IPU_HONOR_OPU_WRITE, + F2FS_IPU_MAX, +}; + +static inline bool IS_F2FS_IPU_DISABLE(struct f2fs_sb_info *sbi) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (S_ISDIR(inode->i_mode)) - return false; - if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL) - return true; - return false; + return SM_I(sbi)->ipu_policy == F2FS_IPU_DISABLE; } +#define F2FS_IPU_POLICY(name) \ +static inline bool IS_##name(struct f2fs_sb_info *sbi) \ +{ \ + return SM_I(sbi)->ipu_policy & BIT(name); \ +} + +F2FS_IPU_POLICY(F2FS_IPU_FORCE); +F2FS_IPU_POLICY(F2FS_IPU_SSR); +F2FS_IPU_POLICY(F2FS_IPU_UTIL); +F2FS_IPU_POLICY(F2FS_IPU_SSR_UTIL); +F2FS_IPU_POLICY(F2FS_IPU_FSYNC); +F2FS_IPU_POLICY(F2FS_IPU_ASYNC); +F2FS_IPU_POLICY(F2FS_IPU_NOCACHE); +F2FS_IPU_POLICY(F2FS_IPU_HONOR_OPU_WRITE); + static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, int type) { @@ -505,64 +831,88 @@ static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, return curseg->alloc_type; } -static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) +static inline bool valid_main_segno(struct f2fs_sb_info *sbi, + unsigned int segno) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - return curseg->next_blkoff; + return segno <= (MAIN_SEGS(sbi) - 1); } -static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +static inline void verify_fio_blkaddr(struct f2fs_io_info *fio) { - unsigned int end_segno = SM_I(sbi)->segment_count - 1; - BUG_ON(segno > end_segno); -} + struct f2fs_sb_info *sbi = fio->sbi; -/* - * This function is used for only debugging. - * NOTE: In future, we have to remove this function. - */ -static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) -{ - struct f2fs_sm_info *sm_info = SM_I(sbi); - block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg; - block_t start_addr = sm_info->seg0_blkaddr; - block_t end_addr = start_addr + total_blks - 1; - BUG_ON(blk_addr < start_addr); - BUG_ON(blk_addr > end_addr); + if (__is_valid_data_blkaddr(fio->old_blkaddr)) + verify_blkaddr(sbi, fio->old_blkaddr, __is_meta_io(fio) ? + META_GENERIC : DATA_GENERIC); + verify_blkaddr(sbi, fio->new_blkaddr, __is_meta_io(fio) ? + META_GENERIC : DATA_GENERIC_ENHANCE); } /* - * Summary block is always treated as invalid block + * Summary block is always treated as an invalid block */ -static inline void check_block_count(struct f2fs_sb_info *sbi, +static inline int check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { - struct f2fs_sm_info *sm_info = SM_I(sbi); - unsigned int end_segno = sm_info->segment_count - 1; + bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; - int i; - - /* check segment usage */ - BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); - - /* check boundary of a given segment number */ - BUG_ON(segno > end_segno); + int cur_pos = 0, next_pos; + unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno); /* check bitmap with valid block count */ - for (i = 0; i < sbi->blocks_per_seg; i++) - if (f2fs_test_bit(i, raw_sit->valid_map)) - valid_blocks++; - BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); + do { + if (is_valid) { + next_pos = find_next_zero_bit_le(&raw_sit->valid_map, + usable_blks_per_seg, + cur_pos); + valid_blocks += next_pos - cur_pos; + } else + next_pos = find_next_bit_le(&raw_sit->valid_map, + usable_blks_per_seg, + cur_pos); + cur_pos = next_pos; + is_valid = !is_valid; + } while (cur_pos < usable_blks_per_seg); + + if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { + f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", + GET_SIT_VBLOCKS(raw_sit), valid_blocks); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); + return -EFSCORRUPTED; + } + + if (usable_blks_per_seg < BLKS_PER_SEG(sbi)) + f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map, + BLKS_PER_SEG(sbi), + usable_blks_per_seg) != BLKS_PER_SEG(sbi)); + + /* check segment usage, and check boundary of a given segment number */ + if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg + || !valid_main_segno(sbi, segno))) { + f2fs_err(sbi, "Wrong valid blocks %d or segno %u", + GET_SIT_VBLOCKS(raw_sit), segno); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); + return -EFSCORRUPTED; + } + return 0; } static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start); + unsigned int offset = SIT_BLOCK_OFFSET(start); block_t blk_addr = sit_i->sit_base_addr + offset; - check_seg_range(sbi, start); + f2fs_bug_on(sbi, !valid_main_segno(sbi, start)); + +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(offset, sit_i->sit_bitmap) != + f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif /* calculate sit block address */ if (f2fs_test_bit(offset, sit_i->sit_bitmap)) @@ -586,19 +936,31 @@ static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) { - unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start); + unsigned int block_off = SIT_BLOCK_OFFSET(start); - if (f2fs_test_bit(block_off, sit_i->sit_bitmap)) - f2fs_clear_bit(block_off, sit_i->sit_bitmap); - else - f2fs_set_bit(block_off, sit_i->sit_bitmap); + f2fs_change_bit(block_off, sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); +#endif } -static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) +static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, + bool base_time) { struct sit_info *sit_i = SIT_I(sbi); - return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec - - sit_i->mounted_time; + time64_t diff, now = ktime_get_boottime_seconds(); + + if (now >= sit_i->mounted_time) + return sit_i->elapsed_time + now - sit_i->mounted_time; + + /* system time is set to the past */ + if (!base_time) { + diff = sit_i->mounted_time - now; + if (sit_i->elapsed_time >= diff) + return sit_i->elapsed_time - diff; + return 0; + } + return sit_i->elapsed_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, @@ -624,14 +986,75 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { - if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + if (is_cursec(sbi, secno) || (sbi->cur_victim_sec == secno)) return true; return false; } -static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * By default, 512 pages for directory data, + * 512 pages (2MB) * 8 for nodes, and + * 256 pages * 8 for meta are set. + */ +static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) +{ + if (sbi->sb->s_bdi->wb.dirty_exceeded) + return 0; + + if (type == DATA) + return BLKS_PER_SEG(sbi); + else if (type == NODE) + return SEGS_TO_BLKS(sbi, 8); + else if (type == META) + return 8 * BIO_MAX_VECS; + else + return 0; +} + +/* + * When writing pages, it'd better align nr_to_write for segment size. + */ +static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, + struct writeback_control *wbc) +{ + long nr_to_write, desired; + + if (wbc->sync_mode != WB_SYNC_NONE) + return 0; + + nr_to_write = wbc->nr_to_write; + desired = BIO_MAX_VECS; + if (type == NODE) + desired <<= 1; + + wbc->nr_to_write = desired; + return desired - nr_to_write; +} + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) { - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dcc->discard_granularity) + break; + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup || !is_idle(sbi, DISCARD_TIME)) + return; +wake_up: + dcc->discard_wake = true; + wake_up_interruptible_all(&dcc->discard_wait_queue); } diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c new file mode 100644 index 000000000000..b88babcf6ab4 --- /dev/null +++ b/fs/f2fs/shrinker.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs shrinker support + * the basic infra was copied from fs/ubifs/shrinker.c + * + * Copyright (c) 2015 Motorola Mobility + * Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org> + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "node.h" + +static LIST_HEAD(f2fs_list); +static DEFINE_SPINLOCK(f2fs_list_lock); +static unsigned int shrinker_run_no; + +static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT]; +} + +static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) +{ + long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; +} + +static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi, + enum extent_type type) +{ + struct extent_tree_info *eti = &sbi->extent_tree[type]; + + return atomic_read(&eti->total_zombie_tree) + + atomic_read(&eti->total_ext_node); +} + +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned long count = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + /* count read extent cache entries */ + count += __count_extent_cache(sbi, EX_READ); + + /* count block age extent cache entries */ + count += __count_extent_cache(sbi, EX_BLOCK_AGE); + + /* count clean nat cache entries */ + count += __count_nat_entries(sbi); + + /* count free nids cache entries */ + count += __count_free_nids(sbi); + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + return count ?: SHRINK_EMPTY; +} + +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr = sc->nr_to_scan; + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int run_no; + unsigned long freed = 0; + + spin_lock(&f2fs_list_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + if (sbi->shrinker_run_no == run_no) + break; + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + sbi->shrinker_run_no = run_no; + + /* shrink extent cache entries */ + freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2); + + /* shrink read extent cache entries */ + freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2); + + /* shrink clean nat cache entries */ + if (freed < nr) + freed += f2fs_try_to_free_nats(sbi, nr - freed); + + /* shrink free nids cache entries */ + if (freed < nr) + freed += f2fs_try_to_free_nids(sbi, nr - freed); + + spin_lock(&f2fs_list_lock); + p = p->next; + list_move_tail(&sbi->s_list, &f2fs_list); + mutex_unlock(&sbi->umount_mutex); + if (freed >= nr) + break; + } + spin_unlock(&f2fs_list_lock); + return freed; +} + +unsigned int f2fs_donate_files(void) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int donate_files = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + donate_files += sbi->donate_files; + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + + return donate_files; +} + +static unsigned int do_reclaim_caches(struct f2fs_sb_info *sbi, + unsigned int reclaim_caches_kb) +{ + struct inode *inode; + struct f2fs_inode_info *fi; + unsigned int nfiles = sbi->donate_files; + pgoff_t npages = reclaim_caches_kb >> (PAGE_SHIFT - 10); + + while (npages && nfiles--) { + pgoff_t len; + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + if (list_empty(&sbi->inode_list[DONATE_INODE])) { + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + break; + } + fi = list_first_entry(&sbi->inode_list[DONATE_INODE], + struct f2fs_inode_info, gdonate_list); + list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + + if (!inode) + continue; + + inode_lock(inode); + if (!is_inode_flag_set(inode, FI_DONATE_FINISHED)) { + len = fi->donate_end - fi->donate_start + 1; + npages = npages < len ? 0 : npages - len; + + invalidate_inode_pages2_range(inode->i_mapping, + fi->donate_start, fi->donate_end); + set_inode_flag(inode, FI_DONATE_FINISHED); + } + inode_unlock(inode); + + iput(inode); + cond_resched(); + } + return npages << (PAGE_SHIFT - 10); +} + +void f2fs_reclaim_caches(unsigned int reclaim_caches_kb) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list && reclaim_caches_kb) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + reclaim_caches_kb = do_reclaim_caches(sbi, reclaim_caches_kb); + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); +} + +void f2fs_join_shrinker(struct f2fs_sb_info *sbi) +{ + spin_lock(&f2fs_list_lock); + list_add_tail(&sbi->s_list, &f2fs_list); + spin_unlock(&f2fs_list_lock); +} + +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) +{ + f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ)); + f2fs_shrink_age_extent_tree(sbi, + __count_extent_cache(sbi, EX_BLOCK_AGE)); + + spin_lock(&f2fs_list_lock); + list_del_init(&sbi->s_list); + spin_unlock(&f2fs_list_lock); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 75c7dc363e92..c4c225e09dc4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1,163 +1,1744 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/super.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/sched/mm.h> #include <linux/statfs.h> -#include <linux/buffer_head.h> -#include <linux/backing-dev.h> #include <linux/kthread.h> #include <linux/parser.h> #include <linux/mount.h> #include <linux/seq_file.h> +#include <linux/proc_fs.h> #include <linux/random.h> #include <linux/exportfs.h> #include <linux/blkdev.h> +#include <linux/quotaops.h> #include <linux/f2fs_fs.h> +#include <linux/sysfs.h> +#include <linux/quota.h> +#include <linux/unicode.h> +#include <linux/part_stat.h> +#include <linux/zstd.h> +#include <linux/lz4.h> +#include <linux/ctype.h> +#include <linux/fs_parser.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "xattr.h" +#include "gc.h" +#include "iostat.h" #define CREATE_TRACE_POINTS #include <trace/events/f2fs.h> static struct kmem_cache *f2fs_inode_cachep; +#ifdef CONFIG_F2FS_FAULT_INJECTION + +const char *f2fs_fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", + [FAULT_KVMALLOC] = "kvmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_BIO] = "alloc bio(obsolete)", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", + [FAULT_READ_IO] = "read IO error", + [FAULT_CHECKPOINT] = "checkpoint error", + [FAULT_DISCARD] = "discard error", + [FAULT_WRITE_IO] = "write IO error", + [FAULT_SLAB_ALLOC] = "slab alloc", + [FAULT_DQUOT_INIT] = "dquot initialize", + [FAULT_LOCK_OP] = "lock_op", + [FAULT_BLKADDR_VALIDITY] = "invalid blkaddr", + [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr", + [FAULT_NO_SEGMENT] = "no free segment", + [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer", + [FAULT_TIMEOUT] = "timeout", + [FAULT_VMALLOC] = "vmalloc", +}; + +int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, + unsigned long type, enum fault_option fo) +{ + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; + + if (fo & FAULT_ALL) { + memset(ffi, 0, sizeof(struct f2fs_fault_info)); + return 0; + } + + if (fo & FAULT_RATE) { + if (rate > INT_MAX) + return -EINVAL; + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = (int)rate; + f2fs_info(sbi, "build fault injection rate: %lu", rate); + } + + if (fo & FAULT_TYPE) { + if (type >= BIT(FAULT_MAX)) + return -EINVAL; + ffi->inject_type = (unsigned int)type; + f2fs_info(sbi, "build fault injection type: 0x%lx", type); + } + + return 0; +} +#endif + +/* f2fs-wide shrinker description */ +static struct shrinker *f2fs_shrinker_info; + +static int __init f2fs_init_shrinker(void) +{ + f2fs_shrinker_info = shrinker_alloc(0, "f2fs-shrinker"); + if (!f2fs_shrinker_info) + return -ENOMEM; + + f2fs_shrinker_info->count_objects = f2fs_shrink_count; + f2fs_shrinker_info->scan_objects = f2fs_shrink_scan; + + shrinker_register(f2fs_shrinker_info); + + return 0; +} + +static void f2fs_exit_shrinker(void) +{ + shrinker_free(f2fs_shrinker_info); +} + enum { Opt_gc_background, Opt_disable_roll_forward, + Opt_norecovery, Opt_discard, Opt_noheap, - Opt_nouser_xattr, - Opt_noacl, + Opt_heap, + Opt_user_xattr, + Opt_acl, Opt_active_logs, Opt_disable_ext_identify, + Opt_inline_xattr, + Opt_inline_xattr_size, + Opt_inline_data, + Opt_inline_dentry, + Opt_flush_merge, + Opt_barrier, + Opt_fastboot, + Opt_extent_cache, + Opt_data_flush, + Opt_reserve_root, + Opt_reserve_node, + Opt_resgid, + Opt_resuid, + Opt_mode, + Opt_fault_injection, + Opt_fault_type, + Opt_lazytime, + Opt_quota, + Opt_usrquota, + Opt_grpquota, + Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_alloc, + Opt_fsync, + Opt_test_dummy_encryption, + Opt_inlinecrypt, + Opt_checkpoint_disable, + Opt_checkpoint_disable_cap, + Opt_checkpoint_disable_cap_perc, + Opt_checkpoint_enable, + Opt_checkpoint_merge, + Opt_compress_algorithm, + Opt_compress_log_size, + Opt_nocompress_extension, + Opt_compress_extension, + Opt_compress_chksum, + Opt_compress_mode, + Opt_compress_cache, + Opt_atgc, + Opt_gc_merge, + Opt_discard_unit, + Opt_memory_mode, + Opt_age_extent_cache, + Opt_errors, + Opt_nat_bits, + Opt_jqfmt, + Opt_checkpoint, + Opt_lookup_mode, Opt_err, }; -static match_table_t f2fs_tokens = { - {Opt_gc_background, "background_gc=%s"}, - {Opt_disable_roll_forward, "disable_roll_forward"}, - {Opt_discard, "discard"}, - {Opt_noheap, "no_heap"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_noacl, "noacl"}, - {Opt_active_logs, "active_logs=%u"}, - {Opt_disable_ext_identify, "disable_ext_identify"}, +static const struct constant_table f2fs_param_background_gc[] = { + {"on", BGGC_MODE_ON}, + {"off", BGGC_MODE_OFF}, + {"sync", BGGC_MODE_SYNC}, + {} +}; + +static const struct constant_table f2fs_param_mode[] = { + {"adaptive", FS_MODE_ADAPTIVE}, + {"lfs", FS_MODE_LFS}, + {"fragment:segment", FS_MODE_FRAGMENT_SEG}, + {"fragment:block", FS_MODE_FRAGMENT_BLK}, + {} +}; + +static const struct constant_table f2fs_param_jqfmt[] = { + {"vfsold", QFMT_VFS_OLD}, + {"vfsv0", QFMT_VFS_V0}, + {"vfsv1", QFMT_VFS_V1}, + {} +}; + +static const struct constant_table f2fs_param_alloc_mode[] = { + {"default", ALLOC_MODE_DEFAULT}, + {"reuse", ALLOC_MODE_REUSE}, + {} +}; +static const struct constant_table f2fs_param_fsync_mode[] = { + {"posix", FSYNC_MODE_POSIX}, + {"strict", FSYNC_MODE_STRICT}, + {"nobarrier", FSYNC_MODE_NOBARRIER}, + {} +}; + +static const struct constant_table f2fs_param_compress_mode[] = { + {"fs", COMPR_MODE_FS}, + {"user", COMPR_MODE_USER}, + {} +}; + +static const struct constant_table f2fs_param_discard_unit[] = { + {"block", DISCARD_UNIT_BLOCK}, + {"segment", DISCARD_UNIT_SEGMENT}, + {"section", DISCARD_UNIT_SECTION}, + {} +}; + +static const struct constant_table f2fs_param_memory_mode[] = { + {"normal", MEMORY_MODE_NORMAL}, + {"low", MEMORY_MODE_LOW}, + {} +}; + +static const struct constant_table f2fs_param_errors[] = { + {"remount-ro", MOUNT_ERRORS_READONLY}, + {"continue", MOUNT_ERRORS_CONTINUE}, + {"panic", MOUNT_ERRORS_PANIC}, + {} +}; + +static const struct constant_table f2fs_param_lookup_mode[] = { + {"perf", LOOKUP_PERF}, + {"compat", LOOKUP_COMPAT}, + {"auto", LOOKUP_AUTO}, + {} +}; + +static const struct fs_parameter_spec f2fs_param_specs[] = { + fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc), + fsparam_flag("disable_roll_forward", Opt_disable_roll_forward), + fsparam_flag("norecovery", Opt_norecovery), + fsparam_flag_no("discard", Opt_discard), + fsparam_flag("no_heap", Opt_noheap), + fsparam_flag("heap", Opt_heap), + fsparam_flag_no("user_xattr", Opt_user_xattr), + fsparam_flag_no("acl", Opt_acl), + fsparam_s32("active_logs", Opt_active_logs), + fsparam_flag("disable_ext_identify", Opt_disable_ext_identify), + fsparam_flag_no("inline_xattr", Opt_inline_xattr), + fsparam_s32("inline_xattr_size", Opt_inline_xattr_size), + fsparam_flag_no("inline_data", Opt_inline_data), + fsparam_flag_no("inline_dentry", Opt_inline_dentry), + fsparam_flag_no("flush_merge", Opt_flush_merge), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag("fastboot", Opt_fastboot), + fsparam_flag_no("extent_cache", Opt_extent_cache), + fsparam_flag("data_flush", Opt_data_flush), + fsparam_u32("reserve_root", Opt_reserve_root), + fsparam_u32("reserve_node", Opt_reserve_node), + fsparam_gid("resgid", Opt_resgid), + fsparam_uid("resuid", Opt_resuid), + fsparam_enum("mode", Opt_mode, f2fs_param_mode), + fsparam_s32("fault_injection", Opt_fault_injection), + fsparam_u32("fault_type", Opt_fault_type), + fsparam_flag_no("lazytime", Opt_lazytime), + fsparam_flag_no("quota", Opt_quota), + fsparam_flag("usrquota", Opt_usrquota), + fsparam_flag("grpquota", Opt_grpquota), + fsparam_flag("prjquota", Opt_prjquota), + fsparam_string_empty("usrjquota", Opt_usrjquota), + fsparam_string_empty("grpjquota", Opt_grpjquota), + fsparam_string_empty("prjjquota", Opt_prjjquota), + fsparam_flag("nat_bits", Opt_nat_bits), + fsparam_enum("jqfmt", Opt_jqfmt, f2fs_param_jqfmt), + fsparam_enum("alloc_mode", Opt_alloc, f2fs_param_alloc_mode), + fsparam_enum("fsync_mode", Opt_fsync, f2fs_param_fsync_mode), + fsparam_string("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_flag("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_flag("inlinecrypt", Opt_inlinecrypt), + fsparam_string("checkpoint", Opt_checkpoint), + fsparam_flag_no("checkpoint_merge", Opt_checkpoint_merge), + fsparam_string("compress_algorithm", Opt_compress_algorithm), + fsparam_u32("compress_log_size", Opt_compress_log_size), + fsparam_string("compress_extension", Opt_compress_extension), + fsparam_string("nocompress_extension", Opt_nocompress_extension), + fsparam_flag("compress_chksum", Opt_compress_chksum), + fsparam_enum("compress_mode", Opt_compress_mode, f2fs_param_compress_mode), + fsparam_flag("compress_cache", Opt_compress_cache), + fsparam_flag("atgc", Opt_atgc), + fsparam_flag_no("gc_merge", Opt_gc_merge), + fsparam_enum("discard_unit", Opt_discard_unit, f2fs_param_discard_unit), + fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode), + fsparam_flag("age_extent_cache", Opt_age_extent_cache), + fsparam_enum("errors", Opt_errors, f2fs_param_errors), + fsparam_enum("lookup_mode", Opt_lookup_mode, f2fs_param_lookup_mode), + {} +}; + +/* Resort to a match_table for this interestingly formatted option */ +static match_table_t f2fs_checkpoint_tokens = { + {Opt_checkpoint_disable, "disable"}, + {Opt_checkpoint_disable_cap, "disable:%u"}, + {Opt_checkpoint_disable_cap_perc, "disable:%u%%"}, + {Opt_checkpoint_enable, "enable"}, {Opt_err, NULL}, }; -void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) +#define F2FS_SPEC_background_gc (1 << 0) +#define F2FS_SPEC_inline_xattr_size (1 << 1) +#define F2FS_SPEC_active_logs (1 << 2) +#define F2FS_SPEC_reserve_root (1 << 3) +#define F2FS_SPEC_resgid (1 << 4) +#define F2FS_SPEC_resuid (1 << 5) +#define F2FS_SPEC_mode (1 << 6) +#define F2FS_SPEC_fault_injection (1 << 7) +#define F2FS_SPEC_fault_type (1 << 8) +#define F2FS_SPEC_jqfmt (1 << 9) +#define F2FS_SPEC_alloc_mode (1 << 10) +#define F2FS_SPEC_fsync_mode (1 << 11) +#define F2FS_SPEC_checkpoint_disable_cap (1 << 12) +#define F2FS_SPEC_checkpoint_disable_cap_perc (1 << 13) +#define F2FS_SPEC_compress_level (1 << 14) +#define F2FS_SPEC_compress_algorithm (1 << 15) +#define F2FS_SPEC_compress_log_size (1 << 16) +#define F2FS_SPEC_compress_extension (1 << 17) +#define F2FS_SPEC_nocompress_extension (1 << 18) +#define F2FS_SPEC_compress_chksum (1 << 19) +#define F2FS_SPEC_compress_mode (1 << 20) +#define F2FS_SPEC_discard_unit (1 << 21) +#define F2FS_SPEC_memory_mode (1 << 22) +#define F2FS_SPEC_errors (1 << 23) +#define F2FS_SPEC_lookup_mode (1 << 24) +#define F2FS_SPEC_reserve_node (1 << 25) + +struct f2fs_fs_context { + struct f2fs_mount_info info; + unsigned long long opt_mask; /* Bits changed */ + unsigned int spec_mask; + unsigned short qname_mask; +}; + +#define F2FS_CTX_INFO(ctx) ((ctx)->info) + +static inline void ctx_set_opt(struct f2fs_fs_context *ctx, + enum f2fs_mount_opt flag) +{ + ctx->info.opt |= BIT(flag); + ctx->opt_mask |= BIT(flag); +} + +static inline void ctx_clear_opt(struct f2fs_fs_context *ctx, + enum f2fs_mount_opt flag) +{ + ctx->info.opt &= ~BIT(flag); + ctx->opt_mask |= BIT(flag); +} + +static inline bool ctx_test_opt(struct f2fs_fs_context *ctx, + enum f2fs_mount_opt flag) +{ + return ctx->info.opt & BIT(flag); +} + +void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, + const char *fmt, ...) { struct va_format vaf; va_list args; + int level; va_start(args, fmt); - vaf.fmt = fmt; + + level = printk_get_level(fmt); + vaf.fmt = printk_skip_level(fmt); vaf.va = &args; - printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + if (limit_rate) + if (sbi) + printk_ratelimited("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + else + printk_ratelimited("%c%cF2FS-fs: %pV\n", + KERN_SOH_ASCII, level, &vaf); + else + if (sbi) + printk("%c%cF2FS-fs (%s): %pV\n", + KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf); + else + printk("%c%cF2FS-fs: %pV\n", + KERN_SOH_ASCII, level, &vaf); + va_end(args); } +#if IS_ENABLED(CONFIG_UNICODE) +static const struct f2fs_sb_encodings { + __u16 magic; + char *name; + unsigned int version; +} f2fs_sb_encoding_map[] = { + {F2FS_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, +}; + +static const struct f2fs_sb_encodings * +f2fs_sb_read_encoding(const struct f2fs_super_block *sb) +{ + __u16 magic = le16_to_cpu(sb->s_encoding); + int i; + + for (i = 0; i < ARRAY_SIZE(f2fs_sb_encoding_map); i++) + if (magic == f2fs_sb_encoding_map[i].magic) + return &f2fs_sb_encoding_map[i]; + + return NULL; +} + +struct kmem_cache *f2fs_cf_name_slab; +static int __init f2fs_create_casefold_cache(void) +{ + f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", + F2FS_NAME_LEN); + return f2fs_cf_name_slab ? 0 : -ENOMEM; +} + +static void f2fs_destroy_casefold_cache(void) +{ + kmem_cache_destroy(f2fs_cf_name_slab); +} +#else +static int __init f2fs_create_casefold_cache(void) { return 0; } +static void f2fs_destroy_casefold_cache(void) { } +#endif + +static inline void limit_reserve_root(struct f2fs_sb_info *sbi) +{ + block_t block_limit = min((sbi->user_block_count >> 3), + sbi->user_block_count - sbi->reserved_blocks); + block_t node_limit = sbi->total_node_count >> 3; + + /* limit is 12.5% */ + if (test_opt(sbi, RESERVE_ROOT) && + F2FS_OPTION(sbi).root_reserved_blocks > block_limit) { + F2FS_OPTION(sbi).root_reserved_blocks = block_limit; + f2fs_info(sbi, "Reduce reserved blocks for root = %u", + F2FS_OPTION(sbi).root_reserved_blocks); + } + if (test_opt(sbi, RESERVE_NODE) && + F2FS_OPTION(sbi).root_reserved_nodes > node_limit) { + F2FS_OPTION(sbi).root_reserved_nodes = node_limit; + f2fs_info(sbi, "Reduce reserved nodes for root = %u", + F2FS_OPTION(sbi).root_reserved_nodes); + } + if (!test_opt(sbi, RESERVE_ROOT) && !test_opt(sbi, RESERVE_NODE) && + (!uid_eq(F2FS_OPTION(sbi).s_resuid, + make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || + !gid_eq(F2FS_OPTION(sbi).s_resgid, + make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) + f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root" + " and reserve_node", + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); +} + +static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi) +{ + if (!F2FS_OPTION(sbi).unusable_cap_perc) + return; + + if (F2FS_OPTION(sbi).unusable_cap_perc == 100) + F2FS_OPTION(sbi).unusable_cap = sbi->user_block_count; + else + F2FS_OPTION(sbi).unusable_cap = (sbi->user_block_count / 100) * + F2FS_OPTION(sbi).unusable_cap_perc; + + f2fs_info(sbi, "Adjust unusable cap for checkpoint=disable = %u / %u%%", + F2FS_OPTION(sbi).unusable_cap, + F2FS_OPTION(sbi).unusable_cap_perc); +} + static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; inode_init_once(&fi->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + fi->i_crypt_info = NULL; +#endif +#ifdef CONFIG_FS_VERITY + fi->i_verity_info = NULL; +#endif } -static int parse_options(struct super_block *sb, char *options) +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +/* + * Note the name of the specified quota file. + */ +static int f2fs_note_qf_name(struct fs_context *fc, int qtype, + struct fs_parameter *param) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); - substring_t args[MAX_OPT_ARGS]; - char *p, *name; - int arg = 0; + struct f2fs_fs_context *ctx = fc->fs_private; + char *qname; - if (!options) + if (param->size < 1) { + f2fs_err(NULL, "Missing quota name"); + return -EINVAL; + } + if (strchr(param->string, '/')) { + f2fs_err(NULL, "quotafile must be on filesystem root"); + return -EINVAL; + } + if (ctx->info.s_qf_names[qtype]) { + if (strcmp(ctx->info.s_qf_names[qtype], param->string) != 0) { + f2fs_err(NULL, "Quota file already specified"); + return -EINVAL; + } return 0; + } + + qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); + if (!qname) { + f2fs_err(NULL, "Not enough memory for storing quotafile name"); + return -ENOMEM; + } + F2FS_CTX_INFO(ctx).s_qf_names[qtype] = qname; + ctx->qname_mask |= 1 << qtype; + return 0; +} + +/* + * Clear the name of the specified quota file. + */ +static int f2fs_unnote_qf_name(struct fs_context *fc, int qtype) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + + kfree(ctx->info.s_qf_names[qtype]); + ctx->info.s_qf_names[qtype] = NULL; + ctx->qname_mask |= 1 << qtype; + return 0; +} + +static void f2fs_unnote_qf_name_all(struct fs_context *fc) +{ + int i; - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) + for (i = 0; i < MAXQUOTAS; i++) + f2fs_unnote_qf_name(fc, i); +} +#endif + +static int f2fs_parse_test_dummy_encryption(const struct fs_parameter *param, + struct f2fs_fs_context *ctx) +{ + int err; + + if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { + f2fs_warn(NULL, "test_dummy_encryption option not supported"); + return -EINVAL; + } + err = fscrypt_parse_test_dummy_encryption(param, + &ctx->info.dummy_enc_policy); + if (err) { + if (err == -EINVAL) + f2fs_warn(NULL, "Value of option \"%s\" is unrecognized", + param->key); + else if (err == -EEXIST) + f2fs_warn(NULL, "Conflicting test_dummy_encryption options"); + else + f2fs_warn(NULL, "Error processing option \"%s\" [%d]", + param->key, err); + return -EINVAL; + } + return 0; +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +static bool is_compress_extension_exist(struct f2fs_mount_info *info, + const char *new_ext, bool is_ext) +{ + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + int ext_cnt; + int i; + + if (is_ext) { + ext = info->extensions; + ext_cnt = info->compress_ext_cnt; + } else { + ext = info->noextensions; + ext_cnt = info->nocompress_ext_cnt; + } + + for (i = 0; i < ext_cnt; i++) { + if (!strcasecmp(new_ext, ext[i])) + return true; + } + + return false; +} + +/* + * 1. The same extension name cannot not appear in both compress and non-compress extension + * at the same time. + * 2. If the compress extension specifies all files, the types specified by the non-compress + * extension will be treated as special cases and will not be compressed. + * 3. Don't allow the non-compress extension specifies all files. + */ +static int f2fs_test_compress_extension(unsigned char (*noext)[F2FS_EXTENSION_LEN], + int noext_cnt, + unsigned char (*ext)[F2FS_EXTENSION_LEN], + int ext_cnt) +{ + int index = 0, no_index = 0; + + if (!noext_cnt) + return 0; + + for (no_index = 0; no_index < noext_cnt; no_index++) { + if (strlen(noext[no_index]) == 0) continue; + if (!strcasecmp("*", noext[no_index])) { + f2fs_info(NULL, "Don't allow the nocompress extension specifies all files"); + return -EINVAL; + } + for (index = 0; index < ext_cnt; index++) { + if (strlen(ext[index]) == 0) + continue; + if (!strcasecmp(ext[index], noext[no_index])) { + f2fs_info(NULL, "Don't allow the same extension %s appear in both compress and nocompress extension", + ext[index]); + return -EINVAL; + } + } + } + return 0; +} + +#ifdef CONFIG_F2FS_FS_LZ4 +static int f2fs_set_lz4hc_level(struct f2fs_fs_context *ctx, const char *str) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + unsigned int level; + + if (strlen(str) == 3) { + F2FS_CTX_INFO(ctx).compress_level = 0; + ctx->spec_mask |= F2FS_SPEC_compress_level; + return 0; + } + + str += 3; + + if (str[0] != ':') { + f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (!f2fs_is_compress_level_valid(COMPRESS_LZ4, level)) { + f2fs_info(NULL, "invalid lz4hc compress level: %d", level); + return -EINVAL; + } + + F2FS_CTX_INFO(ctx).compress_level = level; + ctx->spec_mask |= F2FS_SPEC_compress_level; + return 0; +#else + if (strlen(str) == 3) { + F2FS_CTX_INFO(ctx).compress_level = 0; + ctx->spec_mask |= F2FS_SPEC_compress_level; + return 0; + } + f2fs_info(NULL, "kernel doesn't support lz4hc compression"); + return -EINVAL; +#endif +} +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int f2fs_set_zstd_level(struct f2fs_fs_context *ctx, const char *str) +{ + int level; + int len = 4; + + if (strlen(str) == len) { + F2FS_CTX_INFO(ctx).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + ctx->spec_mask |= F2FS_SPEC_compress_level; + return 0; + } + + str += len; + + if (str[0] != ':') { + f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>"); + return -EINVAL; + } + if (kstrtoint(str + 1, 10, &level)) + return -EINVAL; + + /* f2fs does not support negative compress level now */ + if (level < 0) { + f2fs_info(NULL, "do not support negative compress level: %d", level); + return -ERANGE; + } + + if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) { + f2fs_info(NULL, "invalid zstd compress level: %d", level); + return -EINVAL; + } + + F2FS_CTX_INFO(ctx).compress_level = level; + ctx->spec_mask |= F2FS_SPEC_compress_level; + return 0; +} +#endif +#endif + +static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct f2fs_fs_context *ctx = fc->fs_private; +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt; + char *name; +#endif + substring_t args[MAX_OPT_ARGS]; + struct fs_parse_result result; + int token, ret, arg; + + token = fs_parse(fc, f2fs_param_specs, param, &result); + if (token < 0) + return token; + + switch (token) { + case Opt_gc_background: + F2FS_CTX_INFO(ctx).bggc_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_background_gc; + break; + case Opt_disable_roll_forward: + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_ROLL_FORWARD); + break; + case Opt_norecovery: + /* requires ro mount, checked in f2fs_validate_options */ + ctx_set_opt(ctx, F2FS_MOUNT_NORECOVERY); + break; + case Opt_discard: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD); + else + ctx_set_opt(ctx, F2FS_MOUNT_DISCARD); + break; + case Opt_noheap: + case Opt_heap: + f2fs_warn(NULL, "heap/no_heap options were deprecated"); + break; +#ifdef CONFIG_F2FS_FS_XATTR + case Opt_user_xattr: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_XATTR_USER); + else + ctx_set_opt(ctx, F2FS_MOUNT_XATTR_USER); + break; + case Opt_inline_xattr: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_XATTR); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR); + break; + case Opt_inline_xattr_size: + if (result.int_32 < MIN_INLINE_XATTR_SIZE || + result.int_32 > MAX_INLINE_XATTR_SIZE) { + f2fs_err(NULL, "inline xattr size is out of range: %u ~ %u", + (u32)MIN_INLINE_XATTR_SIZE, (u32)MAX_INLINE_XATTR_SIZE); + return -EINVAL; + } + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE); + F2FS_CTX_INFO(ctx).inline_xattr_size = result.int_32; + ctx->spec_mask |= F2FS_SPEC_inline_xattr_size; + break; +#else + case Opt_user_xattr: + case Opt_inline_xattr: + case Opt_inline_xattr_size: + f2fs_info(NULL, "%s options not supported", param->key); + break; +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + case Opt_acl: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_POSIX_ACL); + else + ctx_set_opt(ctx, F2FS_MOUNT_POSIX_ACL); + break; +#else + case Opt_acl: + f2fs_info(NULL, "%s options not supported", param->key); + break; +#endif + case Opt_active_logs: + if (result.int_32 != 2 && result.int_32 != 4 && + result.int_32 != NR_CURSEG_PERSIST_TYPE) + return -EINVAL; + ctx->spec_mask |= F2FS_SPEC_active_logs; + F2FS_CTX_INFO(ctx).active_logs = result.int_32; + break; + case Opt_disable_ext_identify: + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_EXT_IDENTIFY); + break; + case Opt_inline_data: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DATA); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DATA); + break; + case Opt_inline_dentry: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DENTRY); + else + ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DENTRY); + break; + case Opt_flush_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_FLUSH_MERGE); + else + ctx_set_opt(ctx, F2FS_MOUNT_FLUSH_MERGE); + break; + case Opt_barrier: + if (result.negated) + ctx_set_opt(ctx, F2FS_MOUNT_NOBARRIER); + else + ctx_clear_opt(ctx, F2FS_MOUNT_NOBARRIER); + break; + case Opt_fastboot: + ctx_set_opt(ctx, F2FS_MOUNT_FASTBOOT); + break; + case Opt_extent_cache: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE); + else + ctx_set_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE); + break; + case Opt_data_flush: + ctx_set_opt(ctx, F2FS_MOUNT_DATA_FLUSH); + break; + case Opt_reserve_root: + ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); + F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_reserve_root; + break; + case Opt_reserve_node: + ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_NODE); + F2FS_CTX_INFO(ctx).root_reserved_nodes = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_reserve_node; + break; + case Opt_resuid: + F2FS_CTX_INFO(ctx).s_resuid = result.uid; + ctx->spec_mask |= F2FS_SPEC_resuid; + break; + case Opt_resgid: + F2FS_CTX_INFO(ctx).s_resgid = result.gid; + ctx->spec_mask |= F2FS_SPEC_resgid; + break; + case Opt_mode: + F2FS_CTX_INFO(ctx).fs_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_mode; + break; +#ifdef CONFIG_F2FS_FAULT_INJECTION + case Opt_fault_injection: + F2FS_CTX_INFO(ctx).fault_info.inject_rate = result.int_32; + ctx->spec_mask |= F2FS_SPEC_fault_injection; + ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION); + break; + + case Opt_fault_type: + if (result.uint_32 > BIT(FAULT_MAX)) + return -EINVAL; + F2FS_CTX_INFO(ctx).fault_info.inject_type = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_fault_type; + ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION); + break; +#else + case Opt_fault_injection: + case Opt_fault_type: + f2fs_info(NULL, "%s options not supported", param->key); + break; +#endif + case Opt_lazytime: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_LAZYTIME); + else + ctx_set_opt(ctx, F2FS_MOUNT_LAZYTIME); + break; +#ifdef CONFIG_QUOTA + case Opt_quota: + if (result.negated) { + ctx_clear_opt(ctx, F2FS_MOUNT_QUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA); + ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA); + } else + ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA); + break; + case Opt_usrquota: + ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA); + break; + case Opt_grpquota: + ctx_set_opt(ctx, F2FS_MOUNT_GRPQUOTA); + break; + case Opt_prjquota: + ctx_set_opt(ctx, F2FS_MOUNT_PRJQUOTA); + break; + case Opt_usrjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, USRQUOTA); + else + ret = f2fs_note_qf_name(fc, USRQUOTA, param); + if (ret) + return ret; + break; + case Opt_grpjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, GRPQUOTA); + else + ret = f2fs_note_qf_name(fc, GRPQUOTA, param); + if (ret) + return ret; + break; + case Opt_prjjquota: + if (!*param->string) + ret = f2fs_unnote_qf_name(fc, PRJQUOTA); + else + ret = f2fs_note_qf_name(fc, PRJQUOTA, param); + if (ret) + return ret; + break; + case Opt_jqfmt: + F2FS_CTX_INFO(ctx).s_jquota_fmt = result.int_32; + ctx->spec_mask |= F2FS_SPEC_jqfmt; + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + f2fs_info(NULL, "quota operations not supported"); + break; +#endif + case Opt_alloc: + F2FS_CTX_INFO(ctx).alloc_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_alloc_mode; + break; + case Opt_fsync: + F2FS_CTX_INFO(ctx).fsync_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_fsync_mode; + break; + case Opt_test_dummy_encryption: + ret = f2fs_parse_test_dummy_encryption(param, ctx); + if (ret) + return ret; + break; + case Opt_inlinecrypt: +#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT + ctx_set_opt(ctx, F2FS_MOUNT_INLINECRYPT); +#else + f2fs_info(NULL, "inline encryption not supported"); +#endif + break; + case Opt_checkpoint: /* * Initialize args struct so we know whether arg was * found; some options take optional arguments. */ - args[0].to = args[0].from = NULL; - token = match_token(p, f2fs_tokens, args); + args[0].from = args[0].to = NULL; + arg = 0; + /* revert to match_table for checkpoint= options */ + token = match_token(param->string, f2fs_checkpoint_tokens, args); switch (token) { - case Opt_gc_background: - name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strncmp(name, "on", 2)) - set_opt(sbi, BG_GC); - else if (!strncmp(name, "off", 3)) - clear_opt(sbi, BG_GC); - else { - kfree(name); + case Opt_checkpoint_disable_cap_perc: + if (args->from && match_int(args, &arg)) return -EINVAL; - } - kfree(name); - break; - case Opt_disable_roll_forward: - set_opt(sbi, DISABLE_ROLL_FORWARD); + if (arg < 0 || arg > 100) + return -EINVAL; + F2FS_CTX_INFO(ctx).unusable_cap_perc = arg; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc; + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; - case Opt_discard: - set_opt(sbi, DISCARD); + case Opt_checkpoint_disable_cap: + if (args->from && match_int(args, &arg)) + return -EINVAL; + F2FS_CTX_INFO(ctx).unusable_cap = arg; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap; + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; - case Opt_noheap: - set_opt(sbi, NOHEAP); + case Opt_checkpoint_disable: + ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; -#ifdef CONFIG_F2FS_FS_XATTR - case Opt_nouser_xattr: - clear_opt(sbi, XATTR_USER); + case Opt_checkpoint_enable: + F2FS_CTX_INFO(ctx).unusable_cap_perc = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc; + F2FS_CTX_INFO(ctx).unusable_cap = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap; + ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; + default: + return -EINVAL; + } + break; + case Opt_checkpoint_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT); + else + ctx_set_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT); + break; +#ifdef CONFIG_F2FS_FS_COMPRESSION + case Opt_compress_algorithm: + name = param->string; + if (!strcmp(name, "lzo")) { +#ifdef CONFIG_F2FS_FS_LZO + F2FS_CTX_INFO(ctx).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZO; + ctx->spec_mask |= F2FS_SPEC_compress_level; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - case Opt_nouser_xattr: - f2fs_msg(sb, KERN_INFO, - "nouser_xattr options not supported"); - break; + f2fs_info(NULL, "kernel doesn't support lzo compression"); #endif -#ifdef CONFIG_F2FS_FS_POSIX_ACL - case Opt_noacl: - clear_opt(sbi, POSIX_ACL); - break; + } else if (!strncmp(name, "lz4", 3)) { +#ifdef CONFIG_F2FS_FS_LZ4 + ret = f2fs_set_lz4hc_level(ctx, name); + if (ret) + return -EINVAL; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZ4; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; #else - case Opt_noacl: - f2fs_msg(sb, KERN_INFO, "noacl options not supported"); - break; + f2fs_info(NULL, "kernel doesn't support lz4 compression"); #endif - case Opt_active_logs: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) + } else if (!strncmp(name, "zstd", 4)) { +#ifdef CONFIG_F2FS_FS_ZSTD + ret = f2fs_set_zstd_level(ctx, name); + if (ret) return -EINVAL; - sbi->active_logs = arg; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_ZSTD; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; +#else + f2fs_info(NULL, "kernel doesn't support zstd compression"); +#endif + } else if (!strcmp(name, "lzo-rle")) { +#ifdef CONFIG_F2FS_FS_LZORLE + F2FS_CTX_INFO(ctx).compress_level = 0; + F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZORLE; + ctx->spec_mask |= F2FS_SPEC_compress_level; + ctx->spec_mask |= F2FS_SPEC_compress_algorithm; +#else + f2fs_info(NULL, "kernel doesn't support lzorle compression"); +#endif + } else + return -EINVAL; + break; + case Opt_compress_log_size: + if (result.uint_32 < MIN_COMPRESS_LOG_SIZE || + result.uint_32 > MAX_COMPRESS_LOG_SIZE) { + f2fs_err(NULL, + "Compress cluster log size is out of range"); + return -EINVAL; + } + F2FS_CTX_INFO(ctx).compress_log_size = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_compress_log_size; + break; + case Opt_compress_extension: + name = param->string; + ext = F2FS_CTX_INFO(ctx).extensions; + ext_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + ext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(NULL, "invalid extension length/number"); + return -EINVAL; + } + + if (is_compress_extension_exist(&ctx->info, name, true)) break; - case Opt_disable_ext_identify: - set_opt(sbi, DISABLE_EXT_IDENTIFY); + + ret = strscpy(ext[ext_cnt], name, F2FS_EXTENSION_LEN); + if (ret < 0) + return ret; + F2FS_CTX_INFO(ctx).compress_ext_cnt++; + ctx->spec_mask |= F2FS_SPEC_compress_extension; + break; + case Opt_nocompress_extension: + name = param->string; + noext = F2FS_CTX_INFO(ctx).noextensions; + noext_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + noext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(NULL, "invalid extension length/number"); + return -EINVAL; + } + + if (is_compress_extension_exist(&ctx->info, name, false)) break; - default: - f2fs_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" or missing value", - p); + + ret = strscpy(noext[noext_cnt], name, F2FS_EXTENSION_LEN); + if (ret < 0) + return ret; + F2FS_CTX_INFO(ctx).nocompress_ext_cnt++; + ctx->spec_mask |= F2FS_SPEC_nocompress_extension; + break; + case Opt_compress_chksum: + F2FS_CTX_INFO(ctx).compress_chksum = true; + ctx->spec_mask |= F2FS_SPEC_compress_chksum; + break; + case Opt_compress_mode: + F2FS_CTX_INFO(ctx).compress_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_compress_mode; + break; + case Opt_compress_cache: + ctx_set_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE); + break; +#else + case Opt_compress_algorithm: + case Opt_compress_log_size: + case Opt_compress_extension: + case Opt_nocompress_extension: + case Opt_compress_chksum: + case Opt_compress_mode: + case Opt_compress_cache: + f2fs_info(NULL, "compression options not supported"); + break; +#endif + case Opt_atgc: + ctx_set_opt(ctx, F2FS_MOUNT_ATGC); + break; + case Opt_gc_merge: + if (result.negated) + ctx_clear_opt(ctx, F2FS_MOUNT_GC_MERGE); + else + ctx_set_opt(ctx, F2FS_MOUNT_GC_MERGE); + break; + case Opt_discard_unit: + F2FS_CTX_INFO(ctx).discard_unit = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_discard_unit; + break; + case Opt_memory_mode: + F2FS_CTX_INFO(ctx).memory_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_memory_mode; + break; + case Opt_age_extent_cache: + ctx_set_opt(ctx, F2FS_MOUNT_AGE_EXTENT_CACHE); + break; + case Opt_errors: + F2FS_CTX_INFO(ctx).errors = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_errors; + break; + case Opt_nat_bits: + ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS); + break; + case Opt_lookup_mode: + F2FS_CTX_INFO(ctx).lookup_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_lookup_mode; + break; + } + return 0; +} + +/* + * Check quota settings consistency. + */ +static int f2fs_check_quota_consistency(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + #ifdef CONFIG_QUOTA + struct f2fs_fs_context *ctx = fc->fs_private; + bool quota_feature = f2fs_sb_has_quota_ino(sbi); + bool quota_turnon = sb_any_quota_loaded(sb); + char *old_qname, *new_qname; + bool usr_qf_name, grp_qf_name, prj_qf_name, usrquota, grpquota, prjquota; + int i; + + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA) && + !f2fs_sb_has_project_quota(sbi)) { + f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement."); + return -EINVAL; + } + + if (ctx->qname_mask) { + for (i = 0; i < MAXQUOTAS; i++) { + if (!(ctx->qname_mask & (1 << i))) + continue; + + old_qname = F2FS_OPTION(sbi).s_qf_names[i]; + new_qname = F2FS_CTX_INFO(ctx).s_qf_names[i]; + if (quota_turnon && + !!old_qname != !!new_qname) + goto err_jquota_change; + + if (old_qname) { + if (!new_qname) { + f2fs_info(sbi, "remove qf_name %s", + old_qname); + continue; + } else if (strcmp(old_qname, new_qname) == 0) { + ctx->qname_mask &= ~(1 << i); + continue; + } + goto err_jquota_specified; + } + + if (quota_feature) { + f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name"); + ctx->qname_mask &= ~(1 << i); + kfree(F2FS_CTX_INFO(ctx).s_qf_names[i]); + F2FS_CTX_INFO(ctx).s_qf_names[i] = NULL; + } + } + } + + /* Make sure we don't mix old and new quota format */ + usr_qf_name = F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[USRQUOTA]; + grp_qf_name = F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[GRPQUOTA]; + prj_qf_name = F2FS_OPTION(sbi).s_qf_names[PRJQUOTA] || + F2FS_CTX_INFO(ctx).s_qf_names[PRJQUOTA]; + usrquota = test_opt(sbi, USRQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_USRQUOTA); + grpquota = test_opt(sbi, GRPQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_GRPQUOTA); + prjquota = test_opt(sbi, PRJQUOTA) || + ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA); + + if (usr_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA); + usrquota = false; + } + if (grp_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA); + grpquota = false; + } + if (prj_qf_name) { + ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA); + prjquota = false; + } + if (usr_qf_name || grp_qf_name || prj_qf_name) { + if (grpquota || usrquota || prjquota) { + f2fs_err(sbi, "old and new quota format mixing"); + return -EINVAL; + } + if (!(ctx->spec_mask & F2FS_SPEC_jqfmt || + F2FS_OPTION(sbi).s_jquota_fmt)) { + f2fs_err(sbi, "journaled quota format not specified"); + return -EINVAL; + } + } + return 0; + +err_jquota_change: + f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); + return -EINVAL; +err_jquota_specified: + f2fs_err(sbi, "%s quota file already specified", + QTYPE2NAME(i)); + return -EINVAL; + +#else + if (f2fs_readonly(sbi->sb)) + return 0; + if (f2fs_sb_has_quota_ino(sbi)) { + f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } + if (f2fs_sb_has_project_quota(sbi)) { + f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } + + return 0; +#endif +} + +static int f2fs_check_test_dummy_encryption(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy)) + return 0; + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fscrypt_dummy_policies_equal(&F2FS_OPTION(sbi).dummy_enc_policy, + &F2FS_CTX_INFO(ctx).dummy_enc_policy)) + return 0; + f2fs_warn(sbi, "Can't set or change test_dummy_encryption on remount"); + return -EINVAL; + } + return 0; +} + +static inline bool test_compression_spec(unsigned int mask) +{ + return mask & (F2FS_SPEC_compress_algorithm + | F2FS_SPEC_compress_log_size + | F2FS_SPEC_compress_extension + | F2FS_SPEC_nocompress_extension + | F2FS_SPEC_compress_chksum + | F2FS_SPEC_compress_mode); +} + +static inline void clear_compression_spec(struct f2fs_fs_context *ctx) +{ + ctx->spec_mask &= ~(F2FS_SPEC_compress_algorithm + | F2FS_SPEC_compress_log_size + | F2FS_SPEC_compress_extension + | F2FS_SPEC_nocompress_extension + | F2FS_SPEC_compress_chksum + | F2FS_SPEC_compress_mode); +} + +static int f2fs_check_compression(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i, cnt; + + if (!f2fs_sb_has_compression(sbi)) { + if (test_compression_spec(ctx->spec_mask) || + ctx_test_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE)) + f2fs_info(sbi, "Image doesn't support compression"); + clear_compression_spec(ctx); + ctx->opt_mask &= ~BIT(F2FS_MOUNT_COMPRESS_CACHE); + return 0; + } + if (ctx->spec_mask & F2FS_SPEC_compress_extension) { + cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + for (i = 0; i < F2FS_CTX_INFO(ctx).compress_ext_cnt; i++) { + if (is_compress_extension_exist(&F2FS_OPTION(sbi), + F2FS_CTX_INFO(ctx).extensions[i], true)) { + F2FS_CTX_INFO(ctx).extensions[i][0] = '\0'; + cnt--; + } + } + if (F2FS_OPTION(sbi).compress_ext_cnt + cnt > COMPRESS_EXT_NUM) { + f2fs_err(sbi, "invalid extension length/number"); return -EINVAL; } } + if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) { + cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + for (i = 0; i < F2FS_CTX_INFO(ctx).nocompress_ext_cnt; i++) { + if (is_compress_extension_exist(&F2FS_OPTION(sbi), + F2FS_CTX_INFO(ctx).noextensions[i], false)) { + F2FS_CTX_INFO(ctx).noextensions[i][0] = '\0'; + cnt--; + } + } + if (F2FS_OPTION(sbi).nocompress_ext_cnt + cnt > COMPRESS_EXT_NUM) { + f2fs_err(sbi, "invalid noextension length/number"); + return -EINVAL; + } + } + + if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions, + F2FS_CTX_INFO(ctx).nocompress_ext_cnt, + F2FS_CTX_INFO(ctx).extensions, + F2FS_CTX_INFO(ctx).compress_ext_cnt)) { + f2fs_err(sbi, "new noextensions conflicts with new extensions"); + return -EINVAL; + } + if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions, + F2FS_CTX_INFO(ctx).nocompress_ext_cnt, + F2FS_OPTION(sbi).extensions, + F2FS_OPTION(sbi).compress_ext_cnt)) { + f2fs_err(sbi, "new noextensions conflicts with old extensions"); + return -EINVAL; + } + if (f2fs_test_compress_extension(F2FS_OPTION(sbi).noextensions, + F2FS_OPTION(sbi).nocompress_ext_cnt, + F2FS_CTX_INFO(ctx).extensions, + F2FS_CTX_INFO(ctx).compress_ext_cnt)) { + f2fs_err(sbi, "new extensions conflicts with old noextensions"); + return -EINVAL; + } +#endif + return 0; +} + +static int f2fs_check_opt_consistency(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err; + + if (ctx_test_opt(ctx, F2FS_MOUNT_NORECOVERY) && !f2fs_readonly(sb)) + return -EINVAL; + + if (f2fs_hw_should_discard(sbi) && + (ctx->opt_mask & BIT(F2FS_MOUNT_DISCARD)) && + !ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) { + f2fs_warn(sbi, "discard is required for zoned block devices"); + return -EINVAL; + } + + if (!f2fs_hw_support_discard(sbi) && + (ctx->opt_mask & BIT(F2FS_MOUNT_DISCARD)) && + ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) { + f2fs_warn(sbi, "device does not support discard"); + ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD); + ctx->opt_mask &= ~BIT(F2FS_MOUNT_DISCARD); + } + + if (f2fs_sb_has_device_alias(sbi) && + (ctx->opt_mask & BIT(F2FS_MOUNT_READ_EXTENT_CACHE)) && + !ctx_test_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE)) { + f2fs_err(sbi, "device aliasing requires extent cache"); + return -EINVAL; + } + + if (test_opt(sbi, RESERVE_ROOT) && + (ctx->opt_mask & BIT(F2FS_MOUNT_RESERVE_ROOT)) && + ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_ROOT)) { + f2fs_info(sbi, "Preserve previous reserve_root=%u", + F2FS_OPTION(sbi).root_reserved_blocks); + ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); + ctx->opt_mask &= ~BIT(F2FS_MOUNT_RESERVE_ROOT); + } + if (test_opt(sbi, RESERVE_NODE) && + (ctx->opt_mask & BIT(F2FS_MOUNT_RESERVE_NODE)) && + ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_NODE)) { + f2fs_info(sbi, "Preserve previous reserve_node=%u", + F2FS_OPTION(sbi).root_reserved_nodes); + ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_NODE); + ctx->opt_mask &= ~BIT(F2FS_MOUNT_RESERVE_NODE); + } + + err = f2fs_check_test_dummy_encryption(fc, sb); + if (err) + return err; + + err = f2fs_check_compression(fc, sb); + if (err) + return err; + + err = f2fs_check_quota_consistency(fc, sb); + if (err) + return err; + + if (!IS_ENABLED(CONFIG_UNICODE) && f2fs_sb_has_casefold(sbi)) { + f2fs_err(sbi, + "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); + return -EINVAL; + } + + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ + if (f2fs_sb_has_blkzoned(sbi)) { + if (F2FS_CTX_INFO(ctx).bggc_mode == BGGC_MODE_OFF) { + f2fs_warn(sbi, "zoned devices need bggc"); + return -EINVAL; + } +#ifdef CONFIG_BLK_DEV_ZONED + if ((ctx->spec_mask & F2FS_SPEC_discard_unit) && + F2FS_CTX_INFO(ctx).discard_unit != DISCARD_UNIT_SECTION) { + f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); + F2FS_CTX_INFO(ctx).discard_unit = DISCARD_UNIT_SECTION; + } + + if ((ctx->spec_mask & F2FS_SPEC_mode) && + F2FS_CTX_INFO(ctx).fs_mode != FS_MODE_LFS) { + f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature"); + return -EINVAL; + } +#else + f2fs_err(sbi, "Zoned block device support is not enabled"); + return -EINVAL; +#endif + } + + if (ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE)) { + if (!f2fs_sb_has_extra_attr(sbi) || + !f2fs_sb_has_flexible_inline_xattr(sbi)) { + f2fs_err(sbi, "extra_attr or flexible_inline_xattr feature is off"); + return -EINVAL; + } + if (!ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR) && !test_opt(sbi, INLINE_XATTR)) { + f2fs_err(sbi, "inline_xattr_size option should be set with inline_xattr option"); + return -EINVAL; + } + } + + if (ctx_test_opt(ctx, F2FS_MOUNT_ATGC) && + F2FS_CTX_INFO(ctx).fs_mode == FS_MODE_LFS) { + f2fs_err(sbi, "LFS is not compatible with ATGC"); + return -EINVAL; + } + + if (f2fs_is_readonly(sbi) && ctx_test_opt(ctx, F2FS_MOUNT_FLUSH_MERGE)) { + f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode"); + return -EINVAL; + } + + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "Allow to mount readonly mode only"); + return -EROFS; + } + return 0; +} + +static void f2fs_apply_quota_options(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + bool quota_feature = f2fs_sb_has_quota_ino(sbi); + char *qname; + int i; + + if (quota_feature) + return; + + for (i = 0; i < MAXQUOTAS; i++) { + if (!(ctx->qname_mask & (1 << i))) + continue; + + qname = F2FS_CTX_INFO(ctx).s_qf_names[i]; + if (qname) { + qname = kstrdup(F2FS_CTX_INFO(ctx).s_qf_names[i], + GFP_KERNEL | __GFP_NOFAIL); + set_opt(sbi, QUOTA); + } + F2FS_OPTION(sbi).s_qf_names[i] = qname; + } + + if (ctx->spec_mask & F2FS_SPEC_jqfmt) + F2FS_OPTION(sbi).s_jquota_fmt = F2FS_CTX_INFO(ctx).s_jquota_fmt; + + if (quota_feature && F2FS_OPTION(sbi).s_jquota_fmt) { + f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt"); + F2FS_OPTION(sbi).s_jquota_fmt = 0; + } +#endif +} + +static void f2fs_apply_test_dummy_encryption(struct fs_context *fc, + struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy) || + /* if already set, it was already verified to be the same */ + fscrypt_is_dummy_policy_set(&F2FS_OPTION(sbi).dummy_enc_policy)) + return; + swap(F2FS_OPTION(sbi).dummy_enc_policy, F2FS_CTX_INFO(ctx).dummy_enc_policy); + f2fs_warn(sbi, "Test dummy encryption mode enabled"); +} + +static void f2fs_apply_compression(struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned char (*ctx_ext)[F2FS_EXTENSION_LEN]; + unsigned char (*sbi_ext)[F2FS_EXTENSION_LEN]; + int ctx_cnt, sbi_cnt, i; + + if (ctx->spec_mask & F2FS_SPEC_compress_level) + F2FS_OPTION(sbi).compress_level = + F2FS_CTX_INFO(ctx).compress_level; + if (ctx->spec_mask & F2FS_SPEC_compress_algorithm) + F2FS_OPTION(sbi).compress_algorithm = + F2FS_CTX_INFO(ctx).compress_algorithm; + if (ctx->spec_mask & F2FS_SPEC_compress_log_size) + F2FS_OPTION(sbi).compress_log_size = + F2FS_CTX_INFO(ctx).compress_log_size; + if (ctx->spec_mask & F2FS_SPEC_compress_chksum) + F2FS_OPTION(sbi).compress_chksum = + F2FS_CTX_INFO(ctx).compress_chksum; + if (ctx->spec_mask & F2FS_SPEC_compress_mode) + F2FS_OPTION(sbi).compress_mode = + F2FS_CTX_INFO(ctx).compress_mode; + if (ctx->spec_mask & F2FS_SPEC_compress_extension) { + ctx_ext = F2FS_CTX_INFO(ctx).extensions; + ctx_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt; + sbi_ext = F2FS_OPTION(sbi).extensions; + sbi_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + for (i = 0; i < ctx_cnt; i++) { + if (strlen(ctx_ext[i]) == 0) + continue; + strscpy(sbi_ext[sbi_cnt], ctx_ext[i]); + sbi_cnt++; + } + F2FS_OPTION(sbi).compress_ext_cnt = sbi_cnt; + } + if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) { + ctx_ext = F2FS_CTX_INFO(ctx).noextensions; + ctx_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt; + sbi_ext = F2FS_OPTION(sbi).noextensions; + sbi_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + for (i = 0; i < ctx_cnt; i++) { + if (strlen(ctx_ext[i]) == 0) + continue; + strscpy(sbi_ext[sbi_cnt], ctx_ext[i]); + sbi_cnt++; + } + F2FS_OPTION(sbi).nocompress_ext_cnt = sbi_cnt; + } +#endif +} + +static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) +{ + struct f2fs_fs_context *ctx = fc->fs_private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + F2FS_OPTION(sbi).opt &= ~ctx->opt_mask; + F2FS_OPTION(sbi).opt |= F2FS_CTX_INFO(ctx).opt; + + if (ctx->spec_mask & F2FS_SPEC_background_gc) + F2FS_OPTION(sbi).bggc_mode = F2FS_CTX_INFO(ctx).bggc_mode; + if (ctx->spec_mask & F2FS_SPEC_inline_xattr_size) + F2FS_OPTION(sbi).inline_xattr_size = + F2FS_CTX_INFO(ctx).inline_xattr_size; + if (ctx->spec_mask & F2FS_SPEC_active_logs) + F2FS_OPTION(sbi).active_logs = F2FS_CTX_INFO(ctx).active_logs; + if (ctx->spec_mask & F2FS_SPEC_reserve_root) + F2FS_OPTION(sbi).root_reserved_blocks = + F2FS_CTX_INFO(ctx).root_reserved_blocks; + if (ctx->spec_mask & F2FS_SPEC_reserve_node) + F2FS_OPTION(sbi).root_reserved_nodes = + F2FS_CTX_INFO(ctx).root_reserved_nodes; + if (ctx->spec_mask & F2FS_SPEC_resgid) + F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid; + if (ctx->spec_mask & F2FS_SPEC_resuid) + F2FS_OPTION(sbi).s_resuid = F2FS_CTX_INFO(ctx).s_resuid; + if (ctx->spec_mask & F2FS_SPEC_mode) + F2FS_OPTION(sbi).fs_mode = F2FS_CTX_INFO(ctx).fs_mode; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (ctx->spec_mask & F2FS_SPEC_fault_injection) + (void)f2fs_build_fault_attr(sbi, + F2FS_CTX_INFO(ctx).fault_info.inject_rate, 0, FAULT_RATE); + if (ctx->spec_mask & F2FS_SPEC_fault_type) + (void)f2fs_build_fault_attr(sbi, 0, + F2FS_CTX_INFO(ctx).fault_info.inject_type, FAULT_TYPE); +#endif + if (ctx->spec_mask & F2FS_SPEC_alloc_mode) + F2FS_OPTION(sbi).alloc_mode = F2FS_CTX_INFO(ctx).alloc_mode; + if (ctx->spec_mask & F2FS_SPEC_fsync_mode) + F2FS_OPTION(sbi).fsync_mode = F2FS_CTX_INFO(ctx).fsync_mode; + if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap) + F2FS_OPTION(sbi).unusable_cap = F2FS_CTX_INFO(ctx).unusable_cap; + if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap_perc) + F2FS_OPTION(sbi).unusable_cap_perc = + F2FS_CTX_INFO(ctx).unusable_cap_perc; + if (ctx->spec_mask & F2FS_SPEC_discard_unit) + F2FS_OPTION(sbi).discard_unit = F2FS_CTX_INFO(ctx).discard_unit; + if (ctx->spec_mask & F2FS_SPEC_memory_mode) + F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode; + if (ctx->spec_mask & F2FS_SPEC_errors) + F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors; + if (ctx->spec_mask & F2FS_SPEC_lookup_mode) + F2FS_OPTION(sbi).lookup_mode = F2FS_CTX_INFO(ctx).lookup_mode; + + f2fs_apply_compression(fc, sb); + f2fs_apply_test_dummy_encryption(fc, sb); + f2fs_apply_quota_options(fc, sb); +} + +static int f2fs_sanity_check_options(struct f2fs_sb_info *sbi, bool remount) +{ + if (f2fs_sb_has_device_alias(sbi) && + !test_opt(sbi, READ_EXTENT_CACHE)) { + f2fs_err(sbi, "device aliasing requires extent cache"); + return -EINVAL; + } + + if (!remount) + return 0; + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && + sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) { + f2fs_err(sbi, + "zoned: max open zones %u is too small, need at least %u open zones", + sbi->max_open_zones, F2FS_OPTION(sbi).active_logs); + return -EINVAL; + } +#endif + if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) { + f2fs_warn(sbi, "LFS is not compatible with IPU"); + return -EINVAL; + } return 0; } @@ -165,26 +1746,52 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO); + if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) + return NULL; + + fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) return NULL; init_once((void *) fi); /* Initialize f2fs-specific inode info */ - fi->vfs_inode.i_version = 1; - atomic_set(&fi->dirty_dents, 0); - fi->i_current_depth = 1; - fi->i_advise = 0; - rwlock_init(&fi->ext.ext_lock); + atomic_set(&fi->dirty_pages, 0); + atomic_set(&fi->i_compr_blocks, 0); + atomic_set(&fi->open_count, 0); + atomic_set(&fi->writeback, 0); + init_f2fs_rwsem(&fi->i_sem); + spin_lock_init(&fi->i_size_lock); + INIT_LIST_HEAD(&fi->dirty_list); + INIT_LIST_HEAD(&fi->gdirty_list); + INIT_LIST_HEAD(&fi->gdonate_list); + init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); + init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); + init_f2fs_rwsem(&fi->i_xattr_sem); - set_inode_flag(fi, FI_NEW_INODE); + /* Will be used by directory only */ + fi->i_dir_level = F2FS_SB(sb)->dir_level; return &fi->vfs_inode; } static int f2fs_drop_inode(struct inode *inode) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret; + + /* + * during filesystem shutdown, if checkpoint is disabled, + * drop useless meta/node dirty pages. + */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) { + trace_f2fs_drop_inode(inode, 1); + return 1; + } + } + /* * This is to avoid a deadlock condition like below. * writeback_single_inode(inode) @@ -192,9 +1799,84 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if (!inode_unhashed(inode) && inode->i_state & I_SYNC) + if ((!inode_unhashed(inode) && inode_state_read(inode) & I_SYNC)) { + if (!inode->i_nlink && !is_bad_inode(inode)) { + /* to avoid evict_inode call simultaneously */ + __iget(inode); + spin_unlock(&inode->i_lock); + + /* should remain fi->extent_tree for writepage */ + f2fs_destroy_extent_node(inode); + + sb_start_intwrite(inode->i_sb); + f2fs_i_size_write(inode, 0); + + f2fs_submit_merged_write_cond(F2FS_I_SB(inode), + inode, NULL, 0, DATA); + truncate_inode_pages_final(inode->i_mapping); + + if (F2FS_HAS_BLOCKS(inode)) + f2fs_truncate(inode); + + sb_end_intwrite(inode->i_sb); + + spin_lock(&inode->i_lock); + atomic_dec(&inode->i_count); + } + trace_f2fs_drop_inode(inode, 0); return 0; - return generic_drop_inode(inode); + } + ret = inode_generic_drop(inode); + if (!ret) + ret = fscrypt_drop_inode(inode); + trace_f2fs_drop_inode(inode, ret); + return ret; +} + +int f2fs_inode_dirtied(struct inode *inode, bool sync) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); + } + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, + &sbi->inode_list[DIRTY_META]); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + /* if atomic write is not committed, set inode w/ atomic dirty */ + if (!ret && f2fs_is_atomic_file(inode) && + !is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) + set_inode_flag(inode, FI_ATOMIC_DIRTIED); + + return ret; +} + +void f2fs_inode_synced(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return; + } + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } + clear_inode_flag(inode, FI_DIRTY_INODE); + clear_inode_flag(inode, FI_AUTO_RECOVER); + stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); } /* @@ -204,127 +1886,475 @@ static int f2fs_drop_inode(struct inode *inode) */ static void f2fs_dirty_inode(struct inode *inode, int flags) { - set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); - return; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return; + + if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) + clear_inode_flag(inode, FI_AUTO_RECOVER); + + f2fs_inode_dirtied(inode, false); } -static void f2fs_i_callback(struct rcu_head *head) +static void f2fs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); + fscrypt_free_inode(inode); kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode)); } -static void f2fs_destroy_inode(struct inode *inode) +static void destroy_percpu_info(struct f2fs_sb_info *sbi) +{ + percpu_counter_destroy(&sbi->total_valid_inode_count); + percpu_counter_destroy(&sbi->rf_node_block_count); + percpu_counter_destroy(&sbi->alloc_valid_block_count); +} + +static void destroy_device_list(struct f2fs_sb_info *sbi) { - call_rcu(&inode->i_rcu, f2fs_i_callback); + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (i > 0) + bdev_fput(FDEV(i).bdev_file); +#ifdef CONFIG_BLK_DEV_ZONED + kvfree(FDEV(i).blkz_seq); +#endif + } + kvfree(sbi->devs); } static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; + int err = 0; + bool done; - f2fs_destroy_stats(sbi); - stop_gc_thread(sbi); + /* unregister procfs/sysfs entries in advance to avoid race case */ + f2fs_unregister_sysfs(sbi); + + f2fs_quota_off_umount(sb); + + /* prevent remaining shrinker jobs */ + mutex_lock(&sbi->umount_mutex); + + /* + * flush all issued checkpoints and stop checkpoint issue thread. + * after then, all checkpoints should be done by each process context. + */ + f2fs_stop_ckpt_thread(sbi); + + /* + * We don't need to do checkpoint when superblock is clean. + * But, the previous checkpoint was not done by umount, it needs to do + * clean checkpoint again. + */ + if ((is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); + err = f2fs_write_checkpoint(sbi, &cpc); + } + + /* be sure to wait for any on-going discard commands */ + done = f2fs_issue_discard_timeout(sbi); + if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && done) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); + err = f2fs_write_checkpoint(sbi, &cpc); + } + + /* + * normally superblock is clean, so we need to release this. + * In addition, EIO will skip do checkpoint, we need this as well. + */ + f2fs_release_ino_entry(sbi, true); + + f2fs_leave_shrinker(sbi); + mutex_unlock(&sbi->umount_mutex); + + /* our cp_error case, we can wait for any writeback page */ + f2fs_flush_merged_writes(sbi); + + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + + if (err || f2fs_cp_error(sbi)) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } + + f2fs_bug_on(sbi, sbi->fsync_node_num); - write_checkpoint(sbi, true); + f2fs_destroy_compress_inode(sbi); iput(sbi->node_inode); + sbi->node_inode = NULL; + iput(sbi->meta_inode); + sbi->meta_inode = NULL; + + /* Should check the page counts after dropping all node/meta pages */ + for (i = 0; i < NR_COUNT_TYPE; i++) { + if (!get_pages(sbi, i)) + continue; + f2fs_err(sbi, "detect filesystem reference count leak during " + "umount, type: %d, count: %lld", i, get_pages(sbi, i)); + f2fs_bug_on(sbi, 1); + } + + /* + * iput() can update stat information, if f2fs_write_checkpoint() + * above failed with error. + */ + f2fs_destroy_stats(sbi); /* destroy f2fs internal modules */ - destroy_node_manager(sbi); - destroy_segment_manager(sbi); + f2fs_destroy_node_manager(sbi); + f2fs_destroy_segment_manager(sbi); - kfree(sbi->ckpt); + /* flush s_error_work before sbi destroy */ + flush_work(&sbi->s_error_work); - sb->s_fs_info = NULL; - brelse(sbi->raw_super_buf); - kfree(sbi); + f2fs_destroy_post_read_wq(sbi); + + kvfree(sbi->ckpt); + + kfree(sbi->raw_super); + + f2fs_destroy_page_array_cache(sbi); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(F2FS_OPTION(sbi).s_qf_names[i]); +#endif + fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); + destroy_percpu_info(sbi); + f2fs_destroy_iostat(sbi); + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); +#if IS_ENABLED(CONFIG_UNICODE) + utf8_unload(sb->s_encoding); +#endif } int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; + + if (unlikely(f2fs_cp_error(sbi))) + return 0; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return 0; trace_f2fs_sync_fs(sb, sync); - if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) - return 0; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; if (sync) { - mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, false); - mutex_unlock(&sbi->gc_mutex); - } else { - f2fs_balance_fs(sbi); + stat_inc_cp_call_count(sbi, TOTAL_CALL); + err = f2fs_issue_checkpoint(sbi); } - return 0; + return err; } static int f2fs_freeze(struct super_block *sb) { - int err; + struct f2fs_sb_info *sbi = F2FS_SB(sb); if (f2fs_readonly(sb)) return 0; - err = f2fs_sync_fs(sb, 1); - return err; + /* IO error happened before */ + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY)) + return -EINVAL; + + sbi->umount_lock_holder = current; + + /* Let's flush checkpoints and stop the thread. */ + f2fs_flush_ckpt_thread(sbi); + + sbi->umount_lock_holder = NULL; + + /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ + set_sbi_flag(sbi, SBI_IS_FREEZING); + return 0; } static int f2fs_unfreeze(struct super_block *sb) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + /* + * It will update discard_max_bytes of mounted lvm device to zero + * after creating snapshot on this lvm device, let's drop all + * remained discards. + * We don't need to disable real-time discard because discard_max_bytes + * will recover after removal of snapshot. + */ + if (test_opt(sbi, DISCARD) && !f2fs_hw_support_discard(sbi)) + f2fs_issue_discard_timeout(sbi); + + clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); + return 0; +} + +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dquot->dq_dqb_lock); + + limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, + dquot->dq_dqb.dqb_bhardlimit); + limit >>= sb->s_blocksize_bits; + + if (limit) { + uint64_t remaining = 0; + + curblock = (dquot->dq_dqb.dqb_curspace + + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; + if (limit > curblock) + remaining = limit - curblock; + + buf->f_blocks = min(buf->f_blocks, limit); + buf->f_bfree = min(buf->f_bfree, remaining); + buf->f_bavail = min(buf->f_bavail, remaining); + } + + limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, + dquot->dq_dqb.dqb_ihardlimit); + + if (limit) { + uint64_t remaining = 0; + + if (limit > dquot->dq_dqb.dqb_curinodes) + remaining = limit - dquot->dq_dqb.dqb_curinodes; + + buf->f_files = min(buf->f_files, limit); + buf->f_ffree = min(buf->f_ffree, remaining); + } + + spin_unlock(&dquot->dq_dqb_lock); + dqput(dquot); return 0; } +#endif static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - block_t total_count, user_block_count, start_count, ovp_count; + block_t total_count, user_block_count, start_count; + u64 avail_node_count; + unsigned int total_valid_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); - user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); - ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; - buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi); - buf->f_files = sbi->total_node_count; - buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); + spin_lock(&sbi->stat_lock); + if (sbi->carve_out) + buf->f_blocks -= sbi->current_reserved_blocks; + user_block_count = sbi->user_block_count; + total_valid_node_count = valid_node_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + buf->f_bfree = user_block_count - valid_user_blocks(sbi) - + sbi->current_reserved_blocks; + + if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) + buf->f_bfree = 0; + else + buf->f_bfree -= sbi->unusable_block_count; + spin_unlock(&sbi->stat_lock); + + if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks) + buf->f_bavail = buf->f_bfree - + F2FS_OPTION(sbi).root_reserved_blocks; + else + buf->f_bavail = 0; + + if (avail_node_count > user_block_count) { + buf->f_files = user_block_count; + buf->f_ffree = buf->f_bavail; + } else { + buf->f_files = avail_node_count; + buf->f_ffree = min(avail_node_count - total_valid_node_count, + buf->f_bavail); + } buf->f_namelen = F2FS_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(d_inode(dentry), FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(d_inode(dentry))->i_projid, buf); + } +#endif return 0; } +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (F2FS_OPTION(sbi).s_jquota_fmt) { + char *fmtname = ""; + + switch (F2FS_OPTION(sbi).s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]); + + if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]); + + if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]); +#endif +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +static inline void f2fs_show_compress_options(struct seq_file *seq, + struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *algtype = ""; + int i; + + if (!f2fs_sb_has_compression(sbi)) + return; + + switch (F2FS_OPTION(sbi).compress_algorithm) { + case COMPRESS_LZO: + algtype = "lzo"; + break; + case COMPRESS_LZ4: + algtype = "lz4"; + break; + case COMPRESS_ZSTD: + algtype = "zstd"; + break; + case COMPRESS_LZORLE: + algtype = "lzo-rle"; + break; + } + seq_printf(seq, ",compress_algorithm=%s", algtype); + + if (F2FS_OPTION(sbi).compress_level) + seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level); + + seq_printf(seq, ",compress_log_size=%u", + F2FS_OPTION(sbi).compress_log_size); + + for (i = 0; i < F2FS_OPTION(sbi).compress_ext_cnt; i++) { + seq_printf(seq, ",compress_extension=%s", + F2FS_OPTION(sbi).extensions[i]); + } + + for (i = 0; i < F2FS_OPTION(sbi).nocompress_ext_cnt; i++) { + seq_printf(seq, ",nocompress_extension=%s", + F2FS_OPTION(sbi).noextensions[i]); + } + + if (F2FS_OPTION(sbi).compress_chksum) + seq_puts(seq, ",compress_chksum"); + + if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_FS) + seq_printf(seq, ",compress_mode=%s", "fs"); + else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) + seq_printf(seq, ",compress_mode=%s", "user"); + + if (test_opt(sbi, COMPRESS_CACHE)) + seq_puts(seq, ",compress_cache"); +} +#endif + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC)) + if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) + seq_printf(seq, ",background_gc=%s", "sync"); + else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_ON) seq_printf(seq, ",background_gc=%s", "on"); - else + else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) seq_printf(seq, ",background_gc=%s", "off"); + + if (test_opt(sbi, GC_MERGE)) + seq_puts(seq, ",gc_merge"); + else + seq_puts(seq, ",nogc_merge"); + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); - if (test_opt(sbi, DISCARD)) + if (test_opt(sbi, NORECOVERY)) + seq_puts(seq, ",norecovery"); + if (test_opt(sbi, DISCARD)) { seq_puts(seq, ",discard"); - if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap_alloc"); + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) + seq_printf(seq, ",discard_unit=%s", "block"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + seq_printf(seq, ",discard_unit=%s", "segment"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + seq_printf(seq, ",discard_unit=%s", "section"); + } else { + seq_puts(seq, ",nodiscard"); + } #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); else seq_puts(seq, ",nouser_xattr"); + if (test_opt(sbi, INLINE_XATTR)) + seq_puts(seq, ",inline_xattr"); + else + seq_puts(seq, ",noinline_xattr"); + if (test_opt(sbi, INLINE_XATTR_SIZE)) + seq_printf(seq, ",inline_xattr_size=%u", + F2FS_OPTION(sbi).inline_xattr_size); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -334,87 +2364,1355 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) #endif if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) seq_puts(seq, ",disable_ext_identify"); + if (test_opt(sbi, INLINE_DATA)) + seq_puts(seq, ",inline_data"); + else + seq_puts(seq, ",noinline_data"); + if (test_opt(sbi, INLINE_DENTRY)) + seq_puts(seq, ",inline_dentry"); + else + seq_puts(seq, ",noinline_dentry"); + if (test_opt(sbi, FLUSH_MERGE)) + seq_puts(seq, ",flush_merge"); + else + seq_puts(seq, ",noflush_merge"); + if (test_opt(sbi, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + else + seq_puts(seq, ",barrier"); + if (test_opt(sbi, FASTBOOT)) + seq_puts(seq, ",fastboot"); + if (test_opt(sbi, READ_EXTENT_CACHE)) + seq_puts(seq, ",extent_cache"); + else + seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, AGE_EXTENT_CACHE)) + seq_puts(seq, ",age_extent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); + + seq_puts(seq, ",mode="); + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_ADAPTIVE) + seq_puts(seq, "adaptive"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS) + seq_puts(seq, "lfs"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG) + seq_puts(seq, "fragment:segment"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + seq_puts(seq, "fragment:block"); + seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); + if (test_opt(sbi, RESERVE_ROOT) || test_opt(sbi, RESERVE_NODE)) + seq_printf(seq, ",reserve_root=%u,reserve_node=%u,resuid=%u," + "resgid=%u", + F2FS_OPTION(sbi).root_reserved_blocks, + F2FS_OPTION(sbi).root_reserved_nodes, + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) { + seq_printf(seq, ",fault_injection=%u", + F2FS_OPTION(sbi).fault_info.inject_rate); + seq_printf(seq, ",fault_type=%u", + F2FS_OPTION(sbi).fault_info.inject_type); + } +#endif +#ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); + if (test_opt(sbi, USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (test_opt(sbi, GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); +#endif + f2fs_show_quota_options(seq, sbi->sb); + + fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); + + if (sbi->sb->s_flags & SB_INLINECRYPT) + seq_puts(seq, ",inlinecrypt"); + + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) + seq_printf(seq, ",alloc_mode=%s", "default"); + else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) + seq_printf(seq, ",alloc_mode=%s", "reuse"); + + if (test_opt(sbi, DISABLE_CHECKPOINT)) + seq_printf(seq, ",checkpoint=disable:%u", + F2FS_OPTION(sbi).unusable_cap); + if (test_opt(sbi, MERGE_CHECKPOINT)) + seq_puts(seq, ",checkpoint_merge"); + else + seq_puts(seq, ",nocheckpoint_merge"); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) + seq_printf(seq, ",fsync_mode=%s", "posix"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) + seq_printf(seq, ",fsync_mode=%s", "strict"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) + seq_printf(seq, ",fsync_mode=%s", "nobarrier"); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + f2fs_show_compress_options(seq, sbi->sb); +#endif + + if (test_opt(sbi, ATGC)) + seq_puts(seq, ",atgc"); + + if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL) + seq_printf(seq, ",memory=%s", "normal"); + else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) + seq_printf(seq, ",memory=%s", "low"); - seq_printf(seq, ",active_logs=%u", sbi->active_logs); + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) + seq_printf(seq, ",errors=%s", "remount-ro"); + else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_CONTINUE) + seq_printf(seq, ",errors=%s", "continue"); + else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC) + seq_printf(seq, ",errors=%s", "panic"); + + if (test_opt(sbi, NAT_BITS)) + seq_puts(seq, ",nat_bits"); + + if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_PERF) + seq_show_option(seq, "lookup_mode", "perf"); + else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_COMPAT) + seq_show_option(seq, "lookup_mode", "compat"); + else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_AUTO) + seq_show_option(seq, "lookup_mode", "auto"); return 0; } -static int f2fs_remount(struct super_block *sb, int *flags, char *data) +static void default_options(struct f2fs_sb_info *sbi, bool remount) +{ + /* init some FS parameters */ + if (!remount) { + set_opt(sbi, READ_EXTENT_CACHE); + clear_opt(sbi, DISABLE_CHECKPOINT); + + if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) + set_opt(sbi, DISCARD); + + if (f2fs_sb_has_blkzoned(sbi)) + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; + else + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; + } + + if (f2fs_sb_has_readonly(sbi)) + F2FS_OPTION(sbi).active_logs = NR_CURSEG_RO_TYPE; + else + F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; + + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <= + SMALL_VOLUME_SEGMENTS) + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; + else + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + if (f2fs_sb_has_compression(sbi)) { + F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; + F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; + F2FS_OPTION(sbi).compress_ext_cnt = 0; + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + } + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; + F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; + F2FS_OPTION(sbi).errors = MOUNT_ERRORS_CONTINUE; + + set_opt(sbi, INLINE_XATTR); + set_opt(sbi, INLINE_DATA); + set_opt(sbi, INLINE_DENTRY); + set_opt(sbi, MERGE_CHECKPOINT); + set_opt(sbi, LAZYTIME); + F2FS_OPTION(sbi).unusable_cap = 0; + if (!f2fs_is_readonly(sbi)) + set_opt(sbi, FLUSH_MERGE); + if (f2fs_sb_has_blkzoned(sbi)) + F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; + else + F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; + +#ifdef CONFIG_F2FS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif + + f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL); + + F2FS_OPTION(sbi).lookup_mode = LOOKUP_PERF; +} + +#ifdef CONFIG_QUOTA +static int f2fs_enable_quotas(struct super_block *sb); +#endif + +static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) +{ + unsigned int s_flags = sbi->sb->s_flags; + struct cp_control cpc; + unsigned int gc_mode = sbi->gc_mode; + int err = 0; + int ret; + block_t unusable; + + if (s_flags & SB_RDONLY) { + f2fs_err(sbi, "checkpoint=disable on readonly fs"); + return -EINVAL; + } + sbi->sb->s_flags |= SB_ACTIVE; + + /* check if we need more GC first */ + unusable = f2fs_get_unusable_blocks(sbi); + if (!f2fs_disable_cp_again(sbi, unusable)) + goto skip_gc; + + f2fs_update_time(sbi, DISABLE_TIME); + + sbi->gc_mode = GC_URGENT_HIGH; + + while (!f2fs_time_over(sbi, DISABLE_TIME)) { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .no_bg_gc = true, + .nr_free_secs = 1 }; + + f2fs_down_write(&sbi->gc_lock); + stat_inc_gc_call_count(sbi, FOREGROUND); + err = f2fs_gc(sbi, &gc_control); + if (err == -ENODATA) { + err = 0; + break; + } + if (err && err != -EAGAIN) + break; + } + + ret = sync_filesystem(sbi->sb); + if (ret || err) { + err = ret ? ret : err; + goto restore_flag; + } + + unusable = f2fs_get_unusable_blocks(sbi); + if (f2fs_disable_cp_again(sbi, unusable)) { + err = -EAGAIN; + goto restore_flag; + } + +skip_gc: + f2fs_down_write(&sbi->gc_lock); + cpc.reason = CP_PAUSE; + set_sbi_flag(sbi, SBI_CP_DISABLED); + stat_inc_cp_call_count(sbi, TOTAL_CALL); + err = f2fs_write_checkpoint(sbi, &cpc); + if (err) + goto out_unlock; + + spin_lock(&sbi->stat_lock); + sbi->unusable_block_count = unusable; + spin_unlock(&sbi->stat_lock); + +out_unlock: + f2fs_up_write(&sbi->gc_lock); +restore_flag: + sbi->gc_mode = gc_mode; + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + f2fs_info(sbi, "f2fs_disable_checkpoint() finish, err:%d", err); + return err; +} + +static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) +{ + unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; + long long start, writeback, lock, sync_inode, end; + int ret; + + f2fs_info(sbi, "%s start, meta: %lld, node: %lld, data: %lld", + __func__, + get_pages(sbi, F2FS_DIRTY_META), + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DATA)); + + f2fs_update_time(sbi, ENABLE_TIME); + + start = ktime_get(); + + /* we should flush all the data to keep data consistency */ + while (get_pages(sbi, F2FS_DIRTY_DATA)) { + writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC); + f2fs_io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + + if (f2fs_time_over(sbi, ENABLE_TIME)) + break; + } + writeback = ktime_get(); + + f2fs_down_write(&sbi->cp_enable_rwsem); + + lock = ktime_get(); + + if (get_pages(sbi, F2FS_DIRTY_DATA)) + sync_inodes_sb(sbi->sb); + + if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA))) + f2fs_warn(sbi, "%s: has some unwritten data: %lld", + __func__, get_pages(sbi, F2FS_DIRTY_DATA)); + + sync_inode = ktime_get(); + + f2fs_down_write(&sbi->gc_lock); + f2fs_dirty_to_prefree(sbi); + + clear_sbi_flag(sbi, SBI_CP_DISABLED); + set_sbi_flag(sbi, SBI_IS_DIRTY); + f2fs_up_write(&sbi->gc_lock); + + f2fs_info(sbi, "%s sync_fs, meta: %lld, imeta: %lld, node: %lld, dents: %lld, qdata: %lld", + __func__, + get_pages(sbi, F2FS_DIRTY_META), + get_pages(sbi, F2FS_DIRTY_IMETA), + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_QDATA)); + ret = f2fs_sync_fs(sbi->sb, 1); + if (ret) + f2fs_err(sbi, "%s sync_fs failed, ret: %d", __func__, ret); + + /* Let's ensure there's no pending checkpoint anymore */ + f2fs_flush_ckpt_thread(sbi); + + f2fs_up_write(&sbi->cp_enable_rwsem); + + end = ktime_get(); + + f2fs_info(sbi, "%s end, writeback:%llu, " + "lock:%llu, sync_inode:%llu, sync_fs:%llu", + __func__, + ktime_ms_delta(writeback, start), + ktime_ms_delta(lock, writeback), + ktime_ms_delta(sync_inode, lock), + ktime_ms_delta(end, sync_inode)); + return ret; +} + +static int __f2fs_remount(struct fs_context *fc, struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; - int err, active_logs; + unsigned long old_sb_flags; + unsigned int flags = fc->sb_flags; + int err; + bool need_restart_gc = false, need_stop_gc = false; + bool need_restart_flush = false, need_stop_flush = false; + bool need_restart_discard = false, need_stop_discard = false; + bool need_enable_checkpoint = false, need_disable_checkpoint = false; + bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); + bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE); + bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); + bool no_atgc = !test_opt(sbi, ATGC); + bool no_discard = !test_opt(sbi, DISCARD); + bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); + bool block_unit_discard = f2fs_block_unit_discard(sbi); + bool no_nat_bits = !test_opt(sbi, NAT_BITS); +#ifdef CONFIG_QUOTA + int i, j; +#endif /* * Save the old mount options in case we * need to restore them. */ org_mount_opt = sbi->mount_opt; - active_logs = sbi->active_logs; + old_sb_flags = sb->s_flags; + + sbi->umount_lock_holder = current; + +#ifdef CONFIG_QUOTA + org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { + org_mount_opt.s_qf_names[i] = + kstrdup(F2FS_OPTION(sbi).s_qf_names[i], + GFP_KERNEL); + if (!org_mount_opt.s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(org_mount_opt.s_qf_names[j]); + return -ENOMEM; + } + } else { + org_mount_opt.s_qf_names[i] = NULL; + } + } +#endif - /* parse mount options */ - err = parse_options(sb, data); + /* recover superblocks we couldn't write due to previous RO mount */ + if (!(flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + err = f2fs_commit_super(sbi, false); + f2fs_info(sbi, "Try to recover all the superblocks, ret: %d", + err); + if (!err) + clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); + } + + default_options(sbi, true); + + err = f2fs_check_opt_consistency(fc, sb); + if (err) + goto restore_opts; + + f2fs_apply_options(fc, sb); + + err = f2fs_sanity_check_options(sbi, true); if (err) goto restore_opts; + /* flush outstanding errors before changing fs state */ + flush_work(&sbi->s_error_work); + /* * Previous and new state of filesystem is RO, - * so no point in checking GC conditions. + * so skip checking GC and FLUSH_MERGE conditions. */ - if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + if (f2fs_readonly(sb) && (flags & SB_RDONLY)) goto skip; + if (f2fs_dev_is_readonly(sbi) && !(flags & SB_RDONLY)) { + err = -EROFS; + goto restore_opts; + } + +#ifdef CONFIG_QUOTA + if (!f2fs_readonly(sb) && (flags & SB_RDONLY)) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + } else if (f2fs_readonly(sb) && !(flags & SB_RDONLY)) { + /* dquot_resume needs RW */ + sb->s_flags &= ~SB_RDONLY; + if (sb_any_quota_suspended(sb)) { + dquot_resume(sb, -1); + } else if (f2fs_sb_has_quota_ino(sbi)) { + err = f2fs_enable_quotas(sb); + if (err) + goto restore_opts; + } + } +#endif + /* disallow enable atgc dynamically */ + if (no_atgc == !!test_opt(sbi, ATGC)) { + err = -EINVAL; + f2fs_warn(sbi, "switch atgc option is not allowed"); + goto restore_opts; + } + + /* disallow enable/disable extent_cache dynamically */ + if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch extent_cache option is not allowed"); + goto restore_opts; + } + /* disallow enable/disable age extent_cache dynamically */ + if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch age_extent_cache option is not allowed"); + goto restore_opts; + } + + if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch compress_cache option is not allowed"); + goto restore_opts; + } + + if (block_unit_discard != f2fs_block_unit_discard(sbi)) { + err = -EINVAL; + f2fs_warn(sbi, "switch discard_unit option is not allowed"); + goto restore_opts; + } + + if (no_nat_bits == !!test_opt(sbi, NAT_BITS)) { + err = -EINVAL; + f2fs_warn(sbi, "switch nat_bits option is not allowed"); + goto restore_opts; + } + + if ((flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { + err = -EINVAL; + f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); + goto restore_opts; + } + /* * We stop the GC thread if FS is mounted as RO * or if background_gc = off is passed in mount * option. Also sync the filesystem. */ - if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { + if ((flags & SB_RDONLY) || + (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && + !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { - stop_gc_thread(sbi); - f2fs_sync_fs(sb, 1); + f2fs_stop_gc_thread(sbi); + need_restart_gc = true; } - } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) { - err = start_gc_thread(sbi); + } else if (!sbi->gc_thread) { + err = f2fs_start_gc_thread(sbi); if (err) goto restore_opts; + need_stop_gc = true; + } + + if (flags & SB_RDONLY) { + sync_inodes_sb(sb); + + set_sbi_flag(sbi, SBI_IS_DIRTY); + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_sync_fs(sb, 1); + clear_sbi_flag(sbi, SBI_IS_CLOSE); + } + + /* + * We stop issue flush thread if FS is mounted as RO + * or if flush_merge is not passed in mount option. + */ + if ((flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + clear_opt(sbi, FLUSH_MERGE); + f2fs_destroy_flush_cmd_control(sbi, false); + need_restart_flush = true; + } else { + err = f2fs_create_flush_cmd_control(sbi); + if (err) + goto restore_gc; + need_stop_flush = true; + } + + if (no_discard == !!test_opt(sbi, DISCARD)) { + if (test_opt(sbi, DISCARD)) { + err = f2fs_start_discard_thread(sbi); + if (err) + goto restore_flush; + need_stop_discard = true; + } else { + f2fs_stop_discard_thread(sbi); + f2fs_issue_discard_timeout(sbi); + need_restart_discard = true; + } + } + + adjust_unusable_cap_perc(sbi); + if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto restore_discard; + need_enable_checkpoint = true; + } else { + err = f2fs_enable_checkpoint(sbi); + if (err) + goto restore_discard; + need_disable_checkpoint = true; + } + } + + /* + * Place this routine at the end, since a new checkpoint would be + * triggered while remount and we need to take care of it before + * returning from remount. + */ + if ((flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + !test_opt(sbi, MERGE_CHECKPOINT)) { + f2fs_stop_ckpt_thread(sbi); + } else { + /* Flush if the previous checkpoint, if exists. */ + f2fs_flush_ckpt_thread(sbi); + + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_checkpoint; + } } + skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(org_mount_opt.s_qf_names[i]); +#endif /* Update the POSIXACL Flag */ - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); - return 0; + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + limit_reserve_root(sbi); + fc->sb_flags = (flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + + sbi->umount_lock_holder = NULL; + return 0; +restore_checkpoint: + if (need_enable_checkpoint) { + if (f2fs_enable_checkpoint(sbi)) + f2fs_warn(sbi, "checkpoint has not been enabled"); + } else if (need_disable_checkpoint) { + if (f2fs_disable_checkpoint(sbi)) + f2fs_warn(sbi, "checkpoint has not been disabled"); + } +restore_discard: + if (need_restart_discard) { + if (f2fs_start_discard_thread(sbi)) + f2fs_warn(sbi, "discard has been stopped"); + } else if (need_stop_discard) { + f2fs_stop_discard_thread(sbi); + } +restore_flush: + if (need_restart_flush) { + if (f2fs_create_flush_cmd_control(sbi)) + f2fs_warn(sbi, "background flush thread has stopped"); + } else if (need_stop_flush) { + clear_opt(sbi, FLUSH_MERGE); + f2fs_destroy_flush_cmd_control(sbi, false); + } +restore_gc: + if (need_restart_gc) { + if (f2fs_start_gc_thread(sbi)) + f2fs_warn(sbi, "background gc thread has stopped"); + } else if (need_stop_gc) { + f2fs_stop_gc_thread(sbi); + } restore_opts: +#ifdef CONFIG_QUOTA + F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(F2FS_OPTION(sbi).s_qf_names[i]); + F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; + } +#endif sbi->mount_opt = org_mount_opt; - sbi->active_logs = active_logs; + sb->s_flags = old_sb_flags; + + sbi->umount_lock_holder = NULL; + return err; +} + +static void f2fs_shutdown(struct super_block *sb) +{ + f2fs_do_shutdown(F2FS_SB(sb), F2FS_GOING_DOWN_NOSYNC, false, false); +} + +#ifdef CONFIG_QUOTA +static bool f2fs_need_recovery(struct f2fs_sb_info *sbi) +{ + /* need to recovery orphan */ + if (is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) + return true; + /* need to recovery data */ + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) + return false; + if (test_opt(sbi, NORECOVERY)) + return false; + return !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG); +} + +static bool f2fs_recover_quota_begin(struct f2fs_sb_info *sbi) +{ + bool readonly = f2fs_readonly(sbi->sb); + + if (!f2fs_need_recovery(sbi)) + return false; + + /* it doesn't need to check f2fs_sb_has_readonly() */ + if (f2fs_hw_is_readonly(sbi)) + return false; + + if (readonly) { + sbi->sb->s_flags &= ~SB_RDONLY; + set_sbi_flag(sbi, SBI_IS_WRITABLE); + } + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + return f2fs_enable_quota_files(sbi, readonly); +} + +static void f2fs_recover_quota_end(struct f2fs_sb_info *sbi, + bool quota_enabled) +{ + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); + + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) { + clear_sbi_flag(sbi, SBI_IS_WRITABLE); + sbi->sb->s_flags |= SB_RDONLY; + } +} + +/* Read data from quotafile */ +static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + int tocopy; + size_t toread; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + struct folio *folio; + size_t offset; + +repeat: + folio = mapping_read_folio_gfp(mapping, off >> PAGE_SHIFT, + GFP_NOFS); + if (IS_ERR(folio)) { + if (PTR_ERR(folio) == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto repeat; + } + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + return PTR_ERR(folio); + } + offset = offset_in_folio(folio, off); + tocopy = min(folio_size(folio) - offset, toread); + + folio_lock(folio); + + if (unlikely(folio->mapping != mapping)) { + f2fs_folio_put(folio, true); + goto repeat; + } + + /* + * should never happen, just leave f2fs_bug_on() here to catch + * any potential bug. + */ + f2fs_bug_on(F2FS_SB(sb), !folio_test_uptodate(folio)); + + memcpy_from_folio(data, folio, offset, tocopy); + f2fs_folio_put(folio, true); + + toread -= tocopy; + data += tocopy; + off += tocopy; + } + return len; +} + +/* Write to quotafile */ +static ssize_t f2fs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + int offset = off & (sb->s_blocksize - 1); + size_t towrite = len; + struct folio *folio; + void *fsdata = NULL; + int err = 0; + int tocopy; + + while (towrite > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, + towrite); +retry: + err = a_ops->write_begin(NULL, mapping, off, tocopy, + &folio, &fsdata); + if (unlikely(err)) { + if (err == -ENOMEM) { + memalloc_retry_wait(GFP_NOFS); + goto retry; + } + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + break; + } + + memcpy_to_folio(folio, offset_in_folio(folio, off), data, tocopy); + + a_ops->write_end(NULL, mapping, off, tocopy, tocopy, + folio, fsdata); + offset = 0; + towrite -= tocopy; + off += tocopy; + data += tocopy; + cond_resched(); + } + + if (len == towrite) + return err; + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + f2fs_mark_inode_dirty_sync(inode, false); + return len - towrite; +} + +int f2fs_dquot_initialize(struct inode *inode) +{ + if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) + return -ESRCH; + + return dquot_initialize(inode); +} + +static struct dquot __rcu **f2fs_get_dquots(struct inode *inode) +{ + return F2FS_I(inode)->i_dquot; +} + +static qsize_t *f2fs_get_reserved_space(struct inode *inode) +{ + return &F2FS_I(inode)->i_reserved_quota; +} + +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + if (is_set_ckpt_flags(sbi, CP_QUOTA_NEED_FSCK_FLAG)) { + f2fs_err(sbi, "quota sysfile may be corrupted, skip loading it"); + return 0; + } + + return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type], + F2FS_OPTION(sbi).s_jquota_fmt, type); +} + +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) +{ + int enabled = 0; + int i, err; + + if (f2fs_sb_has_quota_ino(sbi) && rdonly) { + err = f2fs_enable_quotas(sbi->sb); + if (err) { + f2fs_err(sbi, "Cannot turn on quota_ino: %d", err); + return 0; + } + return 1; + } + + for (i = 0; i < MAXQUOTAS; i++) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { + err = f2fs_quota_on_mount(sbi, i); + if (!err) { + enabled = 1; + continue; + } + f2fs_err(sbi, "Cannot turn on quotas: %d on %d", + err, i); + } + } + return enabled; +} + +static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags) +{ + struct inode *qf_inode; + unsigned long qf_inum; + unsigned long qf_flag = F2FS_QUOTA_DEFAULT_FL; + int err; + + BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb))); + + qf_inum = f2fs_qf_ino(sb, type); + if (!qf_inum) + return -EPERM; + + qf_inode = f2fs_iget(sb, qf_inum); + if (IS_ERR(qf_inode)) { + f2fs_err(F2FS_SB(sb), "Bad quota inode %u:%lu", type, qf_inum); + return PTR_ERR(qf_inode); + } + + /* Don't account quota for quota files to avoid recursion */ + inode_lock(qf_inode); + qf_inode->i_flags |= S_NOQUOTA; + + if ((F2FS_I(qf_inode)->i_flags & qf_flag) != qf_flag) { + F2FS_I(qf_inode)->i_flags |= qf_flag; + f2fs_set_inode_flags(qf_inode); + } + inode_unlock(qf_inode); + + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); + iput(qf_inode); + return err; +} + +static int f2fs_enable_quotas(struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int type, err = 0; + unsigned long qf_inum; + bool quota_mopt[MAXQUOTAS] = { + test_opt(sbi, USRQUOTA), + test_opt(sbi, GRPQUOTA), + test_opt(sbi, PRJQUOTA), + }; + + if (is_set_ckpt_flags(F2FS_SB(sb), CP_QUOTA_NEED_FSCK_FLAG)) { + f2fs_err(sbi, "quota file may be corrupted, skip loading it"); + return 0; + } + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + + for (type = 0; type < MAXQUOTAS; type++) { + qf_inum = f2fs_qf_ino(sb, type); + if (qf_inum) { + err = f2fs_quota_enable(sb, type, QFMT_VFS_V1, + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); + if (err) { + f2fs_err(sbi, "Failed to enable quota tracking (type=%d, err=%d). Please run fsck to fix.", + type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + set_sbi_flag(F2FS_SB(sb), + SBI_QUOTA_NEED_REPAIR); + return err; + } + } + } + return 0; +} + +static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type) +{ + struct quota_info *dqopt = sb_dqopt(sbi->sb); + struct address_space *mapping = dqopt->files[type]->i_mapping; + int ret = 0; + + ret = dquot_writeback_dquots(sbi->sb, type); + if (ret) + goto out; + + ret = filemap_fdatawrite(mapping); + if (ret) + goto out; + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + goto out; + + ret = filemap_fdatawait(mapping); + + truncate_inode_pages(&dqopt->files[type]->i_data, 0); +out: + if (ret) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + +int f2fs_do_quota_sync(struct super_block *sb, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret = 0; + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + + if (type != -1 && cnt != type) + continue; + + if (!sb_has_quota_active(sb, cnt)) + continue; + + if (!f2fs_sb_has_quota_ino(sbi)) + inode_lock(dqopt->files[cnt]); + + /* + * do_quotactl + * f2fs_quota_sync + * f2fs_down_read(quota_sem) + * dquot_writeback_dquots() + * f2fs_dquot_commit + * block_operation + * f2fs_down_read(quota_sem) + */ + f2fs_lock_op(sbi); + f2fs_down_read(&sbi->quota_sem); + + ret = f2fs_quota_sync_file(sbi, cnt); + + f2fs_up_read(&sbi->quota_sem); + f2fs_unlock_op(sbi); + + if (!f2fs_sb_has_quota_ino(sbi)) + inode_unlock(dqopt->files[cnt]); + + if (ret) + break; + } + return ret; +} + +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + int ret; + + F2FS_SB(sb)->umount_lock_holder = current; + ret = f2fs_do_quota_sync(sb, type); + F2FS_SB(sb)->umount_lock_holder = NULL; + return ret; +} + +static int f2fs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + struct inode *inode; + int err = 0; + + /* if quota sysfile exists, deny enabling quota with specific file */ + if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) { + f2fs_err(F2FS_SB(sb), "quota sysfile already exists"); + return -EBUSY; + } + + if (path->dentry->d_sb != sb) + return -EXDEV; + + F2FS_SB(sb)->umount_lock_holder = current; + + err = f2fs_do_quota_sync(sb, type); + if (err) + goto out; + + inode = d_inode(path->dentry); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + + err = filemap_fdatawait(inode->i_mapping); + if (err) + goto out; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + goto out; + + inode_lock(inode); + F2FS_I(inode)->i_flags |= F2FS_QUOTA_DEFAULT_FL; + f2fs_set_inode_flags(inode); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out: + F2FS_SB(sb)->umount_lock_holder = NULL; + return err; +} + +static int __f2fs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + return dquot_quota_off(sb, type); + + err = f2fs_do_quota_sync(sb, type); + if (err) + goto out_put; + + err = dquot_quota_off(sb, type); + if (err || f2fs_sb_has_quota_ino(F2FS_SB(sb))) + goto out_put; + + inode_lock(inode); + F2FS_I(inode)->i_flags &= ~F2FS_QUOTA_DEFAULT_FL; + f2fs_set_inode_flags(inode); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out_put: + iput(inode); + return err; +} + +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err; + + F2FS_SB(sb)->umount_lock_holder = current; + + err = __f2fs_quota_off(sb, type); + + /* + * quotactl can shutdown journalled quota, result in inconsistence + * between quota record and fs data by following updates, tag the + * flag to let fsck be aware of it. + */ + if (is_journalled_quota(sbi)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + + F2FS_SB(sb)->umount_lock_holder = NULL; + return err; } -static struct super_operations f2fs_sops = { +void f2fs_quota_off_umount(struct super_block *sb) +{ + int type; + int err; + + for (type = 0; type < MAXQUOTAS; type++) { + err = __f2fs_quota_off(sb, type); + if (err) { + int ret = dquot_quota_off(sb, type); + + f2fs_err(F2FS_SB(sb), "Fail to turn off disk quota (type: %d, err: %d, ret:%d), Please run fsck to fix it.", + type, err, ret); + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + } + } + /* + * In case of checkpoint=disable, we must flush quota blocks. + * This can cause NULL exception for node_inode in end_io, since + * put_super already dropped it. + */ + sync_filesystem(sb); +} + +static void f2fs_truncate_quota_inode_pages(struct super_block *sb) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int type; + + for (type = 0; type < MAXQUOTAS; type++) { + if (!dqopt->files[type]) + continue; + f2fs_inode_synced(dqopt->files[type]); + } +} + +static int f2fs_dquot_commit(struct dquot *dquot) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); + int ret; + + f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); + ret = dquot_commit(dquot); + if (ret < 0) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + f2fs_up_read(&sbi->quota_sem); + return ret; +} + +static int f2fs_dquot_acquire(struct dquot *dquot) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); + int ret; + + f2fs_down_read(&sbi->quota_sem); + ret = dquot_acquire(dquot); + if (ret < 0) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + f2fs_up_read(&sbi->quota_sem); + return ret; +} + +static int f2fs_dquot_release(struct dquot *dquot) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); + int ret = dquot_release(dquot); + + if (ret < 0) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + +static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int ret = dquot_mark_dquot_dirty(dquot); + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); + + return ret; +} + +static int f2fs_dquot_commit_info(struct super_block *sb, int type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int ret = dquot_commit_info(sb, type); + + if (ret < 0) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + +static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} + +static const struct dquot_operations f2fs_quota_operations = { + .get_reserved_space = f2fs_get_reserved_space, + .write_dquot = f2fs_dquot_commit, + .acquire_dquot = f2fs_dquot_acquire, + .release_dquot = f2fs_dquot_release, + .mark_dirty = f2fs_dquot_mark_dquot_dirty, + .write_info = f2fs_dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .get_projid = f2fs_get_projid, + .get_next_id = dquot_get_next_id, +}; + +static const struct quotactl_ops f2fs_quotactl_ops = { + .quota_on = f2fs_quota_on, + .quota_off = f2fs_quota_off, + .quota_sync = f2fs_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; +#else +int f2fs_dquot_initialize(struct inode *inode) +{ + return 0; +} + +int f2fs_do_quota_sync(struct super_block *sb, int type) +{ + return 0; +} + +void f2fs_quota_off_umount(struct super_block *sb) +{ +} +#endif + +static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, + .free_inode = f2fs_free_inode, .drop_inode = f2fs_drop_inode, - .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = f2fs_quota_read, + .quota_write = f2fs_quota_write, + .get_dquots = f2fs_get_dquots, +#endif .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, .freeze_fs = f2fs_freeze, .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, - .remount_fs = f2fs_remount, + .shutdown = f2fs_shutdown, }; +#ifdef CONFIG_FS_ENCRYPTION +static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) +{ + return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, NULL); +} + +static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * Encrypting the root directory is not allowed because fsck + * expects lost+found directory to exist and remain unencrypted + * if LOST_FOUND feature is enabled. + * + */ + if (f2fs_sb_has_lost_found(sbi) && + inode->i_ino == F2FS_ROOT_INO(sbi)) + return -EPERM; + + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, fs_data, XATTR_CREATE); +} + +static const union fscrypt_policy *f2fs_get_dummy_policy(struct super_block *sb) +{ + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_policy.policy; +} + +static bool f2fs_has_stable_inodes(struct super_block *sb) +{ + return true; +} + +static struct block_device **f2fs_get_devices(struct super_block *sb, + unsigned int *num_devs) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct block_device **devs; + int i; + + if (!f2fs_is_multi_device(sbi)) + return NULL; + + devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL); + if (!devs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < sbi->s_ndevs; i++) + devs[i] = FDEV(i).bdev; + *num_devs = sbi->s_ndevs; + return devs; +} + +static const struct fscrypt_operations f2fs_cryptops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_crypt_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), + .needs_bounce_pages = 1, + .has_32bit_inodes = 1, + .supports_subblock_data_units = 1, + .legacy_key_prefix = "f2fs:", + .get_context = f2fs_get_context, + .set_context = f2fs_set_context, + .get_dummy_policy = f2fs_get_dummy_policy, + .empty_dir = f2fs_empty_dir, + .has_stable_inodes = f2fs_has_stable_inodes, + .get_devices = f2fs_get_devices, +}; +#endif /* CONFIG_FS_ENCRYPTION */ + static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (ino < F2FS_ROOT_INO(sbi)) + if (f2fs_check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* @@ -425,7 +3723,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { + if (unlikely(generation && inode->i_generation != generation)) { /* we didn't find the right inode.. */ iput(inode); return ERR_PTR(-ESTALE); @@ -448,15 +3746,28 @@ static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid, } static const struct export_operations f2fs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = f2fs_fh_to_dentry, .fh_to_parent = f2fs_fh_to_parent, .get_parent = f2fs_get_parent, }; -static loff_t max_file_size(unsigned bits) +loff_t max_file_blocks(struct inode *inode) { - loff_t result = ADDRS_PER_INODE; - loff_t leaf_count = ADDRS_PER_BLOCK; + loff_t result = 0; + loff_t leaf_count; + + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + + if (inode && f2fs_compressed_file(inode)) + leaf_count = ADDRS_PER_BLOCK(inode); + else + leaf_count = DEF_ADDRS_PER_BLOCK; /* two direct node blocks */ result += (leaf_count * 2); @@ -469,70 +3780,489 @@ static loff_t max_file_size(unsigned bits) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; - result <<= bits; + /* + * For compatibility with FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{64,32} with + * a 4K crypto data unit, we must restrict the max filesize to what can + * fit within U32_MAX + 1 data units. + */ + + result = umin(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096)); + return result; } -static int sanity_check_raw_super(struct super_block *sb, - struct f2fs_super_block *raw_super) +static int __f2fs_commit_super(struct f2fs_sb_info *sbi, struct folio *folio, + pgoff_t index, bool update) { - unsigned int blocksize; + struct bio *bio; + /* it's rare case, we can do fua all the time */ + blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA; + int ret; - if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { - f2fs_msg(sb, KERN_INFO, - "Magic Mismatch, valid(0x%x) - read(0x%x)", - F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); - return 1; + folio_lock(folio); + folio_wait_writeback(folio); + if (update) + memcpy(F2FS_SUPER_BLOCK(folio, index), F2FS_RAW_SUPER(sbi), + sizeof(struct f2fs_super_block)); + folio_mark_dirty(folio); + folio_clear_dirty_for_io(folio); + folio_start_writeback(folio); + folio_unlock(folio); + + bio = bio_alloc(sbi->sb->s_bdev, 1, opf, GFP_NOFS); + + /* it doesn't need to set crypto context for superblock update */ + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(folio->index); + + if (!bio_add_folio(bio, folio, folio_size(folio), 0)) + f2fs_bug_on(sbi, 1); + + ret = submit_bio_wait(bio); + bio_put(bio); + folio_end_writeback(folio); + + return ret; +} + +static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, + struct folio *folio, pgoff_t index) +{ + struct f2fs_super_block *raw_super = F2FS_SUPER_BLOCK(folio, index); + struct super_block *sb = sbi->sb; + u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); + u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); + u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); + u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); + u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); + u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); + u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); + u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); + u32 segment_count = le32_to_cpu(raw_super->segment_count); + u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + u64 main_end_blkaddr = main_blkaddr + + ((u64)segment_count_main << log_blocks_per_seg); + u64 seg_end_blkaddr = segment0_blkaddr + + ((u64)segment_count << log_blocks_per_seg); + + if (segment0_blkaddr != cp_blkaddr) { + f2fs_info(sbi, "Mismatch start address, segment0(%u) cp_blkaddr(%u)", + segment0_blkaddr, cp_blkaddr); + return true; } - /* Currently, support only 4KB page cache size */ - if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) { - f2fs_msg(sb, KERN_INFO, - "Invalid page_cache_size (%lu), supports only 4KB\n", - PAGE_CACHE_SIZE); - return 1; + if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != + sit_blkaddr) { + f2fs_info(sbi, "Wrong CP boundary, start(%u) end(%u) blocks(%u)", + cp_blkaddr, sit_blkaddr, + segment_count_ckpt << log_blocks_per_seg); + return true; } - /* Currently, support only 4KB block size */ - blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != F2FS_BLKSIZE) { - f2fs_msg(sb, KERN_INFO, - "Invalid blocksize (%u), supports only 4KB\n", - blocksize); - return 1; + if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != + nat_blkaddr) { + f2fs_info(sbi, "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", + sit_blkaddr, nat_blkaddr, + segment_count_sit << log_blocks_per_seg); + return true; } - if (le32_to_cpu(raw_super->log_sectorsize) != - F2FS_LOG_SECTOR_SIZE) { - f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); - return 1; + if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != + ssa_blkaddr) { + f2fs_info(sbi, "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", + nat_blkaddr, ssa_blkaddr, + segment_count_nat << log_blocks_per_seg); + return true; } - if (le32_to_cpu(raw_super->log_sectors_per_block) != - F2FS_LOG_SECTORS_PER_BLOCK) { - f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); - return 1; + + if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != + main_blkaddr) { + f2fs_info(sbi, "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", + ssa_blkaddr, main_blkaddr, + segment_count_ssa << log_blocks_per_seg); + return true; + } + + if (main_end_blkaddr > seg_end_blkaddr) { + f2fs_info(sbi, "Wrong MAIN_AREA boundary, start(%u) end(%llu) block(%u)", + main_blkaddr, seg_end_blkaddr, + segment_count_main << log_blocks_per_seg); + return true; + } else if (main_end_blkaddr < seg_end_blkaddr) { + int err = 0; + char *res; + + /* fix in-memory information all the time */ + raw_super->segment_count = cpu_to_le32((main_end_blkaddr - + segment0_blkaddr) >> log_blocks_per_seg); + + if (f2fs_readonly(sb) || f2fs_hw_is_readonly(sbi)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + res = "internally"; + } else { + err = __f2fs_commit_super(sbi, folio, index, false); + res = err ? "failed" : "done"; + } + f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%llu) block(%u)", + res, main_blkaddr, seg_end_blkaddr, + segment_count_main << log_blocks_per_seg); + if (err) + return true; + } + return false; +} + +static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + struct folio *folio, pgoff_t index) +{ + block_t segment_count, segs_per_sec, secs_per_zone, segment_count_main; + block_t total_sections, blocks_per_seg; + struct f2fs_super_block *raw_super = F2FS_SUPER_BLOCK(folio, index); + size_t crc_offset = 0; + __u32 crc = 0; + + if (le32_to_cpu(raw_super->magic) != F2FS_SUPER_MAGIC) { + f2fs_info(sbi, "Magic Mismatch, valid(0x%x) - read(0x%x)", + F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); + return -EINVAL; + } + + /* Check checksum_offset and crc in superblock */ + if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) { + crc_offset = le32_to_cpu(raw_super->checksum_offset); + if (crc_offset != + offsetof(struct f2fs_super_block, crc)) { + f2fs_info(sbi, "Invalid SB checksum offset: %zu", + crc_offset); + return -EFSCORRUPTED; + } + crc = le32_to_cpu(raw_super->crc); + if (crc != f2fs_crc32(raw_super, crc_offset)) { + f2fs_info(sbi, "Invalid SB checksum value: %u", crc); + return -EFSCORRUPTED; + } + } + + /* only support block_size equals to PAGE_SIZE */ + if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) { + f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u", + le32_to_cpu(raw_super->log_blocksize), + F2FS_BLKSIZE_BITS); + return -EFSCORRUPTED; + } + + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_info(sbi, "Invalid log blocks per segment (%u)", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return -EFSCORRUPTED; + } + + /* Currently, support 512/1024/2048/4096/16K bytes sector size */ + if (le32_to_cpu(raw_super->log_sectorsize) > + F2FS_MAX_LOG_SECTOR_SIZE || + le32_to_cpu(raw_super->log_sectorsize) < + F2FS_MIN_LOG_SECTOR_SIZE) { + f2fs_info(sbi, "Invalid log sectorsize (%u)", + le32_to_cpu(raw_super->log_sectorsize)); + return -EFSCORRUPTED; + } + if (le32_to_cpu(raw_super->log_sectors_per_block) + + le32_to_cpu(raw_super->log_sectorsize) != + F2FS_MAX_LOG_SECTOR_SIZE) { + f2fs_info(sbi, "Invalid log sectors per block(%u) log sectorsize(%u)", + le32_to_cpu(raw_super->log_sectors_per_block), + le32_to_cpu(raw_super->log_sectorsize)); + return -EFSCORRUPTED; + } + + segment_count = le32_to_cpu(raw_super->segment_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main); + segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + total_sections = le32_to_cpu(raw_super->section_count); + + /* blocks_per_seg should be 512, given the above check */ + blocks_per_seg = BIT(le32_to_cpu(raw_super->log_blocks_per_seg)); + + if (segment_count > F2FS_MAX_SEGMENT || + segment_count < F2FS_MIN_SEGMENTS) { + f2fs_info(sbi, "Invalid segment count (%u)", segment_count); + return -EFSCORRUPTED; + } + + if (total_sections > segment_count_main || total_sections < 1 || + segs_per_sec > segment_count || !segs_per_sec) { + f2fs_info(sbi, "Invalid segment/section count (%u, %u x %u)", + segment_count, total_sections, segs_per_sec); + return -EFSCORRUPTED; + } + + if (segment_count_main != total_sections * segs_per_sec) { + f2fs_info(sbi, "Invalid segment/section count (%u != %u * %u)", + segment_count_main, total_sections, segs_per_sec); + return -EFSCORRUPTED; + } + + if ((segment_count / segs_per_sec) < total_sections) { + f2fs_info(sbi, "Small segment_count (%u < %u * %u)", + segment_count, segs_per_sec, total_sections); + return -EFSCORRUPTED; + } + + if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) { + f2fs_info(sbi, "Wrong segment_count / block_count (%u > %llu)", + segment_count, le64_to_cpu(raw_super->block_count)); + return -EFSCORRUPTED; + } + + if (RDEV(0).path[0]) { + block_t dev_seg_count = le32_to_cpu(RDEV(0).total_segments); + int i = 1; + + while (i < MAX_DEVICES && RDEV(i).path[0]) { + dev_seg_count += le32_to_cpu(RDEV(i).total_segments); + i++; + } + if (segment_count != dev_seg_count) { + f2fs_info(sbi, "Segment count (%u) mismatch with total segments from devices (%u)", + segment_count, dev_seg_count); + return -EFSCORRUPTED; + } + } else { + if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_BLKZONED) && + !bdev_is_zoned(sbi->sb->s_bdev)) { + f2fs_info(sbi, "Zoned block device path is missing"); + return -EFSCORRUPTED; + } + } + + if (secs_per_zone > total_sections || !secs_per_zone) { + f2fs_info(sbi, "Wrong secs_per_zone / total_sections (%u, %u)", + secs_per_zone, total_sections); + return -EFSCORRUPTED; + } + if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION || + raw_super->hot_ext_count > F2FS_MAX_EXTENSION || + (le32_to_cpu(raw_super->extension_count) + + raw_super->hot_ext_count) > F2FS_MAX_EXTENSION) { + f2fs_info(sbi, "Corrupted extension count (%u + %u > %u)", + le32_to_cpu(raw_super->extension_count), + raw_super->hot_ext_count, + F2FS_MAX_EXTENSION); + return -EFSCORRUPTED; + } + + if (le32_to_cpu(raw_super->cp_payload) >= + (blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE)) { + f2fs_info(sbi, "Insane cp_payload (%u >= %u)", + le32_to_cpu(raw_super->cp_payload), + blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE); + return -EFSCORRUPTED; + } + + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || + le32_to_cpu(raw_super->root_ino) != 3) { + f2fs_info(sbi, "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", + le32_to_cpu(raw_super->node_ino), + le32_to_cpu(raw_super->meta_ino), + le32_to_cpu(raw_super->root_ino)); + return -EFSCORRUPTED; + } + + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sbi, folio, index)) + return -EFSCORRUPTED; + + /* + * Check for legacy summary layout on 16KB+ block devices. + * Modern f2fs-tools packs multiple 4KB summary areas into one block, + * whereas legacy versions used one block per summary, leading + * to a much larger SSA. + */ + if (SUMS_PER_BLOCK > 1 && + !(__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_PACKED_SSA))) { + f2fs_info(sbi, "Error: Device formatted with a legacy version. " + "Please reformat with a tool supporting the packed ssa " + "feature for block sizes larger than 4kb."); + return -EOPNOTSUPP; } + return 0; } -static int sanity_check_ckpt(struct f2fs_sb_info *sbi) +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; + unsigned int main_segs, blocks_per_seg; + unsigned int sit_segs, nat_segs; + unsigned int sit_bitmap_size, nat_bitmap_size; + unsigned int log_blocks_per_seg; + unsigned int segment_count_main; + unsigned int cp_pack_start_sum, cp_payload; + block_t user_block_count, valid_user_blocks; + block_t avail_node_count, valid_node_count; + unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks; + unsigned int sit_blk_cnt; + int i, j; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); - fsmeta += le32_to_cpu(raw_super->segment_count_sit); - fsmeta += le32_to_cpu(raw_super->segment_count_nat); + sit_segs = le32_to_cpu(raw_super->segment_count_sit); + fsmeta += sit_segs; + nat_segs = le32_to_cpu(raw_super->segment_count_nat); + fsmeta += nat_segs; fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); fsmeta += le32_to_cpu(raw_super->segment_count_ssa); - if (fsmeta >= total) + if (unlikely(fsmeta >= total)) + return 1; + + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (!f2fs_sb_has_readonly(sbi) && + unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version"); + return 1; + } + user_block_count = le64_to_cpu(ckpt->user_block_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main) + + (f2fs_sb_has_readonly(sbi) ? 1 : 0); + log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + if (!user_block_count || user_block_count >= + segment_count_main << log_blocks_per_seg) { + f2fs_err(sbi, "Wrong user_block_count: %u", + user_block_count); + return 1; + } + + valid_user_blocks = le64_to_cpu(ckpt->valid_block_count); + if (valid_user_blocks > user_block_count) { + f2fs_err(sbi, "Wrong valid_user_blocks: %u, user_block_count: %u", + valid_user_blocks, user_block_count); + return 1; + } + + valid_node_count = le32_to_cpu(ckpt->valid_node_count); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + if (valid_node_count > avail_node_count) { + f2fs_err(sbi, "Wrong valid_node_count: %u, avail_node_count: %u", + valid_node_count, avail_node_count); + return 1; + } + + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = BLKS_PER_SEG(sbi); + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto check_data; + + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_node_segno[j])) { + f2fs_err(sbi, "Node segment (%u, %u) has the same segno: %u", + i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } + } +check_data: + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto skip_cross; + + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_err(sbi, "Data segment (%u, %u) has the same segno: %u", + i, j, + le32_to_cpu(ckpt->cur_data_segno[i])); + return 1; + } + } + } + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + for (j = 0; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_err(sbi, "Node segment (%u) and Data segment (%u) has the same segno: %u", + i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } + } +skip_cross: + sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + + if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || + nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { + f2fs_err(sbi, "Wrong bitmap size: sit: %u, nat:%u", + sit_bitmap_size, nat_bitmap_size); + return 1; + } + + sit_blk_cnt = DIV_ROUND_UP(main_segs, SIT_ENTRY_PER_BLOCK); + if (sit_bitmap_size * 8 < sit_blk_cnt) { + f2fs_err(sbi, "Wrong bitmap size: sit: %u, sit_blk_cnt:%u", + sit_bitmap_size, sit_blk_cnt); + return 1; + } + + cp_pack_start_sum = __start_sum_addr(sbi); + cp_payload = __cp_payload(sbi); + if (cp_pack_start_sum < cp_payload + 1 || + cp_pack_start_sum > blocks_per_seg - 1 - + NR_CURSEG_PERSIST_TYPE) { + f2fs_err(sbi, "Wrong cp_pack_start_sum: %u", + cp_pack_start_sum); return 1; + } + + if (__is_set_ckpt_flags(ckpt, CP_LARGE_NAT_BITMAP_FLAG) && + le32_to_cpu(ckpt->checksum_offset) != CP_MIN_CHKSUM_OFFSET) { + f2fs_warn(sbi, "using deprecated layout of large_nat_bitmap, " + "please run fsck v1.13.0 or higher to repair, chksum_offset: %u, " + "fixed with patch: \"f2fs-tools: relocate chksum_offset for large_nat_bitmap feature\"", + le32_to_cpu(ckpt->checksum_offset)); + return 1; + } + + nat_blocks = nat_segs << log_blocks_per_seg; + nat_bits_bytes = nat_blocks / BITS_PER_BYTE; + nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); + if (__is_set_ckpt_flags(ckpt, CP_NAT_BITS_FLAG) && + (cp_payload + F2FS_CP_PACKS + + NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) { + f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)", + cp_payload, nat_bits_blocks); + return 1; + } - if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { - f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_err(sbi, "A bug case: need to run fsck"); return 1; } return 0; @@ -546,280 +4276,1219 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); - sbi->blocksize = 1 << sbi->log_blocksize; + sbi->blocksize = BIT(sbi->log_blocksize); sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); - sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg; + sbi->blocks_per_seg = BIT(sbi->log_blocks_per_seg); sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); sbi->total_sections = le32_to_cpu(raw_super->section_count); - sbi->total_node_count = - (le32_to_cpu(raw_super->segment_count_nat) / 2) - * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; - sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); - sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); - sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + sbi->total_node_count = SEGS_TO_BLKS(sbi, + ((le32_to_cpu(raw_super->segment_count_nat) / 2) * + NAT_ENTRY_PER_BLOCK)); + sbi->allocate_section_hint = le32_to_cpu(raw_super->section_count); + sbi->allocate_section_policy = ALLOCATE_FORWARD_NOHINT; + F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); + F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); + F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; + sbi->gc_mode = GC_NORMAL; + sbi->next_victim_seg[BG_GC] = NULL_SEGNO; + sbi->next_victim_seg[FG_GC] = NULL_SEGNO; + sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; + sbi->migration_granularity = SEGS_PER_SEC(sbi); + sbi->migration_window_granularity = f2fs_sb_has_blkzoned(sbi) ? + DEF_MIGRATION_WINDOW_GRANULARITY_ZONED : SEGS_PER_SEC(sbi); + sbi->seq_file_ra_mul = MIN_RA_MUL; + sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; + sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; + spin_lock_init(&sbi->gc_remaining_trials_lock); + atomic64_set(&sbi->current_atomic_write, 0); + + sbi->dir_level = DEF_DIR_LEVEL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[ENABLE_TIME] = DEF_ENABLE_INTERVAL; + sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = + DEF_UMOUNT_DISCARD_TIMEOUT; + clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + + for (i = 0; i < META; i++) + atomic_set(&sbi->wb_sync_req[i], 0); + + INIT_LIST_HEAD(&sbi->s_list); + mutex_init(&sbi->umount_mutex); + init_f2fs_rwsem(&sbi->io_order_lock); + spin_lock_init(&sbi->cp_lock); + + sbi->dirty_device = 0; + spin_lock_init(&sbi->dev_lock); + + init_f2fs_rwsem(&sbi->sb_lock); + init_f2fs_rwsem(&sbi->pin_sem); } -static int validate_superblock(struct super_block *sb, - struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, sector_t block) +static int init_percpu_info(struct f2fs_sb_info *sbi) { - const char *super = (block == 0 ? "first" : "second"); + int err; - /* read f2fs raw super block */ - *raw_super_buf = sb_bread(sb, block); - if (!*raw_super_buf) { - f2fs_msg(sb, KERN_ERR, "unable to read %s superblock", - super); - return -EIO; + err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); + if (err) + return err; + + err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL); + if (err) + goto err_valid_block; + + err = percpu_counter_init(&sbi->total_valid_inode_count, 0, + GFP_KERNEL); + if (err) + goto err_node_block; + return 0; + +err_node_block: + percpu_counter_destroy(&sbi->rf_node_block_count); +err_valid_block: + percpu_counter_destroy(&sbi->alloc_valid_block_count); + return err; +} + +#ifdef CONFIG_BLK_DEV_ZONED + +struct f2fs_report_zones_args { + struct f2fs_sb_info *sbi; + struct f2fs_dev_info *dev; +}; + +static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct f2fs_report_zones_args *rz_args = data; + block_t unusable_blocks = (zone->len - zone->capacity) >> + F2FS_LOG_SECTORS_PER_BLOCK; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return 0; + + set_bit(idx, rz_args->dev->blkz_seq); + if (!rz_args->sbi->unusable_blocks_per_sec) { + rz_args->sbi->unusable_blocks_per_sec = unusable_blocks; + return 0; + } + if (rz_args->sbi->unusable_blocks_per_sec != unusable_blocks) { + f2fs_err(rz_args->sbi, "F2FS supports single zone capacity\n"); + return -EINVAL; } + return 0; +} - *raw_super = (struct f2fs_super_block *) - ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET); +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) +{ + struct block_device *bdev = FDEV(devi).bdev; + sector_t nr_sectors = bdev_nr_sectors(bdev); + struct f2fs_report_zones_args rep_zone_arg; + u64 zone_sectors; + unsigned int max_open_zones; + int ret; - /* sanity checking of raw super */ - if (!sanity_check_raw_super(sb, *raw_super)) + if (!f2fs_sb_has_blkzoned(sbi)) return 0; - f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " - "in %s superblock", super); - return -EINVAL; + if (bdev_is_zoned(FDEV(devi).bdev)) { + max_open_zones = bdev_max_open_zones(bdev); + if (max_open_zones && (max_open_zones < sbi->max_open_zones)) + sbi->max_open_zones = max_open_zones; + if (sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) { + f2fs_err(sbi, + "zoned: max open zones %u is too small, need at least %u open zones", + sbi->max_open_zones, F2FS_OPTION(sbi).active_logs); + return -EINVAL; + } + } + + zone_sectors = bdev_zone_sectors(bdev); + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(zone_sectors)) + return -EINVAL; + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); + FDEV(devi).nr_blkz = div_u64(SECTOR_TO_BLOCK(nr_sectors), + sbi->blocks_per_blkz); + if (nr_sectors & (zone_sectors - 1)) + FDEV(devi).nr_blkz++; + + FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, + BITS_TO_LONGS(FDEV(devi).nr_blkz) + * sizeof(unsigned long), + GFP_KERNEL); + if (!FDEV(devi).blkz_seq) + return -ENOMEM; + + rep_zone_arg.sbi = sbi; + rep_zone_arg.dev = &FDEV(devi); + + ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb, + &rep_zone_arg); + if (ret < 0) + return ret; + return 0; +} +#endif + +/* + * Read f2fs raw super block. + * Because we have two copies of super block, so read both of them + * to get the first valid one. If any one of them is broken, we pass + * them recovery flag back to the caller. + */ +static int read_raw_super_block(struct f2fs_sb_info *sbi, + struct f2fs_super_block **raw_super, + int *valid_super_block, int *recovery) +{ + struct super_block *sb = sbi->sb; + int block; + struct folio *folio; + struct f2fs_super_block *super; + int err = 0; + + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; + + for (block = 0; block < 2; block++) { + folio = read_mapping_folio(sb->s_bdev->bd_mapping, block, NULL); + if (IS_ERR(folio)) { + f2fs_err(sbi, "Unable to read %dth superblock", + block + 1); + err = PTR_ERR(folio); + *recovery = 1; + continue; + } + + /* sanity checking of raw super */ + err = sanity_check_raw_super(sbi, folio, block); + if (err) { + f2fs_err(sbi, "Can't find valid F2FS filesystem in %dth superblock", + block + 1); + folio_put(folio); + *recovery = 1; + continue; + } + + if (!*raw_super) { + memcpy(super, F2FS_SUPER_BLOCK(folio, block), + sizeof(*super)); + *valid_super_block = block; + *raw_super = super; + } + folio_put(folio); + } + + /* No valid superblock */ + if (!*raw_super) + kfree(super); + else + err = 0; + + return err; +} + +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) +{ + struct folio *folio; + pgoff_t index; + __u32 crc = 0; + int err; + + if ((recover && f2fs_readonly(sbi->sb)) || + f2fs_hw_is_readonly(sbi)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + return -EROFS; + } + + /* we should update superblock crc here */ + if (!recover && f2fs_sb_has_sb_chksum(sbi)) { + crc = f2fs_crc32(F2FS_RAW_SUPER(sbi), + offsetof(struct f2fs_super_block, crc)); + F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc); + } + + /* write back-up superblock first */ + index = sbi->valid_super_block ? 0 : 1; + folio = read_mapping_folio(sbi->sb->s_bdev->bd_mapping, index, NULL); + if (IS_ERR(folio)) + return PTR_ERR(folio); + err = __f2fs_commit_super(sbi, folio, index, true); + folio_put(folio); + + /* if we are in recovery path, skip writing valid superblock */ + if (recover || err) + return err; + + /* write current valid superblock */ + index = sbi->valid_super_block; + folio = read_mapping_folio(sbi->sb->s_bdev->bd_mapping, index, NULL); + if (IS_ERR(folio)) + return PTR_ERR(folio); + err = __f2fs_commit_super(sbi, folio, index, true); + folio_put(folio); + return err; +} + +static void save_stop_reason(struct f2fs_sb_info *sbi, unsigned char reason) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->error_lock, flags); + if (sbi->stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) + sbi->stop_reason[reason]++; + spin_unlock_irqrestore(&sbi->error_lock, flags); +} + +static void f2fs_record_stop_reason(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned long flags; + int err; + + f2fs_down_write(&sbi->sb_lock); + + spin_lock_irqsave(&sbi->error_lock, flags); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + } + memcpy(raw_super->s_stop_reason, sbi->stop_reason, MAX_STOP_REASON); + spin_unlock_irqrestore(&sbi->error_lock, flags); + + err = f2fs_commit_super(sbi, false); + + f2fs_up_write(&sbi->sb_lock); + if (err) + f2fs_err_ratelimited(sbi, + "f2fs_commit_super fails to record stop_reason, err:%d", + err); +} + +void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->error_lock, flags); + if (!test_bit(flag, (unsigned long *)sbi->errors)) { + set_bit(flag, (unsigned long *)sbi->errors); + sbi->error_dirty = true; + } + spin_unlock_irqrestore(&sbi->error_lock, flags); +} + +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + f2fs_save_errors(sbi, error); + + if (!sbi->error_dirty) + return; + if (!test_bit(error, (unsigned long *)sbi->errors)) + return; + schedule_work(&sbi->s_error_work); +} + +static bool system_going_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason) +{ + struct super_block *sb = sbi->sb; + bool shutdown = reason == STOP_CP_REASON_SHUTDOWN; + bool continue_fs = !shutdown && + F2FS_OPTION(sbi).errors == MOUNT_ERRORS_CONTINUE; + + set_ckpt_flags(sbi, CP_ERROR_FLAG); + + if (!f2fs_hw_is_readonly(sbi)) { + save_stop_reason(sbi, reason); + + /* + * always create an asynchronous task to record stop_reason + * in order to avoid potential deadlock when running into + * f2fs_record_stop_reason() synchronously. + */ + schedule_work(&sbi->s_error_work); + } + + /* + * We force ERRORS_RO behavior when system is rebooting. Otherwise we + * could panic during 'reboot -f' as the underlying device got already + * disabled. + */ + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC && + !shutdown && !system_going_down() && + !is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) + panic("F2FS-fs (device %s): panic forced after error\n", + sb->s_id); + + if (shutdown) + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + else + dump_stack(); + + /* + * Continue filesystem operators if errors=continue. Should not set + * RO by shutdown, since RO bypasses thaw_super which can hang the + * system. + */ + if (continue_fs || f2fs_readonly(sb) || shutdown) { + f2fs_warn(sbi, "Stopped filesystem due to reason: %d", reason); + return; + } + + f2fs_warn(sbi, "Remounting filesystem read-only"); + + /* + * We have already set CP_ERROR_FLAG flag to stop all updates + * to filesystem, so it doesn't need to set SB_RDONLY flag here + * because the flag should be set covered w/ sb->s_umount semaphore + * via remount procedure, otherwise, it will confuse code like + * freeze_super() which will lead to deadlocks and other problems. + */ +} + +static void f2fs_record_error_work(struct work_struct *work) +{ + struct f2fs_sb_info *sbi = container_of(work, + struct f2fs_sb_info, s_error_work); + + f2fs_record_stop_reason(sbi); } -static int f2fs_fill_super(struct super_block *sb, void *data, int silent) +static inline unsigned int get_first_seq_zone_segno(struct f2fs_sb_info *sbi) { +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int zoneno, total_zones; + int devi; + + if (!f2fs_sb_has_blkzoned(sbi)) + return NULL_SEGNO; + + for (devi = 0; devi < sbi->s_ndevs; devi++) { + if (!bdev_is_zoned(FDEV(devi).bdev)) + continue; + + total_zones = GET_ZONE_FROM_SEG(sbi, FDEV(devi).total_segments); + + for (zoneno = 0; zoneno < total_zones; zoneno++) { + unsigned int segs, blks; + + if (!f2fs_zone_is_seq(sbi, devi, zoneno)) + continue; + + segs = GET_SEG_FROM_SEC(sbi, + zoneno * sbi->secs_per_zone); + blks = SEGS_TO_BLKS(sbi, segs); + return GET_SEGNO(sbi, FDEV(devi).start_blk + blks); + } + } +#endif + return NULL_SEGNO; +} + +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned int max_devices = MAX_DEVICES; + unsigned int logical_blksize; + blk_mode_t mode = sb_open_mode(sbi->sb->s_flags); + int i; + + /* Initialize single device information */ + if (!RDEV(0).path[0]) { + if (!bdev_is_zoned(sbi->sb->s_bdev)) + return 0; + max_devices = 1; + } + + /* + * Initialize multiple devices information, or single + * zoned block device information. + */ + sbi->devs = f2fs_kzalloc(sbi, + array_size(max_devices, + sizeof(struct f2fs_dev_info)), + GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + + logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); + sbi->aligned_blksize = true; + sbi->bggc_io_aware = AWARE_ALL_IO; +#ifdef CONFIG_BLK_DEV_ZONED + sbi->max_open_zones = UINT_MAX; + sbi->blkzone_alloc_policy = BLKZONE_ALLOC_PRIOR_SEQ; + sbi->bggc_io_aware = AWARE_READ_IO; +#endif + + for (i = 0; i < max_devices; i++) { + if (max_devices == 1) { + FDEV(i).total_segments = + le32_to_cpu(raw_super->segment_count_main); + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).total_segments * + BLKS_PER_SEG(sbi); + } + + if (i == 0) + FDEV(0).bdev_file = sbi->sb->s_bdev_file; + else if (!RDEV(i).path[0]) + break; + + if (max_devices > 1) { + /* Multi-device mount */ + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = + le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + SEGS_TO_BLKS(sbi, + FDEV(i).total_segments) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + sbi->allocate_section_hint = FDEV(i).total_segments / + SEGS_PER_SEC(sbi); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + SEGS_TO_BLKS(sbi, + FDEV(i).total_segments) - 1; + FDEV(i).bdev_file = bdev_file_open_by_path( + FDEV(i).path, mode, sbi->sb, NULL); + } + } + if (IS_ERR(FDEV(i).bdev_file)) + return PTR_ERR(FDEV(i).bdev_file); + + FDEV(i).bdev = file_bdev(FDEV(i).bdev_file); + /* to release errored devices */ + sbi->s_ndevs = i + 1; + + if (logical_blksize != bdev_logical_block_size(FDEV(i).bdev)) + sbi->aligned_blksize = false; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_is_zoned(FDEV(i).bdev)) { + if (!f2fs_sb_has_blkzoned(sbi)) { + f2fs_err(sbi, "Zoned block device feature not enabled"); + return -EINVAL; + } + if (init_blkz_info(sbi, i)) { + f2fs_err(sbi, "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + if (max_devices == 1) + break; + f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: Host-managed)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + continue; + } +#endif + f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + return 0; +} + +static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) +{ +#if IS_ENABLED(CONFIG_UNICODE) + if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) { + const struct f2fs_sb_encodings *encoding_info; + struct unicode_map *encoding; + __u16 encoding_flags; + + encoding_info = f2fs_sb_read_encoding(sbi->raw_super); + if (!encoding_info) { + f2fs_err(sbi, + "Encoding requested by superblock is unknown"); + return -EINVAL; + } + + encoding_flags = le16_to_cpu(sbi->raw_super->s_encoding_flags); + encoding = utf8_load(encoding_info->version); + if (IS_ERR(encoding)) { + f2fs_err(sbi, + "can't mount with superblock charset: %s-%u.%u.%u " + "not supported by the kernel. flags: 0x%x.", + encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); + return PTR_ERR(encoding); + } + f2fs_info(sbi, "Using encoding defined by superblock: " + "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, + unicode_major(encoding_info->version), + unicode_minor(encoding_info->version), + unicode_rev(encoding_info->version), + encoding_flags); + + sbi->sb->s_encoding = encoding; + sbi->sb->s_encoding_flags = encoding_flags; + } +#else + if (f2fs_sb_has_casefold(sbi)) { + f2fs_err(sbi, "Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE"); + return -EINVAL; + } +#endif + return 0; +} + +static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) +{ + /* adjust parameters according to the volume size */ + if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) { + if (f2fs_block_unit_discard(sbi)) + SM_I(sbi)->dcc_info->discard_granularity = + MIN_DISCARD_GRANULARITY; + if (!f2fs_lfs_mode(sbi)) + SM_I(sbi)->ipu_policy = BIT(F2FS_IPU_FORCE) | + BIT(F2FS_IPU_HONOR_OPU_WRITE); + } + + sbi->readdir_ra = true; +} + +static int f2fs_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct f2fs_fs_context *ctx = fc->fs_private; struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; - struct buffer_head *raw_super_buf; struct inode *root; - long err = -EINVAL; - int i; + int err; + bool skip_recovery = false, need_fsck = false; + int recovery, i, valid_super_block; + struct curseg_info *seg_i; + int retry_cnt = 1; +#ifdef CONFIG_QUOTA + bool quota_enabled = false; +#endif + +try_onemore: + err = -EINVAL; + raw_super = NULL; + valid_super_block = -1; + recovery = 0; /* allocate memory for f2fs-specific super block info */ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; + sbi->sb = sb; + + /* initialize locks within allocated memory */ + init_f2fs_rwsem(&sbi->gc_lock); + mutex_init(&sbi->writepages); + init_f2fs_rwsem(&sbi->cp_global_sem); + init_f2fs_rwsem(&sbi->node_write); + init_f2fs_rwsem(&sbi->node_change); + spin_lock_init(&sbi->stat_lock); + init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->cp_enable_rwsem); + init_f2fs_rwsem(&sbi->quota_sem); + init_waitqueue_head(&sbi->cp_wait); + spin_lock_init(&sbi->error_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } + mutex_init(&sbi->flush_lock); + /* set a block size */ - if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { - f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); + if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { + f2fs_err(sbi, "unable to set blocksize"); goto free_sbi; } - err = validate_superblock(sb, &raw_super, &raw_super_buf, 0); - if (err) { - brelse(raw_super_buf); - /* check secondary superblock when primary failed */ - err = validate_superblock(sb, &raw_super, &raw_super_buf, 1); - if (err) - goto free_sb_buf; - } + err = read_raw_super_block(sbi, &raw_super, &valid_super_block, + &recovery); + if (err) + goto free_sbi; + sb->s_fs_info = sbi; - /* init some FS parameters */ - sbi->active_logs = NR_CURSEG_TYPE; + sbi->raw_super = raw_super; - set_opt(sbi, BG_GC); + INIT_WORK(&sbi->s_error_work, f2fs_record_error_work); + memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + memcpy(sbi->stop_reason, raw_super->s_stop_reason, MAX_STOP_REASON); -#ifdef CONFIG_F2FS_FS_XATTR - set_opt(sbi, XATTR_USER); -#endif -#ifdef CONFIG_F2FS_FS_POSIX_ACL - set_opt(sbi, POSIX_ACL); -#endif - /* parse mount options */ - err = parse_options(sb, (char *)data); + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sbi)) + sbi->s_chksum_seed = f2fs_chksum(~0, raw_super->uuid, + sizeof(raw_super->uuid)); + + default_options(sbi, false); + + err = f2fs_check_opt_consistency(fc, sb); if (err) goto free_sb_buf; - sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); + f2fs_apply_options(fc, sb); + + err = f2fs_sanity_check_options(sbi, false); + if (err) + goto free_options; + + sb->s_maxbytes = max_file_blocks(NULL) << + le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + + err = f2fs_setup_casefold(sbi); + if (err) + goto free_options; + +#ifdef CONFIG_QUOTA + sb->dq_op = &f2fs_quota_operations; + sb->s_qcop = &f2fs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; + + if (f2fs_sb_has_quota_ino(sbi)) { + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i)) + sbi->nquota_files++; + } + } +#endif sb->s_op = &f2fs_sops; +#ifdef CONFIG_FS_ENCRYPTION + sb->s_cop = &f2fs_cryptops; +#endif +#ifdef CONFIG_FS_VERITY + sb->s_vop = &f2fs_verityops; +#endif sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; sb->s_time_gran = 1; - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); - memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + if (test_opt(sbi, INLINECRYPT)) + sb->s_flags |= SB_INLINECRYPT; + + if (test_opt(sbi, LAZYTIME)) + sb->s_flags |= SB_LAZYTIME; + else + sb->s_flags &= ~SB_LAZYTIME; + + super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid)); + super_set_sysfs_name_bdev(sb); + sb->s_iflags |= SB_I_CGROUPWB; /* init f2fs-specific super block info */ - sbi->sb = sb; - sbi->raw_super = raw_super; - sbi->raw_super_buf = raw_super_buf; - mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->writepages); - mutex_init(&sbi->cp_mutex); - for (i = 0; i < NR_GLOBAL_LOCKS; i++) - mutex_init(&sbi->fs_lock[i]); - mutex_init(&sbi->node_write); - sbi->por_doing = 0; - spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->bio_sem); + sbi->valid_super_block = valid_super_block; + + /* disallow all the data/node/meta page writes */ + set_sbi_flag(sbi, SBI_POR_DOING); + + err = f2fs_init_write_merge_io(sbi); + if (err) + goto free_bio_info; + init_sb_info(sbi); + err = f2fs_init_iostat(sbi); + if (err) + goto free_bio_info; + + err = init_percpu_info(sbi); + if (err) + goto free_iostat; + + err = f2fs_init_page_array_cache(sbi); + if (err) + goto free_percpu; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { - f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); + f2fs_err(sbi, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_sb_buf; + goto free_page_array_cache; } - err = get_valid_checkpoint(sbi); + err = f2fs_get_valid_checkpoint(sbi); if (err) { - f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); + f2fs_err(sbi, "Failed to get valid F2FS checkpoint"); goto free_meta_inode; } - /* sanity checking of checkpoint */ - err = -EINVAL; - if (sanity_check_ckpt(sbi)) { - f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); - goto free_cp; + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) { + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL; + } + + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FSCK_FLAG)) + set_sbi_flag(sbi, SBI_NEED_FSCK); + + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_err(sbi, "Failed to find devices"); + goto free_devices; + } + + err = f2fs_init_post_read_wq(sbi); + if (err) { + f2fs_err(sbi, "Failed to initialize post read workqueue"); + goto free_devices; } sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); - sbi->total_valid_inode_count = - le32_to_cpu(sbi->ckpt->valid_inode_count); + percpu_counter_set(&sbi->total_valid_inode_count, + le32_to_cpu(sbi->ckpt->valid_inode_count)); sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; - INIT_LIST_HEAD(&sbi->dir_inode_list); - spin_lock_init(&sbi->dir_inode_lock); + sbi->reserved_blocks = 0; + sbi->current_reserved_blocks = 0; + limit_reserve_root(sbi); + adjust_unusable_cap_perc(sbi); + + f2fs_init_extent_cache_info(sbi); - init_orphan_info(sbi); + f2fs_init_ino_entry_info(sbi); + + f2fs_init_fsync_node_info(sbi); + + /* setup checkpoint request control and start checkpoint issue thread */ + f2fs_init_ckpt_req_control(sbi); + if (!f2fs_readonly(sb) && !test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto stop_ckpt_thread; + } + } /* setup f2fs internal modules */ - err = build_segment_manager(sbi); + err = f2fs_build_segment_manager(sbi); if (err) { - f2fs_msg(sb, KERN_ERR, - "Failed to initialize F2FS segment manager"); + f2fs_err(sbi, "Failed to initialize F2FS segment manager (%d)", + err); goto free_sm; } - err = build_node_manager(sbi); + err = f2fs_build_node_manager(sbi); if (err) { - f2fs_msg(sb, KERN_ERR, - "Failed to initialize F2FS node manager"); + f2fs_err(sbi, "Failed to initialize F2FS node manager (%d)", + err); goto free_nm; } - build_gc_manager(sbi); + /* For write statistics */ + sbi->sectors_written_start = f2fs_get_sectors_written(sbi); + + /* get segno of first zoned block device */ + sbi->first_seq_zone_segno = get_first_seq_zone_segno(sbi); + + sbi->reserved_pin_section = f2fs_sb_has_blkzoned(sbi) ? + ZONED_PIN_SEC_REQUIRED_COUNT : + GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)); + + /* Read accumulated write IO statistics if exists */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + if (__exist_node_summaries(sbi)) + sbi->kbytes_written = + le64_to_cpu(seg_i->journal->info.kbytes_written); + + f2fs_build_gc_manager(sbi); + + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { - f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); + f2fs_err(sbi, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); - goto free_nm; + goto free_stats; } - /* if there are nt orphan nodes free them */ - err = -EINVAL; - if (recover_orphan_inodes(sbi)) - goto free_node_inode; - /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { - f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); + f2fs_err(sbi, "Failed to read root inode"); err = PTR_ERR(root); goto free_node_inode; } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) - goto free_root_inode; + if (!S_ISDIR(root->i_mode) || !root->i_blocks || + !root->i_size || !root->i_nlink) { + iput(root); + err = -EINVAL; + goto free_node_inode; + } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; + goto free_node_inode; + } + + err = f2fs_init_compress_inode(sbi); + if (err) goto free_root_inode; + + err = f2fs_register_sysfs(sbi); + if (err) + goto free_compress_inode; + + sbi->umount_lock_holder = current; +#ifdef CONFIG_QUOTA + /* Enable quota usage during mount */ + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { + err = f2fs_enable_quotas(sb); + if (err) + f2fs_err(sbi, "Cannot turn on quotas: error %d", err); + } + + quota_enabled = f2fs_recover_quota_begin(sbi); +#endif + /* if there are any orphan inodes, free them */ + err = f2fs_recover_orphan_inodes(sbi); + if (err) + goto free_meta; + + if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) { + skip_recovery = true; + goto reset_checkpoint; } /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { - err = recover_fsync_data(sbi); - if (err) - f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); + if (!test_opt(sbi, DISABLE_ROLL_FORWARD) && + !test_opt(sbi, NORECOVERY)) { + /* + * mount should be failed, when device has readonly mode, and + * previous checkpoint was not done by clean system shutdown. + */ + if (f2fs_hw_is_readonly(sbi)) { + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + err = f2fs_recover_fsync_data(sbi, true); + if (err > 0) { + err = -EROFS; + f2fs_err(sbi, "Need to recover fsync data, but " + "write access unavailable, please try " + "mount w/ disable_roll_forward or norecovery"); + } + if (err < 0) + goto free_meta; + } + f2fs_info(sbi, "write access unavailable, skipping recovery"); + goto reset_checkpoint; + } + + if (need_fsck) + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (skip_recovery) + goto reset_checkpoint; + + err = f2fs_recover_fsync_data(sbi, false); + if (err < 0) { + if (err != -ENOMEM) + skip_recovery = true; + need_fsck = true; + f2fs_err(sbi, "Cannot recover all fsync data errno=%d", + err); + goto free_meta; + } + } else { + err = f2fs_recover_fsync_data(sbi, true); + if (err > 0) { + if (!f2fs_readonly(sb)) { + f2fs_err(sbi, "Need to recover fsync data"); + err = -EINVAL; + goto free_meta; + } else { + f2fs_info(sbi, "drop all fsynced data"); + err = 0; + } + } } +reset_checkpoint: +#ifdef CONFIG_QUOTA + f2fs_recover_quota_end(sbi, quota_enabled); +#endif + /* + * If the f2fs is not readonly and fsync data recovery succeeds, + * write pointer consistency of cursegs and other zones are already + * checked and fixed during recovery. However, if recovery fails, + * write pointers are left untouched, and retry-mount should check + * them here. + */ + if (skip_recovery) + err = f2fs_check_and_fix_write_pointer(sbi); + if (err) + goto free_meta; + + /* f2fs_recover_fsync_data() cleared this already */ + clear_sbi_flag(sbi, SBI_POR_DOING); + + err = f2fs_init_inmem_curseg(sbi); + if (err) + goto sync_free_meta; + + if (test_opt(sbi, DISABLE_CHECKPOINT)) + err = f2fs_disable_checkpoint(sbi); + else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) + err = f2fs_enable_checkpoint(sbi); + if (err) + goto sync_free_meta; + /* * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (!(sb->s_flags & MS_RDONLY)) { + if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF || + test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ - err = start_gc_thread(sbi); + err = f2fs_start_gc_thread(sbi); if (err) - goto fail; + goto sync_free_meta; } - err = f2fs_build_stats(sbi); - if (err) - goto fail; - - if (test_opt(sbi, DISCARD)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) - f2fs_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); + /* recover broken superblock */ + if (recovery) { + err = f2fs_commit_super(sbi, true); + f2fs_info(sbi, "Try to recover %dth superblock, ret: %d", + sbi->valid_super_block ? 1 : 2, err); } + f2fs_join_shrinker(sbi); + + f2fs_tuning_parameters(sbi); + + f2fs_notice(sbi, "Mounted with checkpoint version = %llx", + cur_cp_version(F2FS_CKPT(sbi))); + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); + clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + + sbi->umount_lock_holder = NULL; return 0; -fail: - stop_gc_thread(sbi); + +sync_free_meta: + /* safe to flush all the data */ + sync_filesystem(sbi->sb); + retry_cnt = 0; + +free_meta: +#ifdef CONFIG_QUOTA + f2fs_truncate_quota_inode_pages(sb); + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) + f2fs_quota_off_umount(sbi->sb); +#endif + /* + * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by f2fs_write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in f2fs_sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); + /* evict some inodes being cached by GC */ + evict_inodes(sb); + f2fs_unregister_sysfs(sbi); +free_compress_inode: + f2fs_destroy_compress_inode(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; free_node_inode: + f2fs_release_ino_entry(sbi, true); + truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); + sbi->node_inode = NULL; +free_stats: + f2fs_destroy_stats(sbi); free_nm: - destroy_node_manager(sbi); + /* stop discard thread before destroying node manager */ + f2fs_stop_discard_thread(sbi); + f2fs_destroy_node_manager(sbi); free_sm: - destroy_segment_manager(sbi); -free_cp: - kfree(sbi->ckpt); + f2fs_destroy_segment_manager(sbi); +stop_ckpt_thread: + f2fs_stop_ckpt_thread(sbi); + /* flush s_error_work before sbi destroy */ + flush_work(&sbi->s_error_work); + f2fs_destroy_post_read_wq(sbi); +free_devices: + destroy_device_list(sbi); + kvfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); + sbi->meta_inode = NULL; +free_page_array_cache: + f2fs_destroy_page_array_cache(sbi); +free_percpu: + destroy_percpu_info(sbi); +free_iostat: + f2fs_destroy_iostat(sbi); +free_bio_info: + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); + +#if IS_ENABLED(CONFIG_UNICODE) + utf8_unload(sb->s_encoding); + sb->s_encoding = NULL; +#endif +free_options: +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(F2FS_OPTION(sbi).s_qf_names[i]); +#endif + /* no need to free dummy_enc_policy, we just keep it in ctx when failed */ + swap(F2FS_CTX_INFO(ctx).dummy_enc_policy, F2FS_OPTION(sbi).dummy_enc_policy); free_sb_buf: - brelse(raw_super_buf); + kfree(raw_super); free_sbi: kfree(sbi); + sb->s_fs_info = NULL; + + /* give only one another chance */ + if (retry_cnt > 0 && skip_recovery) { + retry_cnt--; + shrink_dcache_sb(sb); + goto try_onemore; + } return err; } -static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int f2fs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, f2fs_fill_super); +} + +static int f2fs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + + return __f2fs_remount(fc, sb); +} + +static void f2fs_fc_free(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); + struct f2fs_fs_context *ctx = fc->fs_private; + + if (!ctx) + return; + +#ifdef CONFIG_QUOTA + f2fs_unnote_qf_name_all(fc); +#endif + fscrypt_free_dummy_policy(&F2FS_CTX_INFO(ctx).dummy_enc_policy); + kfree(ctx); +} + +static const struct fs_context_operations f2fs_context_ops = { + .parse_param = f2fs_parse_param, + .get_tree = f2fs_get_tree, + .reconfigure = f2fs_reconfigure, + .free = f2fs_fc_free, +}; + +static void kill_f2fs_super(struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb->s_root) { + sbi->umount_lock_holder = current; + + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * latter evict_inode() can bypass checking and invalidating + * compress inode cache. + */ + if (test_opt(sbi, COMPRESS_CACHE)) + truncate_inode_pages_final(COMPRESS_MAPPING(sbi)); +#endif + + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); + f2fs_write_checkpoint(sbi, &cpc); + } + + if (is_sbi_flag_set(sbi, SBI_IS_RECOVERED) && f2fs_readonly(sb)) + sb->s_flags &= ~SB_RDONLY; + } + kill_block_super(sb); + /* Release block devices last, after fscrypt_destroy_keyring(). */ + if (sbi) { + destroy_device_list(sbi); + kfree(sbi); + sb->s_fs_info = NULL; + } +} + +static int f2fs_init_fs_context(struct fs_context *fc) +{ + struct f2fs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct f2fs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->fs_private = ctx; + fc->ops = &f2fs_context_ops; + + return 0; } static struct file_system_type f2fs_fs_type = { .owner = THIS_MODULE, .name = "f2fs", - .mount = f2fs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = f2fs_init_fs_context, + .kill_sb = kill_f2fs_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { - f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info), NULL); - if (f2fs_inode_cachep == NULL) - return -ENOMEM; - return 0; + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); + return f2fs_inode_cachep ? 0 : -ENOMEM; } static void destroy_inodecache(void) @@ -839,30 +5508,118 @@ static int __init init_f2fs_fs(void) err = init_inodecache(); if (err) goto fail; - err = create_node_manager_caches(); + err = f2fs_create_node_manager_caches(); if (err) - goto fail; - err = create_gc_caches(); + goto free_inodecache; + err = f2fs_create_segment_manager_caches(); if (err) - goto fail; - err = create_checkpoint_caches(); + goto free_node_manager_caches; + err = f2fs_create_checkpoint_caches(); if (err) - goto fail; - err = register_filesystem(&f2fs_fs_type); + goto free_segment_manager_caches; + err = f2fs_create_recovery_cache(); if (err) - goto fail; + goto free_checkpoint_caches; + err = f2fs_create_extent_cache(); + if (err) + goto free_recovery_cache; + err = f2fs_create_garbage_collection_cache(); + if (err) + goto free_extent_cache; + err = f2fs_init_sysfs(); + if (err) + goto free_garbage_collection_cache; + err = f2fs_init_shrinker(); + if (err) + goto free_sysfs; f2fs_create_root_stats(); + err = f2fs_init_post_read_processing(); + if (err) + goto free_root_stats; + err = f2fs_init_iostat_processing(); + if (err) + goto free_post_read; + err = f2fs_init_bio_entry_cache(); + if (err) + goto free_iostat; + err = f2fs_init_bioset(); + if (err) + goto free_bio_entry_cache; + err = f2fs_init_compress_mempool(); + if (err) + goto free_bioset; + err = f2fs_init_compress_cache(); + if (err) + goto free_compress_mempool; + err = f2fs_create_casefold_cache(); + if (err) + goto free_compress_cache; + err = f2fs_init_xattr_cache(); + if (err) + goto free_casefold_cache; + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_xattr_cache; + return 0; +free_xattr_cache: + f2fs_destroy_xattr_cache(); +free_casefold_cache: + f2fs_destroy_casefold_cache(); +free_compress_cache: + f2fs_destroy_compress_cache(); +free_compress_mempool: + f2fs_destroy_compress_mempool(); +free_bioset: + f2fs_destroy_bioset(); +free_bio_entry_cache: + f2fs_destroy_bio_entry_cache(); +free_iostat: + f2fs_destroy_iostat_processing(); +free_post_read: + f2fs_destroy_post_read_processing(); +free_root_stats: + f2fs_destroy_root_stats(); + f2fs_exit_shrinker(); +free_sysfs: + f2fs_exit_sysfs(); +free_garbage_collection_cache: + f2fs_destroy_garbage_collection_cache(); +free_extent_cache: + f2fs_destroy_extent_cache(); +free_recovery_cache: + f2fs_destroy_recovery_cache(); +free_checkpoint_caches: + f2fs_destroy_checkpoint_caches(); +free_segment_manager_caches: + f2fs_destroy_segment_manager_caches(); +free_node_manager_caches: + f2fs_destroy_node_manager_caches(); +free_inodecache: + destroy_inodecache(); fail: return err; } static void __exit exit_f2fs_fs(void) { - f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); - destroy_checkpoint_caches(); - destroy_gc_caches(); - destroy_node_manager_caches(); + f2fs_destroy_xattr_cache(); + f2fs_destroy_casefold_cache(); + f2fs_destroy_compress_cache(); + f2fs_destroy_compress_mempool(); + f2fs_destroy_bioset(); + f2fs_destroy_bio_entry_cache(); + f2fs_destroy_iostat_processing(); + f2fs_destroy_post_read_processing(); + f2fs_destroy_root_stats(); + f2fs_exit_shrinker(); + f2fs_exit_sysfs(); + f2fs_destroy_garbage_collection_cache(); + f2fs_destroy_extent_cache(); + f2fs_destroy_recovery_cache(); + f2fs_destroy_checkpoint_caches(); + f2fs_destroy_segment_manager_caches(); + f2fs_destroy_node_manager_caches(); destroy_inodecache(); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c new file mode 100644 index 000000000000..c42f4f979d13 --- /dev/null +++ b/fs/f2fs/sysfs.c @@ -0,0 +1,2004 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs sysfs interface + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2017 Chao Yu <chao@kernel.org> + */ +#include <linux/compiler.h> +#include <linux/proc_fs.h> +#include <linux/f2fs_fs.h> +#include <linux/seq_file.h> +#include <linux/unicode.h> +#include <linux/ioprio.h> +#include <linux/sysfs.h> + +#include "f2fs.h" +#include "segment.h" +#include "gc.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +static struct proc_dir_entry *f2fs_proc_root; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_STAT_FS + STAT_INFO, /* struct f2fs_stat_info */ +#endif +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif + RESERVED_BLOCKS, /* struct f2fs_sb_info */ + CPRC_INFO, /* struct ckpt_req_control */ + ATGC_INFO, /* struct atgc_management */ +}; + +static const char *gc_mode_names[MAX_GC_MODE] = { + "GC_NORMAL", + "GC_IDLE_CB", + "GC_IDLE_GREEDY", + "GC_IDLE_AT", + "GC_URGENT_HIGH", + "GC_URGENT_LOW", + "GC_URGENT_MID" +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); + ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, + const char *buf, size_t len); + int struct_type; + int offset; + int id; +}; + +struct f2fs_base_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_base_attr *a, char *buf); + ssize_t (*store)(struct f2fs_base_attr *a, const char *buf, size_t len); +}; + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf); + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) + return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&F2FS_OPTION(sbi).fault_info; +#endif +#ifdef CONFIG_F2FS_STAT_FS + else if (struct_type == STAT_INFO) + return (unsigned char *)F2FS_STAT(sbi); +#endif + else if (struct_type == CPRC_INFO) + return (unsigned char *)&sbi->cprc_info; + else if (struct_type == ATGC_INFO) + return (unsigned char *)&sbi->am; + return NULL; +} + +static ssize_t dirty_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%llu\n", + (unsigned long long)(dirty_segments(sbi))); +} + +static ssize_t free_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%llu\n", + (unsigned long long)(free_segments(sbi))); +} + +static ssize_t ovp_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%llu\n", + (unsigned long long)(overprovision_segments(sbi))); +} + +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + ((f2fs_get_sectors_written(sbi) - + sbi->sectors_written_start) >> 1))); +} + +static ssize_t sb_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%lx\n", sbi->s_flag); +} + +static ssize_t cp_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); +} + +static ssize_t pending_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->discard_cmd_cnt)); +} + +static ssize_t issued_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->issued_discard)); +} + +static ssize_t queued_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->queued_discard)); +} + +static ssize_t undiscard_blks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sysfs_emit(buf, "%u\n", + SM_I(sbi)->dcc_info->undiscard_blks); +} + +static ssize_t atgc_enabled_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%d\n", sbi->am.atgc_enabled ? 1 : 0); +} + +static ssize_t gc_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]); +} + +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + int len = 0; + + if (f2fs_sb_has_encrypt(sbi)) + len += sysfs_emit_at(buf, len, "%s", + "encryption"); + if (f2fs_sb_has_blkzoned(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "inode_checksum"); + if (f2fs_sb_has_flexible_inline_xattr(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "flexible_inline_xattr"); + if (f2fs_sb_has_quota_ino(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "quota_ino"); + if (f2fs_sb_has_inode_crtime(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "inode_crtime"); + if (f2fs_sb_has_lost_found(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "lost_found"); + if (f2fs_sb_has_verity(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "verity"); + if (f2fs_sb_has_sb_chksum(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "sb_checksum"); + if (f2fs_sb_has_casefold(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "casefold"); + if (f2fs_sb_has_readonly(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "readonly"); + if (f2fs_sb_has_compression(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "compression"); + if (f2fs_sb_has_packed_ssa(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "packed_ssa"); + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "pin_file"); + len += sysfs_emit_at(buf, len, "\n"); + return len; +} + +static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks); +} + +static ssize_t unusable_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + block_t unusable; + + if (test_opt(sbi, DISABLE_CHECKPOINT)) + unusable = sbi->unusable_block_count; + else + unusable = f2fs_get_unusable_blocks(sbi); + return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable); +} + +static ssize_t encoding_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ +#if IS_ENABLED(CONFIG_UNICODE) + struct super_block *sb = sbi->sb; + + if (f2fs_sb_has_casefold(sbi)) + return sysfs_emit(buf, "UTF-8 (%d.%d.%d)\n", + (sb->s_encoding->version >> 16) & 0xff, + (sb->s_encoding->version >> 8) & 0xff, + sb->s_encoding->version & 0xff); +#endif + return sysfs_emit(buf, "(none)\n"); +} + +static ssize_t encoding_flags_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%x\n", + le16_to_cpu(F2FS_RAW_SUPER(sbi)->s_encoding_flags)); +} + +static ssize_t effective_lookup_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (F2FS_OPTION(sbi).lookup_mode) { + case LOOKUP_PERF: + return sysfs_emit(buf, "perf\n"); + case LOOKUP_COMPAT: + return sysfs_emit(buf, "compat\n"); + case LOOKUP_AUTO: + if (sb_no_casefold_compat_fallback(sbi->sb)) + return sysfs_emit(buf, "auto:perf\n"); + return sysfs_emit(buf, "auto:compat\n"); + } + return 0; +} + +static ssize_t mounted_time_sec_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time); +} + +#ifdef CONFIG_F2FS_STAT_FS +static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + return sysfs_emit(buf, "%llu\n", + (unsigned long long)(si->tot_blks - + (si->bg_data_blks + si->bg_node_blks))); +} + +static ssize_t moved_blocks_background_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + return sysfs_emit(buf, "%llu\n", + (unsigned long long)(si->bg_data_blks + si->bg_node_blks)); +} + +static ssize_t avg_vblocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + si->dirty_count = dirty_segments(sbi); + f2fs_update_sit_info(sbi); + return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); +} +#endif + +static ssize_t main_blkaddr_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%llu\n", + (unsigned long long)MAIN_BLKADDR(sbi)); +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + if (!strcmp(a->attr.name, "extension_list")) { + __u8 (*extlist)[F2FS_EXTENSION_LEN] = + sbi->raw_super->extension_list; + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int len = 0, i; + + len += sysfs_emit_at(buf, len, "cold file extension:\n"); + for (i = 0; i < cold_count; i++) + len += sysfs_emit_at(buf, len, "%s\n", extlist[i]); + + len += sysfs_emit_at(buf, len, "hot file extension:\n"); + for (i = cold_count; i < cold_count + hot_count; i++) + len += sysfs_emit_at(buf, len, "%s\n", extlist[i]); + + return len; + } + + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + struct ckpt_req_control *cprc = &sbi->cprc_info; + int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); + int level = IOPRIO_PRIO_LEVEL(cprc->ckpt_thread_ioprio); + + if (class != IOPRIO_CLASS_RT && class != IOPRIO_CLASS_BE) + return -EINVAL; + + return sysfs_emit(buf, "%s,%d\n", + class == IOPRIO_CLASS_RT ? "rt" : "be", level); + } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_written_block); + + if (!strcmp(a->attr.name, "compr_saved_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_saved_block); + + if (!strcmp(a->attr.name, "compr_new_inode")) + return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); +#endif + + if (!strcmp(a->attr.name, "gc_segment_mode")) + return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode); + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + return sysfs_emit(buf, "%u\n", + sbi->gc_reclaimed_segs[sbi->gc_segment_mode]); + } + + if (!strcmp(a->attr.name, "current_atomic_write")) { + s64 current_write = atomic64_read(&sbi->current_atomic_write); + + return sysfs_emit(buf, "%lld\n", current_write); + } + + if (!strcmp(a->attr.name, "peak_atomic_write")) + return sysfs_emit(buf, "%lld\n", sbi->peak_atomic_write); + + if (!strcmp(a->attr.name, "committed_atomic_block")) + return sysfs_emit(buf, "%llu\n", sbi->committed_atomic_block); + + if (!strcmp(a->attr.name, "revoked_atomic_block")) + return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block); + +#ifdef CONFIG_F2FS_STAT_FS + if (!strcmp(a->attr.name, "cp_foreground_calls")) + return sysfs_emit(buf, "%d\n", + atomic_read(&sbi->cp_call_count[TOTAL_CALL]) - + atomic_read(&sbi->cp_call_count[BACKGROUND])); + if (!strcmp(a->attr.name, "cp_background_calls")) + return sysfs_emit(buf, "%d\n", + atomic_read(&sbi->cp_call_count[BACKGROUND])); +#endif + + ui = (unsigned int *)(ptr + a->offset); + + return sysfs_emit(buf, "%u\n", *ui); +} + +static ssize_t __sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + if (!strcmp(a->attr.name, "extension_list")) { + const char *name = strim((char *)buf); + bool set = true, hot; + + if (!strncmp(name, "[h]", 3)) + hot = true; + else if (!strncmp(name, "[c]", 3)) + hot = false; + else + return -EINVAL; + + name += 3; + + if (*name == '!') { + name++; + set = false; + } + + if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN) + return -EINVAL; + + f2fs_down_write(&sbi->sb_lock); + + ret = f2fs_update_extension_list(sbi, name, hot, set); + if (ret) + goto out; + + ret = f2fs_commit_super(sbi, false); + if (ret) + f2fs_update_extension_list(sbi, name, hot, !set); +out: + f2fs_up_write(&sbi->sb_lock); + return ret ? ret : count; + } + + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + const char *name = strim((char *)buf); + struct ckpt_req_control *cprc = &sbi->cprc_info; + int class; + long level; + int ret; + + if (!strncmp(name, "rt,", 3)) + class = IOPRIO_CLASS_RT; + else if (!strncmp(name, "be,", 3)) + class = IOPRIO_CLASS_BE; + else + return -EINVAL; + + name += 3; + ret = kstrtol(name, 10, &level); + if (ret) + return ret; + if (level >= IOPRIO_NR_LEVELS || level < 0) + return -EINVAL; + + cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, level); + if (test_opt(sbi, MERGE_CHECKPOINT)) { + ret = set_task_ioprio(cprc->f2fs_issue_ckpt, + cprc->ckpt_thread_ioprio); + if (ret) + return ret; + } + + return count; + } + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE) { + if (f2fs_build_fault_attr(sbi, 0, t, FAULT_TYPE)) + return -EINVAL; + return count; + } + if (a->struct_type == FAULT_INFO_RATE) { + if (f2fs_build_fault_attr(sbi, t, 0, FAULT_RATE)) + return -EINVAL; + return count; + } +#endif + if (a->struct_type == RESERVED_BLOCKS) { + spin_lock(&sbi->stat_lock); + if (t > (unsigned long)(sbi->user_block_count - + F2FS_OPTION(sbi).root_reserved_blocks)) { + spin_unlock(&sbi->stat_lock); + return -EINVAL; + } + *ui = t; + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->user_block_count - valid_user_blocks(sbi)); + spin_unlock(&sbi->stat_lock); + return count; + } + + if (!strcmp(a->attr.name, "discard_io_aware_gran")) { + if (t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + if (t == *ui) + return count; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "discard_granularity")) { + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + if (t == *ui) + return count; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "max_ordered_discard")) { + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "discard_urgent_util")) { + if (t > 100) + return -EINVAL; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "discard_io_aware")) { + if (t >= DPOLICY_IO_AWARE_MAX) + return -EINVAL; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "migration_granularity")) { + if (t == 0 || t > SEGS_PER_SEC(sbi)) + return -EINVAL; + } + + if (!strcmp(a->attr.name, "migration_window_granularity")) { + if (t == 0 || t > SEGS_PER_SEC(sbi)) + return -EINVAL; + } + + if (!strcmp(a->attr.name, "gc_urgent")) { + if (t == 0) { + sbi->gc_mode = GC_NORMAL; + } else if (t == 1) { + sbi->gc_mode = GC_URGENT_HIGH; + if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = true; + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + } else if (t == 2) { + sbi->gc_mode = GC_URGENT_LOW; + } else if (t == 3) { + sbi->gc_mode = GC_URGENT_MID; + if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = true; + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + } + } else { + return -EINVAL; + } + return count; + } + if (!strcmp(a->attr.name, "gc_idle")) { + if (t == GC_IDLE_CB) { + sbi->gc_mode = GC_IDLE_CB; + } else if (t == GC_IDLE_GREEDY) { + sbi->gc_mode = GC_IDLE_GREEDY; + } else if (t == GC_IDLE_AT) { + if (!sbi->am.atgc_enabled) + return -EINVAL; + sbi->gc_mode = GC_IDLE_AT; + } else { + sbi->gc_mode = GC_NORMAL; + } + return count; + } + + if (!strcmp(a->attr.name, "gc_remaining_trials")) { + spin_lock(&sbi->gc_remaining_trials_lock); + sbi->gc_remaining_trials = t; + spin_unlock(&sbi->gc_remaining_trials_lock); + + return count; + } + + if (!strcmp(a->attr.name, "gc_no_zoned_gc_percent")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_zoned_gc_percent")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_valid_thresh_ratio")) { + if (t > 100) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + +#ifdef CONFIG_F2FS_IOSTAT + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + if (!strcmp(a->attr.name, "iostat_period_ms")) { + if (t < MIN_IOSTAT_PERIOD_MS || t > MAX_IOSTAT_PERIOD_MS) + return -EINVAL; + spin_lock_irq(&sbi->iostat_lock); + sbi->iostat_period_ms = (unsigned int)t; + spin_unlock_irq(&sbi->iostat_lock); + return count; + } +#endif + +#ifdef CONFIG_BLK_DEV_ZONED + if (!strcmp(a->attr.name, "blkzone_alloc_policy")) { + if (t < BLKZONE_ALLOC_PRIOR_SEQ || t > BLKZONE_ALLOC_PRIOR_CONV) + return -EINVAL; + sbi->blkzone_alloc_policy = t; + return count; + } +#endif + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block") || + !strcmp(a->attr.name, "compr_saved_block")) { + if (t != 0) + return -EINVAL; + sbi->compr_written_block = 0; + sbi->compr_saved_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "compr_new_inode")) { + if (t != 0) + return -EINVAL; + sbi->compr_new_inode = 0; + return count; + } + + if (!strcmp(a->attr.name, "compress_percent")) { + if (t == 0 || t > 100) + return -EINVAL; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "compress_watermark")) { + if (t == 0 || t > 100) + return -EINVAL; + *ui = t; + return count; + } +#endif + + if (!strcmp(a->attr.name, "atgc_candidate_ratio")) { + if (t > 100) + return -EINVAL; + sbi->am.candidate_ratio = t; + return count; + } + + if (!strcmp(a->attr.name, "atgc_age_weight")) { + if (t > 100) + return -EINVAL; + sbi->am.age_weight = t; + return count; + } + + if (!strcmp(a->attr.name, "gc_segment_mode")) { + if (t < MAX_GC_MODE) + sbi->gc_segment_mode = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "gc_pin_file_threshold")) { + if (t > MAX_GC_FAILED_PINNED_FILES) + return -EINVAL; + sbi->gc_pin_file_threshold = t; + return count; + } + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + if (t != 0) + return -EINVAL; + sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0; + return count; + } + + if (!strcmp(a->attr.name, "seq_file_ra_mul")) { + if (t >= MIN_RA_MUL && t <= MAX_RA_MUL) + sbi->seq_file_ra_mul = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "max_fragment_chunk")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_chunk = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "max_fragment_hole")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_hole = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "peak_atomic_write")) { + if (t != 0) + return -EINVAL; + sbi->peak_atomic_write = 0; + return count; + } + + if (!strcmp(a->attr.name, "committed_atomic_block")) { + if (t != 0) + return -EINVAL; + sbi->committed_atomic_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "revoked_atomic_block")) { + if (t != 0) + return -EINVAL; + sbi->revoked_atomic_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "readdir_ra")) { + sbi->readdir_ra = !!t; + return count; + } + + if (!strcmp(a->attr.name, "hot_data_age_threshold")) { + if (t == 0 || t >= sbi->warm_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "warm_data_age_threshold")) { + if (t <= sbi->hot_data_age_threshold) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "last_age_weight")) { + if (t > 100) + return -EINVAL; + if (t == *ui) + return count; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "max_read_extent_count")) { + if (t > UINT_MAX) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "ipu_policy")) { + if (t >= BIT(F2FS_IPU_MAX)) + return -EINVAL; + /* allow F2FS_IPU_NOCACHE only for IPU in the pinned file */ + if (f2fs_lfs_mode(sbi) && (t & ~BIT(F2FS_IPU_NOCACHE))) + return -EINVAL; + SM_I(sbi)->ipu_policy = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "dir_level")) { + if (t > MAX_DIR_HASH_DEPTH) + return -EINVAL; + sbi->dir_level = t; + return count; + } + + if (!strcmp(a->attr.name, "reserved_pin_section")) { + if (t > GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi))) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_gc_multiple")) { + if (t < 1 || t > SEGS_PER_SEC(sbi)) + return -EINVAL; + sbi->gc_thread->boost_gc_multiple = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "gc_boost_gc_greedy")) { + if (t > GC_GREEDY) + return -EINVAL; + sbi->gc_thread->boost_gc_greedy = (unsigned int)t; + return count; + } + + if (!strcmp(a->attr.name, "bggc_io_aware")) { + if (t < AWARE_ALL_IO || t > AWARE_NONE) + return -EINVAL; + sbi->bggc_io_aware = t; + return count; + } + + if (!strcmp(a->attr.name, "allocate_section_hint")) { + if (t < 0 || t > MAIN_SECS(sbi)) + return -EINVAL; + sbi->allocate_section_hint = t; + return count; + } + + if (!strcmp(a->attr.name, "allocate_section_policy")) { + if (t < ALLOCATE_FORWARD_NOHINT || t > ALLOCATE_FORWARD_FROM_HINT) + return -EINVAL; + sbi->allocate_section_policy = t; + return count; + } + + *ui = (unsigned int)t; + + return count; +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + ssize_t ret; + bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") || + a->struct_type == GC_THREAD); + + if (gc_entry) { + if (!down_read_trylock(&sbi->sb->s_umount)) + return -EAGAIN; + } + ret = __sbi_store(a, sbi, buf, count); + if (gc_entry) + up_read(&sbi->sb->s_umount); + + return ret; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static ssize_t f2fs_base_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->show ? a->show(a, buf) : 0; +} + +static ssize_t f2fs_base_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->store ? a->store(a, buf, len) : 0; +} + +/* + * Note that there are three feature list entries: + * 1) /sys/fs/f2fs/features + * : shows runtime features supported by in-kernel f2fs along with Kconfig. + * - ref. F2FS_FEATURE_RO_ATTR() + * + * 2) /sys/fs/f2fs/$s_id/features <deprecated> + * : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This + * won't add new feature anymore, and thus, users should check entries in 3) + * instead of this 2). + * + * 3) /sys/fs/f2fs/$s_id/feature_list + * : shows on-disk features enabled by mkfs.f2fs per instance, which follows + * sysfs entry rule where each entry should expose single value. + * This list covers old feature list provided by 2) and beyond. Therefore, + * please add new on-disk feature in this list only. + * - ref. F2FS_SB_FEATURE_RO_ATTR() + */ +static ssize_t f2fs_feature_show(struct f2fs_base_attr *a, char *buf) +{ + return sysfs_emit(buf, "supported\n"); +} + +#define F2FS_FEATURE_RO_ATTR(_name) \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ +} + +static ssize_t f2fs_tune_show(struct f2fs_base_attr *a, char *buf) +{ + unsigned int res = 0; + + if (!strcmp(a->attr.name, "reclaim_caches_kb")) + res = f2fs_donate_files(); + + return sysfs_emit(buf, "%u\n", res); +} + +static ssize_t f2fs_tune_store(struct f2fs_base_attr *a, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + if (!strcmp(a->attr.name, "reclaim_caches_kb")) + f2fs_reclaim_caches(t); + + return count; +} + +#define F2FS_TUNE_RW_ATTR(_name) \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0644 }, \ + .show = f2fs_tune_show, \ + .store = f2fs_tune_store, \ +} + +static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (F2FS_HAS_FEATURE(sbi, a->id)) + return sysfs_emit(buf, "supported\n"); + return sysfs_emit(buf, "unsupported\n"); +} + +#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ +static struct f2fs_attr f2fs_attr_sb_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_sb_feature_show, \ + .id = F2FS_FEATURE_##_feat, \ +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0444, \ + f2fs_sbi_show, NULL, \ + offsetof(struct struct_name, elname)) + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + +#ifdef CONFIG_F2FS_STAT_FS +#define STAT_INFO_RO_ATTR(name, elname) \ + F2FS_RO_ATTR(STAT_INFO, f2fs_stat_info, name, elname) +#endif + +#define GC_THREAD_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, name, elname) + +#define SM_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, name, elname) + +#define SM_INFO_GENERAL_RW_ATTR(elname) \ + SM_INFO_RW_ATTR(elname, elname) + +#define DCC_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, name, elname) + +#define DCC_INFO_GENERAL_RW_ATTR(elname) \ + DCC_INFO_RW_ATTR(elname, elname) + +#define NM_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, name, elname) + +#define NM_INFO_GENERAL_RW_ATTR(elname) \ + NM_INFO_RW_ATTR(elname, elname) + +#define F2FS_SBI_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, name, elname) + +#define F2FS_SBI_GENERAL_RW_ATTR(elname) \ + F2FS_SBI_RW_ATTR(elname, elname) + +#define F2FS_SBI_GENERAL_RO_ATTR(elname) \ + F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, elname, elname) + +#ifdef CONFIG_F2FS_FAULT_INJECTION +#define FAULT_INFO_GENERAL_RW_ATTR(type, elname) \ + F2FS_RW_ATTR(type, f2fs_fault_info, elname, elname) +#endif + +#define RESERVED_BLOCKS_GENERAL_RW_ATTR(elname) \ + F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, elname, elname) + +#define CPRC_INFO_GENERAL_RW_ATTR(elname) \ + F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, elname, elname) + +#define ATGC_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(ATGC_INFO, atgc_management, name, elname) + +/* GC_THREAD ATTR */ +GC_THREAD_RW_ATTR(gc_urgent_sleep_time, urgent_sleep_time); +GC_THREAD_RW_ATTR(gc_min_sleep_time, min_sleep_time); +GC_THREAD_RW_ATTR(gc_max_sleep_time, max_sleep_time); +GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); +GC_THREAD_RW_ATTR(gc_no_zoned_gc_percent, no_zoned_gc_percent); +GC_THREAD_RW_ATTR(gc_boost_zoned_gc_percent, boost_zoned_gc_percent); +GC_THREAD_RW_ATTR(gc_valid_thresh_ratio, valid_thresh_ratio); +GC_THREAD_RW_ATTR(gc_boost_gc_multiple, boost_gc_multiple); +GC_THREAD_RW_ATTR(gc_boost_gc_greedy, boost_gc_greedy); + +/* SM_INFO ATTR */ +SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments); +SM_INFO_GENERAL_RW_ATTR(ipu_policy); +SM_INFO_GENERAL_RW_ATTR(min_ipu_util); +SM_INFO_GENERAL_RW_ATTR(min_fsync_blocks); +SM_INFO_GENERAL_RW_ATTR(min_seq_blocks); +SM_INFO_GENERAL_RW_ATTR(min_hot_blocks); +SM_INFO_GENERAL_RW_ATTR(min_ssr_sections); +SM_INFO_GENERAL_RW_ATTR(reserved_segments); + +/* DCC_INFO ATTR */ +DCC_INFO_RW_ATTR(max_small_discards, max_discards); +DCC_INFO_GENERAL_RW_ATTR(max_discard_request); +DCC_INFO_GENERAL_RW_ATTR(min_discard_issue_time); +DCC_INFO_GENERAL_RW_ATTR(mid_discard_issue_time); +DCC_INFO_GENERAL_RW_ATTR(max_discard_issue_time); +DCC_INFO_GENERAL_RW_ATTR(discard_io_aware_gran); +DCC_INFO_GENERAL_RW_ATTR(discard_urgent_util); +DCC_INFO_GENERAL_RW_ATTR(discard_granularity); +DCC_INFO_GENERAL_RW_ATTR(max_ordered_discard); +DCC_INFO_GENERAL_RW_ATTR(discard_io_aware); + +/* NM_INFO ATTR */ +NM_INFO_RW_ATTR(max_roll_forward_node_blocks, max_rf_node_blocks); +NM_INFO_GENERAL_RW_ATTR(ram_thresh); +NM_INFO_GENERAL_RW_ATTR(ra_nid_pages); +NM_INFO_GENERAL_RW_ATTR(dirty_nats_ratio); + +/* F2FS_SBI ATTR */ +F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); +F2FS_SBI_RW_ATTR(gc_idle, gc_mode); +F2FS_SBI_RW_ATTR(gc_urgent, gc_mode); +F2FS_SBI_RW_ATTR(cp_interval, interval_time[CP_TIME]); +F2FS_SBI_RW_ATTR(idle_interval, interval_time[REQ_TIME]); +F2FS_SBI_RW_ATTR(discard_idle_interval, interval_time[DISCARD_TIME]); +F2FS_SBI_RW_ATTR(gc_idle_interval, interval_time[GC_TIME]); +F2FS_SBI_RW_ATTR(umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); +F2FS_SBI_RW_ATTR(gc_pin_file_thresh, gc_pin_file_threshold); +F2FS_SBI_RW_ATTR(gc_reclaimed_segments, gc_reclaimed_segs); +F2FS_SBI_GENERAL_RW_ATTR(max_victim_search); +F2FS_SBI_GENERAL_RW_ATTR(migration_granularity); +F2FS_SBI_GENERAL_RW_ATTR(migration_window_granularity); +F2FS_SBI_GENERAL_RW_ATTR(dir_level); +F2FS_SBI_GENERAL_RW_ATTR(allocate_section_hint); +F2FS_SBI_GENERAL_RW_ATTR(allocate_section_policy); +#ifdef CONFIG_F2FS_IOSTAT +F2FS_SBI_GENERAL_RW_ATTR(iostat_enable); +F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms); +#endif +F2FS_SBI_GENERAL_RW_ATTR(readdir_ra); +F2FS_SBI_GENERAL_RW_ATTR(max_io_bytes); +F2FS_SBI_GENERAL_RW_ATTR(data_io_flag); +F2FS_SBI_GENERAL_RW_ATTR(node_io_flag); +F2FS_SBI_GENERAL_RW_ATTR(gc_remaining_trials); +F2FS_SBI_GENERAL_RW_ATTR(seq_file_ra_mul); +F2FS_SBI_GENERAL_RW_ATTR(gc_segment_mode); +F2FS_SBI_GENERAL_RW_ATTR(max_fragment_chunk); +F2FS_SBI_GENERAL_RW_ATTR(max_fragment_hole); +#ifdef CONFIG_F2FS_FS_COMPRESSION +F2FS_SBI_GENERAL_RW_ATTR(compr_written_block); +F2FS_SBI_GENERAL_RW_ATTR(compr_saved_block); +F2FS_SBI_GENERAL_RW_ATTR(compr_new_inode); +F2FS_SBI_GENERAL_RW_ATTR(compress_percent); +F2FS_SBI_GENERAL_RW_ATTR(compress_watermark); +#endif +/* atomic write */ +F2FS_SBI_GENERAL_RO_ATTR(current_atomic_write); +F2FS_SBI_GENERAL_RW_ATTR(peak_atomic_write); +F2FS_SBI_GENERAL_RW_ATTR(committed_atomic_block); +F2FS_SBI_GENERAL_RW_ATTR(revoked_atomic_block); +/* block age extent cache */ +F2FS_SBI_GENERAL_RW_ATTR(hot_data_age_threshold); +F2FS_SBI_GENERAL_RW_ATTR(warm_data_age_threshold); +F2FS_SBI_GENERAL_RW_ATTR(last_age_weight); +/* read extent cache */ +F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count); +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); +F2FS_SBI_GENERAL_RO_ATTR(max_open_zones); +F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); +#endif +F2FS_SBI_GENERAL_RW_ATTR(carve_out); +F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); +F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware); + +/* STAT_INFO ATTR */ +#ifdef CONFIG_F2FS_STAT_FS +STAT_INFO_RO_ATTR(cp_foreground_calls, cp_call_count[FOREGROUND]); +STAT_INFO_RO_ATTR(cp_background_calls, cp_call_count[BACKGROUND]); +STAT_INFO_RO_ATTR(gc_foreground_calls, gc_call_count[FOREGROUND]); +STAT_INFO_RO_ATTR(gc_background_calls, gc_call_count[BACKGROUND]); +#endif + +/* FAULT_INFO ATTR */ +#ifdef CONFIG_F2FS_FAULT_INJECTION +FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_RATE, inject_rate); +FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TYPE, inject_type); +#endif + +/* RESERVED_BLOCKS ATTR */ +RESERVED_BLOCKS_GENERAL_RW_ATTR(reserved_blocks); + +/* CPRC_INFO ATTR */ +CPRC_INFO_GENERAL_RW_ATTR(ckpt_thread_ioprio); + +/* ATGC_INFO ATTR */ +ATGC_INFO_RW_ATTR(atgc_candidate_ratio, candidate_ratio); +ATGC_INFO_RW_ATTR(atgc_candidate_count, max_candidate_count); +ATGC_INFO_RW_ATTR(atgc_age_weight, age_weight); +ATGC_INFO_RW_ATTR(atgc_age_threshold, age_threshold); + +F2FS_GENERAL_RO_ATTR(dirty_segments); +F2FS_GENERAL_RO_ATTR(free_segments); +F2FS_GENERAL_RO_ATTR(ovp_segments); +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); +F2FS_GENERAL_RO_ATTR(current_reserved_blocks); +F2FS_GENERAL_RO_ATTR(unusable); +F2FS_GENERAL_RO_ATTR(encoding); +F2FS_GENERAL_RO_ATTR(encoding_flags); +F2FS_GENERAL_RO_ATTR(effective_lookup_mode); +F2FS_GENERAL_RO_ATTR(mounted_time_sec); +F2FS_GENERAL_RO_ATTR(main_blkaddr); +F2FS_GENERAL_RO_ATTR(pending_discard); +F2FS_GENERAL_RO_ATTR(atgc_enabled); +F2FS_GENERAL_RO_ATTR(gc_mode); +#ifdef CONFIG_F2FS_STAT_FS +F2FS_GENERAL_RO_ATTR(moved_blocks_background); +F2FS_GENERAL_RO_ATTR(moved_blocks_foreground); +F2FS_GENERAL_RO_ATTR(avg_vblocks); +#endif + +#ifdef CONFIG_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); +#if IS_ENABLED(CONFIG_UNICODE) +F2FS_FEATURE_RO_ATTR(encrypted_casefold); +#endif +#endif /* CONFIG_FS_ENCRYPTION */ +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write); +F2FS_FEATURE_RO_ATTR(extra_attr); +F2FS_FEATURE_RO_ATTR(project_quota); +F2FS_FEATURE_RO_ATTR(inode_checksum); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr); +F2FS_FEATURE_RO_ATTR(quota_ino); +F2FS_FEATURE_RO_ATTR(inode_crtime); +F2FS_FEATURE_RO_ATTR(lost_found); +#ifdef CONFIG_FS_VERITY +F2FS_FEATURE_RO_ATTR(verity); +#endif +F2FS_FEATURE_RO_ATTR(sb_checksum); +#if IS_ENABLED(CONFIG_UNICODE) +F2FS_FEATURE_RO_ATTR(casefold); +#endif +F2FS_FEATURE_RO_ATTR(readonly); +#ifdef CONFIG_F2FS_FS_COMPRESSION +F2FS_FEATURE_RO_ATTR(compression); +#endif +F2FS_FEATURE_RO_ATTR(pin_file); +#ifdef CONFIG_UNICODE +F2FS_FEATURE_RO_ATTR(linear_lookup); +#endif +F2FS_FEATURE_RO_ATTR(packed_ssa); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_no_zoned_gc_percent), + ATTR_LIST(gc_boost_zoned_gc_percent), + ATTR_LIST(gc_valid_thresh_ratio), + ATTR_LIST(gc_boost_gc_multiple), + ATTR_LIST(gc_boost_gc_greedy), + ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), + ATTR_LIST(reclaim_segments), + ATTR_LIST(main_blkaddr), + ATTR_LIST(max_small_discards), + ATTR_LIST(max_discard_request), + ATTR_LIST(min_discard_issue_time), + ATTR_LIST(mid_discard_issue_time), + ATTR_LIST(max_discard_issue_time), + ATTR_LIST(discard_io_aware_gran), + ATTR_LIST(discard_urgent_util), + ATTR_LIST(discard_granularity), + ATTR_LIST(max_ordered_discard), + ATTR_LIST(discard_io_aware), + ATTR_LIST(pending_discard), + ATTR_LIST(gc_mode), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_seq_blocks), + ATTR_LIST(min_hot_blocks), + ATTR_LIST(min_ssr_sections), + ATTR_LIST(reserved_segments), + ATTR_LIST(max_victim_search), + ATTR_LIST(migration_granularity), + ATTR_LIST(migration_window_granularity), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(max_roll_forward_node_blocks), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), + ATTR_LIST(discard_idle_interval), + ATTR_LIST(gc_idle_interval), + ATTR_LIST(umount_discard_timeout), + ATTR_LIST(bggc_io_aware), +#ifdef CONFIG_F2FS_IOSTAT + ATTR_LIST(iostat_enable), + ATTR_LIST(iostat_period_ms), +#endif + ATTR_LIST(readdir_ra), + ATTR_LIST(max_io_bytes), + ATTR_LIST(gc_pin_file_thresh), + ATTR_LIST(extension_list), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(data_io_flag), + ATTR_LIST(node_io_flag), + ATTR_LIST(gc_remaining_trials), + ATTR_LIST(ckpt_thread_ioprio), + ATTR_LIST(dirty_segments), + ATTR_LIST(free_segments), + ATTR_LIST(ovp_segments), + ATTR_LIST(unusable), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), + ATTR_LIST(reserved_blocks), + ATTR_LIST(current_reserved_blocks), + ATTR_LIST(encoding), + ATTR_LIST(encoding_flags), + ATTR_LIST(effective_lookup_mode), + ATTR_LIST(mounted_time_sec), +#ifdef CONFIG_F2FS_STAT_FS + ATTR_LIST(cp_foreground_calls), + ATTR_LIST(cp_background_calls), + ATTR_LIST(gc_foreground_calls), + ATTR_LIST(gc_background_calls), + ATTR_LIST(moved_blocks_foreground), + ATTR_LIST(moved_blocks_background), + ATTR_LIST(avg_vblocks), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(unusable_blocks_per_sec), + ATTR_LIST(max_open_zones), + ATTR_LIST(blkzone_alloc_policy), +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + ATTR_LIST(compr_written_block), + ATTR_LIST(compr_saved_block), + ATTR_LIST(compr_new_inode), + ATTR_LIST(compress_percent), + ATTR_LIST(compress_watermark), +#endif + /* For ATGC */ + ATTR_LIST(atgc_candidate_ratio), + ATTR_LIST(atgc_candidate_count), + ATTR_LIST(atgc_age_weight), + ATTR_LIST(atgc_age_threshold), + ATTR_LIST(atgc_enabled), + ATTR_LIST(seq_file_ra_mul), + ATTR_LIST(gc_segment_mode), + ATTR_LIST(gc_reclaimed_segments), + ATTR_LIST(max_fragment_chunk), + ATTR_LIST(max_fragment_hole), + ATTR_LIST(current_atomic_write), + ATTR_LIST(peak_atomic_write), + ATTR_LIST(committed_atomic_block), + ATTR_LIST(revoked_atomic_block), + ATTR_LIST(hot_data_age_threshold), + ATTR_LIST(warm_data_age_threshold), + ATTR_LIST(last_age_weight), + ATTR_LIST(max_read_extent_count), + ATTR_LIST(carve_out), + ATTR_LIST(reserved_pin_section), + ATTR_LIST(allocate_section_hint), + ATTR_LIST(allocate_section_policy), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs); + +#define BASE_ATTR_LIST(name) (&f2fs_base_attr_##name.attr) +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_FS_ENCRYPTION + BASE_ATTR_LIST(encryption), + BASE_ATTR_LIST(test_dummy_encryption_v2), +#if IS_ENABLED(CONFIG_UNICODE) + BASE_ATTR_LIST(encrypted_casefold), +#endif +#endif /* CONFIG_FS_ENCRYPTION */ +#ifdef CONFIG_BLK_DEV_ZONED + BASE_ATTR_LIST(block_zoned), +#endif + BASE_ATTR_LIST(atomic_write), + BASE_ATTR_LIST(extra_attr), + BASE_ATTR_LIST(project_quota), + BASE_ATTR_LIST(inode_checksum), + BASE_ATTR_LIST(flexible_inline_xattr), + BASE_ATTR_LIST(quota_ino), + BASE_ATTR_LIST(inode_crtime), + BASE_ATTR_LIST(lost_found), +#ifdef CONFIG_FS_VERITY + BASE_ATTR_LIST(verity), +#endif + BASE_ATTR_LIST(sb_checksum), +#if IS_ENABLED(CONFIG_UNICODE) + BASE_ATTR_LIST(casefold), +#endif + BASE_ATTR_LIST(readonly), +#ifdef CONFIG_F2FS_FS_COMPRESSION + BASE_ATTR_LIST(compression), +#endif + BASE_ATTR_LIST(pin_file), +#ifdef CONFIG_UNICODE + BASE_ATTR_LIST(linear_lookup), +#endif + BASE_ATTR_LIST(packed_ssa), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_feat); + +F2FS_GENERAL_RO_ATTR(sb_status); +F2FS_GENERAL_RO_ATTR(cp_status); +F2FS_GENERAL_RO_ATTR(issued_discard); +F2FS_GENERAL_RO_ATTR(queued_discard); +F2FS_GENERAL_RO_ATTR(undiscard_blks); + +static struct attribute *f2fs_stat_attrs[] = { + ATTR_LIST(sb_status), + ATTR_LIST(cp_status), + ATTR_LIST(issued_discard), + ATTR_LIST(queued_discard), + ATTR_LIST(undiscard_blks), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_stat); + +F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT); +F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED); +F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR); +F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA); +F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO); +F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME); +F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND); +F2FS_SB_FEATURE_RO_ATTR(verity, VERITY); +F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); +F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); +F2FS_SB_FEATURE_RO_ATTR(readonly, RO); +F2FS_SB_FEATURE_RO_ATTR(device_alias, DEVICE_ALIAS); +F2FS_SB_FEATURE_RO_ATTR(packed_ssa, PACKED_SSA); + +static struct attribute *f2fs_sb_feat_attrs[] = { + ATTR_LIST(sb_encryption), + ATTR_LIST(sb_block_zoned), + ATTR_LIST(sb_extra_attr), + ATTR_LIST(sb_project_quota), + ATTR_LIST(sb_inode_checksum), + ATTR_LIST(sb_flexible_inline_xattr), + ATTR_LIST(sb_quota_ino), + ATTR_LIST(sb_inode_crtime), + ATTR_LIST(sb_lost_found), + ATTR_LIST(sb_verity), + ATTR_LIST(sb_sb_checksum), + ATTR_LIST(sb_casefold), + ATTR_LIST(sb_compression), + ATTR_LIST(sb_readonly), + ATTR_LIST(sb_device_alias), + ATTR_LIST(sb_packed_ssa), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_sb_feat); + +F2FS_TUNE_RW_ATTR(reclaim_caches_kb); + +static struct attribute *f2fs_tune_attrs[] = { + BASE_ATTR_LIST(reclaim_caches_kb), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_tune); + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static const struct kobj_type f2fs_sb_ktype = { + .default_groups = f2fs_groups, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +static const struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static const struct sysfs_ops f2fs_feat_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + +static const struct kobj_type f2fs_feat_ktype = { + .default_groups = f2fs_feat_groups, + .sysfs_ops = &f2fs_feat_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + +static const struct sysfs_ops f2fs_tune_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + +static const struct kobj_type f2fs_tune_ktype = { + .default_groups = f2fs_tune_groups, + .sysfs_ops = &f2fs_tune_attr_ops, +}; + +static struct kobject f2fs_tune = { + .kset = &f2fs_kset, +}; + +static ssize_t f2fs_stat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_stat_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + complete(&sbi->s_stat_kobj_unregister); +} + +static const struct sysfs_ops f2fs_stat_attr_ops = { + .show = f2fs_stat_attr_show, + .store = f2fs_stat_attr_store, +}; + +static const struct kobj_type f2fs_stat_ktype = { + .default_groups = f2fs_stat_groups, + .sysfs_ops = &f2fs_stat_attr_ops, + .release = f2fs_stat_kobj_release, +}; + +static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static void f2fs_feature_list_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + complete(&sbi->s_feature_list_kobj_unregister); +} + +static const struct sysfs_ops f2fs_feature_list_attr_ops = { + .show = f2fs_sb_feat_attr_show, +}; + +static const struct kobj_type f2fs_feature_list_ktype = { + .default_groups = f2fs_sb_feat_groups, + .sysfs_ops = &f2fs_feature_list_attr_ops, + .release = f2fs_feature_list_kobj_release, +}; + +static int __maybe_unused segment_info_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u", se->type, se->valid_blocks); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps|mtime\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, se->valid_blocks); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_printf(seq, "| %llx", se->mtime); + seq_putc(seq, '\n'); + } + return 0; +} + +static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + int i; + + seq_puts(seq, "format: victim_secmap bitmaps\n"); + + for (i = 0; i < MAIN_SECS(sbi); i++) { + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d", test_bit(i, dirty_i->victim_secmap) ? 1 : 0); + if ((i % 10) == 9 || i == (MAIN_SECS(sbi) - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + return 0; +} + +static int __maybe_unused discard_plist_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i, count; + + seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n"); + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + if (dcc) { + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + + if (i % 8 == 0) + seq_printf(seq, " %-3d", i); + count = 0; + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) + count++; + if (count) + seq_printf(seq, " %7d", count); + else + seq_puts(seq, " ."); + if (i % 8 == 7) + seq_putc(seq, '\n'); + } + seq_putc(seq, '\n'); + mutex_unlock(&dcc->cmd_lock); + } + + return 0; +} + +static int __maybe_unused disk_map_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; + + seq_printf(seq, "Address Layout : %5luB Block address (# of Segments)\n", + F2FS_BLKSIZE); + seq_printf(seq, " SB : %12s\n", "0/1024B"); + seq_printf(seq, " seg0_blkaddr : 0x%010x\n", SEG0_BLKADDR(sbi)); + seq_printf(seq, " Checkpoint : 0x%010x (%10d)\n", + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr), 2); + seq_printf(seq, " SIT : 0x%010x (%10d)\n", + SIT_I(sbi)->sit_base_addr, + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_sit)); + seq_printf(seq, " NAT : 0x%010x (%10d)\n", + NM_I(sbi)->nat_blkaddr, + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_nat)); + seq_printf(seq, " SSA : 0x%010x (%10d)\n", + SM_I(sbi)->ssa_blkaddr, + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_ssa)); + seq_printf(seq, " Main : 0x%010x (%10d)\n", + SM_I(sbi)->main_blkaddr, + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main)); + seq_printf(seq, " Block size : %12lu KB\n", F2FS_BLKSIZE >> 10); + seq_printf(seq, " Segment size : %12d MB\n", + (BLKS_PER_SEG(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10); + seq_printf(seq, " Segs/Sections : %12d\n", + SEGS_PER_SEC(sbi)); + seq_printf(seq, " Section size : %12d MB\n", + (BLKS_PER_SEC(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10); + seq_printf(seq, " # of Sections : %12d\n", + le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); + + if (!f2fs_is_multi_device(sbi)) + return 0; + + seq_puts(seq, "\nDisk Map for multi devices:\n"); + for (i = 0; i < sbi->s_ndevs; i++) + seq_printf(seq, "Disk:%2d (zoned=%d): 0x%010x - 0x%010x on %s\n", + i, bdev_is_zoned(FDEV(i).bdev), + FDEV(i).start_blk, FDEV(i).end_blk, + FDEV(i).path); + return 0; +} + +static int __maybe_unused donation_list_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + struct f2fs_inode_info *fi; + struct dentry *dentry; + char *buf, *path; + int i; + + buf = f2fs_getname(sbi); + if (!buf) + return 0; + + seq_printf(seq, "Donation List\n"); + seq_printf(seq, " # of files : %u\n", sbi->donate_files); + seq_printf(seq, " %-50s %10s %20s %20s %22s\n", + "File path", "Status", "Donation offset (kb)", + "Donation size (kb)", "File cached size (kb)"); + seq_printf(seq, "---\n"); + + for (i = 0; i < sbi->donate_files; i++) { + spin_lock(&sbi->inode_lock[DONATE_INODE]); + if (list_empty(&sbi->inode_list[DONATE_INODE])) { + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + break; + } + fi = list_first_entry(&sbi->inode_list[DONATE_INODE], + struct f2fs_inode_info, gdonate_list); + list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + + if (!inode) + continue; + + inode_lock_shared(inode); + + dentry = d_find_alias(inode); + if (!dentry) { + path = NULL; + } else { + path = dentry_path_raw(dentry, buf, PATH_MAX); + if (IS_ERR(path)) + goto next; + } + seq_printf(seq, " %-50s %10s %20llu %20llu %22llu\n", + path ? path : "<unlinked>", + is_inode_flag_set(inode, FI_DONATE_FINISHED) ? + "Evicted" : "Donated", + (loff_t)fi->donate_start << (PAGE_SHIFT - 10), + (loff_t)(fi->donate_end + 1) << (PAGE_SHIFT - 10), + (loff_t)inode->i_mapping->nrpages << (PAGE_SHIFT - 10)); +next: + dput(dentry); + inode_unlock_shared(inode); + iput(inode); + } + f2fs_putname(buf); + return 0; +} + +#ifdef CONFIG_F2FS_FAULT_INJECTION +static int __maybe_unused inject_stats_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; + int i; + + seq_puts(seq, "fault_type injected_count\n"); + + for (i = 0; i < FAULT_MAX; i++) + seq_printf(seq, "%-24s%-10u\n", f2fs_fault_name[i], + ffi->inject_count[i]); + return 0; +} +#endif + +int __init f2fs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) + goto put_kobject; + + ret = kobject_init_and_add(&f2fs_tune, &f2fs_tune_ktype, + NULL, "tuning"); + if (ret) + goto put_kobject; + + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + if (!f2fs_proc_root) { + ret = -ENOMEM; + goto put_kobject; + } + + return 0; + +put_kobject: + kobject_put(&f2fs_tune); + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); + return ret; +} + +void f2fs_exit_sysfs(void) +{ + kobject_put(&f2fs_tune); + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; +} + +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + goto put_sb_kobj; + + sbi->s_stat_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_stat_kobj_unregister); + err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype, + &sbi->s_kobj, "stat"); + if (err) + goto put_stat_kobj; + + sbi->s_feature_list_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_feature_list_kobj_unregister); + err = kobject_init_and_add(&sbi->s_feature_list_kobj, + &f2fs_feature_list_ktype, + &sbi->s_kobj, "feature_list"); + if (err) + goto put_feature_list_kobj; + + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + if (!sbi->s_proc) { + err = -ENOMEM; + goto put_feature_list_kobj; + } + + proc_create_single_data("segment_info", 0444, sbi->s_proc, + segment_info_seq_show, sb); + proc_create_single_data("segment_bits", 0444, sbi->s_proc, + segment_bits_seq_show, sb); +#ifdef CONFIG_F2FS_IOSTAT + proc_create_single_data("iostat_info", 0444, sbi->s_proc, + iostat_info_seq_show, sb); +#endif + proc_create_single_data("victim_bits", 0444, sbi->s_proc, + victim_bits_seq_show, sb); + proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, + discard_plist_seq_show, sb); + proc_create_single_data("disk_map", 0444, sbi->s_proc, + disk_map_seq_show, sb); + proc_create_single_data("donation_list", 0444, sbi->s_proc, + donation_list_seq_show, sb); +#ifdef CONFIG_F2FS_FAULT_INJECTION + proc_create_single_data("inject_stats", 0444, sbi->s_proc, + inject_stats_seq_show, sb); +#endif + return 0; +put_feature_list_kobj: + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); +put_stat_kobj: + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; +} + +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) +{ + remove_proc_subtree(sbi->sb->s_id, f2fs_proc_root); + + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); + + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); +} diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c new file mode 100644 index 000000000000..05b935b55216 --- /dev/null +++ b/fs/f2fs/verity.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/f2fs/verity.c: fs-verity support for f2fs + * + * Copyright 2019 Google LLC + */ + +/* + * Implementation of fsverity_operations for f2fs. + * + * Like ext4, f2fs stores the verity metadata (Merkle tree and + * fsverity_descriptor) past the end of the file, starting at the first 64K + * boundary beyond i_size. This approach works because (a) verity files are + * readonly, and (b) pages fully beyond i_size aren't visible to userspace but + * can be read/written internally by f2fs with only some relatively small + * changes to f2fs. Extended attributes cannot be used because (a) f2fs limits + * the total size of an inode's xattr entries to 4096 bytes, which wouldn't be + * enough for even a single Merkle tree block, and (b) f2fs encryption doesn't + * encrypt xattrs, yet the verity metadata *must* be encrypted when the file is + * because it contains hashes of the plaintext data. + * + * Using a 64K boundary rather than a 4K one keeps things ready for + * architectures with 64K pages, and it doesn't necessarily waste space on-disk + * since there can be a hole between i_size and the start of the Merkle tree. + */ + +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "xattr.h" + +#define F2FS_VERIFY_VER (1) + +static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode) +{ + return round_up(inode->i_size, 65536); +} + +/* + * Read some verity metadata from the inode. __vfs_read() can't be used because + * we need to read beyond i_size. + */ +static int pagecache_read(struct inode *inode, void *buf, size_t count, + loff_t pos) +{ + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct page *page; + + page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, + NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + memcpy_from_page(buf, page, offset_in_page(pos), n); + + put_page(page); + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY. + * kernel_write() can't be used because the file descriptor is readonly. + */ +static int pagecache_write(struct inode *inode, const void *buf, size_t count, + loff_t pos) +{ + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + + if (pos + count > F2FS_BLK_TO_BYTES(max_file_blocks(inode))) + return -EFBIG; + + while (count) { + size_t n = min_t(size_t, count, + PAGE_SIZE - offset_in_page(pos)); + struct folio *folio; + void *fsdata = NULL; + int res; + + res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata); + if (res) + return res; + + memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n); + + res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata); + if (res < 0) + return res; + if (res != n) + return -EIO; + + buf += n; + pos += n; + count -= n; + } + return 0; +} + +/* + * Format of f2fs verity xattr. This points to the location of the verity + * descriptor within the file data rather than containing it directly because + * the verity descriptor *must* be encrypted when f2fs encryption is used. But, + * f2fs encryption does not encrypt xattrs. + */ +struct fsverity_descriptor_location { + __le32 version; + __le32 size; + __le64 pos; +}; + +static int f2fs_begin_enable_verity(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int err; + + if (f2fs_verity_in_progress(inode)) + return -EBUSY; + + if (f2fs_is_atomic_file(inode)) + return -EOPNOTSUPP; + + /* + * Since the file was opened readonly, we have to initialize the quotas + * here and not rely on ->open() doing it. This must be done before + * evicting the inline data. + */ + err = f2fs_dquot_initialize(inode); + if (err) + return err; + + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + + set_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; +} + +static int f2fs_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size; + struct fsverity_descriptor_location dloc = { + .version = cpu_to_le32(F2FS_VERIFY_VER), + .size = cpu_to_le32(desc_size), + .pos = cpu_to_le64(desc_pos), + }; + int err = 0, err2 = 0; + + /* + * If an error already occurred (which fs/verity/ signals by passing + * desc == NULL), then only clean-up is needed. + */ + if (desc == NULL) + goto cleanup; + + /* Append the verity descriptor. */ + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + goto cleanup; + + /* + * Write all pages (both data and verity metadata). Note that this must + * happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond + * i_size won't be written properly. For crash consistency, this also + * must happen before the verity inode flag gets persisted. + */ + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto cleanup; + + /* Set the verity xattr. */ + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), + NULL, XATTR_CREATE); + if (err) + goto cleanup; + + /* Finally, set the verity inode flag. */ + file_set_verity(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); + + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; + +cleanup: + /* + * Verity failed to be enabled, so clean up by truncating any verity + * metadata that was written beyond i_size (both from cache and from + * disk) and clearing FI_VERITY_IN_PROGRESS. + * + * Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection + * from re-instantiating cached pages we are truncating (since unlike + * normal file accesses, garbage collection isn't limited by i_size). + */ + f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_inode_pages(inode->i_mapping, inode->i_size); + err2 = f2fs_truncate(inode); + if (err2) { + f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)", + err2); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return err ?: err2; +} + +static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + struct fsverity_descriptor_location dloc; + int res; + u32 size; + u64 pos; + + /* Get the descriptor location */ + res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), NULL); + if (res < 0 && res != -ERANGE) + return res; + if (res != sizeof(dloc) || dloc.version != cpu_to_le32(F2FS_VERIFY_VER)) { + f2fs_warn(F2FS_I_SB(inode), "unknown verity xattr format"); + return -EINVAL; + } + size = le32_to_cpu(dloc.size); + pos = le64_to_cpu(dloc.pos); + + /* Get the descriptor */ + if (pos + size < pos || + pos + size > F2FS_BLK_TO_BYTES(max_file_blocks(inode)) || + pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) { + f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr"); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_VERITY_XATTR); + return -EFSCORRUPTED; + } + if (buf_size) { + if (size > buf_size) + return -ERANGE; + res = pagecache_read(inode, buf, size, pos); + if (res) + return res; + } + return size; +} + +static struct page *f2fs_read_merkle_tree_page(struct inode *inode, + pgoff_t index, + unsigned long num_ra_pages) +{ + struct folio *folio; + + index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; + + folio = f2fs_filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); + + if (!IS_ERR(folio)) + folio_put(folio); + else if (num_ra_pages > 1) + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); + folio = read_mapping_folio(inode->i_mapping, index, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); + } + return folio_file_page(folio, index); +} + +static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 pos, unsigned int size) +{ + pos += f2fs_verity_metadata_pos(inode); + + return pagecache_write(inode, buf, size, pos); +} + +const struct fsverity_operations f2fs_verityops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_verity_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), + .begin_enable_verity = f2fs_begin_enable_verity, + .end_enable_verity = f2fs_end_enable_verity, + .get_verity_descriptor = f2fs_get_verity_descriptor, + .read_merkle_tree_page = f2fs_read_merkle_tree_page, + .write_merkle_tree_block = f2fs_write_merkle_tree_block, +}; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3ab07ecd86ca..b4e5c406632f 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/xattr.c * @@ -13,149 +14,130 @@ * suggestion of Luka Renko <luka.renko@hermes.si>. * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, * Red Hat Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/rwsem.h> #include <linux/f2fs_fs.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #include "f2fs.h" #include "xattr.h" +#include "segment.h" -static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) +static struct kmem_cache *inline_xattr_slab; +static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline) { - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - int total_len, prefix_len = 0; - const char *prefix = NULL; - - switch (type) { - case F2FS_XATTR_INDEX_USER: - if (!test_opt(sbi, XATTR_USER)) - return -EOPNOTSUPP; - prefix = XATTR_USER_PREFIX; - prefix_len = XATTR_USER_PREFIX_LEN; - break; - case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - prefix = XATTR_TRUSTED_PREFIX; - prefix_len = XATTR_TRUSTED_PREFIX_LEN; - break; - case F2FS_XATTR_INDEX_SECURITY: - prefix = XATTR_SECURITY_PREFIX; - prefix_len = XATTR_SECURITY_PREFIX_LEN; - break; - default: - return -EINVAL; + if (likely(size == DEFAULT_XATTR_SLAB_SIZE)) { + *is_inline = true; + return f2fs_kmem_cache_alloc(inline_xattr_slab, + GFP_F2FS_ZERO, false, sbi); } + *is_inline = false; + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} - total_len = prefix_len + name_len + 1; - if (list && total_len <= list_size) { - memcpy(list, prefix, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; +static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr, + bool is_inline) +{ + if (is_inline) + kmem_cache_free(inline_xattr_slab, xattr_addr); + else + kfree(xattr_addr); } -static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static int f2fs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - switch (type) { + switch (handler->flags) { case F2FS_XATTR_INDEX_USER: if (!test_opt(sbi, XATTR_USER)) return -EOPNOTSUPP; break; case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; case F2FS_XATTR_INDEX_SECURITY: break; default: return -EINVAL; } - if (strcmp(name, "") == 0) - return -EINVAL; - return f2fs_getxattr(dentry->d_inode, type, name, buffer, size); + return f2fs_getxattr(inode, handler->flags, name, + buffer, size, NULL); } -static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int f2fs_xattr_generic_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - switch (type) { + switch (handler->flags) { case F2FS_XATTR_INDEX_USER: if (!test_opt(sbi, XATTR_USER)) return -EOPNOTSUPP; break; case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; case F2FS_XATTR_INDEX_SECURITY: break; default: return -EINVAL; } - if (strcmp(name, "") == 0) - return -EINVAL; - - return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL); + return f2fs_setxattr(inode, handler->flags, name, + value, size, NULL, flags); } -static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) +static bool f2fs_xattr_user_list(struct dentry *dentry) { - const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; - size_t size; - - if (type != F2FS_XATTR_INDEX_ADVISE) - return 0; + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; + return test_opt(sbi, XATTR_USER); } -static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) +static bool f2fs_xattr_trusted_list(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; - - if (strcmp(name, "") != 0) - return -EINVAL; + return capable(CAP_SYS_ADMIN); +} - *((char *)buffer) = F2FS_I(inode)->i_advise; +static int f2fs_xattr_advise_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + if (buffer) + *((char *)buffer) = F2FS_I(inode)->i_advise; return sizeof(char); } -static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +static int f2fs_xattr_advise_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = dentry->d_inode; + unsigned char old_advise = F2FS_I(inode)->i_advise; + unsigned char new_advise; - if (strcmp(name, "") != 0) - return -EINVAL; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) return -EPERM; if (value == NULL) return -EINVAL; - F2FS_I(inode)->i_advise |= *(char *)value; + new_advise = *(char *)value; + if (new_advise & ~FADVISE_MODIFIABLE_BITS) + return -EINVAL; + + new_advise = new_advise & FADVISE_MODIFIABLE_BITS; + new_advise |= old_advise & ~FADVISE_MODIFIABLE_BITS; + + F2FS_I(inode)->i_advise = new_advise; + f2fs_mark_inode_dirty_sync(inode, true); return 0; } #ifdef CONFIG_F2FS_FS_SECURITY static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *page) + void *folio) { const struct xattr *xattr; int err = 0; @@ -163,7 +145,7 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, for (xattr = xattr_array; xattr->name != NULL; xattr++) { err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, xattr->name, xattr->value, - xattr->value_len, (struct page *)page); + xattr->value_len, folio, 0); if (err < 0) break; } @@ -171,17 +153,17 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, } int f2fs_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, struct page *ipage) + const struct qstr *qstr, struct folio *ifolio) { return security_inode_init_security(inode, dir, qstr, - &f2fs_initxattrs, ipage); + f2fs_initxattrs, ifolio); } #endif const struct xattr_handler f2fs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = F2FS_XATTR_INDEX_USER, - .list = f2fs_xattr_generic_list, + .list = f2fs_xattr_user_list, .get = f2fs_xattr_generic_get, .set = f2fs_xattr_generic_set, }; @@ -189,32 +171,30 @@ const struct xattr_handler f2fs_xattr_user_handler = { const struct xattr_handler f2fs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = F2FS_XATTR_INDEX_TRUSTED, - .list = f2fs_xattr_generic_list, + .list = f2fs_xattr_trusted_list, .get = f2fs_xattr_generic_get, .set = f2fs_xattr_generic_set, }; const struct xattr_handler f2fs_xattr_advise_handler = { - .prefix = F2FS_SYSTEM_ADVISE_PREFIX, + .name = F2FS_SYSTEM_ADVISE_NAME, .flags = F2FS_XATTR_INDEX_ADVISE, - .list = f2fs_xattr_advise_list, - .get = f2fs_xattr_advise_get, - .set = f2fs_xattr_advise_set, + .get = f2fs_xattr_advise_get, + .set = f2fs_xattr_advise_set, }; const struct xattr_handler f2fs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = F2FS_XATTR_INDEX_SECURITY, - .list = f2fs_xattr_generic_list, .get = f2fs_xattr_generic_get, .set = f2fs_xattr_generic_set, }; -static const struct xattr_handler *f2fs_xattr_handler_map[] = { +static const struct xattr_handler * const f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler, - [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY @@ -223,12 +203,8 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, }; -const struct xattr_handler *f2fs_xattr_handlers[] = { +const struct xattr_handler * const f2fs_xattr_handlers[] = { &f2fs_xattr_user_handler, -#ifdef CONFIG_F2FS_FS_POSIX_ACL - &f2fs_xattr_acl_access_handler, - &f2fs_xattr_acl_default_handler, -#endif &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY &f2fs_xattr_security_handler, @@ -237,224 +213,526 @@ const struct xattr_handler *f2fs_xattr_handlers[] = { NULL, }; -static inline const struct xattr_handler *f2fs_xattr_handler(int name_index) +static inline const char *f2fs_xattr_prefix(int index, + struct dentry *dentry) { const struct xattr_handler *handler = NULL; - if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map)) - handler = f2fs_xattr_handler_map[name_index]; - return handler; + if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map)) + handler = f2fs_xattr_handler_map[index]; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } -int f2fs_getxattr(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) +static struct f2fs_xattr_entry *__find_xattr(void *base_addr, + void *last_base_addr, void **last_addr, + int index, size_t len, const char *name) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *entry; - struct page *page; - void *base_addr; - int error = 0, found = 0; - size_t value_len, name_len; - - if (name == NULL) - return -EINVAL; - name_len = strlen(name); - - if (!fi->i_xattr_nid) - return -ENODATA; - - page = get_node_page(sbi, fi->i_xattr_nid); - if (IS_ERR(page)) - return PTR_ERR(page); - base_addr = page_address(page); list_for_each_xattr(entry, base_addr) { - if (entry->e_name_index != name_index) + if ((void *)(entry) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + if (last_addr) + *last_addr = entry; + return NULL; + } + + if (entry->e_name_index != index) continue; - if (entry->e_name_len != name_len) + if (entry->e_name_len != len) continue; - if (!memcmp(entry->e_name, name, name_len)) { - found = 1; + if (!memcmp(entry->e_name, name, len)) break; + } + return entry; +} + +static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, + void *base_addr, void **last_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; + + entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name); + if (!entry) + return NULL; + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } + return entry; +} + +static int read_inline_xattr(struct inode *inode, struct folio *ifolio, + void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int inline_size = inline_xattr_size(inode); + struct folio *folio = NULL; + void *inline_addr; + + if (ifolio) { + inline_addr = inline_xattr_addr(inode, ifolio); + } else { + folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + inline_addr = inline_xattr_addr(inode, folio); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_folio_put(folio, true); + + return 0; +} + +static int read_xattr_block(struct inode *inode, void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int inline_size = inline_xattr_size(inode); + struct folio *xfolio; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xfolio = f2fs_get_xnode_folio(sbi, xnid); + if (IS_ERR(xfolio)) + return PTR_ERR(xfolio); + + xattr_addr = folio_address(xfolio); + memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE); + f2fs_folio_put(xfolio, true); + + return 0; +} + +static int lookup_all_xattrs(struct inode *inode, struct folio *ifolio, + unsigned int index, unsigned int len, + const char *name, struct f2fs_xattr_entry **xe, + void **base_addr, int *base_size, + bool *is_inline) +{ + void *cur_addr, *txattr_addr, *last_txattr_addr; + void *last_addr = NULL; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int inline_size = inline_xattr_size(inode); + int err; + + if (!xnid && !inline_size) + return -ENODATA; + + *base_size = XATTR_SIZE(inode) + XATTR_PADDING_SIZE; + txattr_addr = xattr_alloc(F2FS_I_SB(inode), *base_size, is_inline); + if (!txattr_addr) + return -ENOMEM; + + last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(inode); + + /* read from inline xattr */ + if (inline_size) { + err = read_inline_xattr(inode, ifolio, txattr_addr); + if (err) + goto out; + + *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, + index, len, name); + if (*xe) { + *base_size = inline_size; + goto check; } } - if (!found) { - error = -ENODATA; - goto cleanup; + + /* read from xattr node block */ + if (xnid) { + err = read_xattr_block(inode, txattr_addr); + if (err) + goto out; + } + + if (last_addr) + cur_addr = XATTR_HDR(last_addr) - 1; + else + cur_addr = txattr_addr; + + *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); + if (!*xe) { + f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr", + inode->i_ino); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + err = -ENODATA; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); + goto out; + } +check: + if (IS_XATTR_LAST_ENTRY(*xe)) { + err = -ENODATA; + goto out; + } + + *base_addr = txattr_addr; + return 0; +out: + xattr_free(F2FS_I_SB(inode), txattr_addr, *is_inline); + return err; +} + +static int read_all_xattrs(struct inode *inode, struct folio *ifolio, + void **base_addr) +{ + struct f2fs_xattr_header *header; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); + void *txattr_addr; + int err; + + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), + inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); + if (!txattr_addr) + return -ENOMEM; + + /* read from inline xattr */ + if (inline_size) { + err = read_inline_xattr(inode, ifolio, txattr_addr); + if (err) + goto fail; + } + + /* read from xattr node block */ + if (xnid) { + err = read_xattr_block(inode, txattr_addr); + if (err) + goto fail; + } + + header = XATTR_HDR(txattr_addr); + + /* never been allocated xattrs */ + if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { + header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); + header->h_refcount = cpu_to_le32(1); } + *base_addr = txattr_addr; + return 0; +fail: + kfree(txattr_addr); + return err; +} - value_len = le16_to_cpu(entry->e_value_size); +static inline int write_all_xattrs(struct inode *inode, __u32 hsize, + void *txattr_addr, struct folio *ifolio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + size_t inline_size = inline_xattr_size(inode); + struct folio *in_folio = NULL; + void *xattr_addr; + void *inline_addr = NULL; + struct folio *xfolio; + nid_t new_nid = 0; + int err = 0; + + if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) + if (!f2fs_alloc_nid(sbi, &new_nid)) + return -ENOSPC; + + /* write to inline xattr */ + if (inline_size) { + if (ifolio) { + inline_addr = inline_xattr_addr(inode, ifolio); + } else { + in_folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(in_folio)) { + f2fs_alloc_nid_failed(sbi, new_nid); + return PTR_ERR(in_folio); + } + inline_addr = inline_xattr_addr(inode, in_folio); + } - if (buffer && value_len > buffer_size) { + f2fs_folio_wait_writeback(ifolio ? ifolio : in_folio, + NODE, true, true); + /* no need to use xattr node block */ + if (hsize <= inline_size) { + err = f2fs_truncate_xattr_node(inode); + f2fs_alloc_nid_failed(sbi, new_nid); + if (err) { + f2fs_folio_put(in_folio, true); + return err; + } + memcpy(inline_addr, txattr_addr, inline_size); + folio_mark_dirty(ifolio ? ifolio : in_folio); + goto in_page_out; + } + } + + /* write to xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + xfolio = f2fs_get_xnode_folio(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xfolio)) { + err = PTR_ERR(xfolio); + f2fs_alloc_nid_failed(sbi, new_nid); + goto in_page_out; + } + f2fs_bug_on(sbi, new_nid); + f2fs_folio_wait_writeback(xfolio, NODE, true, true); + } else { + struct dnode_of_data dn; + + set_new_dnode(&dn, inode, NULL, NULL, new_nid); + xfolio = f2fs_new_node_folio(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xfolio)) { + err = PTR_ERR(xfolio); + f2fs_alloc_nid_failed(sbi, new_nid); + goto in_page_out; + } + f2fs_alloc_nid_done(sbi, new_nid); + } + xattr_addr = folio_address(xfolio); + + if (inline_size) + memcpy(inline_addr, txattr_addr, inline_size); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); + + if (inline_size) + folio_mark_dirty(ifolio ? ifolio : in_folio); + folio_mark_dirty(xfolio); + + f2fs_folio_put(xfolio, true); +in_page_out: + f2fs_folio_put(in_folio, true); + return err; +} + +int f2fs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size, struct folio *ifolio) +{ + struct f2fs_xattr_entry *entry = NULL; + int error; + unsigned int size, len; + void *base_addr = NULL; + int base_size; + bool is_inline; + + if (name == NULL) + return -EINVAL; + + len = strlen(name); + if (len > F2FS_NAME_LEN) + return -ERANGE; + + if (!ifolio) + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); + error = lookup_all_xattrs(inode, ifolio, index, len, name, + &entry, &base_addr, &base_size, &is_inline); + if (!ifolio) + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); + if (error) + return error; + + size = le16_to_cpu(entry->e_value_size); + + if (buffer && size > buffer_size) { error = -ERANGE; - goto cleanup; + goto out; } if (buffer) { char *pval = entry->e_name + entry->e_name_len; - memcpy(buffer, pval, value_len); - } - error = value_len; -cleanup: - f2fs_put_page(page, 1); + if (base_size - (pval - (char *)base_addr) < size) { + error = -ERANGE; + goto out; + } + memcpy(buffer, pval, size); + } + error = size; +out: + xattr_free(F2FS_I_SB(inode), base_addr, is_inline); return error; } ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct f2fs_inode_info *fi = F2FS_I(inode); + struct inode *inode = d_inode(dentry); struct f2fs_xattr_entry *entry; - struct page *page; - void *base_addr; - int error = 0; + void *base_addr, *last_base_addr; + int error; size_t rest = buffer_size; - if (!fi->i_xattr_nid) - return 0; + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); + error = read_all_xattrs(inode, NULL, &base_addr); + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); + if (error) + return error; - page = get_node_page(sbi, fi->i_xattr_nid); - if (IS_ERR(page)) - return PTR_ERR(page); - base_addr = page_address(page); + last_base_addr = (void *)base_addr + XATTR_SIZE(inode); list_for_each_xattr(entry, base_addr) { - const struct xattr_handler *handler = - f2fs_xattr_handler(entry->e_name_index); + const char *prefix; + size_t prefix_len; size_t size; - if (!handler) - continue; + prefix = f2fs_xattr_prefix(entry->e_name_index, dentry); - size = handler->list(dentry, buffer, rest, entry->e_name, - entry->e_name_len, handler->flags); - if (buffer && size > rest) { - error = -ERANGE; - goto cleanup; + if ((void *)(entry) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr", + inode->i_ino); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); + break; } - if (buffer) - buffer += size; + if (!prefix) + continue; + + prefix_len = strlen(prefix); + size = prefix_len + entry->e_name_len + 1; + if (buffer) { + if (size > rest) { + error = -ERANGE; + goto cleanup; + } + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, entry->e_name, entry->e_name_len); + buffer += entry->e_name_len; + *buffer++ = 0; + } rest -= size; } error = buffer_size - rest; cleanup: - f2fs_put_page(page, 1); + kfree(base_addr); return error; } -int f2fs_setxattr(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len, struct page *ipage) +static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, + const void *value, size_t size) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_xattr_header *header = NULL; + void *pval = entry->e_name + entry->e_name_len; + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); +} + +static int __f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, + struct folio *ifolio, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_entry *here, *last; - struct page *page; - void *base_addr; - int error, found, free, newsize; - size_t name_len; - char *pval; - int ilock; + void *base_addr, *last_base_addr; + int found, newsize; + size_t len; + __u32 new_hsize; + int error; if (name == NULL) return -EINVAL; if (value == NULL) - value_len = 0; + size = 0; - name_len = strlen(name); + len = strlen(name); - if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN) + if (len > F2FS_NAME_LEN) return -ERANGE; - f2fs_balance_fs(sbi); - - ilock = mutex_lock_op(sbi); - - if (!fi->i_xattr_nid) { - /* Allocate new attribute block */ - struct dnode_of_data dn; + if (size > MAX_VALUE_LEN(inode)) + return -E2BIG; +retry: + error = read_all_xattrs(inode, ifolio, &base_addr); + if (error) + return error; - if (!alloc_nid(sbi, &fi->i_xattr_nid)) { - error = -ENOSPC; - goto exit; - } - set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); - mark_inode_dirty(inode); - - page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); - if (IS_ERR(page)) { - alloc_nid_failed(sbi, fi->i_xattr_nid); - fi->i_xattr_nid = 0; - error = PTR_ERR(page); - goto exit; - } + last_base_addr = (void *)base_addr + XATTR_SIZE(inode); - alloc_nid_done(sbi, fi->i_xattr_nid); - base_addr = page_address(page); - header = XATTR_HDR(base_addr); - header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); - header->h_refcount = cpu_to_le32(1); - } else { - /* The inode already has an extended attribute block. */ - page = get_node_page(sbi, fi->i_xattr_nid); - if (IS_ERR(page)) { - error = PTR_ERR(page); - goto exit; + /* find entry with wanted name. */ + here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); + if (!here) { + if (!F2FS_I(inode)->i_xattr_nid) { + error = f2fs_recover_xattr_data(inode, NULL); + f2fs_notice(F2FS_I_SB(inode), + "recover xattr in inode (%lu), error(%d)", + inode->i_ino, error); + if (!error) { + kfree(base_addr); + goto retry; + } } - - base_addr = page_address(page); - header = XATTR_HDR(base_addr); + f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr", + inode->i_ino); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); + goto exit; } - if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { - error = -EIO; - goto cleanup; - } + found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - /* find entry with wanted name. */ - found = 0; - list_for_each_xattr(here, base_addr) { - if (here->e_name_index != name_index) - continue; - if (here->e_name_len != name_len) - continue; - if (!memcmp(here->e_name, name, name_len)) { - found = 1; - break; + if (found) { + if ((flags & XATTR_CREATE)) { + error = -EEXIST; + goto exit; } + + if (value && f2fs_xattr_value_same(here, value, size)) + goto same; + } else if ((flags & XATTR_REPLACE)) { + error = -ENODATA; + goto exit; } last = here; - - while (!IS_XATTR_LAST_ENTRY(last)) + while (!IS_XATTR_LAST_ENTRY(last)) { + if ((void *)(last) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu", + inode->i_ino, ENTRY_SIZE(last)); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + f2fs_handle_error(F2FS_I_SB(inode), + ERROR_CORRUPTED_XATTR); + goto exit; + } last = XATTR_NEXT_ENTRY(last); + } - newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + - name_len + value_len); + newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); /* 1. Check space */ if (value) { - /* If value is NULL, it is remove operation. - * In case of update operation, we caculate free. + int free; + /* + * If value is NULL, it is remove operation. + * In case of update operation, we calculate free. */ - free = MIN_OFFSET - ((char *)last - (char *)header); + free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); if (found) - free = free - ENTRY_SIZE(here); + free = free + ENTRY_SIZE(here); - if (free < newsize) { - error = -ENOSPC; - goto cleanup; + if (unlikely(free < newsize)) { + error = -E2BIG; + goto exit; } } /* 2. Remove old entry */ if (found) { - /* If entry is found, remove old entry. + /* + * If entry is found, remove old entry. * If not found, remove operation is not needed. */ struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here); @@ -465,37 +743,102 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, memset(last, 0, oldsize); } + new_hsize = (char *)last - (char *)base_addr; + /* 3. Write new entry */ if (value) { - /* Before we come here, old entry is removed. - * We just write new entry. */ - memset(last, 0, newsize); - last->e_name_index = name_index; - last->e_name_len = name_len; - memcpy(last->e_name, name, name_len); - pval = last->e_name + name_len; - memcpy(pval, value, value_len); - last->e_value_size = cpu_to_le16(value_len); + char *pval; + /* + * Before we come here, old entry is removed. + * We just write new entry. + */ + last->e_name_index = index; + last->e_name_len = len; + memcpy(last->e_name, name, len); + pval = last->e_name + len; + memcpy(pval, value, size); + last->e_value_size = cpu_to_le16(size); + new_hsize += newsize; + /* + * Explicitly add the null terminator. The unused xattr space + * is supposed to always be zeroed, which would make this + * unnecessary, but don't depend on that. + */ + *(u32 *)((u8 *)last + newsize) = 0; } - set_page_dirty(page); - f2fs_put_page(page, 1); - - if (is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; - inode->i_ctime = CURRENT_TIME; - clear_inode_flag(fi, FI_ACL_MODE); - } - if (ipage) - update_inode(inode, ipage); + error = write_all_xattrs(inode, new_hsize, base_addr, ifolio); + if (error) + goto exit; + + if (index == F2FS_XATTR_INDEX_ENCRYPTION && + !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) + f2fs_set_encrypted_inode(inode); + + if (!S_ISDIR(inode->i_mode)) + goto same; + /* + * In restrict mode, fsync() always try to trigger checkpoint for all + * metadata consistency, in other mode, it triggers checkpoint when + * parent's xattr metadata was updated. + */ + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) + set_sbi_flag(sbi, SBI_NEED_CP); else - update_inode_page(inode); - mutex_unlock_op(sbi, ilock); + f2fs_add_ino_entry(sbi, inode->i_ino, XATTR_DIR_INO); +same: + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + clear_inode_flag(inode, FI_ACL_MODE); + } - return 0; -cleanup: - f2fs_put_page(page, 1); + inode_set_ctime_current(inode); + f2fs_mark_inode_dirty_sync(inode, true); exit: - mutex_unlock_op(sbi, ilock); + kfree(base_addr); return error; } + +int f2fs_setxattr(struct inode *inode, int index, const char *name, + const void *value, size_t size, + struct folio *ifolio, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err; + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + + err = f2fs_dquot_initialize(inode); + if (err) + return err; + + /* this case is only from f2fs_init_inode_metadata */ + if (ifolio) + return __f2fs_setxattr(inode, index, name, value, + size, ifolio, flags); + f2fs_balance_fs(sbi, true); + + f2fs_lock_op(sbi); + f2fs_down_write(&F2FS_I(inode)->i_xattr_sem); + err = __f2fs_setxattr(inode, index, name, value, size, NULL, flags); + f2fs_up_write(&F2FS_I(inode)->i_xattr_sem); + f2fs_unlock_op(sbi); + + f2fs_update_time(sbi, REQ_TIME); + return err; +} + +int __init f2fs_init_xattr_cache(void) +{ + inline_xattr_slab = f2fs_kmem_cache_create("f2fs_xattr_entry", + DEFAULT_XATTR_SLAB_SIZE); + return inline_xattr_slab ? 0 : -ENOMEM; +} + +void f2fs_destroy_xattr_cache(void) +{ + kmem_cache_destroy(inline_xattr_slab); +}
\ No newline at end of file diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 3c0817bef25d..bce3d93e4755 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/xattr.h * @@ -9,10 +10,6 @@ * On-disk format of extended attributes for the ext2 filesystem. * * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef __F2FS_XATTR_H__ #define __F2FS_XATTR_H__ @@ -27,7 +24,7 @@ #define F2FS_XATTR_REFCOUNT_MAX 1024 /* Name indexes */ -#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise" +#define F2FS_SYSTEM_ADVISE_NAME "system.advise" #define F2FS_XATTR_INDEX_USER 1 #define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2 #define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 @@ -35,6 +32,12 @@ #define F2FS_XATTR_INDEX_LUSTRE 5 #define F2FS_XATTR_INDEX_SECURITY 6 #define F2FS_XATTR_INDEX_ADVISE 7 +/* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */ +#define F2FS_XATTR_INDEX_ENCRYPTION 9 +#define F2FS_XATTR_INDEX_VERITY 11 + +#define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT "c" +#define F2FS_XATTR_NAME_VERITY "v" struct f2fs_xattr_header { __le32 h_magic; /* magic number for identification */ @@ -46,18 +49,18 @@ struct f2fs_xattr_entry { __u8 e_name_index; __u8 e_name_len; __le16 e_value_size; /* size of attribute value */ - char e_name[0]; /* attribute name */ + char e_name[]; /* attribute name */ }; #define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) #define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr)) -#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1)) +#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) -#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ - entry->e_name_len + le16_to_cpu(entry->e_value_size))) + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) #define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ ENTRY_SIZE(entry))) @@ -68,18 +71,30 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) - - -#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \ - sizeof(struct node_footer) - \ - sizeof(__u32)) - -#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \ - sizeof(struct f2fs_xattr_entry)) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) +#define XATTR_SIZE(i) ((F2FS_I(i)->i_xattr_nid ? \ + VALID_XATTR_BLOCK_SIZE : 0) + \ + (inline_xattr_size(i))) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ + VALID_XATTR_BLOCK_SIZE) + +#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ + sizeof(struct f2fs_xattr_header) - \ + sizeof(struct f2fs_xattr_entry)) + +#define MIN_INLINE_XATTR_SIZE (sizeof(struct f2fs_xattr_header) / sizeof(__le32)) +#define MAX_INLINE_XATTR_SIZE \ + (DEF_ADDRS_PER_INODE - \ + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ + DEF_INLINE_RESERVED_SIZE - \ + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) +#define DEFAULT_XATTR_SLAB_SIZE (DEFAULT_INLINE_XATTR_ADDRS * \ + sizeof(__le32) + XATTR_PADDING_SIZE) /* * On-disk structure of f2fs_xattr - * We use only 1 block for xattr. + * We use inline xattrs space + 1 block for xattr. * * +--------------------+ * | f2fs_xattr_header | @@ -109,43 +124,44 @@ struct f2fs_xattr_entry { #ifdef CONFIG_F2FS_FS_XATTR extern const struct xattr_handler f2fs_xattr_user_handler; extern const struct xattr_handler f2fs_xattr_trusted_handler; -extern const struct xattr_handler f2fs_xattr_acl_access_handler; -extern const struct xattr_handler f2fs_xattr_acl_default_handler; extern const struct xattr_handler f2fs_xattr_advise_handler; extern const struct xattr_handler f2fs_xattr_security_handler; -extern const struct xattr_handler *f2fs_xattr_handlers[]; +extern const struct xattr_handler * const f2fs_xattr_handlers[]; -extern int f2fs_setxattr(struct inode *, int, const char *, - const void *, size_t, struct page *); -extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t); -extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); +int f2fs_setxattr(struct inode *, int, const char *, const void *, + size_t, struct folio *, int); +int f2fs_getxattr(struct inode *, int, const char *, void *, + size_t, struct folio *); +ssize_t f2fs_listxattr(struct dentry *, char *, size_t); +int __init f2fs_init_xattr_cache(void); +void f2fs_destroy_xattr_cache(void); #else #define f2fs_xattr_handlers NULL -static inline int f2fs_setxattr(struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len) -{ - return -EOPNOTSUPP; -} -static inline int f2fs_getxattr(struct inode *inode, int name_index, - const char *name, void *buffer, size_t buffer_size) +#define f2fs_listxattr NULL +static inline int f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, + struct folio *folio, int flags) { return -EOPNOTSUPP; } -static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, - size_t buffer_size) +static inline int f2fs_getxattr(struct inode *inode, int index, + const char *name, void *buffer, + size_t buffer_size, struct folio *dfolio) { return -EOPNOTSUPP; } +static inline int __init f2fs_init_xattr_cache(void) { return 0; } +static inline void f2fs_destroy_xattr_cache(void) { } #endif #ifdef CONFIG_F2FS_FS_SECURITY -extern int f2fs_init_security(struct inode *, struct inode *, - const struct qstr *, struct page *); +int f2fs_init_security(struct inode *, struct inode *, + const struct qstr *, struct folio *); #else static inline int f2fs_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, struct page *ipage) + const struct qstr *qstr, struct folio *ifolio) { return 0; } |
