summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_addr.c2
-rw-r--r--fs/9p/vfs_dir.c7
-rw-r--r--fs/9p/vfs_file.c7
-rw-r--r--fs/9p/vfs_inode.c3
-rw-r--r--fs/9p/vfs_inode_dotl.c7
-rw-r--r--fs/ceph/file.c8
-rw-r--r--fs/cifs/cifsproto.h21
-rw-r--r--fs/cifs/cifssmb.c43
-rw-r--r--fs/cifs/connect.c31
-rw-r--r--fs/cifs/file.c11
-rw-r--r--fs/cifs/misc.c111
-rw-r--r--fs/cifs/smb2inode.c21
-rw-r--r--fs/cifs/smb2ops.c23
-rw-r--r--fs/cifs/smb2pdu.c82
-rw-r--r--fs/cifs/smbdirect.c2
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/exfat/dir.c90
-rw-r--r--fs/exfat/exfat_fs.h4
-rw-r--r--fs/exfat/exfat_raw.h21
-rw-r--r--fs/exfat/fatent.c32
-rw-r--r--fs/exfat/file.c3
-rw-r--r--fs/exfat/inode.c6
-rw-r--r--fs/exfat/namei.c2
-rw-r--r--fs/exfat/super.c3
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/extents.c2
-rw-r--r--fs/ext4/fast_commit.c44
-rw-r--r--fs/ext4/file.c34
-rw-r--r--fs/ext4/inode.c20
-rw-r--r--fs/ext4/ioctl.c3
-rw-r--r--fs/ext4/namei.c11
-rw-r--r--fs/ext4/super.c56
-rw-r--r--fs/ext4/xattr.c172
-rw-r--r--fs/f2fs/checkpoint.c37
-rw-r--r--fs/f2fs/compress.c24
-rw-r--r--fs/f2fs/data.c624
-rw-r--r--fs/f2fs/debug.c64
-rw-r--r--fs/f2fs/dir.c4
-rw-r--r--fs/f2fs/extent_cache.c60
-rw-r--r--fs/f2fs/f2fs.h128
-rw-r--r--fs/f2fs/file.c173
-rw-r--r--fs/f2fs/gc.c22
-rw-r--r--fs/f2fs/gc.h2
-rw-r--r--fs/f2fs/inline.c14
-rw-r--r--fs/f2fs/inode.c78
-rw-r--r--fs/f2fs/iostat.c186
-rw-r--r--fs/f2fs/iostat.h19
-rw-r--r--fs/f2fs/namei.c5
-rw-r--r--fs/f2fs/node.c9
-rw-r--r--fs/f2fs/segment.c225
-rw-r--r--fs/f2fs/segment.h41
-rw-r--r--fs/f2fs/super.c63
-rw-r--r--fs/f2fs/sysfs.c49
-rw-r--r--fs/f2fs/verity.c2
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hostfs/hostfs_kern.c15
-rw-r--r--fs/jbd2/transaction.c50
-rw-r--r--fs/jffs2/compr.c50
-rw-r--r--fs/jffs2/compr.h26
-rw-r--r--fs/jffs2/file.c15
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jfs/jfs_dmap.c3
-rw-r--r--fs/netfs/iterator.c2
-rw-r--r--fs/ocfs2/move_extents.c34
-rw-r--r--fs/open.c38
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/ubifs/budget.c9
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/ubifs/file.c31
-rw-r--r--fs/ubifs/io.c6
-rw-r--r--fs/ubifs/journal.c8
-rw-r--r--fs/ubifs/super.c17
-rw-r--r--fs/ubifs/sysfs.c6
-rw-r--r--fs/ubifs/tnc.c24
-rw-r--r--fs/ubifs/ubifs.h5
-rw-r--r--fs/udf/inode.c14
-rw-r--r--fs/xfs/libxfs/xfs_ag.c93
-rw-r--r--fs/xfs/libxfs/xfs_ag.h111
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c683
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h61
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c664
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c64
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c242
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c47
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h20
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c10
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c3
-rw-r--r--fs/xfs/scrub/agheader_repair.c35
-rw-r--r--fs/xfs/scrub/bmap.c2
-rw-r--r--fs/xfs/scrub/common.c21
-rw-r--r--fs/xfs/scrub/fscounters.c13
-rw-r--r--fs/xfs/scrub/repair.c7
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_discard.c50
-rw-r--r--fs/xfs/xfs_filestream.c455
-rw-r--r--fs/xfs/xfs_filestream.h6
-rw-r--r--fs/xfs/xfs_fsmap.c5
-rw-r--r--fs/xfs/xfs_icache.c8
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_iwalk.c10
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_reflink.c4
-rw-r--r--fs/xfs/xfs_super.c47
-rw-r--r--fs/xfs/xfs_trace.h81
-rw-r--r--fs/xfs/xfs_trans.c8
-rw-r--r--fs/xfs/xfs_trans.h2
113 files changed, 3276 insertions, 2603 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 3a9c4517265f..61a51b90600d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -468,7 +468,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
#ifdef CONFIG_9P_FSCACHE
/* register the session for caching */
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+ if (v9ses->cache == CACHE_FSCACHE) {
rc = v9fs_cache_session_get_cookie(v9ses, dev_name);
if (rc < 0)
goto err_clnt;
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 97599edbc300..6f46d7e4c750 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -279,8 +279,6 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
- BUG_ON(!v9inode->writeback_fid);
-
/* Prefetch area to be written into the cache if we're caching this
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 59b0e8948f78..3d74b04fe0de 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -197,7 +197,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
/**
- * v9fs_dir_release - close a directory
+ * v9fs_dir_release - called on a close of a file or directory
* @inode: inode of the directory
* @filp: file pointer to a directory
*
@@ -209,6 +209,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
struct p9_fid *fid;
__le32 version;
loff_t i_size;
+ int retval = 0;
fid = filp->private_data;
p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
@@ -217,7 +218,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
spin_lock(&inode->i_lock);
hlist_del(&fid->ilist);
spin_unlock(&inode->i_lock);
- p9_fid_put(fid);
+ retval = p9_fid_put(fid);
}
if ((filp->f_mode & FMODE_WRITE)) {
@@ -228,7 +229,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
} else {
fscache_unuse_cookie(v9fs_inode_cookie(v9inode), NULL, NULL);
}
- return 0;
+ return retval;
}
const struct file_operations v9fs_dir_operations = {
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index b6ba22975781..44c15eb2b908 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -74,8 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
}
mutex_lock(&v9inode->v_mutex);
- if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
- !v9inode->writeback_fid &&
+ if ((v9ses->cache) && !v9inode->writeback_fid &&
((file->f_flags & O_ACCMODE) != O_RDONLY)) {
/*
* clone a fid and add it to writeback_fid
@@ -93,9 +92,11 @@ int v9fs_file_open(struct inode *inode, struct file *file)
v9inode->writeback_fid = (void *) writeback_fid;
}
mutex_unlock(&v9inode->v_mutex);
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+#ifdef CONFIG_9P_FSCACHE
+ if (v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
+#endif
v9fs_open_fid_add(inode, &fid);
return 0;
out_error:
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4344e7a7865f..1d523bec0a94 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -843,8 +843,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
inode = d_inode(dentry);
v9inode = V9FS_I(inode);
mutex_lock(&v9inode->v_mutex);
- if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
- !v9inode->writeback_fid &&
+ if ((v9ses->cache) && !v9inode->writeback_fid &&
((flags & O_ACCMODE) != O_RDONLY)) {
/*
* clone a fid and add it to writeback_fid
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 3bed3eb3a0e2..331ed60d8fcb 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -316,8 +316,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
v9inode = V9FS_I(inode);
mutex_lock(&v9inode->v_mutex);
- if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
- !v9inode->writeback_fid &&
+ if ((v9ses->cache) && !v9inode->writeback_fid &&
((flags & O_ACCMODE) != O_RDONLY)) {
/*
* clone a fid and add it to writeback_fid
@@ -340,9 +339,11 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err)
goto out;
file->private_data = ofid;
- if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+#ifdef CONFIG_9P_FSCACHE
+ if (v9ses->cache == CACHE_FSCACHE)
fscache_use_cookie(v9fs_inode_cookie(v9inode),
file->f_mode & FMODE_WRITE);
+#endif
v9fs_open_fid_add(inode, &ofid);
file->f_mode |= FMODE_CREATED;
out:
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5dcc62e678c4..f4d8bf7dec88 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2098,6 +2098,9 @@ static long ceph_fallocate(struct file *file, int mode,
loff_t endoff = 0;
loff_t size;
+ dout("%s %p %llx.%llx mode %x, offset %llu length %llu\n", __func__,
+ inode, ceph_vinop(inode), mode, offset, length);
+
if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
@@ -2132,6 +2135,10 @@ static long ceph_fallocate(struct file *file, int mode,
if (ret < 0)
goto unlock;
+ ret = file_modified(file);
+ if (ret)
+ goto put_caps;
+
filemap_invalidate_lock(inode->i_mapping);
ceph_fscache_invalidate(inode, false);
ceph_zero_pagecache_range(inode, offset, length);
@@ -2147,6 +2154,7 @@ static long ceph_fallocate(struct file *file, int mode,
}
filemap_invalidate_unlock(inode->i_mapping);
+put_caps:
ceph_put_cap_refs(ci, got);
unlock:
inode_unlock(inode);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b7a36ebd0f2f..e2eff66eefab 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -667,11 +667,21 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
int match_target_ip(struct TCP_Server_Info *server,
const char *share, size_t share_len,
bool *result);
-
-int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
- struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *dfs_link_path);
+int cifs_inval_name_dfs_link_error(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ bool *islink);
+#else
+static inline int cifs_inval_name_dfs_link_error(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ bool *islink)
+{
+ *islink = false;
+ return 0;
+}
#endif
static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options)
@@ -684,5 +694,6 @@ static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options)
struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon);
void cifs_put_tcon_super(struct super_block *sb);
+int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a24e4ddf8043..a43c78396dd8 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -72,7 +72,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
struct cifs_ses *ses;
struct TCP_Server_Info *server;
struct nls_table *nls_codepage;
- int retries;
/*
* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
@@ -102,45 +101,9 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
}
spin_unlock(&tcon->tc_lock);
- retries = server->nr_targets;
-
- /*
- * Give demultiplex thread up to 10 seconds to each target available for
- * reconnect -- should be greater than cifs socket timeout which is 7
- * seconds.
- */
- while (server->tcpStatus == CifsNeedReconnect) {
- rc = wait_event_interruptible_timeout(server->response_q,
- (server->tcpStatus != CifsNeedReconnect),
- 10 * HZ);
- if (rc < 0) {
- cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n",
- __func__);
- return -ERESTARTSYS;
- }
-
- /* are we still trying to reconnect? */
- spin_lock(&server->srv_lock);
- if (server->tcpStatus != CifsNeedReconnect) {
- spin_unlock(&server->srv_lock);
- break;
- }
- spin_unlock(&server->srv_lock);
-
- if (retries && --retries)
- continue;
-
- /*
- * on "soft" mounts we wait once. Hard mounts keep
- * retrying until process is killed or server comes
- * back on-line
- */
- if (!tcon->retry) {
- cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
- return -EHOSTDOWN;
- }
- retries = server->nr_targets;
- }
+ rc = cifs_wait_for_server_reconnect(server, tcon->retry);
+ if (rc)
+ return rc;
spin_lock(&ses->chan_lock);
if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ec020d860be3..5233f14f0636 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1294,7 +1294,8 @@ cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs)
case AF_INET6: {
struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
- return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
+ return (ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr)
+ && saddr6->sin6_scope_id == vaddr6->sin6_scope_id);
}
default:
WARN_ON(1);
@@ -1343,32 +1344,8 @@ match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
static bool match_server_address(struct TCP_Server_Info *server, struct sockaddr *addr)
{
- switch (addr->sa_family) {
- case AF_INET: {
- struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
- struct sockaddr_in *srv_addr4 =
- (struct sockaddr_in *)&server->dstaddr;
-
- if (addr4->sin_addr.s_addr != srv_addr4->sin_addr.s_addr)
- return false;
- break;
- }
- case AF_INET6: {
- struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
- struct sockaddr_in6 *srv_addr6 =
- (struct sockaddr_in6 *)&server->dstaddr;
-
- if (!ipv6_addr_equal(&addr6->sin6_addr,
- &srv_addr6->sin6_addr))
- return false;
- if (addr6->sin6_scope_id != srv_addr6->sin6_scope_id)
- return false;
- break;
- }
- default:
- WARN_ON(1);
- return false; /* don't expect to be here */
- }
+ if (!cifs_match_ipaddr(addr, (struct sockaddr *)&server->dstaddr))
+ return false;
return true;
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ebfcaae8c437..4d4a2d82636d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -52,6 +52,8 @@ static void cifs_undirty_folios(struct inode *inode, loff_t start, unsigned int
end = (start + len - 1) / PAGE_SIZE;
xas_for_each_marked(&xas, folio, end, PAGECACHE_TAG_DIRTY) {
+ if (xas_retry(&xas, folio))
+ continue;
xas_pause(&xas);
rcu_read_unlock();
folio_lock(folio);
@@ -81,6 +83,8 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len
end = (start + len - 1) / PAGE_SIZE;
xas_for_each(&xas, folio, end) {
+ if (xas_retry(&xas, folio))
+ continue;
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
len, start, folio_index(folio), end);
@@ -112,6 +116,8 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len
end = (start + len - 1) / PAGE_SIZE;
xas_for_each(&xas, folio, end) {
+ if (xas_retry(&xas, folio))
+ continue;
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
len, start, folio_index(folio), end);
@@ -2839,6 +2845,7 @@ err_xid:
free_xid(xid);
if (rc == 0) {
wbc->nr_to_write = count;
+ rc = len;
} else if (is_retryable_error(rc)) {
cifs_pages_write_redirty(inode, start, len);
} else {
@@ -3605,7 +3612,7 @@ static ssize_t __cifs_writev(
ctx->nr_pinned_pages = rc;
ctx->bv = (void *)ctx->iter.bvec;
- ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter);
+ ctx->bv_need_unpin = iov_iter_extract_will_pin(from);
} else if ((iov_iter_is_bvec(from) || iov_iter_is_kvec(from)) &&
!is_sync_kiocb(iocb)) {
/*
@@ -4141,7 +4148,7 @@ static ssize_t __cifs_readv(
ctx->nr_pinned_pages = rc;
ctx->bv = (void *)ctx->iter.bvec;
- ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter);
+ ctx->bv_need_unpin = iov_iter_extract_will_pin(to);
ctx->should_dirty = true;
} else if ((iov_iter_is_bvec(to) || iov_iter_is_kvec(to)) &&
!is_sync_kiocb(iocb)) {
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 2905734eb289..a0d286ee723d 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -21,6 +21,7 @@
#include "cifsfs.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dns_resolve.h"
+#include "dfs_cache.h"
#endif
#include "fs_context.h"
#include "cached_dir.h"
@@ -1198,4 +1199,114 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
return 0;
}
+
+/*
+ * Handle weird Windows SMB server behaviour. It responds with
+ * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request for
+ * "\<server>\<dfsname>\<linkpath>" DFS reference, where <dfsname> contains
+ * non-ASCII unicode symbols.
+ */
+int cifs_inval_name_dfs_link_error(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ bool *islink)
+{
+ struct cifs_ses *ses = tcon->ses;
+ size_t len;
+ char *path;
+ char *ref_path;
+
+ *islink = false;
+
+ /*
+ * Fast path - skip check when @full_path doesn't have a prefix path to
+ * look up or tcon is not DFS.
+ */
+ if (strlen(full_path) < 2 || !cifs_sb ||
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) ||
+ !is_tcon_dfs(tcon) || !ses->server->origin_fullpath)
+ return 0;
+
+ /*
+ * Slow path - tcon is DFS and @full_path has prefix path, so attempt
+ * to get a referral to figure out whether it is an DFS link.
+ */
+ len = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1) + strlen(full_path) + 1;
+ path = kmalloc(len, GFP_KERNEL);
+ if (!path)
+ return -ENOMEM;
+
+ scnprintf(path, len, "%s%s", tcon->tree_name, full_path);
+ ref_path = dfs_cache_canonical_path(path + 1, cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ kfree(path);
+
+ if (IS_ERR(ref_path)) {
+ if (PTR_ERR(ref_path) != -EINVAL)
+ return PTR_ERR(ref_path);
+ } else {
+ struct dfs_info3_param *refs = NULL;
+ int num_refs = 0;
+
+ /*
+ * XXX: we are not using dfs_cache_find() here because we might
+ * end filling all the DFS cache and thus potentially
+ * removing cached DFS targets that the client would eventually
+ * need during failover.
+ */
+ if (ses->server->ops->get_dfs_refer &&
+ !ses->server->ops->get_dfs_refer(xid, ses, ref_path, &refs,
+ &num_refs, cifs_sb->local_nls,
+ cifs_remap(cifs_sb)))
+ *islink = refs[0].server_type == DFS_TYPE_LINK;
+ free_dfs_info_array(refs, num_refs);
+ kfree(ref_path);
+ }
+ return 0;
+}
#endif
+
+int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry)
+{
+ int timeout = 10;
+ int rc;
+
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus != CifsNeedReconnect) {
+ spin_unlock(&server->srv_lock);
+ return 0;
+ }
+ timeout *= server->nr_targets;
+ spin_unlock(&server->srv_lock);
+
+ /*
+ * Give demultiplex thread up to 10 seconds to each target available for
+ * reconnect -- should be greater than cifs socket timeout which is 7
+ * seconds.
+ *
+ * On "soft" mounts we wait once. Hard mounts keep retrying until
+ * process is killed or server comes back on-line.
+ */
+ do {
+ rc = wait_event_interruptible_timeout(server->response_q,
+ (server->tcpStatus != CifsNeedReconnect),
+ timeout * HZ);
+ if (rc < 0) {
+ cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n",
+ __func__);
+ return -ERESTARTSYS;
+ }
+
+ /* are we still trying to reconnect? */
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus != CifsNeedReconnect) {
+ spin_unlock(&server->srv_lock);
+ return 0;
+ }
+ spin_unlock(&server->srv_lock);
+ } while (retry);
+
+ cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__);
+ return -EHOSTDOWN;
+}
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 37b4cd59245d..9b956294e864 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -527,12 +527,13 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse)
{
- int rc;
__u32 create_options = 0;
struct cifsFileInfo *cfile;
struct cached_fid *cfid = NULL;
struct kvec err_iov[3] = {};
int err_buftype[3] = {};
+ bool islink;
+ int rc, rc2;
*adjust_tz = false;
*reparse = false;
@@ -580,15 +581,15 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
SMB2_OP_QUERY_INFO, cfile, NULL, NULL,
NULL, NULL);
goto out;
- } else if (rc != -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) &&
- hdr->Status == STATUS_OBJECT_NAME_INVALID) {
- /*
- * Handle weird Windows SMB server behaviour. It responds with
- * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request
- * for "\<server>\<dfsname>\<linkpath>" DFS reference,
- * where <dfsname> contains non-ASCII unicode symbols.
- */
- rc = -EREMOTE;
+ } else if (rc != -EREMOTE && hdr->Status == STATUS_OBJECT_NAME_INVALID) {
+ rc2 = cifs_inval_name_dfs_link_error(xid, tcon, cifs_sb,
+ full_path, &islink);
+ if (rc2) {
+ rc = rc2;
+ goto out;
+ }
+ if (islink)
+ rc = -EREMOTE;
}
if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb &&
(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS))
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f79b075f2992..6dfb865ee9d7 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -796,7 +796,6 @@ static int
smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path)
{
- int rc;
__le16 *utf16_path;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
int err_buftype = CIFS_NO_BUFFER;
@@ -804,6 +803,8 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec err_iov = {};
struct cifs_fid fid;
struct cached_fid *cfid;
+ bool islink;
+ int rc, rc2;
rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid);
if (!rc) {
@@ -833,15 +834,17 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
if (unlikely(!hdr || err_buftype == CIFS_NO_BUFFER))
goto out;
- /*
- * Handle weird Windows SMB server behaviour. It responds with
- * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request
- * for "\<server>\<dfsname>\<linkpath>" DFS reference,
- * where <dfsname> contains non-ASCII unicode symbols.
- */
- if (rc != -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) &&
- hdr->Status == STATUS_OBJECT_NAME_INVALID)
- rc = -EREMOTE;
+
+ if (rc != -EREMOTE && hdr->Status == STATUS_OBJECT_NAME_INVALID) {
+ rc2 = cifs_inval_name_dfs_link_error(xid, tcon, cifs_sb,
+ full_path, &islink);
+ if (rc2) {
+ rc = rc2;
+ goto out;
+ }
+ if (islink)
+ rc = -EREMOTE;
+ }
if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb &&
(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS))
rc = -EOPNOTSUPP;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index ca9d7110ddcb..0e53265e1462 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -139,66 +139,6 @@ out:
return;
}
-static int wait_for_server_reconnect(struct TCP_Server_Info *server,
- __le16 smb2_command, bool retry)
-{
- int timeout = 10;
- int rc;
-
- spin_lock(&server->srv_lock);
- if (server->tcpStatus != CifsNeedReconnect) {
- spin_unlock(&server->srv_lock);
- return 0;
- }
- timeout *= server->nr_targets;
- spin_unlock(&server->srv_lock);
-
- /*
- * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE
- * here since they are implicitly done when session drops.
- */
- switch (smb2_command) {
- /*
- * BB Should we keep oplock break and add flush to exceptions?
- */
- case SMB2_TREE_DISCONNECT:
- case SMB2_CANCEL:
- case SMB2_CLOSE:
- case SMB2_OPLOCK_BREAK:
- return -EAGAIN;
- }
-
- /*
- * Give demultiplex thread up to 10 seconds to each target available for
- * reconnect -- should be greater than cifs socket timeout which is 7
- * seconds.
- *
- * On "soft" mounts we wait once. Hard mounts keep retrying until
- * process is killed or server comes back on-line.
- */
- do {
- rc = wait_event_interruptible_timeout(server->response_q,
- (server->tcpStatus != CifsNeedReconnect),
- timeout * HZ);
- if (rc < 0) {
- cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n",
- __func__);
- return -ERESTARTSYS;
- }
-
- /* are we still trying to reconnect? */
- spin_lock(&server->srv_lock);
- if (server->tcpStatus != CifsNeedReconnect) {
- spin_unlock(&server->srv_lock);
- return 0;
- }
- spin_unlock(&server->srv_lock);
- } while (retry);
-
- cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__);
- return -EHOSTDOWN;
-}
-
static int
smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct TCP_Server_Info *server)
@@ -243,7 +183,27 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
(!tcon->ses->server) || !server)
return -EIO;
- rc = wait_for_server_reconnect(server, smb2_command, tcon->retry);
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus == CifsNeedReconnect) {
+ /*
+ * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE
+ * here since they are implicitly done when session drops.
+ */
+ switch (smb2_command) {
+ /*
+ * BB Should we keep oplock break and add flush to exceptions?
+ */
+ case SMB2_TREE_DISCONNECT:
+ case SMB2_CANCEL:
+ case SMB2_CLOSE:
+ case SMB2_OPLOCK_BREAK:
+ spin_unlock(&server->srv_lock);
+ return -EAGAIN;
+ }
+ }
+ spin_unlock(&server->srv_lock);
+
+ rc = cifs_wait_for_server_reconnect(server, tcon->retry);
if (rc)
return rc;
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 55b6e319a61d..0362ebd4fa0f 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -837,7 +837,7 @@ static int smbd_post_send_iter(struct smbd_connection *info,
int data_length;
struct smbd_request *request;
struct smbd_data_transfer *packet;
- int new_credits;
+ int new_credits = 0;
wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index e3d168911dbe..006ef68d7ff6 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -183,7 +183,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
unsigned int len)
{
struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
- struct file_ra_state ra;
+ struct file_ra_state ra = {};
struct page *pages[BLKS_PER_BUF];
unsigned i, blocknr, buffer;
unsigned long devsize;
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 1dfa67f307f1..957574180a5e 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -29,14 +29,15 @@ static int exfat_extract_uni_name(struct exfat_dentry *ep,
}
-static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
+static int exfat_get_uniname_from_ext_entry(struct super_block *sb,
struct exfat_chain *p_dir, int entry, unsigned short *uniname)
{
- int i;
+ int i, err;
struct exfat_entry_set_cache es;
- if (exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES))
- return;
+ err = exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES);
+ if (err)
+ return err;
/*
* First entry : file entry
@@ -56,12 +57,13 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
}
exfat_put_dentry_set(&es, false);
+ return 0;
}
/* read a directory entry from the opened directory */
static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry)
{
- int i, dentries_per_clu, num_ext;
+ int i, dentries_per_clu, num_ext, err;
unsigned int type, clu_offset, max_dentries;
struct exfat_chain dir, clu;
struct exfat_uni_name uni_name;
@@ -100,7 +102,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
clu.dir = ei->hint_bmap.clu;
}
- while (clu_offset > 0) {
+ while (clu_offset > 0 && clu.dir != EXFAT_EOF_CLUSTER) {
if (exfat_get_next_cluster(sb, &(clu.dir)))
return -EIO;
@@ -146,8 +148,12 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
0);
*uni_name.name = 0x0;
- exfat_get_uniname_from_ext_entry(sb, &clu, i,
+ err = exfat_get_uniname_from_ext_entry(sb, &clu, i,
uni_name.name);
+ if (err) {
+ brelse(bh);
+ continue;
+ }
exfat_utf16_to_nls(sb, &uni_name,
dir_entry->namebuf.lfn,
dir_entry->namebuf.lfnbuf_len);
@@ -234,10 +240,7 @@ static int exfat_iterate(struct file *file, struct dir_context *ctx)
fake_offset = 1;
}
- if (cpos & (DENTRY_SIZE - 1)) {
- err = -ENOENT;
- goto unlock;
- }
+ cpos = round_up(cpos, DENTRY_SIZE);
/* name buffer should be allocated before use */
err = exfat_alloc_namebuf(nb);
@@ -378,6 +381,12 @@ unsigned int exfat_get_entry_type(struct exfat_dentry *ep)
return TYPE_ACL;
return TYPE_CRITICAL_SEC;
}
+
+ if (ep->type == EXFAT_VENDOR_EXT)
+ return TYPE_VENDOR_EXT;
+ if (ep->type == EXFAT_VENDOR_ALLOC)
+ return TYPE_VENDOR_ALLOC;
+
return TYPE_BENIGN_SEC;
}
@@ -521,6 +530,25 @@ release_fbh:
return ret;
}
+static void exfat_free_benign_secondary_clusters(struct inode *inode,
+ struct exfat_dentry *ep)
+{
+ struct super_block *sb = inode->i_sb;
+ struct exfat_chain dir;
+ unsigned int start_clu =
+ le32_to_cpu(ep->dentry.generic_secondary.start_clu);
+ u64 size = le64_to_cpu(ep->dentry.generic_secondary.size);
+ unsigned char flags = ep->dentry.generic_secondary.flags;
+
+ if (!(flags & ALLOC_POSSIBLE) || !start_clu || !size)
+ return;
+
+ exfat_chain_set(&dir, start_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(size, EXFAT_SB(sb)),
+ flags);
+ exfat_free_cluster(inode, &dir);
+}
+
int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
int entry, int num_entries, struct exfat_uni_name *p_uniname)
{
@@ -553,6 +581,9 @@ int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
if (!ep)
return -EIO;
+ if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC)
+ exfat_free_benign_secondary_clusters(inode, ep);
+
exfat_init_name_entry(ep, uniname);
exfat_update_bh(bh, sync);
brelse(bh);
@@ -576,6 +607,9 @@ int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
if (!ep)
return -EIO;
+ if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC)
+ exfat_free_benign_secondary_clusters(inode, ep);
+
exfat_set_entry_type(ep, TYPE_DELETED);
exfat_update_bh(bh, IS_DIRSYNC(inode));
brelse(bh);
@@ -744,6 +778,7 @@ enum exfat_validate_dentry_mode {
ES_MODE_GET_STRM_ENTRY,
ES_MODE_GET_NAME_ENTRY,
ES_MODE_GET_CRITICAL_SEC_ENTRY,
+ ES_MODE_GET_BENIGN_SEC_ENTRY,
};
static bool exfat_validate_entry(unsigned int type,
@@ -757,36 +792,33 @@ static bool exfat_validate_entry(unsigned int type,
if (type != TYPE_FILE && type != TYPE_DIR)
return false;
*mode = ES_MODE_GET_FILE_ENTRY;
- return true;
+ break;
case ES_MODE_GET_FILE_ENTRY:
if (type != TYPE_STREAM)
return false;
*mode = ES_MODE_GET_STRM_ENTRY;
- return true;
+ break;
case ES_MODE_GET_STRM_ENTRY:
if (type != TYPE_EXTEND)
return false;
*mode = ES_MODE_GET_NAME_ENTRY;
- return true;
+ break;
case ES_MODE_GET_NAME_ENTRY:
- if (type == TYPE_STREAM)
- return false;
- if (type != TYPE_EXTEND) {
- if (!(type & TYPE_CRITICAL_SEC))
- return false;
- *mode = ES_MODE_GET_CRITICAL_SEC_ENTRY;
- }
- return true;
- case ES_MODE_GET_CRITICAL_SEC_ENTRY:
- if (type == TYPE_EXTEND || type == TYPE_STREAM)
+ if (type & TYPE_BENIGN_SEC)
+ *mode = ES_MODE_GET_BENIGN_SEC_ENTRY;
+ else if (type != TYPE_EXTEND)
return false;
- if ((type & TYPE_CRITICAL_SEC) != TYPE_CRITICAL_SEC)
+ break;
+ case ES_MODE_GET_BENIGN_SEC_ENTRY:
+ /* Assume unreconized benign secondary entry */
+ if (!(type & TYPE_BENIGN_SEC))
return false;
- return true;
+ break;
default:
- WARN_ON_ONCE(1);
return false;
}
+
+ return true;
}
struct exfat_dentry *exfat_get_dentry_cached(
@@ -1167,10 +1199,8 @@ int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir,
type = exfat_get_entry_type(ext_ep);
brelse(bh);
- if (type == TYPE_EXTEND || type == TYPE_STREAM)
+ if (type & TYPE_CRITICAL_SEC || type & TYPE_BENIGN_SEC)
count++;
- else
- break;
}
return count;
}
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 1bf16abe3c84..729ada9e26e8 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -50,7 +50,7 @@ enum {
#define ES_IDX_LAST_FILENAME(name_len) \
(ES_IDX_FIRST_FILENAME + EXFAT_FILENAME_ENTRY_NUM(name_len) - 1)
-#define DIR_DELETED 0xFFFF0321
+#define DIR_DELETED 0xFFFFFFF7
/* type values */
#define TYPE_UNUSED 0x0000
@@ -71,6 +71,8 @@ enum {
#define TYPE_PADDING 0x0402
#define TYPE_ACLTAB 0x0403
#define TYPE_BENIGN_SEC 0x0800
+#define TYPE_VENDOR_EXT 0x0801
+#define TYPE_VENDOR_ALLOC 0x0802
#define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */
#define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
index 7f39b1c6469c..0ece2e43cf49 100644
--- a/fs/exfat/exfat_raw.h
+++ b/fs/exfat/exfat_raw.h
@@ -27,6 +27,7 @@
((sbi)->num_clusters - EXFAT_RESERVED_CLUSTERS)
/* AllocationPossible and NoFatChain field in GeneralSecondaryFlags Field */
+#define ALLOC_POSSIBLE 0x01
#define ALLOC_FAT_CHAIN 0x01
#define ALLOC_NO_FAT_CHAIN 0x03
@@ -50,6 +51,8 @@
#define EXFAT_STREAM 0xC0 /* stream entry */
#define EXFAT_NAME 0xC1 /* file name entry */
#define EXFAT_ACL 0xC2 /* stream entry */
+#define EXFAT_VENDOR_EXT 0xE0 /* vendor extension entry */
+#define EXFAT_VENDOR_ALLOC 0xE1 /* vendor allocation entry */
#define IS_EXFAT_CRITICAL_PRI(x) (x < 0xA0)
#define IS_EXFAT_BENIGN_PRI(x) (x < 0xC0)
@@ -155,6 +158,24 @@ struct exfat_dentry {
__le32 start_clu;
__le64 size;
} __packed upcase; /* up-case table directory entry */
+ struct {
+ __u8 flags;
+ __u8 vendor_guid[16];
+ __u8 vendor_defined[14];
+ } __packed vendor_ext; /* vendor extension directory entry */
+ struct {
+ __u8 flags;
+ __u8 vendor_guid[16];
+ __u8 vendor_defined[2];
+ __le32 start_clu;
+ __le64 size;
+ } __packed vendor_alloc; /* vendor allocation directory entry */
+ struct {
+ __u8 flags;
+ __u8 custom_defined[18];
+ __le32 start_clu;
+ __le64 size;
+ } __packed generic_secondary; /* generic secondary directory entry */
} __packed dentry;
} __packed;
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 41ae4cce1f42..56b870d9cc0d 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -307,7 +307,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
struct exfat_chain *p_chain, bool sync_bmap)
{
int ret = -ENOSPC;
- unsigned int num_clusters = 0, total_cnt;
+ unsigned int total_cnt;
unsigned int hint_clu, new_clu, last_clu = EXFAT_EOF_CLUSTER;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -344,17 +344,11 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
/* check cluster validation */
if (!is_valid_cluster(sbi, hint_clu)) {
- exfat_err(sb, "hint_cluster is invalid (%u)",
- hint_clu);
+ if (hint_clu != sbi->num_clusters)
+ exfat_err(sb, "hint_cluster is invalid (%u), rewind to the first cluster",
+ hint_clu);
hint_clu = EXFAT_FIRST_CLUSTER;
- if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
- if (exfat_chain_cont_cluster(sb, p_chain->dir,
- num_clusters)) {
- ret = -EIO;
- goto unlock;
- }
- p_chain->flags = ALLOC_FAT_CHAIN;
- }
+ p_chain->flags = ALLOC_FAT_CHAIN;
}
p_chain->dir = EXFAT_EOF_CLUSTER;
@@ -364,7 +358,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
if (new_clu != hint_clu &&
p_chain->flags == ALLOC_NO_FAT_CHAIN) {
if (exfat_chain_cont_cluster(sb, p_chain->dir,
- num_clusters)) {
+ p_chain->size)) {
ret = -EIO;
goto free_cluster;
}
@@ -377,8 +371,6 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
goto free_cluster;
}
- num_clusters++;
-
/* update FAT table */
if (p_chain->flags == ALLOC_FAT_CHAIN) {
if (exfat_ent_set(sb, new_clu, EXFAT_EOF_CLUSTER)) {
@@ -395,13 +387,14 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
goto free_cluster;
}
}
+ p_chain->size++;
+
last_clu = new_clu;
- if (--num_alloc == 0) {
+ if (p_chain->size == num_alloc) {
sbi->clu_srch_ptr = hint_clu;
- sbi->used_clusters += num_clusters;
+ sbi->used_clusters += num_alloc;
- p_chain->size += num_clusters;
mutex_unlock(&sbi->bitmap_lock);
return 0;
}
@@ -412,7 +405,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
if (exfat_chain_cont_cluster(sb, p_chain->dir,
- num_clusters)) {
+ p_chain->size)) {
ret = -EIO;
goto free_cluster;
}
@@ -421,8 +414,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc,
}
}
free_cluster:
- if (num_clusters)
- __exfat_free_cluster(inode, p_chain);
+ __exfat_free_cluster(inode, p_chain);
unlock:
mutex_unlock(&sbi->bitmap_lock);
return ret;
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 1fdb0a64b91d..e99183a74611 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -209,8 +209,7 @@ void exfat_truncate(struct inode *inode)
if (err)
goto write_size;
- inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
- inode->i_blkbits;
+ inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
write_size:
aligned_size = i_size_read(inode);
if (aligned_size & (blocksize - 1)) {
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 5b644cb057fa..481dd338f2b8 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -220,8 +220,7 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
num_clusters += num_to_be_allocated;
*clu = new_clu.dir;
- inode->i_blocks +=
- num_to_be_allocated << sbi->sect_per_clus_bits;
+ inode->i_blocks += EXFAT_CLU_TO_B(num_to_be_allocated, sbi) >> 9;
/*
* Move *clu pointer along FAT chains (hole care) because the
@@ -576,8 +575,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
exfat_save_attr(inode, info->attr);
- inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
- inode->i_blkbits;
+ inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
inode->i_mtime = info->mtime;
inode->i_ctime = info->mtime;
ei->i_crtime = info->crtime;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 02aab4c3a5f7..e0ff9d156f6f 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -396,7 +396,7 @@ static int exfat_find_empty_entry(struct inode *inode,
ei->i_size_ondisk += sbi->cluster_size;
ei->i_size_aligned += sbi->cluster_size;
ei->flags = p_dir->flags;
- inode->i_blocks += 1 << sbi->sect_per_clus_bits;
+ inode->i_blocks += sbi->cluster_size >> 9;
}
return dentry;
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 35f0305cd493..8c32460e031e 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -373,8 +373,7 @@ static int exfat_read_root(struct inode *inode)
inode->i_op = &exfat_dir_inode_operations;
inode->i_fop = &exfat_dir_operations;
- inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >>
- inode->i_blkbits;
+ inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff;
ei->i_size_aligned = i_size_read(inode);
ei->i_size_ondisk = i_size_read(inode);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 43e26e6f6e42..4eeb02d456a9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1529,6 +1529,7 @@ struct ext4_sb_info {
unsigned int s_mount_opt2;
unsigned long s_mount_flags;
unsigned int s_def_mount_opt;
+ unsigned int s_def_mount_opt2;
ext4_fsblk_t s_sb_block;
atomic64_t s_resv_clusters;
kuid_t s_resuid;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9de1c9d1a13d..3559ea6b0781 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3251,7 +3251,7 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_mark_unwritten(ex2);
err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
- if (err != -ENOSPC && err != -EDQUOT)
+ if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
goto out;
if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 4594b62f147b..b06de728b3b6 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1332,8 +1332,14 @@ struct dentry_info_args {
char *dname;
};
+/* Same as struct ext4_fc_tl, but uses native endianness fields */
+struct ext4_fc_tl_mem {
+ u16 fc_tag;
+ u16 fc_len;
+};
+
static inline void tl_to_darg(struct dentry_info_args *darg,
- struct ext4_fc_tl *tl, u8 *val)
+ struct ext4_fc_tl_mem *tl, u8 *val)
{
struct ext4_fc_dentry_info fcd;
@@ -1345,16 +1351,18 @@ static inline void tl_to_darg(struct dentry_info_args *darg,
darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
}
-static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val)
+static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
{
- memcpy(tl, val, EXT4_FC_TAG_BASE_LEN);
- tl->fc_len = le16_to_cpu(tl->fc_len);
- tl->fc_tag = le16_to_cpu(tl->fc_tag);
+ struct ext4_fc_tl tl_disk;
+
+ memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
+ tl->fc_len = le16_to_cpu(tl_disk.fc_len);
+ tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
}
/* Unlink replay function */
-static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
- u8 *val)
+static int ext4_fc_replay_unlink(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
{
struct inode *inode, *old_parent;
struct qstr entry;
@@ -1451,8 +1459,8 @@ out:
}
/* Link replay function */
-static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
- u8 *val)
+static int ext4_fc_replay_link(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
{
struct inode *inode;
struct dentry_info_args darg;
@@ -1506,8 +1514,8 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
/*
* Inode replay function
*/
-static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
- u8 *val)
+static int ext4_fc_replay_inode(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
{
struct ext4_fc_inode fc_inode;
struct ext4_inode *raw_inode;
@@ -1609,8 +1617,8 @@ out:
* inode for which we are trying to create a dentry here, should already have
* been replayed before we start here.
*/
-static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
- u8 *val)
+static int ext4_fc_replay_create(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
{
int ret = 0;
struct inode *inode = NULL;
@@ -1708,7 +1716,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
/* Replay add range tag */
static int ext4_fc_replay_add_range(struct super_block *sb,
- struct ext4_fc_tl *tl, u8 *val)
+ struct ext4_fc_tl_mem *tl, u8 *val)
{
struct ext4_fc_add_range fc_add_ex;
struct ext4_extent newex, *ex;
@@ -1828,8 +1836,8 @@ out:
/* Replay DEL_RANGE tag */
static int
-ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
- u8 *val)
+ext4_fc_replay_del_range(struct super_block *sb,
+ struct ext4_fc_tl_mem *tl, u8 *val)
{
struct inode *inode;
struct ext4_fc_del_range lrange;
@@ -2025,7 +2033,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
struct ext4_fc_replay_state *state;
int ret = JBD2_FC_REPLAY_CONTINUE;
struct ext4_fc_add_range ext;
- struct ext4_fc_tl tl;
+ struct ext4_fc_tl_mem tl;
struct ext4_fc_tail tail;
__u8 *start, *end, *cur, *val;
struct ext4_fc_head head;
@@ -2144,7 +2152,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_fc_tl tl;
+ struct ext4_fc_tl_mem tl;
__u8 *start, *end, *cur, *val;
int ret = JBD2_FC_REPLAY_CONTINUE;
struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bdf61a62c79..0b8b4499e5ca 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
return false;
}
-/* Is IO overwriting allocated and initialized blocks? */
-static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+/* Is IO overwriting allocated or initialized blocks? */
+static bool ext4_overwrite_io(struct inode *inode,
+ loff_t pos, loff_t len, bool *unwritten)
{
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
@@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
blklen = map.m_len;
err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err != blklen)
+ return false;
/*
* 'err==len' means that all of the blocks have been preallocated,
- * regardless of whether they have been initialized or not. To exclude
- * unwritten extents, we need to check m_flags.
+ * regardless of whether they have been initialized or not. We need to
+ * check m_flags to distinguish the unwritten extents.
*/
- return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
+ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
+ return true;
}
static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
@@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
* - For extending writes case we don't take the shared lock, since it requires
* updating inode i_disksize and/or orphan handling with exclusive lock.
*
- * - shared locking will only be true mostly with overwrites. Otherwise we will
- * switch to exclusive i_rwsem lock.
+ * - shared locking will only be true mostly with overwrites, including
+ * initialized blocks and unwritten blocks. For overwrite unwritten blocks
+ * we protect splitting extents by i_data_sem in ext4_inode_info, so we can
+ * also release exclusive i_rwsem lock.
+ *
+ * - Otherwise we will switch to exclusive i_rwsem lock.
*/
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
- bool *ilock_shared, bool *extend)
+ bool *ilock_shared, bool *extend,
+ bool *unwritten)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -459,7 +468,7 @@ restart:
* in file_modified().
*/
if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
- !ext4_overwrite_io(inode, offset, count))) {
+ !ext4_overwrite_io(inode, offset, count, unwritten))) {
if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
@@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(from);
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
- bool extend = false, unaligned_io = false;
+ bool extend = false, unaligned_io = false, unwritten = false;
bool ilock_shared = true;
/*
@@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ext4_buffered_write_iter(iocb, from);
}
- ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
+ ret = ext4_dio_write_checks(iocb, from,
+ &ilock_shared, &extend, &unwritten);
if (ret <= 0)
return ret;
@@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
ext4_journal_stop(handle);
}
- if (ilock_shared)
+ if (ilock_shared && !unwritten)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
(unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 40579ef513b7..d251d705c276 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4872,13 +4872,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
- if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
- ext4_error_inode(inode, function, line, 0,
- "iget: root inode unallocated");
- ret = -EFSCORRUPTED;
- goto bad_inode;
- }
-
if ((flags & EXT4_IGET_HANDLE) &&
(raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
ret = -ESTALE;
@@ -4951,11 +4944,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* NeilBrown 1999oct15
*/
if (inode->i_nlink == 0) {
- if ((inode->i_mode == 0 ||
+ if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
ino != EXT4_BOOT_LOADER_INO) {
- /* this inode is deleted */
- ret = -ESTALE;
+ /* this inode is deleted or unallocated */
+ if (flags & EXT4_IGET_SPECIAL) {
+ ext4_error_inode(inode, function, line, 0,
+ "iget: special inode unallocated");
+ ret = -EFSCORRUPTED;
+ } else
+ ret = -ESTALE;
goto bad_inode;
}
/* The only unlinked inodes we let through here have
@@ -5788,7 +5786,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
int idxblocks;
- int ret = 0;
+ int ret;
/*
* How many index blocks need to touch to map @lblocks logical blocks
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b0dc7212694e..12435d61f09e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -155,9 +155,6 @@ static int ext4_update_backup_sb(struct super_block *sb,
set_buffer_uptodate(bh);
unlock_buffer(bh);
- if (err)
- goto out_bh;
-
if (handle) {
err = ext4_handle_dirty_metadata(handle, NULL, bh);
if (err)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d10a508d95cd..94608b7df7e8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3872,9 +3872,16 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
goto end_rename;
}
+ /*
+ * We need to protect against old.inode directory getting
+ * converted from inline directory format into a normal one.
+ */
+ inode_lock_nested(old.inode, I_MUTEX_NONDIR2);
retval = ext4_rename_dir_prepare(handle, &old);
- if (retval)
+ if (retval) {
+ inode_unlock(old.inode);
goto end_rename;
+ }
}
/*
* If we're renaming a file within an inline_data dir and adding or
@@ -4006,6 +4013,8 @@ end_rename:
} else {
ext4_journal_stop(handle);
}
+ if (old.dir_bh)
+ inode_unlock(old.inode);
release_bh:
brelse(old.dir_bh);
brelse(old.bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index faae05493471..88f7b8a88c76 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2146,7 +2146,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
return 0;
case Opt_commit:
if (result.uint_32 == 0)
- ctx->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE;
else if (result.uint_32 > INT_MAX / HZ) {
ext4_msg(NULL, KERN_ERR,
"Invalid commit interval %d, "
@@ -2883,7 +2883,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
- int def_errors, def_mount_opt = sbi->s_def_mount_opt;
+ int def_errors;
const struct mount_opts *m;
char sep = nodefs ? '\n' : ',';
@@ -2895,15 +2895,28 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
for (m = ext4_mount_opts; m->token != Opt_err; m++) {
int want_set = m->flags & MOPT_SET;
+ int opt_2 = m->flags & MOPT_2;
+ unsigned int mount_opt, def_mount_opt;
+
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
m->flags & MOPT_SKIP)
continue;
- if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
- continue; /* skip if same as the default */
+
+ if (opt_2) {
+ mount_opt = sbi->s_mount_opt2;
+ def_mount_opt = sbi->s_def_mount_opt2;
+ } else {
+ mount_opt = sbi->s_mount_opt;
+ def_mount_opt = sbi->s_def_mount_opt;
+ }
+ /* skip if same as the default */
+ if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt)))
+ continue;
+ /* select Opt_noFoo vs Opt_Foo */
if ((want_set &&
- (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
- (!want_set && (sbi->s_mount_opt & m->mount_opt)))
- continue; /* select Opt_noFoo vs Opt_Foo */
+ (mount_opt & m->mount_opt) != m->mount_opt) ||
+ (!want_set && (mount_opt & m->mount_opt)))
+ continue;
SEQ_OPTS_PRINT("%s", token2str(m->token));
}
@@ -2931,7 +2944,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
- (sbi->s_mount_opt ^ def_mount_opt)) {
+ (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
SEQ_OPTS_PUTS("data=journal");
else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
@@ -4727,7 +4740,6 @@ static int ext4_group_desc_init(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned int db_count;
ext4_fsblk_t block;
- int ret;
int i;
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
@@ -4767,8 +4779,7 @@ static int ext4_group_desc_init(struct super_block *sb,
ext4_msg(sb, KERN_ERR,
"can't read group descriptor %d", i);
sbi->s_gdb_count = i;
- ret = PTR_ERR(bh);
- goto out;
+ return PTR_ERR(bh);
}
rcu_read_lock();
rcu_dereference(sbi->s_group_desc)[i] = bh;
@@ -4777,13 +4788,10 @@ static int ext4_group_desc_init(struct super_block *sb,
sbi->s_gdb_count = db_count;
if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
- ret = -EFSCORRUPTED;
- goto out;
+ return -EFSCORRUPTED;
}
+
return 0;
-out:
- ext4_group_desc_free(sbi);
- return ret;
}
static int ext4_load_and_init_journal(struct super_block *sb,
@@ -5075,6 +5083,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount;
sbi->s_def_mount_opt = sbi->s_mount_opt;
+ sbi->s_def_mount_opt2 = sbi->s_mount_opt2;
err = ext4_check_opt_consistency(fc, sb);
if (err < 0)
@@ -5209,14 +5218,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (ext4_geometry_check(sb, es))
goto failed_mount;
- err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
- if (err)
- goto failed_mount;
-
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
spin_lock_init(&sbi->s_error_lock);
INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
+ err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
+ if (err)
+ goto failed_mount3;
+
/* Register extent status tree shrinker */
if (ext4_es_register_shrinker(sbi))
goto failed_mount3;
@@ -5937,8 +5946,11 @@ static int ext4_load_journal(struct super_block *sb,
if (!really_read_only && journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
es->s_journal_dev = cpu_to_le32(journal_devnum);
-
- /* Make sure we flush the recovery flag to disk. */
+ ext4_commit_super(sb);
+ }
+ if (!really_read_only && journal_inum &&
+ journal_inum != le32_to_cpu(es->s_journal_inum)) {
+ es->s_journal_inum = cpu_to_le32(journal_inum);
ext4_commit_super(sb);
}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index a2f04a3808db..62f2ec599218 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -184,27 +184,73 @@ ext4_xattr_handler(int name_index)
}
static int
-ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
- void *value_start)
+check_xattrs(struct inode *inode, struct buffer_head *bh,
+ struct ext4_xattr_entry *entry, void *end, void *value_start,
+ const char *function, unsigned int line)
{
struct ext4_xattr_entry *e = entry;
+ int err = -EFSCORRUPTED;
+ char *err_str;
+
+ if (bh) {
+ if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+ BHDR(bh)->h_blocks != cpu_to_le32(1)) {
+ err_str = "invalid header";
+ goto errout;
+ }
+ if (buffer_verified(bh))
+ return 0;
+ if (!ext4_xattr_block_csum_verify(inode, bh)) {
+ err = -EFSBADCRC;
+ err_str = "invalid checksum";
+ goto errout;
+ }
+ } else {
+ struct ext4_xattr_ibody_header *header = value_start;
+
+ header -= 1;
+ if (end - (void *)header < sizeof(*header) + sizeof(u32)) {
+ err_str = "in-inode xattr block too small";
+ goto errout;
+ }
+ if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ err_str = "bad magic number in in-inode xattr";
+ goto errout;
+ }
+ }
/* Find the end of the names list */
while (!IS_LAST_ENTRY(e)) {
struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
- if ((void *)next >= end)
- return -EFSCORRUPTED;
- if (strnlen(e->e_name, e->e_name_len) != e->e_name_len)
- return -EFSCORRUPTED;
+ if ((void *)next >= end) {
+ err_str = "e_name out of bounds";
+ goto errout;
+ }
+ if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) {
+ err_str = "bad e_name length";
+ goto errout;
+ }
e = next;
}
/* Check the values */
while (!IS_LAST_ENTRY(entry)) {
u32 size = le32_to_cpu(entry->e_value_size);
+ unsigned long ea_ino = le32_to_cpu(entry->e_value_inum);
- if (size > EXT4_XATTR_SIZE_MAX)
- return -EFSCORRUPTED;
+ if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) {
+ err_str = "ea_inode specified without ea_inode feature enabled";
+ goto errout;
+ }
+ if (ea_ino && ((ea_ino == EXT4_ROOT_INO) ||
+ !ext4_valid_inum(inode->i_sb, ea_ino))) {
+ err_str = "invalid ea_ino";
+ goto errout;
+ }
+ if (size > EXT4_XATTR_SIZE_MAX) {
+ err_str = "e_value size too large";
+ goto errout;
+ }
if (size != 0 && entry->e_value_inum == 0) {
u16 offs = le16_to_cpu(entry->e_value_offs);
@@ -216,66 +262,54 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
* the padded and unpadded sizes, since the size may
* overflow to 0 when adding padding.
*/
- if (offs > end - value_start)
- return -EFSCORRUPTED;
+ if (offs > end - value_start) {
+ err_str = "e_value out of bounds";
+ goto errout;
+ }
value = value_start + offs;
if (value < (void *)e + sizeof(u32) ||
size > end - value ||
- EXT4_XATTR_SIZE(size) > end - value)
- return -EFSCORRUPTED;
+ EXT4_XATTR_SIZE(size) > end - value) {
+ err_str = "overlapping e_value ";
+ goto errout;
+ }
}
entry = EXT4_XATTR_NEXT(entry);
}
-
+ if (bh)
+ set_buffer_verified(bh);
return 0;
+
+errout:
+ if (bh)
+ __ext4_error_inode(inode, function, line, 0, -err,
+ "corrupted xattr block %llu: %s",
+ (unsigned long long) bh->b_blocknr,
+ err_str);
+ else
+ __ext4_error_inode(inode, function, line, 0, -err,
+ "corrupted in-inode xattr: %s", err_str);
+ return err;
}
static inline int
__ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
const char *function, unsigned int line)
{
- int error = -EFSCORRUPTED;
-
- if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
- BHDR(bh)->h_blocks != cpu_to_le32(1))
- goto errout;
- if (buffer_verified(bh))
- return 0;
-
- error = -EFSBADCRC;
- if (!ext4_xattr_block_csum_verify(inode, bh))
- goto errout;
- error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size,
- bh->b_data);
-errout:
- if (error)
- __ext4_error_inode(inode, function, line, 0, -error,
- "corrupted xattr block %llu",
- (unsigned long long) bh->b_blocknr);
- else
- set_buffer_verified(bh);
- return error;
+ return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size,
+ bh->b_data, function, line);
}
#define ext4_xattr_check_block(inode, bh) \
__ext4_xattr_check_block((inode), (bh), __func__, __LINE__)
-static int
+static inline int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
void *end, const char *function, unsigned int line)
{
- int error = -EFSCORRUPTED;
-
- if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
- (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
- goto errout;
- error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
-errout:
- if (error)
- __ext4_error_inode(inode, function, line, 0, -error,
- "corrupted in-inode xattr");
- return error;
+ return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header),
+ function, line);
}
#define xattr_check_inode(inode, header, end) \
@@ -388,6 +422,17 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
struct inode *inode;
int err;
+ /*
+ * We have to check for this corruption early as otherwise
+ * iget_locked() could wait indefinitely for the state of our
+ * parent inode.
+ */
+ if (parent->i_ino == ea_ino) {
+ ext4_error(parent->i_sb,
+ "Parent and EA inode have the same ino %lu", ea_ino);
+ return -EFSCORRUPTED;
+ }
+
inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
@@ -1438,6 +1483,13 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
int err;
+ if (inode->i_sb->s_root == NULL) {
+ ext4_warning(inode->i_sb,
+ "refuse to create EA inode when umounting");
+ WARN_ON(1);
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Let the next inode be the goal, so we try and allocate the EA inode
* in the same group, or nearby one.
@@ -2567,9 +2619,8 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
- buffer = kvmalloc(value_size, GFP_NOFS);
b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
- if (!is || !bs || !buffer || !b_entry_name) {
+ if (!is || !bs || !b_entry_name) {
error = -ENOMEM;
goto out;
}
@@ -2581,12 +2632,18 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
/* Save the entry name and the entry value */
if (entry->e_value_inum) {
+ buffer = kvmalloc(value_size, GFP_NOFS);
+ if (!buffer) {
+ error = -ENOMEM;
+ goto out;
+ }
+
error = ext4_xattr_inode_get(inode, entry, buffer, value_size);
if (error)
goto out;
} else {
size_t value_offs = le16_to_cpu(entry->e_value_offs);
- memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size);
+ buffer = (void *)IFIRST(header) + value_offs;
}
memcpy(b_entry_name, entry->e_name, entry->e_name_len);
@@ -2601,25 +2658,26 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
if (error)
goto out;
- /* Remove the chosen entry from the inode */
- error = ext4_xattr_ibody_set(handle, inode, &i, is);
- if (error)
- goto out;
-
i.value = buffer;
i.value_len = value_size;
error = ext4_xattr_block_find(inode, &i, bs);
if (error)
goto out;
- /* Add entry which was removed from the inode into the block */
+ /* Move ea entry from the inode into the block */
error = ext4_xattr_block_set(handle, inode, &i, bs);
if (error)
goto out;
- error = 0;
+
+ /* Remove the chosen entry from the inode */
+ i.value = NULL;
+ i.value_len = 0;
+ error = ext4_xattr_ibody_set(handle, inode, &i, is);
+
out:
kfree(b_entry_name);
- kvfree(buffer);
+ if (entry->e_value_inum && buffer)
+ kvfree(buffer);
if (is)
brelse(is->iloc.bh);
if (bs)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 5a5515d83a1b..c3e058e0a018 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -70,7 +70,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
.old_blkaddr = index,
.new_blkaddr = index,
.encrypted_page = NULL,
- .is_por = !is_meta,
+ .is_por = !is_meta ? 1 : 0,
};
int err;
@@ -171,10 +171,8 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type)
{
- if (time_to_inject(sbi, FAULT_BLKADDR)) {
- f2fs_show_injection_info(sbi, FAULT_BLKADDR);
+ if (time_to_inject(sbi, FAULT_BLKADDR))
return false;
- }
switch (type) {
case META_NAT:
@@ -239,8 +237,8 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
.op = REQ_OP_READ,
.op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
.encrypted_page = NULL,
- .in_list = false,
- .is_por = (type == META_POR),
+ .in_list = 0,
+ .is_por = (type == META_POR) ? 1 : 0,
};
struct blk_plug plug;
int err;
@@ -625,7 +623,6 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi)
if (time_to_inject(sbi, FAULT_ORPHAN)) {
spin_unlock(&im->ino_lock);
- f2fs_show_injection_info(sbi, FAULT_ORPHAN);
return -ENOSPC;
}
@@ -798,7 +795,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
*/
head = &im->ino_list;
- /* loop for each orphan inode entry and write them in Jornal block */
+ /* loop for each orphan inode entry and write them in journal block */
list_for_each_entry(orphan, head, list) {
if (!page) {
page = f2fs_grab_meta_page(sbi, start_blk++);
@@ -1128,7 +1125,7 @@ retry:
} else {
/*
* We should submit bio, since it exists several
- * wribacking dentry pages in the freeing inode.
+ * writebacking dentry pages in the freeing inode.
*/
f2fs_submit_merged_write(sbi, DATA);
cond_resched();
@@ -1476,20 +1473,18 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
- ckpt->cur_node_segno[i] =
- cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
- ckpt->cur_node_blkoff[i] =
- cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
- ckpt->alloc_type[i + CURSEG_HOT_NODE] =
- curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
+ struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_NODE);
+
+ ckpt->cur_node_segno[i] = cpu_to_le32(curseg->segno);
+ ckpt->cur_node_blkoff[i] = cpu_to_le16(curseg->next_blkoff);
+ ckpt->alloc_type[i + CURSEG_HOT_NODE] = curseg->alloc_type;
}
for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
- ckpt->cur_data_segno[i] =
- cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
- ckpt->cur_data_blkoff[i] =
- cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
- ckpt->alloc_type[i + CURSEG_HOT_DATA] =
- curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
+ struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_DATA);
+
+ ckpt->cur_data_segno[i] = cpu_to_le32(curseg->segno);
+ ckpt->cur_data_blkoff[i] = cpu_to_le16(curseg->next_blkoff);
+ ckpt->alloc_type[i + CURSEG_HOT_DATA] = curseg->alloc_type;
}
/* 2 cp + n data seg summary + orphan inode blocks */
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 2532f369cb10..b40dec3d7f79 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -241,7 +241,7 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc)
unsigned int size = LZ4_MEM_COMPRESS;
#ifdef CONFIG_F2FS_FS_LZ4HC
- if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET)
+ if (F2FS_I(cc->inode)->i_compress_level)
size = LZ4HC_MEM_COMPRESS;
#endif
@@ -267,8 +267,7 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc)
#ifdef CONFIG_F2FS_FS_LZ4HC
static int lz4hc_compress_pages(struct compress_ctx *cc)
{
- unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
- COMPRESS_LEVEL_OFFSET;
+ unsigned char level = F2FS_I(cc->inode)->i_compress_level;
int len;
if (level)
@@ -340,8 +339,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
zstd_cstream *stream;
void *workspace;
unsigned int workspace_size;
- unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
- COMPRESS_LEVEL_OFFSET;
+ unsigned char level = F2FS_I(cc->inode)->i_compress_level;
if (!level)
level = F2FS_ZSTD_DEFAULT_CLEVEL;
@@ -564,7 +562,7 @@ module_param(num_compress_pages, uint, 0444);
MODULE_PARM_DESC(num_compress_pages,
"Number of intermediate compress pages to preallocate");
-int f2fs_init_compress_mempool(void)
+int __init f2fs_init_compress_mempool(void)
{
compress_page_pool = mempool_create_page_pool(num_compress_pages, 0);
return compress_page_pool ? 0 : -ENOMEM;
@@ -690,9 +688,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
vm_unmap_ram(cc->cbuf, cc->nr_cpages);
vm_unmap_ram(cc->rbuf, cc->cluster_size);
- for (i = 0; i < cc->nr_cpages; i++) {
- if (i < new_nr_cpages)
- continue;
+ for (i = new_nr_cpages; i < cc->nr_cpages; i++) {
f2fs_compress_free_page(cc->cpages[i]);
cc->cpages[i] = NULL;
}
@@ -1070,7 +1066,7 @@ retry:
if (ret)
goto out;
if (bio)
- f2fs_submit_bio(sbi, bio, DATA);
+ f2fs_submit_read_bio(sbi, bio, DATA);
ret = f2fs_init_compress_ctx(cc);
if (ret)
@@ -1215,10 +1211,11 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
.page = NULL,
.encrypted_page = NULL,
.compressed_page = NULL,
- .submitted = false,
+ .submitted = 0,
.io_type = io_type,
.io_wbc = wbc,
- .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode),
+ .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ?
+ 1 : 0,
};
struct dnode_of_data dn;
struct node_info ni;
@@ -1228,7 +1225,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
loff_t psize;
int i, err;
- /* we should bypass data pages to proceed the kworkder jobs */
+ /* we should bypass data pages to proceed the kworker jobs */
if (unlikely(f2fs_cp_error(sbi))) {
mapping_set_error(cc->rpages[0]->mapping, -EIO);
goto out_free;
@@ -1813,6 +1810,7 @@ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
const struct address_space_operations f2fs_compress_aops = {
.release_folio = f2fs_release_folio,
.invalidate_folio = f2fs_invalidate_folio,
+ .migrate_folio = filemap_migrate_folio,
};
struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 41addc605350..06b552a0aba2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -292,13 +292,11 @@ static void f2fs_read_end_io(struct bio *bio)
struct bio_post_read_ctx *ctx;
bool intask = in_task();
- iostat_update_and_unbind_ctx(bio, 0);
+ iostat_update_and_unbind_ctx(bio);
ctx = bio->bi_private;
- if (time_to_inject(sbi, FAULT_READ_IO)) {
- f2fs_show_injection_info(sbi, FAULT_READ_IO);
+ if (time_to_inject(sbi, FAULT_READ_IO))
bio->bi_status = BLK_STS_IOERR;
- }
if (bio->bi_status) {
f2fs_finish_read_bio(bio, intask);
@@ -332,13 +330,11 @@ static void f2fs_write_end_io(struct bio *bio)
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
- iostat_update_and_unbind_ctx(bio, 1);
+ iostat_update_and_unbind_ctx(bio);
sbi = bio->bi_private;
- if (time_to_inject(sbi, FAULT_WRITE_IO)) {
- f2fs_show_injection_info(sbi, FAULT_WRITE_IO);
+ if (time_to_inject(sbi, FAULT_WRITE_IO))
bio->bi_status = BLK_STS_IOERR;
- }
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
@@ -507,65 +503,66 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
return fscrypt_mergeable_bio(bio, inode, next_idx);
}
-static inline void __submit_bio(struct f2fs_sb_info *sbi,
- struct bio *bio, enum page_type type)
+void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
+ enum page_type type)
{
- if (!is_read_io(bio_op(bio))) {
- unsigned int start;
+ WARN_ON_ONCE(!is_read_io(bio_op(bio)));
+ trace_f2fs_submit_read_bio(sbi->sb, type, bio);
- if (type != DATA && type != NODE)
- goto submit_io;
+ iostat_update_submit_ctx(bio, type);
+ submit_bio(bio);
+}
- if (f2fs_lfs_mode(sbi) && current->plug)
- blk_finish_plug(current->plug);
+static void f2fs_align_write_bio(struct f2fs_sb_info *sbi, struct bio *bio)
+{
+ unsigned int start =
+ (bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS) % F2FS_IO_SIZE(sbi);
- if (!F2FS_IO_ALIGNED(sbi))
- goto submit_io;
+ if (start == 0)
+ return;
- start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
- start %= F2FS_IO_SIZE(sbi);
+ /* fill dummy pages */
+ for (; start < F2FS_IO_SIZE(sbi); start++) {
+ struct page *page =
+ mempool_alloc(sbi->write_io_dummy,
+ GFP_NOIO | __GFP_NOFAIL);
+ f2fs_bug_on(sbi, !page);
- if (start == 0)
- goto submit_io;
+ lock_page(page);
- /* fill dummy pages */
- for (; start < F2FS_IO_SIZE(sbi); start++) {
- struct page *page =
- mempool_alloc(sbi->write_io_dummy,
- GFP_NOIO | __GFP_NOFAIL);
- f2fs_bug_on(sbi, !page);
+ zero_user_segment(page, 0, PAGE_SIZE);
+ set_page_private_dummy(page);
- lock_page(page);
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
+ f2fs_bug_on(sbi, 1);
+ }
+}
- zero_user_segment(page, 0, PAGE_SIZE);
- set_page_private_dummy(page);
+static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio,
+ enum page_type type)
+{
+ WARN_ON_ONCE(is_read_io(bio_op(bio)));
+
+ if (type == DATA || type == NODE) {
+ if (f2fs_lfs_mode(sbi) && current->plug)
+ blk_finish_plug(current->plug);
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
- f2fs_bug_on(sbi, 1);
+ if (F2FS_IO_ALIGNED(sbi)) {
+ f2fs_align_write_bio(sbi, bio);
+ /*
+ * In the NODE case, we lose next block address chain.
+ * So, we need to do checkpoint in f2fs_sync_file.
+ */
+ if (type == NODE)
+ set_sbi_flag(sbi, SBI_NEED_CP);
}
- /*
- * In the NODE case, we lose next block address chain. So, we
- * need to do checkpoint in f2fs_sync_file.
- */
- if (type == NODE)
- set_sbi_flag(sbi, SBI_NEED_CP);
}
-submit_io:
- if (is_read_io(bio_op(bio)))
- trace_f2fs_submit_read_bio(sbi->sb, type, bio);
- else
- trace_f2fs_submit_write_bio(sbi->sb, type, bio);
+ trace_f2fs_submit_write_bio(sbi->sb, type, bio);
iostat_update_submit_ctx(bio, type);
submit_bio(bio);
}
-void f2fs_submit_bio(struct f2fs_sb_info *sbi,
- struct bio *bio, enum page_type type)
-{
- __submit_bio(sbi, bio, type);
-}
-
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
@@ -573,12 +570,13 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
if (!io->bio)
return;
- if (is_read_io(fio->op))
+ if (is_read_io(fio->op)) {
trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio);
- else
+ f2fs_submit_read_bio(io->sbi, io->bio, fio->type);
+ } else {
trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio);
-
- __submit_bio(io->sbi, io->bio, fio->type);
+ f2fs_submit_write_bio(io->sbi, io->bio, fio->type);
+ }
io->bio = NULL;
}
@@ -655,6 +653,9 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
f2fs_down_write(&io->io_rwsem);
+ if (!io->bio)
+ goto unlock_out;
+
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
@@ -663,6 +664,7 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA;
}
__submit_merged_bio(io);
+unlock_out:
f2fs_up_write(&io->io_rwsem);
}
@@ -741,12 +743,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
}
if (fio->io_wbc && !is_read_io(fio->op))
- wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
__read_io_type(page) : WB_DATA_TYPE(fio->page));
- __submit_bio(fio->sbi, bio, fio->type);
+ if (is_read_io(bio_op(bio)))
+ f2fs_submit_read_bio(fio->sbi, bio, fio->type);
+ else
+ f2fs_submit_write_bio(fio->sbi, bio, fio->type);
return 0;
}
@@ -848,7 +853,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
/* page can't be merged into bio; submit the bio */
del_bio_entry(be);
- __submit_bio(sbi, *bio, DATA);
+ f2fs_submit_write_bio(sbi, *bio, DATA);
break;
}
f2fs_up_write(&io->bio_list_lock);
@@ -911,7 +916,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
}
if (found)
- __submit_bio(sbi, target, DATA);
+ f2fs_submit_write_bio(sbi, target, DATA);
if (bio && *bio) {
bio_put(*bio);
*bio = NULL;
@@ -948,7 +953,7 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
inc_page_count(fio->sbi, WB_DATA_TYPE(page));
@@ -991,7 +996,7 @@ next:
bio_page = fio->page;
/* set submitted = true as a return value */
- fio->submitted = true;
+ fio->submitted = 1;
inc_page_count(sbi, WB_DATA_TYPE(bio_page));
@@ -1007,7 +1012,7 @@ alloc_new:
(fio->type == DATA || fio->type == NODE) &&
fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
dec_page_count(sbi, WB_DATA_TYPE(bio_page));
- fio->retry = true;
+ fio->retry = 1;
goto skip;
}
io->bio = __bio_alloc(fio, BIO_MAX_VECS);
@@ -1022,7 +1027,7 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
io->last_block_in_bio = fio->new_blkaddr;
@@ -1107,7 +1112,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
}
inc_page_count(sbi, F2FS_RD_DATA);
f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE);
- __submit_bio(sbi, bio, DATA);
+ f2fs_submit_read_bio(sbi, bio, DATA);
return 0;
}
@@ -1207,19 +1212,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
return err;
}
-int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
-{
- struct extent_info ei = {0, };
- struct inode *inode = dn->inode;
-
- if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
- dn->data_blkaddr = ei.blk + index - ei.fofs;
- return 0;
- }
-
- return f2fs_reserve_block(dn, index);
-}
-
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
blk_opf_t op_flags, bool for_write,
pgoff_t *next_pgofs)
@@ -1227,15 +1219,14 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
- struct extent_info ei = {0, };
int err;
page = f2fs_grab_cache_page(mapping, index, for_write);
if (!page)
return ERR_PTR(-ENOMEM);
- if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
- dn.data_blkaddr = ei.blk + index - ei.fofs;
+ if (f2fs_lookup_read_extent_cache_block(inode, index,
+ &dn.data_blkaddr)) {
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
err = -EFSCORRUPTED;
@@ -1432,13 +1423,12 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
return err;
dn->data_blkaddr = f2fs_data_blkaddr(dn);
- if (dn->data_blkaddr != NULL_ADDR)
- goto alloc;
-
- if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
- return err;
+ if (dn->data_blkaddr == NULL_ADDR) {
+ err = inc_valid_block_count(sbi, dn->inode, &count);
+ if (unlikely(err))
+ return err;
+ }
-alloc:
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
old_blkaddr = dn->data_blkaddr;
f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
@@ -1452,19 +1442,91 @@ alloc:
return 0;
}
-void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag)
{
- if (flag == F2FS_GET_BLOCK_PRE_AIO) {
- if (lock)
- f2fs_down_read(&sbi->node_change);
- else
- f2fs_up_read(&sbi->node_change);
+ if (flag == F2FS_GET_BLOCK_PRE_AIO)
+ f2fs_down_read(&sbi->node_change);
+ else
+ f2fs_lock_op(sbi);
+}
+
+static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag)
+{
+ if (flag == F2FS_GET_BLOCK_PRE_AIO)
+ f2fs_up_read(&sbi->node_change);
+ else
+ f2fs_unlock_op(sbi);
+}
+
+int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ int err = 0;
+
+ f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO);
+ if (!f2fs_lookup_read_extent_cache_block(dn->inode, index,
+ &dn->data_blkaddr))
+ err = f2fs_reserve_block(dn, index);
+ f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO);
+
+ return err;
+}
+
+static int f2fs_map_no_dnode(struct inode *inode,
+ struct f2fs_map_blocks *map, struct dnode_of_data *dn,
+ pgoff_t pgoff)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ /*
+ * There is one exceptional case that read_node_page() may return
+ * -ENOENT due to filesystem has been shutdown or cp_error, return
+ * -EIO in that case.
+ */
+ if (map->m_may_create &&
+ (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || f2fs_cp_error(sbi)))
+ return -EIO;
+
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs = f2fs_get_next_page_offset(dn, pgoff);
+ if (map->m_next_extent)
+ *map->m_next_extent = f2fs_get_next_page_offset(dn, pgoff);
+ return 0;
+}
+
+static bool f2fs_map_blocks_cached(struct inode *inode,
+ struct f2fs_map_blocks *map, int flag)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ unsigned int maxblocks = map->m_len;
+ pgoff_t pgoff = (pgoff_t)map->m_lblk;
+ struct extent_info ei = {};
+
+ if (!f2fs_lookup_read_extent_cache(inode, pgoff, &ei))
+ return false;
+
+ map->m_pblk = ei.blk + pgoff - ei.fofs;
+ map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgoff);
+ map->m_flags = F2FS_MAP_MAPPED;
+ if (map->m_next_extent)
+ *map->m_next_extent = pgoff + map->m_len;
+
+ /* for hardware encryption, but to avoid potential issue in future */
+ if (flag == F2FS_GET_BLOCK_DIO)
+ f2fs_wait_on_block_writeback_range(inode,
+ map->m_pblk, map->m_len);
+
+ if (f2fs_allow_multi_device_dio(sbi, flag)) {
+ int bidx = f2fs_target_device_index(sbi, map->m_pblk);
+ struct f2fs_dev_info *dev = &sbi->devs[bidx];
+
+ map->m_bdev = dev->bdev;
+ map->m_pblk -= dev->start_blk;
+ map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk);
} else {
- if (lock)
- f2fs_lock_op(sbi);
- else
- f2fs_unlock_op(sbi);
+ map->m_bdev = inode->i_sb->s_bdev;
}
+ return true;
}
/*
@@ -1472,8 +1534,7 @@ void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
* maps continuous logical blocks to physical blocks, and return such
* info via f2fs_map_blocks structure.
*/
-int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
- int create, int flag)
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag)
{
unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
@@ -1483,14 +1544,17 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int err = 0, ofs = 1;
unsigned int ofs_in_node, last_ofs_in_node;
blkcnt_t prealloc;
- struct extent_info ei = {0, };
block_t blkaddr;
unsigned int start_pgofs;
int bidx = 0;
+ bool is_hole;
if (!maxblocks)
return 0;
+ if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag))
+ goto out;
+
map->m_bdev = inode->i_sb->s_bdev;
map->m_multidev_dio =
f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag);
@@ -1502,42 +1566,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
pgofs = (pgoff_t)map->m_lblk;
end = pgofs + maxblocks;
- if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) {
- if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
- map->m_may_create)
- goto next_dnode;
-
- map->m_pblk = ei.blk + pgofs - ei.fofs;
- map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
- map->m_flags = F2FS_MAP_MAPPED;
- if (map->m_next_extent)
- *map->m_next_extent = pgofs + map->m_len;
-
- /* for hardware encryption, but to avoid potential issue in future */
- if (flag == F2FS_GET_BLOCK_DIO)
- f2fs_wait_on_block_writeback_range(inode,
- map->m_pblk, map->m_len);
-
- if (map->m_multidev_dio) {
- block_t blk_addr = map->m_pblk;
-
- bidx = f2fs_target_device_index(sbi, map->m_pblk);
-
- map->m_bdev = FDEV(bidx).bdev;
- map->m_pblk -= FDEV(bidx).start_blk;
- map->m_len = min(map->m_len,
- FDEV(bidx).end_blk + 1 - map->m_pblk);
-
- if (map->m_may_create)
- f2fs_update_device_state(sbi, inode->i_ino,
- blk_addr, map->m_len);
- }
- goto out;
- }
-
next_dnode:
if (map->m_may_create)
- f2fs_do_map_lock(sbi, flag, true);
+ f2fs_map_lock(sbi, flag);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -1545,29 +1576,8 @@ next_dnode:
if (err) {
if (flag == F2FS_GET_BLOCK_BMAP)
map->m_pblk = 0;
-
- if (err == -ENOENT) {
- /*
- * There is one exceptional case that read_node_page()
- * may return -ENOENT due to filesystem has been
- * shutdown or cp_error, so force to convert error
- * number to EIO for such case.
- */
- if (map->m_may_create &&
- (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
- f2fs_cp_error(sbi))) {
- err = -EIO;
- goto unlock_out;
- }
-
- err = 0;
- if (map->m_next_pgofs)
- *map->m_next_pgofs =
- f2fs_get_next_page_offset(&dn, pgofs);
- if (map->m_next_extent)
- *map->m_next_extent =
- f2fs_get_next_page_offset(&dn, pgofs);
- }
+ if (err == -ENOENT)
+ err = f2fs_map_no_dnode(inode, map, &dn, pgofs);
goto unlock_out;
}
@@ -1578,78 +1588,76 @@ next_dnode:
next_block:
blkaddr = f2fs_data_blkaddr(&dn);
-
- if (__is_valid_data_blkaddr(blkaddr) &&
- !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
+ is_hole = !__is_valid_data_blkaddr(blkaddr);
+ if (!is_hole &&
+ !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
goto sync_out;
}
- if (__is_valid_data_blkaddr(blkaddr)) {
- /* use out-place-update for driect IO under LFS mode */
- if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
- map->m_may_create) {
+ /* use out-place-update for direct IO under LFS mode */
+ if (map->m_may_create &&
+ (is_hole || (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO))) {
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto sync_out;
+ }
+
+ switch (flag) {
+ case F2FS_GET_BLOCK_PRE_AIO:
+ if (blkaddr == NULL_ADDR) {
+ prealloc++;
+ last_ofs_in_node = dn.ofs_in_node;
+ }
+ break;
+ case F2FS_GET_BLOCK_PRE_DIO:
+ case F2FS_GET_BLOCK_DIO:
err = __allocate_data_block(&dn, map->m_seg_type);
if (err)
goto sync_out;
- blkaddr = dn.data_blkaddr;
+ if (flag == F2FS_GET_BLOCK_PRE_DIO)
+ file_need_truncate(inode);
set_inode_flag(inode, FI_APPEND_WRITE);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EIO;
+ goto sync_out;
}
- } else {
- if (create) {
- if (unlikely(f2fs_cp_error(sbi))) {
- err = -EIO;
- goto sync_out;
- }
- if (flag == F2FS_GET_BLOCK_PRE_AIO) {
- if (blkaddr == NULL_ADDR) {
- prealloc++;
- last_ofs_in_node = dn.ofs_in_node;
- }
- } else {
- WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO &&
- flag != F2FS_GET_BLOCK_DIO);
- err = __allocate_data_block(&dn,
- map->m_seg_type);
- if (!err) {
- if (flag == F2FS_GET_BLOCK_PRE_DIO)
- file_need_truncate(inode);
- set_inode_flag(inode, FI_APPEND_WRITE);
- }
- }
- if (err)
- goto sync_out;
+
+ blkaddr = dn.data_blkaddr;
+ if (is_hole)
map->m_flags |= F2FS_MAP_NEW;
- blkaddr = dn.data_blkaddr;
- } else {
- if (f2fs_compressed_file(inode) &&
- f2fs_sanity_check_cluster(&dn) &&
- (flag != F2FS_GET_BLOCK_FIEMAP ||
- IS_ENABLED(CONFIG_F2FS_CHECK_FS))) {
- err = -EFSCORRUPTED;
- f2fs_handle_error(sbi,
- ERROR_CORRUPTED_CLUSTER);
- goto sync_out;
- }
- if (flag == F2FS_GET_BLOCK_BMAP) {
- map->m_pblk = 0;
- goto sync_out;
- }
- if (flag == F2FS_GET_BLOCK_PRECACHE)
- goto sync_out;
- if (flag == F2FS_GET_BLOCK_FIEMAP &&
- blkaddr == NULL_ADDR) {
- if (map->m_next_pgofs)
- *map->m_next_pgofs = pgofs + 1;
- goto sync_out;
- }
- if (flag != F2FS_GET_BLOCK_FIEMAP) {
- /* for defragment case */
+ } else if (is_hole) {
+ if (f2fs_compressed_file(inode) &&
+ f2fs_sanity_check_cluster(&dn) &&
+ (flag != F2FS_GET_BLOCK_FIEMAP ||
+ IS_ENABLED(CONFIG_F2FS_CHECK_FS))) {
+ err = -EFSCORRUPTED;
+ f2fs_handle_error(sbi,
+ ERROR_CORRUPTED_CLUSTER);
+ goto sync_out;
+ }
+
+ switch (flag) {
+ case F2FS_GET_BLOCK_PRECACHE:
+ goto sync_out;
+ case F2FS_GET_BLOCK_BMAP:
+ map->m_pblk = 0;
+ goto sync_out;
+ case F2FS_GET_BLOCK_FIEMAP:
+ if (blkaddr == NULL_ADDR) {
if (map->m_next_pgofs)
*map->m_next_pgofs = pgofs + 1;
goto sync_out;
}
+ break;
+ default:
+ /* for defragment case */
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs = pgofs + 1;
+ goto sync_out;
}
}
@@ -1660,9 +1668,9 @@ next_block:
bidx = f2fs_target_device_index(sbi, blkaddr);
if (map->m_len == 0) {
- /* preallocated unwritten block should be mapped for fiemap. */
+ /* reserved delalloc block should be mapped for fiemap. */
if (blkaddr == NEW_ADDR)
- map->m_flags |= F2FS_MAP_UNWRITTEN;
+ map->m_flags |= F2FS_MAP_DELALLOC;
map->m_flags |= F2FS_MAP_MAPPED;
map->m_pblk = blkaddr;
@@ -1721,7 +1729,7 @@ skip:
f2fs_put_dnode(&dn);
if (map->m_may_create) {
- f2fs_do_map_lock(sbi, flag, false);
+ f2fs_map_unlock(sbi, flag);
f2fs_balance_fs(sbi, dn.node_changed);
}
goto next_dnode;
@@ -1767,11 +1775,11 @@ sync_out:
f2fs_put_dnode(&dn);
unlock_out:
if (map->m_may_create) {
- f2fs_do_map_lock(sbi, flag, false);
+ f2fs_map_unlock(sbi, flag);
f2fs_balance_fs(sbi, dn.node_changed);
}
out:
- trace_f2fs_map_blocks(inode, map, create, flag, err);
+ trace_f2fs_map_blocks(inode, map, flag, err);
return err;
}
@@ -1793,7 +1801,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
while (map.m_lblk < last_lblk) {
map.m_len = last_lblk - map.m_lblk;
- err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
+ err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
if (err || map.m_len == 0)
return false;
map.m_lblk += map.m_len;
@@ -1967,7 +1975,7 @@ next:
map.m_len = cluster_size - count_in_cluster;
}
- ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
+ ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP);
if (ret)
goto out;
@@ -1984,7 +1992,7 @@ next:
compr_appended = false;
/* In a case of compressed cluster, append this to the last extent */
- if (compr_cluster && ((map.m_flags & F2FS_MAP_UNWRITTEN) ||
+ if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) ||
!(map.m_flags & F2FS_MAP_FLAGS))) {
compr_appended = true;
goto skip_fill;
@@ -2030,7 +2038,7 @@ skip_fill:
compr_cluster = false;
size += blks_to_bytes(inode, 1);
}
- } else if (map.m_flags & F2FS_MAP_UNWRITTEN) {
+ } else if (map.m_flags & F2FS_MAP_DELALLOC) {
flags = FIEMAP_EXTENT_UNWRITTEN;
}
@@ -2099,7 +2107,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
map->m_lblk = block_in_file;
map->m_len = last_block - block_in_file;
- ret = f2fs_map_blocks(inode, map, 0, F2FS_GET_BLOCK_DEFAULT);
+ ret = f2fs_map_blocks(inode, map, F2FS_GET_BLOCK_DEFAULT);
if (ret)
goto out;
got_it:
@@ -2136,7 +2144,7 @@ zero_out:
*last_block_in_bio, block_nr) ||
!f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
submit_and_realloc:
- __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
}
if (bio == NULL) {
@@ -2283,7 +2291,7 @@ skip_reading_dnode:
*last_block_in_bio, blkaddr) ||
!f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
submit_and_realloc:
- __submit_bio(sbi, bio, DATA);
+ f2fs_submit_read_bio(sbi, bio, DATA);
bio = NULL;
}
@@ -2377,7 +2385,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
- /* there are remained comressed pages, submit them */
+ /* there are remained compressed pages, submit them */
if (!f2fs_cluster_can_merge_page(&cc, page->index)) {
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
@@ -2444,7 +2452,7 @@ next_page:
#endif
}
if (bio)
- __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
return ret;
}
@@ -2530,34 +2538,29 @@ static inline bool check_inplace_update_policy(struct inode *inode,
struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- unsigned int policy = SM_I(sbi)->ipu_policy;
- if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) &&
- is_inode_flag_set(inode, FI_OPU_WRITE))
+ if (IS_F2FS_IPU_HONOR_OPU_WRITE(sbi) &&
+ is_inode_flag_set(inode, FI_OPU_WRITE))
return false;
- if (policy & (0x1 << F2FS_IPU_FORCE))
+ if (IS_F2FS_IPU_FORCE(sbi))
return true;
- if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi))
+ if (IS_F2FS_IPU_SSR(sbi) && f2fs_need_SSR(sbi))
return true;
- if (policy & (0x1 << F2FS_IPU_UTIL) &&
- utilization(sbi) > SM_I(sbi)->min_ipu_util)
+ if (IS_F2FS_IPU_UTIL(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
return true;
- if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && f2fs_need_SSR(sbi) &&
- utilization(sbi) > SM_I(sbi)->min_ipu_util)
+ if (IS_F2FS_IPU_SSR_UTIL(sbi) && f2fs_need_SSR(sbi) &&
+ utilization(sbi) > SM_I(sbi)->min_ipu_util)
return true;
/*
* IPU for rewrite async pages
*/
- if (policy & (0x1 << F2FS_IPU_ASYNC) &&
- fio && fio->op == REQ_OP_WRITE &&
- !(fio->op_flags & REQ_SYNC) &&
- !IS_ENCRYPTED(inode))
+ if (IS_F2FS_IPU_ASYNC(sbi) && fio && fio->op == REQ_OP_WRITE &&
+ !(fio->op_flags & REQ_SYNC) && !IS_ENCRYPTED(inode))
return true;
/* this is only set during fdatasync */
- if (policy & (0x1 << F2FS_IPU_FSYNC) &&
- is_inode_flag_set(inode, FI_NEED_IPU))
+ if (IS_F2FS_IPU_FSYNC(sbi) && is_inode_flag_set(inode, FI_NEED_IPU))
return true;
if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
@@ -2635,7 +2638,6 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
struct page *page = fio->page;
struct inode *inode = page->mapping->host;
struct dnode_of_data dn;
- struct extent_info ei = {0, };
struct node_info ni;
bool ipu_force = false;
int err = 0;
@@ -2647,9 +2649,8 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
set_new_dnode(&dn, inode, NULL, NULL, 0);
if (need_inplace_update(fio) &&
- f2fs_lookup_read_extent_cache(inode, page->index, &ei)) {
- fio->old_blkaddr = ei.blk + page->index - ei.fofs;
-
+ f2fs_lookup_read_extent_cache_block(inode, page->index,
+ &fio->old_blkaddr)) {
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC_ENHANCE)) {
f2fs_handle_error(fio->sbi,
@@ -2699,7 +2700,6 @@ got_it:
goto out_writepage;
set_page_writeback(page);
- ClearPageError(page);
f2fs_put_dnode(&dn);
if (fio->need_lock == LOCK_REQ)
f2fs_unlock_op(fio->sbi);
@@ -2735,7 +2735,6 @@ got_it:
goto out_writepage;
set_page_writeback(page);
- ClearPageError(page);
if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR)
f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false);
@@ -2780,10 +2779,10 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
.old_blkaddr = NULL_ADDR,
.page = page,
.encrypted_page = NULL,
- .submitted = false,
+ .submitted = 0,
.compr_blocks = compr_blocks,
.need_lock = LOCK_RETRY,
- .post_read = f2fs_post_read_required(inode),
+ .post_read = f2fs_post_read_required(inode) ? 1 : 0,
.io_type = io_type,
.io_wbc = wbc,
.bio = bio,
@@ -2792,7 +2791,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
trace_f2fs_writepage(page, DATA);
- /* we should bypass data pages to proceed the kworkder jobs */
+ /* we should bypass data pages to proceed the kworker jobs */
if (unlikely(f2fs_cp_error(sbi))) {
mapping_set_error(page->mapping, -EIO);
/*
@@ -2904,14 +2903,14 @@ out:
}
if (submitted)
- *submitted = fio.submitted ? 1 : 0;
+ *submitted = fio.submitted;
return 0;
redirty_out:
redirty_page_for_writepage(wbc, page);
/*
- * pageout() in MM traslates EAGAIN, so calls handle_write_error()
+ * pageout() in MM translates EAGAIN, so calls handle_write_error()
* -> mapping_set_error() -> set_bit(AS_EIO, ...).
* file_write_and_wait_range() will see EIO error, which is critical
* to return value of fsync() followed by atomic_write failure to user.
@@ -2945,7 +2944,7 @@ out:
}
/*
- * This function was copied from write_cche_pages from mm/page-writeback.c.
+ * This function was copied from write_cache_pages from mm/page-writeback.c.
* The major change is making write step of cold data page separately from
* warm/hot data page.
*/
@@ -3354,9 +3353,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
struct dnode_of_data dn;
struct page *ipage;
bool locked = false;
- struct extent_info ei = {0, };
+ int flag = F2FS_GET_BLOCK_PRE_AIO;
int err = 0;
- int flag;
/*
* If a whole page is being written and we already preallocated all the
@@ -3366,14 +3364,13 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
return 0;
/* f2fs_lock_op avoids race between write CP and convert_inline_page */
- if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode))
- flag = F2FS_GET_BLOCK_DEFAULT;
- else
- flag = F2FS_GET_BLOCK_PRE_AIO;
-
- if (f2fs_has_inline_data(inode) ||
- (pos & PAGE_MASK) >= i_size_read(inode)) {
- f2fs_do_map_lock(sbi, flag, true);
+ if (f2fs_has_inline_data(inode)) {
+ if (pos + len > MAX_INLINE_DATA(inode))
+ flag = F2FS_GET_BLOCK_DEFAULT;
+ f2fs_map_lock(sbi, flag);
+ locked = true;
+ } else if ((pos & PAGE_MASK) >= i_size_read(inode)) {
+ f2fs_map_lock(sbi, flag);
locked = true;
}
@@ -3393,40 +3390,40 @@ restart:
set_inode_flag(inode, FI_DATA_EXIST);
if (inode->i_nlink)
set_page_private_inline(ipage);
- } else {
- err = f2fs_convert_inline_page(&dn, page);
- if (err)
- goto out;
- if (dn.data_blkaddr == NULL_ADDR)
- err = f2fs_get_block(&dn, index);
- }
- } else if (locked) {
- err = f2fs_get_block(&dn, index);
- } else {
- if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
- dn.data_blkaddr = ei.blk + index - ei.fofs;
- } else {
- /* hole case */
- err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err || dn.data_blkaddr == NULL_ADDR) {
- f2fs_put_dnode(&dn);
- f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
- true);
- WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO);
- locked = true;
- goto restart;
- }
+ goto out;
}
+ err = f2fs_convert_inline_page(&dn, page);
+ if (err || dn.data_blkaddr != NULL_ADDR)
+ goto out;
}
- /* convert_inline_page can make node_changed */
- *blk_addr = dn.data_blkaddr;
- *node_changed = dn.node_changed;
+ if (!f2fs_lookup_read_extent_cache_block(inode, index,
+ &dn.data_blkaddr)) {
+ if (locked) {
+ err = f2fs_reserve_block(&dn, index);
+ goto out;
+ }
+
+ /* hole case */
+ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (!err && dn.data_blkaddr != NULL_ADDR)
+ goto out;
+ f2fs_put_dnode(&dn);
+ f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO);
+ WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO);
+ locked = true;
+ goto restart;
+ }
out:
+ if (!err) {
+ /* convert_inline_page can make node_changed */
+ *blk_addr = dn.data_blkaddr;
+ *node_changed = dn.node_changed;
+ }
f2fs_put_dnode(&dn);
unlock_out:
if (locked)
- f2fs_do_map_lock(sbi, flag, false);
+ f2fs_map_unlock(sbi, flag);
return err;
}
@@ -3435,7 +3432,6 @@ static int __find_data_block(struct inode *inode, pgoff_t index,
{
struct dnode_of_data dn;
struct page *ipage;
- struct extent_info ei = {0, };
int err = 0;
ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
@@ -3444,9 +3440,8 @@ static int __find_data_block(struct inode *inode, pgoff_t index,
set_new_dnode(&dn, inode, ipage, ipage, 0);
- if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
- dn.data_blkaddr = ei.blk + index - ei.fofs;
- } else {
+ if (!f2fs_lookup_read_extent_cache_block(inode, index,
+ &dn.data_blkaddr)) {
/* hole case */
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err) {
@@ -3467,7 +3462,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index,
struct page *ipage;
int err = 0;
- f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+ f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO);
ipage = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(ipage)) {
@@ -3476,14 +3471,16 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index,
}
set_new_dnode(&dn, inode, ipage, ipage, 0);
- err = f2fs_get_block(&dn, index);
+ if (!f2fs_lookup_read_extent_cache_block(dn.inode, index,
+ &dn.data_blkaddr))
+ err = f2fs_reserve_block(&dn, index);
*blk_addr = dn.data_blkaddr;
*node_changed = dn.node_changed;
f2fs_put_dnode(&dn);
unlock_out:
- f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO);
return err;
}
@@ -3729,6 +3726,7 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
}
}
+ clear_page_private_reference(&folio->page);
clear_page_private_gcing(&folio->page);
if (test_opt(sbi, COMPRESS_CACHE) &&
@@ -3754,6 +3752,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
clear_page_private_data(&folio->page);
}
+ clear_page_private_reference(&folio->page);
clear_page_private_gcing(&folio->page);
folio_detach_private(folio);
@@ -3835,7 +3834,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
map.m_next_pgofs = NULL;
map.m_seg_type = NO_CHECK_TYPE;
- if (!f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_BMAP))
+ if (!f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_BMAP))
blknr = map.m_pblk;
}
out:
@@ -3943,7 +3942,7 @@ retry:
map.m_seg_type = NO_CHECK_TYPE;
map.m_may_create = false;
- ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
+ ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP);
if (ret)
goto out;
@@ -4168,8 +4167,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (flags & IOMAP_WRITE)
map.m_may_create = true;
- err = f2fs_map_blocks(inode, &map, flags & IOMAP_WRITE,
- F2FS_GET_BLOCK_DIO);
+ err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO);
if (err)
return err;
@@ -4182,20 +4180,24 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
*/
map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
- if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) {
- iomap->length = blks_to_bytes(inode, map.m_len);
- if (map.m_flags & F2FS_MAP_MAPPED) {
- iomap->type = IOMAP_MAPPED;
- iomap->flags |= IOMAP_F_MERGED;
- } else {
- iomap->type = IOMAP_UNWRITTEN;
- }
- if (WARN_ON_ONCE(!__is_valid_data_blkaddr(map.m_pblk)))
- return -EINVAL;
+ /*
+ * We should never see delalloc or compressed extents here based on
+ * prior flushing and checks.
+ */
+ if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR))
+ return -EINVAL;
+ if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR))
+ return -EINVAL;
+ if (map.m_pblk != NULL_ADDR) {
+ iomap->length = blks_to_bytes(inode, map.m_len);
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags |= IOMAP_F_MERGED;
iomap->bdev = map.m_bdev;
iomap->addr = blks_to_bytes(inode, map.m_pblk);
} else {
+ if (flags & IOMAP_WRITE)
+ return -ENOTBLK;
iomap->length = blks_to_bytes(inode, next_pgofs) -
iomap->offset;
iomap->type = IOMAP_HOLE;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 32af4f0c5735..30a77936e3c5 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -354,6 +354,17 @@ static char *s_flag[] = {
[SBI_IS_FREEZING] = " freezefs",
};
+static const char *ipu_mode_names[F2FS_IPU_MAX] = {
+ [F2FS_IPU_FORCE] = "FORCE",
+ [F2FS_IPU_SSR] = "SSR",
+ [F2FS_IPU_UTIL] = "UTIL",
+ [F2FS_IPU_SSR_UTIL] = "SSR_UTIL",
+ [F2FS_IPU_FSYNC] = "FSYNC",
+ [F2FS_IPU_ASYNC] = "ASYNC",
+ [F2FS_IPU_NOCACHE] = "NOCACHE",
+ [F2FS_IPU_HONOR_OPU_WRITE] = "HONOR_OPU_WRITE",
+};
+
static int stat_show(struct seq_file *s, void *v)
{
struct f2fs_stat_info *si;
@@ -362,16 +373,18 @@ static int stat_show(struct seq_file *s, void *v)
raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
- update_general_status(si->sbi);
+ struct f2fs_sb_info *sbi = si->sbi;
+
+ update_general_status(sbi);
seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n",
- si->sbi->sb->s_bdev, i++,
- f2fs_readonly(si->sbi->sb) ? "RO" : "RW",
- is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ?
- "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good"));
- if (si->sbi->s_flag) {
+ sbi->sb->s_bdev, i++,
+ f2fs_readonly(sbi->sb) ? "RO" : "RW",
+ is_set_ckpt_flags(sbi, CP_DISABLED_FLAG) ?
+ "Disabled" : (f2fs_cp_error(sbi) ? "Error" : "Good"));
+ if (sbi->s_flag) {
seq_puts(s, "[SBI:");
- for_each_set_bit(j, &si->sbi->s_flag, 32)
+ for_each_set_bit(j, &sbi->s_flag, 32)
seq_puts(s, s_flag[j]);
seq_puts(s, "]\n");
}
@@ -383,8 +396,21 @@ static int stat_show(struct seq_file *s, void *v)
si->overp_segs, si->rsvd_segs);
seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n",
ktime_get_boottime_seconds(),
- SIT_I(si->sbi)->mounted_time);
- if (test_opt(si->sbi, DISCARD))
+ SIT_I(sbi)->mounted_time);
+
+ seq_puts(s, "Policy:\n");
+ seq_puts(s, " - IPU: [");
+ if (IS_F2FS_IPU_DISABLE(sbi)) {
+ seq_puts(s, " DISABLE");
+ } else {
+ unsigned long policy = SM_I(sbi)->ipu_policy;
+
+ for_each_set_bit(j, &policy, F2FS_IPU_MAX)
+ seq_printf(s, " %s", ipu_mode_names[j]);
+ }
+ seq_puts(s, " ]\n\n");
+
+ if (test_opt(sbi, DISCARD))
seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n",
si->utilization, si->valid_count, si->discard_blks);
else
@@ -491,15 +517,15 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - node segments : %d (%d)\n",
si->node_segs, si->bg_node_segs);
seq_puts(s, " - Reclaimed segs :\n");
- seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]);
- seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]);
+ seq_printf(s, " - Normal : %d\n", sbi->gc_reclaimed_segs[GC_NORMAL]);
+ seq_printf(s, " - Idle CB : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_CB]);
seq_printf(s, " - Idle Greedy : %d\n",
- si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]);
- seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]);
+ sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]);
+ seq_printf(s, " - Idle AT : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_AT]);
seq_printf(s, " - Urgent High : %d\n",
- si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]);
- seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]);
- seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
+ sbi->gc_reclaimed_segs[GC_URGENT_HIGH]);
+ seq_printf(s, " - Urgent Mid : %d\n", sbi->gc_reclaimed_segs[GC_URGENT_MID]);
+ seq_printf(s, " - Urgent Low : %d\n", sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
si->bg_data_blks + si->bg_node_blks);
seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks,
@@ -565,7 +591,7 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_imeta);
seq_printf(s, " - fsync mark: %4lld\n",
percpu_counter_sum_positive(
- &si->sbi->rf_node_block_count));
+ &sbi->rf_node_block_count));
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
@@ -592,12 +618,12 @@ static int stat_show(struct seq_file *s, void *v)
si->block_count[LFS], si->segment_count[LFS]);
/* segment usage info */
- f2fs_update_sit_info(si->sbi);
+ f2fs_update_sit_info(sbi);
seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n",
si->bimodal, si->avg_vblocks);
/* memory footprint */
- update_mem_info(si->sbi);
+ update_mem_info(sbi);
seq_printf(s, "\nMemory: %llu KB\n",
(si->base_mem + si->cache_mem + si->page_mem) >> 10);
seq_printf(s, " - static: %llu KB\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8e025157f35c..9ccdbe120425 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -732,10 +732,8 @@ int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname,
}
start:
- if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) {
- f2fs_show_injection_info(F2FS_I_SB(dir), FAULT_DIR_DEPTH);
+ if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH))
return -ENOSPC;
- }
if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
return -ENOSPC;
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 342af24b2f8c..28b12553f2b3 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -19,6 +19,31 @@
#include "node.h"
#include <trace/events/f2fs.h>
+bool sanity_check_extent_cache(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct extent_info *ei;
+
+ if (!fi->extent_tree[EX_READ])
+ return true;
+
+ ei = &fi->extent_tree[EX_READ]->largest;
+
+ if (ei->len &&
+ (!f2fs_is_valid_blkaddr(sbi, ei->blk,
+ DATA_GENERIC_ENHANCE) ||
+ !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1,
+ DATA_GENERIC_ENHANCE))) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix",
+ __func__, inode->i_ino,
+ ei->blk, ei->fofs, ei->len);
+ return false;
+ }
+ return true;
+}
+
static void __set_extent_info(struct extent_info *ei,
unsigned int fofs, unsigned int len,
block_t blk, bool keep_clen,
@@ -233,7 +258,7 @@ struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
* @prev_ex: extent before ofs
* @next_ex: extent after ofs
* @insert_p: insert point for new extent at ofs
- * in order to simpfy the insertion after.
+ * in order to simplify the insertion after.
* tree must stay unchanged between lookup and insertion.
*/
struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
@@ -718,7 +743,7 @@ static void __update_extent_tree_range(struct inode *inode,
if (!en)
en = next_en;
- /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */
+ /* 2. invalidate all extent nodes in range [fofs, fofs + len - 1] */
while (en && en->ei.fofs < end) {
unsigned int org_end;
int parts = 0; /* # of parts current extent split into */
@@ -871,14 +896,23 @@ unlock_out:
}
#endif
-static unsigned long long __calculate_block_age(unsigned long long new,
+static unsigned long long __calculate_block_age(struct f2fs_sb_info *sbi,
+ unsigned long long new,
unsigned long long old)
{
- unsigned long long diff;
+ unsigned int rem_old, rem_new;
+ unsigned long long res;
+ unsigned int weight = sbi->last_age_weight;
+
+ res = div_u64_rem(new, 100, &rem_new) * (100 - weight)
+ + div_u64_rem(old, 100, &rem_old) * weight;
- diff = (new >= old) ? new - (new - old) : new + (old - new);
+ if (rem_new)
+ res += rem_new * (100 - weight) / 100;
+ if (rem_old)
+ res += rem_old * weight / 100;
- return div_u64(diff * LAST_AGE_WEIGHT, 100);
+ return res;
}
/* This returns a new age and allocated blocks in ei */
@@ -910,7 +944,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei,
cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks;
if (tei.age)
- ei->age = __calculate_block_age(cur_age, tei.age);
+ ei->age = __calculate_block_age(sbi, cur_age, tei.age);
else
ei->age = cur_age;
ei->last_blocks = cur_blocks;
@@ -1047,6 +1081,17 @@ bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
return __lookup_extent_tree(inode, pgofs, ei, EX_READ);
}
+bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index,
+ block_t *blkaddr)
+{
+ struct extent_info ei = {};
+
+ if (!f2fs_lookup_read_extent_cache(inode, index, &ei))
+ return false;
+ *blkaddr = ei.blk + index - ei.fofs;
+ return true;
+}
+
void f2fs_update_read_extent_cache(struct dnode_of_data *dn)
{
return __update_extent_cache(dn, EX_READ);
@@ -1226,6 +1271,7 @@ void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
atomic64_set(&sbi->allocated_data_blocks, 0);
sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD;
sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD;
+ sbi->last_age_weight = LAST_AGE_WEIGHT;
}
int __init f2fs_create_extent_cache(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9a3ffa39ad30..b0ab2062038a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -402,7 +402,6 @@ struct discard_cmd_control {
struct list_head wait_list; /* store on-flushing entries */
struct list_head fstrim_list; /* in-flight discard from fstrim */
wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */
- unsigned int discard_wake; /* to wake up discard thread */
struct mutex cmd_lock;
unsigned int nr_discards; /* # of discards in the list */
unsigned int max_discards; /* max. discards to be issued */
@@ -410,6 +409,7 @@ struct discard_cmd_control {
unsigned int min_discard_issue_time; /* min. interval between discard issue */
unsigned int mid_discard_issue_time; /* mid. interval between discard issue */
unsigned int max_discard_issue_time; /* max. interval between discard issue */
+ unsigned int discard_io_aware_gran; /* minimum discard granularity not be aware of I/O */
unsigned int discard_urgent_util; /* utilization which issue discard proactively */
unsigned int discard_granularity; /* discard granularity */
unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */
@@ -420,6 +420,7 @@ struct discard_cmd_control {
atomic_t discard_cmd_cnt; /* # of cached cmd count */
struct rb_root_cached root; /* root of discard rb-tree */
bool rbtree_check; /* config for consistence check */
+ bool discard_wake; /* to wake up discard thread */
};
/* for the list of fsync inodes, used only during recovery */
@@ -692,15 +693,13 @@ struct extent_tree_info {
};
/*
- * This structure is taken from ext4_map_blocks.
- *
- * Note that, however, f2fs uses NEW and MAPPED flags for f2fs_map_blocks().
+ * State of block returned by f2fs_map_blocks.
*/
-#define F2FS_MAP_NEW (1 << BH_New)
-#define F2FS_MAP_MAPPED (1 << BH_Mapped)
-#define F2FS_MAP_UNWRITTEN (1 << BH_Unwritten)
+#define F2FS_MAP_NEW (1U << 0)
+#define F2FS_MAP_MAPPED (1U << 1)
+#define F2FS_MAP_DELALLOC (1U << 2)
#define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\
- F2FS_MAP_UNWRITTEN)
+ F2FS_MAP_DELALLOC)
struct f2fs_map_blocks {
struct block_device *m_bdev; /* for multi-device dio */
@@ -870,7 +869,7 @@ struct f2fs_inode_info {
unsigned char i_compress_algorithm; /* algorithm type */
unsigned char i_log_cluster_size; /* log of cluster size */
unsigned char i_compress_level; /* compress level (lz4hc,zstd) */
- unsigned short i_compress_flag; /* compress flag */
+ unsigned char i_compress_flag; /* compress flag */
unsigned int i_cluster_size; /* cluster size */
unsigned int atomic_write_cnt;
@@ -1193,7 +1192,8 @@ enum iostat_type {
FS_META_READ_IO, /* meta read IOs */
/* other */
- FS_DISCARD, /* discard */
+ FS_DISCARD_IO, /* discard */
+ FS_FLUSH_IO, /* flush */
NR_IO_TYPE,
};
@@ -1210,19 +1210,19 @@ struct f2fs_io_info {
struct page *encrypted_page; /* encrypted page */
struct page *compressed_page; /* compressed page */
struct list_head list; /* serialize IOs */
- bool submitted; /* indicate IO submission */
- int need_lock; /* indicate we need to lock cp_rwsem */
- bool in_list; /* indicate fio is in io_list */
- bool is_por; /* indicate IO is from recovery or not */
- bool retry; /* need to reallocate block address */
- int compr_blocks; /* # of compressed block addresses */
- bool encrypted; /* indicate file is encrypted */
- bool post_read; /* require post read */
+ unsigned int compr_blocks; /* # of compressed block addresses */
+ unsigned int need_lock:8; /* indicate we need to lock cp_rwsem */
+ unsigned int version:8; /* version of the node */
+ unsigned int submitted:1; /* indicate IO submission */
+ unsigned int in_list:1; /* indicate fio is in io_list */
+ unsigned int is_por:1; /* indicate IO is from recovery or not */
+ unsigned int retry:1; /* need to reallocate block address */
+ unsigned int encrypted:1; /* indicate file is encrypted */
+ unsigned int post_read:1; /* require post read */
enum iostat_type io_type; /* io type */
struct writeback_control *io_wbc; /* writeback control */
struct bio **bio; /* bio for ipu */
sector_t *last_block; /* last block number in bio */
- unsigned char version; /* version of the node */
};
struct bio_entry {
@@ -1384,8 +1384,6 @@ enum {
MEMORY_MODE_LOW, /* memory mode for low memry devices */
};
-
-
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1396,19 +1394,17 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr);
* Layout A: lowest bit should be 1
* | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... |
* bit 0 PAGE_PRIVATE_NOT_POINTER
- * bit 1 PAGE_PRIVATE_ATOMIC_WRITE
- * bit 2 PAGE_PRIVATE_DUMMY_WRITE
- * bit 3 PAGE_PRIVATE_ONGOING_MIGRATION
- * bit 4 PAGE_PRIVATE_INLINE_INODE
- * bit 5 PAGE_PRIVATE_REF_RESOURCE
- * bit 6- f2fs private data
+ * bit 1 PAGE_PRIVATE_DUMMY_WRITE
+ * bit 2 PAGE_PRIVATE_ONGOING_MIGRATION
+ * bit 3 PAGE_PRIVATE_INLINE_INODE
+ * bit 4 PAGE_PRIVATE_REF_RESOURCE
+ * bit 5- f2fs private data
*
* Layout B: lowest bit should be 0
* page.private is a wrapped pointer.
*/
enum {
PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */
- PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */
PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */
PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */
PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */
@@ -1450,22 +1446,18 @@ static inline void clear_page_private_##name(struct page *page) \
}
PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER);
-PAGE_PRIVATE_GET_FUNC(reference, REF_RESOURCE);
PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE);
PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE);
PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE);
PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE);
PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE);
PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE);
PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE);
PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE);
static inline unsigned long get_page_private_data(struct page *page)
@@ -1679,6 +1671,7 @@ struct f2fs_sb_info {
/* The threshold used for hot and warm data seperation*/
unsigned int hot_data_age_threshold;
unsigned int warm_data_age_threshold;
+ unsigned int last_age_weight;
/* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -1864,8 +1857,9 @@ struct f2fs_sb_info {
#ifdef CONFIG_F2FS_IOSTAT
/* For app/fs IO statistics */
spinlock_t iostat_lock;
- unsigned long long rw_iostat[NR_IO_TYPE];
- unsigned long long prev_rw_iostat[NR_IO_TYPE];
+ unsigned long long iostat_count[NR_IO_TYPE];
+ unsigned long long iostat_bytes[NR_IO_TYPE];
+ unsigned long long prev_iostat_bytes[NR_IO_TYPE];
bool iostat_enable;
unsigned long iostat_next_period;
unsigned int iostat_period_ms;
@@ -1877,12 +1871,10 @@ struct f2fs_sb_info {
};
#ifdef CONFIG_F2FS_FAULT_INJECTION
-#define f2fs_show_injection_info(sbi, type) \
- printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \
- KERN_INFO, sbi->sb->s_id, \
- f2fs_fault_name[type], \
- __func__, __builtin_return_address(0))
-static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
+#define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__, \
+ __builtin_return_address(0))
+static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type,
+ const char *func, const char *parent_func)
{
struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
@@ -1895,12 +1887,14 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
atomic_inc(&ffi->inject_ops);
if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
atomic_set(&ffi->inject_ops, 0);
+ printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n",
+ KERN_INFO, sbi->sb->s_id, f2fs_fault_name[type],
+ func, parent_func);
return true;
}
return false;
}
#else
-#define f2fs_show_injection_info(sbi, type) do { } while (0)
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
return false;
@@ -2233,10 +2227,8 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
{
- if (time_to_inject(sbi, FAULT_LOCK_OP)) {
- f2fs_show_injection_info(sbi, FAULT_LOCK_OP);
+ if (time_to_inject(sbi, FAULT_LOCK_OP))
return 0;
- }
return f2fs_down_read_trylock(&sbi->cp_rwsem);
}
@@ -2324,7 +2316,6 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
return ret;
if (time_to_inject(sbi, FAULT_BLOCK)) {
- f2fs_show_injection_info(sbi, FAULT_BLOCK);
release = *count;
goto release_quota;
}
@@ -2604,10 +2595,8 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
return err;
}
- if (time_to_inject(sbi, FAULT_BLOCK)) {
- f2fs_show_injection_info(sbi, FAULT_BLOCK);
+ if (time_to_inject(sbi, FAULT_BLOCK))
goto enospc;
- }
spin_lock(&sbi->stat_lock);
@@ -2731,11 +2720,8 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
if (page)
return page;
- if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) {
- f2fs_show_injection_info(F2FS_M_SB(mapping),
- FAULT_PAGE_ALLOC);
+ if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC))
return NULL;
- }
}
if (!for_write)
@@ -2752,10 +2738,8 @@ static inline struct page *f2fs_pagecache_get_page(
struct address_space *mapping, pgoff_t index,
int fgp_flags, gfp_t gfp_mask)
{
- if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) {
- f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET);
+ if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET))
return NULL;
- }
return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
}
@@ -2805,10 +2789,8 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
if (nofail)
return f2fs_kmem_cache_alloc_nofail(cachep, flags);
- if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) {
- f2fs_show_injection_info(sbi, FAULT_SLAB_ALLOC);
+ if (time_to_inject(sbi, FAULT_SLAB_ALLOC))
return NULL;
- }
return kmem_cache_alloc(cachep, flags);
}
@@ -3382,10 +3364,8 @@ static inline bool is_dot_dotdot(const u8 *name, size_t len)
static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
- if (time_to_inject(sbi, FAULT_KMALLOC)) {
- f2fs_show_injection_info(sbi, FAULT_KMALLOC);
+ if (time_to_inject(sbi, FAULT_KMALLOC))
return NULL;
- }
return kmalloc(size, flags);
}
@@ -3399,10 +3379,8 @@ static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi,
static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
- if (time_to_inject(sbi, FAULT_KVMALLOC)) {
- f2fs_show_injection_info(sbi, FAULT_KVMALLOC);
+ if (time_to_inject(sbi, FAULT_KVMALLOC))
return NULL;
- }
return kvmalloc(size, flags);
}
@@ -3788,8 +3766,8 @@ int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
-void f2fs_submit_bio(struct f2fs_sb_info *sbi,
- struct bio *bio, enum page_type type);
+void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
+ enum page_type type);
int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi);
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
@@ -3808,7 +3786,7 @@ void f2fs_set_data_blkaddr(struct dnode_of_data *dn);
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count);
int f2fs_reserve_new_block(struct dnode_of_data *dn);
-int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
+int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs);
@@ -3819,9 +3797,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
struct page *f2fs_get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size);
int f2fs_do_write_data_page(struct f2fs_io_info *fio);
-void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock);
-int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
- int create, int flag);
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
int f2fs_encrypt_one_page(struct f2fs_io_info *fio);
@@ -4161,6 +4137,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
/*
* extent_cache.c
*/
+bool sanity_check_extent_cache(struct inode *inode);
struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root,
struct rb_entry *cached_re, unsigned int ofs);
struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi,
@@ -4190,6 +4167,8 @@ void f2fs_destroy_extent_cache(void);
void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage);
bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
struct extent_info *ei);
+bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index,
+ block_t *blkaddr);
void f2fs_update_read_extent_cache(struct dnode_of_data *dn);
void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
pgoff_t fofs, block_t blkaddr, unsigned int len);
@@ -4259,7 +4238,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock);
void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
bool f2fs_is_compress_backend_ready(struct inode *inode);
-int f2fs_init_compress_mempool(void);
+int __init f2fs_init_compress_mempool(void);
void f2fs_destroy_compress_mempool(void);
void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task);
void f2fs_end_read_compressed_page(struct page *page, bool failed,
@@ -4328,7 +4307,7 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
WARN_ON_ONCE(1);
return ERR_PTR(-EINVAL);
}
-static inline int f2fs_init_compress_mempool(void) { return 0; }
+static inline int __init f2fs_init_compress_mempool(void) { return 0; }
static inline void f2fs_destroy_compress_mempool(void) { }
static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic,
bool in_task) { }
@@ -4381,9 +4360,8 @@ static inline int set_compress_context(struct inode *inode)
if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 ||
F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) &&
F2FS_OPTION(sbi).compress_level)
- F2FS_I(inode)->i_compress_flag |=
- F2FS_OPTION(sbi).compress_level <<
- COMPRESS_LEVEL_OFFSET;
+ F2FS_I(inode)->i_compress_level =
+ F2FS_OPTION(sbi).compress_level;
F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
set_inode_flag(inode, FI_COMPRESSED_FILE);
stat_inc_compr_inode(inode);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b90617639743..15dabeac4690 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -113,10 +113,8 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (need_alloc) {
/* block allocation */
- f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = f2fs_get_block(&dn, page->index);
- f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ err = f2fs_get_block_locked(&dn, page->index);
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -305,7 +303,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
* for OPU case, during fsync(), node can be persisted before
* data when lower device doesn't support write barrier, result
* in data corruption after SPO.
- * So for strict fsync mode, force to use atomic write sematics
+ * So for strict fsync mode, force to use atomic write semantics
* to keep write order in between data/node and last node to
* avoid potential data corruption.
*/
@@ -619,7 +617,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page),
dn->inode) + ofs;
f2fs_update_read_extent_cache_range(dn, fofs, 0, len);
- f2fs_update_age_extent_cache_range(dn, fofs, nr_free);
+ f2fs_update_age_extent_cache_range(dn, fofs, len);
dec_valid_block_count(sbi, dn->inode, nr_free);
}
dn->ofs_in_node = ofs;
@@ -784,10 +782,8 @@ int f2fs_truncate(struct inode *inode)
trace_f2fs_truncate(inode);
- if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) {
- f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_TRUNCATE);
+ if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE))
return -EIO;
- }
err = f2fs_dquot_initialize(inode);
if (err)
@@ -1112,7 +1108,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
return 0;
}
-static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
pgoff_t pg_start, pg_end;
loff_t off_start, off_end;
@@ -1498,6 +1494,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
}
f2fs_update_read_extent_cache_range(dn, start, 0, index - start);
+ f2fs_update_age_extent_cache_range(dn, start, index - start);
return ret;
}
@@ -1684,7 +1681,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
-static int expand_inode_data(struct inode *inode, loff_t offset,
+static int f2fs_expand_inode_data(struct inode *inode, loff_t offset,
loff_t len, int mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -1697,7 +1694,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
.err_gc_skipped = true,
.nr_free_secs = 0 };
pgoff_t pg_start, pg_end;
- loff_t new_size = i_size_read(inode);
+ loff_t new_size;
loff_t off_end;
block_t expanded = 0;
int err;
@@ -1745,7 +1742,7 @@ next_alloc:
f2fs_unlock_op(sbi);
map.m_seg_type = CURSEG_COLD_DATA_PINNED;
- err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+ err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_DIO);
file_dont_truncate(inode);
f2fs_up_write(&sbi->pin_sem);
@@ -1758,7 +1755,7 @@ next_alloc:
map.m_len = expanded;
} else {
- err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_AIO);
expanded = map.m_len;
}
out_err:
@@ -1809,7 +1806,7 @@ static long f2fs_fallocate(struct file *file, int mode,
return -EOPNOTSUPP;
/*
- * Pinned file should not support partial trucation since the block
+ * Pinned file should not support partial truncation since the block
* can be used by applications.
*/
if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) &&
@@ -1832,7 +1829,7 @@ static long f2fs_fallocate(struct file *file, int mode,
if (offset >= inode->i_size)
goto out;
- ret = punch_hole(inode, offset, len);
+ ret = f2fs_punch_hole(inode, offset, len);
} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
ret = f2fs_collapse_range(inode, offset, len);
} else if (mode & FALLOC_FL_ZERO_RANGE) {
@@ -1840,7 +1837,7 @@ static long f2fs_fallocate(struct file *file, int mode,
} else if (mode & FALLOC_FL_INSERT_RANGE) {
ret = f2fs_insert_range(inode, offset, len);
} else {
- ret = expand_inode_data(inode, offset, len, mode);
+ ret = f2fs_expand_inode_data(inode, offset, len, mode);
}
if (!ret) {
@@ -1859,14 +1856,17 @@ out:
static int f2fs_release_file(struct inode *inode, struct file *filp)
{
/*
- * f2fs_relase_file is called at every close calls. So we should
+ * f2fs_release_file is called at every close calls. So we should
* not drop any inmemory pages by close called by other process.
*/
if (!(filp->f_mode & FMODE_WRITE) ||
atomic_read(&inode->i_writecount) != 1)
return 0;
+ inode_lock(inode);
f2fs_abort_atomic_write(inode, true);
+ inode_unlock(inode);
+
return 0;
}
@@ -1880,8 +1880,13 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id)
* until all the writers close its file. Since this should be done
* before dropping file lock, it needs to do in ->flush.
*/
- if (F2FS_I(inode)->atomic_write_task == current)
+ if (F2FS_I(inode)->atomic_write_task == current &&
+ (current->flags & PF_EXITING)) {
+ inode_lock(inode);
f2fs_abort_atomic_write(inode, true);
+ inode_unlock(inode);
+ }
+
return 0;
}
@@ -2087,19 +2092,28 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
goto out;
}
- /* Create a COW inode for atomic write */
- pinode = f2fs_iget(inode->i_sb, fi->i_pino);
- if (IS_ERR(pinode)) {
- f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
- ret = PTR_ERR(pinode);
- goto out;
- }
+ /* Check if the inode already has a COW inode */
+ if (fi->cow_inode == NULL) {
+ /* Create a COW inode for atomic write */
+ pinode = f2fs_iget(inode->i_sb, fi->i_pino);
+ if (IS_ERR(pinode)) {
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ ret = PTR_ERR(pinode);
+ goto out;
+ }
- ret = f2fs_get_tmpfile(idmap, pinode, &fi->cow_inode);
- iput(pinode);
- if (ret) {
- f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
- goto out;
+ ret = f2fs_get_tmpfile(idmap, pinode, &fi->cow_inode);
+ iput(pinode);
+ if (ret) {
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ goto out;
+ }
+
+ set_inode_flag(fi->cow_inode, FI_COW_FILE);
+ clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
+ } else {
+ /* Reuse the already created COW inode */
+ f2fs_do_truncate_blocks(fi->cow_inode, 0, true);
}
f2fs_write_inode(inode, NULL);
@@ -2107,8 +2121,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
stat_inc_atomic_inode(inode);
set_inode_flag(inode, FI_ATOMIC_FILE);
- set_inode_flag(fi->cow_inode, FI_COW_FILE);
- clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
isize = i_size_read(inode);
fi->original_i_size = isize;
@@ -2338,6 +2350,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ u8 encrypt_pw_salt[16];
int err;
if (!f2fs_sb_has_encrypt(sbi))
@@ -2362,12 +2375,14 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
goto out_err;
}
got_it:
- if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
- 16))
- err = -EFAULT;
+ memcpy(encrypt_pw_salt, sbi->raw_super->encrypt_pw_salt, 16);
out_err:
f2fs_up_write(&sbi->sb_lock);
mnt_drop_write_file(filp);
+
+ if (!err && copy_to_user((__u8 __user *)arg, encrypt_pw_salt, 16))
+ err = -EFAULT;
+
return err;
}
@@ -2524,7 +2539,7 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg)
return __f2fs_ioc_gc_range(filp, &range);
}
-static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
+static int f2fs_ioc_write_checkpoint(struct file *filp)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -2606,7 +2621,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
*/
while (map.m_lblk < pg_end) {
map.m_len = pg_end - map.m_lblk;
- err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
+ err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
if (err)
goto out;
@@ -2653,7 +2668,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
do_map:
map.m_len = pg_end - map.m_lblk;
- err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
+ err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
if (err)
goto clear_out;
@@ -3227,7 +3242,7 @@ int f2fs_precache_extents(struct inode *inode)
map.m_len = end - map.m_lblk;
f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
- err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE);
+ err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRECACHE);
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -3238,7 +3253,7 @@ int f2fs_precache_extents(struct inode *inode)
return 0;
}
-static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg)
+static int f2fs_ioc_precache_extents(struct file *filp)
{
return f2fs_precache_extents(file_inode(filp));
}
@@ -3942,7 +3957,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
goto out;
}
- if (inode->i_size != 0) {
+ if (F2FS_HAS_BLOCKS(inode)) {
ret = -EFBIG;
goto out;
}
@@ -3995,7 +4010,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
return ret;
}
-static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg)
+static int f2fs_ioc_decompress_file(struct file *filp)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -4068,7 +4083,7 @@ out:
return ret;
}
-static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg)
+static int f2fs_ioc_compress_file(struct file *filp)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -4184,7 +4199,7 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case F2FS_IOC_GARBAGE_COLLECT_RANGE:
return f2fs_ioc_gc_range(filp, arg);
case F2FS_IOC_WRITE_CHECKPOINT:
- return f2fs_ioc_write_checkpoint(filp, arg);
+ return f2fs_ioc_write_checkpoint(filp);
case F2FS_IOC_DEFRAGMENT:
return f2fs_ioc_defragment(filp, arg);
case F2FS_IOC_MOVE_RANGE:
@@ -4198,7 +4213,7 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case F2FS_IOC_SET_PIN_FILE:
return f2fs_ioc_set_pin_file(filp, arg);
case F2FS_IOC_PRECACHE_EXTENTS:
- return f2fs_ioc_precache_extents(filp, arg);
+ return f2fs_ioc_precache_extents(filp);
case F2FS_IOC_RESIZE_FS:
return f2fs_ioc_resize_fs(filp, arg);
case FS_IOC_ENABLE_VERITY:
@@ -4224,9 +4239,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case F2FS_IOC_SET_COMPRESS_OPTION:
return f2fs_ioc_set_compress_option(filp, arg);
case F2FS_IOC_DECOMPRESS_FILE:
- return f2fs_ioc_decompress_file(filp, arg);
+ return f2fs_ioc_decompress_file(filp);
case F2FS_IOC_COMPRESS_FILE:
- return f2fs_ioc_compress_file(filp, arg);
+ return f2fs_ioc_compress_file(filp);
default:
return -ENOTTY;
}
@@ -4341,6 +4356,27 @@ out:
return ret;
}
+static void f2fs_trace_rw_file_path(struct kiocb *iocb, size_t count, int rw)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ char *buf, *path;
+
+ buf = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ return;
+ path = dentry_path_raw(file_dentry(iocb->ki_filp), buf, PATH_MAX);
+ if (IS_ERR(path))
+ goto free_buf;
+ if (rw == WRITE)
+ trace_f2fs_datawrite_start(inode, iocb->ki_pos, count,
+ current->pid, path, current->comm);
+ else
+ trace_f2fs_dataread_start(inode, iocb->ki_pos, count,
+ current->pid, path, current->comm);
+free_buf:
+ kfree(buf);
+}
+
static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
@@ -4350,24 +4386,9 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- if (trace_f2fs_dataread_start_enabled()) {
- char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL);
- char *path;
-
- if (!p)
- goto skip_read_trace;
+ if (trace_f2fs_dataread_start_enabled())
+ f2fs_trace_rw_file_path(iocb, iov_iter_count(to), READ);
- path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX);
- if (IS_ERR(path)) {
- kfree(p);
- goto skip_read_trace;
- }
-
- trace_f2fs_dataread_start(inode, pos, iov_iter_count(to),
- current->pid, path, current->comm);
- kfree(p);
- }
-skip_read_trace:
if (f2fs_should_use_dio(inode, iocb, to)) {
ret = f2fs_dio_read_iter(iocb, to);
} else {
@@ -4466,7 +4487,7 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
flag = F2FS_GET_BLOCK_PRE_AIO;
}
- ret = f2fs_map_blocks(inode, &map, 1, flag);
+ ret = f2fs_map_blocks(inode, &map, flag);
/* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */
if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0))
return ret;
@@ -4673,24 +4694,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (preallocated < 0) {
ret = preallocated;
} else {
- if (trace_f2fs_datawrite_start_enabled()) {
- char *p = f2fs_kmalloc(F2FS_I_SB(inode),
- PATH_MAX, GFP_KERNEL);
- char *path;
-
- if (!p)
- goto skip_write_trace;
- path = dentry_path_raw(file_dentry(iocb->ki_filp),
- p, PATH_MAX);
- if (IS_ERR(path)) {
- kfree(p);
- goto skip_write_trace;
- }
- trace_f2fs_datawrite_start(inode, orig_pos, orig_count,
- current->pid, path, current->comm);
- kfree(p);
- }
-skip_write_trace:
+ if (trace_f2fs_datawrite_start_enabled())
+ f2fs_trace_rw_file_path(iocb, orig_count, WRITE);
+
/* Do the actual write. */
ret = dio ?
f2fs_dio_write_iter(iocb, from, &may_need_sync) :
@@ -4823,6 +4829,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC32_MOVE_RANGE:
return f2fs_compat_ioc_move_range(file, arg);
case F2FS_IOC_START_ATOMIC_WRITE:
+ case F2FS_IOC_START_ATOMIC_REPLACE:
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
case F2FS_IOC_START_VOLATILE_WRITE:
case F2FS_IOC_RELEASE_VOLATILE_WRITE:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6e2cae3d2e71..0a9dfa459860 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -57,7 +57,7 @@ static int gc_thread_func(void *data)
/* give it a try one time */
if (gc_th->gc_wake)
- gc_th->gc_wake = 0;
+ gc_th->gc_wake = false;
if (try_to_freeze()) {
stat_other_skip_bggc_count(sbi);
@@ -72,11 +72,9 @@ static int gc_thread_func(void *data)
continue;
}
- if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
- f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
+ if (time_to_inject(sbi, FAULT_CHECKPOINT))
f2fs_stop_checkpoint(sbi, false,
STOP_CP_REASON_FAULT_INJECT);
- }
if (!sb_start_write_trylock(sbi->sb)) {
stat_other_skip_bggc_count(sbi);
@@ -185,7 +183,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
- gc_th->gc_wake = 0;
+ gc_th->gc_wake = false;
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
@@ -1150,7 +1148,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
- struct extent_info ei = {0, };
struct f2fs_io_info fio = {
.sbi = sbi,
.ino = inode->i_ino,
@@ -1159,8 +1156,8 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
.op = REQ_OP_READ,
.op_flags = 0,
.encrypted_page = NULL,
- .in_list = false,
- .retry = false,
+ .in_list = 0,
+ .retry = 0,
};
int err;
@@ -1168,8 +1165,8 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (!page)
return -ENOMEM;
- if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
- dn.data_blkaddr = ei.blk + index - ei.fofs;
+ if (f2fs_lookup_read_extent_cache_block(inode, index,
+ &dn.data_blkaddr)) {
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ))) {
err = -EFSCORRUPTED;
@@ -1248,8 +1245,8 @@ static int move_data_block(struct inode *inode, block_t bidx,
.op = REQ_OP_READ,
.op_flags = 0,
.encrypted_page = NULL,
- .in_list = false,
- .retry = false,
+ .in_list = 0,
+ .retry = 0,
};
struct dnode_of_data dn;
struct f2fs_summary sum;
@@ -1365,7 +1362,6 @@ static int move_data_block(struct inode *inode, block_t bidx,
dec_page_count(fio.sbi, F2FS_DIRTY_META);
set_page_writeback(fio.encrypted_page);
- ClearPageError(page);
fio.op = REQ_OP_WRITE;
fio.op_flags = REQ_SYNC;
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 19b956c2d697..15bd1d680f67 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -41,7 +41,7 @@ struct f2fs_gc_kthread {
unsigned int no_gc_sleep_time;
/* for changing gc mode */
- unsigned int gc_wake;
+ bool gc_wake;
/* for GC_MERGE mount option */
wait_queue_head_t fggc_wq; /*
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 21a495234ffd..72269e7efd26 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -174,7 +174,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
/* write data page to try to make data consistent */
set_page_writeback(page);
- ClearPageError(page);
fio.old_blkaddr = dn->data_blkaddr;
set_inode_flag(dn->inode, FI_HOT_DATA);
f2fs_outplace_write_data(dn, &fio);
@@ -422,18 +421,17 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
dentry_blk = page_address(page);
+ /*
+ * Start by zeroing the full block, to ensure that all unused space is
+ * zeroed and no uninitialized memory is leaked to disk.
+ */
+ memset(dentry_blk, 0, F2FS_BLKSIZE);
+
make_dentry_ptr_inline(dir, &src, inline_dentry);
make_dentry_ptr_block(dir, &dst, dentry_blk);
/* copy data from inline dentry block to new dentry block */
memcpy(dst.bitmap, src.bitmap, src.nr_bitmap);
- memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap);
- /*
- * we do not need to zero out remainder part of dentry and filename
- * field, since we have used bitmap for marking the usage status of
- * them, besides, we can also ignore copying/zeroing reserved space
- * of dentry block, because them haven't been used so far.
- */
memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max);
memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index ff6cf66ed46b..7d2e2c0dba65 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -262,22 +262,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
- if (fi->extent_tree[EX_READ]) {
- struct extent_info *ei = &fi->extent_tree[EX_READ]->largest;
-
- if (ei->len &&
- (!f2fs_is_valid_blkaddr(sbi, ei->blk,
- DATA_GENERIC_ENHANCE) ||
- !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1,
- DATA_GENERIC_ENHANCE))) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix",
- __func__, inode->i_ino,
- ei->blk, ei->fofs, ei->len);
- return false;
- }
- }
-
if (f2fs_sanity_check_inline_data(inode)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix",
@@ -413,12 +397,6 @@ static int do_read_inode(struct inode *inode)
fi->i_inline_xattr_size = 0;
}
- if (!sanity_check_inode(inode, node_page)) {
- f2fs_put_page(node_page, 1);
- f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
- return -EFSCORRUPTED;
- }
-
/* check data exist */
if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
__recover_inline_status(inode, node_page);
@@ -466,11 +444,17 @@ static int do_read_inode(struct inode *inode)
(fi->i_flags & F2FS_COMPR_FL)) {
if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
i_log_cluster_size)) {
+ unsigned short compress_flag;
+
atomic_set(&fi->i_compr_blocks,
le64_to_cpu(ri->i_compr_blocks));
fi->i_compress_algorithm = ri->i_compress_algorithm;
fi->i_log_cluster_size = ri->i_log_cluster_size;
- fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag);
+ compress_flag = le16_to_cpu(ri->i_compress_flag);
+ fi->i_compress_level = compress_flag >>
+ COMPRESS_LEVEL_OFFSET;
+ fi->i_compress_flag = compress_flag &
+ (BIT(COMPRESS_LEVEL_OFFSET) - 1);
fi->i_cluster_size = 1 << fi->i_log_cluster_size;
set_inode_flag(inode, FI_COMPRESSED_FILE);
}
@@ -482,6 +466,18 @@ static int do_read_inode(struct inode *inode)
f2fs_init_read_extent_tree(inode, node_page);
f2fs_init_age_extent_tree(inode);
+ if (!sanity_check_inode(inode, node_page)) {
+ f2fs_put_page(node_page, 1);
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
+ return -EFSCORRUPTED;
+ }
+
+ if (!sanity_check_extent_cache(inode)) {
+ f2fs_put_page(node_page, 1);
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
+ return -EFSCORRUPTED;
+ }
+
f2fs_put_page(node_page, 1);
stat_inc_inline_xattr(inode);
@@ -686,13 +682,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
if (f2fs_sb_has_compression(F2FS_I_SB(inode)) &&
F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
i_log_cluster_size)) {
+ unsigned short compress_flag;
+
ri->i_compr_blocks =
cpu_to_le64(atomic_read(
&F2FS_I(inode)->i_compr_blocks));
ri->i_compress_algorithm =
F2FS_I(inode)->i_compress_algorithm;
- ri->i_compress_flag =
- cpu_to_le16(F2FS_I(inode)->i_compress_flag);
+ compress_flag = F2FS_I(inode)->i_compress_flag |
+ F2FS_I(inode)->i_compress_level <<
+ COMPRESS_LEVEL_OFFSET;
+ ri->i_compress_flag = cpu_to_le16(compress_flag);
ri->i_log_cluster_size =
F2FS_I(inode)->i_log_cluster_size;
}
@@ -714,18 +714,19 @@ void f2fs_update_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *node_page;
+ int count = 0;
retry:
node_page = f2fs_get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page)) {
int err = PTR_ERR(node_page);
- if (err == -ENOMEM) {
- cond_resched();
+ /* The node block was truncated. */
+ if (err == -ENOENT)
+ return;
+
+ if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT)
goto retry;
- } else if (err != -ENOENT) {
- f2fs_stop_checkpoint(sbi, false,
- STOP_CP_REASON_UPDATE_INODE);
- }
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE);
return;
}
f2fs_update_inode(inode, node_page);
@@ -766,11 +767,18 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
void f2fs_evict_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ nid_t xnid = fi->i_xattr_nid;
int err = 0;
f2fs_abort_atomic_write(inode, true);
+ if (fi->cow_inode) {
+ clear_inode_flag(fi->cow_inode, FI_COW_FILE);
+ iput(fi->cow_inode);
+ fi->cow_inode = NULL;
+ }
+
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
@@ -809,10 +817,8 @@ retry:
if (F2FS_HAS_BLOCKS(inode))
err = f2fs_truncate(inode);
- if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
- f2fs_show_injection_info(sbi, FAULT_EVICT_INODE);
+ if (time_to_inject(sbi, FAULT_EVICT_INODE))
err = -EIO;
- }
if (!err) {
f2fs_lock_op(sbi);
@@ -857,7 +863,7 @@ no_delete:
stat_dec_inline_inode(inode);
stat_dec_compr_inode(inode);
stat_sub_compr_blocks(inode,
- atomic_read(&F2FS_I(inode)->i_compr_blocks));
+ atomic_read(&fi->i_compr_blocks));
if (likely(!f2fs_cp_error(sbi) &&
!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c
index 3166a8939ed4..3d5bfb1ad585 100644
--- a/fs/f2fs/iostat.c
+++ b/fs/f2fs/iostat.c
@@ -14,91 +14,79 @@
#include "iostat.h"
#include <trace/events/f2fs.h>
-#define NUM_PREALLOC_IOSTAT_CTXS 128
static struct kmem_cache *bio_iostat_ctx_cache;
static mempool_t *bio_iostat_ctx_pool;
+static inline unsigned long long iostat_get_avg_bytes(struct f2fs_sb_info *sbi,
+ enum iostat_type type)
+{
+ return sbi->iostat_count[type] ? div64_u64(sbi->iostat_bytes[type],
+ sbi->iostat_count[type]) : 0;
+}
+
+#define IOSTAT_INFO_SHOW(name, type) \
+ seq_printf(seq, "%-23s %-16llu %-16llu %-16llu\n", \
+ name":", sbi->iostat_bytes[type], \
+ sbi->iostat_count[type], \
+ iostat_get_avg_bytes(sbi, type))
+
int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset)
{
struct super_block *sb = seq->private;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
- time64_t now = ktime_get_real_seconds();
if (!sbi->iostat_enable)
return 0;
- seq_printf(seq, "time: %-16llu\n", now);
+ seq_printf(seq, "time: %-16llu\n", ktime_get_real_seconds());
+ seq_printf(seq, "\t\t\t%-16s %-16s %-16s\n",
+ "io_bytes", "count", "avg_bytes");
/* print app write IOs */
seq_puts(seq, "[WRITE]\n");
- seq_printf(seq, "app buffered data: %-16llu\n",
- sbi->rw_iostat[APP_BUFFERED_IO]);
- seq_printf(seq, "app direct data: %-16llu\n",
- sbi->rw_iostat[APP_DIRECT_IO]);
- seq_printf(seq, "app mapped data: %-16llu\n",
- sbi->rw_iostat[APP_MAPPED_IO]);
- seq_printf(seq, "app buffered cdata: %-16llu\n",
- sbi->rw_iostat[APP_BUFFERED_CDATA_IO]);
- seq_printf(seq, "app mapped cdata: %-16llu\n",
- sbi->rw_iostat[APP_MAPPED_CDATA_IO]);
+ IOSTAT_INFO_SHOW("app buffered data", APP_BUFFERED_IO);
+ IOSTAT_INFO_SHOW("app direct data", APP_DIRECT_IO);
+ IOSTAT_INFO_SHOW("app mapped data", APP_MAPPED_IO);
+ IOSTAT_INFO_SHOW("app buffered cdata", APP_BUFFERED_CDATA_IO);
+ IOSTAT_INFO_SHOW("app mapped cdata", APP_MAPPED_CDATA_IO);
/* print fs write IOs */
- seq_printf(seq, "fs data: %-16llu\n",
- sbi->rw_iostat[FS_DATA_IO]);
- seq_printf(seq, "fs cdata: %-16llu\n",
- sbi->rw_iostat[FS_CDATA_IO]);
- seq_printf(seq, "fs node: %-16llu\n",
- sbi->rw_iostat[FS_NODE_IO]);
- seq_printf(seq, "fs meta: %-16llu\n",
- sbi->rw_iostat[FS_META_IO]);
- seq_printf(seq, "fs gc data: %-16llu\n",
- sbi->rw_iostat[FS_GC_DATA_IO]);
- seq_printf(seq, "fs gc node: %-16llu\n",
- sbi->rw_iostat[FS_GC_NODE_IO]);
- seq_printf(seq, "fs cp data: %-16llu\n",
- sbi->rw_iostat[FS_CP_DATA_IO]);
- seq_printf(seq, "fs cp node: %-16llu\n",
- sbi->rw_iostat[FS_CP_NODE_IO]);
- seq_printf(seq, "fs cp meta: %-16llu\n",
- sbi->rw_iostat[FS_CP_META_IO]);
+ IOSTAT_INFO_SHOW("fs data", FS_DATA_IO);
+ IOSTAT_INFO_SHOW("fs cdata", FS_CDATA_IO);
+ IOSTAT_INFO_SHOW("fs node", FS_NODE_IO);
+ IOSTAT_INFO_SHOW("fs meta", FS_META_IO);
+ IOSTAT_INFO_SHOW("fs gc data", FS_GC_DATA_IO);
+ IOSTAT_INFO_SHOW("fs gc node", FS_GC_NODE_IO);
+ IOSTAT_INFO_SHOW("fs cp data", FS_CP_DATA_IO);
+ IOSTAT_INFO_SHOW("fs cp node", FS_CP_NODE_IO);
+ IOSTAT_INFO_SHOW("fs cp meta", FS_CP_META_IO);
/* print app read IOs */
seq_puts(seq, "[READ]\n");
- seq_printf(seq, "app buffered data: %-16llu\n",
- sbi->rw_iostat[APP_BUFFERED_READ_IO]);
- seq_printf(seq, "app direct data: %-16llu\n",
- sbi->rw_iostat[APP_DIRECT_READ_IO]);
- seq_printf(seq, "app mapped data: %-16llu\n",
- sbi->rw_iostat[APP_MAPPED_READ_IO]);
- seq_printf(seq, "app buffered cdata: %-16llu\n",
- sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO]);
- seq_printf(seq, "app mapped cdata: %-16llu\n",
- sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO]);
+ IOSTAT_INFO_SHOW("app buffered data", APP_BUFFERED_READ_IO);
+ IOSTAT_INFO_SHOW("app direct data", APP_DIRECT_READ_IO);
+ IOSTAT_INFO_SHOW("app mapped data", APP_MAPPED_READ_IO);
+ IOSTAT_INFO_SHOW("app buffered cdata", APP_BUFFERED_CDATA_READ_IO);
+ IOSTAT_INFO_SHOW("app mapped cdata", APP_MAPPED_CDATA_READ_IO);
/* print fs read IOs */
- seq_printf(seq, "fs data: %-16llu\n",
- sbi->rw_iostat[FS_DATA_READ_IO]);
- seq_printf(seq, "fs gc data: %-16llu\n",
- sbi->rw_iostat[FS_GDATA_READ_IO]);
- seq_printf(seq, "fs cdata: %-16llu\n",
- sbi->rw_iostat[FS_CDATA_READ_IO]);
- seq_printf(seq, "fs node: %-16llu\n",
- sbi->rw_iostat[FS_NODE_READ_IO]);
- seq_printf(seq, "fs meta: %-16llu\n",
- sbi->rw_iostat[FS_META_READ_IO]);
+ IOSTAT_INFO_SHOW("fs data", FS_DATA_READ_IO);
+ IOSTAT_INFO_SHOW("fs gc data", FS_GDATA_READ_IO);
+ IOSTAT_INFO_SHOW("fs cdata", FS_CDATA_READ_IO);
+ IOSTAT_INFO_SHOW("fs node", FS_NODE_READ_IO);
+ IOSTAT_INFO_SHOW("fs meta", FS_META_READ_IO);
/* print other IOs */
seq_puts(seq, "[OTHER]\n");
- seq_printf(seq, "fs discard: %-16llu\n",
- sbi->rw_iostat[FS_DISCARD]);
+ IOSTAT_INFO_SHOW("fs discard", FS_DISCARD_IO);
+ IOSTAT_INFO_SHOW("fs flush", FS_FLUSH_IO);
return 0;
}
static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
{
- int io, idx = 0;
- unsigned int cnt;
+ int io, idx;
struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
unsigned long flags;
@@ -106,12 +94,11 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
for (idx = 0; idx < MAX_IO_TYPE; idx++) {
for (io = 0; io < NR_PAGE_TYPE; io++) {
- cnt = io_lat->bio_cnt[idx][io];
iostat_lat[idx][io].peak_lat =
jiffies_to_msecs(io_lat->peak_lat[idx][io]);
- iostat_lat[idx][io].cnt = cnt;
- iostat_lat[idx][io].avg_lat = cnt ?
- jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0;
+ iostat_lat[idx][io].cnt = io_lat->bio_cnt[idx][io];
+ iostat_lat[idx][io].avg_lat = iostat_lat[idx][io].cnt ?
+ jiffies_to_msecs(io_lat->sum_lat[idx][io]) / iostat_lat[idx][io].cnt : 0;
io_lat->sum_lat[idx][io] = 0;
io_lat->peak_lat[idx][io] = 0;
io_lat->bio_cnt[idx][io] = 0;
@@ -141,9 +128,9 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
msecs_to_jiffies(sbi->iostat_period_ms);
for (i = 0; i < NR_IO_TYPE; i++) {
- iostat_diff[i] = sbi->rw_iostat[i] -
- sbi->prev_rw_iostat[i];
- sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
+ iostat_diff[i] = sbi->iostat_bytes[i] -
+ sbi->prev_iostat_bytes[i];
+ sbi->prev_iostat_bytes[i] = sbi->iostat_bytes[i];
}
spin_unlock_irqrestore(&sbi->iostat_lock, flags);
@@ -159,8 +146,9 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
spin_lock_irq(&sbi->iostat_lock);
for (i = 0; i < NR_IO_TYPE; i++) {
- sbi->rw_iostat[i] = 0;
- sbi->prev_rw_iostat[i] = 0;
+ sbi->iostat_count[i] = 0;
+ sbi->iostat_bytes[i] = 0;
+ sbi->prev_iostat_bytes[i] = 0;
}
spin_unlock_irq(&sbi->iostat_lock);
@@ -169,6 +157,13 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
spin_unlock_irq(&sbi->iostat_lat_lock);
}
+static inline void __f2fs_update_iostat(struct f2fs_sb_info *sbi,
+ enum iostat_type type, unsigned long long io_bytes)
+{
+ sbi->iostat_bytes[type] += io_bytes;
+ sbi->iostat_count[type]++;
+}
+
void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode,
enum iostat_type type, unsigned long long io_bytes)
{
@@ -178,33 +173,33 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode,
return;
spin_lock_irqsave(&sbi->iostat_lock, flags);
- sbi->rw_iostat[type] += io_bytes;
+ __f2fs_update_iostat(sbi, type, io_bytes);
if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO)
- sbi->rw_iostat[APP_WRITE_IO] += io_bytes;
+ __f2fs_update_iostat(sbi, APP_WRITE_IO, io_bytes);
if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO)
- sbi->rw_iostat[APP_READ_IO] += io_bytes;
+ __f2fs_update_iostat(sbi, APP_READ_IO, io_bytes);
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (inode && f2fs_compressed_file(inode)) {
if (type == APP_BUFFERED_IO)
- sbi->rw_iostat[APP_BUFFERED_CDATA_IO] += io_bytes;
+ __f2fs_update_iostat(sbi, APP_BUFFERED_CDATA_IO, io_bytes);
if (type == APP_BUFFERED_READ_IO)
- sbi->rw_iostat[APP_BUFFERED_CDATA_READ_IO] += io_bytes;
+ __f2fs_update_iostat(sbi, APP_BUFFERED_CDATA_READ_IO, io_bytes);
if (type == APP_MAPPED_READ_IO)
- sbi->rw_iostat[APP_MAPPED_CDATA_READ_IO] += io_bytes;
+ __f2fs_update_iostat(sbi, APP_MAPPED_CDATA_READ_IO, io_bytes);
if (type == APP_MAPPED_IO)
- sbi->rw_iostat[APP_MAPPED_CDATA_IO] += io_bytes;
+ __f2fs_update_iostat(sbi, APP_MAPPED_CDATA_IO, io_bytes);
if (type == FS_DATA_READ_IO)
- sbi->rw_iostat[FS_CDATA_READ_IO] += io_bytes;
+ __f2fs_update_iostat(sbi, FS_CDATA_READ_IO, io_bytes);
if (type == FS_DATA_IO)
- sbi->rw_iostat[FS_CDATA_IO] += io_bytes;
+ __f2fs_update_iostat(sbi, FS_CDATA_IO, io_bytes);
}
#endif
@@ -214,49 +209,48 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode,
}
static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
- int rw, bool is_sync)
+ enum iostat_lat_type lat_type)
{
unsigned long ts_diff;
- unsigned int iotype = iostat_ctx->type;
+ unsigned int page_type = iostat_ctx->type;
struct f2fs_sb_info *sbi = iostat_ctx->sbi;
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
- int idx;
unsigned long flags;
if (!sbi->iostat_enable)
return;
ts_diff = jiffies - iostat_ctx->submit_ts;
- if (iotype >= META_FLUSH)
- iotype = META;
-
- if (rw == 0) {
- idx = READ_IO;
- } else {
- if (is_sync)
- idx = WRITE_SYNC_IO;
- else
- idx = WRITE_ASYNC_IO;
+ if (page_type == META_FLUSH) {
+ page_type = META;
+ } else if (page_type >= NR_PAGE_TYPE) {
+ f2fs_warn(sbi, "%s: %d over NR_PAGE_TYPE", __func__, page_type);
+ return;
}
spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
- io_lat->sum_lat[idx][iotype] += ts_diff;
- io_lat->bio_cnt[idx][iotype]++;
- if (ts_diff > io_lat->peak_lat[idx][iotype])
- io_lat->peak_lat[idx][iotype] = ts_diff;
+ io_lat->sum_lat[lat_type][page_type] += ts_diff;
+ io_lat->bio_cnt[lat_type][page_type]++;
+ if (ts_diff > io_lat->peak_lat[lat_type][page_type])
+ io_lat->peak_lat[lat_type][page_type] = ts_diff;
spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
}
-void iostat_update_and_unbind_ctx(struct bio *bio, int rw)
+void iostat_update_and_unbind_ctx(struct bio *bio)
{
struct bio_iostat_ctx *iostat_ctx = bio->bi_private;
- bool is_sync = bio->bi_opf & REQ_SYNC;
+ enum iostat_lat_type lat_type;
- if (rw == 0)
- bio->bi_private = iostat_ctx->post_read_ctx;
- else
+ if (op_is_write(bio_op(bio))) {
+ lat_type = bio->bi_opf & REQ_SYNC ?
+ WRITE_SYNC_IO : WRITE_ASYNC_IO;
bio->bi_private = iostat_ctx->sbi;
- __update_iostat_latency(iostat_ctx, rw, is_sync);
+ } else {
+ lat_type = READ_IO;
+ bio->bi_private = iostat_ctx->post_read_ctx;
+ }
+
+ __update_iostat_latency(iostat_ctx, lat_type);
mempool_free(iostat_ctx, bio_iostat_ctx_pool);
}
diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h
index 2c048307b6e0..eb99d05cf272 100644
--- a/fs/f2fs/iostat.h
+++ b/fs/f2fs/iostat.h
@@ -8,20 +8,21 @@
struct bio_post_read_ctx;
+enum iostat_lat_type {
+ READ_IO = 0,
+ WRITE_SYNC_IO,
+ WRITE_ASYNC_IO,
+ MAX_IO_TYPE,
+};
+
#ifdef CONFIG_F2FS_IOSTAT
+#define NUM_PREALLOC_IOSTAT_CTXS 128
#define DEFAULT_IOSTAT_PERIOD_MS 3000
#define MIN_IOSTAT_PERIOD_MS 100
/* maximum period of iostat tracing is 1 day */
#define MAX_IOSTAT_PERIOD_MS 8640000
-enum {
- READ_IO,
- WRITE_SYNC_IO,
- WRITE_ASYNC_IO,
- MAX_IO_TYPE,
-};
-
struct iostat_lat_info {
unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */
unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */
@@ -57,7 +58,7 @@ static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio)
return iostat_ctx->post_read_ctx;
}
-extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw);
+extern void iostat_update_and_unbind_ctx(struct bio *bio);
extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
struct bio *bio, struct bio_post_read_ctx *ctx);
extern int f2fs_init_iostat_processing(void);
@@ -67,7 +68,7 @@ extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi);
#else
static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, struct inode *inode,
enum iostat_type type, unsigned long long io_bytes) {}
-static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {}
+static inline void iostat_update_and_unbind_ctx(struct bio *bio) {}
static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi,
struct bio *bio, struct bio_post_read_ctx *ctx) {}
static inline void iostat_update_submit_ctx(struct bio *bio,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index d8e01bbbf27f..11fc4c8036a9 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -926,9 +926,6 @@ static int f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
static int f2fs_create_whiteout(struct mnt_idmap *idmap,
struct inode *dir, struct inode **whiteout)
{
- if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
- return -EIO;
-
return __f2fs_tmpfile(idmap, dir, NULL,
S_IFCHR | WHITEOUT_MODE, true, whiteout);
}
@@ -966,7 +963,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
/*
* If new_inode is null, the below renaming flow will
- * add a link in old_dir which can conver inline_dir.
+ * add a link in old_dir which can convert inline_dir.
* After then, if we failed to get the entry due to other
* reasons like ENOMEM, we had to remove the new entry.
* Instead of adding such the error handling routine, let's
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index cf997356d9f9..bd1dad523796 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1587,7 +1587,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
.op_flags = wbc_to_write_flags(wbc),
.page = page,
.encrypted_page = NULL,
- .submitted = false,
+ .submitted = 0,
.io_type = io_type,
.io_wbc = wbc,
};
@@ -1651,7 +1651,6 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
}
set_page_writeback(page);
- ClearPageError(page);
fio.old_blkaddr = ni.blk_addr;
f2fs_do_write_node_page(nid, &fio);
@@ -2083,8 +2082,6 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
f2fs_wait_on_page_writeback(page, NODE, true, false);
- if (TestClearPageError(page))
- ret = -EIO;
put_page(page);
@@ -2548,10 +2545,8 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i = NULL;
retry:
- if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
- f2fs_show_injection_info(sbi, FAULT_ALLOC_NID);
+ if (time_to_inject(sbi, FAULT_ALLOC_NID))
return false;
- }
spin_lock(&nm_i->nid_list_lock);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index ae3c4e5474ef..227e25836173 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -192,18 +192,18 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
if (!f2fs_is_atomic_file(inode))
return;
- clear_inode_flag(fi->cow_inode, FI_COW_FILE);
- iput(fi->cow_inode);
- fi->cow_inode = NULL;
release_atomic_write_cnt(inode);
clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
clear_inode_flag(inode, FI_ATOMIC_REPLACE);
clear_inode_flag(inode, FI_ATOMIC_FILE);
stat_dec_atomic_inode(inode);
+ F2FS_I(inode)->atomic_write_task = NULL;
+
if (clean) {
truncate_inode_pages_final(inode->i_mapping);
f2fs_i_size_write(inode, fi->original_i_size);
+ fi->original_i_size = 0;
}
}
@@ -255,6 +255,9 @@ retry:
}
f2fs_put_dnode(&dn);
+
+ trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode,
+ index, *old_addr, new_addr, recover);
return 0;
}
@@ -262,19 +265,24 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head,
bool revoke)
{
struct revoke_entry *cur, *tmp;
+ pgoff_t start_index = 0;
bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE);
list_for_each_entry_safe(cur, tmp, head, list) {
- if (revoke)
+ if (revoke) {
__replace_atomic_write_block(inode, cur->index,
cur->old_addr, NULL, true);
+ } else if (truncate) {
+ f2fs_truncate_hole(inode, start_index, cur->index);
+ start_index = cur->index + 1;
+ }
list_del(&cur->list);
kmem_cache_free(revoke_entry_slab, cur);
}
if (!revoke && truncate)
- f2fs_do_truncate_blocks(inode, 0, false);
+ f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false);
}
static int __f2fs_commit_atomic_write(struct inode *inode)
@@ -384,10 +392,8 @@ int f2fs_commit_atomic_write(struct inode *inode)
*/
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
- if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
- f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
+ if (time_to_inject(sbi, FAULT_CHECKPOINT))
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT);
- }
/* balance_fs_bg is able to be pending */
if (need && excess_cached_nats(sbi))
@@ -508,6 +514,8 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi,
trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
test_opt(sbi, FLUSH_MERGE), ret);
+ if (!ret)
+ f2fs_update_iostat(sbi, NULL, FS_FLUSH_IO, 0);
return ret;
}
@@ -1059,7 +1067,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->granularity = granularity;
dpolicy->max_requests = dcc->max_discard_request;
- dpolicy->io_aware_gran = MAX_PLIST_NUM;
+ dpolicy->io_aware_gran = dcc->discard_io_aware_gran;
dpolicy->timeout = false;
if (discard_type == DPOLICY_BG) {
@@ -1095,9 +1103,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
block_t start, block_t len);
/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
- struct discard_policy *dpolicy,
- struct discard_cmd *dc,
- unsigned int *issued)
+ struct discard_policy *dpolicy,
+ struct discard_cmd *dc, int *issued)
{
struct block_device *bdev = dc->bdev;
unsigned int max_discard_blocks =
@@ -1141,7 +1148,6 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
dc->len += len;
if (time_to_inject(sbi, FAULT_DISCARD)) {
- f2fs_show_injection_info(sbi, FAULT_DISCARD);
err = -EIO;
} else {
err = __blkdev_issue_discard(bdev,
@@ -1186,7 +1192,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
atomic_inc(&dcc->issued_discard);
- f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE);
+ f2fs_update_iostat(sbi, NULL, FS_DISCARD_IO, len * F2FS_BLKSIZE);
lstart += len;
start += len;
@@ -1378,8 +1384,8 @@ static void __queue_discard_cmd(struct f2fs_sb_info *sbi,
mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock);
}
-static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
- struct discard_policy *dpolicy)
+static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
+ struct discard_policy *dpolicy, int *issued)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
@@ -1387,7 +1393,6 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
struct discard_cmd *dc;
struct blk_plug plug;
unsigned int pos = dcc->next_pos;
- unsigned int issued = 0;
bool io_interrupted = false;
mutex_lock(&dcc->cmd_lock);
@@ -1414,9 +1419,9 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
}
dcc->next_pos = dc->lstart + dc->len;
- err = __submit_discard_cmd(sbi, dpolicy, dc, &issued);
+ err = __submit_discard_cmd(sbi, dpolicy, dc, issued);
- if (issued >= dpolicy->max_requests)
+ if (*issued >= dpolicy->max_requests)
break;
next:
node = rb_next(&dc->rb_node);
@@ -1432,10 +1437,8 @@ next:
mutex_unlock(&dcc->cmd_lock);
- if (!issued && io_interrupted)
- issued = -1;
-
- return issued;
+ if (!(*issued) && io_interrupted)
+ *issued = -1;
}
static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
struct discard_policy *dpolicy);
@@ -1463,8 +1466,10 @@ retry:
if (i + 1 < dpolicy->granularity)
break;
- if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered)
- return __issue_discard_cmd_orderly(sbi, dpolicy);
+ if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) {
+ __issue_discard_cmd_orderly(sbi, dpolicy, &issued);
+ return issued;
+ }
pend_list = &dcc->pend_list[i];
@@ -1609,9 +1614,9 @@ static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX);
/* wait all */
- __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1);
+ __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, MIN_DISCARD_GRANULARITY);
discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX);
- __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1);
+ __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, MIN_DISCARD_GRANULARITY);
discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX);
return discard_blks;
@@ -1653,7 +1658,14 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi)
}
}
-/* This comes from f2fs_put_super */
+/**
+ * f2fs_issue_discard_timeout() - Issue all discard cmd within UMOUNT_DISCARD_TIMEOUT
+ * @sbi: the f2fs_sb_info data for discard cmd to issue
+ *
+ * When UMOUNT_DISCARD_TIMEOUT is exceeded, all remaining discard commands will be dropped
+ *
+ * Return true if issued all discard cmd or no discard cmd need issue, otherwise return false.
+ */
bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
@@ -1661,7 +1673,7 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
bool dropped;
if (!atomic_read(&dcc->discard_cmd_cnt))
- return false;
+ return true;
__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
dcc->discard_granularity);
@@ -1672,7 +1684,7 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
__wait_all_discard_cmd(sbi, NULL);
f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt));
- return dropped;
+ return !dropped;
}
static int issue_discard_thread(void *data)
@@ -1694,13 +1706,14 @@ static int issue_discard_thread(void *data)
if (sbi->gc_mode == GC_URGENT_HIGH ||
!f2fs_available_free_memory(sbi, DISCARD_CACHE))
- __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
+ __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE,
+ MIN_DISCARD_GRANULARITY);
else
__init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
dcc->discard_granularity);
if (dcc->discard_wake)
- dcc->discard_wake = 0;
+ dcc->discard_wake = false;
/* clean up pending candidates before going to sleep */
if (atomic_read(&dcc->queued_discard))
@@ -2065,6 +2078,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
if (!dcc)
return -ENOMEM;
+ dcc->discard_io_aware_gran = MAX_PLIST_NUM;
dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
@@ -2327,17 +2341,13 @@ bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
return is_cp;
}
-/*
- * This function should be resided under the curseg_mutex lock
- */
-static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
- struct f2fs_summary *sum)
+static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- void *addr = curseg->sum_blk;
- addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
- memcpy(addr, sum, sizeof(struct f2fs_summary));
+ if (sbi->ckpt->alloc_type[type] == SSR)
+ return sbi->blocks_per_seg;
+ return curseg->next_blkoff;
}
/*
@@ -2349,15 +2359,11 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
int i, sum_in_page;
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
- if (sbi->ckpt->alloc_type[i] == SSR)
- valid_sum_count += sbi->blocks_per_seg;
- else {
- if (for_ra)
- valid_sum_count += le16_to_cpu(
- F2FS_CKPT(sbi)->cur_data_blkoff[i]);
- else
- valid_sum_count += curseg_blkoff(sbi, i);
- }
+ if (sbi->ckpt->alloc_type[i] != SSR && for_ra)
+ valid_sum_count +=
+ le16_to_cpu(F2FS_CKPT(sbi)->cur_data_blkoff[i]);
+ else
+ valid_sum_count += f2fs_curseg_valid_blocks(sbi, i);
}
sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE -
@@ -2628,30 +2634,10 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi,
return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
}
-/*
- * If a segment is written by LFS manner, next block offset is just obtained
- * by increasing the current block offset. However, if a segment is written by
- * SSR manner, next block offset obtained by calling __next_free_blkoff
- */
-static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
- struct curseg_info *seg)
+static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi,
+ struct curseg_info *seg)
{
- if (seg->alloc_type == SSR) {
- seg->next_blkoff =
- __next_free_blkoff(sbi, seg->segno,
- seg->next_blkoff + 1);
- } else {
- seg->next_blkoff++;
- if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) {
- /* To allocate block chunks in different sizes, use random number */
- if (--seg->fragment_remained_chunk <= 0) {
- seg->fragment_remained_chunk =
- get_random_u32_inclusive(1, sbi->max_fragment_chunk);
- seg->next_blkoff +=
- get_random_u32_inclusive(1, sbi->max_fragment_hole);
- }
- }
- }
+ return __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1);
}
bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
@@ -2909,33 +2895,23 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int old_segno;
- if (!curseg->inited)
- goto alloc;
-
- if (force || curseg->next_blkoff ||
- get_valid_blocks(sbi, curseg->segno, new_sec))
- goto alloc;
-
- if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec))
+ if (!force && curseg->inited &&
+ !curseg->next_blkoff &&
+ !get_valid_blocks(sbi, curseg->segno, new_sec) &&
+ !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec))
return;
-alloc:
+
old_segno = curseg->segno;
new_curseg(sbi, type, true);
stat_inc_seg_type(sbi, curseg);
locate_dirty_segment(sbi, old_segno);
}
-static void __allocate_new_section(struct f2fs_sb_info *sbi,
- int type, bool force)
-{
- __allocate_new_segment(sbi, type, true, force);
-}
-
void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
{
f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
- __allocate_new_section(sbi, type, force);
+ __allocate_new_segment(sbi, type, true, force);
up_write(&SIT_I(sbi)->sentry_lock);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
@@ -3113,13 +3089,6 @@ out:
return err;
}
-static bool __has_curseg_space(struct f2fs_sb_info *sbi,
- struct curseg_info *curseg)
-{
- return curseg->next_blkoff < f2fs_usable_blks_in_seg(sbi,
- curseg->segno);
-}
-
int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
{
switch (hint) {
@@ -3238,6 +3207,19 @@ static int __get_segment_type(struct f2fs_io_info *fio)
return type;
}
+static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi,
+ struct curseg_info *seg)
+{
+ /* To allocate block chunks in different sizes, use random number */
+ if (--seg->fragment_remained_chunk > 0)
+ return;
+
+ seg->fragment_remained_chunk =
+ get_random_u32_inclusive(1, sbi->max_fragment_chunk);
+ seg->next_blkoff +=
+ get_random_u32_inclusive(1, sbi->max_fragment_hole);
+}
+
void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
@@ -3248,6 +3230,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
unsigned long long old_mtime;
bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
struct seg_entry *se = NULL;
+ bool segment_full = false;
f2fs_down_read(&SM_I(sbi)->curseg_lock);
@@ -3266,15 +3249,16 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
f2fs_wait_discard_bio(sbi, *new_blkaddr);
- /*
- * __add_sum_entry should be resided under the curseg_mutex
- * because, this function updates a summary entry in the
- * current summary block.
- */
- __add_sum_entry(sbi, type, sum);
-
- __refresh_next_blkoff(sbi, curseg);
-
+ curseg->sum_blk->entries[curseg->next_blkoff] = *sum;
+ if (curseg->alloc_type == SSR) {
+ curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg);
+ } else {
+ curseg->next_blkoff++;
+ if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
+ f2fs_randomize_chunk(sbi, curseg);
+ }
+ if (curseg->next_blkoff >= f2fs_usable_blks_in_seg(sbi, curseg->segno))
+ segment_full = true;
stat_inc_block_count(sbi, curseg);
if (from_gc) {
@@ -3293,10 +3277,11 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
update_sit_entry(sbi, old_blkaddr, -1);
- if (!__has_curseg_space(sbi, curseg)) {
- /*
- * Flush out current segment and replace it with new segment.
- */
+ /*
+ * If the current segment is full, flush it out and replace it with a
+ * new segment.
+ */
+ if (segment_full) {
if (from_gc) {
get_atssr_segment(sbi, type, se->type,
AT_SSR, se->mtime);
@@ -3331,10 +3316,10 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
struct f2fs_bio_info *io;
if (F2FS_IO_ALIGNED(sbi))
- fio->retry = false;
+ fio->retry = 0;
INIT_LIST_HEAD(&fio->list);
- fio->in_list = true;
+ fio->in_list = 1;
io = sbi->write_io[fio->type] + fio->temp;
spin_lock(&io->io_lock);
list_add_tail(&fio->list, &io->io_list);
@@ -3415,14 +3400,13 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
.new_blkaddr = page->index,
.page = page,
.encrypted_page = NULL,
- .in_list = false,
+ .in_list = 0,
};
if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
fio.op_flags &= ~REQ_META;
set_page_writeback(page);
- ClearPageError(page);
f2fs_submit_page_write(&fio);
stat_inc_meta_count(sbi, page->index);
@@ -3487,7 +3471,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
stat_inc_inplace_blocks(fio->sbi);
- if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE)))
+ if (fio->bio && !IS_F2FS_IPU_NOCACHE(sbi))
err = f2fs_merge_page_bio(fio);
else
err = f2fs_submit_page_bio(fio);
@@ -3576,7 +3560,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
}
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
- __add_sum_entry(sbi, type, sum);
+ curseg->sum_blk->entries[curseg->next_blkoff] = *sum;
if (!recover_curseg || recover_newaddr) {
if (!from_gc)
@@ -3634,7 +3618,7 @@ void f2fs_wait_on_page_writeback(struct page *page,
/* submit cached LFS IO */
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
- /* sbumit cached IPU IO */
+ /* submit cached IPU IO */
f2fs_submit_merged_ipu_write(sbi, NULL, page);
if (ordered) {
wait_on_page_writeback(page);
@@ -3885,15 +3869,8 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
/* Step 3: write summary entries */
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
- unsigned short blkoff;
-
seg_i = CURSEG_I(sbi, i);
- if (sbi->ckpt->alloc_type[i] == SSR)
- blkoff = sbi->blocks_per_seg;
- else
- blkoff = curseg_blkoff(sbi, i);
-
- for (j = 0; j < blkoff; j++) {
+ for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) {
if (!page) {
page = f2fs_grab_meta_page(sbi, blkaddr++);
kaddr = (unsigned char *)page_address(page);
@@ -5126,7 +5103,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
if (!f2fs_lfs_mode(sbi))
- sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
+ sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC);
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
sm_info->min_seq_blocks = sbi->blocks_per_seg;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 3ad1b7b6fa94..efdb7fc3b797 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -670,6 +670,9 @@ static inline int utilization(struct f2fs_sb_info *sbi)
#define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */
+#define F2FS_IPU_DISABLE 0
+
+/* Modification on enum should be synchronized with ipu_mode_names array */
enum {
F2FS_IPU_FORCE,
F2FS_IPU_SSR,
@@ -679,8 +682,29 @@ enum {
F2FS_IPU_ASYNC,
F2FS_IPU_NOCACHE,
F2FS_IPU_HONOR_OPU_WRITE,
+ F2FS_IPU_MAX,
};
+static inline bool IS_F2FS_IPU_DISABLE(struct f2fs_sb_info *sbi)
+{
+ return SM_I(sbi)->ipu_policy == F2FS_IPU_DISABLE;
+}
+
+#define F2FS_IPU_POLICY(name) \
+static inline bool IS_##name(struct f2fs_sb_info *sbi) \
+{ \
+ return SM_I(sbi)->ipu_policy & BIT(name); \
+}
+
+F2FS_IPU_POLICY(F2FS_IPU_FORCE);
+F2FS_IPU_POLICY(F2FS_IPU_SSR);
+F2FS_IPU_POLICY(F2FS_IPU_UTIL);
+F2FS_IPU_POLICY(F2FS_IPU_SSR_UTIL);
+F2FS_IPU_POLICY(F2FS_IPU_FSYNC);
+F2FS_IPU_POLICY(F2FS_IPU_ASYNC);
+F2FS_IPU_POLICY(F2FS_IPU_NOCACHE);
+F2FS_IPU_POLICY(F2FS_IPU_HONOR_OPU_WRITE);
+
static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
int type)
{
@@ -695,15 +719,10 @@ static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi,
return curseg->alloc_type;
}
-static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
-{
- struct curseg_info *curseg = CURSEG_I(sbi, type);
- return curseg->next_blkoff;
-}
-
-static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
+static inline bool valid_main_segno(struct f2fs_sb_info *sbi,
+ unsigned int segno)
{
- f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1);
+ return segno <= (MAIN_SEGS(sbi) - 1);
}
static inline void verify_fio_blkaddr(struct f2fs_io_info *fio)
@@ -758,7 +777,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
/* check segment usage, and check boundary of a given segment number */
if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg
- || segno > TOTAL_SEGS(sbi) - 1)) {
+ || !valid_main_segno(sbi, segno))) {
f2fs_err(sbi, "Wrong valid blocks %d or segno %u",
GET_SIT_VBLOCKS(raw_sit), segno);
set_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -775,7 +794,7 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
unsigned int offset = SIT_BLOCK_OFFSET(start);
block_t blk_addr = sit_i->sit_base_addr + offset;
- check_seg_range(sbi, start);
+ f2fs_bug_on(sbi, !valid_main_segno(sbi, start));
#ifdef CONFIG_F2FS_CHECK_FS
if (f2fs_test_bit(offset, sit_i->sit_bitmap) !=
@@ -924,6 +943,6 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
if (!wakeup || !is_idle(sbi, DISCARD_TIME))
return;
wake_up:
- dcc->discard_wake = 1;
+ dcc->discard_wake = true;
wake_up_interruptible_all(&dcc->discard_wait_queue);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 64d3556d61a5..fbaaabbcd6de 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1288,19 +1288,18 @@ default_check:
* zone alignment optimization. This is optional for host-aware
* devices, but mandatory for host-managed zoned block devices.
*/
-#ifndef CONFIG_BLK_DEV_ZONED
- if (f2fs_sb_has_blkzoned(sbi)) {
- f2fs_err(sbi, "Zoned block device support is not enabled");
- return -EINVAL;
- }
-#endif
if (f2fs_sb_has_blkzoned(sbi)) {
+#ifdef CONFIG_BLK_DEV_ZONED
if (F2FS_OPTION(sbi).discard_unit !=
DISCARD_UNIT_SECTION) {
f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default");
F2FS_OPTION(sbi).discard_unit =
DISCARD_UNIT_SECTION;
}
+#else
+ f2fs_err(sbi, "Zoned block device support is not enabled");
+ return -EINVAL;
+#endif
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -1341,12 +1340,12 @@ default_check:
}
if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) {
- f2fs_err(sbi, "LFS not compatible with checkpoint=disable");
+ f2fs_err(sbi, "LFS is not compatible with checkpoint=disable");
return -EINVAL;
}
if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) {
- f2fs_err(sbi, "LFS not compatible with ATGC");
+ f2fs_err(sbi, "LFS is not compatible with ATGC");
return -EINVAL;
}
@@ -1366,10 +1365,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
- if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) {
- f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC);
+ if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC))
return NULL;
- }
fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO);
if (!fi)
@@ -1424,8 +1421,6 @@ static int f2fs_drop_inode(struct inode *inode)
atomic_inc(&inode->i_count);
spin_unlock(&inode->i_lock);
- f2fs_abort_atomic_write(inode, true);
-
/* should remain fi->extent_tree for writepage */
f2fs_destroy_extent_node(inode);
@@ -1543,7 +1538,7 @@ static void f2fs_put_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
int i;
- bool dropped;
+ bool done;
/* unregister procfs/sysfs entries in advance to avoid race case */
f2fs_unregister_sysfs(sbi);
@@ -1573,9 +1568,8 @@ static void f2fs_put_super(struct super_block *sb)
}
/* be sure to wait for any on-going discard commands */
- dropped = f2fs_issue_discard_timeout(sbi);
-
- if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) {
+ done = f2fs_issue_discard_timeout(sbi);
+ if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && done) {
struct cp_control cpc = {
.reason = CP_UMOUNT | CP_TRIMMED,
};
@@ -1900,15 +1894,24 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sbi, GC_MERGE))
seq_puts(seq, ",gc_merge");
+ else
+ seq_puts(seq, ",nogc_merge");
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
if (test_opt(sbi, NORECOVERY))
seq_puts(seq, ",norecovery");
- if (test_opt(sbi, DISCARD))
+ if (test_opt(sbi, DISCARD)) {
seq_puts(seq, ",discard");
- else
+ if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK)
+ seq_printf(seq, ",discard_unit=%s", "block");
+ else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
+ seq_printf(seq, ",discard_unit=%s", "segment");
+ else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
+ seq_printf(seq, ",discard_unit=%s", "section");
+ } else {
seq_puts(seq, ",nodiscard");
+ }
if (test_opt(sbi, NOHEAP))
seq_puts(seq, ",no_heap");
else
@@ -2032,13 +2035,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sbi, ATGC))
seq_puts(seq, ",atgc");
- if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK)
- seq_printf(seq, ",discard_unit=%s", "block");
- else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
- seq_printf(seq, ",discard_unit=%s", "segment");
- else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
- seq_printf(seq, ",discard_unit=%s", "section");
-
if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL)
seq_printf(seq, ",memory=%s", "normal");
else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW)
@@ -2300,6 +2296,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
}
#endif
+ if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) {
+ err = -EINVAL;
+ f2fs_warn(sbi, "LFS is not compatible with IPU");
+ goto restore_opts;
+ }
+
/* disallow enable atgc dynamically */
if (no_atgc == !!test_opt(sbi, ATGC)) {
err = -EINVAL;
@@ -2589,10 +2591,8 @@ retry:
int f2fs_dquot_initialize(struct inode *inode)
{
- if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) {
- f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_DQUOT_INIT);
+ if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT))
return -ESRCH;
- }
return dquot_initialize(inode);
}
@@ -4083,8 +4083,9 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
if (f2fs_block_unit_discard(sbi))
SM_I(sbi)->dcc_info->discard_granularity =
MIN_DISCARD_GRANULARITY;
- SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE |
- 1 << F2FS_IPU_HONOR_OPU_WRITE;
+ if (!f2fs_lfs_mode(sbi))
+ SM_I(sbi)->ipu_policy = BIT(F2FS_IPU_FORCE) |
+ BIT(F2FS_IPU_HONOR_OPU_WRITE);
}
sbi->readdir_ra = true;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 83a366f3ee80..0b19163c90d4 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -473,6 +473,17 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "discard_io_aware_gran")) {
+ if (t > MAX_PLIST_NUM)
+ return -EINVAL;
+ if (!f2fs_block_unit_discard(sbi))
+ return -EINVAL;
+ if (t == *ui)
+ return count;
+ *ui = t;
+ return count;
+ }
+
if (!strcmp(a->attr.name, "discard_granularity")) {
if (t == 0 || t > MAX_PLIST_NUM)
return -EINVAL;
@@ -511,7 +522,7 @@ out:
} else if (t == 1) {
sbi->gc_mode = GC_URGENT_HIGH;
if (sbi->gc_thread) {
- sbi->gc_thread->gc_wake = 1;
+ sbi->gc_thread->gc_wake = true;
wake_up_interruptible_all(
&sbi->gc_thread->gc_wait_queue_head);
wake_up_discard_thread(sbi, true);
@@ -521,7 +532,7 @@ out:
} else if (t == 3) {
sbi->gc_mode = GC_URGENT_MID;
if (sbi->gc_thread) {
- sbi->gc_thread->gc_wake = 1;
+ sbi->gc_thread->gc_wake = true;
wake_up_interruptible_all(
&sbi->gc_thread->gc_wait_queue_head);
}
@@ -678,7 +689,16 @@ out:
}
if (!strcmp(a->attr.name, "warm_data_age_threshold")) {
- if (t == 0 || t <= sbi->hot_data_age_threshold)
+ if (t <= sbi->hot_data_age_threshold)
+ return -EINVAL;
+ if (t == *ui)
+ return count;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "last_age_weight")) {
+ if (t > 100)
return -EINVAL;
if (t == *ui)
return count;
@@ -686,6 +706,15 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "ipu_policy")) {
+ if (t >= BIT(F2FS_IPU_MAX))
+ return -EINVAL;
+ if (t && f2fs_lfs_mode(sbi))
+ return -EINVAL;
+ SM_I(sbi)->ipu_policy = (unsigned int)t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -825,6 +854,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_io_aware_gran, discard_io_aware_gran);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard);
@@ -944,6 +974,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block)
/* For block age extent cache */
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, last_age_weight, last_age_weight);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -960,6 +991,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(min_discard_issue_time),
ATTR_LIST(mid_discard_issue_time),
ATTR_LIST(max_discard_issue_time),
+ ATTR_LIST(discard_io_aware_gran),
ATTR_LIST(discard_urgent_util),
ATTR_LIST(discard_granularity),
ATTR_LIST(max_ordered_discard),
@@ -1042,6 +1074,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(revoked_atomic_block),
ATTR_LIST(hot_data_age_threshold),
ATTR_LIST(warm_data_age_threshold),
+ ATTR_LIST(last_age_weight),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -1129,13 +1162,13 @@ static const struct sysfs_ops f2fs_attr_ops = {
.store = f2fs_attr_store,
};
-static struct kobj_type f2fs_sb_ktype = {
+static const struct kobj_type f2fs_sb_ktype = {
.default_groups = f2fs_groups,
.sysfs_ops = &f2fs_attr_ops,
.release = f2fs_sb_release,
};
-static struct kobj_type f2fs_ktype = {
+static const struct kobj_type f2fs_ktype = {
.sysfs_ops = &f2fs_attr_ops,
};
@@ -1143,7 +1176,7 @@ static struct kset f2fs_kset = {
.kobj = {.ktype = &f2fs_ktype},
};
-static struct kobj_type f2fs_feat_ktype = {
+static const struct kobj_type f2fs_feat_ktype = {
.default_groups = f2fs_feat_groups,
.sysfs_ops = &f2fs_attr_ops,
};
@@ -1184,7 +1217,7 @@ static const struct sysfs_ops f2fs_stat_attr_ops = {
.store = f2fs_stat_attr_store,
};
-static struct kobj_type f2fs_stat_ktype = {
+static const struct kobj_type f2fs_stat_ktype = {
.default_groups = f2fs_stat_groups,
.sysfs_ops = &f2fs_stat_attr_ops,
.release = f2fs_stat_kobj_release,
@@ -1211,7 +1244,7 @@ static const struct sysfs_ops f2fs_feature_list_attr_ops = {
.show = f2fs_sb_feat_attr_show,
};
-static struct kobj_type f2fs_feature_list_ktype = {
+static const struct kobj_type f2fs_feature_list_ktype = {
.default_groups = f2fs_sb_feat_groups,
.sysfs_ops = &f2fs_feature_list_attr_ops,
.release = f2fs_feature_list_kobj_release,
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index f320ed8172ec..4fc95f353a7a 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -81,7 +81,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int res;
res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 122ed89ebf9f..1986b4f18a90 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -295,11 +295,11 @@ static void hfsplus_put_super(struct super_block *sb)
hfsplus_sync_fs(sb, 1);
}
+ iput(sbi->alloc_file);
+ iput(sbi->hidden_dir);
hfs_btree_close(sbi->attr_tree);
hfs_btree_close(sbi->cat_tree);
hfs_btree_close(sbi->ext_tree);
- iput(sbi->alloc_file);
- iput(sbi->hidden_dir);
kfree(sbi->s_vhdr_buf);
kfree(sbi->s_backup_vhdr_buf);
unload_nls(sbi->nls);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index c18bb50c31b6..28b4f15c19eb 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -412,7 +412,7 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
if (page->index >= end_index)
count = inode->i_size & (PAGE_SIZE-1);
- buffer = kmap(page);
+ buffer = kmap_local_page(page);
err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
if (err != count) {
@@ -428,9 +428,9 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
err = 0;
out:
- kunmap(page);
-
+ kunmap_local(buffer);
unlock_page(page);
+
return err;
}
@@ -441,7 +441,7 @@ static int hostfs_read_folio(struct file *file, struct folio *folio)
loff_t start = page_offset(page);
int bytes_read, ret = 0;
- buffer = kmap(page);
+ buffer = kmap_local_page(page);
bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
PAGE_SIZE);
if (bytes_read < 0) {
@@ -458,8 +458,9 @@ static int hostfs_read_folio(struct file *file, struct folio *folio)
out:
flush_dcache_page(page);
- kunmap(page);
+ kunmap_local(buffer);
unlock_page(page);
+
return ret;
}
@@ -484,9 +485,9 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
unsigned from = pos & (PAGE_SIZE - 1);
int err;
- buffer = kmap(page);
+ buffer = kmap_local_page(page);
err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied);
- kunmap(page);
+ kunmap_local(buffer);
if (!PageUptodate(page) && err == PAGE_SIZE)
SetPageUptodate(page);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6a404ac1c178..15de1385012e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1010,36 +1010,28 @@ repeat:
* ie. locked but not dirty) or tune2fs (which may actually have
* the buffer dirtied, ugh.) */
- if (buffer_dirty(bh)) {
+ if (buffer_dirty(bh) && jh->b_transaction) {
+ warn_dirty_buffer(bh);
/*
- * First question: is this buffer already part of the current
- * transaction or the existing committing transaction?
- */
- if (jh->b_transaction) {
- J_ASSERT_JH(jh,
- jh->b_transaction == transaction ||
- jh->b_transaction ==
- journal->j_committing_transaction);
- if (jh->b_next_transaction)
- J_ASSERT_JH(jh, jh->b_next_transaction ==
- transaction);
- warn_dirty_buffer(bh);
- }
- /*
- * In any case we need to clean the dirty flag and we must
- * do it under the buffer lock to be sure we don't race
- * with running write-out.
+ * We need to clean the dirty flag and we must do it under the
+ * buffer lock to be sure we don't race with running write-out.
*/
JBUFFER_TRACE(jh, "Journalling dirty buffer");
clear_buffer_dirty(bh);
+ /*
+ * The buffer is going to be added to BJ_Reserved list now and
+ * nothing guarantees jbd2_journal_dirty_metadata() will be
+ * ever called for it. So we need to set jbddirty bit here to
+ * make sure the buffer is dirtied and written out when the
+ * journaling machinery is done with it.
+ */
set_buffer_jbddirty(bh);
}
- unlock_buffer(bh);
-
error = -EROFS;
if (is_handle_aborted(handle)) {
spin_unlock(&jh->b_state_lock);
+ unlock_buffer(bh);
goto out;
}
error = 0;
@@ -1049,8 +1041,10 @@ repeat:
* b_next_transaction points to it
*/
if (jh->b_transaction == transaction ||
- jh->b_next_transaction == transaction)
+ jh->b_next_transaction == transaction) {
+ unlock_buffer(bh);
goto done;
+ }
/*
* this is the first time this transaction is touching this buffer,
@@ -1074,10 +1068,24 @@ repeat:
*/
smp_wmb();
spin_lock(&journal->j_list_lock);
+ if (test_clear_buffer_dirty(bh)) {
+ /*
+ * Execute buffer dirty clearing and jh->b_transaction
+ * assignment under journal->j_list_lock locked to
+ * prevent bh being removed from checkpoint list if
+ * the buffer is in an intermediate state (not dirty
+ * and jh->b_transaction is NULL).
+ */
+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
+ set_buffer_jbddirty(bh);
+ }
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
spin_unlock(&journal->j_list_lock);
+ unlock_buffer(bh);
goto done;
}
+ unlock_buffer(bh);
+
/*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 4849a4c9a0e2..764f19dec3f0 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -364,20 +364,25 @@ void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig)
int __init jffs2_compressors_init(void)
{
+ int ret = 0;
/* Registering compressors */
-#ifdef CONFIG_JFFS2_ZLIB
- jffs2_zlib_init();
-#endif
-#ifdef CONFIG_JFFS2_RTIME
- jffs2_rtime_init();
-#endif
-#ifdef CONFIG_JFFS2_RUBIN
- jffs2_rubinmips_init();
- jffs2_dynrubin_init();
-#endif
-#ifdef CONFIG_JFFS2_LZO
- jffs2_lzo_init();
-#endif
+ ret = jffs2_zlib_init();
+ if (ret)
+ goto exit;
+ ret = jffs2_rtime_init();
+ if (ret)
+ goto exit_zlib;
+ ret = jffs2_rubinmips_init();
+ if (ret)
+ goto exit_rtime;
+ ret = jffs2_dynrubin_init();
+ if (ret)
+ goto exit_runinmips;
+ ret = jffs2_lzo_init();
+ if (ret)
+ goto exit_dynrubin;
+
+
/* Setting default compression mode */
#ifdef CONFIG_JFFS2_CMODE_NONE
jffs2_compression_mode = JFFS2_COMPR_MODE_NONE;
@@ -396,23 +401,26 @@ int __init jffs2_compressors_init(void)
#endif
#endif
return 0;
+
+exit_dynrubin:
+ jffs2_dynrubin_exit();
+exit_runinmips:
+ jffs2_rubinmips_exit();
+exit_rtime:
+ jffs2_rtime_exit();
+exit_zlib:
+ jffs2_zlib_exit();
+exit:
+ return ret;
}
int jffs2_compressors_exit(void)
{
/* Unregistering compressors */
-#ifdef CONFIG_JFFS2_LZO
jffs2_lzo_exit();
-#endif
-#ifdef CONFIG_JFFS2_RUBIN
jffs2_dynrubin_exit();
jffs2_rubinmips_exit();
-#endif
-#ifdef CONFIG_JFFS2_RTIME
jffs2_rtime_exit();
-#endif
-#ifdef CONFIG_JFFS2_ZLIB
jffs2_zlib_exit();
-#endif
return 0;
}
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index 5e91d578f4ed..3716b6b7924c 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -88,18 +88,32 @@ int jffs2_rubinmips_init(void);
void jffs2_rubinmips_exit(void);
int jffs2_dynrubin_init(void);
void jffs2_dynrubin_exit(void);
+#else
+static inline int jffs2_rubinmips_init(void) { return 0; }
+static inline void jffs2_rubinmips_exit(void) {}
+static inline int jffs2_dynrubin_init(void) { return 0; }
+static inline void jffs2_dynrubin_exit(void) {}
#endif
#ifdef CONFIG_JFFS2_RTIME
-int jffs2_rtime_init(void);
-void jffs2_rtime_exit(void);
+extern int jffs2_rtime_init(void);
+extern void jffs2_rtime_exit(void);
+#else
+static inline int jffs2_rtime_init(void) { return 0; }
+static inline void jffs2_rtime_exit(void) {}
#endif
#ifdef CONFIG_JFFS2_ZLIB
-int jffs2_zlib_init(void);
-void jffs2_zlib_exit(void);
+extern int jffs2_zlib_init(void);
+extern void jffs2_zlib_exit(void);
+#else
+static inline int jffs2_zlib_init(void) { return 0; }
+static inline void jffs2_zlib_exit(void) {}
#endif
#ifdef CONFIG_JFFS2_LZO
-int jffs2_lzo_init(void);
-void jffs2_lzo_exit(void);
+extern int jffs2_lzo_init(void);
+extern void jffs2_lzo_exit(void);
+#else
+static inline int jffs2_lzo_init(void) { return 0; }
+static inline void jffs2_lzo_exit(void) {}
#endif
#endif /* __JFFS2_COMPR_H__ */
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 3cf71befa475..96b0275ce957 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -137,19 +137,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
pgoff_t index = pos >> PAGE_SHIFT;
- uint32_t pageofs = index << PAGE_SHIFT;
int ret = 0;
jffs2_dbg(1, "%s()\n", __func__);
- if (pageofs > inode->i_size) {
- /* Make new hole frag from old EOF to new page */
+ if (pos > inode->i_size) {
+ /* Make new hole frag from old EOF to new position */
struct jffs2_raw_inode ri;
struct jffs2_full_dnode *fn;
uint32_t alloc_len;
- jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
- (unsigned int)inode->i_size, pageofs);
+ jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new position\n",
+ (unsigned int)inode->i_size, (uint32_t)pos);
ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
@@ -169,10 +168,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
ri.mode = cpu_to_jemode(inode->i_mode);
ri.uid = cpu_to_je16(i_uid_read(inode));
ri.gid = cpu_to_je16(i_gid_read(inode));
- ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs));
+ ri.isize = cpu_to_je32((uint32_t)pos);
ri.atime = ri.ctime = ri.mtime = cpu_to_je32(JFFS2_NOW());
ri.offset = cpu_to_je32(inode->i_size);
- ri.dsize = cpu_to_je32(pageofs - inode->i_size);
+ ri.dsize = cpu_to_je32((uint32_t)pos - inode->i_size);
ri.csize = cpu_to_je32(0);
ri.compr = JFFS2_COMPR_ZERO;
ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
@@ -202,7 +201,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
goto out_err;
}
jffs2_complete_reservation(c);
- inode->i_size = pageofs;
+ inode->i_size = pos;
mutex_unlock(&f->sem);
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 09174898efd0..038516bee1ab 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -403,7 +403,7 @@ int jffs2_do_remount_fs(struct super_block *sb, struct fs_context *fc)
/* We stop if it was running, then restart if it needs to.
This also catches the case where it was stopped and this
is just a remount to restart it.
- Flush the writebuffer, if neccecary, else we loose it */
+ Flush the writebuffer, if necessary, else we loose it */
if (!sb_rdonly(sb)) {
jffs2_stop_garbage_collect_thread(c);
mutex_lock(&c->alloc_sem);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 765838578a72..a3eb1e826947 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -193,7 +193,8 @@ int dbMount(struct inode *ipbmap)
bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
- if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) {
+ if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG ||
+ bmp->db_agl2size < 0) {
err = -EINVAL;
goto err_release_metapage;
}
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index f00d43b8ac0a..e9a45dea748a 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -134,7 +134,7 @@ static ssize_t netfs_extract_user_to_sg(struct iov_iter *iter,
npages = DIV_ROUND_UP(off + len, PAGE_SIZE);
sg_max -= npages;
- for (; npages < 0; npages--) {
+ for (; npages > 0; npages--) {
struct page *page = *pages;
size_t seg = min_t(size_t, PAGE_SIZE - off, len);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 192cad0662d8..b1e32ec4a9d4 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -105,14 +105,6 @@ static int __ocfs2_move_extent(handle_t *handle,
*/
replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
- context->et.et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_split_extent(handle, &context->et, path, index,
&replace_rec, context->meta_ac,
&context->dealloc);
@@ -121,8 +113,6 @@ static int __ocfs2_move_extent(handle_t *handle,
goto out;
}
- ocfs2_journal_dirty(handle, context->et.et_root_bh);
-
context->new_phys_cpos = new_p_cpos;
/*
@@ -444,7 +434,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
bg = (struct ocfs2_group_desc *)gd_bh->b_data;
if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
- le16_to_cpu(bg->bg_bits))) {
+ (le16_to_cpu(bg->bg_bits) << bits_per_unit))) {
*ret_bh = gd_bh;
*vict_bit = (vict_blkno - blkno) >>
@@ -559,6 +549,7 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
last_free_bits++;
if (last_free_bits == move_len) {
+ i -= move_len;
*goal_bit = i;
*phys_cpos = base_cpos + i;
break;
@@ -1030,18 +1021,19 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
context->range = &range;
+ /*
+ * ok, the default theshold for the defragmentation
+ * is 1M, since our maximum clustersize was 1M also.
+ * any thought?
+ */
+ if (!range.me_threshold)
+ range.me_threshold = 1024 * 1024;
+
+ if (range.me_threshold > i_size_read(inode))
+ range.me_threshold = i_size_read(inode);
+
if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
context->auto_defrag = 1;
- /*
- * ok, the default theshold for the defragmentation
- * is 1M, since our maximum clustersize was 1M also.
- * any thought?
- */
- if (!range.me_threshold)
- range.me_threshold = 1024 * 1024;
-
- if (range.me_threshold > i_size_read(inode))
- range.me_threshold = i_size_read(inode);
if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
context->partial = 1;
diff --git a/fs/open.c b/fs/open.c
index 8038cf652583..4401a73d4032 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -368,7 +368,37 @@ COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset
* access() needs to use the real uid/gid, not the effective uid/gid.
* We do this by temporarily clearing all FS-related capabilities and
* switching the fsuid/fsgid around to the real ones.
+ *
+ * Creating new credentials is expensive, so we try to skip doing it,
+ * which we can if the result would match what we already got.
*/
+static bool access_need_override_creds(int flags)
+{
+ const struct cred *cred;
+
+ if (flags & AT_EACCESS)
+ return false;
+
+ cred = current_cred();
+ if (!uid_eq(cred->fsuid, cred->uid) ||
+ !gid_eq(cred->fsgid, cred->gid))
+ return true;
+
+ if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+ kuid_t root_uid = make_kuid(cred->user_ns, 0);
+ if (!uid_eq(cred->uid, root_uid)) {
+ if (!cap_isclear(cred->cap_effective))
+ return true;
+ } else {
+ if (!cap_isidentical(cred->cap_effective,
+ cred->cap_permitted))
+ return true;
+ }
+ }
+
+ return false;
+}
+
static const struct cred *access_override_creds(void)
{
const struct cred *old_cred;
@@ -378,6 +408,12 @@ static const struct cred *access_override_creds(void)
if (!override_cred)
return NULL;
+ /*
+ * XXX access_need_override_creds performs checks in hopes of skipping
+ * this work. Make sure it stays in sync if making any changes in this
+ * routine.
+ */
+
override_cred->fsuid = override_cred->uid;
override_cred->fsgid = override_cred->gid;
@@ -437,7 +473,7 @@ static long do_faccessat(int dfd, const char __user *filename, int mode, int fla
if (flags & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
- if (!(flags & AT_EACCESS)) {
+ if (access_need_override_creds(flags)) {
old_cred = access_override_creds();
if (!old_cred)
return -ENOMEM;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 49283b8103c7..9b0315d34c58 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -300,13 +300,8 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
static void render_cap_t(struct seq_file *m, const char *header,
kernel_cap_t *a)
{
- unsigned __capi;
-
seq_puts(m, header);
- CAP_FOR_EACH_U32(__capi) {
- seq_put_hex_ll(m, NULL,
- a->cap[CAP_LAST_U32 - __capi], 8);
- }
+ seq_put_hex_ll(m, NULL, a->val, 16);
seq_putc(m, '\n');
}
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index e8b9b756f0ac..d76eb7b39f56 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -209,11 +209,10 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
subtract_lebs += 1;
/*
- * The GC journal head LEB is not really accessible. And since
- * different write types go to different heads, we may count only on
- * one head's space.
+ * Since different write types go to different heads, we should
+ * reserve one leb for each head.
*/
- subtract_lebs += c->jhead_cnt - 1;
+ subtract_lebs += c->jhead_cnt;
/* We also reserve one LEB for deletions, which bypass budgeting */
subtract_lebs += 1;
@@ -400,7 +399,7 @@ static int calc_dd_growth(const struct ubifs_info *c,
dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
if (req->dirtied_ino)
- dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
+ dd_growth += c->bi.inode_budget * req->dirtied_ino;
if (req->mod_dent)
dd_growth += c->bi.dent_budget;
dd_growth += req->dirtied_ino_d;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 1e92c1730c16..1505539f6fe9 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1151,7 +1151,6 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
int err, sz_change, len = strlen(symname);
struct fscrypt_str disk_link;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
- .new_ino_d = ALIGN(len, 8),
.dirtied_ino = 1 };
struct fscrypt_name nm;
@@ -1167,6 +1166,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
* Budget request settings: new inode, new direntry and changing parent
* directory inode.
*/
+ req.new_ino_d = ALIGN(disk_link.len - 1, 8);
err = ubifs_budget_space(c, &req);
if (err)
return err;
@@ -1324,6 +1324,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlink) {
ubifs_assert(c, inode_is_locked(new_inode));
+ /* Budget for old inode's data when its nlink > 1. */
+ req.dirtied_ino_d = ALIGN(ubifs_inode(new_inode)->data_len, 8);
err = ubifs_purge_xattrs(new_inode);
if (err)
return err;
@@ -1566,6 +1568,15 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
ubifs_assert(c, fst_inode && snd_inode);
+ /*
+ * Budget request settings: changing two direntries, changing the two
+ * parent directory inodes.
+ */
+
+ dbg_gen("dent '%pd' ino %lu in dir ino %lu exchange dent '%pd' ino %lu in dir ino %lu",
+ old_dentry, fst_inode->i_ino, old_dir->i_ino,
+ new_dentry, snd_inode->i_ino, new_dir->i_ino);
+
err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm);
if (err)
return err;
@@ -1576,6 +1587,10 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
return err;
}
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ goto out;
+
lock_4_inodes(old_dir, new_dir, NULL, NULL);
time = current_time(old_dir);
@@ -1601,6 +1616,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
unlock_4_inodes(old_dir, new_dir, NULL, NULL);
ubifs_release_budget(c, &req);
+out:
fscrypt_free_filename(&fst_nm);
fscrypt_free_filename(&snd_nm);
return err;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 8cb5d76b301c..979ab1d9d0c3 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1032,7 +1032,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
if (page->index >= synced_i_size >> PAGE_SHIFT) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
- goto out_unlock;
+ goto out_redirty;
/*
* The inode has been written, but the write-buffer has
* not been synchronized, so in case of an unclean
@@ -1060,11 +1060,17 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
if (i_size > synced_i_size) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
- goto out_unlock;
+ goto out_redirty;
}
return do_writepage(page, len);
-
+out_redirty:
+ /*
+ * redirty_page_for_writepage() won't call ubifs_dirty_inode() because
+ * it passes I_DIRTY_PAGES flag while calling __mark_inode_dirty(), so
+ * there is no need to do space budget for dirty inode.
+ */
+ redirty_page_for_writepage(wbc, page);
out_unlock:
unlock_page(page);
return err;
@@ -1466,14 +1472,23 @@ static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
- /*
- * An attempt to release a dirty page without budgeting for it - should
- * not happen.
- */
if (folio_test_writeback(folio))
return false;
+
+ /*
+ * Page is private but not dirty, weird? There is one condition
+ * making it happened. ubifs_writepage skipped the page because
+ * page index beyonds isize (for example. truncated by other
+ * process named A), then the page is invalidated by fadvise64
+ * syscall before being truncated by process A.
+ */
ubifs_assert(c, folio_test_private(folio));
- ubifs_assert(c, 0);
+ if (folio_test_checked(folio))
+ release_new_page_budget(c);
+ else
+ release_existing_page_budget(c);
+
+ atomic_long_dec(&c->dirty_pg_cnt);
folio_detach_private(folio);
folio_clear_checked(folio);
return true;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 1607a3c76681..01d8eb170382 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -488,7 +488,7 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
}
/**
- * wbuf_timer_callback - write-buffer timer callback function.
+ * wbuf_timer_callback_nolock - write-buffer timer callback function.
* @timer: timer data (write-buffer descriptor)
*
* This function is called when the write-buffer timer expires.
@@ -505,7 +505,7 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
}
/**
- * new_wbuf_timer - start new write-buffer timer.
+ * new_wbuf_timer_nolock - start new write-buffer timer.
* @c: UBIFS file-system description object
* @wbuf: write-buffer descriptor
*/
@@ -531,7 +531,7 @@ static void new_wbuf_timer_nolock(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
}
/**
- * cancel_wbuf_timer - cancel write-buffer timer.
+ * cancel_wbuf_timer_nolock - cancel write-buffer timer.
* @wbuf: write-buffer descriptor
*/
static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index d02509920baf..dc52ac0f4a34 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1201,9 +1201,13 @@ out_free:
* ubifs_jnl_rename - rename a directory entry.
* @c: UBIFS file-system description object
* @old_dir: parent inode of directory entry to rename
- * @old_dentry: directory entry to rename
+ * @old_inode: directory entry's inode to rename
+ * @old_nm: name of the old directory entry to rename
* @new_dir: parent inode of directory entry to rename
- * @new_dentry: new directory entry (or directory entry to replace)
+ * @new_inode: new directory entry's inode (or directory entry's inode to
+ * replace)
+ * @new_nm: new name of the new directory entry
+ * @whiteout: whiteout inode
* @sync: non-zero if the write-buffer has to be synchronized
*
* This function implements the re-name operation which may involve writing up
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d0c9a09988bc..32cb14759796 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -833,7 +833,7 @@ static int alloc_wbufs(struct ubifs_info *c)
INIT_LIST_HEAD(&c->jheads[i].buds_list);
err = ubifs_wbuf_init(c, &c->jheads[i].wbuf);
if (err)
- return err;
+ goto out_wbuf;
c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
c->jheads[i].wbuf.jhead = i;
@@ -841,7 +841,7 @@ static int alloc_wbufs(struct ubifs_info *c)
c->jheads[i].log_hash = ubifs_hash_get_desc(c);
if (IS_ERR(c->jheads[i].log_hash)) {
err = PTR_ERR(c->jheads[i].log_hash);
- goto out;
+ goto out_log_hash;
}
}
@@ -854,9 +854,18 @@ static int alloc_wbufs(struct ubifs_info *c)
return 0;
-out:
- while (i--)
+out_log_hash:
+ kfree(c->jheads[i].wbuf.buf);
+ kfree(c->jheads[i].wbuf.inodes);
+
+out_wbuf:
+ while (i--) {
+ kfree(c->jheads[i].wbuf.buf);
+ kfree(c->jheads[i].wbuf.inodes);
kfree(c->jheads[i].log_hash);
+ }
+ kfree(c->jheads);
+ c->jheads = NULL;
return err;
}
diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c
index 06ad8fa1fcfb..1c958148bb87 100644
--- a/fs/ubifs/sysfs.c
+++ b/fs/ubifs/sysfs.c
@@ -74,13 +74,13 @@ static const struct sysfs_ops ubifs_attr_ops = {
.show = ubifs_attr_show,
};
-static struct kobj_type ubifs_sb_ktype = {
+static const struct kobj_type ubifs_sb_ktype = {
.default_groups = ubifs_groups,
.sysfs_ops = &ubifs_attr_ops,
.release = ubifs_sb_release,
};
-static struct kobj_type ubifs_ktype = {
+static const struct kobj_type ubifs_ktype = {
.sysfs_ops = &ubifs_attr_ops,
};
@@ -144,6 +144,8 @@ int __init ubifs_sysfs_init(void)
kobject_set_name(&ubifs_kset.kobj, "ubifs");
ubifs_kset.kobj.parent = fs_kobj;
ret = kset_register(&ubifs_kset);
+ if (ret)
+ kset_put(&ubifs_kset);
return ret;
}
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 488f3da7a6c6..2469f72eeaab 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -267,11 +267,18 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
if (zbr->len) {
err = insert_old_idx(c, zbr->lnum, zbr->offs);
if (unlikely(err))
- return ERR_PTR(err);
+ /*
+ * Obsolete znodes will be freed by tnc_destroy_cnext()
+ * or free_obsolete_znodes(), copied up znodes should
+ * be added back to tnc and freed by
+ * ubifs_destroy_tnc_subtree().
+ */
+ goto out;
err = add_idx_dirt(c, zbr->lnum, zbr->len);
} else
err = 0;
+out:
zbr->znode = zn;
zbr->lnum = 0;
zbr->offs = 0;
@@ -3053,6 +3060,21 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
cnext = cnext->cnext;
if (ubifs_zn_obsolete(znode))
kfree(znode);
+ else if (!ubifs_zn_cow(znode)) {
+ /*
+ * Don't forget to update clean znode count after
+ * committing failed, because ubifs will check this
+ * count while closing tnc. Non-obsolete znode could
+ * be re-dirtied during committing process, so dirty
+ * flag is untrustable. The flag 'COW_ZNODE' is set
+ * for each dirty znode before committing, and it is
+ * cleared as long as the znode become clean, so we
+ * can statistic clean znode count according to this
+ * flag.
+ */
+ atomic_long_inc(&c->clean_zn_cnt);
+ atomic_long_inc(&ubifs_clean_zn_cnt);
+ }
} while (cnext && cnext != c->cnext);
}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 9063b73536f8..4c36044140e7 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1623,8 +1623,13 @@ static inline int ubifs_check_hmac(const struct ubifs_info *c,
return crypto_memneq(expected, got, c->hmac_desc_len);
}
+#ifdef CONFIG_UBIFS_FS_AUTHENTICATION
void ubifs_bad_hash(const struct ubifs_info *c, const void *node,
const u8 *hash, int lnum, int offs);
+#else
+static inline void ubifs_bad_hash(const struct ubifs_info *c, const void *node,
+ const u8 *hash, int lnum, int offs) {};
+#endif
int __ubifs_node_check_hash(const struct ubifs_info *c, const void *buf,
const u8 *expected);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f7a9607c2b95..2210e5eb1ea0 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -193,7 +193,7 @@ static int udf_adinicb_writepage(struct folio *folio,
struct udf_inode_info *iinfo = UDF_I(inode);
BUG_ON(!PageLocked(page));
- memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr,
+ memcpy_from_page(iinfo->i_data + iinfo->i_lenEAttr, page, 0,
i_size_read(inode));
unlock_page(page);
mark_inode_dirty(inode);
@@ -241,6 +241,15 @@ static int udf_read_folio(struct file *file, struct folio *folio)
static void udf_readahead(struct readahead_control *rac)
{
+ struct udf_inode_info *iinfo = UDF_I(rac->mapping->host);
+
+ /*
+ * No readahead needed for in-ICB files and udf_get_block() would get
+ * confused for such file anyway.
+ */
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+ return;
+
mpage_readahead(rac, udf_get_block);
}
@@ -407,6 +416,9 @@ static int udf_map_block(struct inode *inode, struct udf_map_rq *map)
int err;
struct udf_inode_info *iinfo = UDF_I(inode);
+ if (WARN_ON_ONCE(iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB))
+ return -EFSCORRUPTED;
+
map->oflags = 0;
if (!(map->iflags & UDF_MAP_CREATE)) {
struct kernel_lb_addr eloc;
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index bb0c700afe3c..86696a1c6891 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -44,16 +44,15 @@ xfs_perag_get(
xfs_agnumber_t agno)
{
struct xfs_perag *pag;
- int ref = 0;
rcu_read_lock();
pag = radix_tree_lookup(&mp->m_perag_tree, agno);
if (pag) {
+ trace_xfs_perag_get(pag, _RET_IP_);
ASSERT(atomic_read(&pag->pag_ref) >= 0);
- ref = atomic_inc_return(&pag->pag_ref);
+ atomic_inc(&pag->pag_ref);
}
rcu_read_unlock();
- trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
return pag;
}
@@ -68,7 +67,6 @@ xfs_perag_get_tag(
{
struct xfs_perag *pag;
int found;
- int ref;
rcu_read_lock();
found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
@@ -77,9 +75,9 @@ xfs_perag_get_tag(
rcu_read_unlock();
return NULL;
}
- ref = atomic_inc_return(&pag->pag_ref);
+ trace_xfs_perag_get_tag(pag, _RET_IP_);
+ atomic_inc(&pag->pag_ref);
rcu_read_unlock();
- trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
return pag;
}
@@ -87,11 +85,68 @@ void
xfs_perag_put(
struct xfs_perag *pag)
{
- int ref;
-
+ trace_xfs_perag_put(pag, _RET_IP_);
ASSERT(atomic_read(&pag->pag_ref) > 0);
- ref = atomic_dec_return(&pag->pag_ref);
- trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+ atomic_dec(&pag->pag_ref);
+}
+
+/*
+ * Active references for perag structures. This is for short term access to the
+ * per ag structures for walking trees or accessing state. If an AG is being
+ * shrunk or is offline, then this will fail to find that AG and return NULL
+ * instead.
+ */
+struct xfs_perag *
+xfs_perag_grab(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+
+ rcu_read_lock();
+ pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+ if (pag) {
+ trace_xfs_perag_grab(pag, _RET_IP_);
+ if (!atomic_inc_not_zero(&pag->pag_active_ref))
+ pag = NULL;
+ }
+ rcu_read_unlock();
+ return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_grab_tag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t first,
+ int tag)
+{
+ struct xfs_perag *pag;
+ int found;
+
+ rcu_read_lock();
+ found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+ (void **)&pag, first, 1, tag);
+ if (found <= 0) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ trace_xfs_perag_grab_tag(pag, _RET_IP_);
+ if (!atomic_inc_not_zero(&pag->pag_active_ref))
+ pag = NULL;
+ rcu_read_unlock();
+ return pag;
+}
+
+void
+xfs_perag_rele(
+ struct xfs_perag *pag)
+{
+ trace_xfs_perag_rele(pag, _RET_IP_);
+ if (atomic_dec_and_test(&pag->pag_active_ref))
+ wake_up(&pag->pag_active_wq);
}
/*
@@ -196,6 +251,10 @@ xfs_free_perag(
cancel_delayed_work_sync(&pag->pag_blockgc_work);
xfs_buf_hash_destroy(pag);
+ /* drop the mount's active reference */
+ xfs_perag_rele(pag);
+ XFS_IS_CORRUPT(pag->pag_mount,
+ atomic_read(&pag->pag_active_ref) != 0);
call_rcu(&pag->rcu_head, __xfs_free_perag);
}
}
@@ -314,6 +373,7 @@ xfs_initialize_perag(
INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
init_waitqueue_head(&pag->pagb_wait);
+ init_waitqueue_head(&pag->pag_active_wq);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
#endif /* __KERNEL__ */
@@ -322,6 +382,9 @@ xfs_initialize_perag(
if (error)
goto out_remove_pag;
+ /* Active ref owned by mount indicates AG is online. */
+ atomic_set(&pag->pag_active_ref, 1);
+
/* first new pag is fully initialized */
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
@@ -824,7 +887,7 @@ xfs_ag_shrink_space(
struct xfs_alloc_arg args = {
.tp = *tpp,
.mp = mp,
- .type = XFS_ALLOCTYPE_THIS_BNO,
+ .pag = pag,
.minlen = delta,
.maxlen = delta,
.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
@@ -856,14 +919,11 @@ xfs_ag_shrink_space(
if (delta >= aglen)
return -EINVAL;
- args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta);
-
/*
* Make sure that the last inode cluster cannot overlap with the new
* end of the AG, even if it's sparse.
*/
- error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp,
- aglen - delta);
+ error = xfs_ialloc_check_shrink(pag, *tpp, agibp, aglen - delta);
if (error)
return error;
@@ -876,7 +936,8 @@ xfs_ag_shrink_space(
return error;
/* internal log shouldn't also show up in the free space btrees */
- error = xfs_alloc_vextent(&args);
+ error = xfs_alloc_vextent_exact_bno(&args,
+ XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta));
if (!error && args.agbno == NULLAGBLOCK)
error = -ENOSPC;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 191b22b9a35b..5e18536dfdce 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -32,14 +32,12 @@ struct xfs_ag_resv {
struct xfs_perag {
struct xfs_mount *pag_mount; /* owner filesystem */
xfs_agnumber_t pag_agno; /* AG this structure belongs to */
- atomic_t pag_ref; /* perag reference count */
- char pagf_init; /* this agf's entry is initialized */
- char pagi_init; /* this agi's entry is initialized */
- char pagf_metadata; /* the agf is preferred to be metadata */
- char pagi_inodeok; /* The agi is ok for inodes */
+ atomic_t pag_ref; /* passive reference count */
+ atomic_t pag_active_ref; /* active reference count */
+ wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */
+ unsigned long pag_opstate;
uint8_t pagf_levels[XFS_BTNUM_AGF];
/* # of levels in bno & cnt btree */
- bool pagf_agflreset; /* agfl requires reset before use */
uint32_t pagf_flcount; /* count of blocks in freelist */
xfs_extlen_t pagf_freeblks; /* total free blocks */
xfs_extlen_t pagf_longest; /* longest free space */
@@ -106,16 +104,44 @@ struct xfs_perag {
#endif /* __KERNEL__ */
};
+/*
+ * Per-AG operational state. These are atomic flag bits.
+ */
+#define XFS_AGSTATE_AGF_INIT 0
+#define XFS_AGSTATE_AGI_INIT 1
+#define XFS_AGSTATE_PREFERS_METADATA 2
+#define XFS_AGSTATE_ALLOWS_INODES 3
+#define XFS_AGSTATE_AGFL_NEEDS_RESET 4
+
+#define __XFS_AG_OPSTATE(name, NAME) \
+static inline bool xfs_perag_ ## name (struct xfs_perag *pag) \
+{ \
+ return test_bit(XFS_AGSTATE_ ## NAME, &pag->pag_opstate); \
+}
+
+__XFS_AG_OPSTATE(initialised_agf, AGF_INIT)
+__XFS_AG_OPSTATE(initialised_agi, AGI_INIT)
+__XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA)
+__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES)
+__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET)
+
int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
void xfs_free_perag(struct xfs_mount *mp);
+/* Passive AG references */
struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
unsigned int tag);
void xfs_perag_put(struct xfs_perag *pag);
+/* Active AG references */
+struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t);
+struct xfs_perag *xfs_perag_grab_tag(struct xfs_mount *, xfs_agnumber_t,
+ int tag);
+void xfs_perag_rele(struct xfs_perag *pag);
+
/*
* Per-ag geometry infomation and validation
*/
@@ -193,31 +219,86 @@ xfs_perag_next(
struct xfs_mount *mp = pag->pag_mount;
*agno = pag->pag_agno + 1;
- xfs_perag_put(pag);
- if (*agno > end_agno)
- return NULL;
- return xfs_perag_get(mp, *agno);
+ xfs_perag_rele(pag);
+ while (*agno <= end_agno) {
+ pag = xfs_perag_grab(mp, *agno);
+ if (pag)
+ return pag;
+ (*agno)++;
+ }
+ return NULL;
}
#define for_each_perag_range(mp, agno, end_agno, pag) \
- for ((pag) = xfs_perag_get((mp), (agno)); \
+ for ((pag) = xfs_perag_grab((mp), (agno)); \
(pag) != NULL; \
(pag) = xfs_perag_next((pag), &(agno), (end_agno)))
#define for_each_perag_from(mp, agno, pag) \
for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag))
-
#define for_each_perag(mp, agno, pag) \
(agno) = 0; \
for_each_perag_from((mp), (agno), (pag))
#define for_each_perag_tag(mp, agno, pag, tag) \
- for ((agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
+ for ((agno) = 0, (pag) = xfs_perag_grab_tag((mp), 0, (tag)); \
(pag) != NULL; \
(agno) = (pag)->pag_agno + 1, \
- xfs_perag_put(pag), \
- (pag) = xfs_perag_get_tag((mp), (agno), (tag)))
+ xfs_perag_rele(pag), \
+ (pag) = xfs_perag_grab_tag((mp), (agno), (tag)))
+
+static inline struct xfs_perag *
+xfs_perag_next_wrap(
+ struct xfs_perag *pag,
+ xfs_agnumber_t *agno,
+ xfs_agnumber_t stop_agno,
+ xfs_agnumber_t restart_agno,
+ xfs_agnumber_t wrap_agno)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ *agno = pag->pag_agno + 1;
+ xfs_perag_rele(pag);
+ while (*agno != stop_agno) {
+ if (*agno >= wrap_agno) {
+ if (restart_agno >= stop_agno)
+ break;
+ *agno = restart_agno;
+ }
+
+ pag = xfs_perag_grab(mp, *agno);
+ if (pag)
+ return pag;
+ (*agno)++;
+ }
+ return NULL;
+}
+
+/*
+ * Iterate all AGs from start_agno through wrap_agno, then restart_agno through
+ * (start_agno - 1).
+ */
+#define for_each_perag_wrap_range(mp, start_agno, restart_agno, wrap_agno, agno, pag) \
+ for ((agno) = (start_agno), (pag) = xfs_perag_grab((mp), (agno)); \
+ (pag) != NULL; \
+ (pag) = xfs_perag_next_wrap((pag), &(agno), (start_agno), \
+ (restart_agno), (wrap_agno)))
+/*
+ * Iterate all AGs from start_agno through wrap_agno, then 0 through
+ * (start_agno - 1).
+ */
+#define for_each_perag_wrap_at(mp, start_agno, wrap_agno, agno, pag) \
+ for_each_perag_wrap_range((mp), (start_agno), 0, (wrap_agno), (agno), (pag))
+
+/*
+ * Iterate all AGs from start_agno through to the end of the filesystem, then 0
+ * through (start_agno - 1).
+ */
+#define for_each_perag_wrap(mp, start_agno, agno, pag) \
+ for_each_perag_wrap_at((mp), (start_agno), (mp)->m_sb.sb_agcount, \
+ (agno), (pag))
+
struct aghdr_init_data {
/* per ag data */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 5af123d13a63..7fd1fea95552 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -264,7 +264,7 @@ xfs_ag_resv_init(
if (error)
goto out;
- error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used);
+ error = xfs_finobt_calc_reserves(pag, tp, &ask, &used);
if (error)
goto out;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index f8ff81c3de76..6a037173d20d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -36,10 +36,6 @@ struct workqueue_struct *xfs_alloc_wq;
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
-
/*
* Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
* the beginning of the block for a proper header with the location information
@@ -772,8 +768,6 @@ xfs_alloc_cur_setup(
int error;
int i;
- ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
-
acur->cur_len = args->maxlen;
acur->rec_bno = 0;
acur->rec_len = 0;
@@ -887,7 +881,6 @@ xfs_alloc_cur_check(
* We have an aligned record that satisfies minlen and beats or matches
* the candidate extent size. Compare locality for near allocation mode.
*/
- ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
diff = xfs_alloc_compute_diff(args->agbno, args->len,
args->alignment, args->datatype,
bnoa, lena, &bnew);
@@ -1133,78 +1126,6 @@ error:
}
/*
- * Allocate a variable extent in the allocation group agno.
- * Type and bno are used to determine where in the allocation group the
- * extent will start.
- * Extent's length (returned in *len) will be between minlen and maxlen,
- * and of the form k * prod + mod unless there's nothing that large.
- * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
- */
-STATIC int /* error */
-xfs_alloc_ag_vextent(
- xfs_alloc_arg_t *args) /* argument structure for allocation */
-{
- int error=0;
-
- ASSERT(args->minlen > 0);
- ASSERT(args->maxlen > 0);
- ASSERT(args->minlen <= args->maxlen);
- ASSERT(args->mod < args->prod);
- ASSERT(args->alignment > 0);
-
- /*
- * Branch to correct routine based on the type.
- */
- args->wasfromfl = 0;
- switch (args->type) {
- case XFS_ALLOCTYPE_THIS_AG:
- error = xfs_alloc_ag_vextent_size(args);
- break;
- case XFS_ALLOCTYPE_NEAR_BNO:
- error = xfs_alloc_ag_vextent_near(args);
- break;
- case XFS_ALLOCTYPE_THIS_BNO:
- error = xfs_alloc_ag_vextent_exact(args);
- break;
- default:
- ASSERT(0);
- /* NOTREACHED */
- }
-
- if (error || args->agbno == NULLAGBLOCK)
- return error;
-
- ASSERT(args->len >= args->minlen);
- ASSERT(args->len <= args->maxlen);
- ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
- ASSERT(args->agbno % args->alignment == 0);
-
- /* if not file data, insert new block into the reverse map btree */
- if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
- error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
- args->agbno, args->len, &args->oinfo);
- if (error)
- return error;
- }
-
- if (!args->wasfromfl) {
- error = xfs_alloc_update_counters(args->tp, args->agbp,
- -((long)(args->len)));
- if (error)
- return error;
-
- ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
- args->agbno, args->len));
- }
-
- xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
-
- XFS_STATS_INC(args->mp, xs_allocx);
- XFS_STATS_ADD(args->mp, xs_allocb, args->len);
- return error;
-}
-
-/*
* Allocate a variable extent at exactly agno/bno.
* Extent's length (returned in *len) will be between minlen and maxlen,
* and of the form k * prod + mod unless there's nothing that large.
@@ -1389,7 +1310,6 @@ xfs_alloc_ag_vextent_locality(
bool fbinc;
ASSERT(acur->len == 0);
- ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
*stat = 0;
@@ -2435,7 +2355,7 @@ xfs_agfl_reset(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_agf *agf = agbp->b_addr;
- ASSERT(pag->pagf_agflreset);
+ ASSERT(xfs_perag_agfl_needs_reset(pag));
trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
xfs_warn(mp,
@@ -2450,7 +2370,7 @@ xfs_agfl_reset(
XFS_AGF_FLCOUNT);
pag->pagf_flcount = 0;
- pag->pagf_agflreset = false;
+ clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
}
/*
@@ -2605,7 +2525,7 @@ xfs_alloc_fix_freelist(
/* deferred ops (AGFL block frees) require permanent transactions */
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- if (!pag->pagf_init) {
+ if (!xfs_perag_initialised_agf(pag)) {
error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
if (error) {
/* Couldn't lock the AGF so skip this AG. */
@@ -2620,7 +2540,8 @@ xfs_alloc_fix_freelist(
* somewhere else if we are not being asked to try harder at this
* point
*/
- if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
+ if (xfs_perag_prefers_metadata(pag) &&
+ (args->datatype & XFS_ALLOC_USERDATA) &&
(flags & XFS_ALLOC_FLAG_TRYLOCK)) {
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
goto out_agbp_relse;
@@ -2646,7 +2567,7 @@ xfs_alloc_fix_freelist(
}
/* reset a padding mismatched agfl before final free space check */
- if (pag->pagf_agflreset)
+ if (xfs_perag_agfl_needs_reset(pag))
xfs_agfl_reset(tp, agbp, pag);
/* If there isn't enough total space or single-extent, reject it. */
@@ -2707,7 +2628,6 @@ xfs_alloc_fix_freelist(
targs.agbp = agbp;
targs.agno = args->agno;
targs.alignment = targs.minlen = targs.prod = 1;
- targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
error = xfs_alloc_read_agfl(pag, tp, &agflbp);
if (error)
@@ -2720,7 +2640,7 @@ xfs_alloc_fix_freelist(
targs.resv = XFS_AG_RESV_AGFL;
/* Allocate as many blocks as possible at once. */
- error = xfs_alloc_ag_vextent(&targs);
+ error = xfs_alloc_ag_vextent_size(&targs);
if (error)
goto out_agflbp_relse;
@@ -2734,6 +2654,18 @@ xfs_alloc_fix_freelist(
break;
goto out_agflbp_relse;
}
+
+ if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) {
+ error = xfs_rmap_alloc(tp, agbp, pag,
+ targs.agbno, targs.len, &targs.oinfo);
+ if (error)
+ goto out_agflbp_relse;
+ }
+ error = xfs_alloc_update_counters(tp, agbp,
+ -((long)(targs.len)));
+ if (error)
+ goto out_agflbp_relse;
+
/*
* Put each allocated block on the list.
*/
@@ -2803,7 +2735,7 @@ xfs_alloc_get_freelist(
if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
agf->agf_flfirst = 0;
- ASSERT(!pag->pagf_agflreset);
+ ASSERT(!xfs_perag_agfl_needs_reset(pag));
be32_add_cpu(&agf->agf_flcount, -1);
pag->pagf_flcount--;
@@ -2892,7 +2824,7 @@ xfs_alloc_put_freelist(
if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
agf->agf_fllast = 0;
- ASSERT(!pag->pagf_agflreset);
+ ASSERT(!xfs_perag_agfl_needs_reset(pag));
be32_add_cpu(&agf->agf_flcount, 1);
pag->pagf_flcount++;
@@ -3099,7 +3031,7 @@ xfs_alloc_read_agf(
return error;
agf = agfbp->b_addr;
- if (!pag->pagf_init) {
+ if (!xfs_perag_initialised_agf(pag)) {
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
@@ -3111,8 +3043,8 @@ xfs_alloc_read_agf(
pag->pagf_levels[XFS_BTNUM_RMAPi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
- pag->pagf_init = 1;
- pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
+ if (xfs_agfl_needs_reset(pag->pag_mount, agf))
+ set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
/*
* Update the in-core allocbt counter. Filter out the rmapbt
@@ -3127,6 +3059,8 @@ xfs_alloc_read_agf(
if (allocbt_blks > 0)
atomic64_add(allocbt_blks,
&pag->pag_mount->m_allocbt_blks);
+
+ set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
}
#ifdef DEBUG
else if (!xfs_is_shutdown(pag->pag_mount)) {
@@ -3148,26 +3082,25 @@ xfs_alloc_read_agf(
}
/*
- * Allocate an extent (variable-size).
- * Depending on the allocation type, we either look in a single allocation
- * group or loop over the allocation groups to find the result.
+ * Pre-proces allocation arguments to set initial state that we don't require
+ * callers to set up correctly, as well as bounds check the allocation args
+ * that are set up.
*/
-int /* error */
-xfs_alloc_vextent(
- struct xfs_alloc_arg *args) /* allocation argument structure */
+static int
+xfs_alloc_vextent_check_args(
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t target,
+ xfs_agnumber_t *minimum_agno)
{
- xfs_agblock_t agsize; /* allocation group size */
- int error;
- int flags; /* XFS_ALLOC_FLAG_... locking flags */
- struct xfs_mount *mp; /* mount structure pointer */
- xfs_agnumber_t sagno; /* starting allocation group number */
- xfs_alloctype_t type; /* input allocation type */
- int bump_rotor = 0;
- xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
-
- mp = args->mp;
- type = args->otype = args->type;
- args->agbno = NULLAGBLOCK;
+ struct xfs_mount *mp = args->mp;
+ xfs_agblock_t agsize;
+
+ args->fsbno = NULLFSBLOCK;
+
+ *minimum_agno = 0;
+ if (args->tp->t_highest_agno != NULLAGNUMBER)
+ *minimum_agno = args->tp->t_highest_agno;
+
/*
* Just fix this up, for the case where the last a.g. is shorter
* (or there's only one a.g.) and the caller couldn't easily figure
@@ -3178,168 +3111,414 @@ xfs_alloc_vextent(
args->maxlen = agsize;
if (args->alignment == 0)
args->alignment = 1;
- ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+
+ ASSERT(args->minlen > 0);
+ ASSERT(args->maxlen > 0);
+ ASSERT(args->alignment > 0);
+ ASSERT(args->resv != XFS_AG_RESV_AGFL);
+
+ ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize);
ASSERT(args->minlen <= args->maxlen);
ASSERT(args->minlen <= agsize);
ASSERT(args->mod < args->prod);
- if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
- XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+
+ if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount ||
+ XFS_FSB_TO_AGBNO(mp, target) >= agsize ||
args->minlen > args->maxlen || args->minlen > agsize ||
args->mod >= args->prod) {
- args->fsbno = NULLFSBLOCK;
trace_xfs_alloc_vextent_badargs(args);
+ return -ENOSPC;
+ }
+
+ if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) {
+ trace_xfs_alloc_vextent_skip_deadlock(args);
+ return -ENOSPC;
+ }
+ return 0;
+
+}
+
+/*
+ * Prepare an AG for allocation. If the AG is not prepared to accept the
+ * allocation, return failure.
+ *
+ * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are
+ * modified to hold their own perag references.
+ */
+static int
+xfs_alloc_vextent_prepare_ag(
+ struct xfs_alloc_arg *args)
+{
+ bool need_pag = !args->pag;
+ int error;
+
+ if (need_pag)
+ args->pag = xfs_perag_get(args->mp, args->agno);
+
+ args->agbp = NULL;
+ error = xfs_alloc_fix_freelist(args, 0);
+ if (error) {
+ trace_xfs_alloc_vextent_nofix(args);
+ if (need_pag)
+ xfs_perag_put(args->pag);
+ args->agbno = NULLAGBLOCK;
+ return error;
+ }
+ if (!args->agbp) {
+ /* cannot allocate in this AG at all */
+ trace_xfs_alloc_vextent_noagbp(args);
+ args->agbno = NULLAGBLOCK;
return 0;
}
+ args->wasfromfl = 0;
+ return 0;
+}
- switch (type) {
- case XFS_ALLOCTYPE_THIS_AG:
- case XFS_ALLOCTYPE_NEAR_BNO:
- case XFS_ALLOCTYPE_THIS_BNO:
- /*
- * These three force us into a single a.g.
- */
- args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
- args->pag = xfs_perag_get(mp, args->agno);
- error = xfs_alloc_fix_freelist(args, 0);
- if (error) {
- trace_xfs_alloc_vextent_nofix(args);
- goto error0;
- }
- if (!args->agbp) {
- trace_xfs_alloc_vextent_noagbp(args);
+/*
+ * Post-process allocation results to account for the allocation if it succeed
+ * and set the allocated block number correctly for the caller.
+ *
+ * XXX: we should really be returning ENOSPC for ENOSPC, not
+ * hiding it behind a "successful" NULLFSBLOCK allocation.
+ */
+static int
+xfs_alloc_vextent_finish(
+ struct xfs_alloc_arg *args,
+ xfs_agnumber_t minimum_agno,
+ int alloc_error,
+ bool drop_perag)
+{
+ struct xfs_mount *mp = args->mp;
+ int error = 0;
+
+ /*
+ * We can end up here with a locked AGF. If we failed, the caller is
+ * likely going to try to allocate again with different parameters, and
+ * that can widen the AGs that are searched for free space. If we have
+ * to do BMBT block allocation, we have to do a new allocation.
+ *
+ * Hence leaving this function with the AGF locked opens up potential
+ * ABBA AGF deadlocks because a future allocation attempt in this
+ * transaction may attempt to lock a lower number AGF.
+ *
+ * We can't release the AGF until the transaction is commited, so at
+ * this point we must update the "first allocation" tracker to point at
+ * this AG if the tracker is empty or points to a lower AG. This allows
+ * the next allocation attempt to be modified appropriately to avoid
+ * deadlocks.
+ */
+ if (args->agbp &&
+ (args->tp->t_highest_agno == NULLAGNUMBER ||
+ args->agno > minimum_agno))
+ args->tp->t_highest_agno = args->agno;
+
+ /*
+ * If the allocation failed with an error or we had an ENOSPC result,
+ * preserve the returned error whilst also marking the allocation result
+ * as "no extent allocated". This ensures that callers that fail to
+ * capture the error will still treat it as a failed allocation.
+ */
+ if (alloc_error || args->agbno == NULLAGBLOCK) {
+ args->fsbno = NULLFSBLOCK;
+ error = alloc_error;
+ goto out_drop_perag;
+ }
+
+ args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+
+ ASSERT(args->len >= args->minlen);
+ ASSERT(args->len <= args->maxlen);
+ ASSERT(args->agbno % args->alignment == 0);
+ XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
+
+ /* if not file data, insert new block into the reverse map btree */
+ if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
+ error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
+ args->agbno, args->len, &args->oinfo);
+ if (error)
+ goto out_drop_perag;
+ }
+
+ if (!args->wasfromfl) {
+ error = xfs_alloc_update_counters(args->tp, args->agbp,
+ -((long)(args->len)));
+ if (error)
+ goto out_drop_perag;
+
+ ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno,
+ args->len));
+ }
+
+ xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
+
+ XFS_STATS_INC(mp, xs_allocx);
+ XFS_STATS_ADD(mp, xs_allocb, args->len);
+
+out_drop_perag:
+ if (drop_perag && args->pag) {
+ xfs_perag_rele(args->pag);
+ args->pag = NULL;
+ }
+ return error;
+}
+
+/*
+ * Allocate within a single AG only. This uses a best-fit length algorithm so if
+ * you need an exact sized allocation without locality constraints, this is the
+ * fastest way to do it.
+ *
+ * Caller is expected to hold a perag reference in args->pag.
+ */
+int
+xfs_alloc_vextent_this_ag(
+ struct xfs_alloc_arg *args,
+ xfs_agnumber_t agno)
+{
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t minimum_agno;
+ int error;
+
+ args->agno = agno;
+ args->agbno = 0;
+ error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0),
+ &minimum_agno);
+ if (error) {
+ if (error == -ENOSPC)
+ return 0;
+ return error;
+ }
+
+ error = xfs_alloc_vextent_prepare_ag(args);
+ if (!error && args->agbp)
+ error = xfs_alloc_ag_vextent_size(args);
+
+ return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
+}
+
+/*
+ * Iterate all AGs trying to allocate an extent starting from @start_ag.
+ *
+ * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
+ * allocation attempts in @start_agno have locality information. If we fail to
+ * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
+ * we attempt to allocation in as there is no locality optimisation possible for
+ * those allocations.
+ *
+ * On return, args->pag may be left referenced if we finish before the "all
+ * failed" return point. The allocation finish still needs the perag, and
+ * so the caller will release it once they've finished the allocation.
+ *
+ * When we wrap the AG iteration at the end of the filesystem, we have to be
+ * careful not to wrap into AGs below ones we already have locked in the
+ * transaction if we are doing a blocking iteration. This will result in an
+ * out-of-order locking of AGFs and hence can cause deadlocks.
+ */
+static int
+xfs_alloc_vextent_iterate_ags(
+ struct xfs_alloc_arg *args,
+ xfs_agnumber_t minimum_agno,
+ xfs_agnumber_t start_agno,
+ xfs_agblock_t target_agbno,
+ uint32_t flags)
+{
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t agno;
+ int error = 0;
+
+restart:
+ for_each_perag_wrap_range(mp, start_agno, minimum_agno,
+ mp->m_sb.sb_agcount, agno, args->pag) {
+ args->agno = agno;
+ error = xfs_alloc_vextent_prepare_ag(args);
+ if (error)
break;
+ if (!args->agbp) {
+ trace_xfs_alloc_vextent_loopfailed(args);
+ continue;
}
- args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
- if ((error = xfs_alloc_ag_vextent(args)))
- goto error0;
- break;
- case XFS_ALLOCTYPE_START_BNO:
- /*
- * Try near allocation first, then anywhere-in-ag after
- * the first a.g. fails.
- */
- if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
- xfs_is_inode32(mp)) {
- args->fsbno = XFS_AGB_TO_FSB(mp,
- ((mp->m_agfrotor / rotorstep) %
- mp->m_sb.sb_agcount), 0);
- bump_rotor = 1;
- }
- args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
- args->type = XFS_ALLOCTYPE_NEAR_BNO;
- fallthrough;
- case XFS_ALLOCTYPE_FIRST_AG:
+
/*
- * Rotate through the allocation groups looking for a winner.
+ * Allocation is supposed to succeed now, so break out of the
+ * loop regardless of whether we succeed or not.
*/
- if (type == XFS_ALLOCTYPE_FIRST_AG) {
- /*
- * Start with allocation group given by bno.
- */
- args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
- args->type = XFS_ALLOCTYPE_THIS_AG;
- sagno = 0;
- flags = 0;
+ if (args->agno == start_agno && target_agbno) {
+ args->agbno = target_agbno;
+ error = xfs_alloc_ag_vextent_near(args);
} else {
- /*
- * Start with the given allocation group.
- */
- args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
- flags = XFS_ALLOC_FLAG_TRYLOCK;
- }
- /*
- * Loop over allocation groups twice; first time with
- * trylock set, second time without.
- */
- for (;;) {
- args->pag = xfs_perag_get(mp, args->agno);
- error = xfs_alloc_fix_freelist(args, flags);
- if (error) {
- trace_xfs_alloc_vextent_nofix(args);
- goto error0;
- }
- /*
- * If we get a buffer back then the allocation will fly.
- */
- if (args->agbp) {
- if ((error = xfs_alloc_ag_vextent(args)))
- goto error0;
- break;
- }
-
- trace_xfs_alloc_vextent_loopfailed(args);
-
- /*
- * Didn't work, figure out the next iteration.
- */
- if (args->agno == sagno &&
- type == XFS_ALLOCTYPE_START_BNO)
- args->type = XFS_ALLOCTYPE_THIS_AG;
- /*
- * For the first allocation, we can try any AG to get
- * space. However, if we already have allocated a
- * block, we don't want to try AGs whose number is below
- * sagno. Otherwise, we may end up with out-of-order
- * locking of AGF, which might cause deadlock.
- */
- if (++(args->agno) == mp->m_sb.sb_agcount) {
- if (args->tp->t_firstblock != NULLFSBLOCK)
- args->agno = sagno;
- else
- args->agno = 0;
- }
- /*
- * Reached the starting a.g., must either be done
- * or switch to non-trylock mode.
- */
- if (args->agno == sagno) {
- if (flags == 0) {
- args->agbno = NULLAGBLOCK;
- trace_xfs_alloc_vextent_allfailed(args);
- break;
- }
-
- flags = 0;
- if (type == XFS_ALLOCTYPE_START_BNO) {
- args->agbno = XFS_FSB_TO_AGBNO(mp,
- args->fsbno);
- args->type = XFS_ALLOCTYPE_NEAR_BNO;
- }
- }
- xfs_perag_put(args->pag);
- }
- if (bump_rotor) {
- if (args->agno == sagno)
- mp->m_agfrotor = (mp->m_agfrotor + 1) %
- (mp->m_sb.sb_agcount * rotorstep);
- else
- mp->m_agfrotor = (args->agno * rotorstep + 1) %
- (mp->m_sb.sb_agcount * rotorstep);
+ args->agbno = 0;
+ error = xfs_alloc_ag_vextent_size(args);
}
break;
- default:
- ASSERT(0);
- /* NOTREACHED */
}
- if (args->agbno == NULLAGBLOCK)
- args->fsbno = NULLFSBLOCK;
- else {
- args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
-#ifdef DEBUG
- ASSERT(args->len >= args->minlen);
- ASSERT(args->len <= args->maxlen);
- ASSERT(args->agbno % args->alignment == 0);
- XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
- args->len);
-#endif
+ if (error) {
+ xfs_perag_rele(args->pag);
+ args->pag = NULL;
+ return error;
+ }
+ if (args->agbp)
+ return 0;
+ /*
+ * We didn't find an AG we can alloation from. If we were given
+ * constraining flags by the caller, drop them and retry the allocation
+ * without any constraints being set.
+ */
+ if (flags) {
+ flags = 0;
+ goto restart;
}
- xfs_perag_put(args->pag);
+
+ ASSERT(args->pag == NULL);
+ trace_xfs_alloc_vextent_allfailed(args);
return 0;
-error0:
- xfs_perag_put(args->pag);
- return error;
+}
+
+/*
+ * Iterate from the AGs from the start AG to the end of the filesystem, trying
+ * to allocate blocks. It starts with a near allocation attempt in the initial
+ * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
+ * back to zero if allowed by previous allocations in this transaction,
+ * otherwise will wrap back to the start AG and run a second blocking pass to
+ * the end of the filesystem.
+ */
+int
+xfs_alloc_vextent_start_ag(
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t target)
+{
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t minimum_agno;
+ xfs_agnumber_t start_agno;
+ xfs_agnumber_t rotorstep = xfs_rotorstep;
+ bool bump_rotor = false;
+ int error;
+
+ args->agno = NULLAGNUMBER;
+ args->agbno = NULLAGBLOCK;
+ error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
+ if (error) {
+ if (error == -ENOSPC)
+ return 0;
+ return error;
+ }
+
+ if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
+ xfs_is_inode32(mp)) {
+ target = XFS_AGB_TO_FSB(mp,
+ ((mp->m_agfrotor / rotorstep) %
+ mp->m_sb.sb_agcount), 0);
+ bump_rotor = 1;
+ }
+
+ start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
+ error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
+ XFS_FSB_TO_AGBNO(mp, target), XFS_ALLOC_FLAG_TRYLOCK);
+
+ if (bump_rotor) {
+ if (args->agno == start_agno)
+ mp->m_agfrotor = (mp->m_agfrotor + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
+ else
+ mp->m_agfrotor = (args->agno * rotorstep + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
+ }
+
+ return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
+}
+
+/*
+ * Iterate from the agno indicated via @target through to the end of the
+ * filesystem attempting blocking allocation. This does not wrap or try a second
+ * pass, so will not recurse into AGs lower than indicated by the target.
+ */
+int
+xfs_alloc_vextent_first_ag(
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t target)
+ {
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t minimum_agno;
+ xfs_agnumber_t start_agno;
+ int error;
+
+ args->agno = NULLAGNUMBER;
+ args->agbno = NULLAGBLOCK;
+ error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
+ if (error) {
+ if (error == -ENOSPC)
+ return 0;
+ return error;
+ }
+
+ start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
+ error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
+ XFS_FSB_TO_AGBNO(mp, target), 0);
+ return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
+}
+
+/*
+ * Allocate at the exact block target or fail. Caller is expected to hold a
+ * perag reference in args->pag.
+ */
+int
+xfs_alloc_vextent_exact_bno(
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t target)
+{
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t minimum_agno;
+ int error;
+
+ args->agno = XFS_FSB_TO_AGNO(mp, target);
+ args->agbno = XFS_FSB_TO_AGBNO(mp, target);
+ error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
+ if (error) {
+ if (error == -ENOSPC)
+ return 0;
+ return error;
+ }
+
+ error = xfs_alloc_vextent_prepare_ag(args);
+ if (!error && args->agbp)
+ error = xfs_alloc_ag_vextent_exact(args);
+
+ return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
+}
+
+/*
+ * Allocate an extent as close to the target as possible. If there are not
+ * viable candidates in the AG, then fail the allocation.
+ *
+ * Caller may or may not have a per-ag reference in args->pag.
+ */
+int
+xfs_alloc_vextent_near_bno(
+ struct xfs_alloc_arg *args,
+ xfs_fsblock_t target)
+{
+ struct xfs_mount *mp = args->mp;
+ xfs_agnumber_t minimum_agno;
+ bool needs_perag = args->pag == NULL;
+ int error;
+
+ args->agno = XFS_FSB_TO_AGNO(mp, target);
+ args->agbno = XFS_FSB_TO_AGBNO(mp, target);
+ error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
+ if (error) {
+ if (error == -ENOSPC)
+ return 0;
+ return error;
+ }
+
+ if (needs_perag)
+ args->pag = xfs_perag_grab(mp, args->agno);
+
+ error = xfs_alloc_vextent_prepare_ag(args);
+ if (!error && args->agbp)
+ error = xfs_alloc_ag_vextent_near(args);
+
+ return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag);
}
/* Ensure that the freelist is at full capacity. */
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 2c3f762dfb58..2b246d74c189 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -17,25 +17,6 @@ extern struct workqueue_struct *xfs_alloc_wq;
unsigned int xfs_agfl_size(struct xfs_mount *mp);
/*
- * Freespace allocation types. Argument to xfs_alloc_[v]extent.
- */
-#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */
-#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */
-#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */
-#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */
-#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */
-
-/* this should become an enum again when the tracing code is fixed */
-typedef unsigned int xfs_alloctype_t;
-
-#define XFS_ALLOC_TYPES \
- { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \
- { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \
- { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \
- { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \
- { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" }
-
-/*
* Flags for xfs_alloc_fix_freelist.
*/
#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */
@@ -68,8 +49,6 @@ typedef struct xfs_alloc_arg {
xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */
xfs_agblock_t max_agbno; /* ... */
xfs_extlen_t len; /* output: actual size of extent */
- xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
- xfs_alloctype_t otype; /* original allocation type */
int datatype; /* mask defining data type treatment */
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
@@ -118,11 +97,43 @@ xfs_alloc_log_agf(
uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */
/*
- * Allocate an extent (variable-size).
+ * Allocate an extent anywhere in the specific AG given. If there is no
+ * space matching the requirements in that AG, then the allocation will fail.
*/
-int /* error */
-xfs_alloc_vextent(
- xfs_alloc_arg_t *args); /* allocation argument structure */
+int xfs_alloc_vextent_this_ag(struct xfs_alloc_arg *args, xfs_agnumber_t agno);
+
+/*
+ * Allocate an extent as close to the target as possible. If there are not
+ * viable candidates in the AG, then fail the allocation.
+ */
+int xfs_alloc_vextent_near_bno(struct xfs_alloc_arg *args,
+ xfs_fsblock_t target);
+
+/*
+ * Allocate an extent exactly at the target given. If this is not possible
+ * then the allocation fails.
+ */
+int xfs_alloc_vextent_exact_bno(struct xfs_alloc_arg *args,
+ xfs_fsblock_t target);
+
+/*
+ * Best effort full filesystem allocation scan.
+ *
+ * Locality aware allocation will be attempted in the initial AG, but on failure
+ * non-localised attempts will be made. The AGs are constrained by previous
+ * allocations in the current transaction. Two passes will be made - the first
+ * non-blocking, the second blocking.
+ */
+int xfs_alloc_vextent_start_ag(struct xfs_alloc_arg *args,
+ xfs_fsblock_t target);
+
+/*
+ * Iterate from the AG indicated from args->fsbno through to the end of the
+ * filesystem attempting blocking allocation. This is for use in last
+ * resort allocation attempts when everything else has failed.
+ */
+int xfs_alloc_vextent_first_ag(struct xfs_alloc_arg *args,
+ xfs_fsblock_t target);
/*
* Free an extent.
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 549a3cba0234..0f29c7b1b39f 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -315,7 +315,7 @@ xfs_allocbt_verify(
level = be16_to_cpu(block->bb_level);
if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
btnum = XFS_BTNUM_CNTi;
- if (pag && pag->pagf_init) {
+ if (pag && xfs_perag_initialised_agf(pag)) {
if (level >= pag->pagf_levels[btnum])
return __this_address;
} else if (level >= mp->m_alloc_maxlevels)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c8c65387136c..34de6e6898c4 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -645,34 +645,23 @@ xfs_bmap_extents_to_btree(
args.tp = tp;
args.mp = mp;
xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, whichfork);
- if (tp->t_firstblock == NULLFSBLOCK) {
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
- } else if (tp->t_flags & XFS_TRANS_LOWMODE) {
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = tp->t_firstblock;
- } else {
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.fsbno = tp->t_firstblock;
- }
+
args.minlen = args.maxlen = args.prod = 1;
args.wasdel = wasdel;
*logflagsp = 0;
- error = xfs_alloc_vextent(&args);
+ error = xfs_alloc_vextent_start_ag(&args,
+ XFS_INO_TO_FSB(mp, ip->i_ino));
if (error)
goto out_root_realloc;
+ /*
+ * Allocation can't fail, the space was reserved.
+ */
if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
error = -ENOSPC;
goto out_root_realloc;
}
- /*
- * Allocation can't fail, the space was reserved.
- */
- ASSERT(tp->t_firstblock == NULLFSBLOCK ||
- args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock));
- tp->t_firstblock = args.fsbno;
cur->bc_ino.allocated++;
ip->i_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
@@ -799,28 +788,24 @@ xfs_bmap_local_to_extents(
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = ip->i_mount;
+ args.total = total;
+ args.minlen = args.maxlen = args.prod = 1;
xfs_rmap_ino_owner(&args.oinfo, ip->i_ino, whichfork, 0);
+
/*
* Allocate a block. We know we need only one, since the
* file currently fits in an inode.
*/
- if (tp->t_firstblock == NULLFSBLOCK) {
- args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
- args.type = XFS_ALLOCTYPE_START_BNO;
- } else {
- args.fsbno = tp->t_firstblock;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- }
args.total = total;
args.minlen = args.maxlen = args.prod = 1;
- error = xfs_alloc_vextent(&args);
+ error = xfs_alloc_vextent_start_ag(&args,
+ XFS_INO_TO_FSB(args.mp, ip->i_ino));
if (error)
goto done;
/* Can't fail, the space was reserved. */
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
- tp->t_firstblock = args.fsbno;
error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp,
XFS_FSB_TO_DADDR(args.mp, args.fsbno),
args.mp->m_bsize, 0, &bp);
@@ -854,8 +839,7 @@ xfs_bmap_local_to_extents(
ifp->if_nextents = 1;
ip->i_nblocks = 1;
- xfs_trans_mod_dquot_byino(tp, ip,
- XFS_TRANS_DQ_BCOUNT, 1L);
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
flags |= xfs_ilog_fext(whichfork);
done:
@@ -3025,9 +3009,7 @@ xfs_bmap_adjacent(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
xfs_fsblock_t adjust; /* adjustment to block numbers */
- xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
xfs_mount_t *mp; /* mount point structure */
- int nullfb; /* true if ap->firstblock isn't set */
int rt; /* true if inode is realtime */
#define ISVALID(x,y) \
@@ -3038,11 +3020,8 @@ xfs_bmap_adjacent(
XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
mp = ap->ip->i_mount;
- nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
rt = XFS_IS_REALTIME_INODE(ap->ip) &&
(ap->datatype & XFS_ALLOC_USERDATA);
- fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
- ap->tp->t_firstblock);
/*
* If allocating at eof, and there's a previous real block,
* try to use its last block as our starting point.
@@ -3101,13 +3080,6 @@ xfs_bmap_adjacent(
prevbno += adjust;
else
prevdiff += adjust;
- /*
- * If the firstblock forbids it, can't use it,
- * must use default.
- */
- if (!rt && !nullfb &&
- XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
- prevbno = NULLFSBLOCK;
}
/*
* No previous block or can't follow it, just default.
@@ -3143,13 +3115,6 @@ xfs_bmap_adjacent(
gotdiff += adjust - ap->length;
} else
gotdiff += adjust;
- /*
- * If the firstblock forbids it, can't use it,
- * must use default.
- */
- if (!rt && !nullfb &&
- XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
- gotbno = NULLFSBLOCK;
}
/*
* No next block, just default.
@@ -3170,147 +3135,91 @@ xfs_bmap_adjacent(
#undef ISVALID
}
-static int
+int
xfs_bmap_longest_free_extent(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- xfs_agnumber_t ag,
- xfs_extlen_t *blen,
- int *notinit)
+ xfs_extlen_t *blen)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_perag *pag;
xfs_extlen_t longest;
int error = 0;
- pag = xfs_perag_get(mp, ag);
- if (!pag->pagf_init) {
+ if (!xfs_perag_initialised_agf(pag)) {
error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK,
NULL);
- if (error) {
- /* Couldn't lock the AGF, so skip this AG. */
- if (error == -EAGAIN) {
- *notinit = 1;
- error = 0;
- }
- goto out;
- }
+ if (error)
+ return error;
}
longest = xfs_alloc_longest_free_extent(pag,
- xfs_alloc_min_freelist(mp, pag),
+ xfs_alloc_min_freelist(pag->pag_mount, pag),
xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
if (*blen < longest)
*blen = longest;
-out:
- xfs_perag_put(pag);
- return error;
+ return 0;
}
-static void
+static xfs_extlen_t
xfs_bmap_select_minlen(
struct xfs_bmalloca *ap,
struct xfs_alloc_arg *args,
- xfs_extlen_t *blen,
- int notinit)
+ xfs_extlen_t blen)
{
- if (notinit || *blen < ap->minlen) {
- /*
- * Since we did a BUF_TRYLOCK above, it is possible that
- * there is space for this request.
- */
- args->minlen = ap->minlen;
- } else if (*blen < args->maxlen) {
- /*
- * If the best seen length is less than the request length,
- * use the best as the minimum.
- */
- args->minlen = *blen;
- } else {
- /*
- * Otherwise we've seen an extent as big as maxlen, use that
- * as the minimum.
- */
- args->minlen = args->maxlen;
- }
-}
-
-STATIC int
-xfs_bmap_btalloc_nullfb(
- struct xfs_bmalloca *ap,
- struct xfs_alloc_arg *args,
- xfs_extlen_t *blen)
-{
- struct xfs_mount *mp = ap->ip->i_mount;
- xfs_agnumber_t ag, startag;
- int notinit = 0;
- int error;
- args->type = XFS_ALLOCTYPE_START_BNO;
- args->total = ap->total;
-
- startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
- if (startag == NULLAGNUMBER)
- startag = ag = 0;
-
- while (*blen < args->maxlen) {
- error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
- &notinit);
- if (error)
- return error;
-
- if (++ag == mp->m_sb.sb_agcount)
- ag = 0;
- if (ag == startag)
- break;
- }
+ /*
+ * Since we used XFS_ALLOC_FLAG_TRYLOCK in _longest_free_extent(), it is
+ * possible that there is enough contiguous free space for this request.
+ */
+ if (blen < ap->minlen)
+ return ap->minlen;
- xfs_bmap_select_minlen(ap, args, blen, notinit);
- return 0;
+ /*
+ * If the best seen length is less than the request length,
+ * use the best as the minimum, otherwise we've got the maxlen we
+ * were asked for.
+ */
+ if (blen < args->maxlen)
+ return blen;
+ return args->maxlen;
}
-STATIC int
-xfs_bmap_btalloc_filestreams(
+static int
+xfs_bmap_btalloc_select_lengths(
struct xfs_bmalloca *ap,
struct xfs_alloc_arg *args,
xfs_extlen_t *blen)
{
- struct xfs_mount *mp = ap->ip->i_mount;
- xfs_agnumber_t ag;
- int notinit = 0;
- int error;
-
- args->type = XFS_ALLOCTYPE_NEAR_BNO;
- args->total = ap->total;
-
- ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
- if (ag == NULLAGNUMBER)
- ag = 0;
-
- error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
- if (error)
- return error;
+ struct xfs_mount *mp = args->mp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno, startag;
+ int error = 0;
- if (*blen < args->maxlen) {
- error = xfs_filestream_new_ag(ap, &ag);
- if (error)
- return error;
+ if (ap->tp->t_flags & XFS_TRANS_LOWMODE) {
+ args->total = ap->minlen;
+ args->minlen = ap->minlen;
+ return 0;
+ }
- error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
- &notinit);
- if (error)
- return error;
+ args->total = ap->total;
+ startag = XFS_FSB_TO_AGNO(mp, ap->blkno);
+ if (startag == NULLAGNUMBER)
+ startag = 0;
+ *blen = 0;
+ for_each_perag_wrap(mp, startag, agno, pag) {
+ error = xfs_bmap_longest_free_extent(pag, args->tp, blen);
+ if (error && error != -EAGAIN)
+ break;
+ error = 0;
+ if (*blen >= args->maxlen)
+ break;
}
+ if (pag)
+ xfs_perag_rele(pag);
- xfs_bmap_select_minlen(ap, args, blen, notinit);
-
- /*
- * Set the failure fallback case to look in the selected AG as stream
- * may have moved.
- */
- ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
- return 0;
+ args->minlen = xfs_bmap_select_minlen(ap, args, *blen);
+ return error;
}
/* Update all inode and quota accounting for the allocation we just did. */
@@ -3413,21 +3322,7 @@ xfs_bmap_process_allocated_extent(
xfs_fileoff_t orig_offset,
xfs_extlen_t orig_length)
{
- int nullfb;
-
- nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
-
- /*
- * check the allocation happened at the same or higher AG than
- * the first block that was allocated.
- */
- ASSERT(nullfb ||
- XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <=
- XFS_FSB_TO_AGNO(args->mp, args->fsbno));
-
ap->blkno = args->fsbno;
- if (nullfb)
- ap->tp->t_firstblock = args->fsbno;
ap->length = args->len;
/*
* If the extent size hint is active, we tried to round the
@@ -3474,23 +3369,17 @@ xfs_bmap_exact_minlen_extent_alloc(
xfs_bmap_compute_alignments(ap, &args);
- if (ap->tp->t_firstblock == NULLFSBLOCK) {
- /*
- * Unlike the longest extent available in an AG, we don't track
- * the length of an AG's shortest extent.
- * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and
- * hence we can afford to start traversing from the 0th AG since
- * we need not be concerned about a drop in performance in
- * "debug only" code paths.
- */
- ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0);
- } else {
- ap->blkno = ap->tp->t_firstblock;
- }
+ /*
+ * Unlike the longest extent available in an AG, we don't track
+ * the length of an AG's shortest extent.
+ * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and
+ * hence we can afford to start traversing from the 0th AG since
+ * we need not be concerned about a drop in performance in
+ * "debug only" code paths.
+ */
+ ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0);
- args.fsbno = ap->blkno;
args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
- args.type = XFS_ALLOCTYPE_FIRST_AG;
args.minlen = args.maxlen = ap->minlen;
args.total = ap->total;
@@ -3502,7 +3391,7 @@ xfs_bmap_exact_minlen_extent_alloc(
args.resv = XFS_AG_RESV_NONE;
args.datatype = ap->datatype;
- error = xfs_alloc_vextent(&args);
+ error = xfs_alloc_vextent_first_ag(&args, ap->blkno);
if (error)
return error;
@@ -3522,193 +3411,270 @@ xfs_bmap_exact_minlen_extent_alloc(
#endif
-STATIC int
-xfs_bmap_btalloc(
- struct xfs_bmalloca *ap)
+/*
+ * If we are not low on available data blocks and we are allocating at
+ * EOF, optimise allocation for contiguous file extension and/or stripe
+ * alignment of the new extent.
+ *
+ * NOTE: ap->aeof is only set if the allocation length is >= the
+ * stripe unit and the allocation offset is at the end of file.
+ */
+static int
+xfs_bmap_btalloc_at_eof(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t blen,
+ int stripe_align,
+ bool ag_only)
{
- struct xfs_mount *mp = ap->ip->i_mount;
- struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp };
- xfs_alloctype_t atype = 0;
- xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
- xfs_agnumber_t ag;
- xfs_fileoff_t orig_offset;
- xfs_extlen_t orig_length;
- xfs_extlen_t blen;
- xfs_extlen_t nextminlen = 0;
- int nullfb; /* true if ap->firstblock isn't set */
- int isaligned;
- int tryagain;
+ struct xfs_mount *mp = args->mp;
+ struct xfs_perag *caller_pag = args->pag;
int error;
- int stripe_align;
-
- ASSERT(ap->length);
- orig_offset = ap->offset;
- orig_length = ap->length;
-
- stripe_align = xfs_bmap_compute_alignments(ap, &args);
-
- nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
- fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
- ap->tp->t_firstblock);
- if (nullfb) {
- if ((ap->datatype & XFS_ALLOC_USERDATA) &&
- xfs_inode_is_filestream(ap->ip)) {
- ag = xfs_filestream_lookup_ag(ap->ip);
- ag = (ag != NULLAGNUMBER) ? ag : 0;
- ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
- } else {
- ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
- }
- } else
- ap->blkno = ap->tp->t_firstblock;
- xfs_bmap_adjacent(ap);
-
- /*
- * If allowed, use ap->blkno; otherwise must use firstblock since
- * it's in the right allocation group.
- */
- if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno)
- ;
- else
- ap->blkno = ap->tp->t_firstblock;
/*
- * Normal allocation, done through xfs_alloc_vextent.
+ * If there are already extents in the file, try an exact EOF block
+ * allocation to extend the file as a contiguous extent. If that fails,
+ * or it's the first allocation in a file, just try for a stripe aligned
+ * allocation.
*/
- tryagain = isaligned = 0;
- args.fsbno = ap->blkno;
- args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
+ if (ap->offset) {
+ xfs_extlen_t nextminlen = 0;
- /* Trim the allocation back to the maximum an AG can fit. */
- args.maxlen = min(ap->length, mp->m_ag_max_usable);
- blen = 0;
- if (nullfb) {
/*
- * Search for an allocation group with a single extent large
- * enough for the request. If one isn't found, then adjust
- * the minimum allocation size to the largest space found.
+ * Compute the minlen+alignment for the next case. Set slop so
+ * that the value of minlen+alignment+slop doesn't go up between
+ * the calls.
*/
- if ((ap->datatype & XFS_ALLOC_USERDATA) &&
- xfs_inode_is_filestream(ap->ip))
- error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+ args->alignment = 1;
+ if (blen > stripe_align && blen <= args->maxlen)
+ nextminlen = blen - stripe_align;
+ else
+ nextminlen = args->minlen;
+ if (nextminlen + stripe_align > args->minlen + 1)
+ args->minalignslop = nextminlen + stripe_align -
+ args->minlen - 1;
else
- error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+ args->minalignslop = 0;
+
+ if (!caller_pag)
+ args->pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ap->blkno));
+ error = xfs_alloc_vextent_exact_bno(args, ap->blkno);
+ if (!caller_pag)
+ xfs_perag_put(args->pag);
if (error)
return error;
- } else if (ap->tp->t_flags & XFS_TRANS_LOWMODE) {
- if (xfs_inode_is_filestream(ap->ip))
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- else
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.total = args.minlen = ap->minlen;
+
+ if (args->fsbno != NULLFSBLOCK)
+ return 0;
+ /*
+ * Exact allocation failed. Reset to try an aligned allocation
+ * according to the original allocation specification.
+ */
+ args->pag = NULL;
+ args->alignment = stripe_align;
+ args->minlen = nextminlen;
+ args->minalignslop = 0;
} else {
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.total = ap->total;
- args.minlen = ap->minlen;
+ /*
+ * Adjust minlen to try and preserve alignment if we
+ * can't guarantee an aligned maxlen extent.
+ */
+ args->alignment = stripe_align;
+ if (blen > args->alignment &&
+ blen <= args->maxlen + args->alignment)
+ args->minlen = blen - args->alignment;
+ args->minalignslop = 0;
}
- /*
- * If we are not low on available data blocks, and the underlying
- * logical volume manager is a stripe, and the file offset is zero then
- * try to allocate data blocks on stripe unit boundary. NOTE: ap->aeof
- * is only set if the allocation length is >= the stripe unit and the
- * allocation offset is at the end of file.
- */
- if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) {
- if (!ap->offset) {
- args.alignment = stripe_align;
- atype = args.type;
- isaligned = 1;
- /*
- * Adjust minlen to try and preserve alignment if we
- * can't guarantee an aligned maxlen extent.
- */
- if (blen > args.alignment &&
- blen <= args.maxlen + args.alignment)
- args.minlen = blen - args.alignment;
- args.minalignslop = 0;
- } else {
- /*
- * First try an exact bno allocation.
- * If it fails then do a near or start bno
- * allocation with alignment turned on.
- */
- atype = args.type;
- tryagain = 1;
- args.type = XFS_ALLOCTYPE_THIS_BNO;
- args.alignment = 1;
- /*
- * Compute the minlen+alignment for the
- * next case. Set slop so that the value
- * of minlen+alignment+slop doesn't go up
- * between the calls.
- */
- if (blen > stripe_align && blen <= args.maxlen)
- nextminlen = blen - stripe_align;
- else
- nextminlen = args.minlen;
- if (nextminlen + stripe_align > args.minlen + 1)
- args.minalignslop =
- nextminlen + stripe_align -
- args.minlen - 1;
- else
- args.minalignslop = 0;
- }
+ if (ag_only) {
+ error = xfs_alloc_vextent_near_bno(args, ap->blkno);
} else {
- args.alignment = 1;
- args.minalignslop = 0;
+ args->pag = NULL;
+ error = xfs_alloc_vextent_start_ag(args, ap->blkno);
+ ASSERT(args->pag == NULL);
+ args->pag = caller_pag;
}
- args.minleft = ap->minleft;
- args.wasdel = ap->wasdel;
- args.resv = XFS_AG_RESV_NONE;
- args.datatype = ap->datatype;
-
- error = xfs_alloc_vextent(&args);
if (error)
return error;
- if (tryagain && args.fsbno == NULLFSBLOCK) {
- /*
- * Exact allocation failed. Now try with alignment
- * turned on.
- */
- args.type = atype;
- args.fsbno = ap->blkno;
- args.alignment = stripe_align;
- args.minlen = nextminlen;
- args.minalignslop = 0;
- isaligned = 1;
- if ((error = xfs_alloc_vextent(&args)))
- return error;
- }
- if (isaligned && args.fsbno == NULLFSBLOCK) {
- /*
- * allocation failed, so turn off alignment and
- * try again.
- */
- args.type = atype;
- args.fsbno = ap->blkno;
- args.alignment = 0;
- if ((error = xfs_alloc_vextent(&args)))
+ if (args->fsbno != NULLFSBLOCK)
+ return 0;
+
+ /*
+ * Allocation failed, so turn return the allocation args to their
+ * original non-aligned state so the caller can proceed on allocation
+ * failure as if this function was never called.
+ */
+ args->fsbno = ap->blkno;
+ args->alignment = 1;
+ return 0;
+}
+
+/*
+ * We have failed multiple allocation attempts so now are in a low space
+ * allocation situation. Try a locality first full filesystem minimum length
+ * allocation whilst still maintaining necessary total block reservation
+ * requirements.
+ *
+ * If that fails, we are now critically low on space, so perform a last resort
+ * allocation attempt: no reserve, no locality, blocking, minimum length, full
+ * filesystem free space scan. We also indicate to future allocations in this
+ * transaction that we are critically low on space so they don't waste time on
+ * allocation modes that are unlikely to succeed.
+ */
+int
+xfs_bmap_btalloc_low_space(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args)
+{
+ int error;
+
+ if (args->minlen > ap->minlen) {
+ args->minlen = ap->minlen;
+ error = xfs_alloc_vextent_start_ag(args, ap->blkno);
+ if (error || args->fsbno != NULLFSBLOCK)
return error;
}
- if (args.fsbno == NULLFSBLOCK && nullfb &&
- args.minlen > ap->minlen) {
- args.minlen = ap->minlen;
- args.type = XFS_ALLOCTYPE_START_BNO;
- args.fsbno = ap->blkno;
- if ((error = xfs_alloc_vextent(&args)))
- return error;
+
+ /* Last ditch attempt before failure is declared. */
+ args->total = ap->minlen;
+ error = xfs_alloc_vextent_first_ag(args, 0);
+ if (error)
+ return error;
+ ap->tp->t_flags |= XFS_TRANS_LOWMODE;
+ return 0;
+}
+
+static int
+xfs_bmap_btalloc_filestreams(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ int stripe_align)
+{
+ xfs_extlen_t blen = 0;
+ int error = 0;
+
+
+ error = xfs_filestream_select_ag(ap, args, &blen);
+ if (error)
+ return error;
+ ASSERT(args->pag);
+
+ /*
+ * If we are in low space mode, then optimal allocation will fail so
+ * prepare for minimal allocation and jump to the low space algorithm
+ * immediately.
+ */
+ if (ap->tp->t_flags & XFS_TRANS_LOWMODE) {
+ args->minlen = ap->minlen;
+ ASSERT(args->fsbno == NULLFSBLOCK);
+ goto out_low_space;
}
- if (args.fsbno == NULLFSBLOCK && nullfb) {
- args.fsbno = 0;
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- args.total = ap->minlen;
- if ((error = xfs_alloc_vextent(&args)))
+
+ args->minlen = xfs_bmap_select_minlen(ap, args, blen);
+ if (ap->aeof)
+ error = xfs_bmap_btalloc_at_eof(ap, args, blen, stripe_align,
+ true);
+
+ if (!error && args->fsbno == NULLFSBLOCK)
+ error = xfs_alloc_vextent_near_bno(args, ap->blkno);
+
+out_low_space:
+ /*
+ * We are now done with the perag reference for the filestreams
+ * association provided by xfs_filestream_select_ag(). Release it now as
+ * we've either succeeded, had a fatal error or we are out of space and
+ * need to do a full filesystem scan for free space which will take it's
+ * own references.
+ */
+ xfs_perag_rele(args->pag);
+ args->pag = NULL;
+ if (error || args->fsbno != NULLFSBLOCK)
+ return error;
+
+ return xfs_bmap_btalloc_low_space(ap, args);
+}
+
+static int
+xfs_bmap_btalloc_best_length(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ int stripe_align)
+{
+ xfs_extlen_t blen = 0;
+ int error;
+
+ ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino);
+ xfs_bmap_adjacent(ap);
+
+ /*
+ * Search for an allocation group with a single extent large enough for
+ * the request. If one isn't found, then adjust the minimum allocation
+ * size to the largest space found.
+ */
+ error = xfs_bmap_btalloc_select_lengths(ap, args, &blen);
+ if (error)
+ return error;
+
+ /*
+ * Don't attempt optimal EOF allocation if previous allocations barely
+ * succeeded due to being near ENOSPC. It is highly unlikely we'll get
+ * optimal or even aligned allocations in this case, so don't waste time
+ * trying.
+ */
+ if (ap->aeof && !(ap->tp->t_flags & XFS_TRANS_LOWMODE)) {
+ error = xfs_bmap_btalloc_at_eof(ap, args, blen, stripe_align,
+ false);
+ if (error || args->fsbno != NULLFSBLOCK)
return error;
- ap->tp->t_flags |= XFS_TRANS_LOWMODE;
}
+ error = xfs_alloc_vextent_start_ag(args, ap->blkno);
+ if (error || args->fsbno != NULLFSBLOCK)
+ return error;
+
+ return xfs_bmap_btalloc_low_space(ap, args);
+}
+
+static int
+xfs_bmap_btalloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_alloc_arg args = {
+ .tp = ap->tp,
+ .mp = mp,
+ .fsbno = NULLFSBLOCK,
+ .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
+ .minleft = ap->minleft,
+ .wasdel = ap->wasdel,
+ .resv = XFS_AG_RESV_NONE,
+ .datatype = ap->datatype,
+ .alignment = 1,
+ .minalignslop = 0,
+ };
+ xfs_fileoff_t orig_offset;
+ xfs_extlen_t orig_length;
+ int error;
+ int stripe_align;
+
+ ASSERT(ap->length);
+ orig_offset = ap->offset;
+ orig_length = ap->length;
+
+ stripe_align = xfs_bmap_compute_alignments(ap, &args);
+
+ /* Trim the allocation back to the maximum an AG can fit. */
+ args.maxlen = min(ap->length, mp->m_ag_max_usable);
+
+ if ((ap->datatype & XFS_ALLOC_USERDATA) &&
+ xfs_inode_is_filestream(ap->ip))
+ error = xfs_bmap_btalloc_filestreams(ap, &args, stripe_align);
+ else
+ error = xfs_bmap_btalloc_best_length(ap, &args, stripe_align);
+ if (error)
+ return error;
+
if (args.fsbno != NULLFSBLOCK) {
xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
orig_length);
@@ -4256,7 +4222,7 @@ xfs_bmapi_convert_unwritten(
return 0;
}
-static inline xfs_extlen_t
+xfs_extlen_t
xfs_bmapi_minleft(
struct xfs_trans *tp,
struct xfs_inode *ip,
@@ -4264,7 +4230,7 @@ xfs_bmapi_minleft(
{
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork);
- if (tp && tp->t_firstblock != NULLFSBLOCK)
+ if (tp && tp->t_highest_agno != NULLAGNUMBER)
return 0;
if (ifp->if_format != XFS_DINODE_FMT_BTREE)
return 1;
@@ -6151,7 +6117,7 @@ xfs_bmap_finish_one(
struct xfs_bmbt_irec *bmap = &bi->bi_bmap;
int error = 0;
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
+ ASSERT(tp->t_highest_agno == NULLAGNUMBER);
trace_xfs_bmap_deferred(tp->t_mountp,
XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 01c2df35c3e3..dd08361ca5a6 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -12,6 +12,7 @@ struct xfs_ifork;
struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
+struct xfs_alloc_arg;
/*
* Argument structure for xfs_bmap_alloc.
@@ -168,6 +169,8 @@ static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec)
#define xfs_valid_startblock(ip, startblock) \
((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
+int xfs_bmap_longest_free_extent(struct xfs_perag *pag,
+ struct xfs_trans *tp, xfs_extlen_t *blen);
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp);
@@ -220,6 +223,10 @@ int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new, int *logflagsp);
+xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
+ int fork);
+int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cfa052d40105..b8ad95050c9b 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -21,6 +21,7 @@
#include "xfs_quota.h"
#include "xfs_trace.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
static struct kmem_cache *xfs_bmbt_cur_cache;
@@ -184,11 +185,11 @@ xfs_bmbt_update_cursor(
struct xfs_btree_cur *src,
struct xfs_btree_cur *dst)
{
- ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) ||
+ ASSERT((dst->bc_tp->t_highest_agno != NULLAGNUMBER) ||
(dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME));
dst->bc_ino.allocated += src->bc_ino.allocated;
- dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock;
+ dst->bc_tp->t_highest_agno = src->bc_tp->t_highest_agno;
src->bc_ino.allocated = 0;
}
@@ -200,46 +201,32 @@ xfs_bmbt_alloc_block(
union xfs_btree_ptr *new,
int *stat)
{
- xfs_alloc_arg_t args; /* block allocation args */
- int error; /* error return value */
+ struct xfs_alloc_arg args;
+ int error;
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
- args.fsbno = cur->bc_tp->t_firstblock;
xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino,
cur->bc_ino.whichfork);
-
- if (args.fsbno == NULLFSBLOCK) {
- args.fsbno = be64_to_cpu(start->l);
- args.type = XFS_ALLOCTYPE_START_BNO;
- /*
- * Make sure there is sufficient room left in the AG to
- * complete a full tree split for an extent insert. If
- * we are converting the middle part of an extent then
- * we may need space for two tree splits.
- *
- * We are relying on the caller to make the correct block
- * reservation for this operation to succeed. If the
- * reservation amount is insufficient then we may fail a
- * block allocation here and corrupt the filesystem.
- */
- args.minleft = args.tp->t_blk_res;
- } else if (cur->bc_tp->t_flags & XFS_TRANS_LOWMODE) {
- args.type = XFS_ALLOCTYPE_START_BNO;
- } else {
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- }
-
args.minlen = args.maxlen = args.prod = 1;
args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL;
- if (!args.wasdel && args.tp->t_blk_res == 0) {
- error = -ENOSPC;
- goto error0;
- }
- error = xfs_alloc_vextent(&args);
+ if (!args.wasdel && args.tp->t_blk_res == 0)
+ return -ENOSPC;
+
+ /*
+ * If we are coming here from something like unwritten extent
+ * conversion, there has been no data extent allocation already done, so
+ * we have to ensure that we attempt to locate the entire set of bmbt
+ * allocations in the same AG, as xfs_bmapi_write() would have reserved.
+ */
+ if (cur->bc_tp->t_highest_agno == NULLAGNUMBER)
+ args.minleft = xfs_bmapi_minleft(cur->bc_tp, cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
+
+ error = xfs_alloc_vextent_start_ag(&args, be64_to_cpu(start->l));
if (error)
- goto error0;
+ return error;
if (args.fsbno == NULLFSBLOCK && args.minleft) {
/*
@@ -247,11 +234,10 @@ xfs_bmbt_alloc_block(
* a full btree split. Try again and if
* successful activate the lowspace algorithm.
*/
- args.fsbno = 0;
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- error = xfs_alloc_vextent(&args);
+ args.minleft = 0;
+ error = xfs_alloc_vextent_start_ag(&args, 0);
if (error)
- goto error0;
+ return error;
cur->bc_tp->t_flags |= XFS_TRANS_LOWMODE;
}
if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
@@ -260,7 +246,6 @@ xfs_bmbt_alloc_block(
}
ASSERT(args.len == 1);
- cur->bc_tp->t_firstblock = args.fsbno;
cur->bc_ino.allocated++;
cur->bc_ino.ip->i_nblocks++;
xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE);
@@ -271,9 +256,6 @@ xfs_bmbt_alloc_block(
*stat = 1;
return 0;
-
- error0:
- return error;
}
STATIC int
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index da8c769887fd..c4649cc624e1 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2943,7 +2943,7 @@ xfs_btree_split(
DECLARE_COMPLETION_ONSTACK(done);
if (cur->bc_btnum != XFS_BTNUM_BMAP ||
- cur->bc_tp->t_firstblock == NULLFSBLOCK)
+ cur->bc_tp->t_highest_agno == NULLAGNUMBER)
return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
args.cur = cur;
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 5118dedf9267..7ee292aecbeb 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -169,10 +169,9 @@ xfs_inobt_insert_rec(
*/
STATIC int
xfs_inobt_insert(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- struct xfs_perag *pag,
xfs_agino_t newino,
xfs_agino_t newlen,
xfs_btnum_t btnum)
@@ -182,7 +181,7 @@ xfs_inobt_insert(
int i;
int error;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
for (thisino = newino;
thisino < newino + newlen;
@@ -514,20 +513,20 @@ __xfs_inobt_rec_merge(
*/
STATIC int
xfs_inobt_insert_sprec(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- struct xfs_perag *pag,
int btnum,
struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
bool merge) /* merge or replace */
{
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
int error;
int i;
struct xfs_inobt_rec_incore rec;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
/* the new record is pre-aligned so we know where to look */
error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
@@ -609,9 +608,9 @@ error:
*/
STATIC int
xfs_ialloc_ag_alloc(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- struct xfs_buf *agbp,
- struct xfs_perag *pag)
+ struct xfs_buf *agbp)
{
struct xfs_agi *agi;
struct xfs_alloc_arg args;
@@ -631,6 +630,7 @@ xfs_ialloc_ag_alloc(
args.mp = tp->t_mountp;
args.fsbno = NULLFSBLOCK;
args.oinfo = XFS_RMAP_OINFO_INODES;
+ args.pag = pag;
#ifdef DEBUG
/* randomly do sparse inode allocations */
@@ -662,8 +662,6 @@ xfs_ialloc_ag_alloc(
goto sparse_alloc;
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
- args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
- args.type = XFS_ALLOCTYPE_THIS_BNO;
args.prod = 1;
/*
@@ -684,7 +682,10 @@ xfs_ialloc_ag_alloc(
/* Allow space for the inode btree to split. */
args.minleft = igeo->inobt_maxlevels;
- if ((error = xfs_alloc_vextent(&args)))
+ error = xfs_alloc_vextent_exact_bno(&args,
+ XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
+ args.agbno));
+ if (error)
return error;
/*
@@ -717,22 +718,17 @@ xfs_ialloc_ag_alloc(
} else
args.alignment = igeo->cluster_align;
/*
- * Need to figure out where to allocate the inode blocks.
- * Ideally they should be spaced out through the a.g.
- * For now, just allocate blocks up front.
- */
- args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
- /*
* Allocate a fixed-size extent of inodes.
*/
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.prod = 1;
/*
* Allow space for the inode btree to split.
*/
args.minleft = igeo->inobt_maxlevels;
- if ((error = xfs_alloc_vextent(&args)))
+ error = xfs_alloc_vextent_near_bno(&args,
+ XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
+ be32_to_cpu(agi->agi_root)));
+ if (error)
return error;
}
@@ -741,11 +737,11 @@ xfs_ialloc_ag_alloc(
* alignment.
*/
if (isaligned && args.fsbno == NULLFSBLOCK) {
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
args.alignment = igeo->cluster_align;
- if ((error = xfs_alloc_vextent(&args)))
+ error = xfs_alloc_vextent_near_bno(&args,
+ XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
+ be32_to_cpu(agi->agi_root)));
+ if (error)
return error;
}
@@ -757,9 +753,6 @@ xfs_ialloc_ag_alloc(
igeo->ialloc_min_blks < igeo->ialloc_blks &&
args.fsbno == NULLFSBLOCK) {
sparse_alloc:
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno);
args.alignment = args.mp->m_sb.sb_spino_align;
args.prod = 1;
@@ -781,7 +774,9 @@ sparse_alloc:
args.mp->m_sb.sb_inoalignmt) -
igeo->ialloc_blks;
- error = xfs_alloc_vextent(&args);
+ error = xfs_alloc_vextent_near_bno(&args,
+ XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
+ be32_to_cpu(agi->agi_root)));
if (error)
return error;
@@ -831,7 +826,7 @@ sparse_alloc:
* if necessary. If a merge does occur, rec is updated to the
* merged record.
*/
- error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
+ error = xfs_inobt_insert_sprec(pag, tp, agbp,
XFS_BTNUM_INO, &rec, true);
if (error == -EFSCORRUPTED) {
xfs_alert(args.mp,
@@ -856,20 +851,20 @@ sparse_alloc:
* existing record with this one.
*/
if (xfs_has_finobt(args.mp)) {
- error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag,
+ error = xfs_inobt_insert_sprec(pag, tp, agbp,
XFS_BTNUM_FINO, &rec, false);
if (error)
return error;
}
} else {
/* full chunk - insert new records to both btrees */
- error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen,
+ error = xfs_inobt_insert(pag, tp, agbp, newino, newlen,
XFS_BTNUM_INO);
if (error)
return error;
if (xfs_has_finobt(args.mp)) {
- error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino,
+ error = xfs_inobt_insert(pag, tp, agbp, newino,
newlen, XFS_BTNUM_FINO);
if (error)
return error;
@@ -981,9 +976,9 @@ xfs_inobt_first_free_inode(
*/
STATIC int
xfs_dialloc_ag_inobt(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- struct xfs_perag *pag,
xfs_ino_t parent,
xfs_ino_t *inop)
{
@@ -999,12 +994,12 @@ xfs_dialloc_ag_inobt(
int i, j;
int searchdistance = 10;
- ASSERT(pag->pagi_init);
- ASSERT(pag->pagi_inodeok);
+ ASSERT(xfs_perag_initialised_agi(pag));
+ ASSERT(xfs_perag_allows_inodes(pag));
ASSERT(pag->pagi_freecount > 0);
restart_pagno:
- cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -1429,9 +1424,9 @@ xfs_dialloc_ag_update_inobt(
*/
static int
xfs_dialloc_ag(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- struct xfs_perag *pag,
xfs_ino_t parent,
xfs_ino_t *inop)
{
@@ -1448,7 +1443,7 @@ xfs_dialloc_ag(
int i;
if (!xfs_has_finobt(mp))
- return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop);
+ return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
@@ -1457,7 +1452,7 @@ xfs_dialloc_ag(
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
- cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
error = xfs_check_agi_freecount(cur);
if (error)
@@ -1500,7 +1495,7 @@ xfs_dialloc_ag(
* the original freecount. If all is well, make the equivalent update to
* the inobt using the finobt record and offset information.
*/
- icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+ icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
error = xfs_check_agi_freecount(icur);
if (error)
@@ -1577,25 +1572,10 @@ xfs_dialloc_roll(
return error;
}
-static xfs_agnumber_t
-xfs_ialloc_next_ag(
- xfs_mount_t *mp)
-{
- xfs_agnumber_t agno;
-
- spin_lock(&mp->m_agirotor_lock);
- agno = mp->m_agirotor;
- if (++mp->m_agirotor >= mp->m_maxagi)
- mp->m_agirotor = 0;
- spin_unlock(&mp->m_agirotor_lock);
-
- return agno;
-}
-
static bool
xfs_dialloc_good_ag(
- struct xfs_trans *tp,
struct xfs_perag *pag,
+ struct xfs_trans *tp,
umode_t mode,
int flags,
bool ok_alloc)
@@ -1606,10 +1586,12 @@ xfs_dialloc_good_ag(
int needspace;
int error;
- if (!pag->pagi_inodeok)
+ if (!pag)
+ return false;
+ if (!xfs_perag_allows_inodes(pag))
return false;
- if (!pag->pagi_init) {
+ if (!xfs_perag_initialised_agi(pag)) {
error = xfs_ialloc_read_agi(pag, tp, NULL);
if (error)
return false;
@@ -1620,7 +1602,7 @@ xfs_dialloc_good_ag(
if (!ok_alloc)
return false;
- if (!pag->pagf_init) {
+ if (!xfs_perag_initialised_agf(pag)) {
error = xfs_alloc_read_agf(pag, tp, flags, NULL);
if (error)
return false;
@@ -1665,8 +1647,8 @@ xfs_dialloc_good_ag(
static int
xfs_dialloc_try_ag(
- struct xfs_trans **tpp,
struct xfs_perag *pag,
+ struct xfs_trans **tpp,
xfs_ino_t parent,
xfs_ino_t *new_ino,
bool ok_alloc)
@@ -1689,7 +1671,7 @@ xfs_dialloc_try_ag(
goto out_release;
}
- error = xfs_ialloc_ag_alloc(*tpp, agbp, pag);
+ error = xfs_ialloc_ag_alloc(pag, *tpp, agbp);
if (error < 0)
goto out_release;
@@ -1705,7 +1687,7 @@ xfs_dialloc_try_ag(
}
/* Allocate an inode in the found AG */
- error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino);
+ error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino);
if (!error)
*new_ino = ino;
return error;
@@ -1737,8 +1719,9 @@ xfs_dialloc(
struct xfs_perag *pag;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
bool ok_alloc = true;
+ bool low_space = false;
int flags;
- xfs_ino_t ino;
+ xfs_ino_t ino = NULLFSINO;
/*
* Directories, symlinks, and regular files frequently allocate at least
@@ -1746,7 +1729,8 @@ xfs_dialloc(
* an AG has enough space for file creation.
*/
if (S_ISDIR(mode))
- start_agno = xfs_ialloc_next_ag(mp);
+ start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) %
+ mp->m_maxagi;
else {
start_agno = XFS_INO_TO_AGNO(mp, parent);
if (start_agno >= mp->m_maxagi)
@@ -1768,41 +1752,55 @@ xfs_dialloc(
}
/*
+ * If we are near to ENOSPC, we want to prefer allocation from AGs that
+ * have free inodes in them rather than use up free space allocating new
+ * inode chunks. Hence we turn off allocation for the first non-blocking
+ * pass through the AGs if we are near ENOSPC to consume free inodes
+ * that we can immediately allocate, but then we allow allocation on the
+ * second pass if we fail to find an AG with free inodes in it.
+ */
+ if (percpu_counter_read_positive(&mp->m_fdblocks) <
+ mp->m_low_space[XFS_LOWSP_1_PCNT]) {
+ ok_alloc = false;
+ low_space = true;
+ }
+
+ /*
* Loop until we find an allocation group that either has free inodes
* or in which we can allocate some inodes. Iterate through the
* allocation groups upward, wrapping at the end.
*/
- agno = start_agno;
flags = XFS_ALLOC_FLAG_TRYLOCK;
- for (;;) {
- pag = xfs_perag_get(mp, agno);
- if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) {
- error = xfs_dialloc_try_ag(tpp, pag, parent,
+retry:
+ for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) {
+ if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) {
+ error = xfs_dialloc_try_ag(pag, tpp, parent,
&ino, ok_alloc);
if (error != -EAGAIN)
break;
+ error = 0;
}
if (xfs_is_shutdown(mp)) {
error = -EFSCORRUPTED;
break;
}
- if (++agno == mp->m_maxagi)
- agno = 0;
- if (agno == start_agno) {
- if (!flags) {
- error = -ENOSPC;
- break;
- }
+ }
+ if (pag)
+ xfs_perag_rele(pag);
+ if (error)
+ return error;
+ if (ino == NULLFSINO) {
+ if (flags) {
flags = 0;
+ if (low_space)
+ ok_alloc = true;
+ goto retry;
}
- xfs_perag_put(pag);
+ return -ENOSPC;
}
-
- if (!error)
- *new_ino = ino;
- xfs_perag_put(pag);
- return error;
+ *new_ino = ino;
+ return 0;
}
/*
@@ -1885,14 +1883,14 @@ next:
STATIC int
xfs_difree_inobt(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- struct xfs_perag *pag,
xfs_agino_t agino,
struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_agi *agi = agbp->b_addr;
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
@@ -1907,7 +1905,7 @@ xfs_difree_inobt(
/*
* Initialize the cursor.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
error = xfs_check_agi_freecount(cur);
if (error)
@@ -2019,20 +2017,20 @@ error0:
*/
STATIC int
xfs_difree_finobt(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- struct xfs_perag *pag,
xfs_agino_t agino,
struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
{
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
int offset = agino - ibtrec->ir_startino;
int error;
int i;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
@@ -2179,7 +2177,7 @@ xfs_difree(
/*
* Fix up the inode allocation btree.
*/
- error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec);
+ error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec);
if (error)
goto error0;
@@ -2187,7 +2185,7 @@ xfs_difree(
* Fix up the free inode btree.
*/
if (xfs_has_finobt(mp)) {
- error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec);
+ error = xfs_difree_finobt(pag, tp, agbp, agino, &rec);
if (error)
goto error0;
}
@@ -2200,15 +2198,15 @@ error0:
STATIC int
xfs_imap_lookup(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
struct xfs_perag *pag,
+ struct xfs_trans *tp,
xfs_agino_t agino,
xfs_agblock_t agbno,
xfs_agblock_t *chunk_agbno,
xfs_agblock_t *offset_agbno,
int flags)
{
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_inobt_rec_incore rec;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
@@ -2229,7 +2227,7 @@ xfs_imap_lookup(
* we have a record, we need to ensure it contains the inode number
* we are looking up.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (!error) {
if (i)
@@ -2263,12 +2261,13 @@ xfs_imap_lookup(
*/
int
xfs_imap(
- struct xfs_mount *mp, /* file system mount structure */
- struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_perag *pag,
+ struct xfs_trans *tp,
xfs_ino_t ino, /* inode to locate */
struct xfs_imap *imap, /* location map structure */
uint flags) /* flags for inode btree lookup */
{
+ struct xfs_mount *mp = pag->pag_mount;
xfs_agblock_t agbno; /* block number of inode in the alloc group */
xfs_agino_t agino; /* inode number within alloc group */
xfs_agblock_t chunk_agbno; /* first block in inode chunk */
@@ -2276,17 +2275,15 @@ xfs_imap(
int error; /* error code */
int offset; /* index of inode in its buffer */
xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
- struct xfs_perag *pag;
ASSERT(ino != NULLFSINO);
/*
* Split up the inode number into its parts.
*/
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
agino = XFS_INO_TO_AGINO(mp, ino);
agbno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (!pag || agbno >= mp->m_sb.sb_agblocks ||
+ if (agbno >= mp->m_sb.sb_agblocks ||
ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
error = -EINVAL;
#ifdef DEBUG
@@ -2295,20 +2292,14 @@ xfs_imap(
* as they can be invalid without implying corruption.
*/
if (flags & XFS_IGET_UNTRUSTED)
- goto out_drop;
- if (!pag) {
- xfs_alert(mp,
- "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
- __func__, XFS_INO_TO_AGNO(mp, ino),
- mp->m_sb.sb_agcount);
- }
+ return error;
if (agbno >= mp->m_sb.sb_agblocks) {
xfs_alert(mp,
"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
__func__, (unsigned long long)agbno,
(unsigned long)mp->m_sb.sb_agblocks);
}
- if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+ if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
xfs_alert(mp,
"%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
__func__, ino,
@@ -2316,7 +2307,7 @@ xfs_imap(
}
xfs_stack_trace();
#endif /* DEBUG */
- goto out_drop;
+ return error;
}
/*
@@ -2327,10 +2318,10 @@ xfs_imap(
* in all cases where an untrusted inode number is passed.
*/
if (flags & XFS_IGET_UNTRUSTED) {
- error = xfs_imap_lookup(mp, tp, pag, agino, agbno,
+ error = xfs_imap_lookup(pag, tp, agino, agbno,
&chunk_agbno, &offset_agbno, flags);
if (error)
- goto out_drop;
+ return error;
goto out_map;
}
@@ -2346,8 +2337,7 @@ xfs_imap(
imap->im_len = XFS_FSB_TO_BB(mp, 1);
imap->im_boffset = (unsigned short)(offset <<
mp->m_sb.sb_inodelog);
- error = 0;
- goto out_drop;
+ return 0;
}
/*
@@ -2359,10 +2349,10 @@ xfs_imap(
offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
chunk_agbno = agbno - offset_agbno;
} else {
- error = xfs_imap_lookup(mp, tp, pag, agino, agbno,
+ error = xfs_imap_lookup(pag, tp, agino, agbno,
&chunk_agbno, &offset_agbno, flags);
if (error)
- goto out_drop;
+ return error;
}
out_map:
@@ -2390,14 +2380,9 @@ out_map:
__func__, (unsigned long long) imap->im_blkno,
(unsigned long long) imap->im_len,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
- error = -EINVAL;
- goto out_drop;
+ return -EINVAL;
}
- error = 0;
-out_drop:
- if (pag)
- xfs_perag_put(pag);
- return error;
+ return 0;
}
/*
@@ -2613,10 +2598,10 @@ xfs_ialloc_read_agi(
return error;
agi = agibp->b_addr;
- if (!pag->pagi_init) {
+ if (!xfs_perag_initialised_agi(pag)) {
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
pag->pagi_count = be32_to_cpu(agi->agi_count);
- pag->pagi_init = 1;
+ set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
}
/*
@@ -2924,26 +2909,24 @@ xfs_ialloc_calc_rootino(
*/
int
xfs_ialloc_check_shrink(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
- xfs_agnumber_t agno,
struct xfs_buf *agibp,
xfs_agblock_t new_length)
{
struct xfs_inobt_rec_incore rec;
struct xfs_btree_cur *cur;
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_perag *pag;
- xfs_agino_t agino = XFS_AGB_TO_AGINO(mp, new_length);
+ xfs_agino_t agino;
int has;
int error;
- if (!xfs_has_sparseinodes(mp))
+ if (!xfs_has_sparseinodes(pag->pag_mount))
return 0;
- pag = xfs_perag_get(mp, agno);
- cur = xfs_inobt_init_cursor(mp, tp, agibp, pag, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO);
/* Look up the inobt record that would correspond to the new EOFS. */
+ agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
if (error || !has)
goto out;
@@ -2964,6 +2947,5 @@ xfs_ialloc_check_shrink(
}
out:
xfs_btree_del_cursor(cur, error);
- xfs_perag_put(pag);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 9bbbca6ac4ed..ab8c30b4ec22 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -12,6 +12,7 @@ struct xfs_imap;
struct xfs_mount;
struct xfs_trans;
struct xfs_btree_cur;
+struct xfs_perag;
/* Move inodes in clusters of this size */
#define XFS_INODE_BIG_CLUSTER_SIZE 8192
@@ -47,7 +48,7 @@ int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag,
*/
int
xfs_imap(
- struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_perag *pag,
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t ino, /* inode to locate */
struct xfs_imap *imap, /* location map structure */
@@ -106,7 +107,7 @@ int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit);
-int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno,
+int xfs_ialloc_check_shrink(struct xfs_perag *pag, struct xfs_trans *tp,
struct xfs_buf *agibp, xfs_agblock_t new_length);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 8c83e265770c..9b28211d5a4c 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -36,8 +36,8 @@ STATIC struct xfs_btree_cur *
xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
- return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
+ return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
+ cur->bc_ag.agbp, cur->bc_btnum);
}
STATIC void
@@ -103,15 +103,15 @@ __xfs_inobt_alloc_block(
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
+ args.pag = cur->bc_ag.pag;
args.oinfo = XFS_RMAP_OINFO_INOBT;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.pag->pag_agno, sbno);
args.minlen = 1;
args.maxlen = 1;
args.prod = 1;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.resv = resv;
- error = xfs_alloc_vextent(&args);
+ error = xfs_alloc_vextent_near_bno(&args,
+ XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, sbno));
if (error)
return error;
@@ -291,8 +291,8 @@ xfs_inobt_verify(
* Similarly, during log recovery we will have a perag structure
* attached, but the agi information will not yet have been initialised
* from the on disk AGI. We don't currently use any of this information,
- * but beware of the landmine (i.e. need to check pag->pagi_init) if we
- * ever do.
+ * but beware of the landmine (i.e. need to check
+ * xfs_perag_initialised_agi(pag)) if we ever do.
*/
if (xfs_has_crc(mp)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
@@ -427,11 +427,11 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
*/
static struct xfs_btree_cur *
xfs_inobt_init_common(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
struct xfs_perag *pag,
+ struct xfs_trans *tp, /* transaction pointer */
xfs_btnum_t btnum) /* ialloc or free ino btree */
{
+ struct xfs_mount *mp = pag->pag_mount;
struct xfs_btree_cur *cur;
cur = xfs_btree_alloc_cursor(mp, tp, btnum,
@@ -456,16 +456,15 @@ xfs_inobt_init_common(
/* Create an inode btree cursor. */
struct xfs_btree_cur *
xfs_inobt_init_cursor(
- struct xfs_mount *mp,
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agbp,
- struct xfs_perag *pag,
xfs_btnum_t btnum)
{
struct xfs_btree_cur *cur;
struct xfs_agi *agi = agbp->b_addr;
- cur = xfs_inobt_init_common(mp, tp, pag, btnum);
+ cur = xfs_inobt_init_common(pag, tp, btnum);
if (btnum == XFS_BTNUM_INO)
cur->bc_nlevels = be32_to_cpu(agi->agi_level);
else
@@ -477,14 +476,13 @@ xfs_inobt_init_cursor(
/* Create an inode btree cursor with a fake root for staging. */
struct xfs_btree_cur *
xfs_inobt_stage_cursor(
- struct xfs_mount *mp,
- struct xbtree_afakeroot *afake,
struct xfs_perag *pag,
+ struct xbtree_afakeroot *afake,
xfs_btnum_t btnum)
{
struct xfs_btree_cur *cur;
- cur = xfs_inobt_init_common(mp, NULL, pag, btnum);
+ cur = xfs_inobt_init_common(pag, NULL, btnum);
xfs_btree_stage_afakeroot(cur, afake);
return cur;
}
@@ -708,9 +706,8 @@ xfs_inobt_max_size(
/* Read AGI and create inobt cursor. */
int
xfs_inobt_cur(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
struct xfs_perag *pag,
+ struct xfs_trans *tp,
xfs_btnum_t which,
struct xfs_btree_cur **curpp,
struct xfs_buf **agi_bpp)
@@ -725,16 +722,15 @@ xfs_inobt_cur(
if (error)
return error;
- cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, pag, which);
+ cur = xfs_inobt_init_cursor(pag, tp, *agi_bpp, which);
*curpp = cur;
return 0;
}
static int
xfs_inobt_count_blocks(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
struct xfs_perag *pag,
+ struct xfs_trans *tp,
xfs_btnum_t btnum,
xfs_extlen_t *tree_blocks)
{
@@ -742,7 +738,7 @@ xfs_inobt_count_blocks(
struct xfs_btree_cur *cur = NULL;
int error;
- error = xfs_inobt_cur(mp, tp, pag, btnum, &cur, &agbp);
+ error = xfs_inobt_cur(pag, tp, btnum, &cur, &agbp);
if (error)
return error;
@@ -779,22 +775,21 @@ xfs_finobt_read_blocks(
*/
int
xfs_finobt_calc_reserves(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
struct xfs_perag *pag,
+ struct xfs_trans *tp,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
xfs_extlen_t tree_len = 0;
int error;
- if (!xfs_has_finobt(mp))
+ if (!xfs_has_finobt(pag->pag_mount))
return 0;
- if (xfs_has_inobtcounts(mp))
+ if (xfs_has_inobtcounts(pag->pag_mount))
error = xfs_finobt_read_blocks(pag, tp, &tree_len);
else
- error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO,
+ error = xfs_inobt_count_blocks(pag, tp, XFS_BTNUM_FINO,
&tree_len);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 26451cb76b98..e859a6e05230 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -46,12 +46,10 @@ struct xfs_perag;
(maxrecs) * sizeof(xfs_inobt_key_t) + \
((index) - 1) * sizeof(xfs_inobt_ptr_t)))
-extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp,
- struct xfs_trans *tp, struct xfs_buf *agbp,
- struct xfs_perag *pag, xfs_btnum_t btnum);
-struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp,
- struct xbtree_afakeroot *afake, struct xfs_perag *pag,
- xfs_btnum_t btnum);
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
+ struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag,
+ struct xbtree_afakeroot *afake, xfs_btnum_t btnum);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
/* ir_holemask to inode allocation bitmap conversion */
@@ -64,13 +62,13 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
#define xfs_inobt_rec_check_count(mp, rec) 0
#endif /* DEBUG */
-int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
- struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used);
+int xfs_finobt_calc_reserves(struct xfs_perag *perag, struct xfs_trans *tp,
+ xfs_extlen_t *ask, xfs_extlen_t *used);
extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
-int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
- struct xfs_perag *pag, xfs_btnum_t btnum,
- struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
+int xfs_inobt_cur(struct xfs_perag *pag, struct xfs_trans *tp,
+ xfs_btnum_t btnum, struct xfs_btree_cur **curpp,
+ struct xfs_buf **agi_bpp);
void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index e1f789866683..f3b860970b26 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -67,14 +67,14 @@ xfs_refcountbt_alloc_block(
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- xfs_refc_block(args.mp));
+ args.pag = cur->bc_ag.pag;
args.oinfo = XFS_RMAP_OINFO_REFC;
args.minlen = args.maxlen = args.prod = 1;
args.resv = XFS_AG_RESV_METADATA;
- error = xfs_alloc_vextent(&args);
+ error = xfs_alloc_vextent_near_bno(&args,
+ XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno,
+ xfs_refc_block(args.mp)));
if (error)
goto out_error;
trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
@@ -227,7 +227,7 @@ xfs_refcountbt_verify(
return fa;
level = be16_to_cpu(block->bb_level);
- if (pag && pag->pagf_init) {
+ if (pag && xfs_perag_initialised_agf(pag)) {
if (level >= pag->pagf_refcount_level)
return __this_address;
} else if (level >= mp->m_refc_maxlevels)
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 7f83f62e51e0..d3285684bb5e 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -313,7 +313,7 @@ xfs_rmapbt_verify(
return fa;
level = be16_to_cpu(block->bb_level);
- if (pag && pag->pagf_init) {
+ if (pag && xfs_perag_initialised_agf(pag)) {
if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
return __this_address;
} else if (level >= mp->m_rmap_maxlevels)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 1eeecf2eb2a7..99cc03a298e2 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -909,7 +909,8 @@ xfs_sb_mount_common(
struct xfs_mount *mp,
struct xfs_sb *sbp)
{
- mp->m_agfrotor = mp->m_agirotor = 0;
+ mp->m_agfrotor = 0;
+ atomic_set(&mp->m_agirotor, 0);
mp->m_maxagi = mp->m_sb.sb_agcount;
mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index d75d82151eeb..c37e6d72760b 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -191,14 +191,15 @@ xrep_agf_init_header(
struct xfs_agf *old_agf)
{
struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag = sc->sa.pag;
struct xfs_agf *agf = agf_bp->b_addr;
memcpy(old_agf, agf, sizeof(*old_agf));
memset(agf, 0, BBTOB(agf_bp->b_length));
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
- agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
- agf->agf_length = cpu_to_be32(sc->sa.pag->block_count);
+ agf->agf_seqno = cpu_to_be32(pag->pag_agno);
+ agf->agf_length = cpu_to_be32(pag->block_count);
agf->agf_flfirst = old_agf->agf_flfirst;
agf->agf_fllast = old_agf->agf_fllast;
agf->agf_flcount = old_agf->agf_flcount;
@@ -206,8 +207,8 @@ xrep_agf_init_header(
uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
/* Mark the incore AGF data stale until we're done fixing things. */
- ASSERT(sc->sa.pag->pagf_init);
- sc->sa.pag->pagf_init = 0;
+ ASSERT(xfs_perag_initialised_agf(pag));
+ clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
}
/* Set btree root information in an AGF. */
@@ -333,7 +334,7 @@ xrep_agf_commit_new(
pag->pagf_levels[XFS_BTNUM_RMAPi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
- pag->pagf_init = 1;
+ set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
return 0;
}
@@ -434,7 +435,7 @@ xrep_agf(
out_revert:
/* Mark the incore AGF state stale and revert the AGF. */
- sc->sa.pag->pagf_init = 0;
+ clear_bit(XFS_AGSTATE_AGF_INIT, &sc->sa.pag->pag_opstate);
memcpy(agf, &old_agf, sizeof(old_agf));
return error;
}
@@ -618,7 +619,7 @@ xrep_agfl_update_agf(
xfs_force_summary_recalc(sc->mp);
/* Update the AGF counters. */
- if (sc->sa.pag->pagf_init)
+ if (xfs_perag_initialised_agf(sc->sa.pag))
sc->sa.pag->pagf_flcount = flcount;
agf->agf_flfirst = cpu_to_be32(0);
agf->agf_flcount = cpu_to_be32(flcount);
@@ -822,14 +823,15 @@ xrep_agi_init_header(
struct xfs_agi *old_agi)
{
struct xfs_agi *agi = agi_bp->b_addr;
+ struct xfs_perag *pag = sc->sa.pag;
struct xfs_mount *mp = sc->mp;
memcpy(old_agi, agi, sizeof(*old_agi));
memset(agi, 0, BBTOB(agi_bp->b_length));
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
- agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
- agi->agi_length = cpu_to_be32(sc->sa.pag->block_count);
+ agi->agi_seqno = cpu_to_be32(pag->pag_agno);
+ agi->agi_length = cpu_to_be32(pag->block_count);
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
if (xfs_has_crc(mp))
@@ -840,8 +842,8 @@ xrep_agi_init_header(
sizeof(agi->agi_unlinked));
/* Mark the incore AGF data stale until we're done fixing things. */
- ASSERT(sc->sa.pag->pagi_init);
- sc->sa.pag->pagi_init = 0;
+ ASSERT(xfs_perag_initialised_agi(pag));
+ clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
}
/* Set btree root information in an AGI. */
@@ -873,8 +875,7 @@ xrep_agi_calc_from_btrees(
xfs_agino_t freecount;
int error;
- cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp,
- sc->sa.pag, XFS_BTNUM_INO);
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO);
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
@@ -894,8 +895,8 @@ xrep_agi_calc_from_btrees(
if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
xfs_agblock_t blocks;
- cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp,
- sc->sa.pag, XFS_BTNUM_FINO);
+ cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp,
+ XFS_BTNUM_FINO);
error = xfs_btree_count_blocks(cur, &blocks);
if (error)
goto err;
@@ -929,7 +930,7 @@ xrep_agi_commit_new(
pag = sc->sa.pag;
pag->pagi_count = be32_to_cpu(agi->agi_count);
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
- pag->pagi_init = 1;
+ set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
return 0;
}
@@ -994,7 +995,7 @@ xrep_agi(
out_revert:
/* Mark the incore AGI state stale and revert the AGI. */
- sc->sa.pag->pagi_init = 0;
+ clear_bit(XFS_AGSTATE_AGI_INIT, &sc->sa.pag->pag_opstate);
memcpy(agi, &old_agi, sizeof(old_agi));
return error;
}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index d50d0eab196a..dbbc7037074c 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -662,7 +662,7 @@ xchk_bmap_check_rmaps(
error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
if (error ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
- xfs_perag_put(pag);
+ xfs_perag_rele(pag);
return error;
}
}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 613260b04a3d..848a8e32e56f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -478,15 +478,15 @@ xchk_ag_btcur_init(
/* Set up a inobt cursor for cross-referencing. */
if (sa->agi_bp &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
- sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
- sa->pag, XFS_BTNUM_INO);
+ sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
+ XFS_BTNUM_INO);
}
/* Set up a finobt cursor for cross-referencing. */
if (sa->agi_bp && xfs_has_finobt(mp) &&
xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
- sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
- sa->pag, XFS_BTNUM_FINO);
+ sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
+ XFS_BTNUM_FINO);
}
/* Set up a rmapbt cursor for cross-referencing. */
@@ -636,6 +636,7 @@ xchk_get_inode(
{
struct xfs_imap imap;
struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag;
struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
struct xfs_inode *ip = NULL;
int error;
@@ -671,10 +672,14 @@ xchk_get_inode(
* Otherwise, we really couldn't find it so tell userspace
* that it no longer exists.
*/
- error = xfs_imap(sc->mp, sc->tp, sc->sm->sm_ino, &imap,
- XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE);
- if (error)
- return -ENOENT;
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
+ if (pag) {
+ error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE);
+ xfs_perag_put(pag);
+ if (error)
+ return -ENOENT;
+ }
error = -EFSCORRUPTED;
fallthrough;
default:
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 4777e7b89fdc..f0c7f41897b9 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -86,7 +86,8 @@ xchk_fscount_warmup(
for_each_perag(mp, agno, pag) {
if (xchk_should_terminate(sc, &error))
break;
- if (pag->pagi_init && pag->pagf_init)
+ if (xfs_perag_initialised_agi(pag) &&
+ xfs_perag_initialised_agf(pag))
continue;
/* Lock both AG headers. */
@@ -101,7 +102,8 @@ xchk_fscount_warmup(
* These are supposed to be initialized by the header read
* function.
*/
- if (!pag->pagi_init || !pag->pagf_init) {
+ if (!xfs_perag_initialised_agi(pag) ||
+ !xfs_perag_initialised_agf(pag)) {
error = -EFSCORRUPTED;
break;
}
@@ -117,7 +119,7 @@ xchk_fscount_warmup(
if (agi_bp)
xfs_buf_relse(agi_bp);
if (pag)
- xfs_perag_put(pag);
+ xfs_perag_rele(pag);
return error;
}
@@ -220,7 +222,8 @@ retry:
break;
/* This somehow got unset since the warmup? */
- if (!pag->pagi_init || !pag->pagf_init) {
+ if (!xfs_perag_initialised_agi(pag) ||
+ !xfs_perag_initialised_agf(pag)) {
error = -EFSCORRUPTED;
break;
}
@@ -249,7 +252,7 @@ retry:
}
if (pag)
- xfs_perag_put(pag);
+ xfs_perag_rele(pag);
if (error) {
xchk_set_incomplete(sc);
return error;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 4b92f9253ccd..1b71174ec0d6 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -206,7 +206,7 @@ xrep_calc_ag_resblks(
return 0;
pag = xfs_perag_get(mp, sm->sm_agno);
- if (pag->pagi_init) {
+ if (xfs_perag_initialised_agi(pag)) {
/* Use in-core icount if possible. */
icount = pag->pagi_count;
} else {
@@ -326,15 +326,14 @@ xrep_alloc_ag_block(
args.tp = sc->tp;
args.mp = sc->mp;
+ args.pag = sc->sa.pag;
args.oinfo = *oinfo;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.pag->pag_agno, 0);
args.minlen = 1;
args.maxlen = 1;
args.prod = 1;
- args.type = XFS_ALLOCTYPE_THIS_AG;
args.resv = resv;
- error = xfs_alloc_vextent(&args);
+ error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno);
if (error)
return error;
if (args.fsbno == NULLFSBLOCK)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 867645b74d88..a09dd2606479 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1410,7 +1410,7 @@ xfs_swap_extent_rmap(
/* Unmap the old blocks in the source file. */
while (tirec.br_blockcount) {
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
+ ASSERT(tp->t_highest_agno == NULLAGNUMBER);
trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec);
/* Read extent from the source file */
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index bfc829c07f03..afc4c78b9eed 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -21,23 +21,20 @@
STATIC int
xfs_trim_extents(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
xfs_daddr_t start,
xfs_daddr_t end,
xfs_daddr_t minlen,
uint64_t *blocks_trimmed)
{
+ struct xfs_mount *mp = pag->pag_mount;
struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
struct xfs_agf *agf;
- struct xfs_perag *pag;
int error;
int i;
- pag = xfs_perag_get(mp, agno);
-
/*
* Force out the log. This means any transactions that might have freed
* space before we take the AGF buffer lock are now on disk, and the
@@ -47,7 +44,7 @@ xfs_trim_extents(
error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
if (error)
- goto out_put_perag;
+ return error;
agf = agbp->b_addr;
cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
@@ -71,10 +68,10 @@ xfs_trim_extents(
error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
if (error)
- goto out_del_cursor;
+ break;
if (XFS_IS_CORRUPT(mp, i != 1)) {
error = -EFSCORRUPTED;
- goto out_del_cursor;
+ break;
}
ASSERT(flen <= be32_to_cpu(agf->agf_longest));
@@ -83,15 +80,15 @@ xfs_trim_extents(
* the format the range/len variables are supplied in by
* userspace.
*/
- dbno = XFS_AGB_TO_DADDR(mp, agno, fbno);
+ dbno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, fbno);
dlen = XFS_FSB_TO_BB(mp, flen);
/*
* Too small? Give up.
*/
if (dlen < minlen) {
- trace_xfs_discard_toosmall(mp, agno, fbno, flen);
- goto out_del_cursor;
+ trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
+ break;
}
/*
@@ -100,7 +97,7 @@ xfs_trim_extents(
* down partially overlapping ranges for now.
*/
if (dbno + dlen < start || dbno > end) {
- trace_xfs_discard_exclude(mp, agno, fbno, flen);
+ trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen);
goto next_extent;
}
@@ -109,32 +106,30 @@ xfs_trim_extents(
* discard and try again the next time.
*/
if (xfs_extent_busy_search(mp, pag, fbno, flen)) {
- trace_xfs_discard_busy(mp, agno, fbno, flen);
+ trace_xfs_discard_busy(mp, pag->pag_agno, fbno, flen);
goto next_extent;
}
- trace_xfs_discard_extent(mp, agno, fbno, flen);
+ trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen);
error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS);
if (error)
- goto out_del_cursor;
+ break;
*blocks_trimmed += flen;
next_extent:
error = xfs_btree_decrement(cur, 0, &i);
if (error)
- goto out_del_cursor;
+ break;
if (fatal_signal_pending(current)) {
error = -ERESTARTSYS;
- goto out_del_cursor;
+ break;
}
}
out_del_cursor:
xfs_btree_del_cursor(cur, error);
xfs_buf_relse(agbp);
-out_put_perag:
- xfs_perag_put(pag);
return error;
}
@@ -152,11 +147,12 @@ xfs_ioc_trim(
struct xfs_mount *mp,
struct fstrim_range __user *urange)
{
+ struct xfs_perag *pag;
unsigned int granularity =
bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
struct fstrim_range range;
xfs_daddr_t start, end, minlen;
- xfs_agnumber_t start_agno, end_agno, agno;
+ xfs_agnumber_t agno;
uint64_t blocks_trimmed = 0;
int error, last_error = 0;
@@ -193,18 +189,18 @@ xfs_ioc_trim(
end = start + BTOBBT(range.len) - 1;
if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
- end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
-
- start_agno = xfs_daddr_to_agno(mp, start);
- end_agno = xfs_daddr_to_agno(mp, end);
+ end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1;
- for (agno = start_agno; agno <= end_agno; agno++) {
- error = xfs_trim_extents(mp, agno, start, end, minlen,
+ agno = xfs_daddr_to_agno(mp, start);
+ for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) {
+ error = xfs_trim_extents(pag, start, end, minlen,
&blocks_trimmed);
if (error) {
last_error = error;
- if (error == -ERESTARTSYS)
+ if (error == -ERESTARTSYS) {
+ xfs_perag_rele(pag);
break;
+ }
}
}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 34b21a29c39b..22c13933c8f8 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -12,6 +12,7 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
#include "xfs_alloc.h"
#include "xfs_mru_cache.h"
#include "xfs_trace.h"
@@ -22,7 +23,7 @@
struct xfs_fstrm_item {
struct xfs_mru_cache_elem mru;
- xfs_agnumber_t ag; /* AG in use for this directory */
+ struct xfs_perag *pag; /* AG in use for this directory */
};
enum xfs_fstrm_alloc {
@@ -30,117 +31,68 @@ enum xfs_fstrm_alloc {
XFS_PICK_LOWSPACE = 2,
};
-/*
- * Allocation group filestream associations are tracked with per-ag atomic
- * counters. These counters allow xfs_filestream_pick_ag() to tell whether a
- * particular AG already has active filestreams associated with it.
- */
-int
-xfs_filestream_peek_ag(
- xfs_mount_t *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
- int ret;
-
- pag = xfs_perag_get(mp, agno);
- ret = atomic_read(&pag->pagf_fstrms);
- xfs_perag_put(pag);
- return ret;
-}
-
-static int
-xfs_filestream_get_ag(
- xfs_mount_t *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
- int ret;
-
- pag = xfs_perag_get(mp, agno);
- ret = atomic_inc_return(&pag->pagf_fstrms);
- xfs_perag_put(pag);
- return ret;
-}
-
-static void
-xfs_filestream_put_ag(
- xfs_mount_t *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, agno);
- atomic_dec(&pag->pagf_fstrms);
- xfs_perag_put(pag);
-}
-
static void
xfs_fstrm_free_func(
void *data,
struct xfs_mru_cache_elem *mru)
{
- struct xfs_mount *mp = data;
struct xfs_fstrm_item *item =
container_of(mru, struct xfs_fstrm_item, mru);
+ struct xfs_perag *pag = item->pag;
- xfs_filestream_put_ag(mp, item->ag);
- trace_xfs_filestream_free(mp, mru->key, item->ag);
+ trace_xfs_filestream_free(pag, mru->key);
+ atomic_dec(&pag->pagf_fstrms);
+ xfs_perag_rele(pag);
kmem_free(item);
}
/*
- * Scan the AGs starting at startag looking for an AG that isn't in use and has
- * at least minlen blocks free.
+ * Scan the AGs starting at start_agno looking for an AG that isn't in use and
+ * has at least minlen blocks free. If no AG is found to match the allocation
+ * requirements, pick the AG with the most free space in it.
*/
static int
xfs_filestream_pick_ag(
- struct xfs_inode *ip,
- xfs_agnumber_t startag,
- xfs_agnumber_t *agp,
+ struct xfs_alloc_arg *args,
+ xfs_ino_t pino,
+ xfs_agnumber_t start_agno,
int flags,
- xfs_extlen_t minlen)
+ xfs_extlen_t *longest)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_fstrm_item *item;
+ struct xfs_mount *mp = args->mp;
struct xfs_perag *pag;
- xfs_extlen_t longest, free = 0, minfree, maxfree = 0;
- xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
- int err, trylock, nscan;
-
- ASSERT(S_ISDIR(VFS_I(ip)->i_mode));
+ struct xfs_perag *max_pag = NULL;
+ xfs_extlen_t minlen = *longest;
+ xfs_extlen_t free = 0, minfree, maxfree = 0;
+ xfs_agnumber_t agno;
+ bool first_pass = true;
+ int err;
/* 2% of an AG's blocks must be free for it to be chosen. */
minfree = mp->m_sb.sb_agblocks / 50;
- ag = startag;
- *agp = NULLAGNUMBER;
-
- /* For the first pass, don't sleep trying to init the per-AG. */
- trylock = XFS_ALLOC_FLAG_TRYLOCK;
-
- for (nscan = 0; 1; nscan++) {
- trace_xfs_filestream_scan(mp, ip->i_ino, ag);
-
- pag = xfs_perag_get(mp, ag);
-
- if (!pag->pagf_init) {
- err = xfs_alloc_read_agf(pag, NULL, trylock, NULL);
- if (err) {
- if (err != -EAGAIN) {
- xfs_perag_put(pag);
- return err;
- }
- /* Couldn't lock the AGF, skip this AG. */
- goto next_ag;
- }
+restart:
+ for_each_perag_wrap(mp, start_agno, agno, pag) {
+ trace_xfs_filestream_scan(pag, pino);
+ *longest = 0;
+ err = xfs_bmap_longest_free_extent(pag, NULL, longest);
+ if (err) {
+ xfs_perag_rele(pag);
+ if (err != -EAGAIN)
+ break;
+ /* Couldn't lock the AGF, skip this AG. */
+ err = 0;
+ continue;
}
/* Keep track of the AG with the most free blocks. */
if (pag->pagf_freeblks > maxfree) {
maxfree = pag->pagf_freeblks;
- max_ag = ag;
+ if (max_pag)
+ xfs_perag_rele(max_pag);
+ atomic_inc(&pag->pag_active_ref);
+ max_pag = pag;
}
/*
@@ -149,93 +101,73 @@ xfs_filestream_pick_ag(
* loop, and it guards against two filestreams being established
* in the same AG as each other.
*/
- if (xfs_filestream_get_ag(mp, ag) > 1) {
- xfs_filestream_put_ag(mp, ag);
- goto next_ag;
- }
-
- longest = xfs_alloc_longest_free_extent(pag,
- xfs_alloc_min_freelist(mp, pag),
- xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
- if (((minlen && longest >= minlen) ||
- (!minlen && pag->pagf_freeblks >= minfree)) &&
- (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
- (flags & XFS_PICK_LOWSPACE))) {
-
- /* Break out, retaining the reference on the AG. */
- free = pag->pagf_freeblks;
- xfs_perag_put(pag);
- *agp = ag;
- break;
+ if (atomic_inc_return(&pag->pagf_fstrms) <= 1) {
+ if (((minlen && *longest >= minlen) ||
+ (!minlen && pag->pagf_freeblks >= minfree)) &&
+ (!xfs_perag_prefers_metadata(pag) ||
+ !(flags & XFS_PICK_USERDATA) ||
+ (flags & XFS_PICK_LOWSPACE))) {
+ /* Break out, retaining the reference on the AG. */
+ free = pag->pagf_freeblks;
+ break;
+ }
}
/* Drop the reference on this AG, it's not usable. */
- xfs_filestream_put_ag(mp, ag);
-next_ag:
- xfs_perag_put(pag);
- /* Move to the next AG, wrapping to AG 0 if necessary. */
- if (++ag >= mp->m_sb.sb_agcount)
- ag = 0;
-
- /* If a full pass of the AGs hasn't been done yet, continue. */
- if (ag != startag)
- continue;
+ atomic_dec(&pag->pagf_fstrms);
+ }
- /* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */
- if (trylock != 0) {
- trylock = 0;
- continue;
+ if (err) {
+ xfs_perag_rele(pag);
+ if (max_pag)
+ xfs_perag_rele(max_pag);
+ return err;
+ }
+
+ if (!pag) {
+ /*
+ * Allow a second pass to give xfs_bmap_longest_free_extent()
+ * another attempt at locking AGFs that it might have skipped
+ * over before we fail.
+ */
+ if (first_pass) {
+ first_pass = false;
+ goto restart;
}
- /* Finally, if lowspace wasn't set, set it for the 3rd pass. */
+ /*
+ * We must be low on data space, so run a final lowspace
+ * optimised selection pass if we haven't already.
+ */
if (!(flags & XFS_PICK_LOWSPACE)) {
flags |= XFS_PICK_LOWSPACE;
- continue;
+ goto restart;
}
/*
- * Take the AG with the most free space, regardless of whether
- * it's already in use by another filestream.
+ * No unassociated AGs are available, so select the AG with the
+ * most free space, regardless of whether it's already in use by
+ * another filestream. It none suit, just use whatever AG we can
+ * grab.
*/
- if (max_ag != NULLAGNUMBER) {
- xfs_filestream_get_ag(mp, max_ag);
+ if (!max_pag) {
+ for_each_perag_wrap(args->mp, 0, start_agno, args->pag)
+ break;
+ atomic_inc(&args->pag->pagf_fstrms);
+ *longest = 0;
+ } else {
+ pag = max_pag;
free = maxfree;
- *agp = max_ag;
- break;
+ atomic_inc(&pag->pagf_fstrms);
}
-
- /* take AG 0 if none matched */
- trace_xfs_filestream_pick(ip, *agp, free, nscan);
- *agp = 0;
- return 0;
- }
-
- trace_xfs_filestream_pick(ip, *agp, free, nscan);
-
- if (*agp == NULLAGNUMBER)
- return 0;
-
- err = -ENOMEM;
- item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
- if (!item)
- goto out_put_ag;
-
- item->ag = *agp;
-
- err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
- if (err) {
- if (err == -EEXIST)
- err = 0;
- goto out_free_item;
+ } else if (max_pag) {
+ xfs_perag_rele(max_pag);
}
+ trace_xfs_filestream_pick(pag, pino, free);
+ args->pag = pag;
return 0;
-out_free_item:
- kmem_free(item);
-out_put_ag:
- xfs_filestream_put_ag(mp, *agp);
- return err;
}
static struct xfs_inode *
@@ -263,104 +195,187 @@ out:
}
/*
- * Find the right allocation group for a file, either by finding an
- * existing file stream or creating a new one.
+ * Lookup the mru cache for an existing association. If one exists and we can
+ * use it, return with an active perag reference indicating that the allocation
+ * will proceed with that association.
*
- * Returns NULLAGNUMBER in case of an error.
+ * If we have no association, or we cannot use the current one and have to
+ * destroy it, return with longest = 0 to tell the caller to create a new
+ * association.
*/
-xfs_agnumber_t
-xfs_filestream_lookup_ag(
- struct xfs_inode *ip)
+static int
+xfs_filestream_lookup_association(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_ino_t pino,
+ xfs_extlen_t *longest)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_inode *pip = NULL;
- xfs_agnumber_t startag, ag = NULLAGNUMBER;
+ struct xfs_mount *mp = args->mp;
+ struct xfs_perag *pag;
struct xfs_mru_cache_elem *mru;
+ int error = 0;
- ASSERT(S_ISREG(VFS_I(ip)->i_mode));
-
- pip = xfs_filestream_get_parent(ip);
- if (!pip)
- return NULLAGNUMBER;
+ *longest = 0;
+ mru = xfs_mru_cache_lookup(mp->m_filestream, pino);
+ if (!mru)
+ return 0;
+ /*
+ * Grab the pag and take an extra active reference for the caller whilst
+ * the mru item cannot go away. This means we'll pin the perag with
+ * the reference we get here even if the filestreams association is torn
+ * down immediately after we mark the lookup as done.
+ */
+ pag = container_of(mru, struct xfs_fstrm_item, mru)->pag;
+ atomic_inc(&pag->pag_active_ref);
+ xfs_mru_cache_done(mp->m_filestream);
- mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
- if (mru) {
- ag = container_of(mru, struct xfs_fstrm_item, mru)->ag;
- xfs_mru_cache_done(mp->m_filestream);
+ trace_xfs_filestream_lookup(pag, ap->ip->i_ino);
- trace_xfs_filestream_lookup(mp, ip->i_ino, ag);
- goto out;
- }
+ ap->blkno = XFS_AGB_TO_FSB(args->mp, pag->pag_agno, 0);
+ xfs_bmap_adjacent(ap);
/*
- * Set the starting AG using the rotor for inode32, otherwise
- * use the directory inode's AG.
+ * If there is very little free space before we start a filestreams
+ * allocation, we're almost guaranteed to fail to find a large enough
+ * free space available so just use the cached AG.
*/
- if (xfs_is_inode32(mp)) {
- xfs_agnumber_t rotorstep = xfs_rotorstep;
- startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
- mp->m_agfrotor = (mp->m_agfrotor + 1) %
- (mp->m_sb.sb_agcount * rotorstep);
- } else
- startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
+ if (ap->tp->t_flags & XFS_TRANS_LOWMODE) {
+ *longest = 1;
+ goto out_done;
+ }
- if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0))
- ag = NULLAGNUMBER;
-out:
- xfs_irele(pip);
- return ag;
+ error = xfs_bmap_longest_free_extent(pag, args->tp, longest);
+ if (error == -EAGAIN)
+ error = 0;
+ if (error || *longest < args->maxlen) {
+ /* We aren't going to use this perag */
+ *longest = 0;
+ xfs_perag_rele(pag);
+ return error;
+ }
+
+out_done:
+ args->pag = pag;
+ return 0;
}
-/*
- * Pick a new allocation group for the current file and its file stream.
- *
- * This is called when the allocator can't find a suitable extent in the
- * current AG, and we have to move the stream into a new AG with more space.
- */
-int
-xfs_filestream_new_ag(
+static int
+xfs_filestream_create_association(
struct xfs_bmalloca *ap,
- xfs_agnumber_t *agp)
+ struct xfs_alloc_arg *args,
+ xfs_ino_t pino,
+ xfs_extlen_t *longest)
{
- struct xfs_inode *ip = ap->ip, *pip;
- struct xfs_mount *mp = ip->i_mount;
- xfs_extlen_t minlen = ap->length;
- xfs_agnumber_t startag = 0;
- int flags = 0;
- int err = 0;
+ struct xfs_mount *mp = args->mp;
struct xfs_mru_cache_elem *mru;
+ struct xfs_fstrm_item *item;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, pino);
+ int flags = 0;
+ int error;
- *agp = NULLAGNUMBER;
-
- pip = xfs_filestream_get_parent(ip);
- if (!pip)
- goto exit;
-
- mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino);
+ /* Changing parent AG association now, so remove the existing one. */
+ mru = xfs_mru_cache_remove(mp->m_filestream, pino);
if (mru) {
struct xfs_fstrm_item *item =
container_of(mru, struct xfs_fstrm_item, mru);
- startag = (item->ag + 1) % mp->m_sb.sb_agcount;
+
+ agno = (item->pag->pag_agno + 1) % mp->m_sb.sb_agcount;
+ xfs_fstrm_free_func(mp, mru);
+ } else if (xfs_is_inode32(mp)) {
+ xfs_agnumber_t rotorstep = xfs_rotorstep;
+
+ agno = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
+ mp->m_agfrotor = (mp->m_agfrotor + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
}
+ ap->blkno = XFS_AGB_TO_FSB(args->mp, agno, 0);
+ xfs_bmap_adjacent(ap);
+
if (ap->datatype & XFS_ALLOC_USERDATA)
flags |= XFS_PICK_USERDATA;
if (ap->tp->t_flags & XFS_TRANS_LOWMODE)
flags |= XFS_PICK_LOWSPACE;
- err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
+ *longest = ap->length;
+ error = xfs_filestream_pick_ag(args, pino, agno, flags, longest);
+ if (error)
+ return error;
/*
- * Only free the item here so we skip over the old AG earlier.
+ * We are going to use this perag now, so create an assoication for it.
+ * xfs_filestream_pick_ag() has already bumped the perag fstrms counter
+ * for us, so all we need to do here is take another active reference to
+ * the perag for the cached association.
+ *
+ * If we fail to store the association, we need to drop the fstrms
+ * counter as well as drop the perag reference we take here for the
+ * item. We do not need to return an error for this failure - as long as
+ * we return a referenced AG, the allocation can still go ahead just
+ * fine.
*/
- if (mru)
- xfs_fstrm_free_func(mp, mru);
+ item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
+ if (!item)
+ goto out_put_fstrms;
+
+ atomic_inc(&args->pag->pag_active_ref);
+ item->pag = args->pag;
+ error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru);
+ if (error)
+ goto out_free_item;
+ return 0;
+
+out_free_item:
+ xfs_perag_rele(item->pag);
+ kmem_free(item);
+out_put_fstrms:
+ atomic_dec(&args->pag->pagf_fstrms);
+ return 0;
+}
+
+/*
+ * Search for an allocation group with a single extent large enough for
+ * the request. First we look for an existing association and use that if it
+ * is found. Otherwise, we create a new association by selecting an AG that fits
+ * the allocation criteria.
+ *
+ * We return with a referenced perag in args->pag to indicate which AG we are
+ * allocating into or an error with no references held.
+ */
+int
+xfs_filestream_select_ag(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *longest)
+{
+ struct xfs_mount *mp = args->mp;
+ struct xfs_inode *pip;
+ xfs_ino_t ino = 0;
+ int error = 0;
+
+ *longest = 0;
+ args->total = ap->total;
+ pip = xfs_filestream_get_parent(ap->ip);
+ if (pip) {
+ ino = pip->i_ino;
+ error = xfs_filestream_lookup_association(ap, args, ino,
+ longest);
+ xfs_irele(pip);
+ if (error)
+ return error;
+ if (*longest >= args->maxlen)
+ goto out_select;
+ if (ap->tp->t_flags & XFS_TRANS_LOWMODE)
+ goto out_select;
+ }
+
+ error = xfs_filestream_create_association(ap, args, ino, longest);
+ if (error)
+ return error;
- xfs_irele(pip);
-exit:
- if (*agp == NULLAGNUMBER)
- *agp = 0;
- return err;
+out_select:
+ ap->blkno = XFS_AGB_TO_FSB(mp, args->pag->pag_agno, 0);
+ return 0;
}
void
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 403226ebb80b..84149ed0e340 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -9,13 +9,13 @@
struct xfs_mount;
struct xfs_inode;
struct xfs_bmalloca;
+struct xfs_alloc_arg;
int xfs_filestream_mount(struct xfs_mount *mp);
void xfs_filestream_unmount(struct xfs_mount *mp);
void xfs_filestream_deassociate(struct xfs_inode *ip);
-xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
-int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
-int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno);
+int xfs_filestream_select_ag(struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args, xfs_extlen_t *blen);
static inline int
xfs_inode_is_filestream(
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 88a88506ffff..59e7d1a14b67 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -688,11 +688,11 @@ __xfs_getfsmap_datadev(
info->agf_bp = NULL;
}
if (info->pag) {
- xfs_perag_put(info->pag);
+ xfs_perag_rele(info->pag);
info->pag = NULL;
} else if (pag) {
/* loop termination case */
- xfs_perag_put(pag);
+ xfs_perag_rele(pag);
}
return error;
@@ -761,6 +761,7 @@ xfs_getfsmap_datadev_bnobt(
{
struct xfs_alloc_rec_incore akeys[2];
+ memset(akeys, 0, sizeof(akeys));
info->missing_owner = XFS_FMR_OWN_UNKNOWN;
return __xfs_getfsmap_datadev(tp, keys, info,
xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index ddeaccc04aec..c9a7e270a428 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -255,7 +255,7 @@ xfs_perag_set_inode_tag(
break;
}
- trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
+ trace_xfs_perag_set_inode_tag(pag, _RET_IP_);
}
/* Clear a tag on both the AG incore inode tree and the AG radix tree. */
@@ -289,7 +289,7 @@ xfs_perag_clear_inode_tag(
radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
spin_unlock(&mp->m_perag_lock);
- trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
+ trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
}
/*
@@ -586,7 +586,7 @@ xfs_iget_cache_miss(
if (!ip)
return -ENOMEM;
- error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
+ error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags);
if (error)
goto out_destroy;
@@ -1767,7 +1767,7 @@ xfs_icwalk(
if (error) {
last_error = error;
if (error == -EFSCORRUPTED) {
- xfs_perag_put(pag);
+ xfs_perag_rele(pag);
break;
}
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f1d715faab5..5808abab786c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1367,7 +1367,7 @@ xfs_itruncate_extents_flags(
unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
while (unmap_len > 0) {
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
+ ASSERT(tp->t_highest_agno == NULLAGNUMBER);
error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
flags, XFS_ITRUNC_MAX_EXTENTS);
if (error)
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 7558486f4937..21be93bf006d 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -275,7 +275,7 @@ xfs_iwalk_ag_start(
/* Set up a fresh cursor and empty the inobt cache. */
iwag->nr_recs = 0;
- error = xfs_inobt_cur(mp, tp, pag, XFS_BTNUM_INO, curpp, agi_bpp);
+ error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp);
if (error)
return error;
@@ -390,7 +390,7 @@ xfs_iwalk_run_callbacks(
}
/* ...and recreate the cursor just past where we left off. */
- error = xfs_inobt_cur(mp, iwag->tp, iwag->pag, XFS_BTNUM_INO, curpp,
+ error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp,
agi_bpp);
if (error)
return error;
@@ -591,7 +591,7 @@ xfs_iwalk(
}
if (iwag.pag)
- xfs_perag_put(pag);
+ xfs_perag_rele(pag);
xfs_iwalk_free(&iwag);
return error;
}
@@ -683,7 +683,7 @@ xfs_iwalk_threaded(
break;
}
if (pag)
- xfs_perag_put(pag);
+ xfs_perag_rele(pag);
if (polled)
xfs_pwork_poll(&pctl);
return xfs_pwork_destroy(&pctl);
@@ -776,7 +776,7 @@ xfs_inobt_walk(
}
if (iwag.pag)
- xfs_perag_put(pag);
+ xfs_perag_rele(pag);
xfs_iwalk_free(&iwag);
return error;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 8aca2cc173ac..f3269c0626f0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -210,8 +210,7 @@ typedef struct xfs_mount {
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
xfs_agnumber_t m_agfrotor; /* last ag where space found */
- xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
- spinlock_t m_agirotor_lock;/* .. and lock protecting it */
+ atomic_t m_agirotor; /* last ag dir inode alloced */
/* Memory shrinker to throttle and reprioritize inodegc */
struct shrinker m_inodegc_shrinker;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 5535778a98f9..f5dc46ce9803 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -610,7 +610,7 @@ xfs_reflink_cancel_cow_blocks(
if (error)
break;
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
- ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
+ ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
/* Free the CoW orphan record. */
xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
@@ -927,7 +927,7 @@ xfs_reflink_recover_cow(
for_each_perag(mp, agno, pag) {
error = xfs_refcount_recover_cow_leftovers(mp, pag);
if (error) {
- xfs_perag_put(pag);
+ xfs_perag_rele(pag);
break;
}
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0c4b73e9b29d..2479b5cbd75e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -247,6 +247,32 @@ xfs_fs_show_options(
return 0;
}
+static bool
+xfs_set_inode_alloc_perag(
+ struct xfs_perag *pag,
+ xfs_ino_t ino,
+ xfs_agnumber_t max_metadata)
+{
+ if (!xfs_is_inode32(pag->pag_mount)) {
+ set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate);
+ clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
+ return false;
+ }
+
+ if (ino > XFS_MAXINUMBER_32) {
+ clear_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate);
+ clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
+ return false;
+ }
+
+ set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate);
+ if (pag->pag_agno < max_metadata)
+ set_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
+ else
+ clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
+ return true;
+}
+
/*
* Set parameters for inode allocation heuristics, taking into account
* filesystem size and inode32/inode64 mount options; i.e. specifically
@@ -310,24 +336,8 @@ xfs_set_inode_alloc(
ino = XFS_AGINO_TO_INO(mp, index, agino);
pag = xfs_perag_get(mp, index);
-
- if (xfs_is_inode32(mp)) {
- if (ino > XFS_MAXINUMBER_32) {
- pag->pagi_inodeok = 0;
- pag->pagf_metadata = 0;
- } else {
- pag->pagi_inodeok = 1;
- maxagi++;
- if (index < max_metadata)
- pag->pagf_metadata = 1;
- else
- pag->pagf_metadata = 0;
- }
- } else {
- pag->pagi_inodeok = 1;
- pag->pagf_metadata = 0;
- }
-
+ if (xfs_set_inode_alloc_perag(pag, ino, max_metadata))
+ maxagi++;
xfs_perag_put(pag);
}
@@ -1922,7 +1932,6 @@ static int xfs_init_fs_context(
return -ENOMEM;
spin_lock_init(&mp->m_sb_lock);
- spin_lock_init(&mp->m_agirotor_lock);
INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
spin_lock_init(&mp->m_perag_lock);
mutex_init(&mp->m_growlock);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6b0e9ae7c513..7dc0fd6a6504 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -74,6 +74,7 @@ struct xfs_inobt_rec_incore;
union xfs_btree_ptr;
struct xfs_dqtrx;
struct xfs_icwalk;
+struct xfs_perag;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -159,36 +160,40 @@ TRACE_EVENT(xlog_intent_recovery_failed,
);
DECLARE_EVENT_CLASS(xfs_perag_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
- unsigned long caller_ip),
- TP_ARGS(mp, agno, refcount, caller_ip),
+ TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip),
+ TP_ARGS(pag, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(int, refcount)
+ __field(int, active_refcount)
__field(unsigned long, caller_ip)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->refcount = refcount;
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->refcount = atomic_read(&pag->pag_ref);
+ __entry->active_refcount = atomic_read(&pag->pag_active_ref);
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno 0x%x refcount %d caller %pS",
+ TP_printk("dev %d:%d agno 0x%x passive refs %d active refs %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->refcount,
+ __entry->active_refcount,
(char *)__entry->caller_ip)
);
#define DEFINE_PERAG_REF_EVENT(name) \
DEFINE_EVENT(xfs_perag_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
- unsigned long caller_ip), \
- TP_ARGS(mp, agno, refcount, caller_ip))
+ TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), \
+ TP_ARGS(pag, caller_ip))
DEFINE_PERAG_REF_EVENT(xfs_perag_get);
DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_grab);
+DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_rele);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
@@ -634,8 +639,8 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
DECLARE_EVENT_CLASS(xfs_filestream_class,
- TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno),
- TP_ARGS(mp, ino, agno),
+ TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino),
+ TP_ARGS(pag, ino),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
@@ -643,10 +648,10 @@ DECLARE_EVENT_CLASS(xfs_filestream_class,
__field(int, streams)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
+ __entry->dev = pag->pag_mount->m_super->s_dev;
__entry->ino = ino;
- __entry->agno = agno;
- __entry->streams = xfs_filestream_peek_ag(mp, agno);
+ __entry->agno = pag->pag_agno;
+ __entry->streams = atomic_read(&pag->pagf_fstrms);
),
TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -656,39 +661,40 @@ DECLARE_EVENT_CLASS(xfs_filestream_class,
)
#define DEFINE_FILESTREAM_EVENT(name) \
DEFINE_EVENT(xfs_filestream_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), \
- TP_ARGS(mp, ino, agno))
+ TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), \
+ TP_ARGS(pag, ino))
DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
TRACE_EVENT(xfs_filestream_pick,
- TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno,
- xfs_extlen_t free, int nscan),
- TP_ARGS(ip, agno, free, nscan),
+ TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino, xfs_extlen_t free),
+ TP_ARGS(pag, ino, free),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_agnumber_t, agno)
__field(int, streams)
__field(xfs_extlen_t, free)
- __field(int, nscan)
),
TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->agno = agno;
- __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->ino = ino;
+ if (pag) {
+ __entry->agno = pag->pag_agno;
+ __entry->streams = atomic_read(&pag->pagf_fstrms);
+ } else {
+ __entry->agno = NULLAGNUMBER;
+ __entry->streams = 0;
+ }
__entry->free = free;
- __entry->nscan = nscan;
),
- TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d nscan %d",
+ TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->agno,
__entry->streams,
- __entry->free,
- __entry->nscan)
+ __entry->free)
);
DECLARE_EVENT_CLASS(xfs_lock_class,
@@ -1795,13 +1801,11 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__field(xfs_extlen_t, alignment)
__field(xfs_extlen_t, minalignslop)
__field(xfs_extlen_t, len)
- __field(short, type)
- __field(short, otype)
__field(char, wasdel)
__field(char, wasfromfl)
__field(int, resv)
__field(int, datatype)
- __field(xfs_fsblock_t, firstblock)
+ __field(xfs_agnumber_t, highest_agno)
),
TP_fast_assign(
__entry->dev = args->mp->m_super->s_dev;
@@ -1816,18 +1820,16 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__entry->alignment = args->alignment;
__entry->minalignslop = args->minalignslop;
__entry->len = args->len;
- __entry->type = args->type;
- __entry->otype = args->otype;
__entry->wasdel = args->wasdel;
__entry->wasfromfl = args->wasfromfl;
__entry->resv = args->resv;
__entry->datatype = args->datatype;
- __entry->firstblock = args->tp->t_firstblock;
+ __entry->highest_agno = args->tp->t_highest_agno;
),
TP_printk("dev %d:%d agno 0x%x agbno 0x%x minlen %u maxlen %u mod %u "
"prod %u minleft %u total %u alignment %u minalignslop %u "
- "len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
- "datatype 0x%x firstblock 0x%llx",
+ "len %u wasdel %d wasfromfl %d resv %d "
+ "datatype 0x%x highest_agno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -1840,13 +1842,11 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__entry->alignment,
__entry->minalignslop,
__entry->len,
- __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
- __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
__entry->wasdel,
__entry->wasfromfl,
__entry->resv,
__entry->datatype,
- (unsigned long long)__entry->firstblock)
+ __entry->highest_agno)
)
#define DEFINE_ALLOC_EVENT(name) \
@@ -1877,6 +1877,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_skip_deadlock);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 7bd16fbff534..8afc0c080861 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -102,7 +102,7 @@ xfs_trans_dup(
INIT_LIST_HEAD(&ntp->t_items);
INIT_LIST_HEAD(&ntp->t_busy);
INIT_LIST_HEAD(&ntp->t_dfops);
- ntp->t_firstblock = NULLFSBLOCK;
+ ntp->t_highest_agno = NULLAGNUMBER;
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(tp->t_ticket != NULL);
@@ -278,7 +278,7 @@ retry:
INIT_LIST_HEAD(&tp->t_items);
INIT_LIST_HEAD(&tp->t_busy);
INIT_LIST_HEAD(&tp->t_dfops);
- tp->t_firstblock = NULLFSBLOCK;
+ tp->t_highest_agno = NULLAGNUMBER;
error = xfs_trans_reserve(tp, resp, blocks, rtextents);
if (error == -ENOSPC && want_retry) {
@@ -1078,10 +1078,10 @@ xfs_trans_cancel(
/*
* It's never valid to cancel a transaction with deferred ops attached,
* because the transaction is effectively dirty. Complain about this
- * loudly before freeing the in-memory defer items.
+ * loudly before freeing the in-memory defer items and shutting down the
+ * filesystem.
*/
if (!list_empty(&tp->t_dfops)) {
- ASSERT(xfs_is_shutdown(mp) || list_empty(&tp->t_dfops));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
dirty = true;
xfs_defer_cancel(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 55819785941c..6e3646d524ce 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -132,7 +132,7 @@ typedef struct xfs_trans {
unsigned int t_rtx_res; /* # of rt extents resvd */
unsigned int t_rtx_res_used; /* # of resvd rt extents used */
unsigned int t_flags; /* misc flags */
- xfs_fsblock_t t_firstblock; /* first block allocated */
+ xfs_agnumber_t t_highest_agno; /* highest AGF locked */
struct xlog_ticket *t_ticket; /* log mgr ticket */
struct xfs_mount *t_mountp; /* ptr to fs mount struct */
struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */