summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/nfs/inode.c112
-rw-r--r--include/linux/nfs_fs.h47
2 files changed, 144 insertions, 15 deletions
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5c8027e3c961..eb8af1e404d9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -208,11 +208,12 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
nfsi->cache_validity |= flags;
- if (inode->i_mapping->nrpages == 0)
- nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA |
- NFS_INO_DATA_INVAL_DEFER);
- else if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
- nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER;
+ if (inode->i_mapping->nrpages == 0) {
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ nfs_ooo_clear(nfsi);
+ } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+ nfs_ooo_clear(nfsi);
+ }
trace_nfs_set_cache_invalid(inode, 0);
}
EXPORT_SYMBOL_GPL(nfs_set_cache_invalid);
@@ -677,9 +678,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
trace_nfs_size_truncate(inode, offset);
i_size_write(inode, offset);
/* Optimisation */
- if (offset == 0)
- NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA |
- NFS_INO_DATA_INVAL_DEFER);
+ if (offset == 0) {
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+ nfs_ooo_clear(NFS_I(inode));
+ }
NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
spin_unlock(&inode->i_lock);
@@ -1109,7 +1111,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
spin_lock(&inode->i_lock);
if (list_empty(&nfsi->open_files) &&
- (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+ nfs_ooo_test(nfsi))
nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA |
NFS_INO_REVAL_FORCED);
list_add_tail_rcu(&ctx->list, &nfsi->open_files);
@@ -1353,8 +1355,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping)
set_bit(NFS_INO_INVALIDATING, bitlock);
smp_wmb();
- nfsi->cache_validity &=
- ~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER);
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ nfs_ooo_clear(nfsi);
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
ret = nfs_invalidate_mapping(inode, mapping);
@@ -1816,6 +1818,66 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
return 0;
}
+static void nfs_ooo_merge(struct nfs_inode *nfsi,
+ u64 start, u64 end)
+{
+ int i, cnt;
+
+ if (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)
+ /* No point merging anything */
+ return;
+
+ if (!nfsi->ooo) {
+ nfsi->ooo = kmalloc(sizeof(*nfsi->ooo), GFP_ATOMIC);
+ if (!nfsi->ooo) {
+ nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+ return;
+ }
+ nfsi->ooo->cnt = 0;
+ }
+
+ /* add this range, merging if possible */
+ cnt = nfsi->ooo->cnt;
+ for (i = 0; i < cnt; i++) {
+ if (end == nfsi->ooo->gap[i].start)
+ end = nfsi->ooo->gap[i].end;
+ else if (start == nfsi->ooo->gap[i].end)
+ start = nfsi->ooo->gap[i].start;
+ else
+ continue;
+ /* Remove 'i' from table and loop to insert the new range */
+ cnt -= 1;
+ nfsi->ooo->gap[i] = nfsi->ooo->gap[cnt];
+ i = -1;
+ }
+ if (start != end) {
+ if (cnt >= ARRAY_SIZE(nfsi->ooo->gap)) {
+ nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+ kfree(nfsi->ooo);
+ nfsi->ooo = NULL;
+ return;
+ }
+ nfsi->ooo->gap[cnt].start = start;
+ nfsi->ooo->gap[cnt].end = end;
+ cnt += 1;
+ }
+ nfsi->ooo->cnt = cnt;
+}
+
+static void nfs_ooo_record(struct nfs_inode *nfsi,
+ struct nfs_fattr *fattr)
+{
+ /* This reply was out-of-order, so record in the
+ * pre/post change id, possibly cancelling
+ * gaps created when iversion was jumpped forward.
+ */
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECHANGE))
+ nfs_ooo_merge(nfsi,
+ fattr->change_attr,
+ fattr->pre_change_attr);
+}
+
static int nfs_refresh_inode_locked(struct inode *inode,
struct nfs_fattr *fattr)
{
@@ -1826,8 +1888,12 @@ static int nfs_refresh_inode_locked(struct inode *inode,
if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode))
ret = nfs_update_inode(inode, fattr);
- else if (attr_cmp == 0)
- ret = nfs_check_inode_attributes(inode, fattr);
+ else {
+ nfs_ooo_record(NFS_I(inode), fattr);
+
+ if (attr_cmp == 0)
+ ret = nfs_check_inode_attributes(inode, fattr);
+ }
trace_nfs_refresh_inode_exit(inode, ret);
return ret;
@@ -1918,6 +1984,8 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
if (attr_cmp < 0)
return 0;
if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) {
+ /* Record the pre/post change info before clearing PRECHANGE */
+ nfs_ooo_record(NFS_I(inode), fattr);
fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
| NFS_ATTR_FATTR_PRESIZE
| NFS_ATTR_FATTR_PREMTIME
@@ -2072,6 +2140,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* More cache consistency checks */
if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+ if (!have_writers && nfsi->ooo && nfsi->ooo->cnt == 1 &&
+ nfsi->ooo->gap[0].end == inode_peek_iversion_raw(inode)) {
+ /* There is one remaining gap that hasn't been
+ * merged into iversion - do that now.
+ */
+ inode_set_iversion_raw(inode, nfsi->ooo->gap[0].start);
+ kfree(nfsi->ooo);
+ nfsi->ooo = NULL;
+ }
if (!inode_eq_iversion_raw(inode, fattr->change_attr)) {
/* Could it be a race with writeback? */
if (!(have_writers || have_delegation)) {
@@ -2093,8 +2170,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id,
inode->i_ino);
- } else if (!have_delegation)
- nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
+ } else if (!have_delegation) {
+ nfs_ooo_record(nfsi, fattr);
+ nfs_ooo_merge(nfsi, inode_peek_iversion_raw(inode),
+ fattr->change_attr);
+ }
inode_set_iversion_raw(inode, fattr->change_attr);
}
} else {
@@ -2248,6 +2328,7 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
return NULL;
nfsi->flags = 0UL;
nfsi->cache_validity = 0UL;
+ nfsi->ooo = NULL;
#if IS_ENABLED(CONFIG_NFS_V4)
nfsi->nfs4_acl = NULL;
#endif /* CONFIG_NFS_V4 */
@@ -2262,6 +2343,7 @@ EXPORT_SYMBOL_GPL(nfs_alloc_inode);
void nfs_free_inode(struct inode *inode)
{
+ kfree(NFS_I(inode)->ooo);
kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
}
EXPORT_SYMBOL_GPL(nfs_free_inode);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 12bb868f9a18..279262057a92 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -195,6 +195,39 @@ struct nfs_inode {
/* Open contexts for shared mmap writes */
struct list_head open_files;
+ /* Keep track of out-of-order replies.
+ * The ooo array contains start/end pairs of
+ * numbers from the changeid sequence when
+ * the inode's iversion has been updated.
+ * It also contains end/start pair (i.e. reverse order)
+ * of sections of the changeid sequence that have
+ * been seen in replies from the server.
+ * Normally these should match and when both
+ * A:B and B:A are found in ooo, they are both removed.
+ * And if a reply with A:B causes an iversion update
+ * of A:B, then neither are added.
+ * When a reply has pre_change that doesn't match
+ * iversion, then the changeid pair and any consequent
+ * change in iversion ARE added. Later replies
+ * might fill in the gaps, or possibly a gap is caused
+ * by a change from another client.
+ * When a file or directory is opened, if the ooo table
+ * is not empty, then we assume the gaps were due to
+ * another client and we invalidate the cached data.
+ *
+ * We can only track a limited number of concurrent gaps.
+ * Currently that limit is 16.
+ * We allocate the table on demand. If there is insufficient
+ * memory, then we probably cannot cache the file anyway
+ * so there is no loss.
+ */
+ struct {
+ int cnt;
+ struct {
+ u64 start, end;
+ } gap[16];
+ } *ooo;
+
#if IS_ENABLED(CONFIG_NFS_V4)
struct nfs4_cached_acl *nfs4_acl;
/* NFSv4 state */
@@ -612,6 +645,20 @@ nfs_fileid_to_ino_t(u64 fileid)
return ino;
}
+static inline void nfs_ooo_clear(struct nfs_inode *nfsi)
+{
+ nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER;
+ kfree(nfsi->ooo);
+ nfsi->ooo = NULL;
+}
+
+static inline bool nfs_ooo_test(struct nfs_inode *nfsi)
+{
+ return (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER) ||
+ (nfsi->ooo && nfsi->ooo->cnt > 0);
+
+}
+
#define NFS_JUKEBOX_RETRY_TIME (5 * HZ)
/* We need to block new opens while a file is being unlinked.