summaryrefslogtreecommitdiff
path: root/fs/nfsd/filecache.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/nfsd/filecache.c')
-rw-r--r--fs/nfsd/filecache.c1459
1 files changed, 898 insertions, 561 deletions
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 82198d747c4c..93798575b807 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1,17 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * Open file cache.
+ * The NFSD open file cache.
*
* (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
+ *
+ * An nfsd_file object is a per-file collection of open state that binds
+ * together:
+ * - a struct file *
+ * - a user credential
+ * - a network namespace
+ * - a read-ahead context
+ * - monitoring for writeback errors
+ *
+ * nfsd_file objects are reference-counted. Consumers acquire a new
+ * object via the nfsd_file_acquire API. They manage their interest in
+ * the acquired object, and hence the object's reference count, via
+ * nfsd_file_get and nfsd_file_put. There are two varieties of nfsd_file
+ * object:
+ *
+ * * non-garbage-collected: When a consumer wants to precisely control
+ * the lifetime of a file's open state, it acquires a non-garbage-
+ * collected nfsd_file. The final nfsd_file_put releases the open
+ * state immediately.
+ *
+ * * garbage-collected: When a consumer does not control the lifetime
+ * of open state, it acquires a garbage-collected nfsd_file. The
+ * final nfsd_file_put allows the open state to linger for a period
+ * during which it may be re-used.
*/
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/file.h>
+#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/list_lru.h>
#include <linux/fsnotify_backend.h>
#include <linux/fsnotify.h>
#include <linux/seq_file.h>
+#include <linux/rhashtable.h>
+#include <linux/nfslocalio.h>
#include "vfs.h"
#include "nfsd.h"
@@ -20,63 +48,73 @@
#include "filecache.h"
#include "trace.h"
-#define NFSDDBG_FACILITY NFSDDBG_FH
-
-/* FIXME: dynamically size this for the machine somehow? */
-#define NFSD_FILE_HASH_BITS 12
-#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS)
#define NFSD_LAUNDRETTE_DELAY (2 * HZ)
-#define NFSD_FILE_SHUTDOWN (1)
-#define NFSD_FILE_LRU_THRESHOLD (4096UL)
-#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2)
+#define NFSD_FILE_CACHE_UP (0)
/* We only care about NFSD_MAY_READ/WRITE for this cache */
-#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE)
-
-struct nfsd_fcache_bucket {
- struct hlist_head nfb_head;
- spinlock_t nfb_lock;
- unsigned int nfb_count;
- unsigned int nfb_maxcount;
-};
+#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_allocations);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_releases);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
struct nfsd_fcache_disposal {
- struct list_head list;
- struct work_struct work;
- struct net *net;
spinlock_t lock;
struct list_head freeme;
- struct rcu_head rcu;
};
-static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
-
static struct kmem_cache *nfsd_file_slab;
static struct kmem_cache *nfsd_file_mark_slab;
-static struct nfsd_fcache_bucket *nfsd_file_hashtbl;
static struct list_lru nfsd_file_lru;
-static long nfsd_file_lru_flags;
+static unsigned long nfsd_file_flags;
static struct fsnotify_group *nfsd_file_fsnotify_group;
-static atomic_long_t nfsd_filecache_count;
static struct delayed_work nfsd_filecache_laundrette;
-static DEFINE_SPINLOCK(laundrette_lock);
-static LIST_HEAD(laundrettes);
+static struct rhltable nfsd_file_rhltable
+ ____cacheline_aligned_in_smp;
+
+static bool
+nfsd_match_cred(const struct cred *c1, const struct cred *c2)
+{
+ int i;
+
+ if (!uid_eq(c1->fsuid, c2->fsuid))
+ return false;
+ if (!gid_eq(c1->fsgid, c2->fsgid))
+ return false;
+ if (c1->group_info == NULL || c2->group_info == NULL)
+ return c1->group_info == c2->group_info;
+ if (c1->group_info->ngroups != c2->group_info->ngroups)
+ return false;
+ for (i = 0; i < c1->group_info->ngroups; i++) {
+ if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
+ return false;
+ }
+ return true;
+}
-static void nfsd_file_gc(void);
+static const struct rhashtable_params nfsd_file_rhash_params = {
+ .key_len = sizeof_field(struct nfsd_file, nf_inode),
+ .key_offset = offsetof(struct nfsd_file, nf_inode),
+ .head_offset = offsetof(struct nfsd_file, nf_rlist),
+
+ /*
+ * Start with a single page hash table to reduce resizing churn
+ * on light workloads.
+ */
+ .min_size = 256,
+ .automatic_shrinking = true,
+};
static void
nfsd_file_schedule_laundrette(void)
{
- long count = atomic_long_read(&nfsd_filecache_count);
-
- if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags))
- return;
-
- queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
- NFSD_LAUNDRETTE_DELAY);
+ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags))
+ queue_delayed_work(system_dfl_wq, &nfsd_filecache_laundrette,
+ NFSD_LAUNDRETTE_DELAY);
}
static void
@@ -115,22 +153,21 @@ nfsd_file_mark_put(struct nfsd_file_mark *nfm)
}
static struct nfsd_file_mark *
-nfsd_file_mark_find_or_create(struct nfsd_file *nf)
+nfsd_file_mark_find_or_create(struct inode *inode)
{
int err;
struct fsnotify_mark *mark;
struct nfsd_file_mark *nfm = NULL, *new;
- struct inode *inode = nf->nf_inode;
do {
- mutex_lock(&nfsd_file_fsnotify_group->mark_mutex);
- mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
- nfsd_file_fsnotify_group);
+ fsnotify_group_lock(nfsd_file_fsnotify_group);
+ mark = fsnotify_find_inode_mark(inode,
+ nfsd_file_fsnotify_group);
if (mark) {
nfm = nfsd_file_mark_get(container_of(mark,
struct nfsd_file_mark,
nfm_mark));
- mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+ fsnotify_group_unlock(nfsd_file_fsnotify_group);
if (nfm) {
fsnotify_put_mark(mark);
break;
@@ -138,8 +175,9 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
/* Avoid soft lockup race with nfsd_file_mark_put() */
fsnotify_destroy_mark(mark, nfsd_file_fsnotify_group);
fsnotify_put_mark(mark);
- } else
- mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
+ } else {
+ fsnotify_group_unlock(nfsd_file_fsnotify_group);
+ }
/* allocate a new nfm */
new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL);
@@ -170,320 +208,389 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
}
static struct nfsd_file *
-nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
- struct net *net)
+nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
+ bool want_gc)
{
struct nfsd_file *nf;
nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL);
- if (nf) {
- INIT_HLIST_NODE(&nf->nf_node);
- INIT_LIST_HEAD(&nf->nf_lru);
- nf->nf_file = NULL;
- nf->nf_cred = get_current_cred();
- nf->nf_net = net;
- nf->nf_flags = 0;
- nf->nf_inode = inode;
- nf->nf_hashval = hashval;
- refcount_set(&nf->nf_ref, 1);
- nf->nf_may = may & NFSD_FILE_MAY_MASK;
- if (may & NFSD_MAY_NOT_BREAK_LEASE) {
- if (may & NFSD_MAY_WRITE)
- __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags);
- if (may & NFSD_MAY_READ)
- __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
- }
- nf->nf_mark = NULL;
- init_rwsem(&nf->nf_rwsem);
- trace_nfsd_file_alloc(nf);
- }
- return nf;
-}
-
-static bool
-nfsd_file_free(struct nfsd_file *nf)
-{
- bool flush = false;
-
- trace_nfsd_file_put_final(nf);
- if (nf->nf_mark)
- nfsd_file_mark_put(nf->nf_mark);
- if (nf->nf_file) {
- get_file(nf->nf_file);
- filp_close(nf->nf_file, NULL);
- fput(nf->nf_file);
- flush = true;
- }
- call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
- return flush;
-}
-
-static bool
-nfsd_file_check_writeback(struct nfsd_file *nf)
-{
- struct file *file = nf->nf_file;
- struct address_space *mapping;
+ if (unlikely(!nf))
+ return NULL;
- if (!file || !(file->f_mode & FMODE_WRITE))
- return false;
- mapping = file->f_mapping;
- return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
- mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
+ this_cpu_inc(nfsd_file_allocations);
+ INIT_LIST_HEAD(&nf->nf_lru);
+ INIT_LIST_HEAD(&nf->nf_gc);
+ nf->nf_birthtime = ktime_get();
+ nf->nf_file = NULL;
+ nf->nf_cred = get_current_cred();
+ nf->nf_net = net;
+ nf->nf_flags = want_gc ?
+ BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING) | BIT(NFSD_FILE_GC) :
+ BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING);
+ nf->nf_inode = inode;
+ refcount_set(&nf->nf_ref, 1);
+ nf->nf_may = need;
+ nf->nf_mark = NULL;
+ nf->nf_dio_mem_align = 0;
+ nf->nf_dio_offset_align = 0;
+ nf->nf_dio_read_offset_align = 0;
+ return nf;
}
-static int
+/**
+ * nfsd_file_check_write_error - check for writeback errors on a file
+ * @nf: nfsd_file to check for writeback errors
+ *
+ * Check whether a nfsd_file has an unseen error. Reset the write
+ * verifier if so.
+ */
+static void
nfsd_file_check_write_error(struct nfsd_file *nf)
{
struct file *file = nf->nf_file;
- if (!file || !(file->f_mode & FMODE_WRITE))
- return 0;
- return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
+ if ((file->f_mode & FMODE_WRITE) &&
+ filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)))
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
}
static void
-nfsd_file_do_unhash(struct nfsd_file *nf)
+nfsd_file_hash_remove(struct nfsd_file *nf)
{
- lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
-
trace_nfsd_file_unhash(nf);
-
- if (nfsd_file_check_write_error(nf))
- nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id));
- --nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
- hlist_del_rcu(&nf->nf_node);
- atomic_long_dec(&nfsd_filecache_count);
+ rhltable_remove(&nfsd_file_rhltable, &nf->nf_rlist,
+ nfsd_file_rhash_params);
}
static bool
nfsd_file_unhash(struct nfsd_file *nf)
{
if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- nfsd_file_do_unhash(nf);
- if (!list_empty(&nf->nf_lru))
- list_lru_del(&nfsd_file_lru, &nf->nf_lru);
+ nfsd_file_hash_remove(nf);
return true;
}
return false;
}
-/*
- * Return true if the file was unhashed.
- */
-static bool
-nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose)
+static void
+nfsd_file_free(struct nfsd_file *nf)
{
- lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+ s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
- trace_nfsd_file_unhash_and_release_locked(nf);
- if (!nfsd_file_unhash(nf))
- return false;
- /* keep final reference for nfsd_file_lru_dispose */
- if (refcount_dec_not_one(&nf->nf_ref))
- return true;
+ trace_nfsd_file_free(nf);
- list_add(&nf->nf_lru, dispose);
- return true;
+ this_cpu_inc(nfsd_file_releases);
+ this_cpu_add(nfsd_file_total_age, age);
+
+ nfsd_file_unhash(nf);
+ if (nf->nf_mark)
+ nfsd_file_mark_put(nf->nf_mark);
+ if (nf->nf_file) {
+ nfsd_file_check_write_error(nf);
+ nfsd_filp_close(nf->nf_file);
+ }
+
+ /*
+ * If this item is still linked via nf_lru, that's a bug.
+ * WARN and leak it to preserve system stability.
+ */
+ if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
+ return;
+
+ call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
}
-static void
-nfsd_file_put_noref(struct nfsd_file *nf)
+static bool
+nfsd_file_check_writeback(struct nfsd_file *nf)
{
- trace_nfsd_file_put(nf);
+ struct file *file = nf->nf_file;
+ struct address_space *mapping;
- if (refcount_dec_and_test(&nf->nf_ref)) {
- WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
- nfsd_file_free(nf);
- }
+ /* File not open for write? */
+ if (!(file->f_mode & FMODE_WRITE))
+ return false;
+
+ /*
+ * Some filesystems (e.g. NFS) flush all dirty data on close.
+ * On others, there is no need to wait for writeback.
+ */
+ if (!(file_inode(file)->i_sb->s_export_op->flags & EXPORT_OP_FLUSH_ON_CLOSE))
+ return false;
+
+ mapping = file->f_mapping;
+ return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
+ mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
}
-void
-nfsd_file_put(struct nfsd_file *nf)
+static void nfsd_file_lru_add(struct nfsd_file *nf)
{
- bool is_hashed;
+ refcount_inc(&nf->nf_ref);
+ if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru))
+ trace_nfsd_file_lru_add(nf);
+ else
+ WARN_ON(1);
+ nfsd_file_schedule_laundrette();
+}
- set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (refcount_read(&nf->nf_ref) > 2 || !nf->nf_file) {
- nfsd_file_put_noref(nf);
- return;
+static bool nfsd_file_lru_remove(struct nfsd_file *nf)
+{
+ if (list_lru_del_obj(&nfsd_file_lru, &nf->nf_lru)) {
+ trace_nfsd_file_lru_del(nf);
+ return true;
}
-
- filemap_flush(nf->nf_file->f_mapping);
- is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0;
- nfsd_file_put_noref(nf);
- if (is_hashed)
- nfsd_file_schedule_laundrette();
- if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)
- nfsd_file_gc();
+ return false;
}
struct nfsd_file *
nfsd_file_get(struct nfsd_file *nf)
{
- if (likely(refcount_inc_not_zero(&nf->nf_ref)))
+ if (nf && refcount_inc_not_zero(&nf->nf_ref))
return nf;
return NULL;
}
-static void
-nfsd_file_dispose_list(struct list_head *dispose)
+/**
+ * nfsd_file_put - put the reference to a nfsd_file
+ * @nf: nfsd_file of which to put the reference
+ *
+ * Put a reference to a nfsd_file. In the non-GC case, we just put the
+ * reference immediately. In the GC case, if the reference would be
+ * the last one, the put it on the LRU instead to be cleaned up later.
+ */
+void
+nfsd_file_put(struct nfsd_file *nf)
{
- struct nfsd_file *nf;
+ might_sleep();
+ trace_nfsd_file_put(nf);
- while(!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del(&nf->nf_lru);
- nfsd_file_put_noref(nf);
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
+ test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ set_bit(NFSD_FILE_RECENT, &nf->nf_flags);
}
+
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
}
-static void
-nfsd_file_dispose_list_sync(struct list_head *dispose)
+/**
+ * nfsd_file_put_local - put nfsd_file reference and arm nfsd_net_put in caller
+ * @pnf: nfsd_file of which to put the reference
+ *
+ * First save the associated net to return to caller, then put
+ * the reference of the nfsd_file.
+ */
+struct net *
+nfsd_file_put_local(struct nfsd_file __rcu **pnf)
{
- bool flush = false;
struct nfsd_file *nf;
+ struct net *net = NULL;
- while(!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del(&nf->nf_lru);
- if (!refcount_dec_and_test(&nf->nf_ref))
- continue;
- if (nfsd_file_free(nf))
- flush = true;
+ nf = unrcu_pointer(xchg(pnf, NULL));
+ if (nf) {
+ net = nf->nf_net;
+ nfsd_file_put(nf);
}
- if (flush)
- flush_delayed_fput();
+ return net;
}
-static void
-nfsd_file_list_remove_disposal(struct list_head *dst,
- struct nfsd_fcache_disposal *l)
+/**
+ * nfsd_file_file - get the backing file of an nfsd_file
+ * @nf: nfsd_file of which to access the backing file.
+ *
+ * Return backing file for @nf.
+ */
+struct file *
+nfsd_file_file(struct nfsd_file *nf)
{
- spin_lock(&l->lock);
- list_splice_init(&l->freeme, dst);
- spin_unlock(&l->lock);
+ return nf->nf_file;
}
static void
-nfsd_file_list_add_disposal(struct list_head *files, struct net *net)
+nfsd_file_dispose_list(struct list_head *dispose)
{
- struct nfsd_fcache_disposal *l;
+ struct nfsd_file *nf;
- rcu_read_lock();
- list_for_each_entry_rcu(l, &laundrettes, list) {
- if (l->net == net) {
- spin_lock(&l->lock);
- list_splice_tail_init(files, &l->freeme);
- spin_unlock(&l->lock);
- queue_work(nfsd_filecache_wq, &l->work);
- break;
- }
+ while (!list_empty(dispose)) {
+ nf = list_first_entry(dispose, struct nfsd_file, nf_gc);
+ list_del_init(&nf->nf_gc);
+ nfsd_file_free(nf);
}
- rcu_read_unlock();
}
+/**
+ * nfsd_file_dispose_list_delayed - move list of dead files to net's freeme list
+ * @dispose: list of nfsd_files to be disposed
+ *
+ * Transfers each file to the "freeme" list for its nfsd_net, to eventually
+ * be disposed of by the per-net garbage collector.
+ */
static void
-nfsd_file_list_add_pernet(struct list_head *dst, struct list_head *src,
- struct net *net)
+nfsd_file_dispose_list_delayed(struct list_head *dispose)
{
- struct nfsd_file *nf, *tmp;
+ while(!list_empty(dispose)) {
+ struct nfsd_file *nf = list_first_entry(dispose,
+ struct nfsd_file, nf_gc);
+ struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
+ struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+ struct svc_serv *serv;
- list_for_each_entry_safe(nf, tmp, src, nf_lru) {
- if (nf->nf_net == net)
- list_move_tail(&nf->nf_lru, dst);
+ spin_lock(&l->lock);
+ list_move_tail(&nf->nf_gc, &l->freeme);
+ spin_unlock(&l->lock);
+
+ /*
+ * The filecache laundrette is shut down after the
+ * nn->nfsd_serv pointer is cleared, but before the
+ * svc_serv is freed.
+ */
+ serv = nn->nfsd_serv;
+ if (serv)
+ svc_wake_up(serv);
}
}
-static void
-nfsd_file_dispose_list_delayed(struct list_head *dispose)
+/**
+ * nfsd_file_net_dispose - deal with nfsd_files waiting to be disposed.
+ * @nn: nfsd_net in which to find files to be disposed.
+ *
+ * When files held open for nfsv3 are removed from the filecache, whether
+ * due to memory pressure or garbage collection, they are queued to
+ * a per-net-ns queue. This function completes the disposal, either
+ * directly or by waking another nfsd thread to help with the work.
+ */
+void nfsd_file_net_dispose(struct nfsd_net *nn)
{
- LIST_HEAD(list);
- struct nfsd_file *nf;
-
- while(!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- nfsd_file_list_add_pernet(&list, dispose, nf->nf_net);
- nfsd_file_list_add_disposal(&list, nf->nf_net);
+ struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+
+ if (!list_empty(&l->freeme)) {
+ LIST_HEAD(dispose);
+ int i;
+
+ spin_lock(&l->lock);
+ for (i = 0; i < 8 && !list_empty(&l->freeme); i++)
+ list_move(l->freeme.next, &dispose);
+ spin_unlock(&l->lock);
+ if (!list_empty(&l->freeme))
+ /* Wake up another thread to share the work
+ * *before* doing any actual disposing.
+ */
+ svc_wake_up(nn->nfsd_serv);
+ nfsd_file_dispose_list(&dispose);
}
}
-/*
- * Note this can deadlock with nfsd_file_cache_purge.
+/**
+ * nfsd_file_lru_cb - Examine an entry on the LRU list
+ * @item: LRU entry to examine
+ * @lru: controlling LRU
+ * @arg: dispose list
+ *
+ * Return values:
+ * %LRU_REMOVED: @item was removed from the LRU
+ * %LRU_ROTATE: @item is to be moved to the LRU tail
+ * %LRU_SKIP: @item cannot be evicted
*/
static enum lru_status
nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
- spinlock_t *lock, void *arg)
- __releases(lock)
- __acquires(lock)
+ void *arg)
{
struct list_head *head = arg;
struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
- /*
- * Do a lockless refcount check. The hashtable holds one reference, so
- * we look to see if anything else has a reference, or if any have
- * been put since the shrinker last ran. Those don't get unhashed and
- * released.
- *
- * Note that in the put path, we set the flag and then decrement the
- * counter. Here we check the counter and then test and clear the flag.
- * That order is deliberate to ensure that we can do this locklessly.
- */
- if (refcount_read(&nf->nf_ref) > 1)
- goto out_skip;
+ /* We should only be dealing with GC entries here */
+ WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
/*
* Don't throw out files that are still undergoing I/O or
* that have uncleared errors pending.
*/
- if (nfsd_file_check_writeback(nf))
- goto out_skip;
+ if (nfsd_file_check_writeback(nf)) {
+ trace_nfsd_file_gc_writeback(nf);
+ return LRU_SKIP;
+ }
- if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags))
- goto out_skip;
+ /* If it was recently added to the list, skip it */
+ if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
+ trace_nfsd_file_gc_referenced(nf);
+ return LRU_ROTATE;
+ }
- if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags))
- goto out_skip;
+ /*
+ * Put the reference held on behalf of the LRU if it is the last
+ * reference, else rotate.
+ */
+ if (!refcount_dec_if_one(&nf->nf_ref)) {
+ trace_nfsd_file_gc_in_use(nf);
+ return LRU_ROTATE;
+ }
- list_lru_isolate_move(lru, &nf->nf_lru, head);
+ /* Refcount went to zero. Unhash it and queue it to the dispose list */
+ nfsd_file_unhash(nf);
+ list_lru_isolate(lru, &nf->nf_lru);
+ list_add(&nf->nf_gc, head);
+ this_cpu_inc(nfsd_file_evictions);
+ trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
-out_skip:
- return LRU_SKIP;
}
-static unsigned long
-nfsd_file_lru_walk_list(struct shrink_control *sc)
+static enum lru_status
+nfsd_file_gc_cb(struct list_head *item, struct list_lru_one *lru,
+ void *arg)
{
- LIST_HEAD(head);
- struct nfsd_file *nf;
- unsigned long ret;
+ struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
- if (sc)
- ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
- nfsd_file_lru_cb, &head);
- else
- ret = list_lru_walk(&nfsd_file_lru,
- nfsd_file_lru_cb,
- &head, LONG_MAX);
- list_for_each_entry(nf, &head, nf_lru) {
- spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
- nfsd_file_do_unhash(nf);
- spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+ if (test_and_clear_bit(NFSD_FILE_RECENT, &nf->nf_flags)) {
+ /*
+ * "REFERENCED" really means "should be at the end of the
+ * LRU. As we are putting it there we can clear the flag.
+ */
+ clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ trace_nfsd_file_gc_aged(nf);
+ return LRU_ROTATE;
}
- nfsd_file_dispose_list_delayed(&head);
- return ret;
+ return nfsd_file_lru_cb(item, lru, arg);
}
+/* If the shrinker runs between calls to list_lru_walk_node() in
+ * nfsd_file_gc(), the "remaining" count will be wrong. This could
+ * result in premature freeing of some files. This may not matter much
+ * but is easy to fix with this spinlock which temporarily disables
+ * the shrinker.
+ */
+static DEFINE_SPINLOCK(nfsd_gc_lock);
static void
nfsd_file_gc(void)
{
- nfsd_file_lru_walk_list(NULL);
+ unsigned long ret = 0;
+ LIST_HEAD(dispose);
+ int nid;
+
+ spin_lock(&nfsd_gc_lock);
+ for_each_node_state(nid, N_NORMAL_MEMORY) {
+ unsigned long remaining = list_lru_count_node(&nfsd_file_lru, nid);
+
+ while (remaining > 0) {
+ unsigned long nr = min(remaining, NFSD_FILE_GC_BATCH);
+
+ remaining -= nr;
+ ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb,
+ &dispose, &nr);
+ if (nr)
+ /* walk aborted early */
+ remaining = 0;
+ }
+ }
+ spin_unlock(&nfsd_gc_lock);
+ trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
+ nfsd_file_dispose_list_delayed(&dispose);
}
static void
nfsd_file_gc_worker(struct work_struct *work)
{
- nfsd_file_gc();
+ if (list_lru_count(&nfsd_file_lru))
+ nfsd_file_gc();
nfsd_file_schedule_laundrette();
}
@@ -496,100 +603,136 @@ nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc)
static unsigned long
nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
{
- return nfsd_file_lru_walk_list(sc);
+ LIST_HEAD(dispose);
+ unsigned long ret;
+
+ if (!spin_trylock(&nfsd_gc_lock))
+ return SHRINK_STOP;
+
+ ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
+ nfsd_file_lru_cb, &dispose);
+ spin_unlock(&nfsd_gc_lock);
+ trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
+ nfsd_file_dispose_list_delayed(&dispose);
+ return ret;
}
-static struct shrinker nfsd_file_shrinker = {
- .scan_objects = nfsd_file_lru_scan,
- .count_objects = nfsd_file_lru_count,
- .seeks = 1,
-};
+static struct shrinker *nfsd_file_shrinker;
+/**
+ * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file
+ * @nf: nfsd_file to attempt to queue
+ * @dispose: private list to queue successfully-put objects
+ *
+ * Unhash an nfsd_file, try to get a reference to it, and then put that
+ * reference. If it's the last reference, queue it to the dispose list.
+ */
static void
-__nfsd_file_close_inode(struct inode *inode, unsigned int hashval,
- struct list_head *dispose)
+nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose)
+ __must_hold(RCU)
{
- struct nfsd_file *nf;
- struct hlist_node *tmp;
+ int decrement = 1;
+
+ /* If we raced with someone else unhashing, ignore it */
+ if (!nfsd_file_unhash(nf))
+ return;
+
+ /* If we can't get a reference, ignore it */
+ if (!nfsd_file_get(nf))
+ return;
- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
- if (inode == nf->nf_inode)
- nfsd_file_unhash_and_release_locked(nf, dispose);
+ /* Extra decrement if we remove from the LRU */
+ if (nfsd_file_lru_remove(nf))
+ ++decrement;
+
+ /* If refcount goes to 0, then put on the dispose list */
+ if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
+ list_add(&nf->nf_gc, dispose);
+ trace_nfsd_file_closing(nf);
}
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
}
/**
- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
- * @inode: inode of the file to attempt to remove
+ * nfsd_file_queue_for_close: try to close out any open nfsd_files for an inode
+ * @inode: inode on which to close out nfsd_files
+ * @dispose: list on which to gather nfsd_files to close out
*
- * Walk the whole hash bucket, looking for any files that correspond to "inode".
- * If any do, then unhash them and put the hashtable reference to them and
- * destroy any that had their last reference put. Also ensure that any of the
- * fputs also have their final __fput done as well.
+ * An nfsd_file represents a struct file being held open on behalf of nfsd.
+ * An open file however can block other activity (such as leases), or cause
+ * undesirable behavior (e.g. spurious silly-renames when reexporting NFS).
+ *
+ * This function is intended to find open nfsd_files when this sort of
+ * conflicting access occurs and then attempt to close those files out.
+ *
+ * Populates the dispose list with entries that have already had their
+ * refcounts go to zero. The actual free of an nfsd_file can be expensive,
+ * so we leave it up to the caller whether it wants to wait or not.
*/
-void
-nfsd_file_close_inode_sync(struct inode *inode)
+static void
+nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose)
{
- unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
- NFSD_FILE_HASH_BITS);
- LIST_HEAD(dispose);
+ struct rhlist_head *tmp, *list;
+ struct nfsd_file *nf;
- __nfsd_file_close_inode(inode, hashval, &dispose);
- trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose));
- nfsd_file_dispose_list_sync(&dispose);
+ rcu_read_lock();
+ list = rhltable_lookup(&nfsd_file_rhltable, &inode,
+ nfsd_file_rhash_params);
+ rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) {
+ if (!test_bit(NFSD_FILE_GC, &nf->nf_flags))
+ continue;
+ nfsd_file_cond_queue(nf, dispose);
+ }
+ rcu_read_unlock();
}
/**
- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Walk the whole hash bucket, looking for any files that correspond to "inode".
- * If any do, then unhash them and put the hashtable reference to them and
- * destroy any that had their last reference put.
+ * Close out any open nfsd_files that can be reaped for @inode. The
+ * actual freeing is deferred to the dispose_list_delayed infrastructure.
+ *
+ * This is used by the fsnotify callbacks and setlease notifier.
*/
static void
nfsd_file_close_inode(struct inode *inode)
{
- unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
- NFSD_FILE_HASH_BITS);
LIST_HEAD(dispose);
- __nfsd_file_close_inode(inode, hashval, &dispose);
- trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose));
+ nfsd_file_queue_for_close(inode, &dispose);
nfsd_file_dispose_list_delayed(&dispose);
}
/**
- * nfsd_file_delayed_close - close unused nfsd_files
- * @work: dummy
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
*
- * Walk the LRU list and close any entries that have not been used since
- * the last scan.
+ * Close out any open nfsd_files that can be reaped for @inode. The
+ * nfsd_files are closed out synchronously.
*
- * Note this can deadlock with nfsd_file_cache_purge.
+ * This is called from nfsd_rename and nfsd_unlink to avoid silly-renames
+ * when reexporting NFS.
*/
-static void
-nfsd_file_delayed_close(struct work_struct *work)
+void
+nfsd_file_close_inode_sync(struct inode *inode)
{
- LIST_HEAD(head);
- struct nfsd_fcache_disposal *l = container_of(work,
- struct nfsd_fcache_disposal, work);
+ LIST_HEAD(dispose);
+
+ trace_nfsd_file_close(inode);
- nfsd_file_list_remove_disposal(&head, l);
- nfsd_file_dispose_list(&head);
+ nfsd_file_queue_for_close(inode, &dispose);
+ nfsd_file_dispose_list(&dispose);
}
static int
nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
void *data)
{
- struct file_lock *fl = data;
+ struct file_lease *fl = data;
/* Only close files for F_SETLEASE leases */
- if (fl->fl_flags & FL_LEASE)
- nfsd_file_close_inode_sync(file_inode(fl->fl_file));
+ if (fl->c.flc_flags & FL_LEASE)
+ nfsd_file_close_inode(file_inode(fl->c.flc_file));
return 0;
}
@@ -598,12 +741,13 @@ static struct notifier_block nfsd_file_lease_notifier = {
};
static int
-nfsd_file_fsnotify_handle_event(struct fsnotify_group *group,
- struct inode *inode,
- u32 mask, const void *data, int data_type,
- const struct qstr *file_name, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *name, u32 cookie)
{
+ if (WARN_ON_ONCE(!inode))
+ return 0;
+
trace_nfsd_file_fsnotify_handle_event(inode, mask);
/* Should be no marks on non-regular files */
@@ -624,85 +768,80 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_group *group,
static const struct fsnotify_ops nfsd_file_fsnotify_ops = {
- .handle_event = nfsd_file_fsnotify_handle_event,
+ .handle_inode_event = nfsd_file_fsnotify_handle_event,
.free_mark = nfsd_file_mark_free,
};
int
nfsd_file_cache_init(void)
{
- int ret = -ENOMEM;
- unsigned int i;
-
- clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+ int ret;
- if (nfsd_file_hashtbl)
+ lockdep_assert_held(&nfsd_mutex);
+ if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1)
return 0;
- nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0);
- if (!nfsd_filecache_wq)
+ ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params);
+ if (ret)
goto out;
- nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
- sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
- if (!nfsd_file_hashtbl) {
- pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
- goto out_err;
- }
-
- nfsd_file_slab = kmem_cache_create("nfsd_file",
- sizeof(struct nfsd_file), 0, 0, NULL);
+ ret = -ENOMEM;
+ nfsd_file_slab = KMEM_CACHE(nfsd_file, 0);
if (!nfsd_file_slab) {
pr_err("nfsd: unable to create nfsd_file_slab\n");
goto out_err;
}
- nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark",
- sizeof(struct nfsd_file_mark), 0, 0, NULL);
+ nfsd_file_mark_slab = KMEM_CACHE(nfsd_file_mark, 0);
if (!nfsd_file_mark_slab) {
pr_err("nfsd: unable to create nfsd_file_mark_slab\n");
goto out_err;
}
-
ret = list_lru_init(&nfsd_file_lru);
if (ret) {
pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret);
goto out_err;
}
- ret = register_shrinker(&nfsd_file_shrinker);
- if (ret) {
- pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret);
+ nfsd_file_shrinker = shrinker_alloc(0, "nfsd-filecache");
+ if (!nfsd_file_shrinker) {
+ ret = -ENOMEM;
+ pr_err("nfsd: failed to allocate nfsd_file_shrinker\n");
goto out_lru;
}
+ nfsd_file_shrinker->count_objects = nfsd_file_lru_count;
+ nfsd_file_shrinker->scan_objects = nfsd_file_lru_scan;
+ nfsd_file_shrinker->seeks = 1;
+
+ shrinker_register(nfsd_file_shrinker);
+
ret = lease_register_notifier(&nfsd_file_lease_notifier);
if (ret) {
pr_err("nfsd: unable to register lease notifier: %d\n", ret);
goto out_shrinker;
}
- nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops);
+ nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops,
+ 0);
if (IS_ERR(nfsd_file_fsnotify_group)) {
pr_err("nfsd: unable to create fsnotify group: %ld\n",
PTR_ERR(nfsd_file_fsnotify_group));
+ ret = PTR_ERR(nfsd_file_fsnotify_group);
nfsd_file_fsnotify_group = NULL;
goto out_notifier;
}
- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head);
- spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock);
- }
-
INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker);
out:
+ if (ret)
+ clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags);
return ret;
out_notifier:
lease_unregister_notifier(&nfsd_file_lease_notifier);
out_shrinker:
- unregister_shrinker(&nfsd_file_shrinker);
+ shrinker_free(nfsd_file_shrinker);
out_lru:
list_lru_destroy(&nfsd_file_lru);
out_err:
@@ -710,58 +849,59 @@ out_err:
nfsd_file_slab = NULL;
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
- nfsd_file_hashtbl = NULL;
- destroy_workqueue(nfsd_filecache_wq);
- nfsd_filecache_wq = NULL;
+ rhltable_destroy(&nfsd_file_rhltable);
goto out;
}
-/*
- * Note this can deadlock with nfsd_file_lru_cb.
+/**
+ * __nfsd_file_cache_purge: clean out the cache for shutdown
+ * @net: net-namespace to shut down the cache (may be NULL)
+ *
+ * Walk the nfsd_file cache and close out any that match @net. If @net is NULL,
+ * then close out everything. Called when an nfsd instance is being shut down,
+ * and when the exports table is flushed.
*/
-void
-nfsd_file_cache_purge(struct net *net)
+static void
+__nfsd_file_cache_purge(struct net *net)
{
- unsigned int i;
- struct nfsd_file *nf;
- struct hlist_node *next;
+ struct rhashtable_iter iter;
+ struct nfsd_file *nf;
LIST_HEAD(dispose);
- bool del;
- if (!nfsd_file_hashtbl)
- return;
-
- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i];
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ if (net) {
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ nfs_localio_invalidate_clients(&nn->local_clients,
+ &nn->local_clients_lock);
+ }
+#endif
- spin_lock(&nfb->nfb_lock);
- hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) {
- if (net && nf->nf_net != net)
- continue;
- del = nfsd_file_unhash_and_release_locked(nf, &dispose);
+ rhltable_walk_enter(&nfsd_file_rhltable, &iter);
+ do {
+ rhashtable_walk_start(&iter);
- /*
- * Deadlock detected! Something marked this entry as
- * unhased, but hasn't removed it from the hash list.
- */
- WARN_ON_ONCE(!del);
+ nf = rhashtable_walk_next(&iter);
+ while (!IS_ERR_OR_NULL(nf)) {
+ if (!net || nf->nf_net == net)
+ nfsd_file_cond_queue(nf, &dispose);
+ nf = rhashtable_walk_next(&iter);
}
- spin_unlock(&nfb->nfb_lock);
- nfsd_file_dispose_list(&dispose);
- }
+
+ rhashtable_walk_stop(&iter);
+ } while (nf == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
+
+ nfsd_file_dispose_list(&dispose);
}
static struct nfsd_fcache_disposal *
-nfsd_alloc_fcache_disposal(struct net *net)
+nfsd_alloc_fcache_disposal(void)
{
struct nfsd_fcache_disposal *l;
l = kmalloc(sizeof(*l), GFP_KERNEL);
if (!l)
return NULL;
- INIT_WORK(&l->work, nfsd_file_delayed_close);
- l->net = net;
spin_lock_init(&l->lock);
INIT_LIST_HEAD(&l->freeme);
return l;
@@ -770,61 +910,39 @@ nfsd_alloc_fcache_disposal(struct net *net)
static void
nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
{
- rcu_assign_pointer(l->net, NULL);
- cancel_work_sync(&l->work);
nfsd_file_dispose_list(&l->freeme);
- kfree_rcu(l, rcu);
-}
-
-static void
-nfsd_add_fcache_disposal(struct nfsd_fcache_disposal *l)
-{
- spin_lock(&laundrette_lock);
- list_add_tail_rcu(&l->list, &laundrettes);
- spin_unlock(&laundrette_lock);
+ kfree(l);
}
static void
-nfsd_del_fcache_disposal(struct nfsd_fcache_disposal *l)
-{
- spin_lock(&laundrette_lock);
- list_del_rcu(&l->list);
- spin_unlock(&laundrette_lock);
-}
-
-static int
-nfsd_alloc_fcache_disposal_net(struct net *net)
+nfsd_free_fcache_disposal_net(struct net *net)
{
- struct nfsd_fcache_disposal *l;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct nfsd_fcache_disposal *l = nn->fcache_disposal;
- l = nfsd_alloc_fcache_disposal(net);
- if (!l)
- return -ENOMEM;
- nfsd_add_fcache_disposal(l);
- return 0;
+ nfsd_free_fcache_disposal(l);
}
-static void
-nfsd_free_fcache_disposal_net(struct net *net)
+int
+nfsd_file_cache_start_net(struct net *net)
{
- struct nfsd_fcache_disposal *l;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- rcu_read_lock();
- list_for_each_entry_rcu(l, &laundrettes, list) {
- if (l->net != net)
- continue;
- nfsd_del_fcache_disposal(l);
- rcu_read_unlock();
- nfsd_free_fcache_disposal(l);
- return;
- }
- rcu_read_unlock();
+ nn->fcache_disposal = nfsd_alloc_fcache_disposal();
+ return nn->fcache_disposal ? 0 : -ENOMEM;
}
-int
-nfsd_file_cache_start_net(struct net *net)
+/**
+ * nfsd_file_cache_purge - Remove all cache items associated with @net
+ * @net: target net namespace
+ *
+ */
+void
+nfsd_file_cache_purge(struct net *net)
{
- return nfsd_alloc_fcache_disposal_net(net);
+ lockdep_assert_held(&nfsd_mutex);
+ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1)
+ __nfsd_file_cache_purge(net);
}
void
@@ -837,16 +955,20 @@ nfsd_file_cache_shutdown_net(struct net *net)
void
nfsd_file_cache_shutdown(void)
{
- set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
+ int i;
+
+ lockdep_assert_held(&nfsd_mutex);
+ if (test_and_clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0)
+ return;
lease_unregister_notifier(&nfsd_file_lease_notifier);
- unregister_shrinker(&nfsd_file_shrinker);
+ shrinker_free(nfsd_file_shrinker);
/*
* make sure all callers of nfsd_file_lru_cb are done before
* calling nfsd_file_cache_purge
*/
cancel_delayed_work_sync(&nfsd_filecache_laundrette);
- nfsd_file_cache_purge(NULL);
+ __nfsd_file_cache_purge(NULL);
list_lru_destroy(&nfsd_file_lru);
rcu_barrier();
fsnotify_put_group(nfsd_file_fsnotify_group);
@@ -856,238 +978,453 @@ nfsd_file_cache_shutdown(void)
fsnotify_wait_marks_destroyed();
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
- nfsd_file_hashtbl = NULL;
- destroy_workqueue(nfsd_filecache_wq);
- nfsd_filecache_wq = NULL;
-}
-
-static bool
-nfsd_match_cred(const struct cred *c1, const struct cred *c2)
-{
- int i;
-
- if (!uid_eq(c1->fsuid, c2->fsuid))
- return false;
- if (!gid_eq(c1->fsgid, c2->fsgid))
- return false;
- if (c1->group_info == NULL || c2->group_info == NULL)
- return c1->group_info == c2->group_info;
- if (c1->group_info->ngroups != c2->group_info->ngroups)
- return false;
- for (i = 0; i < c1->group_info->ngroups; i++) {
- if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
- return false;
+ rhltable_destroy(&nfsd_file_rhltable);
+
+ for_each_possible_cpu(i) {
+ per_cpu(nfsd_file_cache_hits, i) = 0;
+ per_cpu(nfsd_file_acquisitions, i) = 0;
+ per_cpu(nfsd_file_allocations, i) = 0;
+ per_cpu(nfsd_file_releases, i) = 0;
+ per_cpu(nfsd_file_total_age, i) = 0;
+ per_cpu(nfsd_file_evictions, i) = 0;
}
- return true;
}
static struct nfsd_file *
-nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
- unsigned int hashval, struct net *net)
+nfsd_file_lookup_locked(const struct net *net, const struct cred *cred,
+ struct inode *inode, unsigned char need,
+ bool want_gc)
{
+ struct rhlist_head *tmp, *list;
struct nfsd_file *nf;
- unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
- hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
- nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) {
- if ((need & nf->nf_may) != need)
- continue;
- if (nf->nf_inode != inode)
+ list = rhltable_lookup(&nfsd_file_rhltable, &inode,
+ nfsd_file_rhash_params);
+ rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) {
+ if (nf->nf_may != need)
continue;
if (nf->nf_net != net)
continue;
- if (!nfsd_match_cred(nf->nf_cred, current_cred()))
+ if (!nfsd_match_cred(nf->nf_cred, cred))
+ continue;
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != want_gc)
+ continue;
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
+ continue;
+
+ if (!nfsd_file_get(nf))
continue;
- if (nfsd_file_get(nf) != NULL)
- return nf;
+ return nf;
}
return NULL;
}
/**
- * nfsd_file_is_cached - are there any cached open files for this fh?
- * @inode: inode of the file to check
+ * nfsd_file_is_cached - are there any cached open files for this inode?
+ * @inode: inode to check
*
- * Scan the hashtable for open files that match this fh. Returns true if there
- * are any, and false if not.
+ * The lookup matches inodes in all net namespaces and is atomic wrt
+ * nfsd_file_acquire().
+ *
+ * Return values:
+ * %true: filecache contains at least one file matching this inode
+ * %false: filecache contains no files matching this inode
*/
bool
nfsd_file_is_cached(struct inode *inode)
{
- bool ret = false;
- struct nfsd_file *nf;
- unsigned int hashval;
-
- hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
+ struct rhlist_head *tmp, *list;
+ struct nfsd_file *nf;
+ bool ret = false;
rcu_read_lock();
- hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
- nf_node) {
- if (inode == nf->nf_inode) {
+ list = rhltable_lookup(&nfsd_file_rhltable, &inode,
+ nfsd_file_rhash_params);
+ rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist)
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
ret = true;
break;
}
- }
rcu_read_unlock();
- trace_nfsd_file_is_cached(inode, hashval, (int)ret);
+
+ trace_nfsd_file_is_cached(inode, (int)ret);
return ret;
}
-__be32
-nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **pnf)
+static __be32
+nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf)
{
- __be32 status;
- struct net *net = SVC_NET(rqstp);
- struct nfsd_file *nf, *new;
- struct inode *inode;
- unsigned int hashval;
- bool retry = true;
+ struct inode *inode = file_inode(nf->nf_file);
+ struct kstat stat;
+ __be32 status;
- /* FIXME: skip this if fh_dentry is already set? */
- status = fh_verify(rqstp, fhp, S_IFREG,
- may_flags|NFSD_MAY_OWNER_OVERRIDE);
+ /* Currently only need to get DIO alignment info for regular files */
+ if (!S_ISREG(inode->i_mode))
+ return nfs_ok;
+
+ status = fh_getattr(fhp, &stat);
if (status != nfs_ok)
return status;
- inode = d_inode(fhp->fh_dentry);
- hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
+ trace_nfsd_file_get_dio_attrs(inode, &stat);
+
+ if (stat.result_mask & STATX_DIOALIGN) {
+ nf->nf_dio_mem_align = stat.dio_mem_align;
+ nf->nf_dio_offset_align = stat.dio_offset_align;
+ }
+ if (stat.result_mask & STATX_DIO_READ_ALIGN)
+ nf->nf_dio_read_offset_align = stat.dio_read_offset_align;
+ else
+ nf->nf_dio_read_offset_align = nf->nf_dio_offset_align;
+
+ return nfs_ok;
+}
+
+static __be32
+nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
+ struct svc_cred *cred,
+ struct auth_domain *client,
+ struct svc_fh *fhp,
+ unsigned int may_flags, struct file *file,
+ umode_t type, bool want_gc, struct nfsd_file **pnf)
+{
+ unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
+ struct nfsd_file *new, *nf;
+ bool stale_retry = true;
+ bool open_retry = true;
+ struct inode *inode;
+ __be32 status;
+ int ret;
+
retry:
+ if (rqstp)
+ status = fh_verify(rqstp, fhp, type,
+ may_flags|NFSD_MAY_OWNER_OVERRIDE);
+ else
+ status = fh_verify_local(net, cred, client, fhp, type,
+ may_flags|NFSD_MAY_OWNER_OVERRIDE);
+
+ if (status != nfs_ok)
+ return status;
+ inode = d_inode(fhp->fh_dentry);
+
rcu_read_lock();
- nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
+ nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
rcu_read_unlock();
+
if (nf)
goto wait_for_construction;
- new = nfsd_file_alloc(inode, may_flags, hashval, net);
+ new = nfsd_file_alloc(net, inode, need, want_gc);
if (!new) {
- trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags,
- NULL, nfserr_jukebox);
- return nfserr_jukebox;
+ status = nfserr_jukebox;
+ goto out;
}
- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
- if (nf == NULL)
+ rcu_read_lock();
+ spin_lock(&inode->i_lock);
+ nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
+ if (unlikely(nf)) {
+ spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
+ nfsd_file_free(new);
+ goto wait_for_construction;
+ }
+ nf = new;
+ ret = rhltable_insert(&nfsd_file_rhltable, &nf->nf_rlist,
+ nfsd_file_rhash_params);
+ spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
+ if (likely(ret == 0))
goto open_file;
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
- nfsd_file_slab_free(&new->nf_rcu);
+
+ trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret);
+ status = nfserr_jukebox;
+ goto construction_err;
wait_for_construction:
wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
/* Did construction of this file fail? */
if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- if (!retry) {
+ trace_nfsd_file_cons_err(rqstp, inode, may_flags, nf);
+ if (!open_retry) {
status = nfserr_jukebox;
- goto out;
+ goto construction_err;
}
- retry = false;
- nfsd_file_put_noref(nf);
+ nfsd_file_put(nf);
+ open_retry = false;
+ fh_put(fhp);
goto retry;
}
-
this_cpu_inc(nfsd_file_cache_hits);
- if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) {
- bool write = (may_flags & NFSD_MAY_WRITE);
-
- if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) ||
- (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) {
- status = nfserrno(nfsd_open_break_lease(
- file_inode(nf->nf_file), may_flags));
- if (status == nfs_ok) {
- clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
- if (write)
- clear_bit(NFSD_FILE_BREAK_WRITE,
- &nf->nf_flags);
- }
- }
+ status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
+ if (status != nfs_ok) {
+ nfsd_file_put(nf);
+ nf = NULL;
}
+
out:
if (status == nfs_ok) {
+ this_cpu_inc(nfsd_file_acquisitions);
+ nfsd_file_check_write_error(nf);
*pnf = nf;
- } else {
- nfsd_file_put(nf);
- nf = NULL;
}
-
- trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status);
+ trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status);
return status;
-open_file:
- nf = new;
- /* Take reference for the hashtable */
- refcount_inc(&nf->nf_ref);
- __set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
- __set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
- list_lru_add(&nfsd_file_lru, &nf->nf_lru);
- hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head);
- ++nfsd_file_hashtbl[hashval].nfb_count;
- nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount,
- nfsd_file_hashtbl[hashval].nfb_count);
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
- if (atomic_long_inc_return(&nfsd_filecache_count) >= NFSD_FILE_LRU_THRESHOLD)
- nfsd_file_gc();
- nf->nf_mark = nfsd_file_mark_find_or_create(nf);
- if (nf->nf_mark)
- status = nfsd_open_verified(rqstp, fhp, S_IFREG,
- may_flags, &nf->nf_file);
- else
+open_file:
+ trace_nfsd_file_alloc(nf);
+
+ if (type == S_IFREG)
+ nf->nf_mark = nfsd_file_mark_find_or_create(inode);
+
+ if (type != S_IFREG || nf->nf_mark) {
+ if (file) {
+ get_file(file);
+ nf->nf_file = file;
+ status = nfs_ok;
+ trace_nfsd_file_opened(nf, status);
+ } else {
+ ret = nfsd_open_verified(fhp, type, may_flags, &nf->nf_file);
+ if (ret == -EOPENSTALE && stale_retry) {
+ stale_retry = false;
+ nfsd_file_unhash(nf);
+ clear_and_wake_up_bit(NFSD_FILE_PENDING,
+ &nf->nf_flags);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
+ nf = NULL;
+ fh_put(fhp);
+ goto retry;
+ }
+ status = nfserrno(ret);
+ trace_nfsd_file_open(nf, status);
+ if (status == nfs_ok)
+ status = nfsd_file_get_dio_attrs(fhp, nf);
+ }
+ } else
status = nfserr_jukebox;
/*
* If construction failed, or we raced with a call to unlink()
* then unhash.
*/
- if (status != nfs_ok || inode->i_nlink == 0) {
- bool do_free;
- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
- do_free = nfsd_file_unhash(nf);
- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
- if (do_free)
- nfsd_file_put_noref(nf);
- }
- clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
- smp_mb__after_atomic();
- wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
+ if (status != nfs_ok || inode->i_nlink == 0)
+ nfsd_file_unhash(nf);
+ else if (want_gc)
+ nfsd_file_lru_add(nf);
+
+ clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+ if (status == nfs_ok)
+ goto out;
+
+construction_err:
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
+ nf = NULL;
goto out;
}
+/**
+ * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * The nfsd_file object returned by this API is reference-counted
+ * and garbage-collected. The object is retained for a few
+ * seconds after the final nfsd_file_put() in case the caller
+ * wants to re-use it.
+ *
+ * Return values:
+ * %nfs_ok - @pnf points to an nfsd_file with its reference
+ * count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL,
+ fhp, may_flags, NULL, S_IFREG, true, pnf);
+}
+
+/**
+ * nfsd_file_acquire - Get a struct nfsd_file with an open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put().
+ *
+ * Return values:
+ * %nfs_ok - @pnf points to an nfsd_file with its reference
+ * count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
+ */
+__be32
+nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL,
+ fhp, may_flags, NULL, S_IFREG, false, pnf);
+}
+
+/**
+ * nfsd_file_acquire_local - Get a struct nfsd_file with an open file for localio
+ * @net: The network namespace in which to perform a lookup
+ * @cred: the user credential with which to validate access
+ * @client: the auth_domain for LOCALIO lookup
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * This file lookup interface provide access to a file given the
+ * filehandle and credential. No connection-based authorisation
+ * is performed and in that way it is quite different to other
+ * file access mediated by nfsd. It allows a kernel module such as the NFS
+ * client to reach across network and filesystem namespaces to access
+ * a file. The security implications of this should be carefully
+ * considered before use.
+ *
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put().
+ *
+ * Return values:
+ * %nfs_ok - @pnf points to an nfsd_file with its reference
+ * count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_local(struct net *net, struct svc_cred *cred,
+ struct auth_domain *client, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ /*
+ * Save creds before calling nfsd_file_do_acquire() (which calls
+ * nfsd_setuser). Important because caller (LOCALIO) is from
+ * client context.
+ */
+ const struct cred *save_cred = get_current_cred();
+ __be32 beres;
+
+ beres = nfsd_file_do_acquire(NULL, net, cred, client, fhp, may_flags,
+ NULL, S_IFREG, false, pnf);
+ put_cred(revert_creds(save_cred));
+ return beres;
+}
+
+/**
+ * nfsd_file_acquire_opened - Get a struct nfsd_file using existing open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file just created
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @file: cached, already-open file (may be NULL)
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * Acquire a nfsd_file object that is not GC'ed. If one doesn't already exist,
+ * and @file is non-NULL, use it to instantiate a new nfsd_file instead of
+ * opening a new one.
+ *
+ * Return values:
+ * %nfs_ok - @pnf points to an nfsd_file with its reference
+ * count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct file *file,
+ struct nfsd_file **pnf)
+{
+ return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL,
+ fhp, may_flags, file, S_IFREG, false, pnf);
+}
+
+/**
+ * nfsd_file_acquire_dir - Get a struct nfsd_file with an open directory
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put(). This opens directories only, and only
+ * in O_RDONLY mode.
+ *
+ * Return values:
+ * %nfs_ok - @pnf points to an nfsd_file with its reference
+ * count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_dir(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file **pnf)
+{
+ return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, fhp,
+ NFSD_MAY_READ|NFSD_MAY_64BIT_COOKIE,
+ NULL, S_IFDIR, false, pnf);
+}
+
/*
* Note that fields may be added, removed or reordered in the future. Programs
* scraping this file for info should test the labels to ensure they're
* getting the correct field.
*/
-static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
+int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
{
- unsigned int i, count = 0, longest = 0;
- unsigned long hits = 0;
+ unsigned long allocations = 0, releases = 0, evictions = 0;
+ unsigned long hits = 0, acquisitions = 0;
+ unsigned int i, count = 0, buckets = 0;
+ unsigned long lru = 0, total_age = 0;
- /*
- * No need for spinlocks here since we're not terribly interested in
- * accuracy. We do take the nfsd_mutex simply to ensure that we
- * don't end up racing with server shutdown
- */
+ /* Serialize with server shutdown */
mutex_lock(&nfsd_mutex);
- if (nfsd_file_hashtbl) {
- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
- count += nfsd_file_hashtbl[i].nfb_count;
- longest = max(longest, nfsd_file_hashtbl[i].nfb_count);
- }
+ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) {
+ struct bucket_table *tbl;
+ struct rhashtable *ht;
+
+ lru = list_lru_count(&nfsd_file_lru);
+
+ rcu_read_lock();
+ ht = &nfsd_file_rhltable.ht;
+ count = atomic_read(&ht->nelems);
+ tbl = rht_dereference_rcu(ht->tbl, ht);
+ buckets = tbl->size;
+ rcu_read_unlock();
}
mutex_unlock(&nfsd_mutex);
- for_each_possible_cpu(i)
+ for_each_possible_cpu(i) {
hits += per_cpu(nfsd_file_cache_hits, i);
+ acquisitions += per_cpu(nfsd_file_acquisitions, i);
+ allocations += per_cpu(nfsd_file_allocations, i);
+ releases += per_cpu(nfsd_file_releases, i);
+ total_age += per_cpu(nfsd_file_total_age, i);
+ evictions += per_cpu(nfsd_file_evictions, i);
+ }
- seq_printf(m, "total entries: %u\n", count);
- seq_printf(m, "longest chain: %u\n", longest);
+ seq_printf(m, "total inodes: %u\n", count);
+ seq_printf(m, "hash buckets: %u\n", buckets);
+ seq_printf(m, "lru entries: %lu\n", lru);
seq_printf(m, "cache hits: %lu\n", hits);
+ seq_printf(m, "acquisitions: %lu\n", acquisitions);
+ seq_printf(m, "allocations: %lu\n", allocations);
+ seq_printf(m, "releases: %lu\n", releases);
+ seq_printf(m, "evictions: %lu\n", evictions);
+ if (releases)
+ seq_printf(m, "mean age (ms): %ld\n", total_age / releases);
+ else
+ seq_printf(m, "mean age (ms): -\n");
return 0;
}
-
-int nfsd_file_cache_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, nfsd_file_cache_stats_show, NULL);
-}