From 182d919b84902eece162c63ed3d476c8016b4197 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Feb 2015 23:47:31 +0000 Subject: FS-Cache: Count culled objects and objects rejected due to lack of space Count the number of objects that get culled by the cache backend and the number of objects that the cache backend declines to instantiate due to lack of space in the cache. These numbers are made available through /proc/fs/fscache/stats Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- Documentation/filesystems/caching/backend-api.txt | 23 +++++++++++ Documentation/filesystems/caching/fscache.txt | 4 ++ fs/cachefiles/internal.h | 1 - fs/cachefiles/namei.c | 33 ++++++++++------ fs/fscache/internal.h | 5 +++ fs/fscache/object.c | 47 +++++++++++++++++++++++ fs/fscache/stats.c | 10 +++++ include/linux/fscache-cache.h | 12 ++++++ 8 files changed, 122 insertions(+), 13 deletions(-) diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt index 277d1e810670..c0bd5677271b 100644 --- a/Documentation/filesystems/caching/backend-api.txt +++ b/Documentation/filesystems/caching/backend-api.txt @@ -676,6 +676,29 @@ FS-Cache provides some utilities that a cache backend may make use of: as possible. + (*) Indicate that a stale object was found and discarded: + + void fscache_object_retrying_stale(struct fscache_object *object); + + This is called to indicate that the lookup procedure found an object in + the cache that the netfs decided was stale. The object has been + discarded from the cache and the lookup will be performed again. + + + (*) Indicate that the caching backend killed an object: + + void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why); + + This is called to indicate that the cache backend preemptively killed an + object. The why parameter should be set to indicate the reason: + + FSCACHE_OBJECT_IS_STALE - the object was stale and needs discarding. + FSCACHE_OBJECT_NO_SPACE - there was insufficient cache space + FSCACHE_OBJECT_WAS_RETIRED - the object was retired when relinquished. + FSCACHE_OBJECT_WAS_CULLED - the object was culled to make space. + + (*) Get and release references on a retrieval record: void fscache_get_retrieval(struct fscache_retrieval *op); diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt index 770267af5b3e..66fa7fbccfa4 100644 --- a/Documentation/filesystems/caching/fscache.txt +++ b/Documentation/filesystems/caching/fscache.txt @@ -303,6 +303,10 @@ proc files. wrp=N Number of in-progress write_page() cache ops ucp=N Number of in-progress uncache_page() cache ops dsp=N Number of in-progress dissociate_pages() cache ops + CacheEv nsp=N Number of object lookups/creations rejected due to lack of space + stl=N Number of stale objects deleted + rtr=N Number of objects retired when relinquished + cul=N Number of objects culled (*) /proc/fs/fscache/histogram diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 8c52472d2efa..aecd0859eacb 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -43,7 +43,6 @@ struct cachefiles_object { loff_t i_size; /* object size */ unsigned long flags; #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ -#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ atomic_t usage; /* object usage count */ uint8_t type; /* object type */ uint8_t new; /* T if object new */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7f8e83f9d74e..1d60195fc518 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -97,7 +97,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object, * call vfs_unlink(), vfs_rmdir() or vfs_rename() */ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, - struct dentry *dentry) + struct dentry *dentry, + enum fscache_why_object_killed why) { struct cachefiles_object *object; struct rb_node *p; @@ -132,8 +133,9 @@ found_dentry: pr_err("\n"); pr_err("Error: Can't preemptively bury live object\n"); cachefiles_printk_object(object, NULL); - } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { - pr_err("Error: Object already preemptively buried\n"); + } else { + if (why != FSCACHE_OBJECT_IS_STALE) + fscache_object_mark_killed(&object->fscache, why); } write_unlock(&cache->active_lock); @@ -265,7 +267,8 @@ requeue: static int cachefiles_bury_object(struct cachefiles_cache *cache, struct dentry *dir, struct dentry *rep, - bool preemptive) + bool preemptive, + enum fscache_why_object_killed why) { struct dentry *grave, *trap; struct path path, path_to_graveyard; @@ -289,7 +292,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, ret = vfs_unlink(dir->d_inode, rep, NULL); if (preemptive) - cachefiles_mark_object_buried(cache, rep); + cachefiles_mark_object_buried(cache, rep, why); } mutex_unlock(&dir->d_inode->i_mutex); @@ -394,7 +397,7 @@ try_again: "Rename failed with error %d", ret); if (preemptive) - cachefiles_mark_object_buried(cache, rep); + cachefiles_mark_object_buried(cache, rep, why); } unlock_rename(cache->graveyard, dir); @@ -422,7 +425,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); @@ -433,7 +436,8 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, * may have been renamed */ if (dir == object->dentry->d_parent) { ret = cachefiles_bury_object(cache, dir, - object->dentry, false); + object->dentry, false, + FSCACHE_OBJECT_WAS_RETIRED); } else { /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore @@ -522,7 +526,7 @@ lookup_again: if (!next->d_inode) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) - goto create_error; + goto no_space_error; path.dentry = dir; ret = security_path_mkdir(&path, next, 0); @@ -551,7 +555,7 @@ lookup_again: if (!next->d_inode) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) - goto create_error; + goto no_space_error; path.dentry = dir; ret = security_path_mknod(&path, next, S_IFREG, 0); @@ -602,7 +606,8 @@ lookup_again: * mutex) */ object->dentry = NULL; - ret = cachefiles_bury_object(cache, dir, next, true); + ret = cachefiles_bury_object(cache, dir, next, true, + FSCACHE_OBJECT_IS_STALE); dput(next); next = NULL; @@ -610,6 +615,7 @@ lookup_again: goto delete_error; _debug("redo lookup"); + fscache_object_retrying_stale(&object->fscache); goto lookup_again; } } @@ -662,6 +668,8 @@ lookup_again: _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino); return 0; +no_space_error: + fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE); create_error: _debug("create error %d", ret); if (ret == -EIO) @@ -927,7 +935,8 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, /* actually remove the victim (drops the dir mutex) */ _debug("bury"); - ret = cachefiles_bury_object(cache, dir, victim, false); + ret = cachefiles_bury_object(cache, dir, victim, false, + FSCACHE_OBJECT_WAS_CULLED); if (ret < 0) goto error; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 7872a62ef30c..3063a58b7d3d 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -271,6 +271,11 @@ extern atomic_t fscache_n_cop_write_page; extern atomic_t fscache_n_cop_uncache_page; extern atomic_t fscache_n_cop_dissociate_pages; +extern atomic_t fscache_n_cache_no_space_reject; +extern atomic_t fscache_n_cache_stale_objects; +extern atomic_t fscache_n_cache_retired_objects; +extern atomic_t fscache_n_cache_culled_objects; + static inline void fscache_stat(atomic_t *stat) { atomic_inc(stat); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index da032daf0e0d..12bb468bf0ae 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -1016,3 +1016,50 @@ static const struct fscache_state *fscache_update_object(struct fscache_object * _leave(""); return transit_to(WAIT_FOR_CMD); } + +/** + * fscache_object_retrying_stale - Note retrying stale object + * @object: The object that will be retried + * + * Note that an object lookup found an on-disk object that was adjudged to be + * stale and has been deleted. The lookup will be retried. + */ +void fscache_object_retrying_stale(struct fscache_object *object) +{ + fscache_stat(&fscache_n_cache_no_space_reject); +} +EXPORT_SYMBOL(fscache_object_retrying_stale); + +/** + * fscache_object_mark_killed - Note that an object was killed + * @object: The object that was culled + * @why: The reason the object was killed. + * + * Note that an object was killed. Returns true if the object was + * already marked killed, false if it wasn't. + */ +void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why) +{ + if (test_and_set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags)) { + pr_err("Error: Object already killed by cache [%s]\n", + object->cache->identifier); + return; + } + + switch (why) { + case FSCACHE_OBJECT_NO_SPACE: + fscache_stat(&fscache_n_cache_no_space_reject); + break; + case FSCACHE_OBJECT_IS_STALE: + fscache_stat(&fscache_n_cache_stale_objects); + break; + case FSCACHE_OBJECT_WAS_RETIRED: + fscache_stat(&fscache_n_cache_retired_objects); + break; + case FSCACHE_OBJECT_WAS_CULLED: + fscache_stat(&fscache_n_cache_culled_objects); + break; + } +} +EXPORT_SYMBOL(fscache_object_mark_killed); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 40d13c70ef51..3a722e8f2307 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -130,6 +130,11 @@ atomic_t fscache_n_cop_write_page; atomic_t fscache_n_cop_uncache_page; atomic_t fscache_n_cop_dissociate_pages; +atomic_t fscache_n_cache_no_space_reject; +atomic_t fscache_n_cache_stale_objects; +atomic_t fscache_n_cache_retired_objects; +atomic_t fscache_n_cache_culled_objects; + /* * display the general statistics */ @@ -271,6 +276,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_cop_write_page), atomic_read(&fscache_n_cop_uncache_page), atomic_read(&fscache_n_cop_dissociate_pages)); + seq_printf(m, "CacheEv: nsp=%d stl=%d rtr=%d cul=%d\n", + atomic_read(&fscache_n_cache_no_space_reject), + atomic_read(&fscache_n_cache_stale_objects), + atomic_read(&fscache_n_cache_retired_objects), + atomic_read(&fscache_n_cache_culled_objects)); return 0; } diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 771484993ca7..c9dafdaf3347 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -371,6 +371,7 @@ struct fscache_object { #define FSCACHE_OBJECT_IS_LOOKED_UP 4 /* T if object has been looked up */ #define FSCACHE_OBJECT_IS_AVAILABLE 5 /* T if object has become active */ #define FSCACHE_OBJECT_RETIRED 6 /* T if object was retired on relinquishment */ +#define FSCACHE_OBJECT_KILLED_BY_CACHE 7 /* T if object was killed by the cache */ struct list_head cache_link; /* link in cache->object_list */ struct hlist_node cookie_link; /* link in cookie->backing_objects */ @@ -551,4 +552,15 @@ extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object, const void *data, uint16_t datalen); +extern void fscache_object_retrying_stale(struct fscache_object *object); + +enum fscache_why_object_killed { + FSCACHE_OBJECT_IS_STALE, + FSCACHE_OBJECT_NO_SPACE, + FSCACHE_OBJECT_WAS_RETIRED, + FSCACHE_OBJECT_WAS_CULLED, +}; +extern void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why); + #endif /* _LINUX_FSCACHE_CACHE_H */ -- cgit From 3c3059841a9b99fa8dd8d878607deceec1e363e9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:39:28 +0000 Subject: FS-Cache: Move fscache_report_unexpected_submission() to make it more available Move fscache_report_unexpected_submission() up within operation.c so that it can be called from fscache_submit_exclusive_op() too. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/operation.c | 74 +++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index e7b87a0e5185..68ead8482617 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -75,6 +75,43 @@ static void fscache_run_op(struct fscache_object *object, fscache_stat(&fscache_n_op_run); } +/* + * report an unexpected submission + */ +static void fscache_report_unexpected_submission(struct fscache_object *object, + struct fscache_operation *op, + const struct fscache_state *ostate) +{ + static bool once_only; + struct fscache_operation *p; + unsigned n; + + if (once_only) + return; + once_only = true; + + kdebug("unexpected submission OP%x [OBJ%x %s]", + op->debug_id, object->debug_id, object->state->name); + kdebug("objstate=%s [%s]", object->state->name, ostate->name); + kdebug("objflags=%lx", object->flags); + kdebug("objevent=%lx [%lx]", object->events, object->event_mask); + kdebug("ops=%u inp=%u exc=%u", + object->n_ops, object->n_in_progress, object->n_exclusive); + + if (!list_empty(&object->pending_ops)) { + n = 0; + list_for_each_entry(p, &object->pending_ops, pend_link) { + ASSERTCMP(p->object, ==, object); + kdebug("%p %p", op->processor, op->release); + n++; + } + + kdebug("n=%u", n); + } + + dump_stack(); +} + /* * submit an exclusive operation for an object * - other ops are excluded from running simultaneously with this one @@ -138,43 +175,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object, return ret; } -/* - * report an unexpected submission - */ -static void fscache_report_unexpected_submission(struct fscache_object *object, - struct fscache_operation *op, - const struct fscache_state *ostate) -{ - static bool once_only; - struct fscache_operation *p; - unsigned n; - - if (once_only) - return; - once_only = true; - - kdebug("unexpected submission OP%x [OBJ%x %s]", - op->debug_id, object->debug_id, object->state->name); - kdebug("objstate=%s [%s]", object->state->name, ostate->name); - kdebug("objflags=%lx", object->flags); - kdebug("objevent=%lx [%lx]", object->events, object->event_mask); - kdebug("ops=%u inp=%u exc=%u", - object->n_ops, object->n_in_progress, object->n_exclusive); - - if (!list_empty(&object->pending_ops)) { - n = 0; - list_for_each_entry(p, &object->pending_ops, pend_link) { - ASSERTCMP(p->object, ==, object); - kdebug("%p %p", op->processor, op->release); - n++; - } - - kdebug("n=%u", n); - } - - dump_stack(); -} - /* * submit an operation for an object * - objects may be submitted only in the following states: -- cgit From 30ceec6284129662efc3a1e7675b2bd857a046fe Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:27 +0000 Subject: FS-Cache: When submitting an op, cancel it if the target object is dying When submitting an operation, prefer to cancel the operation immediately rather than queuing it for later processing if the object is marked as dying (ie. the object state machine has reached the KILL_OBJECT state). Whilst we're at it, change the series of related test_bit() calls into a READ_ONCE() and bitwise-AND operators to reduce the number of load instructions (test_bit() has a volatile address). Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/operation.c | 47 +++++++++++++++++++++++++++---------------- include/linux/fscache-cache.h | 9 +++++++-- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 68ead8482617..dec6defe3be3 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -120,6 +120,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object, int fscache_submit_exclusive_op(struct fscache_object *object, struct fscache_operation *op) { + const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); @@ -132,8 +134,19 @@ int fscache_submit_exclusive_op(struct fscache_object *object, ASSERTCMP(object->n_ops, >=, object->n_exclusive); ASSERT(list_empty(&op->pend_link)); + ostate = object->state; + smp_rmb(); + op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -155,7 +168,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, /* need to issue a new write op after this */ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -164,11 +177,9 @@ int fscache_submit_exclusive_op(struct fscache_object *object, fscache_stat(&fscache_n_op_pend); ret = 0; } else { - /* If we're in any other state, there must have been an I/O - * error of some nature. - */ - ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags)); - ret = -EIO; + fscache_report_unexpected_submission(object, op, ostate); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } spin_unlock(&object->lock); @@ -187,6 +198,7 @@ int fscache_submit_op(struct fscache_object *object, struct fscache_operation *op) { const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},{%u}", @@ -204,7 +216,15 @@ int fscache_submit_op(struct fscache_object *object, smp_rmb(); op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; @@ -222,25 +242,18 @@ int fscache_submit_op(struct fscache_object *object, fscache_run_op(object, op); } ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; - } else if (fscache_object_is_dying(object)) { - fscache_stat(&fscache_n_op_rejected); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + } else { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; - } else { - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; } spin_unlock(&object->lock); diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index c9dafdaf3347..2e83a141e465 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -411,17 +411,22 @@ static inline bool fscache_object_is_available(struct fscache_object *object) return test_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags); } +static inline bool fscache_cache_is_broken(struct fscache_object *object) +{ + return test_bit(FSCACHE_IOERROR, &object->cache->flags); +} + static inline bool fscache_object_is_active(struct fscache_object *object) { return fscache_object_is_available(object) && fscache_object_is_live(object) && - !test_bit(FSCACHE_IOERROR, &object->cache->flags); + !fscache_cache_is_broken(object); } static inline bool fscache_object_is_dead(struct fscache_object *object) { return fscache_object_is_dying(object) && - test_bit(FSCACHE_IOERROR, &object->cache->flags); + fscache_cache_is_broken(object); } /** -- cgit From 6515d1dbf424c5c3b94d44e9c7f581026e7fc0d3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 25 Feb 2015 11:53:57 +0000 Subject: FS-Cache: Handle a new operation submitted against a killed object Reject new operations that are being submitted against an object if that object has failed its lookup or creation states or has been killed by the cache backend for some other reason, such as having been culled. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/object.c | 2 ++ fs/fscache/operation.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 12bb468bf0ae..9b79fc9a1464 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -610,6 +610,8 @@ static const struct fscache_state *fscache_lookup_failure(struct fscache_object object->cache->ops->lookup_complete(object); fscache_stat_d(&fscache_n_cop_lookup_complete); + set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags); + cookie = object->cookie; set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index dec6defe3be3..18658fffbba1 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -176,6 +176,9 @@ int fscache_submit_exclusive_op(struct fscache_object *object, list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; + } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } else { fscache_report_unexpected_submission(object, op, ostate); op->state = FSCACHE_OP_ST_CANCELLED; @@ -249,6 +252,9 @@ int fscache_submit_op(struct fscache_object *object, list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; + } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } else { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); -- cgit From f09b443d0e09f37121c55d7f83056f6ebff6ab4f Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:28 +0000 Subject: FS-Cache: Synchronise object death state change vs operation submission When an object is being marked as no longer live, do this under the object spinlock to prevent a race with operation submission targeted on that object. The problem occurs due to the following pair of intertwined sequences when the cache tries to create an object that would take it over the hard available space limit: NETFS INTERFACE =============== (A) The netfs calls fscache_acquire_cookie(). object creation is deferred to the object state machine and the netfs is allowed to continue. OBJECT STATE MACHINE KTHREAD ============================ (1) The object is looked up on disk by fscache_look_up_object() calling cachefiles_walk_to_object(). The latter finds that the object is not yet represented on disk and calls fscache_object_lookup_negative(). (2) fscache_object_lookup_negative() sets FSCACHE_COOKIE_NO_DATA_YET and clears FSCACHE_COOKIE_LOOKING_UP, thus allowing the netfs to start queuing read operations. (B) The netfs calls fscache_read_or_alloc_pages(). This calls fscache_wait_for_deferred_lookup() which sees FSCACHE_COOKIE_LOOKING_UP become clear, allowing the read to begin. (C) A read operation is set up and passed to fscache_submit_op() to deal with. (3) cachefiles_walk_to_object() calls cachefiles_has_space(), which fails (or one of the file operations to create stuff fails). cachefiles returns an error to fscache. (4) fscache_look_up_object() transits to the LOOKUP_FAILURE state, (5) fscache_lookup_failure() sets FSCACHE_OBJECT_LOOKED_UP and FSCACHE_COOKIE_UNAVAILABLE and clears FSCACHE_COOKIE_LOOKING_UP then transits to the KILL_OBJECT state. (6) fscache_kill_object() clears FSCACHE_OBJECT_IS_LIVE in an attempt to reject any further requests from the netfs. (7) object->n_ops is examined and found to be 0. fscache_kill_object() transits to the DROP_OBJECT state. (D) fscache_submit_op() locks the object spinlock, sees if it can dispatch the op immediately by calling fscache_object_is_active() - which fails since FSCACHE_OBJECT_IS_AVAILABLE has not yet been set. (E) fscache_submit_op() then tests FSCACHE_OBJECT_LOOKED_UP - which is set. It then queues the object and increments object->n_ops. (8) fscache_drop_object() releases the object and eventually fscache_put_object() calls cachefiles_put_object() which suffers an assertion failure here: ASSERTCMP(object->fscache.n_ops, ==, 0); Locking the object spinlock in step (6) around the clearance of FSCACHE_OBJECT_IS_LIVE ensures that the the decision trees in fscache_submit_op() and fscache_submit_exclusive_op() don't see the IS_LIVE flag being cleared mid-decision: either the op is queued before step (7) - in which case fscache_kill_object() will see n_ops>0 and will deal with the op - or the op will be rejected. This, combined with rejecting op submission if the target object is dying, fix the problem. The problem shows up as the following oops: CacheFiles: Assertion failed CacheFiles: 1 == 0 is false ------------[ cut here ]------------ kernel BUG at ../fs/cachefiles/interface.c:339! ... RIP: 0010:[] [] cachefiles_put_object+0x2a4/0x301 [cachefiles] ... Call Trace: [] fscache_put_object+0x18/0x21 [fscache] [] fscache_object_work_func+0x3ba/0x3c9 [fscache] [] process_one_work+0x226/0x441 [] worker_thread+0x273/0x36b [] ? rescuer_thread+0x2e1/0x2e1 [] kthread+0x10e/0x116 [] ? kthread_create_on_node+0x1bb/0x1bb [] ret_from_fork+0x7c/0xb0 [] ? kthread_create_on_node+0x1bb/0x1bb Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/object.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 9b79fc9a1464..40049f7505f0 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -327,6 +327,17 @@ void fscache_object_init(struct fscache_object *object, } EXPORT_SYMBOL(fscache_object_init); +/* + * Mark the object as no longer being live, making sure that we synchronise + * against op submission. + */ +static inline void fscache_mark_object_dead(struct fscache_object *object) +{ + spin_lock(&object->lock); + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + spin_unlock(&object->lock); +} + /* * Abort object initialisation before we start it. */ @@ -631,7 +642,7 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob _enter("{OBJ%x,%d,%d},%d", object->debug_id, object->n_ops, object->n_children, event); - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); object->oob_event_mask = 0; if (list_empty(&object->dependents) && @@ -976,13 +987,13 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj return transit_to(UPDATE_OBJECT); nomem: - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); fscache_unuse_cookie(object); _leave(" [ENOMEM]"); return transit_to(KILL_OBJECT); submit_op_failed: - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); spin_unlock(&cookie->lock); fscache_unuse_cookie(object); kfree(op); -- cgit From 87021526300f1a292dd966e141e183630ac95317 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:52:51 +0000 Subject: FS-Cache: fscache_object_is_dead() has wrong logic, kill it fscache_object_is_dead() returns true only if the object is marked dead and the cache got an I/O error. This should be a logical OR instead. Since two of the callers got split up into handling for separate subcases, expand the other callers and kill the function. This is probably the right thing to do anyway since one of the subcases isn't about the object at all, but rather about the cache. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/cookie.c | 3 ++- fs/fscache/page.c | 6 ++++-- include/linux/fscache-cache.h | 6 ------ 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 89acec742e0b..8de22164f5fb 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -327,7 +327,8 @@ static int fscache_alloc_object(struct fscache_cache *cache, object_already_extant: ret = -ENOBUFS; - if (fscache_object_is_dead(object)) { + if (fscache_object_is_dying(object) || + fscache_cache_is_broken(object)) { spin_unlock(&cookie->lock); goto error; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index de33b3fccca6..d0805e31361c 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -377,11 +377,13 @@ check_if_dead: _leave(" = -ENOBUFS [cancelled]"); return -ENOBUFS; } - if (unlikely(fscache_object_is_dead(object))) { - pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state); + if (unlikely(fscache_object_is_dying(object) || + fscache_cache_is_broken(object))) { + enum fscache_operation_state state = op->state; fscache_cancel_op(op, do_cancel); if (stat_object_dead) fscache_stat(stat_object_dead); + _leave(" = -ENOBUFS [obj dead %d]", state); return -ENOBUFS; } return 0; diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 2e83a141e465..ca3d550da11e 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -423,12 +423,6 @@ static inline bool fscache_object_is_active(struct fscache_object *object) !fscache_cache_is_broken(object); } -static inline bool fscache_object_is_dead(struct fscache_object *object) -{ - return fscache_object_is_dying(object) && - fscache_cache_is_broken(object); -} - /** * fscache_object_destroyed - Note destruction of an object in a cache * @cache: The cache from which the object came -- cgit From 418b7eb9e1011bc11220a03ad0045885d04698d2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:28 +0000 Subject: FS-Cache: Permit fscache_cancel_op() to cancel in-progress operations too Currently, fscache_cancel_op() only cancels pending operations - attempts to cancel in-progress operations are ignored. This leads to a problem in fscache_wait_for_operation_activation() whereby the wait is terminated, but the object has been killed. The check at the end of the function now triggers because it's no longer contingent on the cache having produced an I/O error since the commit that fixed the logic error in fscache_object_is_dead(). The result of the check is that it tries to cancel the operation - but since the object may not be pending by this point, the cancellation request may be ignored - with the result that the the object is just put by the caller and fscache_put_operation has an assertion failure because the operation isn't in either the COMPLETE or the CANCELLED states. To fix this, we permit in-progress ops to be cancelled under some circumstances. The bug results in an oops that looks something like this: FS-Cache: fscache_wait_for_operation_activation() = -ENOBUFS [obj dead 3] FS-Cache: FS-Cache: Assertion failed FS-Cache: 3 == 5 is false ------------[ cut here ]------------ kernel BUG at ../fs/fscache/operation.c:432! ... RIP: 0010:[] fscache_put_operation+0xf2/0x2cd Call Trace: [] __fscache_read_or_alloc_pages+0x2ec/0x3b3 [] __nfs_readpages_from_fscache+0x59/0xbf [nfs] [] nfs_readpages+0x10c/0x185 [nfs] [] ? alloc_pages_current+0x119/0x13e [] ? __page_cache_alloc+0xfb/0x10a [] __do_page_cache_readahead+0x188/0x22c [] ondemand_readahead+0x29e/0x2af [] page_cache_sync_readahead+0x38/0x3a [] generic_file_read_iter+0x1a2/0x55a [] ? nfs_revalidate_mapping+0xd6/0x288 [nfs] [] nfs_file_read+0x49/0x70 [nfs] [] new_sync_read+0x78/0x9c [] __vfs_read+0x13/0x38 [] vfs_read+0x95/0x121 [] SyS_read+0x4c/0x8a [] system_call_fastpath+0x12/0x17 Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/internal.h | 3 ++- fs/fscache/operation.c | 20 +++++++++++++++++--- fs/fscache/page.c | 4 ++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 3063a58b7d3d..87c4544ec912 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -125,7 +125,8 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); extern int fscache_cancel_op(struct fscache_operation *, - void (*)(struct fscache_operation *)); + void (*)(struct fscache_operation *), + bool); extern void fscache_cancel_all_ops(struct fscache_object *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 18658fffbba1..67594a8d791a 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -312,9 +312,11 @@ void fscache_start_operations(struct fscache_object *object) * cancel an operation that's pending on an object */ int fscache_cancel_op(struct fscache_operation *op, - void (*do_cancel)(struct fscache_operation *)) + void (*do_cancel)(struct fscache_operation *), + bool cancel_in_progress_op) { struct fscache_object *object = op->object; + bool put = false; int ret; _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); @@ -328,8 +330,19 @@ int fscache_cancel_op(struct fscache_operation *op, ret = -EBUSY; if (op->state == FSCACHE_OP_ST_PENDING) { ASSERT(!list_empty(&op->pend_link)); - fscache_stat(&fscache_n_op_cancelled); list_del_init(&op->pend_link); + put = true; + fscache_stat(&fscache_n_op_cancelled); + if (do_cancel) + do_cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + ret = 0; + } else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) { + fscache_stat(&fscache_n_op_cancelled); if (do_cancel) do_cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; @@ -337,10 +350,11 @@ int fscache_cancel_op(struct fscache_operation *op, object->n_exclusive--; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - fscache_put_operation(op); ret = 0; } + if (put) + fscache_put_operation(op); spin_unlock(&object->lock); _leave(" = %d", ret); return ret; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index d0805e31361c..433cae927eca 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -359,7 +359,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, TASK_INTERRUPTIBLE) != 0) { - ret = fscache_cancel_op(op, do_cancel); + ret = fscache_cancel_op(op, do_cancel, false); if (ret == 0) return -ERESTARTSYS; @@ -380,7 +380,7 @@ check_if_dead: if (unlikely(fscache_object_is_dying(object) || fscache_cache_is_broken(object))) { enum fscache_operation_state state = op->state; - fscache_cancel_op(op, do_cancel); + fscache_cancel_op(op, do_cancel, true); if (stat_object_dead) fscache_stat(stat_object_dead); _leave(" = -ENOBUFS [obj dead %d]", state); -- cgit From 1339ec98e32b4bc8efb6fbb71c006a465130aaba Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 25 Feb 2015 13:26:39 +0000 Subject: FS-Cache: Out of line fscache_operation_init() Out of line fscache_operation_init() so that it can access internal FS-Cache features, such as stats, in a later commit. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/operation.c | 22 ++++++++++++++++++++++ include/linux/fscache-cache.h | 24 +++--------------------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 67594a8d791a..61a6e78b85fa 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -20,6 +20,28 @@ atomic_t fscache_op_debug_id; EXPORT_SYMBOL(fscache_op_debug_id); +/** + * fscache_operation_init - Do basic initialisation of an operation + * @op: The operation to initialise + * @release: The release function to assign + * + * Do basic initialisation of an operation. The caller must still set flags, + * object and processor if needed. + */ +void fscache_operation_init(struct fscache_operation *op, + fscache_operation_processor_t processor, + fscache_operation_release_t release) +{ + INIT_WORK(&op->work, fscache_op_work_func); + atomic_set(&op->usage, 1); + op->state = FSCACHE_OP_ST_INITIALISED; + op->debug_id = atomic_inc_return(&fscache_op_debug_id); + op->processor = processor; + op->release = release; + INIT_LIST_HEAD(&op->pend_link); +} +EXPORT_SYMBOL(fscache_operation_init); + /** * fscache_enqueue_operation - Enqueue an operation for processing * @op: The operation to enqueue diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index ca3d550da11e..0e26d49972e3 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -119,27 +119,9 @@ extern void fscache_op_work_func(struct work_struct *work); extern void fscache_enqueue_operation(struct fscache_operation *); extern void fscache_op_complete(struct fscache_operation *, bool); extern void fscache_put_operation(struct fscache_operation *); - -/** - * fscache_operation_init - Do basic initialisation of an operation - * @op: The operation to initialise - * @release: The release function to assign - * - * Do basic initialisation of an operation. The caller must still set flags, - * object and processor if needed. - */ -static inline void fscache_operation_init(struct fscache_operation *op, - fscache_operation_processor_t processor, - fscache_operation_release_t release) -{ - INIT_WORK(&op->work, fscache_op_work_func); - atomic_set(&op->usage, 1); - op->state = FSCACHE_OP_ST_INITIALISED; - op->debug_id = atomic_inc_return(&fscache_op_debug_id); - op->processor = processor; - op->release = release; - INIT_LIST_HEAD(&op->pend_link); -} +extern void fscache_operation_init(struct fscache_operation *, + fscache_operation_processor_t, + fscache_operation_release_t); /* * data read operation -- cgit From 03cdd0e4b9a98ae995b81cd8f58e992ec3f44ae2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 25 Feb 2015 13:21:15 +0000 Subject: FS-Cache: Count the number of initialised operations Count and display through /proc/fs/fscache/stats the number of initialised operations. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- Documentation/filesystems/caching/fscache.txt | 3 ++- fs/fscache/internal.h | 1 + fs/fscache/operation.c | 1 + fs/fscache/stats.c | 4 +++- 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt index 66fa7fbccfa4..50f0a5757f48 100644 --- a/Documentation/filesystems/caching/fscache.txt +++ b/Documentation/filesystems/caching/fscache.txt @@ -284,8 +284,9 @@ proc files. enq=N Number of times async ops queued for processing can=N Number of async ops cancelled rej=N Number of async ops rejected due to object lookup/create failure + ini=N Number of async ops initialised dfr=N Number of async ops queued for deferred release - rel=N Number of async ops released + rel=N Number of async ops released (should equal ini=N when idle) gc=N Number of deferred-release async ops garbage collected CacheOp alo=N Number of in-progress alloc_object() cache ops luo=N Number of in-progress lookup_object() cache ops diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 87c4544ec912..a63225116db6 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -165,6 +165,7 @@ extern atomic_t fscache_n_op_pend; extern atomic_t fscache_n_op_run; extern atomic_t fscache_n_op_enqueue; extern atomic_t fscache_n_op_deferred_release; +extern atomic_t fscache_n_op_initialised; extern atomic_t fscache_n_op_release; extern atomic_t fscache_n_op_gc; extern atomic_t fscache_n_op_cancelled; diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 61a6e78b85fa..9761df4fc2ab 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -39,6 +39,7 @@ void fscache_operation_init(struct fscache_operation *op, op->processor = processor; op->release = release; INIT_LIST_HEAD(&op->pend_link); + fscache_stat(&fscache_n_op_initialised); } EXPORT_SYMBOL(fscache_operation_init); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 3a722e8f2307..7cfa0aacdf6d 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -23,6 +23,7 @@ atomic_t fscache_n_op_run; atomic_t fscache_n_op_enqueue; atomic_t fscache_n_op_requeue; atomic_t fscache_n_op_deferred_release; +atomic_t fscache_n_op_initialised; atomic_t fscache_n_op_release; atomic_t fscache_n_op_gc; atomic_t fscache_n_op_cancelled; @@ -251,7 +252,8 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_op_enqueue), atomic_read(&fscache_n_op_cancelled), atomic_read(&fscache_n_op_rejected)); - seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", + seq_printf(m, "Ops : ini=%u dfr=%u rel=%u gc=%u\n", + atomic_read(&fscache_n_op_initialised), atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), atomic_read(&fscache_n_op_gc)); -- cgit From 73c04a47bf79770fbe7f3cf515f5831fccab88ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 25 Feb 2015 14:07:25 +0000 Subject: FS-Cache: Fix cancellation of in-progress operation Cancellation of an in-progress operation needs to update the relevant counters and start any operations that are pending waiting on this one. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/operation.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 9761df4fc2ab..b6bf5f399d70 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -365,6 +365,13 @@ int fscache_cancel_op(struct fscache_operation *op, wake_up_bit(&op->flags, FSCACHE_OP_WAITING); ret = 0; } else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) { + ASSERTCMP(object->n_in_progress, >, 0); + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + fscache_stat(&fscache_n_op_cancelled); if (do_cancel) do_cancel(op); -- cgit From a39caadf06879017cb9a8c5c5cb4fc4ccb213275 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 25 Feb 2015 14:22:40 +0000 Subject: FS-Cache: Put an aborted initialised op so that it is accounted correctly Call fscache_put_operation() or a wrapper on any op that has gone through fscache_operation_init() so that the accounting shown in /proc is done correctly, specifically fscache_n_op_release. fscache_put_operation() therefore now allows an op in the INITIALISED state as well as in the CANCELLED and COMPLETE states. Note that this means that an operation can get put that doesn't have its ->object pointer filled in, so anything that depends on the object needs to be conditional in fscache_put_operation(). Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/operation.c | 54 ++++++++++++++++++++++++++------------------------ fs/fscache/page.c | 14 ++++++------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index b6bf5f399d70..c76c09730768 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -472,7 +472,8 @@ void fscache_put_operation(struct fscache_operation *op) return; _debug("PUT OP"); - ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE, + ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && + op->state != FSCACHE_OP_ST_COMPLETE, op->state, ==, FSCACHE_OP_ST_CANCELLED); op->state = FSCACHE_OP_ST_DEAD; @@ -484,35 +485,36 @@ void fscache_put_operation(struct fscache_operation *op) } object = op->object; + if (likely(object)) { + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) + fscache_unuse_cookie(object); + + /* now... we may get called with the object spinlock held, so we + * complete the cleanup here only if we can immediately acquire the + * lock, and defer it otherwise */ + if (!spin_trylock(&object->lock)) { + _debug("defer put"); + fscache_stat(&fscache_n_op_deferred_release); + + cache = object->cache; + spin_lock(&cache->op_gc_list_lock); + list_add_tail(&op->pend_link, &cache->op_gc_list); + spin_unlock(&cache->op_gc_list_lock); + schedule_work(&cache->op_gc); + _leave(" [defer]"); + return; + } - if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) - atomic_dec(&object->n_reads); - if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) - fscache_unuse_cookie(object); - - /* now... we may get called with the object spinlock held, so we - * complete the cleanup here only if we can immediately acquire the - * lock, and defer it otherwise */ - if (!spin_trylock(&object->lock)) { - _debug("defer put"); - fscache_stat(&fscache_n_op_deferred_release); + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - cache = object->cache; - spin_lock(&cache->op_gc_list_lock); - list_add_tail(&op->pend_link, &cache->op_gc_list); - spin_unlock(&cache->op_gc_list_lock); - schedule_work(&cache->op_gc); - _leave(" [defer]"); - return; + spin_unlock(&object->lock); } - ASSERTCMP(object->n_ops, >, 0); - object->n_ops--; - if (object->n_ops == 0) - fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - - spin_unlock(&object->lock); - kfree(op); _leave(" [done]"); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 433cae927eca..7db9752252ef 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -239,7 +239,7 @@ nobufs_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_operation(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_attr_changed_nobufs); @@ -505,7 +505,7 @@ nobufs_unlock: spin_unlock(&cookie->lock); if (wake_cookie) __fscache_wake_unused_cookie(cookie); - kfree(op); + fscache_put_retrieval(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); _leave(" = -ENOBUFS"); @@ -634,7 +634,7 @@ nobufs_unlock_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_retrieval(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); nobufs: @@ -728,7 +728,7 @@ nobufs_unlock_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_retrieval(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); nobufs: @@ -1018,7 +1018,7 @@ already_pending: spin_unlock(&object->lock); spin_unlock(&cookie->lock); radix_tree_preload_end(); - kfree(op); + fscache_put_operation(&op->op); fscache_stat(&fscache_n_stores_ok); _leave(" = 0"); return 0; @@ -1038,7 +1038,7 @@ nobufs_unlock_obj: nobufs: spin_unlock(&cookie->lock); radix_tree_preload_end(); - kfree(op); + fscache_put_operation(&op->op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_stores_nobufs); @@ -1046,7 +1046,7 @@ nobufs: return -ENOBUFS; nomem_free: - kfree(op); + fscache_put_operation(&op->op); nomem: fscache_stat(&fscache_n_stores_oom); _leave(" = -ENOMEM"); -- cgit From d3b97ca4a99e4e6c78f5a21c968eadf5c8ba9971 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:29 +0000 Subject: FS-Cache: The operation cancellation method needs calling in more places Any time an incomplete operation is cancelled, the operation cancellation function needs to be called to clean up. This is currently being passed directly to some of the functions that might want to call it, but not all. Instead, pass the cancellation method pointer to the fscache_operation_init() and have that cache it in the operation struct. Further, plug in a dummy cancellation handler if the caller declines to set one as this allows us to call the function unconditionally (the extra overhead isn't worth bothering about as we don't expect to be calling this typically). The cancellation method must thence be called everywhere the CANCELLED state is set. Note that we call it *before* setting the CANCELLED state such that the method can use the old state value to guide its operation. fscache_do_cancel_retrieval() needs moving higher up in the sources so that the init function can use it now. Without this, the following oops may be seen: FS-Cache: Assertion failed FS-Cache: 3 == 0 is false ------------[ cut here ]------------ kernel BUG at ../fs/fscache/page.c:261! ... RIP: 0010:[] fscache_release_retrieval_op+0x77/0x100 [] fscache_put_operation+0x114/0x2da [] __fscache_read_or_alloc_pages+0x358/0x3b3 [] __nfs_readpages_from_fscache+0x59/0xbf [nfs] [] nfs_readpages+0x10c/0x185 [nfs] [] ? alloc_pages_current+0x119/0x13e [] ? __page_cache_alloc+0xfb/0x10a [] __do_page_cache_readahead+0x188/0x22c [] ondemand_readahead+0x29e/0x2af [] page_cache_sync_readahead+0x38/0x3a [] generic_file_read_iter+0x1a2/0x55a [] ? nfs_revalidate_mapping+0xd6/0x288 [nfs] [] nfs_file_read+0x49/0x70 [nfs] [] new_sync_read+0x78/0x9c [] __vfs_read+0x13/0x38 [] vfs_read+0x95/0x121 [] SyS_read+0x4c/0x8a [] system_call_fastpath+0x12/0x17 The assertion is showing that the remaining number of pages (n_pages) is not 0 when the operation is being released. Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/cookie.c | 5 ++--- fs/fscache/internal.h | 7 ++----- fs/fscache/object.c | 3 ++- fs/fscache/operation.c | 31 ++++++++++++++++++++++------- fs/fscache/page.c | 46 +++++++++++++++++++++---------------------- include/linux/fscache-cache.h | 5 +++++ 6 files changed, 57 insertions(+), 40 deletions(-) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 8de22164f5fb..d403c69bee08 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -672,7 +672,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) if (!op) return -ENOMEM; - fscache_operation_init(op, NULL, NULL); + fscache_operation_init(op, NULL, NULL, NULL); op->flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -696,8 +696,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) /* the work queue now carries its own ref on the object */ spin_unlock(&cookie->lock); - ret = fscache_wait_for_operation_activation(object, op, - NULL, NULL, NULL); + ret = fscache_wait_for_operation_activation(object, op, NULL, NULL); if (ret == 0) { /* ask the cache to honour the operation */ ret = object->cache->ops->check_consistency(op); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index a63225116db6..97ec45110957 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -124,9 +124,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); -extern int fscache_cancel_op(struct fscache_operation *, - void (*)(struct fscache_operation *), - bool); +extern int fscache_cancel_op(struct fscache_operation *, bool); extern void fscache_cancel_all_ops(struct fscache_object *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); @@ -139,8 +137,7 @@ extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); extern int fscache_wait_for_operation_activation(struct fscache_object *, struct fscache_operation *, atomic_t *, - atomic_t *, - void (*)(struct fscache_operation *)); + atomic_t *); extern void fscache_invalidate_writes(struct fscache_cookie *); /* diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 40049f7505f0..9e792e30f4db 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -961,7 +961,8 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj if (!op) goto nomem; - fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); + fscache_operation_init(op, object->cache->ops->invalidate_object, + NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index c76c09730768..57d4abb68656 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -20,6 +20,10 @@ atomic_t fscache_op_debug_id; EXPORT_SYMBOL(fscache_op_debug_id); +static void fscache_operation_dummy_cancel(struct fscache_operation *op) +{ +} + /** * fscache_operation_init - Do basic initialisation of an operation * @op: The operation to initialise @@ -30,6 +34,7 @@ EXPORT_SYMBOL(fscache_op_debug_id); */ void fscache_operation_init(struct fscache_operation *op, fscache_operation_processor_t processor, + fscache_operation_cancel_t cancel, fscache_operation_release_t release) { INIT_WORK(&op->work, fscache_op_work_func); @@ -37,6 +42,7 @@ void fscache_operation_init(struct fscache_operation *op, op->state = FSCACHE_OP_ST_INITIALISED; op->debug_id = atomic_inc_return(&fscache_op_debug_id); op->processor = processor; + op->cancel = cancel ?: fscache_operation_dummy_cancel; op->release = release; INIT_LIST_HEAD(&op->pend_link); fscache_stat(&fscache_n_op_initialised); @@ -164,9 +170,11 @@ int fscache_submit_exclusive_op(struct fscache_object *object, flags = READ_ONCE(object->flags); if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { fscache_stat(&fscache_n_op_rejected); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -EIO; } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { @@ -200,10 +208,12 @@ int fscache_submit_exclusive_op(struct fscache_object *object, fscache_stat(&fscache_n_op_pend); ret = 0; } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } else { fscache_report_unexpected_submission(object, op, ostate); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } @@ -245,9 +255,11 @@ int fscache_submit_op(struct fscache_object *object, flags = READ_ONCE(object->flags); if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { fscache_stat(&fscache_n_op_rejected); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -EIO; } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { @@ -276,11 +288,13 @@ int fscache_submit_op(struct fscache_object *object, fscache_stat(&fscache_n_op_pend); ret = 0; } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } else { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } @@ -335,7 +349,6 @@ void fscache_start_operations(struct fscache_object *object) * cancel an operation that's pending on an object */ int fscache_cancel_op(struct fscache_operation *op, - void (*do_cancel)(struct fscache_operation *), bool cancel_in_progress_op) { struct fscache_object *object = op->object; @@ -355,9 +368,9 @@ int fscache_cancel_op(struct fscache_operation *op, ASSERT(!list_empty(&op->pend_link)); list_del_init(&op->pend_link); put = true; + fscache_stat(&fscache_n_op_cancelled); - if (do_cancel) - do_cancel(op); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; @@ -373,8 +386,7 @@ int fscache_cancel_op(struct fscache_operation *op, fscache_start_operations(object); fscache_stat(&fscache_n_op_cancelled); - if (do_cancel) - do_cancel(op); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; @@ -408,6 +420,7 @@ void fscache_cancel_all_ops(struct fscache_object *object) list_del_init(&op->pend_link); ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) @@ -440,8 +453,12 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled) spin_lock(&object->lock); - op->state = cancelled ? - FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE; + if (!cancelled) { + op->state = FSCACHE_OP_ST_COMPLETE; + } else { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + } if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 7db9752252ef..d1b23a67c031 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -213,7 +213,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, fscache_attr_changed_op, NULL); + fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -248,6 +248,17 @@ nobufs: } EXPORT_SYMBOL(__fscache_attr_changed); +/* + * Handle cancellation of a pending retrieval op + */ +static void fscache_do_cancel_retrieval(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + atomic_set(&op->n_pages, 0); +} + /* * release a retrieval op reference */ @@ -285,7 +296,9 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); + fscache_operation_init(&op->op, NULL, + fscache_do_cancel_retrieval, + fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); @@ -329,25 +342,13 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) return 0; } -/* - * Handle cancellation of a pending retrieval op - */ -static void fscache_do_cancel_retrieval(struct fscache_operation *_op) -{ - struct fscache_retrieval *op = - container_of(_op, struct fscache_retrieval, op); - - atomic_set(&op->n_pages, 0); -} - /* * wait for an object to become active (or dead) */ int fscache_wait_for_operation_activation(struct fscache_object *object, struct fscache_operation *op, atomic_t *stat_op_waits, - atomic_t *stat_object_dead, - void (*do_cancel)(struct fscache_operation *)) + atomic_t *stat_object_dead) { int ret; @@ -359,7 +360,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, TASK_INTERRUPTIBLE) != 0) { - ret = fscache_cancel_op(op, do_cancel, false); + ret = fscache_cancel_op(op, false); if (ret == 0) return -ERESTARTSYS; @@ -380,7 +381,7 @@ check_if_dead: if (unlikely(fscache_object_is_dying(object) || fscache_cache_is_broken(object))) { enum fscache_operation_state state = op->state; - fscache_cancel_op(op, do_cancel, true); + fscache_cancel_op(op, true); if (stat_object_dead) fscache_stat(stat_object_dead); _leave(" = -ENOBUFS [obj dead %d]", state); @@ -464,8 +465,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -595,8 +595,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -702,8 +701,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_alloc_op_waits), - __fscache_stat(&fscache_n_allocs_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_allocs_object_dead)); if (ret < 0) goto error; @@ -946,7 +944,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_write_op, + fscache_operation_init(&op->op, fscache_write_op, NULL, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING) | diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 0e26d49972e3..69c6eb10d858 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -74,6 +74,7 @@ extern wait_queue_head_t fscache_cache_cleared_wq; */ typedef void (*fscache_operation_release_t)(struct fscache_operation *op); typedef void (*fscache_operation_processor_t)(struct fscache_operation *op); +typedef void (*fscache_operation_cancel_t)(struct fscache_operation *op); enum fscache_operation_state { FSCACHE_OP_ST_BLANK, /* Op is not yet submitted */ @@ -109,6 +110,9 @@ struct fscache_operation { * the op in a non-pool thread */ fscache_operation_processor_t processor; + /* Operation cancellation cleanup (optional) */ + fscache_operation_cancel_t cancel; + /* operation releaser */ fscache_operation_release_t release; }; @@ -121,6 +125,7 @@ extern void fscache_op_complete(struct fscache_operation *, bool); extern void fscache_put_operation(struct fscache_operation *); extern void fscache_operation_init(struct fscache_operation *, fscache_operation_processor_t, + fscache_operation_cancel_t, fscache_operation_release_t); /* -- cgit From 4a47132ff472a0c2c5441baeb50cf97f2580bc43 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Feb 2015 10:05:29 +0000 Subject: FS-Cache: Retain the netfs context in the retrieval op earlier Now that the retrieval operation may be disposed of by fscache_put_operation() before we actually set the context, the retrieval-specific cleanup operation can produce a NULL-pointer dereference when it tries to unconditionally clean up the netfs context. Given that it is expected that we'll get at least as far as the place where we currently set the context pointer and it is unlikely we'll go through the error handling paths prior to that point, retain the context right from the point that the retrieval op is allocated. Concomitant to this, we need to retain the cookie pointer in the retrieval op also so that we can call the netfs to release its context in the release method. In addition, we might now get into fscache_release_retrieval_op() with the op only initialised. To this end, set the operation to DEAD only after the release method has been called and skip the n_pages test upon cleanup if the op is still in the INITIALISED state. Without these changes, the following oops might be seen: BUG: unable to handle kernel NULL pointer dereference at 00000000000000b8 ... RIP: 0010:[] fscache_release_retrieval_op+0xae/0x100 ... Call Trace: [] fscache_put_operation+0x117/0x2e0 [] __fscache_read_or_alloc_pages+0x351/0x3ac [] __nfs_readpages_from_fscache+0x59/0xbf [nfs] [] nfs_readpages+0x10c/0x185 [nfs] [] ? alloc_pages_current+0x119/0x13e [] ? __page_cache_alloc+0xfb/0x10a [] __do_page_cache_readahead+0x188/0x22c [] ondemand_readahead+0x29e/0x2af [] page_cache_sync_readahead+0x38/0x3a [] generic_file_read_iter+0x1a2/0x55a [] ? nfs_revalidate_mapping+0xd6/0x288 [nfs] [] nfs_file_read+0x49/0x70 [nfs] [] new_sync_read+0x78/0x9c [] __vfs_read+0x13/0x38 [] vfs_read+0x95/0x121 [] SyS_read+0x4c/0x8a [] system_call_fastpath+0x12/0x17 Signed-off-by: David Howells Reviewed-by: Steve Dickson Acked-by: Jeff Layton --- fs/fscache/operation.c | 2 +- fs/fscache/page.c | 20 ++++++++++---------- include/linux/fscache-cache.h | 1 + 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 57d4abb68656..de67745e1cd7 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -492,7 +492,6 @@ void fscache_put_operation(struct fscache_operation *op) ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && op->state != FSCACHE_OP_ST_COMPLETE, op->state, ==, FSCACHE_OP_ST_CANCELLED); - op->state = FSCACHE_OP_ST_DEAD; fscache_stat(&fscache_n_op_release); @@ -500,6 +499,7 @@ void fscache_put_operation(struct fscache_operation *op) op->release(op); op->release = NULL; } + op->state = FSCACHE_OP_ST_DEAD; object = op->object; if (likely(object)) { diff --git a/fs/fscache/page.c b/fs/fscache/page.c index d1b23a67c031..483bbc613bf0 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -269,11 +269,12 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) _enter("{OP%x}", op->op.debug_id); - ASSERTCMP(atomic_read(&op->n_pages), ==, 0); + ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED, + atomic_read(&op->n_pages), ==, 0); fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) - fscache_put_context(op->op.object->cookie, op->context); + fscache_put_context(op->cookie, op->context); _leave(""); } @@ -302,11 +303,18 @@ static struct fscache_retrieval *fscache_alloc_retrieval( op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); + op->cookie = cookie; op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; op->start_time = jiffies; INIT_LIST_HEAD(&op->to_do); + + /* Pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure. + */ + if (context) + fscache_get_context(op->cookie, context); return op; } @@ -456,10 +464,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( @@ -586,10 +590,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 69c6eb10d858..604e1526cd00 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -133,6 +133,7 @@ extern void fscache_operation_init(struct fscache_operation *, */ struct fscache_retrieval { struct fscache_operation op; + struct fscache_cookie *cookie; /* The netfs cookie */ struct address_space *mapping; /* netfs pages */ fscache_rw_complete_t end_io_func; /* function to call on I/O completion */ void *context; /* netfs read context (pinned) */ -- cgit From 3f4a9494104cbadfa05cb2da9ca04b205712360f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 6 Jun 2015 09:15:55 -0400 Subject: ncpfs: successful rename() should invalidate caches for parents Signed-off-by: Al Viro --- fs/ncpfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 80021c709af9..93575e91a7aa 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -1145,6 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, case 0x00: ncp_dbg(1, "renamed %pd -> %pd\n", old_dentry, new_dentry); + ncp_d_prune(old_dentry); + ncp_d_prune(new_dentry); break; case 0x9E: error = -ENAMETOOLONG; -- cgit From 13b987ea275840d74d9df9a44326632fab1894da Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 10 Jun 2015 10:09:32 +1000 Subject: fs/ufs: revert "ufs: fix deadlocks introduced by sb mutex merge" This reverts commit 9ef7db7f38d0 ("ufs: fix deadlocks introduced by sb mutex merge") That patch tried to solve commit 0244756edc4b98c ("ufs: sb mutex merge + mutex_destroy") which is itself partially reverted due to multiple deadlocks. Signed-off-by: Fabian Frederick Suggested-by: Jan Kara Cc: Ian Campbell Cc: Evgeniy Dushistov Cc: Alexey Khoroshilov Cc: Roger Pau Monne Cc: Ian Jackson Cc: Al Viro Cc: Signed-off-by: Andrew Morton --- fs/ufs/inode.c | 5 ++++- fs/ufs/namei.c | 14 ++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index be7d42c7d938..2d93ab07da8a 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -902,6 +902,9 @@ void ufs_evict_inode(struct inode * inode) invalidate_inode_buffers(inode); clear_inode(inode); - if (want_delete) + if (want_delete) { + lock_ufs(inode->i_sb); ufs_free_inode(inode); + unlock_ufs(inode->i_sb); + } } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index e491a93a7e9a..1f5223c9e1e2 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -128,12 +128,12 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; + lock_ufs(dir->i_sb); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out_notlocked; + goto out; - lock_ufs(dir->i_sb); if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ inode->i_op = &ufs_symlink_inode_operations; @@ -184,9 +184,13 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) struct inode * inode; int err; + lock_ufs(dir->i_sb); + inode_inc_link_count(dir); + inode = ufs_new_inode(dir, S_IFDIR|mode); + err = PTR_ERR(inode); if (IS_ERR(inode)) - return PTR_ERR(inode); + goto out_dir; inode->i_op = &ufs_dir_inode_operations; inode->i_fop = &ufs_dir_operations; @@ -194,9 +198,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) inode_inc_link_count(inode); - lock_ufs(dir->i_sb); - inode_inc_link_count(dir); - err = ufs_make_empty(inode, dir); if (err) goto out_fail; @@ -215,6 +216,7 @@ out_fail: inode_dec_link_count(inode); unlock_new_inode(inode); iput (inode); +out_dir: inode_dec_link_count(dir); unlock_ufs(dir->i_sb); goto out; -- cgit From cdd9eefdf905e92e7fc6cc393314efe68dc6ff66 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 10 Jun 2015 10:09:32 +1000 Subject: fs/ufs: restore s_lock mutex Commit 0244756edc4b98c ("ufs: sb mutex merge + mutex_destroy") generated deadlocks in read/write mode on mkdir. This patch partially reverts it keeping fixes by Andrew Morton and mutex_destroy() [AV: fixed a missing bit in ufs_remount()] Signed-off-by: Fabian Frederick Reported-by: Ian Campbell Suggested-by: Jan Kara Cc: Ian Campbell Cc: Evgeniy Dushistov Cc: Alexey Khoroshilov Cc: Roger Pau Monne Cc: Ian Jackson Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/ufs/balloc.c | 34 +++++++++++++++++----------------- fs/ufs/ialloc.c | 16 ++++++++-------- fs/ufs/super.c | 10 ++++++++++ fs/ufs/ufs.h | 1 + 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 2c1036080d52..a7106eda5024 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -51,8 +51,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (ufs_fragnum(fragment) + count > uspi->s_fpg) ufs_error (sb, "ufs_free_fragments", "internal error"); - - lock_ufs(sb); + + mutex_lock(&UFS_SB(sb)->s_lock); cgno = ufs_dtog(uspi, fragment); bit = ufs_dtogd(uspi, fragment); @@ -115,13 +115,13 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (sb->s_flags & MS_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - - unlock_ufs(sb); + + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); return; failed: - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return; } @@ -151,7 +151,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) goto failed; } - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); do_more: overflow = 0; @@ -211,12 +211,12 @@ do_more: } ufs_mark_sb_dirty(sb); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); return; failed_unlock: - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); failed: UFSD("EXIT (FAILED)\n"); return; @@ -357,7 +357,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, usb1 = ubh_get_usb_first(uspi); *err = -ENOSPC; - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); tmp = ufs_data_ptr_to_cpu(sb, p); if (count + ufs_fragnum(fragment) > uspi->s_fpb) { @@ -378,19 +378,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, "fragment %llu, tmp %llu\n", (unsigned long long)fragment, (unsigned long long)tmp); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return INVBLOCK; } if (fragment < UFS_I(inode)->i_lastfrag) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } } else { if (tmp) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } } @@ -399,7 +399,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, * There is not enough space for user on the device */ if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return 0; } @@ -424,7 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); } - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -439,7 +439,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, fragment + count); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -477,7 +477,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); @@ -485,7 +485,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return result; } - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return 0; } diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 7caa01652888..fd0203ce1f7f 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -69,11 +69,11 @@ void ufs_free_inode (struct inode * inode) ino = inode->i_ino; - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return; } @@ -81,7 +81,7 @@ void ufs_free_inode (struct inode * inode) bit = ufs_inotocgoff (ino); ucpi = ufs_load_cylinder (sb, cg); if (!ucpi) { - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return; } ucg = ubh_get_ucg(UCPI_UBH(ucpi)); @@ -115,7 +115,7 @@ void ufs_free_inode (struct inode * inode) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); } @@ -193,7 +193,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) sbi = UFS_SB(sb); uspi = sbi->s_uspi; - lock_ufs(sb); + mutex_lock(&sbi->s_lock); /* * Try to place the inode in its parent directory @@ -331,21 +331,21 @@ cg_found: sync_dirty_buffer(bh); brelse(bh); } - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); UFSD("allocating inode %lu\n", inode->i_ino); UFSD("EXIT\n"); return inode; fail_remove_inode: - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); clear_nlink(inode); unlock_new_inode(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); failed: - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); make_bad_inode(inode); iput (inode); UFSD("EXIT (FAILED): err %d\n", err); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index b3bc3e7ae79d..afe9955654c8 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -694,6 +694,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait) unsigned flags; lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); @@ -711,6 +712,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait) ufs_put_cstotal(sb); UFSD("EXIT\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; @@ -1277,6 +1279,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) sync_filesystem(sb); lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); @@ -1290,6 +1293,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt = 0; ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } @@ -1297,12 +1301,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { pr_err("ufstype can't be changed during remount\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } @@ -1326,6 +1332,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) */ #ifndef CONFIG_UFS_FS_WRITE pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; #else @@ -1335,11 +1342,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { pr_err("this ufstype is read-only supported\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { pr_err("failed during remounting\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EPERM; } @@ -1347,6 +1356,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 2a07396d5f9e..cf6368d42d4a 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -30,6 +30,7 @@ struct ufs_sb_info { int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ + struct mutex s_lock; }; struct ufs_inode_info { -- cgit From 12ecbb4b1d765a5076920999298d9625439dbe58 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 1 Jun 2015 14:52:04 +0200 Subject: ufs: Fix warning from unlock_new_inode() Commit e4502c63f56aeca88 (ufs: deal with nfsd/iget races) introduced unlock_new_inode() call into ufs_add_nondir(). However that function gets called also from ufs_link() which hands it already initialized inode and thus unlock_new_inode() complains. The problem is harmless but annoying. Fix the problem by opencoding necessary stuff in ufs_link() Fixes: e4502c63f56aeca887ced37f24e0def1ef11cec8 Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/ufs/namei.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 1f5223c9e1e2..2346b83fa12b 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -174,7 +174,12 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, inode_inc_link_count(inode); ihold(inode); - error = ufs_add_nondir(dentry, inode); + error = ufs_add_link(dentry, inode); + if (error) { + inode_dec_link_count(inode); + iput(inode); + } else + d_instantiate(dentry, inode); unlock_ufs(dir->i_sb); return error; } -- cgit From 514d748f69c97a51a2645eb198ac5c6218f22ff9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 2 Jun 2015 11:26:34 +0200 Subject: ufs: Fix possible deadlock when looking up directories Commit e4502c63f56aeca88 (ufs: deal with nfsd/iget races) made ufs create inodes with I_NEW flag set. However ufs_mkdir() never cleared this flag. Thus if someone ever tried to lookup the directory by inode number, he would deadlock waiting for I_NEW to be cleared. Luckily this mostly happens only if the filesystem is exported over NFS since otherwise we have the inode attached to dentry and don't look it up by inode number. In rare cases dentry can get freed without inode being freed and then we'd hit the deadlock even without NFS export. Fix the problem by clearing I_NEW before instantiating new directory inode. Fixes: e4502c63f56aeca887ced37f24e0def1ef11cec8 Reported-by: Fabian Frederick Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/ufs/namei.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 2346b83fa12b..60ee32249b72 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -212,6 +212,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) goto out_fail; unlock_ufs(dir->i_sb); + unlock_new_inode(inode); d_instantiate(dentry, inode); out: return err; -- cgit From a50e4a02ad6957ef6f77cccaa8ef6a337f1856af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 01:50:43 -0400 Subject: ufs: don't bother with lock_ufs()/unlock_ufs() for directory access We are already serialized by ->i_mutex and operations on different directories are independent. These calls are just rudiments of blind BKL conversion and they should've been removed back then. Signed-off-by: Al Viro --- fs/ufs/namei.c | 54 ++++++++++++++---------------------------------------- 1 file changed, 14 insertions(+), 40 deletions(-) diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 60ee32249b72..3429079c11e2 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -56,11 +56,9 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi if (dentry->d_name.len > UFS_MAXNAMLEN) return ERR_PTR(-ENAMETOOLONG); - lock_ufs(dir->i_sb); ino = ufs_inode_by_name(dir, &dentry->d_name); if (ino) inode = ufs_iget(dir->i_sb, ino); - unlock_ufs(dir->i_sb); return d_splice_alias(inode, dentry); } @@ -76,24 +74,16 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { struct inode *inode; - int err; - - UFSD("BEGIN\n"); inode = ufs_new_inode(dir, mode); - err = PTR_ERR(inode); + if (IS_ERR(inode)) + return PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ufs_file_inode_operations; - inode->i_fop = &ufs_file_operations; - inode->i_mapping->a_ops = &ufs_aops; - mark_inode_dirty(inode); - lock_ufs(dir->i_sb); - err = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); - } - UFSD("END: err=%d\n", err); - return err; + inode->i_op = &ufs_file_inode_operations; + inode->i_fop = &ufs_file_operations; + inode->i_mapping->a_ops = &ufs_aops; + mark_inode_dirty(inode); + return ufs_add_nondir(dentry, inode); } static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) @@ -110,9 +100,7 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev init_special_inode(inode, mode, rdev); ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); mark_inode_dirty(inode); - lock_ufs(dir->i_sb); err = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); } return err; } @@ -121,18 +109,17 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; - int err = -ENAMETOOLONG; + int err; unsigned l = strlen(symname)+1; struct inode * inode; if (l > sb->s_blocksize) - goto out_notlocked; + return -ENAMETOOLONG; - lock_ufs(dir->i_sb); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out; + return err; if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ @@ -149,17 +136,13 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, } mark_inode_dirty(inode); - err = ufs_add_nondir(dentry, inode); -out: - unlock_ufs(dir->i_sb); -out_notlocked: - return err; + return ufs_add_nondir(dentry, inode); out_fail: inode_dec_link_count(inode); unlock_new_inode(inode); iput(inode); - goto out; + return err; } static int ufs_link (struct dentry * old_dentry, struct inode * dir, @@ -168,8 +151,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = d_inode(old_dentry); int error; - lock_ufs(dir->i_sb); - inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); ihold(inode); @@ -180,7 +161,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, iput(inode); } else d_instantiate(dentry, inode); - unlock_ufs(dir->i_sb); return error; } @@ -189,7 +169,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) struct inode * inode; int err; - lock_ufs(dir->i_sb); inode_inc_link_count(dir); inode = ufs_new_inode(dir, S_IFDIR|mode); @@ -210,12 +189,10 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) err = ufs_add_link(dentry, inode); if (err) goto out_fail; - unlock_ufs(dir->i_sb); unlock_new_inode(inode); d_instantiate(dentry, inode); -out: - return err; + return 0; out_fail: inode_dec_link_count(inode); @@ -224,8 +201,7 @@ out_fail: iput (inode); out_dir: inode_dec_link_count(dir); - unlock_ufs(dir->i_sb); - goto out; + return err; } static int ufs_unlink(struct inode *dir, struct dentry *dentry) @@ -255,7 +231,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) struct inode * inode = d_inode(dentry); int err= -ENOTEMPTY; - lock_ufs(dir->i_sb); if (ufs_empty_dir (inode)) { err = ufs_unlink(dir, dentry); if (!err) { @@ -264,7 +239,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) inode_dec_link_count(dir); } } - unlock_ufs(dir->i_sb); return err; } -- cgit From 70d45cdb664390a5573c05f1eecc8432dcc6581b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 01:56:23 -0400 Subject: ufs: don't touch mtime/ctime of directory being moved See "ext2: Do not update mtime of a moved directory" (and followup in "ext2: fix unbalanced kmap()/kunmap()") for background; this is UFS equivalent - the same problem exists here. Signed-off-by: Al Viro --- fs/ufs/dir.c | 6 ++++-- fs/ufs/namei.c | 9 +++++++-- fs/ufs/ufs.h | 2 +- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 1bfe8cabff0f..862d284d438e 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -87,7 +87,8 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) /* Releases the page */ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode) + struct page *page, struct inode *inode, + bool update_times) { loff_t pos = page_offset(page) + (char *) de - (char *) page_address(page); @@ -103,7 +104,8 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, err = ufs_commit_chunk(page, pos, len); ufs_put_page(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (update_times) + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(dir); } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 3429079c11e2..d3fdfa22add5 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -276,7 +276,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, new_page, old_inode); + ufs_set_link(new_dir, new_de, new_page, old_inode, 1); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) drop_nlink(new_inode); @@ -299,7 +299,12 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, mark_inode_dirty(old_inode); if (dir_de) { - ufs_set_link(old_inode, dir_de, dir_page, new_dir); + if (old_dir != new_dir) + ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0); + else { + kunmap(dir_page); + page_cache_release(dir_page); + } inode_dec_link_count(old_dir); } return 0; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index cf6368d42d4a..2e31ea2e35a3 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -106,7 +106,7 @@ extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page extern int ufs_empty_dir (struct inode *); extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode); + struct page *page, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; -- cgit From e4f95517f18271b1da36cfc5d700e46844396d6e Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 17 Jun 2015 18:15:45 +0200 Subject: fs/ufs: restore s_lock mutex_init() Add last missing line in commit "cdd9eefdf905" ("fs/ufs: restore s_lock mutex") Signed-off-by: Fabian Frederick Signed-off-by: Al Viro --- fs/ufs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index afe9955654c8..dc33f9416340 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -801,6 +801,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); mutex_init(&sbi->mutex); + mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); /* -- cgit From f25801ee4680ef1db21e15c112e6e5fe3ffe8da5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 18 Jun 2015 14:32:23 +0100 Subject: overlay: Call ovl_drop_write() earlier in ovl_dentry_open() Call ovl_drop_write() earlier in ovl_dentry_open() before we call vfs_open() as we've done the copy up for which we needed the freeze-write lock by that point. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/overlayfs/inode.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 308379b2d0b2..21079d1ca2aa 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -343,31 +343,25 @@ static int ovl_dentry_open(struct dentry *dentry, struct file *file, int err; struct path realpath; enum ovl_path_type type; - bool want_write = false; type = ovl_path_real(dentry, &realpath); if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { - want_write = true; err = ovl_want_write(dentry); if (err) - goto out; + return err; if (file->f_flags & O_TRUNC) err = ovl_copy_up_last(dentry, NULL, true); else err = ovl_copy_up(dentry); + ovl_drop_write(dentry); if (err) - goto out_drop_write; + return err; ovl_path_upper(dentry, &realpath); } - err = vfs_open(&realpath, file, cred); -out_drop_write: - if (want_write) - ovl_drop_write(dentry); -out: - return err; + return vfs_open(&realpath, file, cred); } static const struct inode_operations ovl_file_inode_operations = { -- cgit From 4bacc9c9234c7c8eec44f5ed4e960d9f96fa0f01 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 18 Jun 2015 14:32:31 +0100 Subject: overlayfs: Make f_path always point to the overlay and f_inode to the underlay Make file->f_path always point to the overlay dentry so that the path in /proc/pid/fd is correct and to ensure that label-based LSMs have access to the overlay as well as the underlay (path-based LSMs probably don't need it). Using my union testsuite to set things up, before the patch I see: [root@andromeda union-testsuite]# bash 5 /a/foo107 [root@andromeda union-testsuite]# stat /mnt/a/foo107 ... Device: 23h/35d Inode: 13381 Links: 1 ... [root@andromeda union-testsuite]# stat -L /proc/$$/fd/5 ... Device: 23h/35d Inode: 13381 Links: 1 ... After the patch: [root@andromeda union-testsuite]# bash 5 /mnt/a/foo107 [root@andromeda union-testsuite]# stat /mnt/a/foo107 ... Device: 23h/35d Inode: 40346 Links: 1 ... [root@andromeda union-testsuite]# stat -L /proc/$$/fd/5 ... Device: 23h/35d Inode: 40346 Links: 1 ... Note the change in where /proc/$$/fd/5 points to in the ls command. It was pointing to /a/foo107 (which doesn't exist) and now points to /mnt/a/foo107 (which is correct). The inode accessed, however, is the lower layer. The union layer is on device 25h/37d and the upper layer on 24h/36d. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/dcache.c | 5 ++++- fs/internal.h | 1 + fs/open.c | 49 +++++++++++++++++++++++++----------------------- fs/overlayfs/inode.c | 14 ++++++-------- fs/overlayfs/overlayfs.h | 1 + fs/overlayfs/super.c | 1 + include/linux/dcache.h | 2 ++ include/linux/fs.h | 2 -- 8 files changed, 41 insertions(+), 34 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 37b5afdaf698..c4ce35110704 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1673,7 +1673,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | - DCACHE_OP_DELETE )); + DCACHE_OP_DELETE | + DCACHE_OP_SELECT_INODE)); dentry->d_op = op; if (!op) return; @@ -1689,6 +1690,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; + if (op->d_select_inode) + dentry->d_flags |= DCACHE_OP_SELECT_INODE; } EXPORT_SYMBOL(d_set_d_op); diff --git a/fs/internal.h b/fs/internal.h index 01dce1d1476b..4d5af583ab03 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -107,6 +107,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, extern long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag); extern int open_check_o_direct(struct file *f); +extern int vfs_open(const struct path *, struct file *, const struct cred *); /* * inode.c diff --git a/fs/open.c b/fs/open.c index e0250bdcc440..b1c5823b7f11 100644 --- a/fs/open.c +++ b/fs/open.c @@ -678,18 +678,18 @@ int open_check_o_direct(struct file *f) } static int do_dentry_open(struct file *f, + struct inode *inode, int (*open)(struct inode *, struct file *), const struct cred *cred) { static const struct file_operations empty_fops = {}; - struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; path_get(&f->f_path); - inode = f->f_inode = f->f_path.dentry->d_inode; + f->f_inode = inode; f->f_mapping = inode->i_mapping; if (unlikely(f->f_flags & O_PATH)) { @@ -793,7 +793,8 @@ int finish_open(struct file *file, struct dentry *dentry, BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; - error = do_dentry_open(file, open, current_cred()); + error = do_dentry_open(file, d_backing_inode(dentry), open, + current_cred()); if (!error) *opened |= FILE_OPENED; @@ -822,6 +823,28 @@ int finish_no_open(struct file *file, struct dentry *dentry) } EXPORT_SYMBOL(finish_no_open); +/** + * vfs_open - open the file at the given path + * @path: path to open + * @file: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *file, + const struct cred *cred) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = dentry->d_inode; + + file->f_path = *path; + if (dentry->d_flags & DCACHE_OP_SELECT_INODE) { + inode = dentry->d_op->d_select_inode(dentry, file->f_flags); + if (IS_ERR(inode)) + return PTR_ERR(inode); + } + + return do_dentry_open(file, inode, NULL, cred); +} + struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { @@ -853,26 +876,6 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); -/** - * vfs_open - open the file at the given path - * @path: path to open - * @filp: newly allocated file with f_flag initialized - * @cred: credentials to use - */ -int vfs_open(const struct path *path, struct file *filp, - const struct cred *cred) -{ - struct inode *inode = path->dentry->d_inode; - - if (inode->i_op->dentry_open) - return inode->i_op->dentry_open(path->dentry, filp, cred); - else { - filp->f_path = *path; - return do_dentry_open(filp, NULL, cred); - } -} -EXPORT_SYMBOL(vfs_open); - static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 21079d1ca2aa..f140e3dbfb7b 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -337,31 +337,30 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, return true; } -static int ovl_dentry_open(struct dentry *dentry, struct file *file, - const struct cred *cred) +struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags) { int err; struct path realpath; enum ovl_path_type type; type = ovl_path_real(dentry, &realpath); - if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { + if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { err = ovl_want_write(dentry); if (err) - return err; + return ERR_PTR(err); - if (file->f_flags & O_TRUNC) + if (file_flags & O_TRUNC) err = ovl_copy_up_last(dentry, NULL, true); else err = ovl_copy_up(dentry); ovl_drop_write(dentry); if (err) - return err; + return ERR_PTR(err); ovl_path_upper(dentry, &realpath); } - return vfs_open(&realpath, file, cred); + return d_backing_inode(realpath.dentry); } static const struct inode_operations ovl_file_inode_operations = { @@ -372,7 +371,6 @@ static const struct inode_operations ovl_file_inode_operations = { .getxattr = ovl_getxattr, .listxattr = ovl_listxattr, .removexattr = ovl_removexattr, - .dentry_open = ovl_dentry_open, }; static const struct inode_operations ovl_symlink_inode_operations = { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 17ac5afc9ffb..ea5a40b06e3a 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -173,6 +173,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); int ovl_removexattr(struct dentry *dentry, const char *name); +struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, struct ovl_entry *oe); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5f0d1993e6e3..84c5e27fbfd9 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -275,6 +275,7 @@ static void ovl_dentry_release(struct dentry *dentry) static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, + .d_select_inode = ovl_d_select_inode, }; static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index df334cbacc6d..167ec0934049 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -160,6 +160,7 @@ struct dentry_operations { char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(struct dentry *, bool); + struct inode *(*d_select_inode)(struct dentry *, unsigned); } ____cacheline_aligned; /* @@ -225,6 +226,7 @@ struct dentry_operations { #define DCACHE_MAY_FREE 0x00800000 #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ +#define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */ extern seqlock_t rename_lock; diff --git a/include/linux/fs.h b/include/linux/fs.h index b577e801b4af..2bd77e10e8e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1641,7 +1641,6 @@ struct inode_operations { int (*set_acl)(struct inode *, struct posix_acl *, int); /* WARNING: probably going away soon, do not use! */ - int (*dentry_open)(struct dentry *, struct file *, const struct cred *); } ____cacheline_aligned; ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, @@ -2194,7 +2193,6 @@ extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int); -extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); -- cgit From 9bf39ab2adafd7cf8740859cb49e7b7952813a5d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 19 Jun 2015 10:29:13 +0200 Subject: vfs: add file_path() helper Turn d_path(&file->f_path, ...); into file_path(file, ...); Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- arch/arc/kernel/troubleshoot.c | 10 +++------- arch/blackfin/kernel/trace.c | 2 +- arch/tile/kernel/stack.c | 2 +- arch/tile/mm/elf.c | 2 +- drivers/block/loop.c | 2 +- drivers/md/bitmap.c | 2 +- drivers/md/md.c | 2 +- drivers/usb/gadget/function/f_mass_storage.c | 2 +- drivers/usb/gadget/function/storage_common.c | 2 +- fs/binfmt_elf.c | 4 ++-- fs/coredump.c | 2 +- fs/ext4/super.c | 2 +- fs/open.c | 6 ++++++ include/linux/fs.h | 2 ++ kernel/events/core.c | 2 +- mm/memory.c | 2 +- 16 files changed, 25 insertions(+), 21 deletions(-) diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index e00a01879025..9f80c5adcb68 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -67,15 +67,12 @@ static void print_task_path_n_nm(struct task_struct *tsk, char *buf) mmput(mm); if (exe_file) { - path = exe_file->f_path; - path_get(&exe_file->f_path); + path_nm = file_path(exe_file, buf, 255); fput(exe_file); - path_nm = d_path(&path, buf, 255); - path_put(&path); } done: - pr_info("Path: %s\n", path_nm); + pr_info("Path: %s\n", !IS_ERR(path_nm) ? path_nm : "?"); } static void show_faulting_vma(unsigned long address, char *buf) @@ -99,8 +96,7 @@ static void show_faulting_vma(unsigned long address, char *buf) if (vma && (vma->vm_start <= address)) { struct file *file = vma->vm_file; if (file) { - struct path *path = &file->f_path; - nm = d_path(path, buf, PAGE_SIZE - 1); + nm = file_path(file, buf, PAGE_SIZE - 1); inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; diff --git a/arch/blackfin/kernel/trace.c b/arch/blackfin/kernel/trace.c index c36efa0c7163..719dd796c12c 100644 --- a/arch/blackfin/kernel/trace.c +++ b/arch/blackfin/kernel/trace.c @@ -136,7 +136,7 @@ void decode_address(char *buf, unsigned long address) struct file *file = vma->vm_file; if (file) { - char *d_name = d_path(&file->f_path, _tmpbuf, + char *d_name = file_path(file, _tmpbuf, sizeof(_tmpbuf)); if (!IS_ERR(d_name)) name = d_name; diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c index c42dce50acd8..8d62cf12c2c0 100644 --- a/arch/tile/kernel/stack.c +++ b/arch/tile/kernel/stack.c @@ -334,7 +334,7 @@ static void describe_addr(struct KBacktraceIterator *kbt, } if (vma->vm_file) { - p = d_path(&vma->vm_file->f_path, buf, bufsize); + p = file_path(vma->vm_file, buf, bufsize); if (IS_ERR(p)) p = "?"; name = kbasename(p); diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c index f7ddae3725a4..6225cc998db1 100644 --- a/arch/tile/mm/elf.c +++ b/arch/tile/mm/elf.c @@ -56,7 +56,7 @@ static int notify_exec(struct mm_struct *mm) if (exe_file == NULL) goto done_free; - path = d_path(&exe_file->f_path, buf, PAGE_SIZE); + path = file_path(exe_file, buf, PAGE_SIZE); if (IS_ERR(path)) goto done_put; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d7173cb1ea76..0d8ad59413cd 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -568,7 +568,7 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) spin_lock_irq(&lo->lo_lock); if (lo->lo_backing_file) - p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1); + p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1); spin_unlock_irq(&lo->lo_lock); if (IS_ERR_OR_NULL(p)) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2bc56e2a3526..dda33d648354 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -834,7 +834,7 @@ static void bitmap_file_kick(struct bitmap *bitmap) if (bitmap->storage.file) { path = kmalloc(PAGE_SIZE, GFP_KERNEL); if (path) - ptr = d_path(&bitmap->storage.file->f_path, + ptr = file_path(bitmap->storage.file, path, PAGE_SIZE); printk(KERN_ALERT diff --git a/drivers/md/md.c b/drivers/md/md.c index 593a02476c78..e67f3ac137bf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5741,7 +5741,7 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg) /* bitmap disabled, zero the first byte and copy out */ if (!mddev->bitmap_info.file) file->pathname[0] = '\0'; - else if ((ptr = d_path(&mddev->bitmap_info.file->f_path, + else if ((ptr = file_path(mddev->bitmap_info.file, file->pathname, sizeof(file->pathname))), IS_ERR(ptr)) err = PTR_ERR(ptr); diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index 3cc109f3c9c8..d2259c663996 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -2936,7 +2936,7 @@ int fsg_common_create_lun(struct fsg_common *common, struct fsg_lun_config *cfg, if (fsg_lun_is_open(lun)) { p = "(error)"; if (pathbuf) { - p = d_path(&lun->filp->f_path, pathbuf, PATH_MAX); + p = file_path(lun->filp, pathbuf, PATH_MAX); if (IS_ERR(p)) p = "(error)"; } diff --git a/drivers/usb/gadget/function/storage_common.c b/drivers/usb/gadget/function/storage_common.c index 648f9e489b39..d62683017cf3 100644 --- a/drivers/usb/gadget/function/storage_common.c +++ b/drivers/usb/gadget/function/storage_common.c @@ -341,7 +341,7 @@ ssize_t fsg_show_file(struct fsg_lun *curlun, struct rw_semaphore *filesem, down_read(filesem); if (fsg_lun_is_open(curlun)) { /* Get the complete pathname */ - p = d_path(&curlun->filp->f_path, buf, PAGE_SIZE - 1); + p = file_path(curlun->filp, buf, PAGE_SIZE - 1); if (IS_ERR(p)) rc = PTR_ERR(p); else { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 241ef68d2893..5046b6247103 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1530,7 +1530,7 @@ static int fill_files_note(struct memelfnote *note) file = vma->vm_file; if (!file) continue; - filename = d_path(&file->f_path, name_curpos, remaining); + filename = file_path(file, name_curpos, remaining); if (IS_ERR(filename)) { if (PTR_ERR(filename) == -ENAMETOOLONG) { vfree(data); @@ -1540,7 +1540,7 @@ static int fill_files_note(struct memelfnote *note) continue; } - /* d_path() fills at the end, move name down */ + /* file_path() fills at the end, move name down */ /* n = strlen(filename) + 1: */ n = (name_curpos + remaining) - filename; remaining = filename - name_curpos; diff --git a/fs/coredump.c b/fs/coredump.c index bbbe139ab280..5b771b36cc6e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -138,7 +138,7 @@ static int cn_print_exe_file(struct core_name *cn) goto put_exe_file; } - path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); + path = file_path(exe_file, pathbuf, PATH_MAX); if (IS_ERR(path)) { ret = PTR_ERR(path); goto free_buf; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f06d0589ddba..0ae853d2e1f1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -449,7 +449,7 @@ void __ext4_error_file(struct file *file, const char *function, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { - path = d_path(&(file->f_path), pathname, sizeof(pathname)); + path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); diff --git a/fs/open.c b/fs/open.c index b1c5823b7f11..1dbc79358d59 100644 --- a/fs/open.c +++ b/fs/open.c @@ -823,6 +823,12 @@ int finish_no_open(struct file *file, struct dentry *dentry) } EXPORT_SYMBOL(finish_no_open); +char *file_path(struct file *filp, char *buf, int buflen) +{ + return d_path(&filp->f_path, buf, buflen); +} +EXPORT_SYMBOL(file_path); + /** * vfs_open - open the file at the given path * @path: path to open diff --git a/include/linux/fs.h b/include/linux/fs.h index 2bd77e10e8e5..2c135ad741a9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2500,6 +2500,8 @@ extern struct file * open_exec(const char *); extern int is_subdir(struct dentry *, struct dentry *); extern int path_is_under(struct path *, struct path *); +extern char *file_path(struct file *, char *, int); + #include /* needed for stackable file system support */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 81aa3a4ece9f..5c964e845483 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5791,7 +5791,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); + name = file_path(file, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; diff --git a/mm/memory.c b/mm/memory.c index 22e037e3364e..28c10da1efbc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3724,7 +3724,7 @@ void print_vma_addr(char *prefix, unsigned long ip) if (buf) { char *p; - p = d_path(&f->f_path, buf, PAGE_SIZE); + p = file_path(f, buf, PAGE_SIZE); if (IS_ERR(p)) p = "?"; printk("%s%s[%lx+%lx]", prefix, kbasename(p), -- cgit From 2726d56620ce71f40dd583d51391b86e1ab8cc57 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 19 Jun 2015 10:30:28 +0200 Subject: vfs: add seq_file_path() helper Turn seq_path(..., &file->f_path, ...); into seq_file_path(..., file, ...); Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- drivers/md/bitmap.c | 2 +- fs/proc/nommu.c | 2 +- fs/proc/task_mmu.c | 4 ++-- fs/proc/task_nommu.c | 2 +- fs/seq_file.c | 14 ++++++++++++++ include/linux/seq_file.h | 1 + mm/swapfile.c | 2 +- 7 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index dda33d648354..3813fdfee4be 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1922,7 +1922,7 @@ void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) chunk_kb ? "KB" : "B"); if (bitmap->storage.file) { seq_printf(seq, ", file: "); - seq_path(seq, &bitmap->storage.file->f_path, " \t\n"); + seq_file_path(seq, bitmap->storage.file, " \t\n"); } seq_printf(seq, "\n"); diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index d4a35746cab9..f8595e8b5cd0 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -64,7 +64,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, ""); + seq_file_path(m, file, ""); } seq_putc(m, '\n'); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6dee68d013ff..ca1e091881d4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -310,7 +310,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) */ if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, "\n"); + seq_file_path(m, file, "\n"); goto done; } @@ -1509,7 +1509,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (file) { seq_puts(m, " file="); - seq_path(m, &file->f_path, "\n\t= "); + seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); } else { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 599ec2e20104..e0d64c92e4f6 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -180,7 +180,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, ""); + seq_file_path(m, file, ""); } else if (mm) { pid_t tid = pid_of_stack(priv, vma, is_pid); diff --git a/fs/seq_file.c b/fs/seq_file.c index 555f82155be8..d8a0545ad7ea 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -487,6 +487,20 @@ int seq_path(struct seq_file *m, const struct path *path, const char *esc) } EXPORT_SYMBOL(seq_path); +/** + * seq_file_path - seq_file interface to print a pathname of a file + * @m: the seq_file handle + * @file: the struct file to print + * @esc: set of characters to escape in the output + * + * return the absolute path to the file. + */ +int seq_file_path(struct seq_file *m, struct file *file, const char *esc) +{ + return seq_path(m, &file->f_path, esc); +} +EXPORT_SYMBOL(seq_file_path); + /* * Same as seq_path, but relative to supplied root. */ diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index afbb1fd77c77..912a7c482649 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -123,6 +123,7 @@ __printf(2, 3) int seq_printf(struct seq_file *, const char *, ...); __printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args); int seq_path(struct seq_file *, const struct path *, const char *); +int seq_file_path(struct seq_file *, struct file *, const char *); int seq_dentry(struct seq_file *, struct dentry *, const char *); int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc); diff --git a/mm/swapfile.c b/mm/swapfile.c index a7e72103f23b..41e4581af7c5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2032,7 +2032,7 @@ static int swap_show(struct seq_file *swap, void *v) } file = si->swap_file; - len = seq_path(swap, &file->f_path, " \t\n\\"); + len = seq_file_path(swap, file, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? -- cgit From 6b6dabc8dcefaf9997ce037c70b32d570ced9d3e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 21 Jun 2015 01:37:24 -0400 Subject: nilfs2_direct_IO(): remove dead code Signed-off-by: Al Viro --- fs/nilfs2/inode.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 258d9fe2521a..4a73d6dffabf 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -307,31 +307,13 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, static ssize_t nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - ssize_t size; + struct inode *inode = file_inode(iocb->ki_filp); if (iov_iter_rw(iter) == WRITE) return 0; /* Needs synchronization with the cleaner */ - size = blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); - - /* - * In case of error extending write may have instantiated a few - * blocks outside i_size. Trim these off again. - */ - if (unlikely(iov_iter_rw(iter) == WRITE && size < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + count; - - if (end > isize) - nilfs_write_failed(mapping, end); - } - - return size; + return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); } const struct address_space_operations nilfs_aops = { -- cgit From c0c3a718e3ab2430a52a60d614b109e5e48e83e2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 19 Jun 2015 09:00:55 +1000 Subject: fs/posix_acl.c: make posix_acl_create() safer and cleaner If posix_acl_create() returns an error code then "*acl" and "*default_acl" can be uninitialized or point to freed memory. This is a dangerous thing to do. For example, it causes a problem in ocfs2_reflink(): fs/ocfs2/refcounttree.c:4327 ocfs2_reflink() error: potentially using uninitialized 'default_acl'. I've re-written this so we set the pointers to NULL at the start. I've added a temporary "clone" variable to hold the value of "*acl" until end. Setting them to NULL means means we don't need the "no_acl" label. We may as well remove the "apply_umask" stuff forward and remove that label as well. Signed-off-by: Dan Carpenter Cc: Alexander Viro Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/posix_acl.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 84bb65b83570..4fb17ded7d47 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { struct posix_acl *p; + struct posix_acl *clone; int ret; + *acl = NULL; + *default_acl = NULL; + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) - goto no_acl; + return 0; p = get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(p)) { - if (p == ERR_PTR(-EOPNOTSUPP)) - goto apply_umask; - return PTR_ERR(p); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; } + if (IS_ERR(p)) + return PTR_ERR(p); - if (!p) - goto apply_umask; - - *acl = posix_acl_clone(p, GFP_NOFS); - if (!*acl) + clone = posix_acl_clone(p, GFP_NOFS); + if (!clone) goto no_mem; - ret = posix_acl_create_masq(*acl, mode); + ret = posix_acl_create_masq(clone, mode); if (ret < 0) goto no_mem_clone; - if (ret == 0) { - posix_acl_release(*acl); - *acl = NULL; - } + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; - if (!S_ISDIR(*mode)) { + if (!S_ISDIR(*mode)) posix_acl_release(p); - *default_acl = NULL; - } else { + else *default_acl = p; - } - return 0; -apply_umask: - *mode &= ~current_umask(); -no_acl: - *default_acl = NULL; - *acl = NULL; return 0; no_mem_clone: - posix_acl_release(*acl); + posix_acl_release(clone); no_mem: posix_acl_release(p); return -ENOMEM; -- cgit From 2426f3910069ed47c0cc58559a6d088af7920201 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 May 2015 16:05:52 +0200 Subject: fs: Fix S_NOSEC handling file_remove_suid() could mistakenly set S_NOSEC inode bit when root was modifying the file. As a result following writes to the file by ordinary user would avoid clearing suid or sgid bits. Fix the bug by checking actual mode bits before setting S_NOSEC. CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index e8d62688ed91..07f4cb5eab4b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1706,8 +1706,8 @@ int file_remove_suid(struct file *file) error = security_inode_killpriv(dentry); if (!error && killsuid) error = __remove_suid(dentry, killsuid); - if (!error && (inode->i_sb->s_flags & MS_NOSEC)) - inode->i_flags |= S_NOSEC; + if (!error) + inode_has_no_xattr(inode); return error; } -- cgit From 5fa8e0a1c6a762857ae67d1628c58b9a02362003 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 May 2015 16:05:53 +0200 Subject: fs: Rename file_remove_suid() to file_remove_privs() file_remove_suid() is a misnomer since it removes also file capabilities stored in xattrs and sets S_NOSEC flag. Also should_remove_suid() tells something else than whether file_remove_suid() call is necessary which leads to bugs. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/btrfs/file.c | 2 +- fs/ceph/file.c | 2 +- fs/fuse/file.c | 2 +- fs/inode.c | 13 ++++++++----- fs/ntfs/file.c | 2 +- fs/xfs/xfs_file.c | 2 +- include/linux/fs.h | 2 +- mm/filemap.c | 2 +- 8 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b072e17479aa..86f97282779a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1748,7 +1748,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, } current->backing_dev_info = inode_to_bdi(inode); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) { mutex_unlock(&inode->i_mutex); goto out; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3b6b522b4b31..e55fe32c6224 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -959,7 +959,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) pos = iocb->ki_pos; count = iov_iter_count(from); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ef05b5c4cff..1344647965dc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1169,7 +1169,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err <= 0) goto out; - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; diff --git a/fs/inode.c b/fs/inode.c index 07f4cb5eab4b..849210c155dc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1685,7 +1685,11 @@ static int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs, NULL); } -int file_remove_suid(struct file *file) +/* + * Remove special file priviledges (suid, capabilities) when file is written + * to or truncated. + */ +int file_remove_privs(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); @@ -1711,7 +1715,7 @@ int file_remove_suid(struct file *file) return error; } -EXPORT_SYMBOL(file_remove_suid); +EXPORT_SYMBOL(file_remove_privs); /** * file_update_time - update mtime and ctime time @@ -1966,9 +1970,8 @@ EXPORT_SYMBOL(inode_dio_wait); * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify * i_flags actually followed this rule, is that there is at least one - * code path which doesn't today --- for example, - * __generic_file_aio_write() calls file_remove_suid() without holding - * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * code path which doesn't today so we use cmpxchg() out of an abundance + * of caution. * * In the long run, i_mutex is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 7bb487e663b4..182bb93aa79c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -382,7 +382,7 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, base_ni = ni; if (NInoAttr(ni)) base_ni = ni->ext.base_ntfs_ino; - err = file_remove_suid(file); + err = file_remove_privs(file); if (unlikely(err)) goto out; /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8121e75352ee..f3e4fbb59985 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -623,7 +623,7 @@ restart: * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ - return file_remove_suid(file); + return file_remove_privs(file); } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c135ad741a9..641e68d850cf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2553,7 +2553,7 @@ extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); -extern int file_remove_suid(struct file *); +extern int file_remove_privs(struct file *); extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) diff --git a/mm/filemap.c b/mm/filemap.c index 6bf5e42d560a..f851e36802d5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2536,7 +2536,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; -- cgit From dbfae0cdcd87602737101d4417811f4323156b54 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 May 2015 16:05:54 +0200 Subject: fs: Provide function telling whether file_remove_privs() will do anything Provide function telling whether file_remove_privs() will do anything. Currently we only have should_remove_suid() and that does something slightly different. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/inode.c | 44 ++++++++++++++++++++++++++++++++------------ include/linux/fs.h | 1 + 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 849210c155dc..8c2dd74455c9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1673,7 +1673,32 @@ int should_remove_suid(struct dentry *dentry) } EXPORT_SYMBOL(should_remove_suid); -static int __remove_suid(struct dentry *dentry, int kill) +/* + * Return mask of changes for notify_change() that need to be done as a + * response to write or truncate. Return 0 if nothing has to be changed. + * Negative value on error (change should be denied). + */ +int file_needs_remove_privs(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = d_inode(dentry); + int mask = 0; + int ret; + + if (IS_NOSEC(inode)) + return 0; + + mask = should_remove_suid(dentry); + ret = security_inode_need_killpriv(dentry); + if (ret < 0) + return ret; + if (ret) + mask |= ATTR_KILL_PRIV; + return mask; +} +EXPORT_SYMBOL(file_needs_remove_privs); + +static int __remove_privs(struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1693,23 +1718,18 @@ int file_remove_privs(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); - int killsuid; - int killpriv; + int kill; int error = 0; /* Fast path for nothing security related */ if (IS_NOSEC(inode)) return 0; - killsuid = should_remove_suid(dentry); - killpriv = security_inode_need_killpriv(dentry); - - if (killpriv < 0) - return killpriv; - if (killpriv) - error = security_inode_killpriv(dentry); - if (!error && killsuid) - error = __remove_suid(dentry, killsuid); + kill = file_needs_remove_privs(file); + if (kill < 0) + return kill; + if (kill) + error = __remove_privs(dentry, kill); if (!error) inode_has_no_xattr(inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 641e68d850cf..ee60e8ab210f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2554,6 +2554,7 @@ extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_privs(struct file *); +extern int file_needs_remove_privs(struct file *file); extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) -- cgit From 45f147a1bc97c743c6101a8d2741c69a51f583e4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 May 2015 16:05:55 +0200 Subject: fs: Call security_ops->inode_killpriv on truncate Comment in include/linux/security.h says that ->inode_killpriv() should be called when setuid bit is being removed and that similar security labels (in fact this applies only to file capabilities) should be removed at this time as well. However we don't call ->inode_killpriv() when we remove suid bit on truncate. We fix the problem by calling ->inode_need_killpriv() and subsequently ->inode_killpriv() on truncate the same way as we do it on file write. After this patch there's only one user of should_remove_suid() - ocfs2 - and indeed it's buggy because it doesn't call ->inode_killpriv() on write. However fixing it is difficult because of special locking constraints. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/inode.c | 5 ++--- fs/open.c | 6 ++++-- include/linux/fs.h | 6 +++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 8c2dd74455c9..0401d2c6d087 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1678,9 +1678,8 @@ EXPORT_SYMBOL(should_remove_suid); * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ -int file_needs_remove_privs(struct file *file) +int dentry_needs_remove_privs(struct dentry *dentry) { - struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); int mask = 0; int ret; @@ -1696,7 +1695,7 @@ int file_needs_remove_privs(struct file *file) mask |= ATTR_KILL_PRIV; return mask; } -EXPORT_SYMBOL(file_needs_remove_privs); +EXPORT_SYMBOL(dentry_needs_remove_privs); static int __remove_privs(struct dentry *dentry, int kill) { diff --git a/fs/open.c b/fs/open.c index 1dbc79358d59..e33dab287fa0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -51,8 +51,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, newattrs.ia_valid |= ATTR_FILE; } - /* Remove suid/sgid on truncate too */ - ret = should_remove_suid(dentry); + /* Remove suid, sgid, and file capabilities on truncate too */ + ret = dentry_needs_remove_privs(dentry); + if (ret < 0) + return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; diff --git a/include/linux/fs.h b/include/linux/fs.h index ee60e8ab210f..1e658b11c265 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2554,7 +2554,11 @@ extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_privs(struct file *); -extern int file_needs_remove_privs(struct file *file); +extern int dentry_needs_remove_privs(struct dentry *dentry); +static inline int file_needs_remove_privs(struct file *file) +{ + return dentry_needs_remove_privs(file->f_path.dentry); +} extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) -- cgit From a6de82cab123beaf9406024943caa0242f0618b0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 May 2015 16:05:56 +0200 Subject: xfs: Correctly lock inode when removing suid and file capabilities Currently XFS calls file_remove_privs() without holding i_mutex. This is wrong because that function can end up messing with file permissions and file capabilities stored in xattrs for which we need i_mutex held. Fix the problem by grabbing iolock exclusively when we will need to change anything in permissions / xattrs. Reviewed-by: Dave Chinner Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/xfs/xfs_file.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f3e4fbb59985..71c2c712e609 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -563,6 +563,13 @@ restart: if (error) return error; + /* For changing security info in file_remove_privs() we need i_mutex */ + if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + goto restart; + } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this @@ -623,7 +630,9 @@ restart: * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ - return file_remove_privs(file); + if (!IS_NOSEC(inode)) + return file_remove_privs(file); + return 0; } /* -- cgit From db6172c41194576ba2a27e64fa2a5576d11d6eb9 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 19 Mar 2015 12:28:04 +0100 Subject: fs: cleanup slight list_entry abuse list_entry is just a wrapper for container_of, but it is arguably wrong (and slightly confusing) to use it when the pointed-to struct member is not a struct list_head. Use container_of directly instead. Signed-off-by: Rasmus Villemoes Signed-off-by: Al Viro --- fs/affs/affs.h | 2 +- fs/befs/befs.h | 2 +- fs/coda/coda_linux.h | 2 +- fs/hfs/hfs_fs.h | 2 +- fs/hfsplus/hfsplus_fs.h | 2 +- fs/hpfs/hpfs_fn.h | 2 +- fs/jffs2/os-linux.h | 2 +- fs/jfs/jfs_incore.h | 2 +- fs/minix/minix.h | 2 +- fs/ntfs/inode.h | 2 +- fs/squashfs/squashfs_fs_i.h | 2 +- fs/sysv/sysv.h | 2 +- fs/udf/udf_i.h | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/affs/affs.h b/fs/affs/affs.h index cffe8370fb44..c69a87eaf57d 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -64,7 +64,7 @@ struct affs_inode_info { /* short cut to get to the affs specific inode data */ static inline struct affs_inode_info *AFFS_I(struct inode *inode) { - return list_entry(inode, struct affs_inode_info, vfs_inode); + return container_of(inode, struct affs_inode_info, vfs_inode); } /* diff --git a/fs/befs/befs.h b/fs/befs/befs.h index 1fead8d56a98..35d19e8731e3 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -112,7 +112,7 @@ BEFS_SB(const struct super_block *super) static inline struct befs_inode_info * BEFS_I(const struct inode *inode) { - return list_entry(inode, struct befs_inode_info, vfs_inode); + return container_of(inode, struct befs_inode_info, vfs_inode); } static inline befs_blocknr_t diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index d6f7a76a1f5b..f829fe963f5b 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -79,7 +79,7 @@ void coda_sysctl_clean(void); static inline struct coda_inode_info *ITOC(struct inode *inode) { - return list_entry(inode, struct coda_inode_info, vfs_inode); + return container_of(inode, struct coda_inode_info, vfs_inode); } static __inline__ struct CodaFid *coda_i2f(struct inode *inode) diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 95d255219b1e..1f1c7dcbcc2f 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -252,7 +252,7 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb); #define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60) #define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60) -#define HFS_I(inode) (list_entry(inode, struct hfs_inode_info, vfs_inode)) +#define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode)) #define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) #define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index b0441d65fa54..f91a1faf819e 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -263,7 +263,7 @@ struct hfsplus_inode_info { static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) { - return list_entry(inode, struct hfsplus_inode_info, vfs_inode); + return container_of(inode, struct hfsplus_inode_info, vfs_inode); } /* diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index b63b75fa00e7..bb04b58d1d69 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -304,7 +304,7 @@ extern const struct address_space_operations hpfs_symlink_aops; static inline struct hpfs_inode_info *hpfs_i(struct inode *inode) { - return list_entry(inode, struct hpfs_inode_info, vfs_inode); + return container_of(inode, struct hpfs_inode_info, vfs_inode); } static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index d200a9b8fd5e..824e61ede465 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -19,7 +19,7 @@ struct kstatfs; struct kvec; -#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode)) +#define JFFS2_INODE_INFO(i) (container_of(i, struct jffs2_inode_info, vfs_inode)) #define OFNI_EDONI_2SFFJ(f) (&(f)->vfs_inode) #define JFFS2_SB_INFO(sb) (sb->s_fs_info) #define OFNI_BS_2SFFJ(c) ((struct super_block *)c->os_priv) diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index fa7e795bd8ae..1f26d1910409 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -206,7 +206,7 @@ struct jfs_sb_info { static inline struct jfs_inode_info *JFS_IP(struct inode *inode) { - return list_entry(inode, struct jfs_inode_info, vfs_inode); + return container_of(inode, struct jfs_inode_info, vfs_inode); } static inline int jfs_dirtable_inline(struct inode *inode) diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 1ebd11854622..01ad81dcacc5 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -84,7 +84,7 @@ static inline struct minix_sb_info *minix_sb(struct super_block *sb) static inline struct minix_inode_info *minix_i(struct inode *inode) { - return list_entry(inode, struct minix_inode_info, vfs_inode); + return container_of(inode, struct minix_inode_info, vfs_inode); } static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 76b6cfb579d7..b3c3469de6cb 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -239,7 +239,7 @@ typedef struct { */ static inline ntfs_inode *NTFS_I(struct inode *inode) { - return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode); + return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); } static inline struct inode *VFS_I(ntfs_inode *ni) diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index 73588e7700ed..d09fcd6fb85d 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -49,6 +49,6 @@ struct squashfs_inode_info { static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) { - return list_entry(inode, struct squashfs_inode_info, vfs_inode); + return container_of(inode, struct squashfs_inode_info, vfs_inode); } #endif diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 2c13525131cd..6c212288adcb 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -73,7 +73,7 @@ struct sysv_inode_info { static inline struct sysv_inode_info *SYSV_I(struct inode *inode) { - return list_entry(inode, struct sysv_inode_info, vfs_inode); + return container_of(inode, struct sysv_inode_info, vfs_inode); } static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb) diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index b5cd8ed2aa12..b1b9a63d8cf3 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -56,7 +56,7 @@ struct udf_inode_info { static inline struct udf_inode_info *UDF_I(struct inode *inode) { - return list_entry(inode, struct udf_inode_info, vfs_inode); + return container_of(inode, struct udf_inode_info, vfs_inode); } #endif /* _UDF_I_H) */ -- cgit From e5e6e97fe0f63b374e44a22f5a5c2d151c7fa8c5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Jun 2015 21:49:23 -0400 Subject: remove the pointless include of lglock.h Signed-off-by: Al Viro --- fs/file_table.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/file_table.c b/fs/file_table.c index 294174dcc226..7f9d407c7595 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include -- cgit From b57c2cb9ea1a02c2ae08e16de8c20cc13ffbf85a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sun, 24 May 2015 17:19:41 +0200 Subject: pagemap.h: move dir_pages() over there That function was declared in a lot of filesystems to calculate directory pages. Signed-off-by: Fabian Frederick Signed-off-by: Al Viro --- fs/exofs/dir.c | 6 ------ fs/ext2/dir.c | 5 ----- fs/freevxfs/vxfs_lookup.c | 7 ------- fs/minix/dir.c | 5 ----- fs/nilfs2/dir.c | 5 ----- fs/qnx6/dir.c | 5 ----- fs/sysv/dir.c | 5 ----- include/linux/pagemap.h | 6 ++++++ 8 files changed, 6 insertions(+), 38 deletions(-) diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 4deb0b05b011..e5bb2abf77f9 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -44,12 +44,6 @@ static inline void exofs_put_page(struct page *page) page_cache_release(page); } -/* Accesses dir's inode->i_size must be called under inode lock */ -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) { loff_t last_byte = inode->i_size; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 796b491e6978..0c6638b40f21 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -70,11 +70,6 @@ static inline void ext2_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 99c7f0a37af4..484b32d3234a 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -61,13 +61,6 @@ const struct file_operations vxfs_dir_operations = { .iterate = vxfs_readdir, }; - -static inline u_long -dir_pages(struct inode *inode) -{ - return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - static inline u_long dir_blocks(struct inode *ip) { diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 118e4e7bc935..d19ac258105a 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -45,11 +45,6 @@ minix_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 0ee0bed3649b..6b8b92b19cec 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 8d64bb5366bf..e1f37278cf97 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -32,11 +32,6 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n) return page; } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static unsigned last_entry(struct inode *inode, unsigned long page_nr) { unsigned long last_byte = inode->i_size; diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 8f3555f00c54..63c1bcb224ee 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -33,11 +33,6 @@ static inline void dir_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4b3736f7065c..808942d31062 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -670,4 +670,10 @@ static inline int add_to_page_cache(struct page *page, return error; } +static inline unsigned long dir_pages(struct inode *inode) +{ + return (unsigned long)(inode->i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; +} + #endif /* _LINUX_PAGEMAP_H */ -- cgit From 5d754ced150e392c7b5dbf73ed2feddb60cd579a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sun, 24 May 2015 17:19:43 +0200 Subject: ufs: use dir_pages instead of ufs_dir_pages() dir_pages was declared in a lot of filesystems. Use newly dir_pages() from pagemap.h Signed-off-by: Fabian Frederick Signed-off-by: Al Viro --- fs/ufs/dir.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 862d284d438e..74f2e80288bf 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -65,11 +65,6 @@ static inline void ufs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long ufs_dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; @@ -258,7 +253,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, int namelen = qstr->len; unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; - unsigned long npages = ufs_dir_pages(dir); + unsigned long npages = dir_pages(dir); struct page *page = NULL; struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; @@ -322,7 +317,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) unsigned short rec_len, name_len; struct page *page = NULL; struct ufs_dir_entry *de; - unsigned long npages = ufs_dir_pages(dir); + unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; loff_t pos; @@ -439,7 +434,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; - unsigned long npages = ufs_dir_pages(inode); + unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); int need_revalidate = file->f_version != inode->i_version; unsigned flags = UFS_SB(sb)->s_flags; @@ -610,7 +605,7 @@ int ufs_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; struct page *page = NULL; - unsigned long i, npages = ufs_dir_pages(inode); + unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { char *kaddr; -- cgit From dc3f4198eac14e52a98dfc79cd84b45e280f59cd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 18 May 2015 10:10:34 -0400 Subject: make simple_positive() public Signed-off-by: Al Viro --- arch/powerpc/platforms/cell/spufs/inode.c | 2 +- arch/s390/hypfs/inode.c | 7 +------ drivers/block/drbd/drbd_debugfs.c | 10 +--------- drivers/infiniband/hw/ipath/ipath_fs.c | 2 +- drivers/infiniband/hw/qib/qib_fs.c | 2 +- fs/autofs4/autofs_i.h | 5 ----- fs/ceph/dir.c | 2 +- fs/configfs/inode.c | 2 +- fs/debugfs/inode.c | 11 +++-------- fs/libfs.c | 5 ----- fs/nfs/dir.c | 2 +- fs/tracefs/inode.c | 11 +++-------- include/linux/dcache.h | 5 +++++ security/inode.c | 19 ++++++------------- 14 files changed, 25 insertions(+), 60 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 1ba6307be4db..11634fa7ab3c 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -166,7 +166,7 @@ static void spufs_prune_dir(struct dentry *dir) mutex_lock(&d_inode(dir)->i_mutex); list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) { spin_lock(&dentry->d_lock); - if (!(d_unhashed(dentry)) && d_really_is_positive(dentry)) { + if (simple_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index d3f896a35b98..8ffad5437232 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -62,18 +62,13 @@ static void hypfs_add_dentry(struct dentry *dentry) hypfs_last_dentry = dentry; } -static inline int hypfs_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - static void hypfs_remove(struct dentry *dentry) { struct dentry *parent; parent = dentry->d_parent; mutex_lock(&d_inode(parent)->i_mutex); - if (hypfs_positive(dentry)) { + if (simple_positive(dentry)) { if (d_is_dir(dentry)) simple_rmdir(d_inode(parent), dentry); else diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index a6ee3d750c30..6b88a35fb048 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -419,14 +419,6 @@ static int in_flight_summary_show(struct seq_file *m, void *pos) return 0; } -/* simple_positive(file->f_path.dentry) respectively debugfs_positive(), - * but neither is "reachable" from here. - * So we have our own inline version of it above. :-( */ -static inline int debugfs_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - /* make sure at *open* time that the respective object won't go away. */ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data, struct kref *kref, @@ -444,7 +436,7 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo /* serialize with d_delete() */ mutex_lock(&d_inode(parent)->i_mutex); /* Make sure the object is still alive */ - if (debugfs_positive(file->f_path.dentry) + if (simple_positive(file->f_path.dentry) && kref_get_unless_zero(kref)) ret = 0; mutex_unlock(&d_inode(parent)->i_mutex); diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index 1ca8e32a9592..25422a3a7238 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -277,7 +277,7 @@ static int remove_file(struct dentry *parent, char *name) } spin_lock(&tmp->d_lock); - if (!d_unhashed(tmp) && d_really_is_positive(tmp)) { + if (simple_positive(tmp)) { dget_dlock(tmp); __d_drop(tmp); spin_unlock(&tmp->d_lock); diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index bdd5d3857203..13ef22bd9459 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -455,7 +455,7 @@ static int remove_file(struct dentry *parent, char *name) } spin_lock(&tmp->d_lock); - if (!d_unhashed(tmp) && d_really_is_positive(tmp)) { + if (simple_positive(tmp)) { __d_drop(tmp); spin_unlock(&tmp->d_lock); simple_unlink(d_inode(parent), tmp); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 5b700ef1e59d..c37149b929be 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -238,11 +238,6 @@ static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) return d_inode(sbi->sb->s_root)->i_ino; } -static inline int simple_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - static inline void __autofs4_add_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4248307fea90..edbb8da02a6a 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -161,7 +161,7 @@ more: } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (di->lease_shared_gen == shared_gen && - !d_unhashed(dentry) && d_really_is_positive(dentry) && + simple_positive(dentry) && ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 8d89f5fd0331..eae87575e681 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -236,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dentry->d_lock); - if (!d_unhashed(dentry) && d_really_is_positive(dentry)) { + if (simple_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 7eaec88ea970..ef86ad6bdc3e 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -44,11 +44,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb) return inode; } -static inline int debugfs_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - struct debugfs_mount_opts { kuid_t uid; kgid_t gid; @@ -522,7 +517,7 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; - if (debugfs_positive(dentry)) { + if (simple_positive(dentry)) { dget(dentry); if (d_is_dir(dentry)) ret = simple_rmdir(d_inode(parent), dentry); @@ -602,7 +597,7 @@ void debugfs_remove_recursive(struct dentry *dentry) */ spin_lock(&parent->d_lock); list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!debugfs_positive(child)) + if (!simple_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ @@ -623,7 +618,7 @@ void debugfs_remove_recursive(struct dentry *dentry) * from d_subdirs. When releasing the parent->d_lock we can * no longer trust that the next pointer is valid. * Restart the loop. We'll skip this one with the - * debugfs_positive() check. + * simple_positive() check. */ goto loop; } diff --git a/fs/libfs.c b/fs/libfs.c index 65e1feca8b98..4d9e6c118fe1 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -20,11 +20,6 @@ #include "internal.h" -static inline int simple_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b2c8b31b2be7..b9108f4254a7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1771,7 +1771,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { - if (d_really_is_positive(dentry) && !d_unhashed(dentry)) + if (simple_positive(dentry)) d_delete(dentry); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index d92bdf3b079a..6e8a1400d662 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -496,16 +496,11 @@ struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *pare return dentry; } -static inline int tracefs_positive(struct dentry *dentry) -{ - return dentry->d_inode && !d_unhashed(dentry); -} - static int __tracefs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; - if (tracefs_positive(dentry)) { + if (simple_positive(dentry)) { if (dentry->d_inode) { dget(dentry); switch (dentry->d_inode->i_mode & S_IFMT) { @@ -582,7 +577,7 @@ void tracefs_remove_recursive(struct dentry *dentry) */ spin_lock(&parent->d_lock); list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!tracefs_positive(child)) + if (!simple_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ @@ -603,7 +598,7 @@ void tracefs_remove_recursive(struct dentry *dentry) * from d_subdirs. When releasing the parent->d_lock we can * no longer trust that the next pointer is valid. * Restart the loop. We'll skip this one with the - * tracefs_positive() check. + * simple_positive() check. */ goto loop; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 167ec0934049..d2d50249b7b2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -507,6 +507,11 @@ static inline bool d_really_is_positive(const struct dentry *dentry) return dentry->d_inode != NULL; } +static inline int simple_positive(struct dentry *dentry) +{ + return d_really_is_positive(dentry) && !d_unhashed(dentry); +} + extern void d_set_fallthru(struct dentry *dentry); static inline bool d_is_fallthru(const struct dentry *dentry) diff --git a/security/inode.c b/security/inode.c index 91503b79c5f8..6df0d8dae1e0 100644 --- a/security/inode.c +++ b/security/inode.c @@ -25,11 +25,6 @@ static struct vfsmount *mount; static int mount_count; -static inline int positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - static int fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr files[] = {{""}}; @@ -201,14 +196,12 @@ void securityfs_remove(struct dentry *dentry) return; mutex_lock(&d_inode(parent)->i_mutex); - if (positive(dentry)) { - if (d_really_is_positive(dentry)) { - if (d_is_dir(dentry)) - simple_rmdir(d_inode(parent), dentry); - else - simple_unlink(d_inode(parent), dentry); - dput(dentry); - } + if (simple_positive(dentry)) { + if (d_is_dir(dentry)) + simple_rmdir(d_inode(parent), dentry); + else + simple_unlink(d_inode(parent), dentry); + dput(dentry); } mutex_unlock(&d_inode(parent)->i_mutex); simple_release_fs(&mount, &mount_count); -- cgit From 06d7137e5c566e1e8a4acd4a30e7e12170a57b58 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 29 Jun 2015 12:07:04 -0400 Subject: namei: make set_root_rcu() return void The only caller that cares about its return value can just as easily pick it from nd->root_seq itself. We used to just calculate it and return to caller, but these days we are storing it in nd->root_seq in all cases. Signed-off-by: Al Viro --- fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 2dad0eaf91d3..ae4e4c18b2ac 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -792,7 +792,7 @@ static void set_root(struct nameidata *nd) get_fs_root(current->fs, &nd->root); } -static unsigned set_root_rcu(struct nameidata *nd) +static void set_root_rcu(struct nameidata *nd) { struct fs_struct *fs = current->fs; unsigned seq; @@ -802,7 +802,6 @@ static unsigned set_root_rcu(struct nameidata *nd) nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); - return nd->root_seq; } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -1998,7 +1997,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (*s == '/') { if (flags & LOOKUP_RCU) { rcu_read_lock(); - nd->seq = set_root_rcu(nd); + set_root_rcu(nd); + nd->seq = nd->root_seq; } else { set_root(nd); path_get(&nd->root); -- cgit From 2adc376c551943a07170cbe70f43e6d6065f8906 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Thu, 25 Jun 2015 12:25:58 -0300 Subject: vfs: avoid creation of inode number 0 in get_next_ino currently, get_next_ino() is able to create inodes with inode number = 0. This have a bad impact in the filesystems relying in this function to generate inode numbers. While there is no problem at all in having inodes with number 0, userspace tools which handle file management tasks can have problems handling these files, like for example, the impossiblity of users to delete these files, since glibc will ignore them. So, I believe the best way is kernel to avoid creating them. This problem has been raised previously, but the old thread didn't have any other update for a year+, and I've seen too many users hitting the same issue regarding the impossibility to delete files while using filesystems relying on this function. So, I'm starting the thread again, with the same patch that I believe is enough to address this problem. Signed-off-by: Carlos Maiolino Signed-off-by: Al Viro --- fs/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 0401d2c6d087..648e71ce6ec2 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -840,7 +840,11 @@ unsigned int get_next_ino(void) } #endif - *p = ++res; + res++; + /* get_next_ino should not provide a 0 inode number */ + if (unlikely(!res)) + res++; + *p = res; put_cpu_var(last_ino); return res; } -- cgit From 1af95de6f0119d5bde02d3a811a9f3a3661e954e Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Tue, 23 Jun 2015 18:54:45 +0800 Subject: fs:super:get_anon_bdev: fix race condition could cause dev exceed its upper limitation Execution of get_anon_bdev concurrently and preemptive kernel all could bring race condition, it isn't enough to check dev against its upper limitation with equality operator only. This patch fix it. Signed-off-by: Wang YanQing Signed-off-by: Al Viro --- fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/super.c b/fs/super.c index 928c20f47af9..b61372354f2b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -842,7 +842,7 @@ int get_anon_bdev(dev_t *p) else if (error) return -EAGAIN; - if (dev == (1 << MINORBITS)) { + if (dev >= (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) -- cgit From 8a81252b774b53e628a8a0fe18e2b8fc236d92cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Jun 2015 15:54:08 +0200 Subject: fs/file.c: don't acquire files->file_lock in fd_install() Mateusz Guzik reported : Currently obtaining a new file descriptor results in locking fdtable twice - once in order to reserve a slot and second time to fill it. Holding the spinlock in __fd_install() is needed in case a resize is done, or to prevent a resize. Mateusz provided an RFC patch and a micro benchmark : http://people.redhat.com/~mguzik/pipebench.c A resize is an unlikely operation in a process lifetime, as table size is at least doubled at every resize. We can use RCU instead of the spinlock. __fd_install() must wait if a resize is in progress. The resize must block new __fd_install() callers from starting, and wait that ongoing install are finished (synchronize_sched()) resize should be attempted by a single thread to not waste resources. rcu_sched variant is used, as __fd_install() and expand_fdtable() run from process context. It gives us a ~30% speedup using pipebench on a dual Intel(R) Xeon(R) CPU E5-2696 v2 @ 2.50GHz Signed-off-by: Eric Dumazet Reported-by: Mateusz Guzik Acked-by: Mateusz Guzik Tested-by: Mateusz Guzik Signed-off-by: Al Viro --- Documentation/filesystems/porting | 4 +++ fs/file.c | 67 ++++++++++++++++++++++++++++----------- include/linux/fdtable.h | 3 ++ 3 files changed, 55 insertions(+), 19 deletions(-) diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 3eae250254d5..ec5456113072 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -500,3 +500,7 @@ in your dentry operations instead. dentry, it does not get nameidata at all and it gets called only when cookie is non-NULL. Note that link body isn't available anymore, so if you need it, store it as cookie. +-- +[mandatory] + __fd_install() & fd_install() can now sleep. Callers should not + hold a spinlock or other resources that do not allow a schedule. diff --git a/fs/file.c b/fs/file.c index 93c5f89c248b..3d2eb4c542a4 100644 --- a/fs/file.c +++ b/fs/file.c @@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr) spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); + + /* make sure all __fd_install() have seen resize_in_progress + * or have finished their rcu_read_lock_sched() section. + */ + if (atomic_read(&files->count) > 1) + synchronize_sched(); + spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; @@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr) __free_fdtable(new_fdt); return -EMFILE; } - /* - * Check again since another task may have expanded the fd table while - * we dropped the lock - */ cur_fdt = files_fdtable(files); - if (nr >= cur_fdt->max_fds) { - /* Continue as planned */ - copy_fdtable(new_fdt, cur_fdt); - rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt != &files->fdtab) - call_rcu(&cur_fdt->rcu, free_fdtable_rcu); - } else { - /* Somebody else expanded, so undo our attempt */ - __free_fdtable(new_fdt); - } + BUG_ON(nr < cur_fdt->max_fds); + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + if (cur_fdt != &files->fdtab) + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); + /* coupled with smp_rmb() in __fd_install() */ + smp_wmb(); return 1; } @@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr) * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) { struct fdtable *fdt; + int expanded = 0; +repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return 0; + return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; + if (unlikely(files->resize_in_progress)) { + spin_unlock(&files->file_lock); + expanded = 1; + wait_event(files->resize_wait, !files->resize_in_progress); + spin_lock(&files->file_lock); + goto repeat; + } + /* All good, so we try */ - return expand_fdtable(files, nr); + files->resize_in_progress = true; + expanded = expand_fdtable(files, nr); + files->resize_in_progress = false; + + wake_up_all(&files->resize_wait); + return expanded; } static inline void __set_close_on_exec(int fd, struct fdtable *fdt) @@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->resize_in_progress = false; + init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; @@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); + + might_sleep(); + rcu_read_lock_sched(); + + while (unlikely(files->resize_in_progress)) { + rcu_read_unlock_sched(); + wait_event(files->resize_wait, !files->resize_in_progress); + rcu_read_lock_sched(); + } + /* coupled with smp_wmb() in expand_fdtable() */ + smp_rmb(); + fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + rcu_read_unlock_sched(); } void fd_install(unsigned int fd, struct file *file) diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 230f87bdf5ad..fbb88740634a 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -47,6 +47,9 @@ struct files_struct { * read mostly part */ atomic_t count; + bool resize_in_progress; + wait_queue_head_t resize_wait; + struct fdtable __rcu *fdt; struct fdtable fdtab; /* -- cgit From 5ba97d2832f87943c43bb69cb1ef86dbc59df5bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Jun 2015 17:10:30 +0200 Subject: fs/file.c: __fget() and dup2() atomicity rules __fget() does lockless fetch of pointer from the descriptor table, attempts to grab a reference and treats "it was already zero" as "it's already gone from the table, we just hadn't seen the store, let's fail". Unfortunately, that breaks the atomicity of dup2() - __fget() might see the old pointer, notice that it's been already dropped and treat that as "it's closed". What we should be getting is either the old file or new one, depending whether we come before or after dup2(). Dmitry had following test failing sometimes : int fd; void *Thread(void *x) { char buf; int n = read(fd, &buf, 1); if (n != 1) exit(printf("read failed: n=%d errno=%d\n", n, errno)); return 0; } int main() { fd = open("/dev/urandom", O_RDONLY); int fd2 = open("/dev/urandom", O_RDONLY); if (fd == -1 || fd2 == -1) exit(printf("open failed\n")); pthread_t th; pthread_create(&th, 0, Thread, 0); if (dup2(fd2, fd) == -1) exit(printf("dup2 failed\n")); pthread_join(th, 0); if (close(fd) == -1) exit(printf("close failed\n")); if (close(fd2) == -1) exit(printf("close failed\n")); printf("DONE\n"); return 0; } Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: Al Viro --- fs/file.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/file.c b/fs/file.c index 3d2eb4c542a4..6c672ad329e9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -664,11 +664,17 @@ static struct file *__fget(unsigned int fd, fmode_t mask) struct file *file; rcu_read_lock(); +loop: file = fcheck_files(files, fd); if (file) { - /* File object ref couldn't be taken */ - if ((file->f_mode & mask) || !get_file_rcu(file)) + /* File object ref couldn't be taken. + * dup2() atomicity guarantee is the reason + * we loop to catch the new file (or NULL pointer) + */ + if (file->f_mode & mask) file = NULL; + else if (!get_file_rcu(file)) + goto loop; } rcu_read_unlock(); -- cgit From 44f4c054cae646a9296da05cdfe7d6e786f73d46 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 3 Jul 2015 10:40:38 -0400 Subject: dax: Add block size note to documentation For block devices which are small enough, mkfs will default to creating a filesystem with block sizes smaller than page size. Signed-off-by: Matthew Wilcox Signed-off-by: Al Viro --- Documentation/filesystems/dax.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index baf41118660d..7af2851d667c 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -18,8 +18,10 @@ Usage ----- If you have a block device which supports DAX, you can make a filesystem -on it as usual. When mounting it, use the -o dax option manually -or add 'dax' to the options in /etc/fstab. +on it as usual. The DAX code currently only supports files with a block +size equal to your kernel's PAGE_SIZE, so you may need to specify a block +size when creating the filesystem. When mounting it, use the "-o dax" +option on the command line or add 'dax' to the options in /etc/fstab. Implementation Tips for Block Driver Writers -- cgit From 872eb127e3a6cddcfca1410bb808d9b9bc773dc1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 3 Jul 2015 10:40:39 -0400 Subject: dax: Use copy_from_iter_nocache When userspace does a write, there's no need for the written data to pollute the CPU cache. This matches the original XIP code. Signed-off-by: Matthew Wilcox Signed-off-by: Al Viro --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 6f65f00e58ec..159f796afa85 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -155,7 +155,7 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, } if (iov_iter_rw(iter) == WRITE) - len = copy_from_iter(addr, max - pos, iter); + len = copy_from_iter_nocache(addr, max - pos, iter); else if (!hole) len = copy_to_iter(addr, max - pos, iter); else -- cgit From bbab37ddc20bae4709bca8745c128c4f46fe63c5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 3 Jul 2015 10:40:42 -0400 Subject: block: Add support for DAX reads/writes to block devices If a block device supports the ->direct_access methods, bypass the normal DIO path and use DAX to go straight to memcpy() instead of allocating a DIO and a BIO. Includes support for the DIO_SKIP_DIO_COUNT flag in DAX, as is done in do_blockdev_direct_IO(). Signed-off-by: Matthew Wilcox Signed-off-by: Al Viro --- fs/block_dev.c | 4 ++++ fs/dax.c | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index c7e4163ede87..5dde6dff4940 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -151,6 +151,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, + NULL, DIO_SKIP_DIO_COUNT); return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); @@ -1173,6 +1176,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; + bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0; if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); diff --git a/fs/dax.c b/fs/dax.c index 159f796afa85..37a0c4826c1a 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -209,7 +209,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, } /* Protects against truncate */ - inode_dio_begin(inode); + if (!(flags & DIO_SKIP_DIO_COUNT)) + inode_dio_begin(inode); retval = dax_io(inode, iter, pos, end, get_block, &bh); @@ -219,7 +220,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); - inode_dio_end(inode); + if (!(flags & DIO_SKIP_DIO_COUNT)) + inode_dio_end(inode); out: return retval; } -- cgit From 43c3dd08da890e458f670b4fc0630513fb405620 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 3 Jul 2015 10:40:43 -0400 Subject: dax: bdev_direct_access() may sleep The brd driver is the only in-tree driver that may sleep currently. After some discussion on linux-fsdevel, we decided that any driver may choose to sleep in its ->direct_access method. To ensure that all callers of bdev_direct_access() are prepared for this, add a call to might_sleep(). Signed-off-by: Matthew Wilcox Signed-off-by: Al Viro --- fs/block_dev.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index 5dde6dff4940..12b22ddb22ef 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -445,6 +445,12 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, long avail; const struct block_device_operations *ops = bdev->bd_disk->fops; + /* + * The device driver is allowed to sleep, in order to make the + * memory directly accessible. + */ + might_sleep(); + if (size < 0) return size; if (!ops->direct_access) -- cgit From a84b69cb6e0a41e86bc593904faa6def3b957343 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 4 Jul 2015 16:04:19 -0400 Subject: 9p: forgetting to cancel request on interrupted zero-copy RPC If we'd already sent a request and decide to abort it, we *must* issue TFLUSH properly and not just blindly reuse the tag, or we'll get seriously screwed when response eventually arrives and we confuse it for response to later request that had reused the same tag. Cc: stable@vger.kernel.org # v3.2 and later Signed-off-by: Al Viro --- net/9p/client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/9p/client.c b/net/9p/client.c index 6f4c4c88db84..28f36e4556f9 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -843,7 +843,8 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, if (err < 0) { if (err == -EIO) c->status = Disconnected; - goto reterr; + if (err != -ERESTARTSYS) + goto reterr; } if (req->status == REQ_STATUS_ERROR) { p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); -- cgit From 67e808fbb0404a12d9b9830a44bbb48d447d8bc9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 4 Jul 2015 16:11:05 -0400 Subject: p9_client_write(): avoid double p9_free_req() Braino in "9p: switch p9_client_write() to passing it struct iov_iter *"; if response is impossible to parse and we discard the request, get the out of the loop right there. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- net/9p/client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/9p/client.c b/net/9p/client.c index 28f36e4556f9..81925b923318 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1648,6 +1648,7 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) if (*err) { trace_9p_protocol_dump(clnt, req->rc); p9_free_req(clnt, req); + break; } p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); -- cgit From 0f1db7dee200127da4c07928189748918c312031 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 4 Jul 2015 16:17:39 -0400 Subject: 9p: cope with bogus responses from server in p9_client_{read,write} if server claims to have written/read more than we'd told it to, warn and cap the claimed byte count to avoid advancing more than we are ready to. --- net/9p/client.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/9p/client.c b/net/9p/client.c index 81925b923318..498454b3c06c 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1583,6 +1583,10 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) p9_free_req(clnt, req); break; } + if (rsize < count) { + pr_err("bogus RREAD count (%d > %d)\n", count, rsize); + count = rsize; + } p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); if (!count) { @@ -1650,6 +1654,10 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) p9_free_req(clnt, req); break; } + if (rsize < count) { + pr_err("bogus RWRITE count (%d > %d)\n", count, rsize); + count = rsize; + } p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); -- cgit