diff options
Diffstat (limited to 'fs/nfs/pnfs.c')
| -rw-r--r-- | fs/nfs/pnfs.c | 1003 |
1 files changed, 725 insertions, 278 deletions
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 53726da5c010..f157d43d1312 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -61,6 +61,7 @@ static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, u32 seq); static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list); +static int pnfs_layout_return_on_reboot(struct pnfs_layout_hdr *lo); /* Return the registered pnfs layout driver module matching given id */ static struct pnfs_layoutdriver_type * @@ -92,6 +93,17 @@ find_pnfs_driver(u32 id) return local; } +const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id) +{ + return find_pnfs_driver(id); +} + +void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld) +{ + if (ld) + module_put(ld->owner); +} + void unset_pnfs_layoutdriver(struct nfs_server *nfss) { @@ -268,11 +280,11 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) struct nfs_server *server = NFS_SERVER(lo->plh_inode); struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - if (!list_empty(&lo->plh_layouts)) { + if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) { struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); + list_del_rcu(&lo->plh_layouts); spin_unlock(&clp->cl_lock); } put_cred(lo->plh_lc_cred); @@ -304,11 +316,39 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); + /* Notify pnfs_destroy_layout_final() that we're done */ + if (inode_state_read(inode) & (I_FREEING | I_CLEAR)) + wake_up_var_locked(lo, &inode->i_lock); spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); } } +static struct inode * +pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = igrab(lo->plh_inode); + if (inode) + return inode; + set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags); + return NULL; +} + +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)(s1 - s2) > 0; +} + +static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq) +{ + if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier) + lo->plh_barrier = newseq; +} + static void pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, u32 seq) @@ -317,10 +357,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); - if (seq != 0) { - WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); + /* + * We must set lo->plh_return_seq to avoid livelocks with + * pnfs_layout_need_return() + */ + if (seq == 0) + seq = be32_to_cpu(lo->plh_stateid.seqid); + if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) lo->plh_return_seq = seq; - } + pnfs_barrier_update(lo, seq); } static void @@ -359,9 +404,10 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, } /* - * Update the seqid of a layout stateid + * Update the seqid of a layout stateid after receiving + * NFS4ERR_OLD_STATEID */ -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { @@ -377,7 +423,15 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, spin_lock(&inode->i_lock); lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + if (lo && pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + /* Is our call using the most recent seqid? If so, bump it */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) { + nfs4_stateid_seqid_inc(dst); + ret = true; + goto out; + } + /* Try to update the seqid to the most recent */ err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); if (err != -EBUSY) { dst->seqid = lo->plh_stateid.seqid; @@ -385,6 +439,7 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, ret = true; } } +out: spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); return ret; @@ -413,12 +468,25 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, pnfs_clear_lseg_state(lseg, lseg_list); pnfs_clear_layoutreturn_info(lo); pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); + set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags); if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) pnfs_clear_layoutreturn_waitbit(lo); return !list_empty(&lo->plh_segs); } +static int pnfs_mark_layout_stateid_return(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list, + enum pnfs_iomode iomode, u32 seq) +{ + struct pnfs_layout_range range = { + .iomode = iomode, + .length = NFS4_MAX_UINT64, + }; + + return pnfs_mark_matching_lsegs_return(lo, lseg_list, &range, seq); +} + static int pnfs_iomode_to_fail_bit(u32 iomode) { @@ -454,7 +522,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) spin_lock(&inode->i_lock); pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); - pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0); + pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, @@ -486,6 +554,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); + INIT_LIST_HEAD(&lseg->pls_commits); refcount_set(&lseg->pls_refcount, 1); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; @@ -550,10 +619,6 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) inode = lo->plh_inode; if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { - if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { - spin_unlock(&inode->i_lock); - return; - } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); if (pnfs_cache_lseg_for_layoutreturn(lo, lseg)) @@ -613,15 +678,6 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, return rv; } -/* - * Compare 2 layout stateid sequence ids, to see which is newer, - * taking into account wraparound issues. - */ -static bool pnfs_seqid_is_newer(u32 s1, u32 s2) -{ - return (s32)(s1 - s2) > 0; -} - static bool pnfs_should_free_range(const struct pnfs_layout_range *lseg_range, const struct pnfs_layout_range *recall_range) @@ -665,6 +721,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -677,13 +734,23 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg->pls_range.offset, lseg->pls_range.length); - if (!mark_lseg_invalid(lseg, tmp_list)) - remaining++; + if (mark_lseg_invalid(lseg, tmp_list)) + continue; + remaining++; + pnfs_lseg_cancel_io(server, lseg); } dprintk("%s:Return %i\n", __func__, remaining); return remaining; } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} + static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, struct list_head *free_me, @@ -713,8 +780,7 @@ pnfs_free_lseg_list(struct list_head *free_me) } } -void -pnfs_destroy_layout(struct nfs_inode *nfsi) +static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi) { struct pnfs_layout_hdr *lo; LIST_HEAD(tmp_list); @@ -732,9 +798,28 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_put_layout_hdr(lo); } else spin_unlock(&nfsi->vfs_inode.i_lock); + return lo; +} + +void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + __pnfs_destroy_layout(nfsi); } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); +void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi); + struct inode *inode = &nfsi->vfs_inode; + + if (lo) { + spin_lock(&inode->i_lock); + wait_var_event_spinlock(lo, nfsi->layout != lo, + &inode->i_lock); + spin_unlock(&inode->i_lock); + } +} + static bool pnfs_layout_add_bulk_destroy_list(struct inode *inode, struct list_head *layout_list) @@ -758,22 +843,33 @@ static int pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, struct nfs_server *server, struct list_head *layout_list) + __must_hold(&clp->cl_lock) + __must_hold(RCU) { struct pnfs_layout_hdr *lo, *next; struct inode *inode; list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { - if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) - continue; - inode = igrab(lo->plh_inode); - if (inode == NULL) + if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || + test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) || + !list_empty(&lo->plh_bulk_destroy)) continue; - list_del_init(&lo->plh_layouts); - if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) - continue; - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); + /* If the sb is being destroyed, just bail */ + if (!nfs_sb_active(server->super)) + break; + inode = pnfs_grab_inode_layout_hdr(lo); + if (inode != NULL) { + if (pnfs_layout_add_bulk_destroy_list(inode, + layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + } else { + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + } + nfs_sb_deactive(server->super); spin_lock(&clp->cl_lock); rcu_read_lock(); return -EAGAIN; @@ -783,7 +879,7 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, static int pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, - bool is_bulk_recall) + enum pnfs_layout_destroy_mode mode) { struct pnfs_layout_hdr *lo; struct inode *inode; @@ -801,8 +897,11 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, spin_lock(&inode->i_lock); list_del_init(&lo->plh_bulk_destroy); - if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) { - if (is_bulk_recall) + if (mode == PNFS_LAYOUT_FILE_BULK_RETURN) { + pnfs_mark_layout_stateid_return(lo, &lseg_list, + IOMODE_ANY, 0); + } else if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) { + if (mode == PNFS_LAYOUT_BULK_RETURN) set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); ret = -EAGAIN; } @@ -811,15 +910,13 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, /* Free all lsegs that are attached to commit buckets */ nfs_commit_inode(inode, 0); pnfs_put_layout_hdr(lo); - iput(inode); + nfs_iput_and_deactive(inode); } return ret; } -int -pnfs_destroy_layouts_byfsid(struct nfs_client *clp, - struct nfs_fsid *fsid, - bool is_recall) +int pnfs_layout_destroy_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid, + enum pnfs_layout_destroy_mode mode) { struct nfs_server *server; LIST_HEAD(layout_list); @@ -838,37 +935,44 @@ restart: rcu_read_unlock(); spin_unlock(&clp->cl_lock); - if (list_empty(&layout_list)) - return 0; - return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); + return pnfs_layout_free_bulk_destroy_list(&layout_list, mode); } -int -pnfs_destroy_layouts_byclid(struct nfs_client *clp, - bool is_recall) +static void pnfs_layout_build_destroy_list_byclient(struct nfs_client *clp, + struct list_head *list) { struct nfs_server *server; - LIST_HEAD(layout_list); spin_lock(&clp->cl_lock); rcu_read_lock(); restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - if (pnfs_layout_bulk_destroy_byserver_locked(clp, - server, - &layout_list) != 0) + if (pnfs_layout_bulk_destroy_byserver_locked(clp, server, + list) != 0) goto restart; } rcu_read_unlock(); spin_unlock(&clp->cl_lock); +} - if (list_empty(&layout_list)) - return 0; - return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +static int pnfs_layout_do_destroy_byclid(struct nfs_client *clp, + struct list_head *list, + enum pnfs_layout_destroy_mode mode) +{ + pnfs_layout_build_destroy_list_byclient(clp, list); + return pnfs_layout_free_bulk_destroy_list(list, mode); +} + +int pnfs_layout_destroy_byclid(struct nfs_client *clp, + enum pnfs_layout_destroy_mode mode) +{ + LIST_HEAD(layout_list); + + return pnfs_layout_do_destroy_byclid(clp, &layout_list, mode); } /* - * Called by the state manger to remove all layouts established under an + * Called by the state manager to remove all layouts established under an * expired lease. */ void @@ -877,40 +981,112 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) nfs4_deviceid_mark_client_invalid(clp); nfs4_deviceid_purge_client(clp); - pnfs_destroy_layouts_byclid(clp, false); + pnfs_layout_destroy_byclid(clp, PNFS_LAYOUT_INVALIDATE); +} + +static void pnfs_layout_build_recover_list_byclient(struct nfs_client *clp, + struct list_head *list) +{ + struct nfs_server *server; + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!(server->caps & NFS_CAP_REBOOT_LAYOUTRETURN)) + continue; + if (pnfs_layout_bulk_destroy_byserver_locked(clp, server, + list) != 0) + goto restart; + } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); +} + +static int pnfs_layout_bulk_list_reboot(struct list_head *list) +{ + struct pnfs_layout_hdr *lo; + struct nfs_server *server; + int ret; + + list_for_each_entry(lo, list, plh_bulk_destroy) { + server = NFS_SERVER(lo->plh_inode); + ret = pnfs_layout_return_on_reboot(lo); + switch (ret) { + case 0: + continue; + case -NFS4ERR_BAD_STATEID: + server->caps &= ~NFS_CAP_REBOOT_LAYOUTRETURN; + break; + case -NFS4ERR_NO_GRACE: + break; + default: + goto err; + } + break; + } + return 0; +err: + return ret; +} + +int pnfs_layout_handle_reboot(struct nfs_client *clp) +{ + LIST_HEAD(list); + int ret = 0, ret2; + + pnfs_layout_build_recover_list_byclient(clp, &list); + if (!list_empty(&list)) + ret = pnfs_layout_bulk_list_reboot(&list); + ret2 = pnfs_layout_do_destroy_byclid(clp, &list, + PNFS_LAYOUT_INVALIDATE); + if (!ret) + ret = ret2; + return (ret == 0) ? 0 : -EAGAIN; +} + +static void +pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred) +{ + const struct cred *old; + + if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) { + old = xchg(&lo->plh_lc_cred, get_cred(cred)); + put_cred(old); + } } /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, - bool update_barrier) + const struct cred *cred, bool update_barrier) { - u32 oldseq, newseq, new_barrier = 0; - - oldseq = be32_to_cpu(lo->plh_stateid.seqid); - newseq = be32_to_cpu(new->seqid); + u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid); + u32 newseq = be32_to_cpu(new->seqid); if (!pnfs_layout_is_valid(lo)) { + pnfs_set_layout_cred(lo, cred); nfs4_stateid_copy(&lo->plh_stateid, new); lo->plh_barrier = newseq; pnfs_clear_layoutreturn_info(lo); clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); return; } - if (pnfs_seqid_is_newer(newseq, oldseq)) { + + if (pnfs_seqid_is_newer(newseq, oldseq)) nfs4_stateid_copy(&lo->plh_stateid, new); - /* - * Because of wraparound, we want to keep the barrier - * "close" to the current seqids. - */ - new_barrier = newseq - atomic_read(&lo->plh_outstanding); - } - if (update_barrier) - new_barrier = be32_to_cpu(new->seqid); - else if (new_barrier == 0) + + if (update_barrier) { + pnfs_barrier_update(lo, newseq); return; - if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) - lo->plh_barrier = new_barrier; + } + /* + * Because of wraparound, we want to keep the barrier + * "close" to the current seqids. We really only want to + * get here from a layoutget call. + */ + if (atomic_read(&lo->plh_outstanding) == 1) + pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid)); } static bool @@ -919,7 +1095,7 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, { u32 seqid = be32_to_cpu(stateid->seqid); - return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); + return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid); } /* lget is set to 1 if called from inside send_layoutget call chain */ @@ -1038,7 +1214,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino, lgp->args.ctx = get_nfs_open_context(ctx); nfs4_stateid_copy(&lgp->args.stateid, stateid); lgp->gfp_flags = gfp_flags; - lgp->cred = get_cred(ctx->cred); + lgp->cred = ctx->cred; return lgp; } @@ -1047,9 +1223,7 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp) size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE; nfs4_free_pages(lgp->args.layout.pages, max_pages); - if (lgp->args.inode) - pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout); - put_cred(lgp->cred); + pnfs_put_layout_hdr(lgp->lo); put_nfs_open_context(lgp->args.ctx); kfree(lgp); } @@ -1069,6 +1243,33 @@ static void pnfs_clear_layoutcommit(struct inode *inode, } } +static void +pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range, + struct list_head *freeme) +{ + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + pnfs_reset_return_info(lo); + else + pnfs_mark_layout_stateid_invalid(lo, freeme); + pnfs_clear_layoutreturn_waitbit(lo); +} + +void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range) +{ + struct inode *inode = lo->plh_inode; + LIST_HEAD(freeme); + + spin_lock(&inode->i_lock); + pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range, &freeme); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); +} + void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, const nfs4_stateid *arg_stateid, const struct pnfs_layout_range *range, @@ -1078,15 +1279,15 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, LIST_HEAD(freeme); spin_lock(&inode->i_lock); - if (!pnfs_layout_is_valid(lo) || !arg_stateid || - !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + if (!nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) goto out_unlock; - if (stateid) { + if (stateid && pnfs_layout_is_valid(lo)) { u32 seq = be32_to_cpu(arg_stateid->seqid); pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); - pnfs_set_layout_stateid(lo, stateid, true); + pnfs_set_layout_stateid(lo, stateid, NULL, true); + pnfs_reset_return_info(lo); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: @@ -1099,30 +1300,27 @@ out_unlock: static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, + const struct cred **cred, enum pnfs_iomode *iomode) { /* Serialise LAYOUTGET/LAYOUTRETURN */ - if (atomic_read(&lo->plh_outstanding) != 0) + if (atomic_read(&lo->plh_outstanding) != 0 && lo->plh_return_seq == 0) return false; if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) return false; set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); pnfs_get_layout_hdr(lo); + nfs4_stateid_copy(stateid, &lo->plh_stateid); + *cred = get_cred(lo->plh_lc_cred); if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { - if (stateid != NULL) { - nfs4_stateid_copy(stateid, &lo->plh_stateid); - if (lo->plh_return_seq != 0) - stateid->seqid = cpu_to_be32(lo->plh_return_seq); - } + if (lo->plh_return_seq != 0) + stateid->seqid = cpu_to_be32(lo->plh_return_seq); if (iomode != NULL) *iomode = lo->plh_return_iomode; pnfs_clear_layoutreturn_info(lo); - return true; - } - if (stateid != NULL) - nfs4_stateid_copy(stateid, &lo->plh_stateid); - if (iomode != NULL) + } else if (iomode != NULL) *iomode = IOMODE_ANY; + pnfs_barrier_update(lo, be32_to_cpu(stateid->seqid)); return true; } @@ -1144,20 +1342,26 @@ pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args, } static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, - enum pnfs_iomode iomode, bool sync) +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid, + const struct cred **pcred, + enum pnfs_iomode iomode, + unsigned int flags) { struct inode *ino = lo->plh_inode; struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; struct nfs4_layoutreturn *lrp; + const struct cred *cred = *pcred; int status = 0; - lrp = kzalloc(sizeof(*lrp), GFP_NOFS); + *pcred = NULL; + lrp = kzalloc(sizeof(*lrp), nfs_io_gfp_mask()); if (unlikely(lrp == NULL)) { status = -ENOMEM; spin_lock(&ino->i_lock); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&ino->i_lock); + put_cred(cred); pnfs_put_layout_hdr(lo); goto out; } @@ -1165,11 +1369,11 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode); lrp->args.ld_private = &lrp->ld_private; lrp->clp = NFS_SERVER(ino)->nfs_client; - lrp->cred = lo->plh_lc_cred; + lrp->cred = cred; if (ld->prepare_layoutreturn) ld->prepare_layoutreturn(&lrp->args); - status = nfs4_proc_layoutreturn(lrp, sync); + status = nfs4_proc_layoutreturn(lrp, flags); out: dprintk("<-- %s status: %d\n", __func__, status); return status; @@ -1179,27 +1383,11 @@ out: static bool pnfs_layout_need_return(struct pnfs_layout_hdr *lo) { - struct pnfs_layout_segment *s; - enum pnfs_iomode iomode; - u32 seq; - if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) return false; - - seq = lo->plh_return_seq; - iomode = lo->plh_return_iomode; - - /* Defer layoutreturn until all recalled lsegs are done */ - list_for_each_entry(s, &lo->plh_segs, pls_list) { - if (seq && pnfs_seqid_is_newer(s->pls_seq, seq)) - continue; - if (iomode != IOMODE_ANY && s->pls_range.iomode != iomode) - continue; - if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) - return false; - } - - return true; + return pnfs_mark_layout_stateid_return(lo, &lo->plh_return_segs, + lo->plh_return_iomode, + lo->plh_return_seq) != EBUSY; } static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) @@ -1210,15 +1398,17 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) return; spin_lock(&inode->i_lock); if (pnfs_layout_need_return(lo)) { + const struct cred *cred; nfs4_stateid stateid; enum pnfs_iomode iomode; bool send; - send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, &stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, + PNFS_FL_LAYOUTRETURN_ASYNC); } } else spin_unlock(&inode->i_lock); @@ -1237,7 +1427,13 @@ _pnfs_return_layout(struct inode *ino) { struct pnfs_layout_hdr *lo = NULL; struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; LIST_HEAD(tmp_list); + const struct cred *cred; nfs4_stateid stateid; int status = 0; bool send, valid_layout; @@ -1263,29 +1459,26 @@ _pnfs_return_layout(struct inode *ino) } valid_layout = pnfs_layout_is_valid(lo); pnfs_clear_layoutcommit(ino, &tmp_list); - pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0); + pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0); - if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { - struct pnfs_layout_range range = { - .iomode = IOMODE_ANY, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); - } /* Don't send a LAYOUTRETURN if list was initially empty */ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) || !valid_layout) { spin_unlock(&ino->i_lock); dprintk("NFS: %s no layout segments to return\n", __func__); - goto out_put_layout_hdr; + goto out_wait_layoutreturn; } - send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); + send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL); spin_unlock(&ino->i_lock); if (send) - status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); + status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, + 0); +out_wait_layoutreturn: + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); out_put_layout_hdr: pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -1321,6 +1514,24 @@ pnfs_commit_and_return_layout(struct inode *inode) return ret; } +static int pnfs_layout_return_on_reboot(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = lo->plh_inode; + const struct cred *cred; + + spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + return 0; + } + cred = get_cred(lo->plh_lc_cred); + pnfs_get_layout_hdr(lo); + spin_unlock(&inode->i_lock); + + return pnfs_send_layoutreturn(lo, &zero_stateid, &cred, IOMODE_ANY, + PNFS_FL_LAYOUTRETURN_PRIVILEGED); +} + bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, @@ -1331,6 +1542,7 @@ bool pnfs_roc(struct inode *ino, struct nfs4_state *state; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg, *next; + const struct cred *lc_cred; nfs4_stateid stateid; enum pnfs_iomode iomode = 0; bool layoutreturn = false, roc = false; @@ -1400,15 +1612,16 @@ retry: * 2. we don't send layoutreturn */ /* lo ref dropped in pnfs_roc_release() */ - layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode); /* If the creds don't match, we can't compound the layoutreturn */ - if (!layoutreturn || cred != lo->plh_lc_cred) + if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0) goto out_noroc; roc = layoutreturn; pnfs_init_layoutreturn_args(args, lo, &stateid, iomode); res->lrs_present = 0; layoutreturn = false; + put_cred(lc_cred); out_noroc: spin_unlock(&ino->i_lock); @@ -1422,31 +1635,99 @@ out_noroc: return true; } if (layoutreturn) - pnfs_send_layoutreturn(lo, &stateid, iomode, true); + pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, 0); pnfs_put_layout_hdr(lo); return false; } +int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, int *ret) +{ + struct nfs4_layoutreturn_args *arg = *argpp; + int retval = -EAGAIN; + + if (!arg) + return 0; + /* Handle Layoutreturn errors */ + switch (*ret) { + case 0: + retval = 0; + break; + case -NFS4ERR_NOMATCHING_LAYOUT: + /* Was there an RPC level error? If not, retry */ + if (task->tk_rpc_status == 0) + break; + /* + * Is there a fatal network level error? + * If so release the layout, but flag the error. + */ + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + *ret = 0; + (*respp)->lrs_present = 0; + retval = -EIO; + break; + } + /* If the call was not sent, let caller handle it */ + if (!RPC_WAS_SENT(task)) + return 0; + /* + * Otherwise, assume the call succeeded and + * that we need to release the layout + */ + *ret = 0; + (*respp)->lrs_present = 0; + retval = 0; + break; + case -NFS4ERR_DELAY: + /* Let the caller handle the retry */ + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return 0; + case -NFS4ERR_OLD_STATEID: + if (!nfs4_layout_refresh_old_stateid(&arg->stateid, + &arg->range, arg->inode)) + break; + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return -EAGAIN; + } + *argpp = NULL; + *respp = NULL; + return retval; +} + void pnfs_roc_release(struct nfs4_layoutreturn_args *args, - struct nfs4_layoutreturn_res *res, - int ret) + struct nfs4_layoutreturn_res *res, int ret) { struct pnfs_layout_hdr *lo = args->layout; - const nfs4_stateid *arg_stateid = NULL; + struct inode *inode = args->inode; const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; + LIST_HEAD(freeme); - if (ret == 0) { - arg_stateid = &args->stateid; + switch (ret) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_NOMATCHING_LAYOUT: + spin_lock(&inode->i_lock); + pnfs_layoutreturn_retry_later_locked(lo, &args->stateid, + &args->range, &freeme); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); + break; + case 0: if (res->lrs_present) res_stateid = &res->stateid; + fallthrough; + default: + pnfs_layoutreturn_free_lsegs(lo, &args->stateid, &args->range, + res_stateid); } - pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, - res_stateid); + trace_nfs4_layoutreturn_on_close(args->inode, &args->stateid, ret); if (ld_private && ld_private->ops && ld_private->ops->free) ld_private->ops->free(ld_private); pnfs_put_layout_hdr(lo); - trace_nfs4_layoutreturn_on_close(args->inode, 0); } bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) @@ -1759,7 +2040,7 @@ static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) pnfs_layoutcommit_inode(lo->plh_inode, false); return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); } static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) @@ -1769,8 +2050,16 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) { - if (atomic_dec_and_test(&lo->plh_outstanding)) - wake_up_var(&lo->plh_outstanding); + if (atomic_dec_and_test(&lo->plh_outstanding) && + test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) { + smp_mb__after_atomic(); + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN); + } +} + +static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags); } static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) @@ -1785,15 +2074,14 @@ static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) static void _add_to_server_list(struct pnfs_layout_hdr *lo, struct nfs_server *server) { - if (list_empty(&lo->plh_layouts)) { + if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) { struct nfs_client *clp = server->nfs_client; /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. */ spin_lock(&clp->cl_lock); - if (list_empty(&lo->plh_layouts)) - list_add_tail(&lo->plh_layouts, &server->layouts); + list_add_tail_rcu(&lo->plh_layouts, &server->layouts); spin_unlock(&clp->cl_lock); } } @@ -1823,7 +2111,9 @@ pnfs_update_layout(struct inode *ino, struct pnfs_layout_segment *lseg = NULL; struct nfs4_layoutget *lgp; nfs4_stateid stateid; - long timeout = 0; + struct nfs4_exception exception = { + .inode = ino, + }; unsigned long giveup = jiffies + (clp->cl_lease_time << 1); bool first; @@ -1840,6 +2130,14 @@ pnfs_update_layout(struct inode *ino, } lookup_again: + if (!nfs4_valid_open_stateid(ctx->state)) { + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_INVALID_OPEN); + lseg = ERR_PTR(-EIO); + goto out; + } + lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp)); if (IS_ERR(lseg)) goto out; @@ -1848,6 +2146,7 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NOMEM); goto out; @@ -1872,27 +2171,42 @@ lookup_again: * If the layout segment list is empty, but there are outstanding * layoutget calls, then they might be subject to a layoutrecall. */ - if (list_empty(&lo->plh_segs) && + if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); - lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, - atomic_read(&lo->plh_outstanding))); - if (IS_ERR(lseg) || !list_empty(&lo->plh_segs)) + lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN, + TASK_KILLABLE)); + if (IS_ERR(lseg)) goto out_put_layout_hdr; pnfs_put_layout_hdr(lo); goto lookup_again; } - lseg = pnfs_find_lseg(lo, &arg, strict_iomode); - if (lseg) { + /* + * Because we free lsegs when sending LAYOUTRETURN, we need to wait + * for LAYOUTRETURN. + */ + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + spin_unlock(&ino->i_lock); + dprintk("%s wait for layoutreturn\n", __func__); + lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo)); + if (!IS_ERR(lseg)) { + pnfs_put_layout_hdr(lo); + dprintk("%s retrying\n", __func__); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + lseg, + PNFS_UPDATE_LAYOUT_RETRY); + goto lookup_again; + } trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, - PNFS_UPDATE_LAYOUT_FOUND_CACHED); - goto out_unlock; + PNFS_UPDATE_LAYOUT_RETURN); + goto out_put_layout_hdr; } - if (!nfs4_valid_open_stateid(ctx->state)) { + lseg = pnfs_find_lseg(lo, &arg, strict_iomode); + if (lseg) { trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, - PNFS_UPDATE_LAYOUT_INVALID_OPEN); + PNFS_UPDATE_LAYOUT_FOUND_CACHED); goto out_unlock; } @@ -1902,6 +2216,7 @@ lookup_again: * stateid. */ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { + int status; /* * The first layoutget for the file. Need to serialize per @@ -1920,39 +2235,24 @@ lookup_again: goto lookup_again; } + spin_unlock(&ino->i_lock); first = true; - if (nfs4_select_rw_stateid(ctx->state, + status = nfs4_select_rw_stateid(ctx->state, iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ, - NULL, &stateid, NULL) != 0) { + NULL, &stateid, NULL); + if (status != 0) { + lseg = ERR_PTR(status); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_INVALID_OPEN); - goto out_unlock; - } - } else { - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - } - - /* - * Because we free lsegs before sending LAYOUTRETURN, we need to wait - * for LAYOUTRETURN even if first is true. - */ - if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { - spin_unlock(&ino->i_lock); - dprintk("%s wait for layoutreturn\n", __func__); - lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo)); - if (!IS_ERR(lseg)) { - if (first) - pnfs_clear_first_layoutget(lo); + nfs4_schedule_stateid_recovery(server, ctx->state); + pnfs_clear_first_layoutget(lo); pnfs_put_layout_hdr(lo); - dprintk("%s retrying\n", __func__); - trace_pnfs_update_layout(ino, pos, count, iomode, lo, - lseg, PNFS_UPDATE_LAYOUT_RETRY); goto lookup_again; } - trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, - PNFS_UPDATE_LAYOUT_RETURN); - goto out_put_layout_hdr; + spin_lock(&ino->i_lock); + } else { + nfs4_stateid_copy(&stateid, &lo->plh_stateid); } if (pnfs_layoutgets_blocked(lo)) { @@ -1975,13 +2275,17 @@ lookup_again: lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags); if (!lgp) { + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL, PNFS_UPDATE_LAYOUT_NOMEM); nfs_layoutget_end(lo); goto out_put_layout_hdr; } - lseg = nfs4_proc_layoutget(lgp, &timeout); + lgp->lo = lo; + pnfs_get_layout_hdr(lo); + + lseg = nfs4_proc_layoutget(lgp, &exception); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); nfs_layoutget_end(lo); @@ -1994,6 +2298,12 @@ lookup_again: case -ERECALLCONFLICT: case -EAGAIN: break; + case -ENODATA: + /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */ + pnfs_layout_set_fail_bit( + lo, pnfs_iomode_to_fail_bit(iomode)); + lseg = NULL; + goto out_put_layout_hdr; default: if (!nfs_error_is_fatal(PTR_ERR(lseg))) { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); @@ -2002,6 +2312,8 @@ lookup_again: goto out_put_layout_hdr; } if (lseg) { + if (!exception.retry) + goto out_put_layout_hdr; if (first) pnfs_clear_first_layoutget(lo); trace_pnfs_update_layout(ino, pos, count, @@ -2016,6 +2328,8 @@ lookup_again: out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_EXIT); pnfs_put_layout_hdr(lo); out: dprintk("%s: inode %s/%llu pNFS layout segment %s for " @@ -2059,7 +2373,7 @@ _pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx) struct pnfs_layout_hdr *lo; spin_lock(&ino->i_lock); - lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL); + lo = pnfs_find_alloc_layout(ino, ctx, nfs_io_gfp_mask()); if (!lo) goto out_unlock; if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) @@ -2081,8 +2395,6 @@ out_unlock: return NULL; } -extern const nfs4_stateid current_stateid; - static void _lgopen_prepare_attached(struct nfs4_opendata *data, struct nfs_open_context *ctx) { @@ -2104,13 +2416,15 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data, lo = _pnfs_grab_empty_layout(ino, ctx); if (!lo) return; - lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, - &rng, GFP_KERNEL); + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, + nfs_io_gfp_mask()); if (!lgp) { pnfs_clear_first_layoutget(lo); + nfs_layoutget_end(lo); pnfs_put_layout_hdr(lo); return; } + lgp->lo = lo; data->lgp = lgp; data->o_arg.lg_args = &lgp->args; data->o_res.lg_res = &lgp->res; @@ -2119,6 +2433,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data, static void _lgopen_prepare_floating(struct nfs4_opendata *data, struct nfs_open_context *ctx) { + struct inode *ino = data->dentry->d_inode; struct pnfs_layout_range rng = { .iomode = (data->o_arg.fmode & FMODE_WRITE) ? IOMODE_RW: IOMODE_READ, @@ -2127,8 +2442,8 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data, }; struct nfs4_layoutget *lgp; - lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, ¤t_stateid, - &rng, GFP_KERNEL); + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, + nfs_io_gfp_mask()); if (!lgp) return; data->lgp = lgp; @@ -2147,6 +2462,8 @@ void pnfs_lgopen_prepare(struct nfs4_opendata *data, /* Could check on max_ops, but currently hardcoded high enough */ if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN)) return; + if (data->lgp) + return; if (data->state) _lgopen_prepare_attached(data, ctx); else @@ -2186,13 +2503,13 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, } return; } - if (!lgp->args.inode) { + if (!lgp->lo) { lo = _pnfs_grab_empty_layout(ino, ctx); if (!lo) return; - lgp->args.inode = ino; + lgp->lo = lo; } else - lo = NFS_I(lgp->args.inode)->layout; + lo = lgp->lo; lseg = pnfs_layout_process(lgp); if (!IS_ERR(lseg)) { @@ -2205,11 +2522,9 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, void nfs4_lgopen_release(struct nfs4_layoutget *lgp) { if (lgp != NULL) { - struct inode *inode = lgp->args.inode; - if (inode) { - struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - pnfs_clear_first_layoutget(lo); - nfs_layoutget_end(lo); + if (lgp->lo) { + pnfs_clear_first_layoutget(lgp->lo); + nfs_layoutget_end(lgp->lo); } pnfs_layoutget_free(lgp); } @@ -2218,7 +2533,7 @@ void nfs4_lgopen_release(struct nfs4_layoutget *lgp) struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { - struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct pnfs_layout_hdr *lo = lgp->lo; struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; @@ -2246,23 +2561,33 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } - if (!pnfs_layout_is_valid(lo)) { - /* We have a completely new layout */ - pnfs_set_layout_stateid(lo, &res->stateid, true); - } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { + if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && + !pnfs_is_first_layoutget(lo)) + goto out_forget; + + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { + if (!pnfs_layout_is_valid(lo)) + lo->plh_barrier = 0; dprintk("%s forget reply due to sequence\n", __func__); goto out_forget; } - pnfs_set_layout_stateid(lo, &res->stateid, false); - } else { + pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false); + } else if (pnfs_layout_is_valid(lo)) { /* * We got an entirely new state ID. Mark all segments for the * inode invalid, and retry the layoutget */ - pnfs_mark_layout_stateid_invalid(lo, &free_me); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .length = NFS4_MAX_UINT64, + }; + pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0); goto out_forget; + } else { + /* We have a completely new layout */ + pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true); } pnfs_get_lseg(lseg); @@ -2283,16 +2608,6 @@ out_forget: return ERR_PTR(-EAGAIN); } -static int -mark_lseg_invalid_or_return(struct pnfs_layout_segment *lseg, - struct list_head *tmp_list) -{ - if (!mark_lseg_invalid(lseg, tmp_list)) - return 0; - pnfs_cache_lseg_for_layoutreturn(lseg->pls_layout, lseg); - return 1; -} - /** * pnfs_mark_matching_lsegs_return - Free or return matching layout segments * @lo: pointer to layout header @@ -2316,12 +2631,16 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); assert_spin_locked(&lo->plh_inode->i_lock); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + tmp_list = &lo->plh_return_segs; + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (pnfs_match_lseg_recall(lseg, return_range, seq)) { dprintk("%s: marking lseg %p iomode %d " @@ -2329,10 +2648,13 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); - if (mark_lseg_invalid_or_return(lseg, tmp_list)) + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + tmp_list = &lo->plh_return_segs; + if (mark_lseg_invalid(lseg, tmp_list)) continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + pnfs_lseg_cancel_io(server, lseg); } if (remaining) { @@ -2348,87 +2670,208 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, return -ENOENT; } -void pnfs_error_mark_layout_for_return(struct inode *inode, - struct pnfs_layout_segment *lseg) +static void +pnfs_mark_layout_for_return(struct inode *inode, + const struct pnfs_layout_range *range) { - struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - struct pnfs_layout_range range = { - .iomode = lseg->pls_range.iomode, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; + struct pnfs_layout_hdr *lo; bool return_now = false; spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; if (!pnfs_layout_is_valid(lo)) { spin_unlock(&inode->i_lock); return; } - pnfs_set_plh_return_info(lo, range.iomode, 0); + pnfs_set_plh_return_info(lo, range->iomode, 0); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0) != -EBUSY) { + if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) { + const struct cred *cred; nfs4_stateid stateid; enum pnfs_iomode iomode; - return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); spin_unlock(&inode->i_lock); if (return_now) - pnfs_send_layoutreturn(lo, &stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, + PNFS_FL_LAYOUTRETURN_ASYNC); } else { spin_unlock(&inode->i_lock); nfs_commit_inode(inode, 0); } } + +void pnfs_error_mark_layout_for_return(struct inode *inode, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_range range = { + .iomode = lseg->pls_range.iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + pnfs_mark_layout_for_return(inode, &range); +} EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); +static bool +pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo) +{ + return pnfs_layout_is_valid(lo) && + !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) && + !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); +} + +static struct pnfs_layout_segment * +pnfs_find_first_lseg(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range, + enum pnfs_iomode iomode) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + continue; + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + continue; + if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY) + continue; + if (pnfs_lseg_range_intersecting(&lseg->pls_range, range)) + return lseg; + } + return NULL; +} + +/* Find open file states whose mode matches that of the range */ +static bool +pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range) +{ + struct list_head *head; + struct nfs_open_context *ctx; + fmode_t mode = 0; + + if (!pnfs_layout_can_be_returned(lo) || + !pnfs_find_first_lseg(lo, range, range->iomode)) + return false; + + head = &NFS_I(lo->plh_inode)->open_files; + list_for_each_entry_rcu(ctx, head, list) { + if (ctx->state) + mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE); + } + + switch (range->iomode) { + default: + break; + case IOMODE_READ: + mode &= ~FMODE_WRITE; + break; + case IOMODE_RW: + if (pnfs_find_first_lseg(lo, range, IOMODE_READ)) + mode &= ~FMODE_READ; + } + return mode == 0; +} + +static int pnfs_layout_return_unused_byserver(struct nfs_server *server, + void *data) +{ + const struct pnfs_layout_range *range = data; + const struct cred *cred; + struct pnfs_layout_hdr *lo; + struct inode *inode; + nfs4_stateid stateid; + enum pnfs_iomode iomode; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { + inode = lo->plh_inode; + if (!inode || !pnfs_layout_can_be_returned(lo) || + test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + continue; + spin_lock(&inode->i_lock); + if (!lo->plh_inode || + !pnfs_should_return_unused_layout(lo, range)) { + spin_unlock(&inode->i_lock); + continue; + } + pnfs_get_layout_hdr(lo); + pnfs_set_plh_return_info(lo, range->iomode, 0); + if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, + range, 0) != 0 || + !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) { + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + pnfs_put_layout_hdr(lo); + cond_resched(); + goto restart; + } + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, + PNFS_FL_LAYOUTRETURN_ASYNC); + pnfs_put_layout_hdr(lo); + cond_resched(); + goto restart; + } + rcu_read_unlock(); + return 0; +} + void -pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) +pnfs_layout_return_unused_byclid(struct nfs_client *clp, + enum pnfs_iomode iomode) +{ + struct pnfs_layout_range range = { + .iomode = iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver, + &range); +} + +/* Check if we have we have a valid layout but if there isn't an intersection + * between the request and the pgio->pg_lseg, put this pgio->pg_lseg away. + */ +void +pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) { if (pgio->pg_lseg == NULL || - test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags)) + (test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags) && + pnfs_lseg_request_intersecting(pgio->pg_lseg, req))) return; pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); -/* - * Check for any intersection between the request and the pgio->pg_lseg, - * and if none, put this pgio->pg_lseg away. - */ -static void -pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) { - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; - } -} - void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - u64 rd_size = req->wb_bytes; + u64 rd_size; - pnfs_generic_pg_check_layout(pgio); - pnfs_generic_pg_check_range(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); if (pgio->pg_lseg == NULL) { if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); else - rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - req_offset(req), - rd_size, - IOMODE_READ, - false, - GFP_KERNEL); + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq, + req_offset(req)); + + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), rd_size, + IOMODE_READ, false, + nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -2446,16 +2889,12 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - pnfs_generic_pg_check_layout(pgio); - pnfs_generic_pg_check_range(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); if (pgio->pg_lseg == NULL) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - req_offset(req), - wb_size, - IOMODE_RW, - false, - GFP_NOFS); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), wb_size, IOMODE_RW, + false, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -2531,7 +2970,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, hdr->completion_ops); - set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags); return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); @@ -2612,6 +3050,7 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_write_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: @@ -2727,7 +3166,8 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, } /* Resend all requests through pnfs. */ -void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr, + unsigned int mirror_idx) { struct nfs_pageio_descriptor pgio; @@ -2738,6 +3178,7 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); + pgio.pg_mirror_idx = mirror_idx; hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); } } @@ -2754,6 +3195,7 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_read_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: @@ -2890,6 +3332,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; int status; + bool mark_as_dirty = false; if (!pnfs_layoutcommit_outstanding(inode)) return 0; @@ -2903,14 +3346,14 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = wait_on_bit_lock_action(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (status) goto out; } status = -ENOMEM; /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ - data = kzalloc(sizeof(*data), GFP_NOFS); + data = kzalloc(sizeof(*data), nfs_io_gfp_mask()); if (!data) goto clear_layoutcommitting; @@ -2925,10 +3368,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) end_pos = nfsi->layout->plh_lwb; nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); + data->cred = get_cred(nfsi->layout->plh_lc_cred); spin_unlock(&inode->i_lock); data->args.inode = inode; - data->cred = get_cred(nfsi->layout->plh_lc_cred); nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; @@ -2941,19 +3384,23 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (ld->prepare_layoutcommit) { status = ld->prepare_layoutcommit(&data->args); if (status) { - put_cred(data->cred); + if (status != -ENOSPC) + put_cred(data->cred); spin_lock(&inode->i_lock); set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - goto out_unlock; + if (status != -ENOSPC) + goto out_unlock; + spin_unlock(&inode->i_lock); + mark_as_dirty = true; } } status = nfs4_proc_layoutcommit(data, sync); out: - if (status) + if (status || mark_as_dirty) mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; @@ -2977,7 +3424,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { struct nfs4_threshold *thp; - thp = kzalloc(sizeof(*thp), GFP_NOFS); + thp = kzalloc(sizeof(*thp), nfs_io_gfp_mask()); if (!thp) { dprintk("%s mdsthreshold allocation failed\n", __func__); return NULL; |
