diff options
Diffstat (limited to 'fs/nfs')
59 files changed, 3251 insertions, 1449 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 47189476b553..0e4c67373e4f 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -149,8 +149,8 @@ do_add_page_to_bio(struct bio *bio, int npg, enum req_op op, sector_t isect, /* limit length to what the device mapping allows */ end = disk_addr + *len; - if (end >= map->start + map->len) - *len = map->start + map->len - disk_addr; + if (end >= map->disk_offset + map->len) + *len = map->disk_offset + map->len - disk_addr; retry: if (!bio) { @@ -676,7 +676,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, struct pnfs_layout_segment *lseg; struct xdr_buf buf; struct xdr_stream xdr; - struct page *scratch; + struct folio *scratch; int status, i; uint32_t count; __be32 *p; @@ -689,13 +689,13 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, return ERR_PTR(-ENOMEM); status = -ENOMEM; - scratch = alloc_page(gfp_mask); + scratch = folio_alloc(gfp_mask, 0); if (!scratch) goto out; xdr_init_decode_pages(&xdr, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&xdr, scratch); + xdr_set_scratch_folio(&xdr, scratch); status = -EIO; p = xdr_inline_decode(&xdr, 4); @@ -744,7 +744,7 @@ process_extents: } out_free_scratch: - __free_page(scratch); + folio_put(scratch); out: dprintk("%s returns %d\n", __func__, status); switch (status) { diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index cab8809f0e0f..ab76120705e2 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -257,10 +257,11 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, struct pnfs_block_dev *child; u64 chunk; u32 chunk_idx; + u64 disk_chunk; u64 disk_offset; chunk = div_u64(offset, dev->chunk_size); - div_u64_rem(chunk, dev->nr_children, &chunk_idx); + disk_chunk = div_u64_rem(chunk, dev->nr_children, &chunk_idx); if (chunk_idx >= dev->nr_children) { dprintk("%s: invalid chunk idx %d (%lld/%lld)\n", @@ -273,7 +274,7 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, offset = chunk * dev->chunk_size; /* disk offset of the stripe */ - disk_offset = div_u64(offset, dev->nr_children); + disk_offset = disk_chunk * dev->chunk_size; child = &dev->children[chunk_idx]; child->map(child, disk_offset, map); @@ -540,16 +541,16 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct pnfs_block_dev *top; struct xdr_stream xdr; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; int nr_volumes, ret, i; __be32 *p; - scratch = alloc_page(gfp_mask); + scratch = folio_alloc(gfp_mask, 0); if (!scratch) goto out; xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&xdr, scratch); + xdr_set_scratch_folio(&xdr, scratch); p = xdr_inline_decode(&xdr, sizeof(__be32)); if (!p) @@ -581,7 +582,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, out_free_volumes: kfree(volumes); out_free_scratch: - __free_page(scratch); + folio_put(scratch); out: return node; } diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 8f7cff7a4293..315949a7e92d 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -6,6 +6,7 @@ #include <linux/vmalloc.h> #include "blocklayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -520,10 +521,71 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); } -static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, +/** + * ext_tree_try_encode_commit - try to encode all extents into the buffer + * @bl: pointer to the layout + * @p: pointer to the output buffer + * @buffer_size: size of the output buffer + * @count: output pointer to the number of encoded extents + * @lastbyte: output pointer to the last written byte + * + * Return values: + * %0: Success, all required extents encoded, outputs are valid + * %-ENOSPC: Buffer too small, nothing encoded, outputs are invalid + */ +static int +ext_tree_try_encode_commit(struct pnfs_block_layout *bl, __be32 *p, size_t buffer_size, size_t *count, __u64 *lastbyte) { struct pnfs_block_extent *be; + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_WRITTEN) + continue; + + (*count)++; + if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { + spin_unlock(&bl->bl_ext_lock); + return -ENOSPC; + } + } + for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_WRITTEN) + continue; + + if (bl->bl_scsi_layout) + p = encode_scsi_range(be, p); + else + p = encode_block_extent(be, p); + be->be_tag = EXTENT_COMMITTING; + } + *lastbyte = (bl->bl_lwb != 0) ? bl->bl_lwb - 1 : U64_MAX; + bl->bl_lwb = 0; + spin_unlock(&bl->bl_ext_lock); + + return 0; +} + +/** + * ext_tree_encode_commit - encode as much as possible extents into the buffer + * @bl: pointer to the layout + * @p: pointer to the output buffer + * @buffer_size: size of the output buffer + * @count: output pointer to the number of encoded extents + * @lastbyte: output pointer to the last written byte + * + * Return values: + * %0: Success, all required extents encoded, outputs are valid + * %-ENOSPC: Buffer too small, some extents are encoded, outputs are valid + */ +static int +ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, + size_t buffer_size, size_t *count, __u64 *lastbyte) +{ + struct pnfs_block_extent *be, *be_prev; int ret = 0; spin_lock(&bl->bl_ext_lock); @@ -534,9 +596,9 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, (*count)++; if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { - /* keep counting.. */ + (*count)--; ret = -ENOSPC; - continue; + break; } if (bl->bl_scsi_layout) @@ -544,14 +606,30 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, else p = encode_block_extent(be, p); be->be_tag = EXTENT_COMMITTING; + be_prev = be; + } + if (!ret) { + *lastbyte = (bl->bl_lwb != 0) ? bl->bl_lwb - 1 : U64_MAX; + bl->bl_lwb = 0; + } else { + *lastbyte = be_prev->be_f_offset + be_prev->be_length; + *lastbyte <<= SECTOR_SHIFT; + *lastbyte -= 1; } - *lastbyte = bl->bl_lwb - 1; - bl->bl_lwb = 0; spin_unlock(&bl->bl_ext_lock); return ret; } +/** + * ext_tree_prepare_commit - encode extents that need to be committed + * @arg: layout commit data + * + * Return values: + * %0: Success, all required extents are encoded + * %-ENOSPC: Some extents are encoded, but not all, due to RPC size limit + * %-ENOMEM: Out of memory, extents not encoded + */ int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) { @@ -560,20 +638,18 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) __be32 *start_p; int ret; - dprintk("%s enter\n", __func__); - arg->layoutupdate_page = alloc_page(GFP_NOFS); if (!arg->layoutupdate_page) return -ENOMEM; start_p = page_address(arg->layoutupdate_page); arg->layoutupdate_pages = &arg->layoutupdate_page; -retry: - ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten); + ret = ext_tree_try_encode_commit(bl, start_p + 1, buffer_size, + &count, &arg->lastbytewritten); if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); - buffer_size = ext_tree_layoutupdate_size(bl, count); + buffer_size = NFS_SERVER(arg->inode)->wsize; count = 0; arg->layoutupdate_pages = @@ -588,7 +664,8 @@ retry: return -ENOMEM; } - goto retry; + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, + &count, &arg->lastbytewritten); } *start_p = cpu_to_be32(count); @@ -607,8 +684,9 @@ retry: } } - dprintk("%s found %zu ranges\n", __func__, count); - return 0; + trace_bl_ext_tree_prepare_commit(ret, count, + arg->lastbytewritten, !!ret); + return ret; } void diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index d8d50a88de04..d526f5ba7887 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -141,24 +141,18 @@ static const struct rpc_pipe_ops bl_upcall_ops = { .destroy_msg = bl_pipe_destroy_msg, }; -static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, +static int nfs4blocklayout_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { - struct dentry *dir, *dentry; + struct dentry *dir; + int err; dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); if (dir == NULL) - return ERR_PTR(-ENOENT); - dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); + return -ENOENT; + err = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); dput(dir); - return dentry; -} - -static void nfs4blocklayout_unregister_sb(struct super_block *sb, - struct rpc_pipe *pipe) -{ - if (pipe->dentry) - rpc_unlink(pipe->dentry); + return err; } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, @@ -167,7 +161,6 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, struct super_block *sb = ptr; struct net *net = sb->s_fs_info; struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; int ret = 0; if (!try_module_get(THIS_MODULE)) @@ -180,16 +173,10 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, switch (event) { case RPC_PIPEFS_MOUNT: - dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - break; - } - nn->bl_device_pipe->dentry = dentry; + ret = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); break; case RPC_PIPEFS_UMOUNT: - if (nn->bl_device_pipe->dentry) - nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); + rpc_unlink(nn->bl_device_pipe); break; default: ret = -ENOTSUPP; @@ -203,18 +190,17 @@ static struct notifier_block nfs4blocklayout_block = { .notifier_call = rpc_pipefs_event, }; -static struct dentry *nfs4blocklayout_register_net(struct net *net, - struct rpc_pipe *pipe) +static int nfs4blocklayout_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *pipefs_sb; - struct dentry *dentry; + int ret; pipefs_sb = rpc_get_sb_net(net); if (!pipefs_sb) - return NULL; - dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); + return 0; + ret = nfs4blocklayout_register_sb(pipefs_sb, pipe); rpc_put_sb_net(net); - return dentry; + return ret; } static void nfs4blocklayout_unregister_net(struct net *net, @@ -224,7 +210,7 @@ static void nfs4blocklayout_unregister_net(struct net *net, pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { - nfs4blocklayout_unregister_sb(pipefs_sb, pipe); + rpc_unlink(pipe); rpc_put_sb_net(net); } } @@ -232,20 +218,17 @@ static void nfs4blocklayout_unregister_net(struct net *net, static int nfs4blocklayout_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; + int err; mutex_init(&nn->bl_mutex); init_waitqueue_head(&nn->bl_wq); nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); if (IS_ERR(nn->bl_device_pipe)) return PTR_ERR(nn->bl_device_pipe); - dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); - if (IS_ERR(dentry)) { + err = nfs4blocklayout_register_net(net, nn->bl_device_pipe); + if (unlikely(err)) rpc_destroy_pipe_data(nn->bl_device_pipe); - return PTR_ERR(dentry); - } - nn->bl_device_pipe->dentry = dentry; - return 0; + return err; } static void nfs4blocklayout_net_exit(struct net *net) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 86bdc7d23fb9..fabda0f6ec1a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, @@ -153,7 +153,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, ret = svc_bind(serv, net); if (ret < 0) { printk(KERN_WARNING "NFS: bind callback service failed\n"); - goto err_bind; + goto err; } ret = 0; @@ -166,13 +166,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, if (ret < 0) { printk(KERN_ERR "NFS: callback service start failed\n"); - goto err_socks; + goto err; } return 0; -err_socks: - svc_rpcb_cleanup(serv, net); -err_bind: +err: nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " "net = %x\n", ret, net->ns.inum); @@ -260,7 +258,7 @@ err_start: /* * Kill the callback thread if it's no longer being used. */ -void nfs_callback_down(int minorversion, struct net *net) +void nfs_callback_down(int minorversion, struct net *net, struct rpc_xprt *xprt) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; struct svc_serv *serv; @@ -272,7 +270,7 @@ void nfs_callback_down(int minorversion, struct net *net) if (cb_info->users == 0) { svc_set_num_threads(serv, NULL, 0); dprintk("nfs_callback_down: service destroyed\n"); - svc_destroy(&cb_info->serv); + xprt_svc_destroy_nullify_bc(xprt, &cb_info->serv); } mutex_unlock(&nfs_callback_mutex); } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 154a6ed1299f..8809f93d82c0 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -188,7 +188,8 @@ extern __be32 nfs4_callback_recall(void *argp, void *resp, struct cb_process_state *cps); #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt); -extern void nfs_callback_down(int minorversion, struct net *net); +extern void nfs_callback_down(int minorversion, struct net *net, + struct rpc_xprt *xprt); #endif /* CONFIG_NFS_V4 */ /* * nfs41: Callbacks are expected to not cause substantial latency, diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 02c916a55020..2aaea9c98c2c 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -180,7 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; - clp->cl_net = get_net(cl_init->net); + clp->cl_net = get_net_track(cl_init->net, &clp->cl_ns_tracker, GFP_KERNEL); #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); @@ -250,7 +250,7 @@ void nfs_free_client(struct nfs_client *clp) if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); - put_net(clp->cl_net); + put_net_track(clp->cl_net, &clp->cl_ns_tracker); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); @@ -338,6 +338,14 @@ again: /* Match the xprt security policy */ if (clp->cl_xprtsec.policy != data->xprtsec.policy) continue; + if (clp->cl_xprtsec.policy == RPC_XPRTSEC_TLS_X509) { + if (clp->cl_xprtsec.cert_serial != + data->xprtsec.cert_serial) + continue; + if (clp->cl_xprtsec.privkey_serial != + data->xprtsec.privkey_serial) + continue; + } refcount_inc(&clp->cl_count); return clp; @@ -439,7 +447,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) spin_unlock(&nn->nfs_client_lock); new = rpc_ops->init_client(new, cl_init); if (!IS_ERR(new)) - nfs_local_probe(new); + nfs_local_probe_async(new); return new; } @@ -682,6 +690,44 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, } EXPORT_SYMBOL_GPL(nfs_init_client); +static void nfs4_server_set_init_caps(struct nfs_server *server) +{ +#if IS_ENABLED(CONFIG_NFS_V4) + /* Set the basic capabilities */ + server->caps = server->nfs_client->cl_mvops->init_caps; + if (server->flags & NFS_MOUNT_NORDIRPLUS) + server->caps &= ~NFS_CAP_READDIRPLUS; + if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA) + server->caps &= ~NFS_CAP_READ_PLUS; + + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && + server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; +#endif +} + +void nfs_server_set_init_caps(struct nfs_server *server) +{ + switch (server->nfs_client->rpc_ops->version) { + case 2: + server->caps = NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; + break; + case 3: + server->caps = NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; + if (!(server->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; + break; + default: + nfs4_server_set_init_caps(server); + break; + } +} +EXPORT_SYMBOL_GPL(nfs_server_set_init_caps); + /* * Create a version 2 or 3 client */ @@ -726,7 +772,6 @@ static int nfs_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = ctx->flags; server->options = ctx->options; - server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; switch (clp->rpc_ops->version) { case 2: @@ -739,10 +784,18 @@ static int nfs_init_server(struct nfs_server *server, server->fattr_valid = NFS_ATTR_FATTR_V4; } - if (ctx->rsize) + if (ctx->bsize) { + server->bsize = ctx->bsize; + server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_BSIZE; + } + if (ctx->rsize) { server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto); - if (ctx->wsize) + server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_RSIZE; + } + if (ctx->wsize) { server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto); + server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_WSIZE; + } server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; @@ -762,6 +815,8 @@ static int nfs_init_server(struct nfs_server *server, if (error < 0) goto error; + nfs_server_set_init_caps(server); + /* Preserve the values of mount_server-related mount options */ if (ctx->mount_server.addrlen) { memcpy(&server->mountd_address, &ctx->mount_server.address, @@ -814,7 +869,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; - server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); @@ -831,7 +885,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->maxfilesize = fsinfo->maxfilesize; - server->time_delta = fsinfo->time_delta; server->change_attr_type = fsinfo->change_attr_type; server->clone_blksize = fsinfo->clone_blksize; @@ -851,6 +904,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; + else + server->caps &= ~NFS_CAP_XATTR; #endif } @@ -930,13 +985,17 @@ EXPORT_SYMBOL_GPL(nfs_probe_server); void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) { target->flags = source->flags; - target->rsize = source->rsize; - target->wsize = source->wsize; + target->automount_inherit = source->automount_inherit; + if (source->automount_inherit & NFS_AUTOMOUNT_INHERIT_BSIZE) + target->bsize = source->bsize; + if (source->automount_inherit & NFS_AUTOMOUNT_INHERIT_RSIZE) + target->rsize = source->rsize; + if (source->automount_inherit & NFS_AUTOMOUNT_INHERIT_WSIZE) + target->wsize = source->wsize; target->acregmin = source->acregmin; target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; target->acdirmax = source->acdirmax; - target->caps = source->caps; target->options = source->options; target->auth_info = source->auth_info; target->port = source->port; @@ -1007,6 +1066,7 @@ struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->ss_src_copies); atomic_set(&server->active, 0); + atomic_long_set(&server->nr_active_delegations, 0); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { @@ -1105,6 +1165,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc) if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } + /* Linux 'subtree_check' borkenness mandates this setting */ + server->fh_expire_type = NFS_FH_VOL_RENAME; if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, @@ -1168,6 +1230,8 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (error < 0) goto out_free_server; + nfs_server_set_init_caps(server); + /* probe the filesystem info for this server filesystem */ error = nfs_probe_server(server, fh); if (error < 0) @@ -1200,6 +1264,10 @@ void nfs_clients_init(struct net *net) #if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); #endif +#if IS_ENABLED(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&nn->nfs4_data_server_cache); + spin_lock_init(&nn->nfs4_data_server_lock); +#endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); @@ -1216,6 +1284,9 @@ void nfs_clients_exit(struct net *net) nfs_cleanup_cb_ident_idr(net); WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); +#if IS_ENABLED(CONFIG_NFS_V4_1) + WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache)); +#endif } #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 8bdbc4dca89c..2248e3ad089a 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -27,8 +27,20 @@ #define NFS_DEFAULT_DELEGATION_WATERMARK (5000U) -static atomic_long_t nfs_active_delegations; static unsigned nfs_delegation_watermark = NFS_DEFAULT_DELEGATION_WATERMARK; +module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644); + +bool directory_delegations = true; +module_param(directory_delegations, bool, 0644); +MODULE_PARM_DESC(directory_delegations, + "Enable the use of directory delegations, defaults to on."); + +static struct hlist_head *nfs_delegation_hash(struct nfs_server *server, + const struct nfs_fh *fhandle) +{ + return server->delegation_hash_table + + (nfs_fhandle_hash(fhandle) & server->delegation_hash_mask); +} static void __nfs_free_delegation(struct nfs_delegation *delegation) { @@ -37,11 +49,12 @@ static void __nfs_free_delegation(struct nfs_delegation *delegation) kfree_rcu(delegation, rcu); } -static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation) +static void nfs_mark_delegation_revoked(struct nfs_server *server, + struct nfs_delegation *delegation) { if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; - atomic_long_dec(&nfs_active_delegations); + atomic_long_dec(&server->nr_active_delegations); if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) nfs_clear_verifier_delegated(delegation->inode); } @@ -59,9 +72,10 @@ static void nfs_put_delegation(struct nfs_delegation *delegation) __nfs_free_delegation(delegation); } -static void nfs_free_delegation(struct nfs_delegation *delegation) +static void nfs_free_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) { - nfs_mark_delegation_revoked(delegation); + nfs_mark_delegation_revoked(server, delegation); nfs_put_delegation(delegation); } @@ -134,6 +148,8 @@ static int nfs4_do_check_delegation(struct inode *inode, fmode_t type, */ int nfs4_have_delegation(struct inode *inode, fmode_t type, int flags) { + if (S_ISDIR(inode->i_mode) && !directory_delegations) + nfs_inode_evict_delegation(inode); return nfs4_do_check_delegation(inode, type, flags, true); } @@ -237,34 +253,34 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL) { - spin_lock(&delegation->lock); - nfs4_stateid_copy(&delegation->stateid, stateid); - delegation->type = type; - delegation->pagemod_limit = pagemod_limit; - oldcred = delegation->cred; - delegation->cred = get_cred(cred); - switch (deleg_type) { - case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: - case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: - set_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags); - break; - default: - clear_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags); - } - clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); - if (test_and_clear_bit(NFS_DELEGATION_REVOKED, - &delegation->flags)) - atomic_long_inc(&nfs_active_delegations); - spin_unlock(&delegation->lock); - rcu_read_unlock(); - put_cred(oldcred); - trace_nfs4_reclaim_delegation(inode, type); - } else { + if (!delegation) { rcu_read_unlock(); nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit, deleg_type); + return; } + + spin_lock(&delegation->lock); + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; + oldcred = delegation->cred; + delegation->cred = get_cred(cred); + switch (deleg_type) { + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + set_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags); + break; + default: + clear_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags); + } + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + atomic_long_inc(&NFS_SERVER(inode)->nr_active_delegations); + spin_unlock(&delegation->lock); + rcu_read_unlock(); + put_cred(oldcred); + trace_nfs4_reclaim_delegation(inode, type); } static int nfs_do_return_delegation(struct inode *inode, @@ -355,6 +371,8 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi, rcu_dereference_protected(nfsi->delegation, lockdep_is_held(&clp->cl_lock)); + trace_nfs4_detach_delegation(&nfsi->vfs_inode, delegation->type); + if (deleg_cur == NULL || delegation != deleg_cur) return NULL; @@ -363,10 +381,12 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi, spin_unlock(&delegation->lock); return NULL; } + hlist_del_init_rcu(&delegation->hash); list_del_rcu(&delegation->super_list); delegation->inode = NULL; rcu_assign_pointer(nfsi->delegation, NULL); spin_unlock(&delegation->lock); + clear_bit(NFS_INO_REQ_DIR_DELEG, &nfsi->flags); return delegation; } @@ -410,7 +430,8 @@ nfs_update_delegation_cred(struct nfs_delegation *delegation, } static void -nfs_update_inplace_delegation(struct nfs_delegation *delegation, +nfs_update_inplace_delegation(struct nfs_server *server, + struct nfs_delegation *delegation, const struct nfs_delegation *update) { if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) { @@ -423,7 +444,7 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, nfs_update_delegation_cred(delegation, update->cred); /* smp_mb__before_atomic() is implicit due to xchg() */ clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags); - atomic_long_inc(&nfs_active_delegations); + atomic_long_inc(&server->nr_active_delegations); } } } @@ -478,7 +499,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, if (nfs4_stateid_match_other(&old_delegation->stateid, &delegation->stateid)) { spin_lock(&old_delegation->lock); - nfs_update_inplace_delegation(old_delegation, + nfs_update_inplace_delegation(server, old_delegation, delegation); spin_unlock(&old_delegation->lock); goto out; @@ -524,10 +545,12 @@ add_new: spin_unlock(&inode->i_lock); list_add_tail_rcu(&delegation->super_list, &server->delegations); + hlist_add_head_rcu(&delegation->hash, + nfs_delegation_hash(server, &NFS_I(inode)->fh)); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; - atomic_long_inc(&nfs_active_delegations); + atomic_long_inc(&server->nr_active_delegations); trace_nfs4_set_delegation(inode, type); @@ -541,7 +564,7 @@ out: __nfs_free_delegation(delegation); if (freeme != NULL) { nfs_do_return_delegation(inode, freeme, 0); - nfs_free_delegation(freeme); + nfs_free_delegation(server, freeme); } return status; } @@ -592,6 +615,8 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) { bool ret = false; + trace_nfs_delegation_need_return(delegation); + if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || @@ -751,7 +776,7 @@ void nfs_inode_evict_delegation(struct inode *inode) set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags); nfs_do_return_delegation(inode, delegation, 1); - nfs_free_delegation(delegation); + nfs_free_delegation(NFS_SERVER(inode), delegation); } } @@ -837,7 +862,8 @@ void nfs4_inode_return_delegation_on_close(struct inode *inode) if (!delegation) goto out; if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) || - atomic_long_read(&nfs_active_delegations) >= nfs_delegation_watermark) { + atomic_long_read(&NFS_SERVER(inode)->nr_active_delegations) >= + nfs_delegation_watermark) { spin_lock(&delegation->lock); if (delegation->inode && list_empty(&NFS_I(inode)->open_files) && @@ -1013,7 +1039,7 @@ static void nfs_revoke_delegation(struct inode *inode, } spin_unlock(&delegation->lock); } - nfs_mark_delegation_revoked(delegation); + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); ret = true; out: rcu_read_unlock(); @@ -1021,13 +1047,6 @@ out: nfs_inode_find_state_and_recover(inode, stateid); } -void nfs_remove_bad_delegation(struct inode *inode, - const nfs4_stateid *stateid) -{ - nfs_revoke_delegation(inode, stateid); -} -EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); - void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid) { @@ -1052,7 +1071,7 @@ void nfs_delegation_mark_returned(struct inode *inode, delegation->stateid.seqid = stateid->seqid; } - nfs_mark_delegation_revoked(delegation); + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); spin_unlock(&delegation->lock); if (nfs_detach_delegation(NFS_I(inode), delegation, NFS_SERVER(inode))) @@ -1070,6 +1089,24 @@ out_rcu_unlock: } /** + * nfs_remove_bad_delegation - handle delegations that are unusable + * @inode: inode to process + * @stateid: the delegation's stateid + * + * If the server ACK-ed our FREE_STATEID then clean + * up the delegation, else mark and keep the revoked state. + */ +void nfs_remove_bad_delegation(struct inode *inode, + const nfs4_stateid *stateid) +{ + if (stateid && stateid->type == NFS4_FREED_STATEID_TYPE) + nfs_delegation_mark_returned(inode, stateid); + else + nfs_revoke_delegation(inode, stateid); +} +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); + +/** * nfs_expire_unused_delegation_types * @clp: client to process * @flags: delegation types to expire @@ -1147,11 +1184,12 @@ static struct inode * nfs_delegation_find_inode_server(struct nfs_server *server, const struct nfs_fh *fhandle) { + struct hlist_head *head = nfs_delegation_hash(server, fhandle); struct nfs_delegation *delegation; struct super_block *freeme = NULL; struct inode *res = NULL; - list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + hlist_for_each_entry_rcu(delegation, head, hash) { spin_lock(&delegation->lock); if (delegation->inode != NULL && !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && @@ -1254,7 +1292,7 @@ restart: if (delegation != NULL) { if (nfs_detach_delegation(NFS_I(inode), delegation, server) != NULL) - nfs_free_delegation(delegation); + nfs_free_delegation(server, delegation); /* Match nfs_start_delegation_return_locked */ nfs_put_delegation(delegation); } @@ -1559,4 +1597,17 @@ out: return ret; } -module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644); +int nfs4_delegation_hash_alloc(struct nfs_server *server) +{ + int delegation_buckets, i; + + delegation_buckets = roundup_pow_of_two(nfs_delegation_watermark / 16); + server->delegation_hash_mask = delegation_buckets - 1; + server->delegation_hash_table = kmalloc_array(delegation_buckets, + sizeof(*server->delegation_hash_table), GFP_KERNEL); + if (!server->delegation_hash_table) + return -ENOMEM; + for (i = 0; i < delegation_buckets; i++) + INIT_HLIST_HEAD(&server->delegation_hash_table[i]); + return 0; +} diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 8ff5ab9c5c25..46d866adb5c2 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -14,6 +14,7 @@ * NFSv4 delegation */ struct nfs_delegation { + struct hlist_node hash; struct list_head super_list; const struct cred *cred; struct inode *inode; @@ -123,4 +124,19 @@ static inline int nfs_have_delegated_mtime(struct inode *inode) NFS_DELEGATION_FLAG_TIME); } +extern bool directory_delegations; + +static inline void nfs_request_directory_delegation(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_REQ_DIR_DELEG, &NFS_I(inode)->flags); +} + +static inline bool nfs_have_directory_delegation(struct inode *inode) +{ + return S_ISDIR(inode->i_mode) && nfs_have_delegated_attributes(inode); +} + +int nfs4_delegation_hash_alloc(struct nfs_server *server); + #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index bd23fc736b39..23a78a742b61 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -789,16 +789,17 @@ again: goto out; } + nfs_set_verifier(dentry, dir_verifier); inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); alias = d_splice_alias(inode, dentry); d_lookup_done(dentry); if (alias) { if (IS_ERR(alias)) goto out; + nfs_set_verifier(alias, dir_verifier); dput(dentry); dentry = alias; } - nfs_set_verifier(dentry, dir_verifier); trace_nfs_readdir_lookup(d_inode(parent), dentry, 0); out: dput(dentry); @@ -829,17 +830,17 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, struct address_space *mapping = desc->file->f_mapping; struct folio *new, *folio = *arrays; struct xdr_stream stream; - struct page *scratch; + struct folio *scratch; struct xdr_buf buf; u64 cookie; int status; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (scratch == NULL) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); do { status = nfs_readdir_entry_decode(desc, entry, &stream); @@ -891,7 +892,7 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, if (folio != *arrays) nfs_readdir_folio_unlock_and_put(folio); - put_page(scratch); + folio_put(scratch); return status; } @@ -1514,6 +1515,15 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, return 0; if (!nfs_dentry_verify_change(dir, dentry)) return 0; + + /* + * If we have a directory delegation then we don't need to revalidate + * the directory. The delegation will either get recalled or we will + * receive a notification when it changes. + */ + if (nfs_have_directory_delegation(dir)) + return 0; + /* Revalidate nfsi->cache_change_attribute before we declare a match */ if (nfs_mapping_need_revalidate_inode(dir)) { if (rcu_walk) @@ -1828,9 +1838,7 @@ static void block_revalidate(struct dentry *dentry) static void unblock_revalidate(struct dentry *dentry) { - /* store_release ensures wait_var_event() sees the update */ - smp_store_release(&dentry->d_fsdata, NULL); - wake_up_var(&dentry->d_fsdata); + store_release_wake_up(&dentry->d_fsdata, NULL); } /* @@ -1896,13 +1904,15 @@ static int nfs_dentry_delete(const struct dentry *dentry) } /* Ensure that we revalidate inode->i_nlink */ -static void nfs_drop_nlink(struct inode *inode) +static void nfs_drop_nlink(struct inode *inode, unsigned long gencount) { + struct nfs_inode *nfsi = NFS_I(inode); + spin_lock(&inode->i_lock); /* drop the inode if we're reasonably sure this is the last link */ - if (inode->i_nlink > 0) + if (inode->i_nlink > 0 && gencount == nfsi->attr_gencount) drop_nlink(inode); - NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfs_set_cache_invalid( inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_NLINK); @@ -1916,8 +1926,9 @@ static void nfs_drop_nlink(struct inode *inode) static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) { if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { + unsigned long gencount = READ_ONCE(NFS_I(inode)->attr_gencount); nfs_complete_unlink(dentry, inode); - nfs_drop_nlink(inode); + nfs_drop_nlink(inode, gencount); } iput(inode); } @@ -1993,13 +2004,14 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in nfs_lookup_advise_force_readdirplus(dir, flags); no_entry: + nfs_set_verifier(dentry, dir_verifier); res = d_splice_alias(inode, dentry); if (res != NULL) { if (IS_ERR(res)) goto out; + nfs_set_verifier(res, dir_verifier); dentry = res; } - nfs_set_verifier(dentry, dir_verifier); out: trace_nfs_lookup_exit(dir, dentry, flags, PTR_ERR_OR_ZERO(res)); nfs_free_fattr(fattr); @@ -2141,12 +2153,12 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, d_drop(dentry); switch (err) { case -ENOENT: - d_splice_alias(NULL, dentry); if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) dir_verifier = inode_peek_iversion_raw(dir); else dir_verifier = nfs_save_change_attribute(dir); nfs_set_verifier(dentry, dir_verifier); + d_splice_alias(NULL, dentry); break; case -EISDIR: case -ENOTDIR: @@ -2200,13 +2212,18 @@ no_open: else dput(dentry); } - if (IS_ERR(res)) - return PTR_ERR(res); return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open); static int +nfs_lookup_revalidate_delegated_parent(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + return nfs_lookup_revalidate_done(dir, dentry, inode, 1); +} + +static int nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { @@ -2233,6 +2250,9 @@ nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name, if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); + if (nfs_have_directory_delegation(dir)) + return nfs_lookup_revalidate_delegated_parent(dir, dentry, inode); + /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) goto full_reval; @@ -2262,7 +2282,7 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int open_flags, umode_t mode) { - + struct dentry *res = NULL; /* Same as look+open from lookup_open(), but with different O_TRUNC * handling. */ @@ -2272,26 +2292,21 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, return -ENAMETOOLONG; if (open_flags & O_CREAT) { - file->f_mode |= FMODE_CREATED; error = nfs_do_create(dir, dentry, mode, open_flags); - if (error) + if (!error) { + file->f_mode |= FMODE_CREATED; + return finish_open(file, dentry, NULL); + } else if (error != -EEXIST || open_flags & O_EXCL) return error; - return finish_open(file, dentry, NULL); - } else if (d_in_lookup(dentry)) { + } + if (d_in_lookup(dentry)) { /* The only flags nfs_lookup considers are * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and * we want those to be zero so the lookup isn't skipped. */ - struct dentry *res = nfs_lookup(dir, dentry, 0); - - d_lookup_done(dentry); - if (unlikely(res)) { - if (IS_ERR(res)) - return PTR_ERR(res); - return finish_no_open(file, res); - } + res = nfs_lookup(dir, dentry, 0); } - return finish_no_open(file, NULL); + return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open_v23); @@ -2516,9 +2531,11 @@ static int nfs_safe_remove(struct dentry *dentry) trace_nfs_remove_enter(dir, dentry); if (inode != NULL) { + unsigned long gencount = READ_ONCE(NFS_I(inode)->attr_gencount); + error = NFS_PROTO(dir)->remove(dir, dentry); if (error == 0) - nfs_drop_nlink(inode); + nfs_drop_nlink(inode, gencount); } else error = NFS_PROTO(dir)->remove(dir, dentry); if (error == -ENOENT) @@ -2676,6 +2693,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) unblock_revalidate(new_dentry); } +static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, + struct dentry *new_dentry) +{ + struct nfs_server *server = NFS_SB(old_dentry->d_sb); + + if (old_dentry->d_parent != new_dentry->d_parent) + return false; + if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) + return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); + return true; +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2706,6 +2735,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); + unsigned long new_gencount = 0; struct dentry *dentry = NULL; struct rpc_task *task; bool must_unblock = false; @@ -2758,12 +2788,14 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } else { block_revalidate(new_dentry); must_unblock = true; + new_gencount = NFS_I(new_inode)->attr_gencount; spin_unlock(&new_dentry->d_lock); } } - if (S_ISREG(old_inode->i_mode)) + if (S_ISREG(old_inode->i_mode) && + nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry)) nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); @@ -2796,7 +2828,7 @@ out: new_dir, new_dentry, error); if (!error) { if (new_inode != NULL) - nfs_drop_nlink(new_inode); + nfs_drop_nlink(new_inode, new_gencount); /* * The d_move() should be here instead of in an async RPC completion * handler because we need the proper locks to move the dentry. If diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f32f8d7c9122..48d89716193a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -757,7 +757,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; - struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct inode *inode = dreq->inode; int flags = NFS_ODIRECT_DONE; @@ -786,6 +785,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_unlock(&inode->i_lock); while (!list_empty(&hdr->pages)) { + struct nfs_page *req; req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index e9c233b6fd20..a10dd5f9d078 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -66,14 +66,21 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid, { struct nfs_fattr *fattr = NULL; struct nfs_fh *server_fh = nfs_exp_embedfh(fid->raw); - size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size; + size_t fh_size = offsetof(struct nfs_fh, data); const struct nfs_rpc_ops *rpc_ops; struct dentry *dentry; struct inode *inode; - int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size); + int len = EMBED_FH_OFF; u32 *p = fid->raw; int ret; + /* Initial check of bounds */ + if (fh_len < len + XDR_QUADLEN(fh_size) || + fh_len > XDR_QUADLEN(NFS_MAXFHSIZE)) + return NULL; + /* Calculate embedded filehandle size */ + fh_size += server_fh->size; + len += XDR_QUADLEN(fh_size); /* NULL translates to ESTALE */ if (fh_len < len || fh_type != len) return NULL; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 033feeab8c34..d020aab40c64 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -28,6 +28,7 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/gfp.h> +#include <linux/rmap.h> #include <linux/swap.h> #include <linux/compaction.h> @@ -160,6 +161,8 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t result; + trace_nfs_file_read(iocb, to); + if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_read(iocb, to, false); @@ -207,24 +210,25 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe EXPORT_SYMBOL_GPL(nfs_file_splice_read); int -nfs_file_mmap(struct file *file, struct vm_area_struct *vma) +nfs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); int status; dprintk("NFS: mmap(%pD2)\n", file); - /* Note: generic_file_mmap() returns ENOSYS on nommu systems + /* Note: generic_file_mmap_prepare() returns ENOSYS on nommu systems * so we call that before revalidating the mapping */ - status = generic_file_mmap(file, vma); + status = generic_file_mmap_prepare(desc); if (!status) { - vma->vm_ops = &nfs_file_vm_ops; + desc->vm_ops = &nfs_file_vm_ops; status = nfs_revalidate_mapping(inode, file->f_mapping); } return status; } -EXPORT_SYMBOL_GPL(nfs_file_mmap); +EXPORT_SYMBOL_GPL(nfs_file_mmap_prepare); /* * Flush any dirty pages for this process, and check for write errors. @@ -279,6 +283,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL_GPL(nfs_file_fsync); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to) +{ + struct folio *folio; + + if (from >= to) + return; + + folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + + if (folio_test_uptodate(folio)) { + loff_t fpos = folio_pos(folio); + size_t offset = from - fpos; + size_t end = folio_size(folio); + + if (to - fpos < end) + end = to - fpos; + folio_zero_segment(folio, offset, end); + trace_nfs_size_truncate_folio(mapping->host, to); + } + + folio_unlock(folio); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(nfs_truncate_last_folio); + /* * Decide whether a read/modify/write cycle may be more efficient * then a modify/write/read cycle when writing to a page in the @@ -328,6 +363,8 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, if (pnfs_ld_read_whole_page(file_inode(file))) return true; + if (folio_test_dropbehind(folio)) + return false; /* Open for reading too? */ if (file->f_mode & FMODE_READ) return true; @@ -342,24 +379,28 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, * If the writer ends up delaying the write, the writer needs to * increment the page use counts until he is done with the page. */ -static int nfs_write_begin(struct file *file, struct address_space *mapping, +static int nfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { - fgf_t fgp = FGP_WRITEBEGIN; struct folio *folio; + struct file *file = iocb->ki_filp; int once_thru = 0; int ret; + trace_nfs_write_begin(file_inode(file), pos, len); + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); + nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); - fgp |= fgf_set_order(len); start: - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - return PTR_ERR(folio); + folio = write_begin_get_folio(iocb, mapping, pos >> PAGE_SHIFT, len); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out; + } *foliop = folio; ret = nfs_flush_incompatible(file, folio); @@ -369,22 +410,28 @@ start: } else if (!once_thru && nfs_want_read_modify_write(file, folio, pos, len)) { once_thru = 1; + folio_clear_dropbehind(folio); ret = nfs_read_folio(file, folio); folio_put(folio); if (!ret) goto start; } +out: + trace_nfs_write_begin_done(file_inode(file), pos, len, ret); return ret; } -static int nfs_write_end(struct file *file, struct address_space *mapping, +static int nfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { + struct file *file = iocb->ki_filp; struct nfs_open_context *ctx = nfs_file_open_context(file); unsigned offset = offset_in_folio(folio, pos); int status; + trace_nfs_write_end(file_inode(file), pos, len); dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); @@ -413,13 +460,16 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, folio_unlock(folio); folio_put(folio); - if (status < 0) + if (status < 0) { + trace_nfs_write_end_done(file_inode(file), pos, len, status); return status; + } NFS_I(mapping->host)->write_io += copied; if (nfs_ctx_key_to_expire(ctx, mapping->host)) nfs_wb_all(mapping->host); + trace_nfs_write_end_done(file_inode(file), pos, len, copied); return copied; } @@ -437,10 +487,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset, dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", folio->index, offset, length); - if (offset != 0 || length < folio_size(folio)) - return; /* Cancel any unstarted writes on this page */ - nfs_wb_folio_cancel(inode, folio); + if (offset != 0 || length < folio_size(folio)) + nfs_wb_folio(inode, folio); + else + nfs_wb_folio_cancel(inode, folio); folio_wait_private_2(folio); /* [DEPRECATED] */ trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length); } @@ -651,6 +702,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) errseq_t since; int error; + trace_nfs_file_write(iocb, from); + result = nfs_key_timeout_notify(file, inode); if (result) return result; @@ -899,7 +952,7 @@ const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, .read_iter = nfs_file_read, .write_iter = nfs_file_write, - .mmap = nfs_file_mmap, + .mmap_prepare = nfs_file_mmap_prepare, .open = nfs_file_open, .flush = nfs_file_flush, .release = nfs_file_release, @@ -910,5 +963,6 @@ const struct file_operations nfs_file_operations = { .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, + .fop_flags = FOP_DONTCACHE, }; EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d39a1f58e18d..5c4551117c58 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -646,19 +646,19 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, { struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; __be32 *p; uint32_t nfl_util; int i; dprintk("%s: set_layout_map Begin\n", __func__); - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), * num_fh (4) */ @@ -724,11 +724,11 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, fl->fh_array[i]->size); } - __free_page(scratch); + folio_put(scratch); return 0; out_err: - __free_page(scratch); + folio_put(scratch); return -EIO; } diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4fa304fa5bc4..df79aeb68db4 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -73,17 +73,18 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct nfs4_file_layout_dsaddr *dsaddr = NULL; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; + struct net *net = server->nfs_client->cl_net; /* set up xdr stream */ - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) goto out_err; xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* Get the stripe count (number of stripe index) */ p = xdr_inline_decode(&stream, 4); @@ -159,8 +160,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -170,7 +170,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_deviceid; } - dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!dsaddr->ds_list[i]) goto out_err_drain_dsaddrs; trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr); @@ -186,7 +186,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, } } - __free_page(scratch); + folio_put(scratch); return dsaddr; out_err_drain_dsaddrs: @@ -204,7 +204,7 @@ out_err_free_deviceid: out_err_free_stripe_indices: kfree(stripe_indices); out_err_free_scratch: - __free_page(scratch); + folio_put(scratch); out_err: dprintk("%s ERROR: returning NULL\n", __func__); return NULL; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 61ad269c825f..9056f05a67dc 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -47,7 +47,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, int dev_limit, enum nfs4_ff_op_type type); static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, - struct nfs4_ff_layout_mirror *mirror); + struct nfs4_ff_layout_ds_stripe *dss_info); static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) @@ -164,31 +164,32 @@ decode_name(struct xdr_stream *xdr, u32 *id) } static struct nfsd_file * -ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, +ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id, struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, fmode_t mode) { #if IS_ENABLED(CONFIG_NFS_LOCALIO) struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - return nfs_local_open_fh(clp, cred, fh, &mirror->nfl, mode); + return nfs_local_open_fh(clp, cred, fh, &mirror->dss[dss_id].nfl, mode); #else return NULL; #endif } -static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, - const struct nfs4_ff_layout_mirror *m2) +static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1, + const struct nfs4_ff_layout_ds_stripe *dss2) { int i, j; - if (m1->fh_versions_cnt != m2->fh_versions_cnt) + if (dss1->fh_versions_cnt != dss2->fh_versions_cnt) return false; - for (i = 0; i < m1->fh_versions_cnt; i++) { + + for (i = 0; i < dss1->fh_versions_cnt; i++) { bool found_fh = false; - for (j = 0; j < m2->fh_versions_cnt; j++) { - if (nfs_compare_fh(&m1->fh_versions[i], - &m2->fh_versions[j]) == 0) { + for (j = 0; j < dss2->fh_versions_cnt; j++) { + if (nfs_compare_fh(&dss1->fh_versions[i], + &dss2->fh_versions[j]) == 0) { found_fh = true; break; } @@ -199,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, return true; } +static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + u32 dss_id; + + if (m1->dss_count != m2->dss_count) + return false; + + for (dss_id = 0; dss_id < m1->dss_count; dss_id++) + if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id])) + return false; + + return true; +} + +static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + u32 dss_id; + + if (m1->dss_count != m2->dss_count) + return false; + + for (dss_id = 0; dss_id < m1->dss_count; dss_id++) + if (memcmp(&m1->dss[dss_id].devid, + &m2->dss[dss_id].devid, + sizeof(m1->dss[dss_id].devid)) != 0) + return false; + + return true; +} + static struct nfs4_ff_layout_mirror * ff_layout_add_mirror(struct pnfs_layout_hdr *lo, struct nfs4_ff_layout_mirror *mirror) @@ -209,7 +242,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo, spin_lock(&inode->i_lock); list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { - if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0) + if (!ff_mirror_match_devid(mirror, pos)) continue; if (!ff_mirror_match_fh(mirror, pos)) continue; @@ -237,32 +270,52 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror) mirror->layout = NULL; } -static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) +static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(u32 dss_count, + gfp_t gfp_flags) { struct nfs4_ff_layout_mirror *mirror; mirror = kzalloc(sizeof(*mirror), gfp_flags); - if (mirror != NULL) { - spin_lock_init(&mirror->lock); - refcount_set(&mirror->ref, 1); - INIT_LIST_HEAD(&mirror->mirrors); - nfs_localio_file_init(&mirror->nfl); + if (mirror == NULL) + return NULL; + + spin_lock_init(&mirror->lock); + refcount_set(&mirror->ref, 1); + INIT_LIST_HEAD(&mirror->mirrors); + + mirror->dss_count = dss_count; + mirror->dss = + kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), + gfp_flags); + if (mirror->dss == NULL) { + kfree(mirror); + return NULL; } + + for (u32 dss_id = 0; dss_id < mirror->dss_count; dss_id++) + nfs_localio_file_init(&mirror->dss[dss_id].nfl); + return mirror; } static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) { - const struct cred *cred; + const struct cred *cred; + u32 dss_id; ff_layout_remove_mirror(mirror); - kfree(mirror->fh_versions); - nfs_close_local_fh(&mirror->nfl); - cred = rcu_access_pointer(mirror->ro_cred); - put_cred(cred); - cred = rcu_access_pointer(mirror->rw_cred); - put_cred(cred); - nfs4_ff_layout_put_deviceid(mirror->mirror_ds); + + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + kfree(mirror->dss[dss_id].fh_versions); + cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred); + put_cred(cred); + cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred); + put_cred(cred); + nfs_close_local_fh(&mirror->dss[dss_id].nfl); + nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds); + } + + kfree(mirror->dss); kfree(mirror); } @@ -293,7 +346,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1, struct pnfs_layout_segment *l2) { const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1); - const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1); + const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2); u32 i; if (fl1->mirror_array_cnt != fl2->mirror_array_cnt) @@ -366,14 +419,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo, free_me); } +static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror) +{ + u32 dss_id, sum = 0; + + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) + sum += mirror->dss[dss_id].efficiency; + + return sum; +} + static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { int i, j; for (i = 0; i < fls->mirror_array_cnt - 1; i++) { for (j = i + 1; j < fls->mirror_array_cnt; j++) - if (fls->mirror_array[i]->efficiency < - fls->mirror_array[j]->efficiency) + if (ff_mirror_efficiency_sum(fls->mirror_array[i]) < + ff_mirror_efficiency_sum(fls->mirror_array[j])) swap(fls->mirror_array[i], fls->mirror_array[j]); } @@ -388,20 +451,21 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_ff_layout_segment *fls = NULL; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; u64 stripe_unit; u32 mirror_array_cnt; __be32 *p; int i, rc; + struct nfs4_ff_layout_ds_stripe *dss_info; dprintk("--> %s\n", __func__); - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) return ERR_PTR(-ENOMEM); xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* stripe unit and mirror_array_cnt */ rc = -EIO; @@ -427,116 +491,134 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array_cnt = mirror_array_cnt; fls->stripe_unit = stripe_unit; + u32 dss_count = 0; for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; struct cred *kcred; const struct cred __rcu *cred; kuid_t uid; kgid_t gid; - u32 ds_count, fh_count, id; - int j; + u32 fh_count, id; + int j, dss_id; rc = -EIO; p = xdr_inline_decode(&stream, 4); if (!p) goto out_err_free; - ds_count = be32_to_cpup(p); - /* FIXME: allow for striping? */ - if (ds_count != 1) + // Ensure all mirrors have same stripe count. + if (dss_count == 0) + dss_count = be32_to_cpup(p); + else if (dss_count != be32_to_cpup(p)) goto out_err_free; - fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); + if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT || + dss_count == 0) + goto out_err_free; + + if (dss_count > 1 && stripe_unit == 0) + goto out_err_free; + + fls->mirror_array[i] = ff_layout_alloc_mirror(dss_count, gfp_flags); if (fls->mirror_array[i] == NULL) { rc = -ENOMEM; goto out_err_free; } - fls->mirror_array[i]->ds_count = ds_count; + for (dss_id = 0; dss_id < dss_count; dss_id++) { + dss_info = &fls->mirror_array[i]->dss[dss_id]; + dss_info->mirror = fls->mirror_array[i]; - /* deviceid */ - rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid); - if (rc) - goto out_err_free; + /* deviceid */ + rc = decode_deviceid(&stream, &dss_info->devid); + if (rc) + goto out_err_free; - /* efficiency */ - rc = -EIO; - p = xdr_inline_decode(&stream, 4); - if (!p) - goto out_err_free; - fls->mirror_array[i]->efficiency = be32_to_cpup(p); + /* efficiency */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + dss_info->efficiency = be32_to_cpup(p); - /* stateid */ - rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid); - if (rc) - goto out_err_free; + /* stateid */ + rc = decode_pnfs_stateid(&stream, &dss_info->stateid); + if (rc) + goto out_err_free; - /* fh */ - rc = -EIO; - p = xdr_inline_decode(&stream, 4); - if (!p) - goto out_err_free; - fh_count = be32_to_cpup(p); + /* fh */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + fh_count = be32_to_cpup(p); - fls->mirror_array[i]->fh_versions = - kcalloc(fh_count, sizeof(struct nfs_fh), - gfp_flags); - if (fls->mirror_array[i]->fh_versions == NULL) { - rc = -ENOMEM; - goto out_err_free; - } + dss_info->fh_versions = + kcalloc(fh_count, sizeof(struct nfs_fh), + gfp_flags); + if (dss_info->fh_versions == NULL) { + rc = -ENOMEM; + goto out_err_free; + } + + for (j = 0; j < fh_count; j++) { + rc = decode_nfs_fh(&stream, + &dss_info->fh_versions[j]); + if (rc) + goto out_err_free; + } + + dss_info->fh_versions_cnt = fh_count; - for (j = 0; j < fh_count; j++) { - rc = decode_nfs_fh(&stream, - &fls->mirror_array[i]->fh_versions[j]); + /* user */ + rc = decode_name(&stream, &id); if (rc) goto out_err_free; - } - fls->mirror_array[i]->fh_versions_cnt = fh_count; + uid = make_kuid(&init_user_ns, id); - /* user */ - rc = decode_name(&stream, &id); - if (rc) - goto out_err_free; + /* group */ + rc = decode_name(&stream, &id); + if (rc) + goto out_err_free; - uid = make_kuid(&init_user_ns, id); + gid = make_kgid(&init_user_ns, id); - /* group */ - rc = decode_name(&stream, &id); - if (rc) - goto out_err_free; + if (gfp_flags & __GFP_FS) + kcred = prepare_kernel_cred(&init_task); + else { + unsigned int nofs_flags = memalloc_nofs_save(); - gid = make_kgid(&init_user_ns, id); + kcred = prepare_kernel_cred(&init_task); + memalloc_nofs_restore(nofs_flags); + } + rc = -ENOMEM; + if (!kcred) + goto out_err_free; + kcred->fsuid = uid; + kcred->fsgid = gid; + cred = RCU_INITIALIZER(kcred); - if (gfp_flags & __GFP_FS) - kcred = prepare_kernel_cred(&init_task); - else { - unsigned int nofs_flags = memalloc_nofs_save(); - kcred = prepare_kernel_cred(&init_task); - memalloc_nofs_restore(nofs_flags); + if (lgr->range.iomode == IOMODE_READ) + rcu_assign_pointer(dss_info->ro_cred, cred); + else + rcu_assign_pointer(dss_info->rw_cred, cred); } - rc = -ENOMEM; - if (!kcred) - goto out_err_free; - kcred->fsuid = uid; - kcred->fsgid = gid; - cred = RCU_INITIALIZER(kcred); - - if (lgr->range.iomode == IOMODE_READ) - rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); - else - rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); if (mirror != fls->mirror_array[i]) { - /* swap cred ptrs so free_mirror will clean up old */ - if (lgr->range.iomode == IOMODE_READ) { - cred = xchg(&mirror->ro_cred, cred); - rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); - } else { - cred = xchg(&mirror->rw_cred, cred); - rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + for (dss_id = 0; dss_id < dss_count; dss_id++) { + dss_info = &fls->mirror_array[i]->dss[dss_id]; + /* swap cred ptrs so free_mirror will clean up old */ + if (lgr->range.iomode == IOMODE_READ) { + cred = xchg(&mirror->dss[dss_id].ro_cred, + dss_info->ro_cred); + rcu_assign_pointer(dss_info->ro_cred, cred); + } else { + cred = xchg(&mirror->dss[dss_id].rw_cred, + dss_info->rw_cred); + rcu_assign_pointer(dss_info->rw_cred, cred); + } } ff_layout_free_mirror(fls->mirror_array[i]); fls->mirror_array[i] = mirror; @@ -564,7 +646,7 @@ out_sort_mirrors: ret = &fls->generic_hdr; dprintk("<-- %s (success)\n", __func__); out_free_page: - __free_page(scratch); + folio_put(scratch); return ret; out_err_free: _ff_layout_free_lseg(fls); @@ -593,6 +675,26 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg) _ff_layout_free_lseg(fls); } +static u32 calc_commit_idx(struct pnfs_layout_segment *lseg, + u32 mirror_idx, u32 dss_id) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + + return (mirror_idx * flseg->mirror_array[0]->dss_count) + dss_id; +} + +static u32 calc_mirror_idx_from_commit(struct pnfs_layout_segment *lseg, + u32 commit_index) +{ + return commit_index / FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count; +} + +static u32 calc_dss_id_from_commit(struct pnfs_layout_segment *lseg, + u32 commit_index) +{ + return commit_index % FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count; +} + static void nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { @@ -617,6 +719,7 @@ nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) static bool nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, struct nfs4_ff_layoutstat *layoutstat, ktime_t now) { @@ -624,8 +727,8 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout); nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); - if (!mirror->start_time) - mirror->start_time = now; + if (!mirror->dss[dss_id].start_time) + mirror->dss[dss_id].start_time = now; if (mirror->report_interval != 0) report_interval = (s64)mirror->report_interval * 1000LL; else if (layoutstats_timer != 0) @@ -675,13 +778,16 @@ nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, static void nfs4_ff_layout_stat_io_start_read(struct inode *inode, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now); - nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + report = nfs4_ff_layoutstat_start_io( + mirror, dss_id, &mirror->dss[dss_id].read_stat, now); + nfs4_ff_layout_stat_io_update_requested( + &mirror->dss[dss_id].read_stat, requested); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); @@ -692,11 +798,12 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode, static void nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, __u64 completed) { spin_lock(&mirror->lock); - nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, + nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].read_stat, requested, completed, ktime_get(), task->tk_start); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); @@ -706,13 +813,20 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, static void nfs4_ff_layout_stat_io_start_write(struct inode *inode, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now); - nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + report = nfs4_ff_layoutstat_start_io( + mirror, + dss_id, + &mirror->dss[dss_id].write_stat, + now); + nfs4_ff_layout_stat_io_update_requested( + &mirror->dss[dss_id].write_stat, + requested); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); @@ -723,6 +837,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode, static void nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, __u64 completed, enum nfs3_stable_how committed) @@ -731,25 +846,25 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, requested = completed = 0; spin_lock(&mirror->lock); - nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, + nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].write_stat, requested, completed, ktime_get(), task->tk_start); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); } static void -ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx) +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); if (devid) nfs4_mark_deviceid_unavailable(devid); } static void -ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); if (devid) nfs4_mark_deviceid_available(devid); @@ -758,69 +873,87 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) static struct nfs4_pnfs_ds * ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id, bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_ff_layout_mirror *mirror; - struct nfs4_pnfs_ds *ds; + struct nfs4_pnfs_ds *ds = ERR_PTR(-EAGAIN); u32 idx; /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); - if (!ds) + *dss_id = nfs4_ff_layout_calc_dss_id( + fls->stripe_unit, + fls->mirror_array[idx]->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false); + if (IS_ERR(ds)) continue; if (check_device && - nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) + nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) { + // reinitialize the error state in case if this is the last iteration + ds = ERR_PTR(-EINVAL); continue; + } *best_idx = idx; - return ds; + break; } - return NULL; + return ds; } static struct nfs4_pnfs_ds * ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { - return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false); + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id, false); } static struct nfs4_pnfs_ds * ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { - return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true); + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id, true); } static struct nfs4_pnfs_ds * ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { struct nfs4_pnfs_ds *ds; - ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); - if (ds) + ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id); + if (!IS_ERR(ds)) return ds; - return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); + return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id); } static struct nfs4_pnfs_ds * ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, - u32 *best_idx) + u32 *best_idx, + u32 offset, + u32 *dss_id) { struct pnfs_layout_segment *lseg = pgio->pg_lseg; struct nfs4_pnfs_ds *ds; ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, - best_idx); - if (ds || !pgio->pg_mirror_idx) + best_idx, offset, dss_id); + if (!IS_ERR(ds) || !pgio->pg_mirror_idx) return ds; - return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); + return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx, + offset, dss_id); } static void @@ -839,6 +972,56 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, } } +static bool +ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls) +{ + return fls->mirror_array[0]->dss_count > 1; +} + +/* + * ff_layout_pg_test(). Called by nfs_can_coalesce_requests() + * + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + unsigned int size; + u64 p_stripe, r_stripe; + u32 stripe_offset; + u64 segment_offset = pgio->pg_lseg->pls_range.offset; + u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + /* calls nfs_generic_pg_test */ + size = pnfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; + else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg))) + return size; + + /* see if req and prev are in the same stripe */ + if (prev) { + p_stripe = (u64)req_offset(prev) - segment_offset; + r_stripe = (u64)req_offset(req) - segment_offset; + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + if (p_stripe != r_stripe) + return 0; + } + + /* calculate remaining bytes in the current stripe */ + div_u64_rem((u64)req_offset(req) - segment_offset, + stripe_unit, + &stripe_offset); + WARN_ON_ONCE(stripe_offset > stripe_unit); + if (stripe_offset >= stripe_unit) + return 0; + return min(stripe_unit - (unsigned int)stripe_offset, size); +} + static void ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) @@ -846,7 +1029,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_pgio_mirror *pgm; struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - u32 ds_idx; + u32 ds_idx, dss_id; if (NFS_SERVER(pgio->pg_inode)->flags & (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) @@ -867,8 +1050,9 @@ retry: /* Reset wb_nio, since getting layout segment was successful */ req->wb_nio = 0; - ds = ff_layout_get_ds_for_read(pgio, &ds_idx); - if (!ds) { + ds = ff_layout_get_ds_for_read(pgio, &ds_idx, + req_offset(req), &dss_id); + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; pnfs_generic_pg_cleanup(pgio); @@ -879,7 +1063,7 @@ retry: mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); pgm = &pgio->pg_mirrors[0]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; + pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize; pgio->pg_mirror_idx = ds_idx; return; @@ -916,7 +1100,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; struct nfs4_pnfs_ds *ds; - u32 i; + u32 i, dss_id; retry: pnfs_generic_pg_check_layout(pgio, req); @@ -941,8 +1125,13 @@ retry: for (i = 0; i < pgio->pg_mirror_count; i++) { mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); - ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true); - if (!ds) { + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit, + mirror->dss_count, + req_offset(req)); + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, + dss_id, true); + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; pnfs_generic_pg_cleanup(pgio); @@ -951,7 +1140,7 @@ retry: goto retry; } pgm = &pgio->pg_mirrors[i]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; + pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize; } if (NFS_SERVER(pgio->pg_inode)->flags & @@ -1017,14 +1206,14 @@ ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx) static const struct nfs_pageio_ops ff_layout_pg_read_ops = { .pg_init = ff_layout_pg_init_read, - .pg_test = pnfs_generic_pg_test, + .pg_test = ff_layout_pg_test, .pg_doio = pnfs_generic_pg_readpages, .pg_cleanup = pnfs_generic_pg_cleanup, }; static const struct nfs_pageio_ops ff_layout_pg_write_ops = { .pg_init = ff_layout_pg_init_write, - .pg_test = pnfs_generic_pg_test, + .pg_test = ff_layout_pg_test, .pg_doio = pnfs_generic_pg_writepages, .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write, .pg_cleanup = pnfs_generic_pg_cleanup, @@ -1072,11 +1261,15 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; + u32 dss_id = 0; + struct nfs4_pnfs_ds *ds; - if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx)) - ff_layout_send_layouterror(hdr->lseg); - else + ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx, + hdr->args.offset, &dss_id); + if (IS_ERR(ds)) pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); + else + ff_layout_send_layouterror(hdr->lseg); pnfs_read_resend_pnfs(hdr, new_idx); } @@ -1105,42 +1298,53 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr) } static int ff_layout_async_handle_error_v4(struct rpc_task *task, + u32 op_status, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { struct pnfs_layout_hdr *lo = lseg->pls_layout; struct inode *inode = lo->plh_inode; - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; - switch (task->tk_status) { - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: + switch (op_status) { + case NFS4_OK: + case NFS4ERR_NXIO: + break; + case NFSERR_PERM: + if (!task->tk_xprt) + break; + xprt_force_disconnect(task->tk_xprt); + goto out_retry; + case NFS4ERR_BADSESSION: + case NFS4ERR_BADSLOT: + case NFS4ERR_BAD_HIGH_SLOT: + case NFS4ERR_DEADSESSION: + case NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case NFS4ERR_SEQ_FALSE_RETRY: + case NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session. Exchangeid " "flags 0x%x\n", __func__, task->tk_status, clp->cl_exchange_flags); nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - break; - case -NFS4ERR_DELAY: - case -NFS4ERR_GRACE: + goto out_retry; + case NFS4ERR_DELAY: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + fallthrough; + case NFS4ERR_GRACE: rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); - break; - case -NFS4ERR_RETRY_UNCACHED_REP: - break; + goto out_retry; + case NFS4ERR_RETRY_UNCACHED_REP: + goto out_retry; /* Invalidate Layout errors */ - case -NFS4ERR_PNFS_NO_LAYOUT: - case -ESTALE: /* mapped NFS4ERR_STALE */ - case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ - case -EISDIR: /* mapped NFS4ERR_ISDIR */ - case -NFS4ERR_FHEXPIRED: - case -NFS4ERR_WRONG_TYPE: + case NFS4ERR_PNFS_NO_LAYOUT: + case NFS4ERR_STALE: + case NFS4ERR_BADHANDLE: + case NFS4ERR_ISDIR: + case NFS4ERR_FHEXPIRED: + case NFS4ERR_WRONG_TYPE: dprintk("%s Invalid layout error %d\n", __func__, task->tk_status); /* @@ -1153,6 +1357,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, pnfs_destroy_layout(NFS_I(inode)); rpc_wake_up(&tbl->slot_tbl_waitq); goto reset; + default: + break; + } + + switch (task->tk_status) { /* RPC connection errors */ case -ENETDOWN: case -ENETUNREACH: @@ -1172,26 +1381,55 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); rpc_wake_up(&tbl->slot_tbl_waitq); - fallthrough; + break; default: - if (ff_layout_avoid_mds_available_ds(lseg)) - return -NFS4ERR_RESET_TO_PNFS; -reset: - dprintk("%s Retry through MDS. Error %d\n", __func__, - task->tk_status); - return -NFS4ERR_RESET_TO_MDS; + break; } + + if (ff_layout_avoid_mds_available_ds(lseg)) + return -NFS4ERR_RESET_TO_PNFS; +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, + task->tk_status); + return -NFS4ERR_RESET_TO_MDS; + +out_retry: task->tk_status = 0; return -EAGAIN; } /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ static int ff_layout_async_handle_error_v3(struct rpc_task *task, + u32 op_status, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); + + switch (op_status) { + case NFS_OK: + case NFSERR_NXIO: + break; + case NFSERR_PERM: + if (!task->tk_xprt) + break; + xprt_force_disconnect(task->tk_xprt); + goto out_retry; + case NFSERR_ACCES: + case NFSERR_BADHANDLE: + case NFSERR_FBIG: + case NFSERR_IO: + case NFSERR_NOSPC: + case NFSERR_ROFS: + case NFSERR_STALE: + goto out_reset_to_pnfs; + case NFSERR_JUKEBOX: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + goto out_retry; + default: + break; + } switch (task->tk_status) { /* File access problems. Don't mark the device as unavailable */ @@ -1216,6 +1454,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); } +out_reset_to_pnfs: /* FIXME: Need to prevent infinite looping here. */ return -NFS4ERR_RESET_TO_PNFS; out_retry: @@ -1226,15 +1465,16 @@ out_retry: } static int ff_layout_async_handle_error(struct rpc_task *task, + u32 op_status, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { int vers = clp->cl_nfs_mod->rpc_vers->number; if (task->tk_status >= 0) { - ff_layout_mark_ds_reachable(lseg, idx); + ff_layout_mark_ds_reachable(lseg, idx, dss_id); return 0; } @@ -1244,10 +1484,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task, switch (vers) { case 3: - return ff_layout_async_handle_error_v3(task, clp, lseg, idx); + return ff_layout_async_handle_error_v3(task, op_status, clp, + lseg, idx, dss_id); case 4: - return ff_layout_async_handle_error_v4(task, state, clp, - lseg, idx); + return ff_layout_async_handle_error_v4(task, op_status, state, + clp, lseg, idx, dss_id); default: /* should never happen */ WARN_ON_ONCE(1); @@ -1256,7 +1497,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, } static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, - u32 idx, u64 offset, u64 length, + u32 idx, u32 dss_id, u64 offset, u64 length, u32 *op_status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; @@ -1294,15 +1535,16 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, offset, length, status, opnum, + mirror, dss_id, offset, length, status, opnum, nfs_io_gfp_mask()); switch (status) { case NFS4ERR_DELAY: case NFS4ERR_GRACE: + case NFS4ERR_PERM: break; case NFS4ERR_NXIO: - ff_layout_mark_ds_unreachable(lseg, idx); + ff_layout_mark_ds_unreachable(lseg, idx, dss_id); /* * Don't return the layout if this is a read and we still * have layouts to try @@ -1322,19 +1564,27 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg); + u32 dss_id = nfs4_ff_layout_calc_dss_id( + flseg->stripe_unit, + flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count, + hdr->args.offset); int err; if (task->tk_status < 0) { - ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + ff_layout_io_track_ds_error(hdr->lseg, + hdr->pgio_mirror_idx, dss_id, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_READ, task->tk_status); - trace_ff_layout_read_error(hdr); + trace_ff_layout_read_error(hdr, task->tk_status); } - err = ff_layout_async_handle_error(task, hdr->args.context->state, + err = ff_layout_async_handle_error(task, hdr->res.op_status, + hdr->args.context->state, hdr->ds_clp, hdr->lseg, - hdr->pgio_mirror_idx); + hdr->pgio_mirror_idx, + dss_id); trace_nfs4_pnfs_read(hdr, err); clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); @@ -1390,23 +1640,47 @@ ff_layout_set_layoutcommit(struct inode *inode, static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_start_read(hdr->inode, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - task->tk_start); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_start_read( + hdr->inode, + mirror, + dss_id, + hdr->args.count, + task->tk_start); } static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_end_read(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - hdr->res.count); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_end_read( + task, + mirror, + dss_id, + hdr->args.count, + hdr->res.count); set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags); } @@ -1494,20 +1768,28 @@ static void ff_layout_read_release(void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg); + u32 dss_id = nfs4_ff_layout_calc_dss_id( + flseg->stripe_unit, + flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count, + hdr->args.offset); loff_t end_offs = 0; int err; if (task->tk_status < 0) { - ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + ff_layout_io_track_ds_error(hdr->lseg, + hdr->pgio_mirror_idx, dss_id, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_WRITE, task->tk_status); - trace_ff_layout_write_error(hdr); + trace_ff_layout_write_error(hdr, task->tk_status); } - err = ff_layout_async_handle_error(task, hdr->args.context->state, + err = ff_layout_async_handle_error(task, hdr->res.op_status, + hdr->args.context->state, hdr->ds_clp, hdr->lseg, - hdr->pgio_mirror_idx); + hdr->pgio_mirror_idx, + dss_id); trace_nfs4_pnfs_write(hdr, err); clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); @@ -1545,17 +1827,20 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { int err; + u32 idx = calc_mirror_idx_from_commit(data->lseg, data->ds_commit_index); + u32 dss_id = calc_dss_id_from_commit(data->lseg, data->ds_commit_index); if (task->tk_status < 0) { - ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, + ff_layout_io_track_ds_error(data->lseg, idx, dss_id, data->args.offset, data->args.count, &data->res.op_status, OP_COMMIT, task->tk_status); - trace_ff_layout_commit_error(data); + trace_ff_layout_commit_error(data, task->tk_status); } - err = ff_layout_async_handle_error(task, NULL, data->ds_clp, - data->lseg, data->ds_commit_index); + err = ff_layout_async_handle_error(task, data->res.op_status, + NULL, data->ds_clp, data->lseg, idx, + dss_id); trace_nfs4_pnfs_commit_ds(data, err); switch (err) { @@ -1574,30 +1859,54 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, } ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); - return 0; } static void ff_layout_write_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_start_write(hdr->inode, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - task->tk_start); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_start_write( + hdr->inode, + mirror, + dss_id, + hdr->args.count, + task->tk_start); } static void ff_layout_write_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count, - hdr->res.verf->committed); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_end_write( + task, + mirror, + dss_id, + hdr->args.count, + hdr->res.count, + hdr->res.verf->committed); set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags); } @@ -1680,10 +1989,16 @@ static void ff_layout_write_release(void *data) static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task, struct nfs_commit_data *cdata) { + u32 idx, dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags)) return; + + idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index); + dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index); nfs4_ff_layout_stat_io_start_write(cdata->inode, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + FF_LAYOUT_COMP(cdata->lseg, idx), + dss_id, 0, task->tk_start); } @@ -1692,6 +2007,7 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, { struct nfs_page *req; __u64 count = 0; + u32 idx, dss_id; if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags)) return; @@ -1700,8 +2016,12 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, list_for_each_entry(req, &cdata->pages, wb_list) count += req->wb_bytes; } + + idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index); + dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index); nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + FF_LAYOUT_COMP(cdata->lseg, idx), + dss_id, count, count, NFS_FILE_SYNC); set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags); } @@ -1815,26 +2135,34 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) u32 idx = hdr->pgio_mirror_idx; int vers; struct nfs_fh *fh; + u32 dss_id; + bool ds_fatal_error = false; dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n", __func__, hdr->inode->i_ino, hdr->args.pgbase, (size_t)hdr->args.count, offset); mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); - if (!ds) + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(lseg)->stripe_unit, + mirror->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false); + if (IS_ERR(ds)) { + ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds)); goto out_failed; + } ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - hdr->inode); + hdr->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); @@ -1842,11 +2170,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->pgio_done_cb = ff_layout_read_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - fh = nfs4_ff_layout_select_ds_fh(mirror); + fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id); if (fh) hdr->args.fh = fh; - nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); + nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1856,7 +2184,8 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->mds_offset = offset; /* Start IO accounting for local read */ - localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, FMODE_READ); + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, + FMODE_READ); if (localio) { hdr->task.tk_start = ktime_get(); ff_layout_read_record_layoutstats_start(&hdr->task, hdr); @@ -1871,7 +2200,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) return PNFS_ATTEMPTED; out_failed: - if (ff_layout_avoid_mds_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error) return PNFS_TRY_AGAIN; trace_pnfs_mds_fallback_read_pagelist(hdr->inode, hdr->args.offset, hdr->args.count, @@ -1893,22 +2222,30 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) int vers; struct nfs_fh *fh; u32 idx = hdr->pgio_mirror_idx; + u32 dss_id; + bool ds_fatal_error = false; mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); - if (!ds) + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(lseg)->stripe_unit, + mirror->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true); + if (IS_ERR(ds)) { + ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds)); goto out_failed; + } ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - hdr->inode); + hdr->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, @@ -1918,12 +2255,12 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) hdr->pgio_done_cb = ff_layout_write_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - hdr->ds_commit_idx = idx; - fh = nfs4_ff_layout_select_ds_fh(mirror); + hdr->ds_commit_idx = calc_commit_idx(lseg, idx, dss_id); + fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id); if (fh) hdr->args.fh = fh; - nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); + nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1932,7 +2269,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) hdr->args.offset = offset; /* Start IO accounting for local write */ - localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, FMODE_READ|FMODE_WRITE); if (localio) { hdr->task.tk_start = ktime_get(); @@ -1948,7 +2285,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) return PNFS_ATTEMPTED; out_failed: - if (ff_layout_avoid_mds_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error) return PNFS_TRY_AGAIN; trace_pnfs_mds_fallback_write_pagelist(hdr->inode, hdr->args.offset, hdr->args.count, @@ -1956,20 +2293,15 @@ out_failed: return PNFS_NOT_ATTEMPTED; } -static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) -{ - return i; -} - static struct nfs_fh * -select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i, u32 dss_id) { struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); /* FIXME: Assume that there is only one NFS version available * for the DS. */ - return &flseg->mirror_array[i]->fh_versions[0]; + return &flseg->mirror_array[i]->dss[dss_id].fh_versions[0]; } static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) @@ -1980,7 +2312,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct nfsd_file *localio; struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; - u32 idx; + u32 idx, dss_id; int vers, ret; struct nfs_fh *fh; @@ -1988,22 +2320,23 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))) goto out_err; - idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + idx = calc_mirror_idx_from_commit(lseg, data->ds_commit_index); mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); - if (!ds) + dss_id = calc_dss_id_from_commit(lseg, data->ds_commit_index); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true); + if (IS_ERR(ds)) goto out_err; ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - data->inode); + data->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_err; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred, dss_id); if (!ds_cred) goto out_err; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), @@ -2012,12 +2345,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) data->cred = ds_cred; refcount_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; - fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + fh = select_ds_fh_from_commit(lseg, idx, dss_id); if (fh) data->args.fh = fh; /* Start IO accounting for local commit */ - localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, FMODE_READ|FMODE_WRITE); if (localio) { data->task.tk_start = ktime_get(); @@ -2081,25 +2414,28 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg) struct nfs4_pnfs_ds *ds; struct nfs_client *ds_clp; struct rpc_clnt *clnt; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < flseg->mirror_array_cnt; idx++) { mirror = flseg->mirror_array[idx]; - mirror_ds = mirror->mirror_ds; - if (IS_ERR_OR_NULL(mirror_ds)) - continue; - ds = mirror->mirror_ds->ds; - if (!ds) - continue; - ds_clp = ds->ds_clp; - if (!ds_clp) - continue; - clnt = ds_clp->cl_rpcclient; - if (!clnt) - continue; - if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg)) - continue; - rpc_clnt_disconnect(clnt); + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + mirror_ds = mirror->dss[dss_id].mirror_ds; + if (IS_ERR_OR_NULL(mirror_ds)) + continue; + ds = mirror->dss[dss_id].mirror_ds->ds; + if (!ds) + continue; + ds_clp = ds->ds_clp; + if (!ds_clp) + continue; + clnt = ds_clp->cl_rpcclient; + if (!clnt) + continue; + if (!rpc_cancel_tasks(clnt, -EAGAIN, + ff_layout_match_io, lseg)) + continue; + rpc_clnt_disconnect(clnt); + } } } @@ -2121,8 +2457,9 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); struct inode *inode = lseg->pls_layout->plh_inode; struct pnfs_commit_array *array, *new; + u32 size = flseg->mirror_array_cnt * flseg->mirror_array[0]->dss_count; - new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, + new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask()); if (new) { spin_lock(&inode->i_lock); @@ -2486,11 +2823,11 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr, static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_ds_stripe *dss_info) { struct nfs4_pnfs_ds_addr *da; - struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; - struct nfs_fh *fh = &mirror->fh_versions[0]; + struct nfs4_pnfs_ds *ds = dss_info->mirror_ds->ds; + struct nfs_fh *fh = &dss_info->fh_versions[0]; __be32 *p; da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); @@ -2502,13 +2839,17 @@ ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, p = xdr_reserve_space(xdr, 4 + fh->size); xdr_encode_opaque(p, fh->data, fh->size); /* ff_io_latency4 read */ - spin_lock(&mirror->lock); - ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat); + spin_lock(&dss_info->mirror->lock); + ff_layout_encode_io_latency(xdr, + &dss_info->read_stat.io_stat); /* ff_io_latency4 write */ - ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat); - spin_unlock(&mirror->lock); + ff_layout_encode_io_latency(xdr, + &dss_info->write_stat.io_stat); + spin_unlock(&dss_info->mirror->lock); /* nfstime4 */ - ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time)); + ff_layout_encode_nfstime(xdr, + ktime_sub(ktime_get(), + dss_info->start_time)); /* bool */ p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(false); @@ -2532,7 +2873,8 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args, static void ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque) { - struct nfs4_ff_layout_mirror *mirror = opaque->data; + struct nfs4_ff_layout_ds_stripe *dss_info = opaque->data; + struct nfs4_ff_layout_mirror *mirror = dss_info->mirror; ff_layout_put_mirror(mirror); } @@ -2549,37 +2891,47 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; + struct nfs4_ff_layout_ds_stripe *dss_info; struct nfs4_deviceid_node *dev; - int i = 0; + int i = 0, dss_id; list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { - if (i >= dev_limit) - break; - if (IS_ERR_OR_NULL(mirror->mirror_ds)) - continue; - if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, - &mirror->flags) && - type != NFS4_FF_OP_LAYOUTRETURN) - continue; - /* mirror refcount put in cleanup_layoutstats */ - if (!refcount_inc_not_zero(&mirror->ref)) - continue; - dev = &mirror->mirror_ds->id_node; - memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); - devinfo->offset = 0; - devinfo->length = NFS4_MAX_UINT64; - spin_lock(&mirror->lock); - devinfo->read_count = mirror->read_stat.io_stat.ops_completed; - devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; - devinfo->write_count = mirror->write_stat.io_stat.ops_completed; - devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; - spin_unlock(&mirror->lock); - devinfo->layout_type = LAYOUT_FLEX_FILES; - devinfo->ld_private.ops = &layoutstat_ops; - devinfo->ld_private.data = mirror; - - devinfo++; - i++; + for (dss_id = 0; dss_id < mirror->dss_count; ++dss_id) { + dss_info = &mirror->dss[dss_id]; + if (i >= dev_limit) + break; + if (IS_ERR_OR_NULL(dss_info->mirror_ds)) + continue; + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, + &mirror->flags) && + type != NFS4_FF_OP_LAYOUTRETURN) + continue; + /* mirror refcount put in cleanup_layoutstats */ + if (!refcount_inc_not_zero(&mirror->ref)) + continue; + dev = &dss_info->mirror_ds->id_node; + memcpy(&devinfo->dev_id, + &dev->deviceid, + NFS4_DEVICEID4_SIZE); + devinfo->offset = 0; + devinfo->length = NFS4_MAX_UINT64; + spin_lock(&mirror->lock); + devinfo->read_count = + dss_info->read_stat.io_stat.ops_completed; + devinfo->read_bytes = + dss_info->read_stat.io_stat.bytes_completed; + devinfo->write_count = + dss_info->write_stat.io_stat.ops_completed; + devinfo->write_bytes = + dss_info->write_stat.io_stat.bytes_completed; + spin_unlock(&mirror->lock); + devinfo->layout_type = LAYOUT_FLEX_FILES; + devinfo->ld_private.ops = &layoutstat_ops; + devinfo->ld_private.data = &mirror->dss[dss_id]; + + devinfo++; + i++; + } } return i; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 095df09017a5..17a008c8e97c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -21,6 +21,8 @@ * due to network error etc. */ #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 +#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096 + /* LAYOUTSTATS report interval in ms */ #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) #define FF_LAYOUTSTATS_MAXDEV 4 @@ -71,12 +73,12 @@ struct nfs4_ff_layoutstat { struct nfs4_ff_busy_timer busy_timer; }; -struct nfs4_ff_layout_mirror { - struct pnfs_layout_hdr *layout; - struct list_head mirrors; - u32 ds_count; - u32 efficiency; +struct nfs4_ff_layout_mirror; + +struct nfs4_ff_layout_ds_stripe { + struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; + u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; @@ -84,12 +86,19 @@ struct nfs4_ff_layout_mirror { const struct cred __rcu *ro_cred; const struct cred __rcu *rw_cred; struct nfs_file_localio nfl; - refcount_t ref; - spinlock_t lock; - unsigned long flags; struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; +}; + +struct nfs4_ff_layout_mirror { + struct pnfs_layout_hdr *layout; + struct list_head mirrors; + u32 dss_count; + struct nfs4_ff_layout_ds_stripe *dss; + refcount_t ref; + spinlock_t lock; + unsigned long flags; u32 report_interval; }; @@ -150,12 +159,12 @@ FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) } static inline struct nfs4_deviceid_node * -FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx); if (mirror != NULL) { - struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds; + struct nfs4_ff_layout_ds *mirror_ds = mirror->dss[dss_id].mirror_ds; if (!IS_ERR_OR_NULL(mirror_ds)) return &mirror_ds->id_node; @@ -182,9 +191,22 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) } static inline int -nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror) +nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror, u32 dss_id) +{ + return mirror->dss[dss_id].mirror_ds->ds_versions[0].version; +} + +static inline u32 +nfs4_ff_layout_calc_dss_id(const u64 stripe_unit, const u32 dss_count, const loff_t offset) { - return mirror->mirror_ds->ds_versions[0].version; + u64 tmp = offset; + + if (dss_count == 1 || stripe_unit == 0) + return 0; + + do_div(tmp, stripe_unit); + + return do_div(tmp, dss_count); } struct nfs4_ff_layout_ds * @@ -193,9 +215,9 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds); void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds); int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, - struct nfs4_ff_layout_mirror *mirror, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - gfp_t gfp_flags); + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, u64 offset, u64 length, int status, + enum nfs_opnum4 opnum, gfp_t gfp_flags); void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg); int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head); void ff_layout_free_ds_ioerr(struct list_head *head); @@ -204,23 +226,27 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, struct list_head *head, unsigned int maxnum); struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror); +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id); void nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, - nfs4_stateid *stateid); + u32 dss_id, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, bool fail_return); struct rpc_clnt * nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, - struct inode *inode); + struct inode *inode, + u32 dss_id); const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, const struct pnfs_layout_range *range, - const struct cred *mdscred); + const struct cred *mdscred, + u32 dss_id); bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e58bedfb1dcc..c55ea8fa3bfa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -44,18 +44,19 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, { struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; struct nfs4_ff_layout_ds *new_ds = NULL; struct nfs4_ff_ds_version *ds_versions = NULL; + struct net *net = server->nfs_client->cl_net; u32 mp_count; u32 version_count; __be32 *p; int i, ret = -ENOMEM; /* set up xdr stream */ - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) goto out_err; @@ -69,7 +70,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, INIT_LIST_HEAD(&dsaddrs); xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* multipath count */ p = xdr_inline_decode(&stream, 4); @@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, for (i = 0; i < mp_count; i++) { /* multipath ds */ - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, new_ds->ds_versions = ds_versions; new_ds->ds_versions_cnt = version_count; - new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!new_ds->ds) goto out_err_drain_dsaddrs; @@ -163,7 +163,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, kfree(da); } - __free_page(scratch); + folio_put(scratch); return new_ds; out_err_drain_dsaddrs: @@ -177,7 +177,7 @@ out_err_drain_dsaddrs: kfree(ds_versions); out_scratch: - __free_page(scratch); + folio_put(scratch); out_err: kfree(new_ds); @@ -250,16 +250,16 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, } int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, - struct nfs4_ff_layout_mirror *mirror, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - gfp_t gfp_flags) + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, u64 offset, u64 length, int status, + enum nfs_opnum4 opnum, gfp_t gfp_flags) { struct nfs4_ff_layout_ds_err *dserr; if (status == 0) return 0; - if (IS_ERR_OR_NULL(mirror->mirror_ds)) + if (IS_ERR_OR_NULL(mirror->dss[dss_id].mirror_ds)) return -EINVAL; dserr = kmalloc(sizeof(*dserr), gfp_flags); @@ -271,8 +271,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, dserr->length = length; dserr->status = status; dserr->opnum = opnum; - nfs4_stateid_copy(&dserr->stateid, &mirror->stateid); - memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid, + nfs4_stateid_copy(&dserr->stateid, &mirror->dss[dss_id].stateid); + memcpy(&dserr->deviceid, &mirror->dss[dss_id].mirror_ds->id_node.deviceid, NFS4_DEVICEID4_SIZE); spin_lock(&flo->generic_hdr.plh_inode->i_lock); @@ -282,14 +282,14 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, } static const struct cred * -ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) +ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode, u32 dss_id) { const struct cred *cred, __rcu **pcred; if (iomode == IOMODE_READ) - pcred = &mirror->ro_cred; + pcred = &mirror->dss[dss_id].ro_cred; else - pcred = &mirror->rw_cred; + pcred = &mirror->dss[dss_id].rw_cred; rcu_read_lock(); do { @@ -304,43 +304,45 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) } struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror) +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id) { /* FIXME: For now assume there is only 1 version available for the DS */ - return &mirror->fh_versions[0]; + return &mirror->dss[dss_id].fh_versions[0]; } void nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, - nfs4_stateid *stateid) + u32 dss_id, + nfs4_stateid *stateid) { - if (nfs4_ff_layout_ds_version(mirror) == 4) - nfs4_stateid_copy(stateid, &mirror->stateid); + if (nfs4_ff_layout_ds_version(mirror, dss_id) == 4) + nfs4_stateid_copy(stateid, &mirror->dss[dss_id].stateid); } static bool ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id) { if (mirror == NULL) goto outerr; - if (mirror->mirror_ds == NULL) { + if (mirror->dss[dss_id].mirror_ds == NULL) { struct nfs4_deviceid_node *node; struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), - &mirror->devid, lo->plh_lc_cred, + &mirror->dss[dss_id].devid, lo->plh_lc_cred, GFP_KERNEL); if (node) mirror_ds = FF_LAYOUT_MIRROR_DS(node); /* check for race with another call to this function */ - if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && + if (cmpxchg(&mirror->dss[dss_id].mirror_ds, NULL, mirror_ds) && mirror_ds != ERR_PTR(-ENODEV)) nfs4_put_deviceid_node(node); } - if (IS_ERR(mirror->mirror_ds)) + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) goto outerr; return true; @@ -352,6 +354,7 @@ outerr: * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on * @mirror: layout mirror describing the DS to use + * @dss_id: DS stripe id to select stripe to use * @fail_return: return layout on connect failure? * * Try to prepare a DS connection to accept an RPC call. This involves @@ -368,18 +371,19 @@ outerr: struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, bool fail_return) { - struct nfs4_pnfs_ds *ds = NULL; + struct nfs4_pnfs_ds *ds; struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; - int status; + int status = -EAGAIN; - if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror)) + if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror, dss_id)) goto noconnect; - ds = mirror->mirror_ds->ds; + ds = mirror->dss[dss_id].mirror_ds->ds; if (READ_ONCE(ds->ds_clp)) goto out; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ @@ -388,10 +392,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. */ - status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node, + status = nfs4_pnfs_ds_connect(s, ds, &mirror->dss[dss_id].mirror_ds->id_node, dataserver_timeo, dataserver_retrans, - mirror->mirror_ds->ds_versions[0].version, - mirror->mirror_ds->ds_versions[0].minor_version); + mirror->dss[dss_id].mirror_ds->ds_versions[0].version, + mirror->dss[dss_id].mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ if (!status) { @@ -400,25 +404,25 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, * keep ds_clp even if DS is local, so that if local IO cannot * proceed somehow, we can fall back to NFS whenever we want. */ - nfs_local_probe(ds->ds_clp); + nfs_local_probe_async(ds->ds_clp); max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); - if (mirror->mirror_ds->ds_versions[0].rsize > max_payload) - mirror->mirror_ds->ds_versions[0].rsize = max_payload; - if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) - mirror->mirror_ds->ds_versions[0].wsize = max_payload; + if (mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize > max_payload) + mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize = max_payload; + if (mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize > max_payload) + mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize = max_payload; goto out; } noconnect: ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, lseg->pls_range.offset, + mirror, dss_id, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); ff_layout_send_layouterror(lseg); if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); - ds = NULL; + ds = ERR_PTR(status); out: return ds; } @@ -426,12 +430,13 @@ out: const struct cred * ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, const struct pnfs_layout_range *range, - const struct cred *mdscred) + const struct cred *mdscred, + u32 dss_id) { const struct cred *cred; - if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) { - cred = ff_layout_get_mirror_cred(mirror, range->iomode); + if (mirror && !mirror->dss[dss_id].mirror_ds->ds_versions[0].tightly_coupled) { + cred = ff_layout_get_mirror_cred(mirror, range->iomode, dss_id); if (!cred) cred = get_cred(mdscred); } else { @@ -445,15 +450,17 @@ ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, * @mirror: pointer to the mirror * @ds_clp: nfs_client for the DS * @inode: pointer to inode + * @dss_id: DS stripe id * * Find or create a DS rpc client with th MDS server rpc client auth flavor * in the nfs_client cl_ds_clients list. */ struct rpc_clnt * nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, - struct nfs_client *ds_clp, struct inode *inode) + struct nfs_client *ds_clp, struct inode *inode, + u32 dss_id) { - switch (mirror->mirror_ds->ds_versions[0].version) { + switch (mirror->dss[dss_id].mirror_ds->ds_versions[0].version) { case 3: /* For NFSv3 DS, flavor is set when creating DS connections */ return ds_clp->cl_rpcclient; @@ -559,16 +566,18 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (mirror) { - if (!mirror->mirror_ds) + if (!mirror) + continue; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + if (!mirror->dss[dss_id].mirror_ds) return true; - if (IS_ERR(mirror->mirror_ds)) + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) continue; - devid = &mirror->mirror_ds->id_node; + devid = &mirror->dss[dss_id].mirror_ds->id_node; if (!nfs4_test_deviceid_unavailable(devid)) return true; } @@ -581,17 +590,21 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (!mirror || IS_ERR(mirror->mirror_ds)) - return false; - if (!mirror->mirror_ds) - continue; - devid = &mirror->mirror_ds->id_node; - if (nfs4_test_deviceid_unavailable(devid)) + if (!mirror) return false; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) + return false; + if (!mirror->dss[dss_id].mirror_ds) + continue; + devid = &mirror->dss[dss_id].mirror_ds->id_node; + if (nfs4_test_deviceid_unavailable(devid)) + return false; + } } return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 13f71ca8c974..b4679b7161b0 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -96,6 +96,8 @@ enum nfs_param { Opt_wsize, Opt_write, Opt_xprtsec, + Opt_cert_serial, + Opt_privkey_serial, }; enum { @@ -221,6 +223,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_enum ("write", Opt_write, nfs_param_enums_write), fsparam_u32 ("wsize", Opt_wsize), fsparam_string("xprtsec", Opt_xprtsec), + fsparam_s32("cert_serial", Opt_cert_serial), + fsparam_s32("privkey_serial", Opt_privkey_serial), {} }; @@ -551,6 +555,32 @@ static int nfs_parse_version_string(struct fs_context *fc, return 0; } +#ifdef CONFIG_KEYS +static int nfs_tls_key_verify(key_serial_t key_id) +{ + struct key *key = key_lookup(key_id); + int error = 0; + + if (IS_ERR(key)) { + pr_err("key id %08x not found\n", key_id); + return PTR_ERR(key); + } + if (test_bit(KEY_FLAG_REVOKED, &key->flags) || + test_bit(KEY_FLAG_INVALIDATED, &key->flags)) { + pr_err("key id %08x revoked\n", key_id); + error = -EKEYREVOKED; + } + + key_put(key); + return error; +} +#else +static inline int nfs_tls_key_verify(key_serial_t key_id) +{ + return -ENOENT; +} +#endif /* CONFIG_KEYS */ + /* * Parse a single mount parameter. */ @@ -807,6 +837,18 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, if (ret < 0) return ret; break; + case Opt_cert_serial: + ret = nfs_tls_key_verify(result.int_32); + if (ret < 0) + return ret; + ctx->xprtsec.cert_serial = result.int_32; + break; + case Opt_privkey_serial: + ret = nfs_tls_key_verify(result.int_32); + if (ret < 0) + return ret; + ctx->xprtsec.privkey_serial = result.int_32; + break; case Opt_proto: if (!param->string) @@ -1227,8 +1269,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, int ret; data->context[NFS_MAX_CONTEXT_LEN] = '\0'; - ret = vfs_parse_fs_string(fc, "context", - data->context, strlen(data->context)); + ret = vfs_parse_fs_string(fc, "context", data->context); if (ret < 0) return ret; #else diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index e278a1ad1ca3..8b0785178731 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -367,6 +367,7 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) sreq = netfs->sreq; if (test_bit(NFS_IOHDR_EOF, &hdr->flags) && + sreq->rreq->origin != NETFS_UNBUFFERED_READ && sreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 119e447758b9..84049f3cd340 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_drop_inode(struct inode *inode) { - return NFS_STALE(inode) || generic_drop_inode(inode); + return NFS_STALE(inode) || inode_generic_drop(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); @@ -197,6 +197,7 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) if (!(flags & NFS_INO_REVAL_FORCED)) flags &= ~(NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_BTIME | NFS_INO_INVALID_XATTR); flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); } @@ -474,7 +475,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) goto out_no_inode; } - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long now = jiffies; @@ -522,6 +523,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode_set_atime(inode, 0, 0); inode_set_mtime(inode, 0, 0); inode_set_ctime(inode, 0, 0); + memset(&nfsi->btime, 0, sizeof(nfsi->btime)); inode_set_iversion_raw(inode, 0); inode->i_size = 0; clear_nlink(inode); @@ -545,6 +547,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); + if (fattr->valid & NFS_ATTR_FATTR_BTIME) + nfsi->btime = fattr->btime; + else if (fattr_supported & NFS_ATTR_FATTR_BTIME) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode_set_iversion_raw(inode, fattr->change_attr); else @@ -557,6 +563,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) set_nlink(inode, fattr->nlink); else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); + else + set_nlink(inode, 1); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (fattr_supported & NFS_ATTR_FATTR_OWNER) @@ -600,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - atomic_read(&inode->i_count)); + icount_read(inode)); out: return inode; @@ -633,6 +641,34 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) } } +static void nfs_set_timestamps_to_ts(struct inode *inode, struct iattr *attr) +{ + unsigned int cache_flags = 0; + + if (attr->ia_valid & ATTR_MTIME_SET) { + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + struct timespec64 now; + int updated = 0; + + now = inode_set_ctime_current(inode); + if (!timespec64_equal(&now, &ctime)) + updated |= S_CTIME; + + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (!timespec64_equal(&now, &mtime)) + updated |= S_MTIME; + + inode_maybe_inc_iversion(inode, updated); + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (attr->ia_valid & ATTR_ATIME_SET) { + inode_set_atime_to_ts(inode, attr->ia_atime); + cache_flags |= NFS_INO_INVALID_ATIME; + } + NFS_I(inode)->cache_validity &= ~cache_flags; +} + static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) { enum file_time_flags time_flags = 0; @@ -680,7 +716,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; + loff_t oldsize = i_size_read(inode); int error = 0; + kuid_t task_uid = current_fsuid(); + kuid_t owner_uid = inode->i_uid; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -695,20 +734,37 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (error) return error; - if (attr->ia_size == i_size_read(inode)) + if (attr->ia_size == oldsize) attr->ia_valid &= ~ATTR_SIZE; } if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { spin_lock(&inode->i_lock); - nfs_update_timestamps(inode, attr->ia_valid); + if (attr->ia_valid & ATTR_MTIME_SET) { + if (uid_eq(task_uid, owner_uid)) { + nfs_set_timestamps_to_ts(inode, attr); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| + ATTR_ATIME|ATTR_ATIME_SET); + } + } else { + nfs_update_timestamps(inode, attr->ia_valid); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); + } spin_unlock(&inode->i_lock); - attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME); } else if (nfs_have_delegated_atime(inode) && attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { - nfs_update_delegated_atime(inode); - attr->ia_valid &= ~ATTR_ATIME; + if (attr->ia_valid & ATTR_ATIME_SET) { + if (uid_eq(task_uid, owner_uid)) { + spin_lock(&inode->i_lock); + nfs_set_timestamps_to_ts(inode, attr); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + } + } else { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; + } } /* Optimization: if the end result is no change, don't RPC */ @@ -718,8 +774,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + nfs_file_block_o_direct(NFS_I(inode)); nfs_sync_inode(inode); + } fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) { @@ -728,8 +786,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); - if (error == 0) + if (error == 0) { + if (attr->ia_valid & ATTR_SIZE) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + attr->ia_size); error = nfs_refresh_inode(inode, fattr); + } nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); @@ -888,6 +950,7 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) static u32 nfs_get_valid_attrmask(struct inode *inode) { + u64 fattr_valid = NFS_SERVER(inode)->fattr_valid; unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); u32 reply_mask = STATX_INO | STATX_TYPE; @@ -907,6 +970,9 @@ static u32 nfs_get_valid_attrmask(struct inode *inode) reply_mask |= STATX_UID | STATX_GID; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; + if (!(cache_validity & NFS_INO_INVALID_BTIME) && + (fattr_valid & NFS_ATTR_FATTR_BTIME)) + reply_mask |= STATX_BTIME; if (!(cache_validity & NFS_INO_INVALID_CHANGE)) reply_mask |= STATX_CHANGE_COOKIE; return reply_mask; @@ -917,6 +983,7 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, { struct inode *inode = d_inode(path->dentry); struct nfs_server *server = NFS_SERVER(inode); + u64 fattr_valid = server->fattr_valid; unsigned long cache_validity; int err = 0; bool force_sync = query_flags & AT_STATX_FORCE_SYNC; @@ -927,9 +994,12 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | - STATX_INO | STATX_SIZE | STATX_BLOCKS | + STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | STATX_CHANGE_COOKIE; + if (!(fattr_valid & NFS_ATTR_FATTR_BTIME)) + request_mask &= ~STATX_BTIME; + if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { if (readdirplus_enabled) nfs_readdirplus_parent_cache_hit(path->dentry); @@ -961,7 +1031,7 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, /* Is the user requesting attributes that might need revalidation? */ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| STATX_MTIME|STATX_UID|STATX_GID| - STATX_SIZE|STATX_BLOCKS| + STATX_SIZE|STATX_BLOCKS|STATX_BTIME| STATX_CHANGE_COOKIE))) goto out_no_revalidate; @@ -985,6 +1055,8 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, do_update |= cache_validity & NFS_INO_INVALID_OTHER; if (request_mask & STATX_BLOCKS) do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; + if (request_mask & STATX_BTIME) + do_update |= cache_validity & NFS_INO_INVALID_BTIME; if (do_update) { if (readdirplus_enabled) @@ -1006,6 +1078,22 @@ out_no_revalidate: stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; + stat->btime = NFS_I(inode)->btime; + + /* Special handling for STATX_DIOALIGN and STATX_DIO_READ_ALIGN + * - NFS doesn't have DIO alignment constraints, avoid getting + * these DIO attrs from remote and just respond with most + * accommodating limits (so client will issue supported DIO). + * - this is unintuitive, but the most coarse-grained + * dio_offset_align is the most accommodating. + */ + if ((request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) && + S_ISREG(inode->i_mode)) { + stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN; + stat->dio_mem_align = 4; /* 4-byte alignment */ + stat->dio_offset_align = PAGE_SIZE; + stat->dio_read_offset_align = stat->dio_offset_align; + } out: trace_nfs_getattr_exit(inode, err); return err; @@ -1301,6 +1389,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = pnfs_sync_inode(inode, false); if (status) goto out; + } else if (nfs_have_directory_delegation(inode)) { + status = 0; + goto out; } status = -ENOMEM; @@ -1900,7 +1991,7 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr, NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER | - NFS_INO_INVALID_NLINK; + NFS_INO_INVALID_NLINK | NFS_INO_INVALID_BTIME; unsigned long cache_validity = NFS_I(inode)->cache_validity; enum nfs4_change_attr_type ctype = NFS_SERVER(inode)->change_attr_type; @@ -2166,10 +2257,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) bool attr_changed = false; bool have_delegation; - dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", + dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - atomic_read(&inode->i_count), fattr->valid); + icount_read(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ @@ -2261,7 +2352,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | NFS_INO_INVALID_MODE - | NFS_INO_INVALID_OTHER; + | NFS_INO_INVALID_OTHER + | NFS_INO_INVALID_BTIME; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); attr_changed = true; @@ -2295,6 +2387,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; + if (fattr->valid & NFS_ATTR_FATTR_BTIME) + nfsi->btime = fattr->btime; + else if (fattr_supported & NFS_ATTR_FATTR_BTIME) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_BTIME; + /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { new_isize = nfs_size_to_loff_t(fattr->size); @@ -2546,15 +2644,26 @@ EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); + int err; nfs_clients_init(net); if (!rpc_proc_register(net, &nn->rpcstats)) { - nfs_clients_exit(net); - return -ENOMEM; + err = -ENOMEM; + goto err_proc_rpc; } - return nfs_fs_proc_net_init(net); + err = nfs_fs_proc_net_init(net); + if (err) + goto err_proc_nfs; + + return 0; + +err_proc_nfs: + rpc_proc_unregister(net, "nfs"); +err_proc_rpc: + nfs_clients_exit(net); + return err; } static void nfs_net_exit(struct net *net) @@ -2571,6 +2680,35 @@ static struct pernet_operations nfs_net_ops = { .size = sizeof(struct nfs_net), }; +#ifdef CONFIG_KEYS +static struct key *nfs_keyring; + +static int __init nfs_init_keyring(void) +{ + nfs_keyring = keyring_alloc(".nfs", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + (KEY_USR_ALL & ~KEY_USR_SETATTR), + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + return PTR_ERR_OR_ZERO(nfs_keyring); +} + +static void nfs_exit_keyring(void) +{ + key_put(nfs_keyring); +} +#else +static inline int nfs_init_keyring(void) +{ + return 0; +} + +static inline void nfs_exit_keyring(void) +{ +} +#endif /* CONFIG_KEYS */ + /* * Initialize NFS */ @@ -2578,6 +2716,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_init_keyring(); + if (err) + return err; + err = nfs_sysfs_init(); if (err < 0) goto out10; @@ -2638,6 +2780,7 @@ out7: out9: nfs_sysfs_exit(); out10: + nfs_exit_keyring(); return err; } @@ -2653,6 +2796,7 @@ static void __exit exit_nfs_fs(void) nfs_fs_proc_exit(); nfsiod_stop(); nfs_sysfs_exit(); + nfs_exit_keyring(); } /* Not quite true; I just maintain it */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 6655e5f32ec6..2e596244799f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -13,7 +13,7 @@ #include <linux/nfslocalio.h> #include <linux/wait_bit.h> -#define NFS_SB_MASK (SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) +#define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; @@ -152,7 +152,6 @@ struct nfs_fs_context { struct super_block *sb; struct dentry *dentry; struct nfs_fattr *fattr; - unsigned int inherited_bsize; } clone_data; }; @@ -207,7 +206,6 @@ struct nfs_mount_request { }; extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans); -extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern const struct rpc_program nfs_program; @@ -232,7 +230,7 @@ extern struct nfs_client * nfs4_find_client_sessionid(struct net *, const struct sockaddr *, struct nfs4_sessionid *, u32); extern struct nfs_server *nfs_create_server(struct fs_context *); -extern void nfs4_server_set_init_caps(struct nfs_server *); +extern void nfs_server_set_init_caps(struct nfs_server *); extern struct nfs_server *nfs4_create_server(struct fs_context *); extern struct nfs_server *nfs4_create_referral_server(struct fs_context *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, @@ -432,12 +430,14 @@ loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -int nfs_file_mmap(struct file *, struct vm_area_struct *); +int nfs_file_mmap_prepare(struct vm_area_desc *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; @@ -455,7 +455,16 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* localio.c */ -extern void nfs_local_probe(struct nfs_client *); +struct nfs_local_dio { + u32 mem_align; + u32 offset_align; + loff_t middle_offset; + loff_t end_offset; + ssize_t start_len; /* Length for misaligned first extent */ + ssize_t middle_len; /* Length for DIO-aligned middle extent */ + ssize_t end_len; /* Length for misaligned last extent */ +}; + extern void nfs_local_probe_async(struct nfs_client *); extern void nfs_local_probe_async_work(struct work_struct *); extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *, @@ -532,6 +541,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } +/* Must be called with exclusively locked inode->i_rwsem */ +static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(&nfsi->vfs_inode); + } +} + + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, @@ -672,9 +691,12 @@ nfs_write_match_verf(const struct nfs_writeverf *verf, static inline gfp_t nfs_io_gfp_mask(void) { - if (current->flags & PF_WQ_WORKER) - return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; - return GFP_KERNEL; + gfp_t ret = current_gfp_context(GFP_KERNEL); + + /* For workers __GFP_NORETRY only with __GFP_IO or __GFP_FS */ + if ((current->flags & PF_WQ_WORKER) && ret == GFP_KERNEL) + ret |= __GFP_NORETRY | __GFP_NOWARN; + return ret; } /* diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 3388faf2acb9..d275b0a250bf 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -14,15 +14,6 @@ #include "internal.h" -/* Call with exclusively locked inode->i_rwsem */ -static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) -{ - if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { - clear_bit(NFS_INO_ODIRECT, &nfsi->flags); - inode_dio_wait(inode); - } -} - /** * nfs_start_io_read - declare the file is being used for buffered reads * @inode: file inode @@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (err) return err; - nfs_block_o_direct(nfsi, inode); + nfs_file_block_o_direct(nfsi); downgrade_write(&inode->i_rwsem); return 0; @@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (!err) - nfs_block_o_direct(NFS_I(inode), inode); + nfs_file_block_o_direct(NFS_I(inode)); return err; } diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 5c21caeae075..a113bfdacfd6 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -30,6 +30,8 @@ #define NFSDBG_FACILITY NFSDBG_VFS +#define NFSLOCAL_MAX_IOS 3 + struct nfs_local_kiocb { struct kiocb kiocb; struct bio_vec *bvec; @@ -37,6 +39,13 @@ struct nfs_local_kiocb { struct work_struct work; void (*aio_complete_work)(struct work_struct *); struct nfsd_file *localio; + /* Begin mostly DIO-specific members */ + size_t end_len; + short int end_iter_index; + atomic_t n_iters; + struct iov_iter iters[NFSLOCAL_MAX_IOS]; + bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; + /* End mostly DIO-specific members */ }; struct nfs_local_fsync_ctx { @@ -49,11 +58,6 @@ struct nfs_local_fsync_ctx { static bool localio_enabled __read_mostly = true; module_param(localio_enabled, bool, 0644); -static bool localio_O_DIRECT_semantics __read_mostly = false; -module_param(localio_O_DIRECT_semantics, bool, 0644); -MODULE_PARM_DESC(localio_O_DIRECT_semantics, - "LOCALIO will use O_DIRECT semantics to filesystem."); - static inline bool nfs_client_is_local(const struct nfs_client *clp) { return !!rcu_access_pointer(clp->cl_uuid.net); @@ -171,7 +175,7 @@ static bool nfs_server_uuid_is_local(struct nfs_client *clp) * - called after alloc_client and init_client (so cl_rpcclient exists) * - this function is idempotent, it can be called for old or new clients */ -void nfs_local_probe(struct nfs_client *clp) +static void nfs_local_probe(struct nfs_client *clp) { /* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */ if (!localio_enabled || @@ -180,10 +184,8 @@ void nfs_local_probe(struct nfs_client *clp) return; } - if (nfs_client_is_local(clp)) { - /* If already enabled, disable and re-enable */ - nfs_localio_disable_client(clp); - } + if (nfs_client_is_local(clp)) + return; if (!nfs_uuid_begin(&clp->cl_uuid)) return; @@ -191,14 +193,16 @@ void nfs_local_probe(struct nfs_client *clp) nfs_localio_enable_client(clp); nfs_uuid_end(&clp->cl_uuid); } -EXPORT_SYMBOL_GPL(nfs_local_probe); void nfs_local_probe_async_work(struct work_struct *work) { struct nfs_client *clp = container_of(work, struct nfs_client, cl_local_probe_work); + if (!refcount_inc_not_zero(&clp->cl_count)) + return; nfs_local_probe(clp); + nfs_put_client(clp); } void nfs_local_probe_async(struct nfs_client *clp) @@ -207,14 +211,16 @@ void nfs_local_probe_async(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs_local_probe_async); -static inline struct nfsd_file *nfs_local_file_get(struct nfsd_file *nf) +static inline void nfs_local_file_put(struct nfsd_file *localio) { - return nfs_to->nfsd_file_get(nf); -} + /* nfs_to_nfsd_file_put_local() expects an __rcu pointer + * but we have a __kernel pointer. It is always safe + * to cast a __kernel pointer to an __rcu pointer + * because the cast only weakens what is known about the pointer. + */ + struct nfsd_file __rcu *nf = (struct nfsd_file __rcu*) localio; -static inline void nfs_local_file_put(struct nfsd_file *nf) -{ - nfs_to->nfsd_file_put(nf); + nfs_to_nfsd_file_put_local(&nf); } /* @@ -226,23 +232,26 @@ static inline void nfs_local_file_put(struct nfsd_file *nf) static struct nfsd_file * __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, struct nfs_file_localio *nfl, + struct nfsd_file __rcu **pnf, const fmode_t mode) { + int status = 0; struct nfsd_file *localio; localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient, - cred, fh, nfl, mode); + cred, fh, nfl, pnf, mode); if (IS_ERR(localio)) { - int status = PTR_ERR(localio); - trace_nfs_local_open_fh(fh, mode, status); + status = PTR_ERR(localio); switch (status) { case -ENOMEM: case -ENXIO: case -ENOENT: - /* Revalidate localio, will disable if unsupported */ + /* Revalidate localio */ + nfs_localio_disable_client(clp); nfs_local_probe(clp); } } + trace_nfs_local_open_fh(fh, mode, status); return localio; } @@ -258,7 +267,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, struct nfs_file_localio *nfl, const fmode_t mode) { - struct nfsd_file *nf, *new, __rcu **pnf; + struct nfsd_file *nf, __rcu **pnf; if (!nfs_server_is_local(clp)) return NULL; @@ -270,50 +279,13 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, else pnf = &nfl->ro_file; - new = NULL; - rcu_read_lock(); - nf = rcu_dereference(*pnf); - if (!nf) { - rcu_read_unlock(); - new = __nfs_local_open_fh(clp, cred, fh, nfl, mode); - if (IS_ERR(new)) - return NULL; - /* try to swap in the pointer */ - spin_lock(&clp->cl_uuid.lock); - nf = rcu_dereference_protected(*pnf, 1); - if (!nf) { - nf = new; - new = NULL; - rcu_assign_pointer(*pnf, nf); - } - spin_unlock(&clp->cl_uuid.lock); - rcu_read_lock(); - } - nf = nfs_local_file_get(nf); - rcu_read_unlock(); - if (new) - nfs_to_nfsd_file_put_local(new); + nf = __nfs_local_open_fh(clp, cred, fh, nfl, pnf, mode); + if (IS_ERR(nf)) + return NULL; return nf; } EXPORT_SYMBOL_GPL(nfs_local_open_fh); -static struct bio_vec * -nfs_bvec_alloc_and_import_pagevec(struct page **pagevec, - unsigned int npages, gfp_t flags) -{ - struct bio_vec *bvec, *p; - - bvec = kmalloc_array(npages, sizeof(*bvec), flags); - if (bvec != NULL) { - for (p = bvec; npages > 0; p++, pagevec++, npages--) { - p->bv_page = *pagevec; - p->bv_len = PAGE_SIZE; - p->bv_offset = 0; - } - } - return bvec; -} - static void nfs_local_iocb_free(struct nfs_local_kiocb *iocb) { @@ -327,40 +299,198 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, { struct nfs_local_kiocb *iocb; - iocb = kmalloc(sizeof(*iocb), flags); + iocb = kzalloc(sizeof(*iocb), flags); if (iocb == NULL) return NULL; - iocb->bvec = nfs_bvec_alloc_and_import_pagevec(hdr->page_array.pagevec, - hdr->page_array.npages, flags); + + iocb->bvec = kmalloc_array(hdr->page_array.npages, + sizeof(struct bio_vec), flags); if (iocb->bvec == NULL) { kfree(iocb); return NULL; } - if (localio_O_DIRECT_semantics && - test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { - iocb->kiocb.ki_filp = file; - iocb->kiocb.ki_flags = IOCB_DIRECT; - } else - init_sync_kiocb(&iocb->kiocb, file); + init_sync_kiocb(&iocb->kiocb, file); - iocb->kiocb.ki_pos = hdr->args.offset; iocb->hdr = hdr; + iocb->kiocb.ki_pos = hdr->args.offset; iocb->kiocb.ki_flags &= ~IOCB_APPEND; + iocb->kiocb.ki_complete = NULL; iocb->aio_complete_work = NULL; + iocb->end_iter_index = -1; + return iocb; } +static bool +nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw, + size_t len, struct nfs_local_dio *local_dio) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + loff_t offset = hdr->args.offset; + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; + loff_t start_end, orig_end, middle_end; + + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, + &nf_dio_offset_align, &nf_dio_read_offset_align); + if (rw == ITER_DEST) + nf_dio_offset_align = nf_dio_read_offset_align; + + if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align)) + return false; + if (unlikely(len < nf_dio_offset_align)) + return false; + + local_dio->mem_align = nf_dio_mem_align; + local_dio->offset_align = nf_dio_offset_align; + + start_end = round_up(offset, nf_dio_offset_align); + orig_end = offset + len; + middle_end = round_down(orig_end, nf_dio_offset_align); + + local_dio->middle_offset = start_end; + local_dio->end_offset = middle_end; + + local_dio->start_len = start_end - offset; + local_dio->middle_len = middle_end - start_end; + local_dio->end_len = orig_end - middle_end; + + if (rw == ITER_DEST) + trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio); + else + trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio); + return true; +} + +static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, + unsigned int addr_mask, unsigned int len_mask) +{ + const struct bio_vec *bvec = i->bvec; + size_t skip = i->iov_offset; + size_t size = i->count; + + if (size & len_mask) + return false; + do { + size_t len = bvec->bv_len; + + if (len > size) + len = size; + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) + return false; + bvec++; + size -= len; + skip = 0; + } while (size); + + return true; +} + static void -nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir) +nfs_local_iter_setup(struct iov_iter *iter, int rw, struct bio_vec *bvec, + unsigned int nvecs, unsigned long total, + size_t start, size_t len) +{ + iov_iter_bvec(iter, rw, bvec, nvecs, total); + if (start) + iov_iter_advance(iter, start); + iov_iter_truncate(iter, len); +} + +/* + * Setup as many as 3 iov_iter based on extents described by @local_dio. + * Returns the number of iov_iter that were setup. + */ +static int +nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, + unsigned int nvecs, unsigned long total, + struct nfs_local_dio *local_dio) +{ + int n_iters = 0; + struct iov_iter *iters = iocb->iters; + + /* Setup misaligned start? */ + if (local_dio->start_len) { + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, 0, local_dio->start_len); + ++n_iters; + } + + /* + * Setup DIO-aligned middle, if there is no misaligned end (below) + * then AIO completion is used, see nfs_local_call_{read,write} + */ + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, nvecs, + total, local_dio->start_len, local_dio->middle_len); + + iocb->iter_is_dio_aligned[n_iters] = + nfs_iov_iter_aligned_bvec(&iters[n_iters], + local_dio->mem_align-1, local_dio->offset_align-1); + + if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) { + trace_nfs_local_dio_misaligned(iocb->hdr->inode, + local_dio->start_len, local_dio->middle_len, local_dio); + return 0; /* no DIO-aligned IO possible */ + } + iocb->end_iter_index = n_iters; + ++n_iters; + + /* Setup misaligned end? */ + if (local_dio->end_len) { + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, local_dio->start_len + + local_dio->middle_len, local_dio->end_len); + iocb->end_iter_index = n_iters; + ++n_iters; + } + + atomic_set(&iocb->n_iters, n_iters); + return n_iters; +} + +static noinline_for_stack void +nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) { struct nfs_pgio_header *hdr = iocb->hdr; + struct page **pagevec = hdr->page_array.pagevec; + unsigned long v, total; + unsigned int base; + size_t len; + + v = 0; + total = hdr->args.count; + base = hdr->args.pgbase; + while (total && v < hdr->page_array.npages) { + len = min_t(size_t, total, PAGE_SIZE - base); + bvec_set_page(&iocb->bvec[v], *pagevec, len, base); + total -= len; + ++pagevec; + ++v; + base = 0; + } + len = hdr->args.count - total; - iov_iter_bvec(i, dir, iocb->bvec, hdr->page_array.npages, - hdr->args.count + hdr->args.pgbase); - if (hdr->args.pgbase != 0) - iov_iter_advance(i, hdr->args.pgbase); + /* + * For each iocb, iocb->n_iters is always at least 1 and we always + * end io after first nfs_local_pgio_done call unless misaligned DIO. + */ + atomic_set(&iocb->n_iters, 1); + + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { + struct nfs_local_dio local_dio; + + if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) && + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) { + /* Ensure DIO WRITE's IO on stable storage upon completion */ + if (rw == ITER_SOURCE) + iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC; + return; /* is DIO-aligned */ + } + } + + /* Use buffered IO */ + iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); } static void @@ -380,17 +510,34 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr, hdr->task.tk_start = ktime_get(); } -static void -nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) +static bool +nfs_local_pgio_done(struct nfs_local_kiocb *iocb, long status, bool force) { + struct nfs_pgio_header *hdr = iocb->hdr; + + /* Must handle partial completions */ if (status >= 0) { - hdr->res.count = status; - hdr->res.op_status = NFS4_OK; - hdr->task.tk_status = 0; + hdr->res.count += status; + /* @hdr was initialized to 0 (zeroed during allocation) */ + if (hdr->task.tk_status == 0) + hdr->res.op_status = NFS4_OK; } else { hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status); hdr->task.tk_status = status; } + + if (force) + return true; + + BUG_ON(atomic_read(&iocb->n_iters) <= 0); + return atomic_dec_and_test(&iocb->n_iters); +} + +static void +nfs_local_iocb_release(struct nfs_local_kiocb *iocb) +{ + nfs_local_file_put(iocb->localio); + nfs_local_iocb_free(iocb); } static void @@ -398,8 +545,7 @@ nfs_local_pgio_release(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; - nfs_local_file_put(iocb->localio); - nfs_local_iocb_free(iocb); + nfs_local_iocb_release(iocb); nfs_local_hdr_release(hdr, hdr->task.tk_ops); } @@ -415,13 +561,16 @@ static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb) queue_work(nfsiod_workqueue, &iocb->work); } -static void -nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) +static void nfs_local_read_done(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; struct file *filp = iocb->kiocb.ki_filp; + long status = hdr->task.tk_status; - nfs_local_pgio_done(hdr, status); + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n"); + } /* * Must clear replen otherwise NFSv3 data corruption will occur @@ -429,20 +578,27 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) */ hdr->res.replen = 0; - if (hdr->res.count != hdr->args.count || - hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) + /* nfs_readpage_result() handles short read */ + + if (hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) hdr->res.eof = true; dprintk("%s: read %ld bytes eof %d.\n", __func__, status > 0 ? status : 0, hdr->res.eof); } +static inline void nfs_local_read_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_read_done(iocb); + nfs_local_pgio_release(iocb); +} + static void nfs_local_read_aio_complete_work(struct work_struct *work) { struct nfs_local_kiocb *iocb = container_of(work, struct nfs_local_kiocb, work); - nfs_local_pgio_release(iocb); + nfs_local_read_iocb_done(iocb); } static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) @@ -450,7 +606,10 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); - nfs_local_read_done(iocb, ret); + /* AIO completion of DIO read should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ } @@ -459,51 +618,48 @@ static void nfs_local_call_read(struct work_struct *work) struct nfs_local_kiocb *iocb = container_of(work, struct nfs_local_kiocb, work); struct file *filp = iocb->kiocb.ki_filp; - const struct cred *save_cred; - struct iov_iter iter; + bool force_done = false; ssize_t status; - - save_cred = override_creds(filp->f_cred); - - nfs_local_iter_init(&iter, iocb, READ); - - status = filp->f_op->read_iter(&iocb->kiocb, &iter); - if (status != -EIOCBQUEUED) { - nfs_local_read_done(iocb, status); - nfs_local_pgio_release(iocb); + int n_iters; + + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_read_aio_complete; + iocb->aio_complete_work = nfs_local_read_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + + scoped_with_creds(filp->f_cred) + status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); + + if (status != -EIOCBQUEUED) { + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial read */ + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_read_iocb_done(iocb); + break; + } + } } - - revert_creds(save_cred); } static int -nfs_do_local_read(struct nfs_pgio_header *hdr, - struct nfsd_file *localio, +nfs_local_do_read(struct nfs_local_kiocb *iocb, const struct rpc_call_ops *call_ops) { - struct nfs_local_kiocb *iocb; - struct file *file = nfs_to->nfsd_file_file(localio); - - /* Don't support filesystems without read_iter */ - if (!file->f_op->read_iter) - return -EAGAIN; + struct nfs_pgio_header *hdr = iocb->hdr; dprintk("%s: vfs_read count=%u pos=%llu\n", __func__, hdr->args.count, hdr->args.offset); - iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL); - if (iocb == NULL) - return -ENOMEM; - iocb->localio = localio; - nfs_local_pgio_init(hdr, call_ops); hdr->res.eof = false; - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { - iocb->kiocb.ki_complete = nfs_local_read_aio_complete; - iocb->aio_complete_work = nfs_local_read_aio_complete_work; - } - INIT_WORK(&iocb->work, nfs_local_call_read); queue_work(nfslocaliod_workqueue, &iocb->work); @@ -515,14 +671,13 @@ nfs_copy_boot_verifier(struct nfs_write_verifier *verifier, struct inode *inode) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; u32 *verf = (u32 *)verifier->data; - int seq = 0; + unsigned int seq; do { - read_seqbegin_or_lock(&clp->cl_boot_lock, &seq); + seq = read_seqbegin(&clp->cl_boot_lock); verf[0] = (u32)clp->cl_nfssvc_boot.tv_sec; verf[1] = (u32)clp->cl_nfssvc_boot.tv_nsec; - } while (need_seqretry(&clp->cl_boot_lock, seq)); - done_seqretry(&clp->cl_boot_lock, seq); + } while (read_seqretry(&clp->cl_boot_lock, seq)); } static void @@ -545,7 +700,7 @@ nfs_set_local_verifier(struct inode *inode, } /* Factored out from fs/nfsd/vfs.h:fh_getattr() */ -static int __vfs_getattr(struct path *p, struct kstat *stat, int version) +static int __vfs_getattr(const struct path *p, struct kstat *stat, int version) { u32 request_mask = STATX_BASIC_STATS; @@ -605,15 +760,20 @@ static void nfs_local_vfs_getattr(struct nfs_local_kiocb *iocb) fattr->du.nfs3.used = stat.blocks << 9; } -static void -nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) +static void nfs_local_write_done(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; - struct inode *inode = hdr->inode; + long status = hdr->task.tk_status; dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n"); + } + /* Handle short writes as if they are ENOSPC */ + status = hdr->res.count; if (status > 0 && status < hdr->args.count) { hdr->mds_offset += status; hdr->args.offset += status; @@ -621,11 +781,18 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) hdr->args.count -= status; nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset); status = -ENOSPC; + /* record -ENOSPC in terms of nfs_local_pgio_done */ + (void) nfs_local_pgio_done(iocb, status, true); } - if (status < 0) - nfs_reset_boot_verifier(inode); + if (hdr->task.tk_status < 0) + nfs_reset_boot_verifier(hdr->inode); +} - nfs_local_pgio_done(hdr, status); +static inline void nfs_local_write_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_write_done(iocb); + nfs_local_vfs_getattr(iocb); + nfs_local_pgio_release(iocb); } static void nfs_local_write_aio_complete_work(struct work_struct *work) @@ -633,8 +800,7 @@ static void nfs_local_write_aio_complete_work(struct work_struct *work) struct nfs_local_kiocb *iocb = container_of(work, struct nfs_local_kiocb, work); - nfs_local_vfs_getattr(iocb); - nfs_local_pgio_release(iocb); + nfs_local_write_iocb_done(iocb); } static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) @@ -642,7 +808,10 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); - nfs_local_write_done(iocb, ret); + /* AIO completion of DIO write should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ } @@ -652,49 +821,52 @@ static void nfs_local_call_write(struct work_struct *work) container_of(work, struct nfs_local_kiocb, work); struct file *filp = iocb->kiocb.ki_filp; unsigned long old_flags = current->flags; - const struct cred *save_cred; - struct iov_iter iter; + bool force_done = false; ssize_t status; + int n_iters; current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; - save_cred = override_creds(filp->f_cred); - - nfs_local_iter_init(&iter, iocb, WRITE); file_start_write(filp); - status = filp->f_op->write_iter(&iocb->kiocb, &iter); - file_end_write(filp); - if (status != -EIOCBQUEUED) { - nfs_local_write_done(iocb, status); - nfs_local_vfs_getattr(iocb); - nfs_local_pgio_release(iocb); + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_write_aio_complete; + iocb->aio_complete_work = nfs_local_write_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + + scoped_with_creds(filp->f_cred) + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); + + if (status != -EIOCBQUEUED) { + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial write */ + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_write_iocb_done(iocb); + break; + } + } } + file_end_write(filp); - revert_creds(save_cred); current->flags = old_flags; } static int -nfs_do_local_write(struct nfs_pgio_header *hdr, - struct nfsd_file *localio, +nfs_local_do_write(struct nfs_local_kiocb *iocb, const struct rpc_call_ops *call_ops) { - struct nfs_local_kiocb *iocb; - struct file *file = nfs_to->nfsd_file_file(localio); - - /* Don't support filesystems without write_iter */ - if (!file->f_op->write_iter) - return -EAGAIN; + struct nfs_pgio_header *hdr = iocb->hdr; dprintk("%s: vfs_write count=%u pos=%llu %s\n", __func__, hdr->args.count, hdr->args.offset, (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); - iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO); - if (iocb == NULL) - return -ENOMEM; - iocb->localio = localio; - switch (hdr->args.stable) { default: break; @@ -709,43 +881,74 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable); - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { - iocb->kiocb.ki_complete = nfs_local_write_aio_complete; - iocb->aio_complete_work = nfs_local_write_aio_complete_work; - } - INIT_WORK(&iocb->work, nfs_local_call_write); queue_work(nfslocaliod_workqueue, &iocb->work); return 0; } +static struct nfs_local_kiocb * +nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio) +{ + struct file *file = nfs_to->nfsd_file_file(localio); + struct nfs_local_kiocb *iocb; + gfp_t gfp_mask; + int rw; + + if (hdr->rw_mode & FMODE_READ) { + if (!file->f_op->read_iter) + return ERR_PTR(-EOPNOTSUPP); + gfp_mask = GFP_KERNEL; + rw = ITER_DEST; + } else { + if (!file->f_op->write_iter) + return ERR_PTR(-EOPNOTSUPP); + gfp_mask = GFP_NOIO; + rw = ITER_SOURCE; + } + + iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask); + if (iocb == NULL) + return ERR_PTR(-ENOMEM); + iocb->hdr = hdr; + iocb->localio = localio; + + nfs_local_iters_init(iocb, rw); + + return iocb; +} + int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { + struct nfs_local_kiocb *iocb; int status = 0; if (!hdr->args.count) return 0; + iocb = nfs_local_iocb_init(hdr, localio); + if (IS_ERR(iocb)) + return PTR_ERR(iocb); + switch (hdr->rw_mode) { case FMODE_READ: - status = nfs_do_local_read(hdr, localio, call_ops); + status = nfs_local_do_read(iocb, call_ops); break; case FMODE_WRITE: - status = nfs_do_local_write(hdr, localio, call_ops); + status = nfs_local_do_write(iocb, call_ops); break; default: dprintk("%s: invalid mode: %d\n", __func__, hdr->rw_mode); - status = -EINVAL; + status = -EOPNOTSUPP; } if (status != 0) { if (status == -EAGAIN) nfs_localio_disable_client(clp); - nfs_local_file_put(localio); + nfs_local_iocb_release(iocb); hdr->task.tk_status = status; nfs_local_hdr_release(hdr, call_ops); } diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 57c9dd700b58..db8dfb920394 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -223,74 +223,6 @@ out_mnt_err: goto out; } -/** - * nfs_umount - Notify a server that we have unmounted this export - * @info: pointer to umount request arguments - * - * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always - * use UDP. - */ -void nfs_umount(const struct nfs_mount_request *info) -{ - static const struct rpc_timeout nfs_umnt_timeout = { - .to_initval = 1 * HZ, - .to_maxval = 3 * HZ, - .to_retries = 2, - }; - struct rpc_create_args args = { - .net = info->net, - .protocol = IPPROTO_UDP, - .address = (struct sockaddr *)info->sap, - .addrsize = info->salen, - .timeout = &nfs_umnt_timeout, - .servername = info->hostname, - .program = &mnt_program, - .version = info->version, - .authflavor = RPC_AUTH_UNIX, - .flags = RPC_CLNT_CREATE_NOPING, - .cred = current_cred(), - }; - struct rpc_message msg = { - .rpc_argp = info->dirpath, - }; - struct rpc_clnt *clnt; - int status; - - if (strlen(info->dirpath) > MNTPATHLEN) - return; - - if (info->noresvport) - args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; - - clnt = rpc_create(&args); - if (IS_ERR(clnt)) - goto out_clnt_err; - - dprintk("NFS: sending UMNT request for %s:%s\n", - (info->hostname ? info->hostname : "server"), info->dirpath); - - if (info->version == NFS_MNT3_VERSION) - msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; - else - msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; - - status = rpc_call_sync(clnt, &msg, 0); - rpc_shutdown_client(clnt); - - if (unlikely(status < 0)) - goto out_call_err; - - return; - -out_clnt_err: - dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", - PTR_ERR(clnt)); - return; - -out_call_err: - dprintk("NFS: UMNT request failed, status=%d\n", status); -} - /* * XDR encode/decode functions for MOUNT */ diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 973aed9cc5fe..af9be0c5f516 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -149,6 +149,7 @@ struct vfsmount *nfs_d_automount(struct path *path) struct vfsmount *mnt = ERR_PTR(-ENOMEM); struct nfs_server *server = NFS_SB(path->dentry->d_sb); struct nfs_client *client = server->nfs_client; + unsigned long s_flags = path->dentry->d_sb->s_flags; int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout); int ret; @@ -169,11 +170,21 @@ struct vfsmount *nfs_d_automount(struct path *path) if (!ctx->clone_data.fattr) goto out_fc; + if (fc->cred != server->cred) { + put_cred(fc->cred); + fc->cred = get_cred(server->cred); + } + if (fc->net_ns != client->cl_net) { put_net(fc->net_ns); fc->net_ns = get_net(client->cl_net); } + /* Inherit the flags covered by NFS_SB_MASK */ + fc->sb_flags_mask |= NFS_SB_MASK; + fc->sb_flags &= ~NFS_SB_MASK; + fc->sb_flags |= s_flags & NFS_SB_MASK; + /* for submounts we want the same server; referrals will reassign */ memcpy(&ctx->nfs_server._address, &client->cl_addr, client->cl_addrlen); ctx->nfs_server.addrlen = client->cl_addrlen; @@ -184,6 +195,10 @@ struct vfsmount *nfs_d_automount(struct path *path) ctx->nfs_mod = client->cl_nfs_mod; get_nfs_version(ctx->nfs_mod); + /* Inherit block sizes if they were specified as mount parameters */ + if (server->automount_inherit & NFS_AUTOMOUNT_INHERIT_BSIZE) + ctx->bsize = server->bsize; + ret = client->rpc_ops->submount(fc, server); if (ret < 0) { mnt = ERR_PTR(ret); @@ -195,7 +210,6 @@ struct vfsmount *nfs_d_automount(struct path *path) if (IS_ERR(mnt)) goto out_fc; - mntget(mnt); /* prevent immediate expiration */ if (timeout <= 0) goto out_fc; @@ -284,14 +298,14 @@ int nfs_do_submount(struct fs_context *fc) return -ENOMEM; ctx->internal = true; - ctx->clone_data.inherited_bsize = ctx->clone_data.sb->s_blocksize_bits; p = nfs_devname(dentry, buffer, 4096); if (IS_ERR(p)) { nfs_errorf(fc, "NFS: Couldn't determine submount pathname"); ret = PTR_ERR(p); } else { - ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p); + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(p, buffer + 4096 - p)); if (!ret) ret = vfs_get_tree(fc); } @@ -336,7 +350,7 @@ static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp) num *= HZ; *((int *)kp->arg) = num; if (!list_empty(&nfs_automount_list)) - mod_delayed_work(system_wq, &nfs_automount_task, num); + mod_delayed_work(system_percpu_wq, &nfs_automount_task, num); } else { *((int *)kp->arg) = -1*HZ; cancel_delayed_work(&nfs_automount_task); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index a68b21603ea9..6ba3ea39e928 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -31,7 +31,11 @@ struct nfs_net { unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; int cb_users[NFS4_MAX_MINOR_VERSION + 1]; -#endif +#endif /* CONFIG_NFS_V4 */ +#if IS_ENABLED(CONFIG_NFS_V4_1) + struct list_head nfs4_data_server_cache; + spinlock_t nfs4_data_server_lock; +#endif /* CONFIG_NFS_V4_1 */ struct nfs_netns_client *nfs_client; spinlock_t nfs_client_lock; ktime_t boot_time; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 6e75c6c2d234..9eff09158518 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -23,8 +23,8 @@ #include <linux/nfs2.h> #include <linux/nfs_fs.h> #include <linux/nfs_common.h> -#include "nfstrace.h" #include "internal.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_XDR diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 18d8f6529f61..a126eb31f62f 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -104,7 +104,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu) switch (status) { case 0: - status = nfs_refresh_inode(inode, res.fattr); + nfs_refresh_inode(inode, res.fattr); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 0d7310c1ee0c..5d97c1d38bb6 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -2,6 +2,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/sunrpc/addr.h> +#include <net/handshake.h> #include "internal.h" #include "nfs3_fs.h" #include "netns.h" @@ -98,7 +99,11 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, - .xprtsec = mds_clp->cl_xprtsec, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, .connect_timeout = connect_timeout, .reconnect_timeout = connect_timeout, }; @@ -111,9 +116,14 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { + case XPRT_TRANSPORT_TCP_TLS: + if (mds_clp->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_clp->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: - case XPRT_TRANSPORT_TCP_TLS: if (mds_clp->cl_nconnect > 1) cl_init.nconnect = mds_clp->cl_nconnect; } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index a4cb67573aa7..1181f9cc6dbd 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -483,7 +483,8 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) static void nfs3_proc_rename_setup(struct rpc_message *msg, struct dentry *old_dentry, - struct dentry *new_dentry) + struct dentry *new_dentry, + struct inode *same_parent) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 4ae01c10b7e2..e17d72908412 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -23,8 +23,8 @@ #include <linux/nfsacl.h> #include <linux/nfs_common.h> -#include "nfstrace.h" #include "internal.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_XDR diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 0282d93c8bcc..aafd15a4afce 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -21,6 +21,7 @@ int nfs42_proc_allocate(struct file *, loff_t, loff_t); ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t, struct nl4_server *, nfs4_stateid *, bool); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); +int nfs42_proc_zero_range(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 5cf52ece96ac..d537fb0c230e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, exception.inode = inode; exception.state = lock->open_context->state; + nfs_file_block_o_direct(NFS_I(inode)); err = nfs_sync_inode(inode); if (err) goto out; @@ -137,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) @@ -145,8 +147,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + + if (err == 0) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + else if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | + NFS_CAP_ZERO_RANGE); inode_unlock(inode); return err; @@ -169,7 +176,34 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + NFS_SERVER(inode)->caps &= ~(NFS_CAP_DEALLOCATE | + NFS_CAP_ZERO_RANGE); + + inode_unlock(inode); + return err; +} + +int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], + }; + struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) { + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + truncate_pagecache_range(inode, offset, (offset + len) -1); + } else if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; inode_unlock(inode); return err; @@ -329,22 +363,27 @@ out: /** * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload - * @inode: pointer to destination inode + * @file: pointer to destination file * @pos: destination offset * @len: copy length + * @oldsize: length of the file prior to clone/copy * * Punch a hole in the inode page cache, so that the NFS client will * know to retrieve new data. * Update the file size if necessary, and then mark the inode as having * invalid cached values for change attribute, ctime, mtime and space used. */ -static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len) +static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len, + loff_t oldsize) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; loff_t newsize = pos + len; loff_t end = newsize - 1; - WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_SHIFT, end >> PAGE_SHIFT)); + nfs_truncate_last_folio(mapping, oldsize, pos); + WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT)); spin_lock(&inode->i_lock); if (newsize > i_size_read(inode)) @@ -377,6 +416,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_server *src_server = NFS_SERVER(src_inode); loff_t pos_src = args->src_pos; loff_t pos_dst = args->dst_pos; + loff_t oldsize_dst = i_size_read(dst_inode); size_t count = args->count; ssize_t status; @@ -405,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, return status; } + nfs_file_block_o_direct(NFS_I(dst_inode)); status = nfs_sync_inode(dst_inode); if (status) return status; @@ -450,7 +491,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, goto out; } - nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count); + nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst); nfs_invalidate_atime(src_inode); status = res->write_res.count; out: @@ -1217,6 +1258,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct nfs42_clone_res res = { .server = server, }; + loff_t oldsize_dst = i_size_read(dst_inode); int status; msg->rpc_argp = &args; @@ -1251,7 +1293,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, /* a zero-length count means clone to EOF in src */ if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; - nfs42_copy_dest_done(dst_inode, dst_offset, count); + nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); } @@ -1472,7 +1514,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, ret = -ENOMEM; - res.scratch = alloc_page(GFP_KERNEL); + res.scratch = folio_alloc(GFP_KERNEL, 0); if (!res.scratch) goto out; @@ -1510,7 +1552,7 @@ out_free_pages: } kfree(pages); out_free_scratch: - __free_page(res.scratch); + folio_put(res.scratch); out: return ret; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index b1b663468249..e10d83ba835e 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -174,6 +174,18 @@ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_zero_range_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_deallocate_maxsz + \ + encode_allocate_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_zero_range_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_deallocate_maxsz + \ + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -649,6 +661,27 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, } /* + * Encode ZERO_RANGE request + */ +static void nfs4_xdr_enc_zero_range(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_falloc_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_deallocate(xdr, args, &hdr); + encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); + encode_nops(&hdr); +} + +/* * Encode READ_PLUS request */ static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, @@ -1511,6 +1544,37 @@ out: } /* + * Decode ZERO_RANGE request + */ +static int nfs4_xdr_dec_zero_range(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_falloc_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_deallocate(xdr, res); + if (status) + goto out; + status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); +out: + return status; +} + +/* * Decode READ_PLUS request */ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, @@ -1717,7 +1781,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_page(xdr, res->scratch); + xdr_set_scratch_folio(xdr, res->scratch); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 7d383d29a995..c34c89af9c7d 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -63,12 +63,11 @@ struct nfs4_minor_version_ops { bool (*match_stateid)(const nfs4_stateid *, const nfs4_stateid *); int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, - struct nfs_fsinfo *); + struct nfs_fattr *); void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); int (*test_and_free_expired)(struct nfs_server *, - const nfs4_stateid *, - const struct cred *); + nfs4_stateid *, const struct cred *); struct nfs_seqid * (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); void (*session_trunk)(struct rpc_clnt *clnt, @@ -297,7 +296,8 @@ extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int, int); extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, const struct cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, const struct cred *); -extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); +extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, + struct nfs_fattr *, bool); extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, const struct cred *cred); extern int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred); extern int nfs4_destroy_clientid(struct nfs_client *clp); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 162c85a83a14..96bccefbe2cb 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -11,6 +11,7 @@ #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <net/handshake.h> #include "internal.h" #include "callback.h" #include "delegation.h" @@ -222,6 +223,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; + clp->cl_last_renewal = jiffies; #if IS_ENABLED(CONFIG_NFS_V4_1) init_waitqueue_head(&clp->cl_lock_waitq); #endif @@ -279,8 +281,13 @@ error: */ static void nfs4_destroy_callback(struct nfs_client *clp) { - if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net); + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) { + struct rpc_xprt *xprt; + + xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt); + nfs_callback_down(clp->cl_mvops->minor_version, clp->cl_net, + xprt); + } } static void nfs4_shutdown_client(struct nfs_client *clp) @@ -802,6 +809,7 @@ static void nfs4_destroy_server(struct nfs_server *server) unset_pnfs_layoutdriver(server); nfs4_purge_state_owners(server, &freeme); nfs4_free_state_owners(&freeme); + kfree(server->delegation_hash_table); } /* @@ -895,55 +903,40 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, * Set up an NFS4 client */ static int nfs4_set_client(struct nfs_server *server, - const char *hostname, - const struct sockaddr_storage *addr, - const size_t addrlen, - const char *ip_addr, - int proto, const struct rpc_timeout *timeparms, - u32 minorversion, unsigned int nconnect, - unsigned int max_connect, - struct net *net, - struct xprtsec_parms *xprtsec) + struct nfs_client_initdata *cl_init) { - struct nfs_client_initdata cl_init = { - .hostname = hostname, - .addr = addr, - .addrlen = addrlen, - .ip_addr = ip_addr, - .nfs_mod = &nfs_v4, - .proto = proto, - .minorversion = minorversion, - .net = net, - .timeparms = timeparms, - .cred = server->cred, - .xprtsec = *xprtsec, - }; struct nfs_client *clp; - if (minorversion == 0) - __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags); - else - cl_init.max_connect = max_connect; - switch (proto) { + cl_init->nfs_mod = &nfs_v4; + cl_init->cred = server->cred; + + if (cl_init->minorversion == 0) { + __set_bit(NFS_CS_REUSEPORT, &cl_init->init_flags); + cl_init->max_connect = 0; + } + + switch (cl_init->proto) { case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: - cl_init.nconnect = nconnect; + break; + default: + cl_init->nconnect = 0; } if (server->flags & NFS_MOUNT_NORESVPORT) - __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_NORESVPORT, &cl_init->init_flags); if (server->options & NFS_OPTION_MIGRATION) - __set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); + __set_bit(NFS_CS_MIGRATION, &cl_init->init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) - __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); - server->port = rpc_get_port((struct sockaddr *)addr); + __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init->init_flags); + server->port = rpc_get_port((struct sockaddr *)cl_init->addr); if (server->flags & NFS_MOUNT_NETUNREACH_FATAL) - __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init->init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); + clp = nfs_get_client(cl_init); if (IS_ERR(clp)) return PTR_ERR(clp); @@ -996,7 +989,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, - .xprtsec = mds_srv->nfs_client->cl_xprtsec, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, }; char buf[INET6_ADDRSTRLEN + 1]; @@ -1005,9 +1002,14 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { + case XPRT_TRANSPORT_TCP_TLS: + if (mds_srv->nfs_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_srv->nfs_client->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: - case XPRT_TRANSPORT_TCP_TLS: if (mds_clp->cl_nconnect > 1) { cl_init.nconnect = mds_clp->cl_nconnect; cl_init.max_connect = NFS_MAX_TRANSPORTS; @@ -1088,29 +1090,15 @@ static void nfs4_session_limit_xasize(struct nfs_server *server) #endif } -void nfs4_server_set_init_caps(struct nfs_server *server) -{ - /* Set the basic capabilities */ - server->caps |= server->nfs_client->cl_mvops->init_caps; - if (server->flags & NFS_MOUNT_NORDIRPLUS) - server->caps &= ~NFS_CAP_READDIRPLUS; - if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA) - server->caps &= ~NFS_CAP_READ_PLUS; - - /* - * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower - * authentication. - */ - if (nfs4_disable_idmapping && - server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) - server->caps |= NFS_CAP_UIDGID_NOMAP; -} - static int nfs4_server_common_setup(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) { int error; + error = nfs4_delegation_hash_alloc(server); + if (error) + return error; + /* data servers support only a subset of NFSv4.1 */ if (is_ds_only_client(server->nfs_client)) return -EPROTONOSUPPORT; @@ -1118,14 +1106,14 @@ static int nfs4_server_common_setup(struct nfs_server *server, /* We must ensure the session is initialised first */ error = nfs4_init_session(server->nfs_client); if (error < 0) - goto out; + return error; - nfs4_server_set_init_caps(server); + nfs_server_set_init_caps(server); /* Probe the root fh to retrieve its FSID and filehandle */ error = nfs4_get_rootfh(server, mntfh, auth_probe); if (error < 0) - goto out; + return error; dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, @@ -1134,7 +1122,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, error = nfs_probe_server(server, mntfh); if (error < 0) - goto out; + return error; nfs4_session_limit_rwsize(server); nfs4_session_limit_xasize(server); @@ -1145,8 +1133,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, nfs_server_insert_lists(server); server->mount_time = jiffies; server->destroy = nfs4_destroy_server; -out: - return error; + return 0; } /* @@ -1156,6 +1143,19 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct rpc_timeout timeparms; + struct nfs_client_initdata cl_init = { + .hostname = ctx->nfs_server.hostname, + .addr = &ctx->nfs_server._address, + .addrlen = ctx->nfs_server.addrlen, + .ip_addr = ctx->client_address, + .proto = ctx->nfs_server.protocol, + .minorversion = ctx->minorversion, + .net = fc->net_ns, + .timeparms = &timeparms, + .xprtsec = ctx->xprtsec, + .nconnect = ctx->nfs_server.nconnect, + .max_connect = ctx->nfs_server.max_connect, + }; int error; nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol, @@ -1175,25 +1175,24 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) ctx->selected_flavor = RPC_AUTH_UNIX; /* Get a client record */ - error = nfs4_set_client(server, - ctx->nfs_server.hostname, - &ctx->nfs_server._address, - ctx->nfs_server.addrlen, - ctx->client_address, - ctx->nfs_server.protocol, - &timeparms, - ctx->minorversion, - ctx->nfs_server.nconnect, - ctx->nfs_server.max_connect, - fc->net_ns, - &ctx->xprtsec); + error = nfs4_set_client(server, &cl_init); if (error < 0) return error; - if (ctx->rsize) - server->rsize = nfs_io_size(ctx->rsize, server->nfs_client->cl_proto); - if (ctx->wsize) - server->wsize = nfs_io_size(ctx->wsize, server->nfs_client->cl_proto); + if (ctx->bsize) { + server->bsize = ctx->bsize; + server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_BSIZE; + } + if (ctx->rsize) { + server->rsize = + nfs_io_size(ctx->rsize, server->nfs_client->cl_proto); + server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_RSIZE; + } + if (ctx->wsize) { + server->wsize = + nfs_io_size(ctx->wsize, server->nfs_client->cl_proto); + server->automount_inherit |= NFS_AUTOMOUNT_INHERIT_WSIZE; + } server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; @@ -1246,18 +1245,28 @@ error: struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); - struct nfs_client *parent_client; - struct nfs_server *server, *parent_server; - int proto, error; + struct nfs_server *parent_server = NFS_SB(ctx->clone_data.sb); + struct nfs_client *parent_client = parent_server->nfs_client; + struct nfs_client_initdata cl_init = { + .hostname = ctx->nfs_server.hostname, + .addr = &ctx->nfs_server._address, + .addrlen = ctx->nfs_server.addrlen, + .ip_addr = parent_client->cl_ipaddr, + .minorversion = parent_client->cl_mvops->minor_version, + .net = parent_client->cl_net, + .timeparms = parent_server->client->cl_timeout, + .xprtsec = parent_client->cl_xprtsec, + .nconnect = parent_client->cl_nconnect, + .max_connect = parent_client->cl_max_connect, + }; + struct nfs_server *server; bool auth_probe; + int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); - parent_server = NFS_SB(ctx->clone_data.sb); - parent_client = parent_server->nfs_client; - server->cred = get_cred(parent_server->cred); /* Initialise the client representation from the parent server */ @@ -1266,38 +1275,17 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) /* Get a client representation */ #if IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT); - error = nfs4_set_client(server, - ctx->nfs_server.hostname, - &ctx->nfs_server._address, - ctx->nfs_server.addrlen, - parent_client->cl_ipaddr, - XPRT_TRANSPORT_RDMA, - parent_server->client->cl_timeout, - parent_client->cl_mvops->minor_version, - parent_client->cl_nconnect, - parent_client->cl_max_connect, - parent_client->cl_net, - &parent_client->cl_xprtsec); + cl_init.proto = XPRT_TRANSPORT_RDMA; + error = nfs4_set_client(server, &cl_init); if (!error) goto init_server; #endif /* IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) */ - proto = XPRT_TRANSPORT_TCP; + cl_init.proto = XPRT_TRANSPORT_TCP; if (parent_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE) - proto = XPRT_TRANSPORT_TCP_TLS; + cl_init.proto = XPRT_TRANSPORT_TCP_TLS; rpc_set_port(&ctx->nfs_server.address, NFS_PORT); - error = nfs4_set_client(server, - ctx->nfs_server.hostname, - &ctx->nfs_server._address, - ctx->nfs_server.addrlen, - parent_client->cl_ipaddr, - proto, - parent_server->client->cl_timeout, - parent_client->cl_mvops->minor_version, - parent_client->cl_nconnect, - parent_client->cl_max_connect, - parent_client->cl_net, - &parent_client->cl_xprtsec); + error = nfs4_set_client(server, &cl_init); if (error < 0) goto error; @@ -1353,6 +1341,19 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, char buf[INET6_ADDRSTRLEN + 1]; struct sockaddr_storage address; struct sockaddr *localaddr = (struct sockaddr *)&address; + struct nfs_client_initdata cl_init = { + .hostname = hostname, + .addr = sap, + .addrlen = salen, + .ip_addr = buf, + .proto = clp->cl_proto, + .minorversion = clp->cl_minorversion, + .net = net, + .timeparms = clnt->cl_timeout, + .xprtsec = clp->cl_xprtsec, + .nconnect = clp->cl_nconnect, + .max_connect = clp->cl_max_connect, + }; int error; error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); @@ -1368,11 +1369,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, nfs_server_remove_lists(server); set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); - error = nfs4_set_client(server, hostname, sap, salen, buf, - clp->cl_proto, clnt->cl_timeout, - clp->cl_minorversion, - clp->cl_nconnect, clp->cl_max_connect, - net, &clp->cl_xprtsec); + error = nfs4_set_client(server, &cl_init); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); if (error != 0) { nfs_server_insert_lists(server); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1cd9652f3c28..7317f26892c5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -225,8 +225,14 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE))) + switch (mode) { + case 0: + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + case FALLOC_FL_ZERO_RANGE: + break; + default: return -EOPNOTSUPP; + } ret = inode_newsize_ok(inode, offset + len); if (ret < 0) @@ -234,6 +240,8 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (mode & FALLOC_FL_PUNCH_HOLE) return nfs42_proc_deallocate(filep, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) + return nfs42_proc_zero_range(filep, offset ,len); return nfs42_proc_allocate(filep, offset, len); } @@ -245,7 +253,6 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, struct nfs_server *server = NFS_SERVER(dst_inode); struct inode *src_inode = file_inode(src_file); unsigned int bs = server->clone_blksize; - bool same_inode = false; int ret; /* NFS does not support deduplication. */ @@ -267,25 +274,15 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, goto out; } - if (src_inode == dst_inode) - same_inode = true; - /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ - if (same_inode) { - inode_lock(src_inode); - } else if (dst_inode < src_inode) { - inode_lock_nested(dst_inode, I_MUTEX_PARENT); - inode_lock_nested(src_inode, I_MUTEX_CHILD); - } else { - inode_lock_nested(src_inode, I_MUTEX_PARENT); - inode_lock_nested(dst_inode, I_MUTEX_CHILD); - } - + lock_two_nondirectories(src_inode, dst_inode); /* flush all pending writes on both src and dst so that server * has the latest data */ + nfs_file_block_o_direct(NFS_I(src_inode)); ret = nfs_sync_inode(src_inode); if (ret) goto out_unlock; + nfs_file_block_o_direct(NFS_I(dst_inode)); ret = nfs_sync_inode(dst_inode); if (ret) goto out_unlock; @@ -298,15 +295,7 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1); out_unlock: - if (same_inode) { - inode_unlock(src_inode); - } else if (dst_inode < src_inode) { - inode_unlock(src_inode); - inode_unlock(dst_inode); - } else { - inode_unlock(dst_inode); - inode_unlock(src_inode); - } + unlock_two_nondirectories(src_inode, dst_inode); out: return ret < 0 ? ret : count; } @@ -442,13 +431,15 @@ void nfs42_ssc_unregister_ops(void) static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { + if (!S_ISREG(file_inode(file)->i_mode)) + return -EINVAL; return nfs4_proc_setlease(file, arg, lease, priv); } const struct file_operations nfs4_file_operations = { .read_iter = nfs_file_read, .write_iter = nfs_file_write, - .mmap = nfs_file_mmap, + .mmap_prepare = nfs_file_mmap_prepare, .open = nfs4_file_open, .flush = nfs4_file_flush, .release = nfs_file_release, @@ -467,4 +458,5 @@ const struct file_operations nfs4_file_operations = { #else .llseek = nfs_file_llseek, #endif + .fop_flags = FOP_DONTCACHE, }; diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index 1a69479a3a59..e67ea345de69 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -12,30 +12,28 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) { - struct nfs_fsinfo fsinfo; + struct nfs_fattr *fattr = nfs_alloc_fattr(); int ret = -ENOMEM; - fsinfo.fattr = nfs_alloc_fattr(); - if (fsinfo.fattr == NULL) + if (fattr == NULL) goto out; /* Start by getting the root filehandle from the server */ - ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe); + ret = nfs4_proc_get_rootfh(server, mntfh, fattr, auth_probe); if (ret < 0) { dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); goto out; } - if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) - || !S_ISDIR(fsinfo.fattr->mode)) { + if (!(fattr->valid & NFS_ATTR_FATTR_TYPE) || !S_ISDIR(fattr->mode)) { printk(KERN_ERR "nfs4_get_rootfh:" " getroot encountered non-directory\n"); ret = -ENOTDIR; goto out; } - memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); out: - nfs_free_fattr(fsinfo.fattr); + nfs_free_fattr(fattr); return ret; } diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 25a7c771cfd8..9e1c48c5c0b8 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -306,15 +306,12 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, const char *type, void *data, size_t data_size, struct idmap *idmap) { - const struct cred *saved_cred; struct key *rkey; const struct user_key_payload *payload; ssize_t ret; - saved_cred = override_creds(id_resolver_cache); - rkey = nfs_idmap_request_key(name, namelen, type, idmap); - revert_creds(saved_cred); - + scoped_with_creds(id_resolver_cache) + rkey = nfs_idmap_request_key(name, namelen, type, idmap); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); goto out; @@ -424,26 +421,16 @@ static void nfs_idmap_pipe_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; - struct rpc_pipe *pipe = idmap->idmap_pipe; - if (pipe->dentry) { - rpc_unlink(pipe->dentry); - pipe->dentry = NULL; - } + rpc_unlink(idmap->idmap_pipe); } static int nfs_idmap_pipe_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; - struct rpc_pipe *pipe = idmap->idmap_pipe; - struct dentry *dentry; - dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - pipe->dentry = dentry; - return 0; + return rpc_mkpipe_dentry(dir, "idmap", idmap, idmap->idmap_pipe); } static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 970f28dbf253..ec1ce593dea2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -105,7 +105,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, bool is_privileged); static int nfs41_test_stateid(struct nfs_server *, const nfs4_stateid *, const struct cred *); -static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, const struct cred *, bool); #endif @@ -222,6 +222,7 @@ const u32 nfs4_fattr_bitmap[3] = { | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_MOUNTED_ON_FILEID, @@ -243,6 +244,7 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY, FATTR4_WORD2_MDSTHRESHOLD @@ -323,16 +325,19 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, if (!(cache_validity & NFS_INO_INVALID_OTHER)) dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP); + if (!(cache_validity & NFS_INO_INVALID_BTIME)) + dst[1] &= ~FATTR4_WORD1_TIME_CREATE; + if (nfs_have_delegated_mtime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) - dst[1] &= ~FATTR4_WORD1_TIME_ACCESS; + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); if (!(cache_validity & NFS_INO_INVALID_MTIME)) - dst[1] &= ~FATTR4_WORD1_TIME_MODIFY; + dst[1] &= ~(FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET); if (!(cache_validity & NFS_INO_INVALID_CTIME)) - dst[1] &= ~FATTR4_WORD1_TIME_METADATA; + dst[1] &= ~(FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY_SET); } else if (nfs_have_delegated_atime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) - dst[1] &= ~FATTR4_WORD1_TIME_ACCESS; + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); } } @@ -386,7 +391,9 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = htonl(attrs); /* bitmap */ *p++ = htonl(12); /* attribute buffer length */ *p++ = htonl(NF4DIR); + spin_lock(&dentry->d_lock); p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent))); + spin_unlock(&dentry->d_lock); readdir->pgbase = (char *)p - (char *)start; readdir->count -= readdir->pgbase; @@ -671,6 +678,15 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server, struct nfs_client *clp = server->nfs_client; int ret; + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + exception->delay = 0; + exception->recovering = 0; + exception->retry = 0; + return -EIO; + } + ret = nfs4_do_handle_exception(server, errorcode, exception); if (exception->delay) { int ret2 = nfs4_exception_should_retrans(server, exception); @@ -1298,7 +1314,8 @@ nfs4_update_changeattr_locked(struct inode *inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | - NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR; + NFS_INO_INVALID_MODE | NFS_INO_INVALID_BTIME | + NFS_INO_INVALID_XATTR; nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); } nfsi->attrtimeo_timestamp = jiffies; @@ -1763,8 +1780,17 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, if (nfs_stateid_is_sequential(state, stateid)) break; - if (status) - break; + if (status) { + if (nfs4_stateid_match_other(stateid, &state->open_stateid) && + !nfs4_stateid_is_newer(stateid, &state->open_stateid)) { + trace_nfs4_open_stateid_update_skip(state->inode, + stateid, status); + return; + } else { + break; + } + } + /* Rely on seqids for serialisation with NFSv4.0 */ if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client)) break; @@ -2894,16 +2920,14 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st } static int nfs40_test_and_free_expired_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - const struct cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { return -NFS4ERR_BAD_STATEID; } #if defined(CONFIG_NFS_V4_1) static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - const struct cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { int status; @@ -2912,6 +2936,7 @@ static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, break; case NFS4_INVALID_STATEID_TYPE: case NFS4_SPECIAL_STATEID_TYPE: + case NFS4_FREED_STATEID_TYPE: return -NFS4ERR_BAD_STATEID; case NFS4_REVOKED_STATEID_TYPE: goto out_free; @@ -3158,18 +3183,6 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (opendata->o_res.rflags & NFS4_OPEN_RESULT_PRESERVE_UNLINKED) set_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(state->inode)->flags); - dentry = opendata->dentry; - if (d_really_is_negative(dentry)) { - struct dentry *alias; - d_drop(dentry); - alias = d_splice_alias(igrab(state->inode), dentry); - /* d_splice_alias() can't fail here - it's a non-directory */ - if (alias) { - dput(ctx->dentry); - ctx->dentry = dentry = alias; - } - } - switch(opendata->o_arg.claim) { default: break; @@ -3180,7 +3193,20 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, break; if (opendata->o_res.delegation.type != 0) dir_verifier = nfs_save_change_attribute(dir); - nfs_set_verifier(dentry, dir_verifier); + } + + dentry = opendata->dentry; + nfs_set_verifier(dentry, dir_verifier); + if (d_really_is_negative(dentry)) { + struct dentry *alias; + d_drop(dentry); + alias = d_splice_alias(igrab(state->inode), dentry); + /* d_splice_alias() can't fail here - it's a non-directory */ + if (alias) { + dput(ctx->dentry); + nfs_set_verifier(alias, dir_verifier); + ctx->dentry = dentry = alias; + } } /* Parse layoutget results before we check for access */ @@ -3620,6 +3646,7 @@ struct nfs4_closedata { } lr; struct nfs_fattr fattr; unsigned long timestamp; + unsigned short retrans; }; static void nfs4_free_closedata(void *data) @@ -3648,6 +3675,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) .state = state, .inode = calldata->inode, .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -3695,6 +3723,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + calldata->retrans = exception.retrans; if (exception.retry) goto out_restart; } @@ -3967,8 +3996,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_CASE_INSENSITIVE | FATTR4_WORD0_CASE_PRESERVING; if (minorversion) - bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT | - FATTR4_WORD2_OPEN_ARGUMENTS; + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + if (minorversion > 1) + bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { @@ -3998,8 +4028,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f res.attr_bitmask[2]; } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); - server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | - NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL); + server->caps &= + ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS | + NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS | + NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME); server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) @@ -4038,6 +4070,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME; if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)) server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)) + server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_CREATE)) + server->fattr_valid &= ~NFS_ATTR_FATTR_BTIME; memcpy(server->attr_bitmask_nl, res.attr_bitmask, sizeof(server->attr_bitmask)); server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; @@ -4073,7 +4109,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) }; int err; - nfs4_server_set_init_caps(server); do { err = nfs4_handle_exception(server, _nfs4_server_capabilities(server, fhandle), @@ -4221,15 +4256,18 @@ out: } static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fattr *fattr) { - u32 bitmask[3]; + u32 bitmask[3] = { + [0] = FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE | + FATTR4_WORD0_SIZE | FATTR4_WORD0_FSID, + }; struct nfs4_lookup_root_arg args = { .bitmask = bitmask, }; struct nfs4_lookup_res res = { .server = server, - .fattr = info->fattr, + .fattr = fattr, .fh = fhandle, }; struct rpc_message msg = { @@ -4238,27 +4276,20 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; - bitmask[0] = nfs4_fattr_bitmap[0]; - bitmask[1] = nfs4_fattr_bitmap[1]; - /* - * Process the label in the upcoming getfattr - */ - bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL; - - nfs_fattr_init(info->fattr); + nfs_fattr_init(fattr); return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fattr *fattr) { struct nfs4_exception exception = { .interruptible = true, }; int err; do { - err = _nfs4_lookup_root(server, fhandle, info); - trace_nfs4_lookup_root(server, fhandle, info->fattr, err); + err = _nfs4_lookup_root(server, fhandle, fattr); + trace_nfs4_lookup_root(server, fhandle, fattr, err); switch (err) { case 0: case -NFS4ERR_WRONGSEC: @@ -4271,8 +4302,9 @@ out: return err; } -static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, rpc_authflavor_t flavor) +static int nfs4_lookup_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + rpc_authflavor_t flavor) { struct rpc_auth_create_args auth_args = { .pseudoflavor = flavor, @@ -4282,7 +4314,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl auth = rpcauth_create(&auth_args, server->client); if (IS_ERR(auth)) return -EACCES; - return nfs4_lookup_root(server, fhandle, info); + return nfs4_lookup_root(server, fhandle, fattr); } /* @@ -4295,7 +4327,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl * negative errno value. */ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fattr *fattr) { /* Per 3530bis 15.33.5 */ static const rpc_authflavor_t flav_array[] = { @@ -4311,8 +4343,9 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, if (server->auth_info.flavor_len > 0) { /* try each flavor specified by user */ for (i = 0; i < server->auth_info.flavor_len; i++) { - status = nfs4_lookup_root_sec(server, fhandle, info, - server->auth_info.flavors[i]); + status = nfs4_lookup_root_sec( + server, fhandle, fattr, + server->auth_info.flavors[i]); if (status == -NFS4ERR_WRONGSEC || status == -EACCES) continue; break; @@ -4320,7 +4353,7 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, } else { /* no flavors specified by user, try default list */ for (i = 0; i < ARRAY_SIZE(flav_array); i++) { - status = nfs4_lookup_root_sec(server, fhandle, info, + status = nfs4_lookup_root_sec(server, fhandle, fattr, flav_array[i]); if (status == -NFS4ERR_WRONGSEC || status == -EACCES) continue; @@ -4344,28 +4377,22 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, * nfs4_proc_get_rootfh - get file handle for server's pseudoroot * @server: initialized nfs_server handle * @fhandle: we fill in the pseudo-fs root file handle - * @info: we fill in an FSINFO struct + * @fattr: we fill in a bare bones struct fattr * @auth_probe: probe the auth flavours * * Returns zero on success, or a negative errno. */ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, - bool auth_probe) + struct nfs_fattr *fattr, bool auth_probe) { int status = 0; if (!auth_probe) - status = nfs4_lookup_root(server, fhandle, info); + status = nfs4_lookup_root(server, fhandle, fattr); if (auth_probe || status == NFS4ERR_WRONGSEC) - status = server->nfs_client->cl_mvops->find_root_sec(server, - fhandle, info); - - if (status == 0) - status = nfs4_server_capabilities(server, fhandle); - if (status == 0) - status = nfs4_do_fsinfo(server, fhandle, info); + status = server->nfs_client->cl_mvops->find_root_sec( + server, fhandle, fattr); return nfs4_map_errors(status); } @@ -4443,6 +4470,30 @@ out: return status; } +#if IS_ENABLED(CONFIG_NFS_V4_1) +static bool should_request_dir_deleg(struct inode *inode) +{ + if (!directory_delegations) + return false; + if (!inode) + return false; + if (!S_ISDIR(inode->i_mode)) + return false; + if (!nfs_server_capable(inode, NFS_CAP_DIR_DELEG)) + return false; + if (!test_and_clear_bit(NFS_INO_REQ_DIR_DELEG, &(NFS_I(inode)->flags))) + return false; + if (nfs4_have_delegation(inode, FMODE_READ, 0)) + return false; + return true; +} +#else +static bool should_request_dir_deleg(struct inode *inode) +{ + return false; +} +#endif /* CONFIG_NFS_V4_1 */ + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct inode *inode) { @@ -4460,7 +4511,9 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; + struct nfs4_gdd_res gdd_res; unsigned short task_flags = 0; + int status; if (nfs4_has_session(server->nfs_client)) task_flags = RPC_TASK_MOVEABLE; @@ -4469,11 +4522,31 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, if (inode && (server->flags & NFS_MOUNT_SOFTREVAL)) task_flags |= RPC_TASK_TIMEOUT; + args.get_dir_deleg = should_request_dir_deleg(inode); + if (args.get_dir_deleg) + res.gdd_res = &gdd_res; + nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), inode, 0); nfs_fattr_init(fattr); nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); - return nfs4_do_call_sync(server->client, server, &msg, - &args.seq_args, &res.seq_res, task_flags); + + status = nfs4_do_call_sync(server->client, server, &msg, + &args.seq_args, &res.seq_res, task_flags); + if (args.get_dir_deleg) { + switch (status) { + case 0: + if (gdd_res.status != GDD4_OK) + break; + status = nfs_inode_set_delegation( + inode, current_cred(), FMODE_READ, + &gdd_res.deleg, 0, NFS4_OPEN_DELEGATE_READ); + break; + case -ENOTSUPP: + case -EOPNOTSUPP: + server->caps &= ~NFS_CAP_DIR_DELEG; + } + } + return status; } int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, @@ -4486,8 +4559,14 @@ int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, do { err = _nfs4_proc_getattr(server, fhandle, fattr, inode); trace_nfs4_getattr(server, fhandle, fattr, err); - err = nfs4_handle_exception(server, err, - &exception); + switch (err) { + default: + err = nfs4_handle_exception(server, err, &exception); + break; + case -ENOTSUPP: + case -EOPNOTSUPP: + exception.retry = true; + } } while (exception.retry); return err; } @@ -4698,16 +4777,19 @@ static int _nfs4_proc_lookupp(struct inode *inode, }; unsigned short task_flags = 0; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) + if (server->flags & NFS_MOUNT_SOFTREVAL) task_flags |= RPC_TASK_TIMEOUT; + if (server->caps & NFS_CAP_MOVEABLE) + task_flags |= RPC_TASK_MOVEABLE; args.bitmask = nfs4_bitmask(server, fattr->label); nfs_fattr_init(fattr); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, task_flags); + status = nfs4_do_call_sync(clnt, server, &msg, &args.seq_args, + &res.seq_res, task_flags); dprintk("NFS reply lookupp: %d\n", status); return status; } @@ -4748,6 +4830,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry int status = 0; if (!nfs4_have_delegation(inode, FMODE_READ, 0)) { + nfs_request_directory_delegation(inode); res.fattr = nfs_alloc_fattr(); if (res.fattr == NULL) return -ENOMEM; @@ -4855,6 +4938,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); + nfs_request_directory_delegation(dir); + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_mode &= ~current_umask(); state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL); @@ -4951,6 +5036,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, nfs4_init_sequence(&args->seq_args, &res->seq_res, 1, 0); nfs_fattr_init(res->dir_attr); + nfs_request_directory_delegation(d_inode(dentry->d_parent)); if (inode) { nfs4_inode_return_delegation(inode); @@ -4985,7 +5071,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) static void nfs4_proc_rename_setup(struct rpc_message *msg, struct dentry *old_dentry, - struct dentry *new_dentry) + struct dentry *new_dentry, + struct inode *same_parent) { struct nfs_renameargs *arg = msg->rpc_argp; struct nfs_renameres *res = msg->rpc_resp; @@ -4996,6 +5083,8 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, nfs4_inode_make_writeable(old_inode); if (new_inode) nfs4_inode_return_delegation(new_inode); + if (same_parent) + nfs_request_directory_delegation(same_parent); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; res->server = NFS_SB(old_dentry->d_sb); nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1, 0); @@ -5155,13 +5244,15 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ } static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, - struct nfs4_createdata *data) + struct nfs4_createdata *data, int *statusp) { - int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + struct dentry *ret; + + *statusp = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, &data->arg.seq_args, &data->res.seq_res, 1); - if (status) - return ERR_PTR(status); + if (*statusp) + return NULL; spin_lock(&dir->i_lock); /* Creating a directory bumps nlink in the parent */ @@ -5170,7 +5261,11 @@ static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, data->res.fattr->time_start, NFS_INO_INVALID_DATA); spin_unlock(&dir->i_lock); - return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + ret = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + if (!IS_ERR(ret)) + return ret; + *statusp = PTR_ERR(ret); + return NULL; } static void nfs4_free_createdata(struct nfs4_createdata *data) @@ -5231,17 +5326,18 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - struct nfs4_label *label) + struct nfs4_label *label, int *statusp) { struct nfs4_createdata *data; - struct dentry *ret = ERR_PTR(-ENOMEM); + struct dentry *ret = NULL; + *statusp = -ENOMEM; data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); if (data == NULL) goto out; data->arg.label = label; - ret = nfs4_do_mkdir(dir, dentry, data); + ret = nfs4_do_mkdir(dir, dentry, data, statusp); nfs4_free_createdata(data); out: @@ -5264,11 +5360,12 @@ static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_mode &= ~current_umask(); do { - alias = _nfs4_proc_mkdir(dir, dentry, sattr, label); - err = PTR_ERR_OR_ZERO(alias); + alias = _nfs4_proc_mkdir(dir, dentry, sattr, label, &err); trace_nfs4_mkdir(dir, &dentry->d_name, err); - err = nfs4_handle_exception(NFS_SERVER(dir), err, - &exception); + if (err) + alias = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + err, + &exception)); } while (exception.retry); nfs4_label_release_security(label); @@ -5571,9 +5668,11 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -5687,10 +5786,12 @@ static int nfs4_write_done_cb(struct rpc_task *task, .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, NFS_SERVER(inode), task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -5764,6 +5865,8 @@ void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], bitmask[1] |= FATTR4_WORD1_TIME_MODIFY; if (cache_validity & NFS_INO_INVALID_BLOCKS) bitmask[1] |= FATTR4_WORD1_SPACE_USED; + if (cache_validity & NFS_INO_INVALID_BTIME) + bitmask[1] |= FATTR4_WORD1_TIME_CREATE; if (cache_validity & NFS_INO_INVALID_SIZE) bitmask[0] |= FATTR4_WORD0_SIZE; @@ -6138,7 +6241,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, } /* for decoding across pages */ - res.acl_scratch = alloc_page(GFP_KERNEL); + res.acl_scratch = folio_alloc(GFP_KERNEL, 0); if (!res.acl_scratch) goto out_free; @@ -6174,7 +6277,7 @@ out_free: while (--i >= 0) __free_page(pages[i]); if (res.acl_scratch) - __free_page(res.acl_scratch); + folio_put(res.acl_scratch); kfree(pages); return ret; } @@ -6202,6 +6305,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, struct nfs_server *server = NFS_SERVER(inode); int ret; + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); @@ -6276,6 +6381,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, { struct nfs4_exception exception = { }; int err; + + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; do { err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); @@ -6697,6 +6805,7 @@ struct nfs4_delegreturndata { struct nfs_fh fh; nfs4_stateid stateid; unsigned long timestamp; + unsigned short retrans; struct { struct nfs4_layoutreturn_args arg; struct nfs4_layoutreturn_res res; @@ -6717,6 +6826,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) .inode = data->inode, .stateid = &data->stateid, .task_is_privileged = data->args.seq_args.sa_privileged, + .retrans = data->retrans, }; if (!nfs4_sequence_done(task, &data->res.seq_res)) @@ -6788,6 +6898,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) task->tk_status = nfs4_async_handle_exception(task, data->res.server, task->tk_status, &exception); + data->retrans = exception.retrans; if (exception.retry) goto out_restart; } @@ -7064,6 +7175,7 @@ struct nfs4_unlockdata { struct file_lock fl; struct nfs_server *server; unsigned long timestamp; + unsigned short retrans; }; static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, @@ -7074,10 +7186,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_unlockdata *p; struct nfs4_state *state = lsp->ls_state; struct inode *inode = state->inode; + struct nfs_lock_context *l_ctx; p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return NULL; + l_ctx = nfs_get_lock_context(ctx); + if (!IS_ERR(l_ctx)) { + p->l_ctx = l_ctx; + } else { + kfree(p); + return NULL; + } p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; @@ -7085,7 +7205,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->lsp = lsp; /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); - p->l_ctx = nfs_get_lock_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); @@ -7111,6 +7230,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) struct nfs4_exception exception = { .inode = calldata->lsp->ls_state->inode, .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -7144,6 +7264,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) task->tk_status = nfs4_async_handle_exception(task, calldata->server, task->tk_status, &exception); + calldata->retrans = exception.retrans; if (exception.retry) rpc_restart_call_prepare(task); } @@ -7838,10 +7959,10 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, return err; do { err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - if (err != -NFS4ERR_DELAY) + if (err != -NFS4ERR_DELAY && err != -NFS4ERR_GRACE) break; ssleep(1); - } while (err == -NFS4ERR_DELAY); + } while (err == -NFS4ERR_DELAY || err == -NFSERR_GRACE); return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } @@ -9408,7 +9529,7 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args goto out; if (rcvd->max_rqst_sz > sent->max_rqst_sz) return -EINVAL; - if (rcvd->max_resp_sz < sent->max_resp_sz) + if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; @@ -10310,10 +10431,10 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if * possible) as per RFC3530bis and RFC5661 Security Considerations sections */ -static int -_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, - struct nfs4_secinfo_flavors *flavors, bool use_integrity) +static int _nfs41_proc_secinfo_no_name(struct nfs_server *server, + struct nfs_fh *fhandle, + struct nfs4_secinfo_flavors *flavors, + bool use_integrity) { struct nfs41_secinfo_no_name_args args = { .style = SECINFO_STYLE_CURRENT_FH, @@ -10357,9 +10478,9 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int -nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +static int nfs41_proc_secinfo_no_name(struct nfs_server *server, + struct nfs_fh *fhandle, + struct nfs4_secinfo_flavors *flavors) { struct nfs4_exception exception = { .interruptible = true, @@ -10371,7 +10492,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, /* try to use integrity protection with machine cred */ if (_nfs4_is_integrity_protected(server->nfs_client)) - err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + err = _nfs41_proc_secinfo_no_name(server, fhandle, flavors, true); /* @@ -10381,7 +10502,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, * the current filesystem's rpc_client and the user cred. */ if (err == -NFS4ERR_WRONGSEC) - err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + err = _nfs41_proc_secinfo_no_name(server, fhandle, flavors, false); switch (err) { @@ -10397,9 +10518,8 @@ out: return err; } -static int -nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) +static int nfs41_find_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int err; struct page *page; @@ -10415,14 +10535,14 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, } flavors = page_address(page); - err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + err = nfs41_proc_secinfo_no_name(server, fhandle, flavors); /* * Fall back on "guess and check" method if * the server doesn't support SECINFO_NO_NAME */ if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) { - err = nfs4_find_root_sec(server, fhandle, info); + err = nfs4_find_root_sec(server, fhandle, fattr); goto out_freepage; } if (err) @@ -10447,8 +10567,8 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, flavor = RPC_AUTH_MAXFLAVOR; if (flavor != RPC_AUTH_MAXFLAVOR) { - err = nfs4_lookup_root_sec(server, fhandle, - info, flavor); + err = nfs4_lookup_root_sec(server, fhandle, fattr, + flavor); if (!err) break; } @@ -10595,7 +10715,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { * Note: this function is always asynchronous. */ static int nfs41_free_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, + nfs4_stateid *stateid, const struct cred *cred, bool privileged) { @@ -10635,6 +10755,7 @@ static int nfs41_free_stateid(struct nfs_server *server, if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); + stateid->type = NFS4_FREED_STATEID_TYPE; return 0; } @@ -10650,6 +10771,8 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) static bool nfs41_match_stateid(const nfs4_stateid *s1, const nfs4_stateid *s2) { + trace_nfs41_match_stateid(s1, s2); + if (s1->type != s2->type) return false; @@ -10667,6 +10790,8 @@ static bool nfs41_match_stateid(const nfs4_stateid *s1, static bool nfs4_match_stateid(const nfs4_stateid *s1, const nfs4_stateid *s2) { + trace_nfs4_match_stateid(s1, s2); + return nfs4_stateid_match(s1, s2); } @@ -10766,6 +10891,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_DIR_DELEG | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 @@ -10792,6 +10918,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .minor_version = 2, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN + | NFS_CAP_DIR_DELEG | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 @@ -10801,6 +10928,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_OFFLOAD_CANCEL | NFS_CAP_COPY_NOTIFY | NFS_CAP_DEALLOCATE + | NFS_CAP_ZERO_RANGE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE @@ -10836,7 +10964,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { - ssize_t error, error2, error3; + ssize_t error, error2, error3, error4 = 0; size_t left = size; error = generic_listxattr(dentry, list, left); @@ -10859,8 +10987,18 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left); if (error3 < 0) return error3; + if (list) { + list += error3; + left -= error3; + } - error += error2 + error3; + if (!nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) { + error4 = security_inode_listsecurity(d_inode(dentry), list, left); + if (error4 < 0) + return error4; + } + + error += error2 + error3 + error4; if (size && error > size) return -ERANGE; return error; @@ -10912,6 +11050,26 @@ static const struct inode_operations nfs4_file_inode_operations = { .listxattr = nfs4_listxattr, }; +static struct nfs_server *nfs4_clone_server(struct nfs_server *source, + struct nfs_fh *fh, struct nfs_fattr *fattr, + rpc_authflavor_t flavor) +{ + struct nfs_server *server; + int error; + + server = nfs_clone_server(source, fh, fattr, flavor); + if (IS_ERR(server)) + return server; + + error = nfs4_delegation_hash_alloc(server); + if (error) { + nfs_free_server(server); + return ERR_PTR(error); + } + + return server; +} + const struct nfs_rpc_ops nfs_v4_clientops = { .version = 4, /* protocol version */ .dentry_ops = &nfs4_dentry_operations, @@ -10964,7 +11122,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .init_client = nfs4_init_client, .free_client = nfs4_free_client, .create_server = nfs4_create_server, - .clone_server = nfs_clone_server, + .clone_server = nfs4_clone_server, .discover_trunking = nfs4_discover_trunking, .enable_swap = nfs4_enable_swap, .disable_swap = nfs4_disable_swap, diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index db3811af0796..18ae614e5a6c 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -122,7 +122,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) timeout = 5 * HZ; dprintk("%s: requeueing work. Lease period = %ld\n", __func__, (timeout + HZ - 1) / HZ); - mod_delayed_work(system_wq, &clp->cl_renewd, timeout); + mod_delayed_work(system_percpu_wq, &clp->cl_renewd, timeout); set_bit(NFS_CS_RENEWD, &clp->cl_res_state); spin_unlock(&clp->cl_lock); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 7612e977e80b..01179f7de322 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2744,6 +2744,9 @@ out_error: case -ENETUNREACH: nfs_mark_client_ready(clp, -EIO); break; + case -EINVAL: + nfs_mark_client_ready(clp, status); + break; default: ssleep(1); break; diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index b29a26923ce0..5ec9c83f1ef0 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -149,21 +149,9 @@ static int do_nfs4_mount(struct nfs_server *server, struct fs_context *root_fc; struct vfsmount *root_mnt; struct dentry *dentry; - size_t len; + char *source; int ret; - struct fs_parameter param = { - .key = "source", - .type = fs_value_is_string, - .dirfd = -1, - }; - - struct fs_parameter param_fsc = { - .key = "fsc", - .type = fs_value_is_string, - .dirfd = -1, - }; - if (IS_ERR(server)) return PTR_ERR(server); @@ -181,15 +169,7 @@ static int do_nfs4_mount(struct nfs_server *server, root_ctx->server = server; if (ctx->fscache_uniq) { - len = strlen(ctx->fscache_uniq); - param_fsc.size = len; - param_fsc.string = kmemdup_nul(ctx->fscache_uniq, len, GFP_KERNEL); - if (param_fsc.string == NULL) { - put_fs_context(root_fc); - return -ENOMEM; - } - ret = vfs_parse_fs_param(root_fc, ¶m_fsc); - kfree(param_fsc.string); + ret = vfs_parse_fs_string(root_fc, "fsc", ctx->fscache_uniq); if (ret < 0) { put_fs_context(root_fc); return ret; @@ -197,20 +177,18 @@ static int do_nfs4_mount(struct nfs_server *server, } /* We leave export_path unset as it's not used to find the root. */ - len = strlen(hostname) + 5; - param.string = kmalloc(len, GFP_KERNEL); - if (param.string == NULL) { - put_fs_context(root_fc); - return -ENOMEM; - } - /* Does hostname needs to be enclosed in brackets? */ if (strchr(hostname, ':')) - param.size = snprintf(param.string, len, "[%s]:/", hostname); + source = kasprintf(GFP_KERNEL, "[%s]:/", hostname); else - param.size = snprintf(param.string, len, "%s:/", hostname); - ret = vfs_parse_fs_param(root_fc, ¶m); - kfree(param.string); + source = kasprintf(GFP_KERNEL, "%s:/", hostname); + + if (!source) { + put_fs_context(root_fc); + return -ENOMEM; + } + ret = vfs_parse_fs_string(root_fc, "source", source); + kfree(source); if (ret < 0) { put_fs_context(root_fc); return ret; diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index 389941ccc9c9..987c92d6364b 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -26,11 +26,13 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done); EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done); EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist); EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_ds_connect); EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error); EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error); EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error); +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_ext_tree_prepare_commit); EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_reg); EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_reg_err); EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_unreg); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index bc67fe6801b1..6285128e631a 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -14,6 +14,8 @@ #include <trace/misc/fs.h> #include <trace/misc/nfs.h> +#include "delegation.h" + #define show_nfs_fattr_flags(valid) \ __print_flags((unsigned long)valid, "|", \ { NFS_ATTR_FATTR_TYPE, "TYPE" }, \ @@ -30,7 +32,8 @@ { NFS_ATTR_FATTR_CTIME, "CTIME" }, \ { NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \ { NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \ - { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }) + { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }, \ + { NFS_ATTR_FATTR_BTIME, "BTIME" }) DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_PROTO( @@ -273,6 +276,32 @@ TRACE_EVENT(nfs4_cb_offload, show_nfs_stable_how(__entry->cb_how) ) ); + +TRACE_EVENT(pnfs_ds_connect, + TP_PROTO( + char *ds_remotestr, + int status + ), + + TP_ARGS(ds_remotestr, status), + + TP_STRUCT__entry( + __string(ds_ips, ds_remotestr) + __field(int, status) + ), + + TP_fast_assign( + __assign_str(ds_ips); + __entry->status = status; + ), + + TP_printk( + "ds_ips=%s, status=%d", + __get_str(ds_ips), + __entry->status + ) +); + #endif /* CONFIG_NFS_V4_1 */ TRACE_EVENT(nfs4_setup_sequence, @@ -956,6 +985,52 @@ DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_ARGS(inode, fmode)) DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation); DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation); +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_detach_delegation); + +#define show_delegation_flags(flags) \ + __print_flags(flags, "|", \ + { BIT(NFS_DELEGATION_NEED_RECLAIM), "NEED_RECLAIM" }, \ + { BIT(NFS_DELEGATION_RETURN), "RETURN" }, \ + { BIT(NFS_DELEGATION_RETURN_IF_CLOSED), "RETURN_IF_CLOSED" }, \ + { BIT(NFS_DELEGATION_REFERENCED), "REFERENCED" }, \ + { BIT(NFS_DELEGATION_RETURNING), "RETURNING" }, \ + { BIT(NFS_DELEGATION_REVOKED), "REVOKED" }, \ + { BIT(NFS_DELEGATION_TEST_EXPIRED), "TEST_EXPIRED" }, \ + { BIT(NFS_DELEGATION_INODE_FREEING), "INODE_FREEING" }, \ + { BIT(NFS_DELEGATION_RETURN_DELAYED), "RETURN_DELAYED" }) + +DECLARE_EVENT_CLASS(nfs4_delegation_event, + TP_PROTO( + const struct nfs_delegation *delegation + ), + + TP_ARGS(delegation), + + TP_STRUCT__entry( + __field(u32, fhandle) + __field(unsigned int, fmode) + __field(unsigned long, flags) + ), + + TP_fast_assign( + __entry->fhandle = nfs_fhandle_hash(NFS_FH(delegation->inode)); + __entry->fmode = delegation->type; + __entry->flags = delegation->flags; + ), + + TP_printk( + "fhandle=0x%08x fmode=%s flags=%s", + __entry->fhandle, show_fs_fmode_flags(__entry->fmode), + show_delegation_flags(__entry->flags) + ) +); +#define DEFINE_NFS4_DELEGATION_EVENT(name) \ + DEFINE_EVENT(nfs4_delegation_event, name, \ + TP_PROTO( \ + const struct nfs_delegation *delegation \ + ), \ + TP_ARGS(delegation)) +DEFINE_NFS4_DELEGATION_EVENT(nfs_delegation_need_return); TRACE_EVENT(nfs4_delegreturn_exit, TP_PROTO( @@ -1278,6 +1353,7 @@ DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_skip); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_close_stateid_update_wait); DECLARE_EVENT_CLASS(nfs4_getattr_event, @@ -1449,6 +1525,63 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall); DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file); +#define show_stateid_type(type) \ + __print_symbolic(type, \ + { NFS4_INVALID_STATEID_TYPE, "INVALID" }, \ + { NFS4_SPECIAL_STATEID_TYPE, "SPECIAL" }, \ + { NFS4_OPEN_STATEID_TYPE, "OPEN" }, \ + { NFS4_LOCK_STATEID_TYPE, "LOCK" }, \ + { NFS4_DELEGATION_STATEID_TYPE, "DELEGATION" }, \ + { NFS4_LAYOUT_STATEID_TYPE, "LAYOUT" }, \ + { NFS4_PNFS_DS_STATEID_TYPE, "PNFS_DS" }, \ + { NFS4_REVOKED_STATEID_TYPE, "REVOKED" }, \ + { NFS4_FREED_STATEID_TYPE, "FREED" }) + +DECLARE_EVENT_CLASS(nfs4_match_stateid_event, + TP_PROTO( + const nfs4_stateid *s1, + const nfs4_stateid *s2 + ), + + TP_ARGS(s1, s2), + + TP_STRUCT__entry( + __field(int, s1_seq) + __field(int, s2_seq) + __field(u32, s1_hash) + __field(u32, s2_hash) + __field(int, s1_type) + __field(int, s2_type) + ), + + TP_fast_assign( + __entry->s1_seq = s1->seqid; + __entry->s1_hash = nfs_stateid_hash(s1); + __entry->s1_type = s1->type; + __entry->s2_seq = s2->seqid; + __entry->s2_hash = nfs_stateid_hash(s2); + __entry->s2_type = s2->type; + ), + + TP_printk( + "s1=%s:%x:%u s2=%s:%x:%u", + show_stateid_type(__entry->s1_type), + __entry->s1_hash, __entry->s1_seq, + show_stateid_type(__entry->s2_type), + __entry->s2_hash, __entry->s2_seq + ) +); + +#define DEFINE_NFS4_MATCH_STATEID_EVENT(name) \ + DEFINE_EVENT(nfs4_match_stateid_event, name, \ + TP_PROTO( \ + const nfs4_stateid *s1, \ + const nfs4_stateid *s2 \ + ), \ + TP_ARGS(s1, s2)) +DEFINE_NFS4_MATCH_STATEID_EVENT(nfs41_match_stateid); +DEFINE_NFS4_MATCH_STATEID_EVENT(nfs4_match_stateid); + DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( const char *name, @@ -2051,13 +2184,15 @@ TRACE_EVENT(fl_getdevinfo, DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_PROTO( - const struct nfs_pgio_header *hdr + const struct nfs_pgio_header *hdr, + int error ), - TP_ARGS(hdr), + TP_ARGS(hdr, error), TP_STRUCT__entry( __field(unsigned long, error) + __field(unsigned long, nfs_error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -2073,7 +2208,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_fast_assign( const struct inode *inode = hdr->inode; - __entry->error = hdr->res.op_status; + __entry->error = -error; + __entry->nfs_error = hdr->res.op_status; __entry->fhandle = nfs_fhandle_hash(hdr->args.fh); __entry->fileid = NFS_FILEID(inode); __entry->dev = inode->i_sb->s_dev; @@ -2088,7 +2224,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s", + "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s " + "nfs_error=%lu (%s)", -__entry->error, show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -2096,28 +2233,32 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, __entry->fhandle, __entry->offset, __entry->count, __entry->stateid_seq, __entry->stateid_hash, - __get_str(dstaddr) + __get_str(dstaddr), __entry->nfs_error, + show_nfs4_status(__entry->nfs_error) ) ); #define DEFINE_NFS4_FLEXFILES_IO_EVENT(name) \ DEFINE_EVENT(nfs4_flexfiles_io_event, name, \ TP_PROTO( \ - const struct nfs_pgio_header *hdr \ + const struct nfs_pgio_header *hdr, \ + int error \ ), \ - TP_ARGS(hdr)) + TP_ARGS(hdr, error)) DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_read_error); DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_write_error); TRACE_EVENT(ff_layout_commit_error, TP_PROTO( - const struct nfs_commit_data *data + const struct nfs_commit_data *data, + int error ), - TP_ARGS(data), + TP_ARGS(data, error), TP_STRUCT__entry( __field(unsigned long, error) + __field(unsigned long, nfs_error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -2131,7 +2272,8 @@ TRACE_EVENT(ff_layout_commit_error, TP_fast_assign( const struct inode *inode = data->inode; - __entry->error = data->res.op_status; + __entry->error = -error; + __entry->nfs_error = data->res.op_status; __entry->fhandle = nfs_fhandle_hash(data->args.fh); __entry->fileid = NFS_FILEID(inode); __entry->dev = inode->i_sb->s_dev; @@ -2142,14 +2284,49 @@ TRACE_EVENT(ff_layout_commit_error, TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%llu count=%u dstaddr=%s", + "offset=%llu count=%u dstaddr=%s nfs_error=%lu (%s)", -__entry->error, show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->count, - __get_str(dstaddr) + __get_str(dstaddr), __entry->nfs_error, + show_nfs4_status(__entry->nfs_error) + ) +); + +TRACE_EVENT(bl_ext_tree_prepare_commit, + TP_PROTO( + int ret, + size_t count, + u64 lwb, + bool not_all_ranges + ), + + TP_ARGS(ret, count, lwb, not_all_ranges), + + TP_STRUCT__entry( + __field(int, ret) + __field(size_t, count) + __field(u64, lwb) + __field(bool, not_all_ranges) + ), + + TP_fast_assign( + __entry->ret = ret; + __entry->count = count; + __entry->lwb = lwb; + __entry->not_all_ranges = not_all_ranges; + ), + + TP_printk( + "ret=%d, found %zu ranges, lwb=%llu%s", + __entry->ret, + __entry->count, + __entry->lwb, + __entry->not_all_ranges ? ", not all ranges encoded" : + "" ) ); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 55bef5fbfa47..b6fe30577fab 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -393,6 +393,20 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) +#define encode_get_dir_deleg_maxsz (op_encode_hdr_maxsz + \ + 4 /* gdda_signal_deleg_avail */ + \ + 8 /* gdda_notification_types */ + \ + nfstime4_maxsz /* gdda_child_attr_delay */ + \ + nfstime4_maxsz /* gdda_dir_attr_delay */ + \ + nfs4_fattr_bitmap_maxsz /* gdda_child_attributes */ + \ + nfs4_fattr_bitmap_maxsz /* gdda_dir_attributes */) +#define decode_get_dir_deleg_maxsz (op_decode_hdr_maxsz + \ + 4 /* gddrnf_status */ + \ + encode_verifier_maxsz /* gddr_cookieverf */ + \ + encode_stateid_maxsz /* gddr_stateid */ + \ + 8 /* gddr_notification */ + \ + nfs4_fattr_maxsz /* gddr_child_attributes */ + \ + nfs4_fattr_maxsz /* gddr_dir_attributes */) #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \ 1 /* layout type */ + \ @@ -444,6 +458,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 +#define encode_get_dir_deleg_maxsz 0 +#define decode_get_dir_deleg_maxsz 0 #define encode_layoutreturn_maxsz 0 #define decode_layoutreturn_maxsz 0 #define encode_layoutget_maxsz 0 @@ -631,11 +647,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ + encode_get_dir_deleg_maxsz + \ encode_getattr_maxsz + \ encode_renew_maxsz) #define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ + decode_get_dir_deleg_maxsz + \ decode_getattr_maxsz + \ decode_renew_maxsz) #define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \ @@ -1623,6 +1641,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY; attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; @@ -2007,6 +2026,33 @@ static void encode_sequence(struct xdr_stream *xdr, #ifdef CONFIG_NFS_V4_1 static void +encode_get_dir_delegation(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ + struct timespec64 ts = { 0, 0 }; + u32 notifications[1] = { 0 }; + u32 attributes[1] = { 0 }; + __be32 *p; + + encode_op_hdr(xdr, OP_GET_DIR_DELEGATION, decode_get_dir_deleg_maxsz, hdr); + + /* We don't handle CB_RECALLABLE_OBJ_AVAIL yet. */ + xdr_stream_encode_bool(xdr, false); + + xdr_encode_bitmap4(xdr, notifications, ARRAY_SIZE(notifications)); + + /* Request no delay on attribute updates */ + p = reserve_space(xdr, 12 + 12); + p = xdr_encode_nfstime4(p, &ts); + xdr_encode_nfstime4(p, &ts); + + /* Requested child attributes */ + xdr_encode_bitmap4(xdr, attributes, ARRAY_SIZE(attributes)); + + /* Requested dir attributes */ + xdr_encode_bitmap4(xdr, attributes, ARRAY_SIZE(attributes)); +} + +static void encode_getdeviceinfo(struct xdr_stream *xdr, const struct nfs4_getdeviceinfo_args *args, struct compound_hdr *hdr) @@ -2142,6 +2188,11 @@ static void encode_free_stateid(struct xdr_stream *xdr, } #else static inline void +encode_get_dir_delegation(struct xdr_stream *xdr, struct compound_hdr *hdr) +{ +} + +static inline void encode_layoutreturn(struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args, struct compound_hdr *hdr) @@ -2355,6 +2406,8 @@ static void nfs4_xdr_enc_getattr(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); + if (args->get_dir_deleg) + encode_get_dir_delegation(xdr, &hdr); encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -4207,6 +4260,24 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str return status; } +static int decode_attr_time_create(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_CREATE - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_CREATE)) { + status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_BTIME; + bitmap[1] &= ~FATTR4_WORD1_TIME_CREATE; + } + dprintk("%s: btime=%lld\n", __func__, time->tv_sec); + return status; +} + static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -4781,6 +4852,11 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; + status = decode_attr_time_create(xdr, bitmap, &fattr->btime); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime); if (status < 0) goto xdr_error; @@ -4906,7 +4982,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, } /* - * The prefered block size for layout directed io + * The preferred block size for layout directed io */ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -5970,6 +6046,49 @@ static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) return decode_stateid(xdr, stateid); } +static int decode_get_dir_delegation(struct xdr_stream *xdr, + struct nfs4_getattr_res *res) +{ + struct nfs4_gdd_res *gdd_res = res->gdd_res; + nfs4_verifier cookieverf; + u32 bitmap[1]; + int status; + + status = decode_op_hdr(xdr, OP_GET_DIR_DELEGATION); + if (status) + return status; + + if (xdr_stream_decode_u32(xdr, &gdd_res->status)) + return -EIO; + + if (gdd_res->status == GDD4_UNAVAIL) + return xdr_inline_decode(xdr, 4) ? 0 : -EIO; + + status = decode_verifier(xdr, &cookieverf); + if (status) + return status; + + status = decode_delegation_stateid(xdr, &gdd_res->deleg); + if (status) + return status; + + /* Decode supported notification types. */ + status = decode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap)); + if (status < 0) + return status; + + /* Decode supported child attributes. */ + status = decode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap)); + if (status < 0) + return status; + + /* Decode supported attributes. */ + status = decode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap)); + if (status < 0) + return status; + return 0; +} + static int decode_getdeviceinfo(struct xdr_stream *xdr, struct nfs4_getdeviceinfo_res *res) { @@ -6184,6 +6303,12 @@ static int decode_free_stateid(struct xdr_stream *xdr, return res->status; } #else +static int decode_get_dir_delegation(struct xdr_stream *xdr, + struct nfs4_getattr_res *res) +{ + return 0; +} + static inline int decode_layoutreturn(struct xdr_stream *xdr, struct nfs4_layoutreturn_res *res) @@ -6501,6 +6626,11 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; + if (res->gdd_res) { + status = decode_get_dir_delegation(xdr, res); + if (status) + goto out; + } status = decode_getfattr(xdr, res->fattr, res->server); out: return status; @@ -6561,7 +6691,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, int status; if (res->acl_scratch != NULL) - xdr_set_scratch_page(xdr, res->acl_scratch); + xdr_set_scratch_folio(xdr, res->acl_scratch); status = decode_compound_hdr(xdr, &hdr); if (status) goto out; @@ -7711,6 +7841,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs), PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr), PROC42(READ_PLUS, enc_read_plus, dec_read_plus), + PROC42(ZERO_RANGE, enc_zero_range, dec_zero_range), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 7a058bd8c566..6ce55e8e6b67 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -32,7 +32,8 @@ { NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \ { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \ { NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \ - { NFS_INO_INVALID_MODE, "INVALID_MODE" }) + { NFS_INO_INVALID_MODE, "INVALID_MODE" }, \ + { NFS_INO_INVALID_BTIME, "INVALID_BTIME" }) #define nfs_show_nfsi_flags(v) \ __print_flags(v, "|", \ @@ -44,6 +45,23 @@ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ { BIT(NFS_INO_ODIRECT), "ODIRECT" }) +#define nfs_show_wb_flags(v) \ + __print_flags(v, "|", \ + { BIT(PG_BUSY), "BUSY" }, \ + { BIT(PG_MAPPED), "MAPPED" }, \ + { BIT(PG_FOLIO), "FOLIO" }, \ + { BIT(PG_CLEAN), "CLEAN" }, \ + { BIT(PG_COMMIT_TO_DS), "COMMIT_TO_DS" }, \ + { BIT(PG_INODE_REF), "INODE_REF" }, \ + { BIT(PG_HEADLOCK), "HEADLOCK" }, \ + { BIT(PG_TEARDOWN), "TEARDOWN" }, \ + { BIT(PG_UNLOCKPAGE), "UNLOCKPAGE" }, \ + { BIT(PG_UPTODATE), "UPTODATE" }, \ + { BIT(PG_WB_END), "WB_END" }, \ + { BIT(PG_REMOVE), "REMOVE" }, \ + { BIT(PG_CONTENDED1), "CONTENDED1" }, \ + { BIT(PG_CONTENDED2), "CONTENDED2" }) + DECLARE_EVENT_CLASS(nfs_inode_event, TP_PROTO( const struct inode *inode @@ -56,6 +74,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event, __field(u32, fhandle) __field(u64, fileid) __field(u64, version) + __field(unsigned long, cache_validity) ), TP_fast_assign( @@ -64,14 +83,17 @@ DECLARE_EVENT_CLASS(nfs_inode_event, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); + __entry->cache_validity = nfsi->cache_validity; ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ", + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu cache_validity=0x%lx (%s)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, - (unsigned long long)__entry->version + (unsigned long long)__entry->version, + __entry->cache_validity, + nfs_show_cache_validity(__entry->cache_validity) ) ); @@ -267,6 +289,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class, TP_ARGS(inode, new_size)) DEFINE_NFS_UPDATE_SIZE_EVENT(truncate); +DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio); DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); DEFINE_NFS_UPDATE_SIZE_EVENT(update); DEFINE_NFS_UPDATE_SIZE_EVENT(grow); @@ -961,7 +984,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = offset, + __entry->offset = offset; __entry->count = count; ), @@ -1011,8 +1034,8 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = offset, - __entry->count = count, + __entry->offset = offset; + __entry->count = count; __entry->ret = ret; ), @@ -1045,6 +1068,73 @@ DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done); +DEFINE_NFS_FOLIO_EVENT(nfs_try_to_update_request); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_try_to_update_request_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_update_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_update_folio_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_write_begin); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_begin_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_write_end); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_end_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_writepages); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writepages_done); + +DECLARE_EVENT_CLASS(nfs_kiocb_event, + TP_PROTO( + const struct kiocb *iocb, + const struct iov_iter *iter + ), + + TP_ARGS(iocb, iter), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + + TP_fast_assign( + const struct inode *inode = file_inode(iocb->ki_filp); + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); + __entry->offset = iocb->ki_pos; + __entry->count = iov_iter_count(iter); + __entry->flags = iocb->ki_flags; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld count=%zu ki_flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->offset, __entry->count, + __print_flags(__entry->flags, "|", TRACE_IOCB_STRINGS) + ) +); + +#define DEFINE_NFS_KIOCB_EVENT(name) \ + DEFINE_EVENT(nfs_kiocb_event, name, \ + TP_PROTO( \ + const struct kiocb *iocb, \ + const struct iov_iter *iter \ + ), \ + TP_ARGS(iocb, iter)) + +DEFINE_NFS_KIOCB_EVENT(nfs_file_read); +DEFINE_NFS_KIOCB_EVENT(nfs_file_write); + TRACE_EVENT(nfs_aop_readahead, TP_PROTO( const struct inode *inode, @@ -1392,6 +1482,55 @@ TRACE_EVENT(nfs_writeback_done, ) ); +DECLARE_EVENT_CLASS(nfs_page_class, + TP_PROTO( + const struct nfs_page *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(const struct nfs_page *__private, req) + __field(loff_t, offset) + __field(unsigned int, count) + __field(unsigned long, flags) + ), + + TP_fast_assign( + const struct inode *inode = folio_inode(req->wb_folio); + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->req = req; + __entry->offset = req_offset(req); + __entry->count = req->wb_bytes; + __entry->flags = req->wb_flags; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x req=%p offset=%lld count=%u flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->req, __entry->offset, __entry->count, + nfs_show_wb_flags(__entry->flags) + ) +); + +#define DEFINE_NFS_PAGE_EVENT(name) \ + DEFINE_EVENT(nfs_page_class, name, \ + TP_PROTO( \ + const struct nfs_page *req \ + ), \ + TP_ARGS(req)) + +DEFINE_NFS_PAGE_EVENT(nfs_writepage_setup); +DEFINE_NFS_PAGE_EVENT(nfs_do_writepage); + DECLARE_EVENT_CLASS(nfs_page_error_class, TP_PROTO( const struct inode *inode, @@ -1593,6 +1732,76 @@ DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io); +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +DECLARE_EVENT_CLASS(nfs_local_dio_class, + TP_PROTO( + const struct inode *inode, + loff_t offset, + ssize_t count, + const struct nfs_local_dio *local_dio + ), + TP_ARGS(inode, offset, count, local_dio), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, offset) + __field(ssize_t, count) + __field(u32, mem_align) + __field(u32, offset_align) + __field(loff_t, start) + __field(ssize_t, start_len) + __field(loff_t, middle) + __field(ssize_t, middle_len) + __field(loff_t, end) + __field(ssize_t, end_len) + ), + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->offset = offset; + __entry->count = count; + __entry->mem_align = local_dio->mem_align; + __entry->offset_align = local_dio->offset_align; + __entry->start = offset; + __entry->start_len = local_dio->start_len; + __entry->middle = local_dio->middle_offset; + __entry->middle_len = local_dio->middle_len; + __entry->end = local_dio->end_offset; + __entry->end_len = local_dio->end_len; + ), + TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zd " + "mem_align=%u offset_align=%u " + "start=%llu+%zd middle=%llu+%zd end=%llu+%zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->offset, __entry->count, + __entry->mem_align, __entry->offset_align, + __entry->start, __entry->start_len, + __entry->middle, __entry->middle_len, + __entry->end, __entry->end_len) +) + +#define DEFINE_NFS_LOCAL_DIO_EVENT(name) \ +DEFINE_EVENT(nfs_local_dio_class, nfs_local_dio_##name, \ + TP_PROTO(const struct inode *inode, \ + loff_t offset, \ + ssize_t count, \ + const struct nfs_local_dio *local_dio),\ + TP_ARGS(inode, offset, count, local_dio)) + +DEFINE_NFS_LOCAL_DIO_EVENT(read); +DEFINE_NFS_LOCAL_DIO_EVENT(write); +DEFINE_NFS_LOCAL_DIO_EVENT(misaligned); + +#endif /* CONFIG_NFS_LOCALIO */ + TRACE_EVENT(nfs_fh_to_dentry, TP_PROTO( const struct super_block *sb, @@ -1707,10 +1916,10 @@ TRACE_EVENT(nfs_local_open_fh, ), TP_printk( - "error=%d fhandle=0x%08x mode=%s", - __entry->error, + "fhandle=0x%08x mode=%s result=%d", __entry->fhandle, - show_fs_fmode_flags(__entry->fmode) + show_fs_fmode_flags(__entry->fmode), + __entry->error ) ); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 11968dcb7243..6e69ce43a13f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -253,13 +253,14 @@ nfs_page_group_unlock(struct nfs_page *req) nfs_page_clear_headlock(req); } -/* - * nfs_page_group_sync_on_bit_locked +/** + * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set + * @req: request in page group + * @bit: PG_* bit that is used to sync page group * * must be called with page group lock held */ -static bool -nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) { struct nfs_page *head = req->wb_head; struct nfs_page *tmp; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5f582713bf05..b72d7cc36766 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -306,7 +306,6 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { struct inode *inode; - unsigned long i_state; if (!lo) return; @@ -317,12 +316,11 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); - i_state = inode->i_state; + /* Notify pnfs_destroy_layout_final() that we're done */ + if (inode_state_read(inode) & (I_FREEING | I_CLEAR)) + wake_up_var_locked(lo, &inode->i_lock); spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); - /* Notify pnfs_destroy_layout_final() that we're done */ - if (i_state & (I_FREEING | I_CLEAR)) - wake_up_var(lo); } } @@ -466,6 +464,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, *next; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(lo->plh_inode)->flags); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) pnfs_clear_lseg_state(lseg, lseg_list); pnfs_clear_layoutreturn_info(lo); @@ -745,6 +744,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return remaining; } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} + static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, struct list_head *free_me, @@ -801,23 +808,17 @@ void pnfs_destroy_layout(struct nfs_inode *nfsi) } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); -static bool pnfs_layout_removed(struct nfs_inode *nfsi, - struct pnfs_layout_hdr *lo) -{ - bool ret; - - spin_lock(&nfsi->vfs_inode.i_lock); - ret = nfsi->layout != lo; - spin_unlock(&nfsi->vfs_inode.i_lock); - return ret; -} - void pnfs_destroy_layout_final(struct nfs_inode *nfsi) { struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi); + struct inode *inode = &nfsi->vfs_inode; - if (lo) - wait_var_event(lo, pnfs_layout_removed(nfsi, lo)); + if (lo) { + spin_lock(&inode->i_lock); + wait_var_event_spinlock(lo, nfsi->layout != lo, + &inode->i_lock); + spin_unlock(&inode->i_lock); + } } static bool @@ -1246,21 +1247,15 @@ static void pnfs_clear_layoutcommit(struct inode *inode, static void pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo, const nfs4_stateid *arg_stateid, - const struct pnfs_layout_range *range) + const struct pnfs_layout_range *range, + struct list_head *freeme) { - const struct pnfs_layout_segment *lseg; - u32 seq = be32_to_cpu(arg_stateid->seqid); - if (pnfs_layout_is_valid(lo) && - nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) { - list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) { - if (pnfs_seqid_is_newer(lseg->pls_seq, seq) || - !pnfs_should_free_range(&lseg->pls_range, range)) - continue; - pnfs_set_plh_return_info(lo, range->iomode, seq); - break; - } - } + nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + pnfs_reset_return_info(lo); + else + pnfs_mark_layout_stateid_invalid(lo, freeme); + pnfs_clear_layoutreturn_waitbit(lo); } void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, @@ -1268,11 +1263,12 @@ void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, const struct pnfs_layout_range *range) { struct inode *inode = lo->plh_inode; + LIST_HEAD(freeme); spin_lock(&inode->i_lock); - pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range); - pnfs_clear_layoutreturn_waitbit(lo); + pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range, &freeme); spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); } void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, @@ -1292,6 +1288,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); pnfs_set_layout_stateid(lo, stateid, NULL, true); + pnfs_reset_return_info(lo); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: @@ -1661,6 +1658,18 @@ int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, /* Was there an RPC level error? If not, retry */ if (task->tk_rpc_status == 0) break; + /* + * Is there a fatal network level error? + * If so release the layout, but flag the error. + */ + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + *ret = 0; + (*respp)->lrs_present = 0; + retval = -EIO; + break; + } /* If the call was not sent, let caller handle it */ if (!RPC_WAS_SENT(task)) return 0; @@ -1695,6 +1704,7 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct inode *inode = args->inode; const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; + LIST_HEAD(freeme); switch (ret) { case -NFS4ERR_BADSESSION: @@ -1703,9 +1713,9 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, case -NFS4ERR_NOMATCHING_LAYOUT: spin_lock(&inode->i_lock); pnfs_layoutreturn_retry_later_locked(lo, &args->stateid, - &args->range); - pnfs_clear_layoutreturn_waitbit(lo); + &args->range, &freeme); spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); break; case 0: if (res->lrs_present) @@ -2042,8 +2052,10 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) { if (atomic_dec_and_test(&lo->plh_outstanding) && - test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) + test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) { + smp_mb__after_atomic(); wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN); + } } static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) @@ -3321,6 +3333,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; int status; + bool mark_as_dirty = false; if (!pnfs_layoutcommit_outstanding(inode)) return 0; @@ -3372,19 +3385,23 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (ld->prepare_layoutcommit) { status = ld->prepare_layoutcommit(&data->args); if (status) { - put_cred(data->cred); + if (status != -ENOSPC) + put_cred(data->cred); spin_lock(&inode->i_lock); set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - goto out_unlock; + if (status != -ENOSPC) + goto out_unlock; + spin_unlock(&inode->i_lock); + mark_as_dirty = true; } } status = nfs4_proc_layoutcommit(data, sync); out: - if (status) + if (status || mark_as_dirty) mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 30d2613e912b..91ff877185c8 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -60,6 +60,7 @@ struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *ds_remotestr; /* comma sep list of addrs */ struct list_head ds_addrs; + const struct net *ds_net; struct nfs_client *ds_clp; refcount_t ds_count; unsigned long ds_state; @@ -415,7 +416,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode, int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max); void pnfs_generic_write_commit_done(struct rpc_task *task, void *data); void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); -struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net, + struct list_head *dsaddrs, gfp_t gfp_flags); void nfs4_pnfs_v3_ds_connect_unload(void); int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index dbef837e871a..9976cc16b689 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -16,6 +16,8 @@ #include "nfs4session.h" #include "internal.h" #include "pnfs.h" +#include "netns.h" +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -504,14 +506,14 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist); /* * Data server cache * - * Data servers can be mapped to different device ids. - * nfs4_pnfs_ds reference counting + * Data servers can be mapped to different device ids, but should + * never be shared between net namespaces. + * + * nfs4_pnfs_ds reference counting: * - set to 1 on allocation * - incremented when a device id maps a data server already in the cache. * - decremented when deviceid is removed from the cache. */ -static DEFINE_SPINLOCK(nfs4_ds_cache_lock); -static LIST_HEAD(nfs4_data_server_cache); /* Debug routines */ static void @@ -604,11 +606,11 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1, * Lookup DS by addresses. nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct list_head *dsaddrs) +_data_server_lookup_locked(const struct nfs_net *nn, const struct list_head *dsaddrs) { struct nfs4_pnfs_ds *ds; - list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) + list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node) if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) return ds; return NULL; @@ -653,10 +655,11 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds) void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds) { - if (refcount_dec_and_lock(&ds->ds_count, - &nfs4_ds_cache_lock)) { + struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id); + + if (refcount_dec_and_lock(&ds->ds_count, &nn->nfs4_data_server_lock)) { list_del_init(&ds->ds_node); - spin_unlock(&nfs4_ds_cache_lock); + spin_unlock(&nn->nfs4_data_server_lock); destroy_ds(ds); } } @@ -716,8 +719,9 @@ out_err: * uncached and return cached struct nfs4_pnfs_ds. */ struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags) { + struct nfs_net *nn = net_generic(net, nfs_net_id); struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; char *remotestr; @@ -733,16 +737,17 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) /* this is only used for debugging, so it's ok if its NULL */ remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); - spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(dsaddrs); + spin_lock(&nn->nfs4_data_server_lock); + tmp_ds = _data_server_lookup_locked(nn, dsaddrs); if (tmp_ds == NULL) { INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; refcount_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); + ds->ds_net = net; ds->ds_clp = NULL; - list_add(&ds->ds_node, &nfs4_data_server_cache); + list_add(&ds->ds_node, &nn->nfs4_data_server_cache); dprintk("%s add new data server %s\n", __func__, ds->ds_remotestr); } else { @@ -754,7 +759,7 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) refcount_read(&tmp_ds->ds_count)); ds = tmp_ds; } - spin_unlock(&nfs4_ds_cache_lock); + spin_unlock(&nn->nfs4_data_server_lock); out: return ds; } @@ -804,8 +809,11 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, unsigned int retrans) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -826,21 +834,31 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .servername = clp->cl_hostname, .connect_timeout = connect_timeout, .reconnect_timeout = connect_timeout, + .xprtsec = clp->cl_xprtsec, }; - if (da->da_transport != clp->cl_proto) + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) + xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; + + if (xprt_args.ident != clp->cl_proto) continue; - if (da->da_addr.ss_family != clp->cl_addr.ss_family) + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) continue; /* Add this address as an alias */ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, NULL); continue; } - clp = get_v3_ds_connect(mds_srv, - &da->da_addr, - da->da_addrlen, da->da_transport, - timeo, retrans); + + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen, + ds_proto, timeo, retrans); if (IS_ERR(clp)) continue; clp->cl_rpcclient->cl_softerr = 0; @@ -866,7 +884,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, u32 minor_version) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -894,12 +915,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, .data = &xprtdata, }; - if (da->da_transport != clp->cl_proto && - clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) - continue; - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == - XPRT_TRANSPORT_TCP_TLS) { + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) { struct sockaddr *addr = (struct sockaddr *)&da->da_addr; struct sockaddr_in *sin = @@ -930,7 +947,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; xprt_args.servername = servername; } - if (da->da_addr.ss_family != clp->cl_addr.ss_family) + if (xprt_args.ident != clp->cl_proto) + continue; + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) continue; /** @@ -944,15 +964,14 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, if (xprtdata.cred) put_cred(xprtdata.cred); } else { - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == - XPRT_TRANSPORT_TCP_TLS) - da->da_transport = XPRT_TRANSPORT_TCP_TLS; - clp = nfs4_set_ds_client(mds_srv, - &da->da_addr, - da->da_addrlen, - da->da_transport, timeo, - retrans, minor_version); + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = nfs4_set_ds_client(mds_srv, &da->da_addr, + da->da_addrlen, ds_proto, + timeo, retrans, minor_version); if (IS_ERR(clp)) continue; @@ -963,7 +982,6 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, clp = ERR_PTR(-EIO); continue; } - } } @@ -994,8 +1012,10 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, err = nfs4_wait_ds_connect(ds); if (err || ds->ds_clp) goto out; - if (nfs4_test_deviceid_unavailable(devid)) - return -ENODEV; + if (nfs4_test_deviceid_unavailable(devid)) { + err = -ENODEV; + goto out; + } } while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0); if (ds->ds_clp) @@ -1025,11 +1045,12 @@ out: if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) { WARN_ON_ONCE(ds->ds_clp || !nfs4_test_deviceid_unavailable(devid)); - return -EINVAL; - } - err = nfs_client_init_status(ds->ds_clp); + err = -EINVAL; + } else + err = nfs_client_init_status(ds->ds_clp); } + trace_pnfs_ds_connect(ds->ds_remotestr, err); return err; } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 63e71310b9f6..39df80e4ae6f 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -353,7 +353,8 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) static void nfs_proc_rename_setup(struct rpc_message *msg, struct dentry *old_dentry, - struct dentry *new_dentry) + struct dentry *new_dentry, + struct inode *same_parent) { msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 81bd1b9aba17..3c1fa320b3f1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -56,7 +56,8 @@ static int nfs_return_empty_folio(struct folio *folio) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); - folio_unlock(folio); + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); return 0; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9eea9e62afc9..57d372db03b9 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1091,8 +1091,9 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) sb->s_blocksize = 0; sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr; sb->s_op = server->nfs_client->cl_nfs_mod->sops; - if (ctx->bsize) - sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits); + if (server->bsize) + sb->s_blocksize = + nfs_block_size(server->bsize, &sb->s_blocksize_bits); switch (server->nfs_client->rpc_ops->version) { case 2: @@ -1173,7 +1174,7 @@ static int nfs_set_super(struct super_block *s, struct fs_context *fc) struct nfs_server *server = fc->s_fs_info; int ret; - s->s_d_op = server->nfs_client->rpc_ops->dentry_ops; + set_default_d_op(s, server->nfs_client->rpc_ops->dentry_ops); ret = set_anon_super(s, server); if (ret == 0) server->s_dev = s->s_dev; @@ -1315,10 +1316,6 @@ int nfs_get_tree_common(struct fs_context *fc) if (server->flags & NFS_MOUNT_NOAC) fc->sb_flags |= SB_SYNCHRONOUS; - if (ctx->clone_data.sb) - if (ctx->clone_data.sb->s_flags & SB_SYNCHRONOUS) - fc->sb_flags |= SB_SYNCHRONOUS; - /* Get a superblock - note that we may end up sharing one that already exists */ fc->s_fs_info = server; s = sget_fc(fc, compare_super, nfs_set_super); @@ -1342,13 +1339,8 @@ int nfs_get_tree_common(struct fs_context *fc) } if (!s->s_root) { - unsigned bsize = ctx->clone_data.inherited_bsize; /* initial superblock/root creation */ nfs_fill_super(s, ctx); - if (bsize) { - s->s_blocksize_bits = bsize; - s->s_blocksize = 1U << bsize; - } error = nfs_get_cache_cookie(s, ctx); if (error < 0) goto error_splat_super; diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 1c62a5a9f51d..58146e935402 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -40,31 +40,31 @@ static const char *nfs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page; + struct folio *folio; void *err; if (!dentry) { err = ERR_PTR(nfs_revalidate_mapping_rcu(inode)); if (err) return err; - page = find_get_page(inode->i_mapping, 0); - if (!page) + folio = filemap_get_folio(inode->i_mapping, 0); + if (IS_ERR(folio)) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) return err; - page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler, + folio = read_cache_folio(&inode->i_data, 0, nfs_symlink_filler, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); + if (IS_ERR(folio)) + return ERR_CAST(folio); } - set_delayed_call(done, page_put_link, page); - return page_address(page); + set_delayed_call(done, page_put_link, folio); + return folio_address(folio); } /* diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 37cb2b776435..ea6e6168092b 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -189,6 +189,7 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent, return p; kobject_put(&p->kobject); + kobject_put(&p->nfs_net_kobj); } return NULL; } @@ -387,6 +388,33 @@ static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) } #endif /* CONFIG_NFS_V4_1 */ +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +static ssize_t +localio_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + bool localio = nfs_server_is_local(server->nfs_client); + return sysfs_emit(buf, "%d\n", localio); +} + +static struct kobj_attribute nfs_sysfs_attr_localio = __ATTR_RO(localio); + +static void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ + int ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_localio.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +#else +static inline void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ +} +#endif /* IS_ENABLED(CONFIG_NFS_LOCALIO) */ + void nfs_sysfs_add_server(struct nfs_server *server) { int ret; @@ -405,6 +433,7 @@ void nfs_sysfs_add_server(struct nfs_server *server) server->s_sysfs_id, ret); nfs_sysfs_add_nfsv41_server(server); + nfs_sysfs_add_nfs_localio_server(server); } EXPORT_SYMBOL_GPL(nfs_sysfs_add_server); diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index bf77399696a7..4db818c0f9dd 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -390,7 +390,8 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, nfs_sb_active(old_dir->i_sb); - NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry); + NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry, + old_dir == new_dir ? old_dir : NULL); return rpc_run_task(&task_setup_data); } @@ -464,18 +465,17 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) sdentry = NULL; do { - int slen; dput(sdentry); sillycounter++; - slen = scnprintf(silly, sizeof(silly), - SILLYNAME_PREFIX "%0*llx%0*x", - SILLYNAME_FILEID_LEN, fileid, - SILLYNAME_COUNTER_LEN, sillycounter); + scnprintf(silly, sizeof(silly), + SILLYNAME_PREFIX "%0*llx%0*x", + SILLYNAME_FILEID_LEN, fileid, + SILLYNAME_COUNTER_LEN, sillycounter); dfprintk(VFS, "NFS: trying to rename %pd to %s\n", dentry, silly); - sdentry = lookup_one_len(silly, dentry->d_parent, slen); + sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent); /* * N.B. Better to return EBUSY here ... it could be * dangerous to delete the file while it's in use. diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 23df8b214474..336c510f3750 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -153,20 +153,10 @@ nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) } } -static int -nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) { - int ret; - - if (!test_bit(PG_REMOVE, &req->wb_flags)) - return 0; - ret = nfs_page_group_lock(req); - if (ret) - return ret; if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) nfs_page_set_inode_ref(req, inode); - nfs_page_group_unlock(req); - return 0; } /** @@ -247,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error) } /* - * nfs_page_group_search_locked - * @head - head request of page group - * @page_offset - offset into page - * - * Search page group with head @head to find a request that contains the - * page offset @page_offset. - * - * Returns a pointer to the first matching nfs request, or NULL if no - * match is found. - * - * Must be called with the page group lock held - */ -static struct nfs_page * -nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) -{ - struct nfs_page *req; - - req = head; - do { - if (page_offset >= req->wb_pgbase && - page_offset < (req->wb_pgbase + req->wb_bytes)) - return req; - - req = req->wb_this_page; - } while (req != head); - - return NULL; -} - -/* - * nfs_page_group_covers_page - * @head - head request of page group + * nfs_page_covers_folio + * @req: struct nfs_page * - * Return true if the page group with head @head covers the whole page, - * returns false otherwise + * Return true if the request covers the whole folio. + * Note that the caller should ensure all subrequests have been joined */ static bool nfs_page_group_covers_page(struct nfs_page *req) { unsigned int len = nfs_folio_length(nfs_page_to_folio(req)); - struct nfs_page *tmp; - unsigned int pos = 0; - - nfs_page_group_lock(req); - - for (;;) { - tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (!tmp) - break; - pos = tmp->wb_pgbase + tmp->wb_bytes; - } - nfs_page_group_unlock(req); - return pos >= len; + return req->wb_pgbase == 0 && req->wb_bytes == len; } /* We can set the PG_uptodate flag if we see that a write request @@ -348,7 +296,7 @@ static void nfs_folio_end_writeback(struct folio *folio) { struct nfs_server *nfss = NFS_SERVER(folio->mapping->host); - folio_end_writeback(folio); + folio_end_writeback_no_dropbehind(folio); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) { nfss->write_congested = 0; @@ -585,19 +533,18 @@ retry: } } + ret = nfs_page_group_lock(head); + if (ret < 0) + goto out_unlock; + /* Ensure that nobody removed the request before we locked it */ if (head != folio->private) { + nfs_page_group_unlock(head); nfs_unlock_and_release_request(head); goto retry; } - ret = nfs_cancel_remove_inode(head, inode); - if (ret < 0) - goto out_unlock; - - ret = nfs_page_group_lock(head); - if (ret < 0) - goto out_unlock; + nfs_cancel_remove_inode(head, inode); /* lock each request in the page group */ for (subreq = head->wb_this_page; @@ -632,20 +579,21 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct folio *folio, - struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) +static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; - int ret = 0; + int ret; + + nfs_pageio_cond_complete(pgio, folio->index); req = nfs_lock_and_join_requests(folio); if (!req) - goto out; - ret = PTR_ERR(req); + return 0; if (IS_ERR(req)) - goto out; + return PTR_ERR(req); + trace_nfs_do_writepage(req); nfs_folio_set_writeback(folio); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); @@ -654,7 +602,6 @@ static int nfs_page_async_flush(struct folio *folio, if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -662,28 +609,20 @@ static int nfs_page_async_flush(struct folio *folio, */ if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - if (wbc->sync_mode == WB_SYNC_NONE) - ret = AOP_WRITEPAGE_ACTIVATE; folio_redirty_for_writepage(wbc, folio); nfs_redirty_request(req); pgio->pg_error = 0; - } else - nfs_add_stats(folio->mapping->host, - NFSIOS_WRITEPAGES, 1); -out: - return ret; + return ret; + } + + nfs_add_stats(folio->mapping->host, NFSIOS_WRITEPAGES, 1); + return 0; + out_launder: nfs_write_error(req, ret); return 0; } -static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) -{ - nfs_pageio_cond_complete(pgio, folio->index); - return nfs_page_async_flush(folio, wbc, pgio); -} - /* * Write an mmapped page to the server. */ @@ -703,17 +642,6 @@ static int nfs_writepage_locked(struct folio *folio, return err; } -static int nfs_writepages_callback(struct folio *folio, - struct writeback_control *wbc, void *data) -{ - int ret; - - ret = nfs_do_writepage(folio, wbc, data); - if (ret != AOP_WRITEPAGE_ACTIVATE) - folio_unlock(folio); - return ret; -} - static void nfs_io_completion_commit(void *inode) { nfs_commit_inode(inode, 0); @@ -729,18 +657,20 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int priority = 0; int err; + trace_nfs_writepages(inode, wbc->range_start, wbc->range_end - wbc->range_start); + /* Wait with writeback until write congestion eases */ if (wbc->sync_mode == WB_SYNC_NONE && nfss->write_congested) { err = wait_event_killable(nfss->write_congestion_wait, nfss->write_congested == 0); if (err) - return err; + goto out_err; } nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || - wbc->for_background || wbc->for_sync || wbc->for_reclaim) { + wbc->for_background || wbc->for_sync) { ioc = nfs_io_completion_alloc(GFP_KERNEL); if (ioc) nfs_io_completion_init(ioc, nfs_io_completion_commit, @@ -749,11 +679,15 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) } do { + struct folio *folio = NULL; + nfs_pageio_init_write(&pgio, inode, priority, false, &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; - err = write_cache_pages(mapping, wbc, nfs_writepages_callback, - &pgio); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) { + err = nfs_do_writepage(folio, wbc, &pgio); + folio_unlock(folio); + } pgio.pg_error = 0; nfs_pageio_complete(&pgio); if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR) @@ -761,10 +695,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) } while (err < 0 && !nfs_error_is_fatal(err)); nfs_io_completion_put(ioc); - if (err < 0) - goto out_err; - return 0; + if (err > 0) + err = 0; out_err: + trace_nfs_writepages_done(inode, wbc->range_start, wbc->range_end - wbc->range_start, err); return err; } @@ -802,7 +736,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); - if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { + nfs_page_group_lock(req); + if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) { struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio->mapping; @@ -813,7 +748,10 @@ static void nfs_inode_remove_request(struct nfs_page *req) clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } spin_unlock(&mapping->i_private_lock); + + folio_end_dropbehind(folio); } + nfs_page_group_unlock(req); if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { atomic_long_dec(&nfsi->nrequests); @@ -993,7 +931,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) req->wb_nio = 0; memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo, - hdr->pgio_mirror_idx); + hdr->ds_commit_idx); goto next; } remove_req: @@ -1084,11 +1022,12 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio, unsigned int end; int error; + trace_nfs_try_to_update_request(folio_inode(folio), offset, bytes); end = offset + bytes; req = nfs_lock_and_join_requests(folio); if (IS_ERR_OR_NULL(req)) - return req; + goto out; rqend = req->wb_offset + req->wb_bytes; /* @@ -1110,6 +1049,9 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio, else req->wb_bytes = rqend - req->wb_offset; req->wb_nio = 0; +out: + trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, + PTR_ERR_OR_ZERO(req)); return req; out_flushme: /* @@ -1120,6 +1062,7 @@ out_flushme: nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); error = nfs_wb_folio(folio->mapping->host, folio); + trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, error); return (error < 0) ? ERR_PTR(error) : NULL; } @@ -1157,6 +1100,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, req = nfs_setup_write_request(ctx, folio, offset, count); if (IS_ERR(req)) return PTR_ERR(req); + trace_nfs_writepage_setup(req); /* Update file length */ nfs_grow_file(folio, offset, count); nfs_mark_uptodate(req); @@ -1357,6 +1301,8 @@ int nfs_update_folio(struct file *file, struct folio *folio, nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); + trace_nfs_update_folio(inode, offset, count); + dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count, (long long)(folio_pos(folio) + offset)); @@ -1376,6 +1322,7 @@ int nfs_update_folio(struct file *file, struct folio *folio, if (status < 0) nfs_set_pageerror(mapping); out: + trace_nfs_update_folio_done(inode, offset, count, status); dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; @@ -1588,7 +1535,8 @@ static int nfs_writeback_done(struct rpc_task *task, /* Deal with the suid/sgid bit corner case */ if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE + | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); } return 0; @@ -1873,7 +1821,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) nfs_mapping_set_error(folio, status); nfs_inode_remove_request(req); } - dprintk_cont(", error = %d\n", status); + dprintk(", error = %d\n", status); goto next; } @@ -1883,11 +1831,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a match */ if (folio) nfs_inode_remove_request(req); - dprintk_cont(" OK\n"); + dprintk(" OK\n"); goto next; } /* We have a mismatch. Write the page again */ - dprintk_cont(" mismatch\n"); + dprintk(" mismatch\n"); nfs_mark_request_dirty(req); atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: @@ -2070,6 +2018,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) * release it */ nfs_inode_remove_request(req); nfs_unlock_and_release_request(req); + folio_cancel_dirty(folio); } return ret; @@ -2129,8 +2078,12 @@ int nfs_migrate_folio(struct address_space *mapping, struct folio *dst, * that we can safely release the inode reference while holding * the folio lock. */ - if (folio_test_private(src)) - return -EBUSY; + if (folio_test_private(src)) { + if (mode == MIGRATE_SYNC) + nfs_wb_folio(src->mapping->host, src); + if (folio_test_private(src)) + return -EBUSY; + } if (folio_test_private_2(src)) { /* [DEPRECATED] */ if (mode == MIGRATE_ASYNC) |
