diff options
Diffstat (limited to 'fs/nfs')
78 files changed, 7237 insertions, 3190 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index b6fc169be1b1..07932ce9246c 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -2,8 +2,10 @@ config NFS_FS tristate "NFS client support" depends on INET && FILE_LOCKING && MULTIUSER + select CRC32 select LOCKD select SUNRPC + select NFS_COMMON select NFS_ACL_SUPPORT if NFS_V3_ACL help Choose Y here if you want to access files residing on other @@ -33,12 +35,12 @@ config NFS_FS config NFS_V2 tristate "NFS client support for NFS version 2" depends on NFS_FS - default y + default n help This option enables support for version 2 of the NFS protocol (RFC 1094) in the kernel's NFS client. - If unsure, say Y. + If unsure, say N. config NFS_V3 tristate "NFS client support for NFS version 3" @@ -125,7 +127,7 @@ config PNFS_BLOCK config PNFS_FLEXFILE_LAYOUT tristate - depends on NFS_V4_1 && NFS_V3 + depends on NFS_V4_1 default NFS_V4 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN @@ -169,8 +171,9 @@ config ROOT_NFS config NFS_FSCACHE bool "Provide NFS client caching support" - depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y + depends on NFS_FS select NETFS_SUPPORT + select FSCACHE help Say Y here if you want NFS data to be cached locally on disc through the general filesystem cache manager @@ -194,7 +197,6 @@ config NFS_USE_KERNEL_DNS config NFS_DEBUG bool depends on NFS_FS && SUNRPC_DEBUG - select CRC32 default y config NFS_DISABLE_UDP_SUPPORT @@ -209,8 +211,6 @@ config NFS_DISABLE_UDP_SUPPORT config NFS_V4_2_READ_PLUS bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" depends on NFS_V4_2 - default n + default y help - This is intended for developers only. The READ_PLUS operation has - been shown to have issues under specific conditions and should not - be used in production. + Choose Y here to enable use of the NFS v4.2 READ_PLUS operation. diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 5f6db37f461e..9fb2f2cac87e 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -13,6 +13,7 @@ nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o +nfs-$(CONFIG_NFS_LOCALIO) += localio.o obj-$(CONFIG_NFS_V2) += nfsv2.o nfsv2-y := nfs2super.o proc.o nfs2xdr.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 943aeea1eb16..0e4c67373e4f 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -149,8 +149,8 @@ do_add_page_to_bio(struct bio *bio, int npg, enum req_op op, sector_t isect, /* limit length to what the device mapping allows */ end = disk_addr + *len; - if (end >= map->start + map->len) - *len = map->start + map->len - disk_addr; + if (end >= map->disk_offset + map->len) + *len = map->disk_offset + map->len - disk_addr; retry: if (!bio) { @@ -564,23 +564,45 @@ bl_find_get_deviceid(struct nfs_server *server, gfp_t gfp_mask) { struct nfs4_deviceid_node *node; - unsigned long start, end; + int err = -ENODEV; retry: node = nfs4_find_get_deviceid(server, id, cred, gfp_mask); if (!node) return ERR_PTR(-ENODEV); - if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0) - return node; + /* + * Devices that are marked unavailable are left in the cache with a + * timeout to avoid sending GETDEVINFO after every LAYOUTGET, or + * constantly attempting to register the device. Once marked as + * unavailable they must be deleted and never reused. + */ + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { + unsigned long end = jiffies; + unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT; + + if (!time_in_range(node->timestamp_unavailable, start, end)) { + /* Uncork subsequent GETDEVINFO operations for this device */ + nfs4_delete_deviceid(node->ld, node->nfs_client, id); + goto retry; + } + goto out_put; + } - end = jiffies; - start = end - PNFS_DEVICE_RETRY_TIMEOUT; - if (!time_in_range(node->timestamp_unavailable, start, end)) { - nfs4_delete_deviceid(node->ld, node->nfs_client, id); - goto retry; + if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node))) { + /* + * If we cannot register, treat this device as transient: + * Make a negative cache entry for the device + */ + nfs4_mark_deviceid_unavailable(node); + goto out_put; } - return ERR_PTR(-ENODEV); + + return node; + +out_put: + nfs4_put_deviceid_node(node); + return ERR_PTR(err); } static int @@ -654,7 +676,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, struct pnfs_layout_segment *lseg; struct xdr_buf buf; struct xdr_stream xdr; - struct page *scratch; + struct folio *scratch; int status, i; uint32_t count; __be32 *p; @@ -667,13 +689,13 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, return ERR_PTR(-ENOMEM); status = -ENOMEM; - scratch = alloc_page(gfp_mask); + scratch = folio_alloc(gfp_mask, 0); if (!scratch) goto out; xdr_init_decode_pages(&xdr, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&xdr, scratch); + xdr_set_scratch_folio(&xdr, scratch); status = -EIO; p = xdr_inline_decode(&xdr, 4); @@ -722,7 +744,7 @@ process_extents: } out_free_scratch: - __free_page(scratch); + folio_put(scratch); out: dprintk("%s returns %d\n", __func__, status); switch (status) { @@ -893,10 +915,9 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } if (pgio->pg_dreq == NULL) - wb_size = pnfs_num_cont_bytes(pgio->pg_inode, - req->wb_index); + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index); else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req)); pnfs_generic_pg_init_write(pgio, req, wb_size); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 716bc75e9ed2..6da40ca19570 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -104,20 +104,26 @@ struct pnfs_block_dev { u64 start; u64 len; + enum pnfs_block_volume_type type; u32 nr_children; struct pnfs_block_dev *children; u64 chunk_size; - struct block_device *bdev; + struct file *bdev_file; u64 disk_offset; + unsigned long flags; u64 pr_key; - bool pr_registered; bool (*map)(struct pnfs_block_dev *dev, u64 offset, struct pnfs_block_dev_map *map); }; +/* pnfs_block_dev flag bits */ +enum { + PNFS_BDEV_REGISTERED = 0, +}; + /* sector_t fields are all in 512-byte sectors */ struct pnfs_block_extent { union { @@ -172,6 +178,7 @@ struct bl_msg_hdr { #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ /* dev.c */ +bool bl_register_dev(struct pnfs_block_dev *d); struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, gfp_t gfp_mask); void bl_free_deviceid_node(struct nfs4_deviceid_node *d); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 70f5563a8e81..ab76120705e2 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -10,12 +10,81 @@ #include <linux/pr.h> #include "blocklayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD +static void bl_unregister_scsi(struct pnfs_block_dev *dev) +{ + struct block_device *bdev = file_bdev(dev->bdev_file); + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + int status; + + status = ops->pr_register(bdev, dev->pr_key, 0, false); + if (status) + trace_bl_pr_key_unreg_err(bdev, dev->pr_key, status); + else + trace_bl_pr_key_unreg(bdev, dev->pr_key); +} + +static bool bl_register_scsi(struct pnfs_block_dev *dev) +{ + struct block_device *bdev = file_bdev(dev->bdev_file); + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + int status; + + if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags)) + return true; + + status = ops->pr_register(bdev, 0, dev->pr_key, true); + if (status) { + trace_bl_pr_key_reg_err(bdev, dev->pr_key, status); + return false; + } + trace_bl_pr_key_reg(bdev, dev->pr_key); + return true; +} + +static void bl_unregister_dev(struct pnfs_block_dev *dev) +{ + u32 i; + + if (dev->nr_children) { + for (i = 0; i < dev->nr_children; i++) + bl_unregister_dev(&dev->children[i]); + return; + } + + if (dev->type == PNFS_BLOCK_VOLUME_SCSI && + test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags)) + bl_unregister_scsi(dev); +} + +bool bl_register_dev(struct pnfs_block_dev *dev) +{ + u32 i; + + if (dev->nr_children) { + for (i = 0; i < dev->nr_children; i++) { + if (!bl_register_dev(&dev->children[i])) { + while (i > 0) + bl_unregister_dev(&dev->children[--i]); + return false; + } + } + return true; + } + + if (dev->type == PNFS_BLOCK_VOLUME_SCSI) + return bl_register_scsi(dev); + return true; +} + static void bl_free_device(struct pnfs_block_dev *dev) { + bl_unregister_dev(dev); + if (dev->nr_children) { int i; @@ -23,19 +92,8 @@ bl_free_device(struct pnfs_block_dev *dev) bl_free_device(&dev->children[i]); kfree(dev->children); } else { - if (dev->pr_registered) { - const struct pr_ops *ops = - dev->bdev->bd_disk->fops->pr_ops; - int error; - - error = ops->pr_register(dev->bdev, dev->pr_key, 0, - false); - if (error) - pr_err("failed to unregister PR key.\n"); - } - - if (dev->bdev) - blkdev_put(dev->bdev, NULL); + if (dev->bdev_file) + fput(dev->bdev_file); } } @@ -169,7 +227,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, map->start = dev->start; map->len = dev->len; map->disk_offset = dev->disk_offset; - map->bdev = dev->bdev; + map->bdev = file_bdev(dev->bdev_file); return true; } @@ -199,10 +257,11 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, struct pnfs_block_dev *child; u64 chunk; u32 chunk_idx; + u64 disk_chunk; u64 disk_offset; chunk = div_u64(offset, dev->chunk_size); - div_u64_rem(chunk, dev->nr_children, &chunk_idx); + disk_chunk = div_u64_rem(chunk, dev->nr_children, &chunk_idx); if (chunk_idx >= dev->nr_children) { dprintk("%s: invalid chunk idx %d (%lld/%lld)\n", @@ -215,7 +274,7 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, offset = chunk * dev->chunk_size; /* disk offset of the stripe */ - disk_offset = div_u64(offset, dev->nr_children); + disk_offset = disk_chunk * dev->chunk_size; child = &dev->children[chunk_idx]; child->map(child, disk_offset, map); @@ -236,28 +295,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; - struct block_device *bdev; + struct file *bdev_file; dev_t dev; dev = bl_resolve_deviceid(server, v, gfp_mask); if (!dev) return -EIO; - bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, - NULL); - if (IS_ERR(bdev)) { + bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, + NULL, NULL); + if (IS_ERR(bdev_file)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); - return PTR_ERR(bdev); + MAJOR(dev), MINOR(dev), PTR_ERR(bdev_file)); + return PTR_ERR(bdev_file); } - d->bdev = bdev; - - - d->len = bdev_nr_bytes(d->bdev); + d->bdev_file = bdev_file; + d->len = bdev_nr_bytes(file_bdev(bdev_file)); d->map = bl_map_simple; printk(KERN_INFO "pNFS: using block device %s\n", - d->bdev->bd_disk->disk_name); + file_bdev(bdev_file)->bd_disk->disk_name); return 0; } @@ -302,10 +359,10 @@ bl_validate_designator(struct pnfs_block_volume *v) } } -static struct block_device * +static struct file * bl_open_path(struct pnfs_block_volume *v, const char *prefix) { - struct block_device *bdev; + struct file *bdev_file; const char *devname; devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN", @@ -313,15 +370,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix) if (!devname) return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, - NULL); - if (IS_ERR(bdev)) { - pr_warn("pNFS: failed to open device %s (%ld)\n", - devname, PTR_ERR(bdev)); + bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, + NULL, NULL); + if (IS_ERR(bdev_file)) { + dprintk("failed to open device %s (%ld)\n", + devname, PTR_ERR(bdev_file)); } kfree(devname); - return bdev; + return bdev_file; } static int @@ -331,6 +388,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *v = &volumes[idx]; struct block_device *bdev; const struct pr_ops *ops; + struct file *bdev_file; int error; if (!bl_validate_designator(v)) @@ -342,40 +400,38 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, * On other distributions like Debian, the default SCSI by-id path will * point to the dm-multipath device if one exists. */ - bdev = bl_open_path(v, "dm-uuid-mpath-0x"); - if (IS_ERR(bdev)) - bdev = bl_open_path(v, "wwn-0x"); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - d->bdev = bdev; - - d->len = bdev_nr_bytes(d->bdev); + bdev_file = bl_open_path(v, "dm-uuid-mpath-0x"); + if (IS_ERR(bdev_file)) + bdev_file = bl_open_path(v, "wwn-0x"); + if (IS_ERR(bdev_file)) + bdev_file = bl_open_path(v, "nvme-eui."); + if (IS_ERR(bdev_file)) { + pr_warn("pNFS: no device found for volume %*phN\n", + v->scsi.designator_len, v->scsi.designator); + return PTR_ERR(bdev_file); + } + d->bdev_file = bdev_file; + bdev = file_bdev(bdev_file); + + d->len = bdev_nr_bytes(bdev); d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; - pr_info("pNFS: using block device %s (reservation key 0x%llx)\n", - d->bdev->bd_disk->disk_name, d->pr_key); + if (d->len == 0) + return -ENODEV; - ops = d->bdev->bd_disk->fops->pr_ops; + ops = bdev->bd_disk->fops->pr_ops; if (!ops) { pr_err("pNFS: block device %s does not support reservations.", - d->bdev->bd_disk->disk_name); + bdev->bd_disk->disk_name); error = -EINVAL; goto out_blkdev_put; } - error = ops->pr_register(d->bdev, 0, d->pr_key, true); - if (error) { - pr_err("pNFS: failed to register key for block device %s.", - d->bdev->bd_disk->disk_name); - goto out_blkdev_put; - } - - d->pr_registered = true; return 0; out_blkdev_put: - blkdev_put(d->bdev, NULL); + fput(d->bdev_file); return error; } @@ -404,7 +460,7 @@ bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, int ret, i; d->children = kcalloc(v->concat.volumes_count, - sizeof(struct pnfs_block_dev), GFP_KERNEL); + sizeof(struct pnfs_block_dev), gfp_mask); if (!d->children) return -ENOMEM; @@ -433,7 +489,7 @@ bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, int ret, i; d->children = kcalloc(v->stripe.volumes_count, - sizeof(struct pnfs_block_dev), GFP_KERNEL); + sizeof(struct pnfs_block_dev), gfp_mask); if (!d->children) return -ENOMEM; @@ -457,7 +513,9 @@ static int bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { - switch (volumes[idx].type) { + d->type = volumes[idx].type; + + switch (d->type) { case PNFS_BLOCK_VOLUME_SIMPLE: return bl_parse_simple(server, d, volumes, idx, gfp_mask); case PNFS_BLOCK_VOLUME_SLICE: @@ -469,7 +527,7 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, case PNFS_BLOCK_VOLUME_SCSI: return bl_parse_scsi(server, d, volumes, idx, gfp_mask); default: - dprintk("unsupported volume type: %d\n", volumes[idx].type); + dprintk("unsupported volume type: %d\n", d->type); return -EIO; } } @@ -483,16 +541,16 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct pnfs_block_dev *top; struct xdr_stream xdr; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; int nr_volumes, ret, i; __be32 *p; - scratch = alloc_page(gfp_mask); + scratch = folio_alloc(gfp_mask, 0); if (!scratch) goto out; xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&xdr, scratch); + xdr_set_scratch_folio(&xdr, scratch); p = xdr_inline_decode(&xdr, sizeof(__be32)); if (!p) @@ -524,7 +582,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, out_free_volumes: kfree(volumes); out_free_scratch: - __free_page(scratch); + folio_put(scratch); out: return node; } diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 8f7cff7a4293..315949a7e92d 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -6,6 +6,7 @@ #include <linux/vmalloc.h> #include "blocklayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -520,10 +521,71 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); } -static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, +/** + * ext_tree_try_encode_commit - try to encode all extents into the buffer + * @bl: pointer to the layout + * @p: pointer to the output buffer + * @buffer_size: size of the output buffer + * @count: output pointer to the number of encoded extents + * @lastbyte: output pointer to the last written byte + * + * Return values: + * %0: Success, all required extents encoded, outputs are valid + * %-ENOSPC: Buffer too small, nothing encoded, outputs are invalid + */ +static int +ext_tree_try_encode_commit(struct pnfs_block_layout *bl, __be32 *p, size_t buffer_size, size_t *count, __u64 *lastbyte) { struct pnfs_block_extent *be; + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_WRITTEN) + continue; + + (*count)++; + if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { + spin_unlock(&bl->bl_ext_lock); + return -ENOSPC; + } + } + for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_WRITTEN) + continue; + + if (bl->bl_scsi_layout) + p = encode_scsi_range(be, p); + else + p = encode_block_extent(be, p); + be->be_tag = EXTENT_COMMITTING; + } + *lastbyte = (bl->bl_lwb != 0) ? bl->bl_lwb - 1 : U64_MAX; + bl->bl_lwb = 0; + spin_unlock(&bl->bl_ext_lock); + + return 0; +} + +/** + * ext_tree_encode_commit - encode as much as possible extents into the buffer + * @bl: pointer to the layout + * @p: pointer to the output buffer + * @buffer_size: size of the output buffer + * @count: output pointer to the number of encoded extents + * @lastbyte: output pointer to the last written byte + * + * Return values: + * %0: Success, all required extents encoded, outputs are valid + * %-ENOSPC: Buffer too small, some extents are encoded, outputs are valid + */ +static int +ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, + size_t buffer_size, size_t *count, __u64 *lastbyte) +{ + struct pnfs_block_extent *be, *be_prev; int ret = 0; spin_lock(&bl->bl_ext_lock); @@ -534,9 +596,9 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, (*count)++; if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { - /* keep counting.. */ + (*count)--; ret = -ENOSPC; - continue; + break; } if (bl->bl_scsi_layout) @@ -544,14 +606,30 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, else p = encode_block_extent(be, p); be->be_tag = EXTENT_COMMITTING; + be_prev = be; + } + if (!ret) { + *lastbyte = (bl->bl_lwb != 0) ? bl->bl_lwb - 1 : U64_MAX; + bl->bl_lwb = 0; + } else { + *lastbyte = be_prev->be_f_offset + be_prev->be_length; + *lastbyte <<= SECTOR_SHIFT; + *lastbyte -= 1; } - *lastbyte = bl->bl_lwb - 1; - bl->bl_lwb = 0; spin_unlock(&bl->bl_ext_lock); return ret; } +/** + * ext_tree_prepare_commit - encode extents that need to be committed + * @arg: layout commit data + * + * Return values: + * %0: Success, all required extents are encoded + * %-ENOSPC: Some extents are encoded, but not all, due to RPC size limit + * %-ENOMEM: Out of memory, extents not encoded + */ int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) { @@ -560,20 +638,18 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) __be32 *start_p; int ret; - dprintk("%s enter\n", __func__); - arg->layoutupdate_page = alloc_page(GFP_NOFS); if (!arg->layoutupdate_page) return -ENOMEM; start_p = page_address(arg->layoutupdate_page); arg->layoutupdate_pages = &arg->layoutupdate_page; -retry: - ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten); + ret = ext_tree_try_encode_commit(bl, start_p + 1, buffer_size, + &count, &arg->lastbytewritten); if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); - buffer_size = ext_tree_layoutupdate_size(bl, count); + buffer_size = NFS_SERVER(arg->inode)->wsize; count = 0; arg->layoutupdate_pages = @@ -588,7 +664,8 @@ retry: return -ENOMEM; } - goto retry; + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, + &count, &arg->lastbytewritten); } *start_p = cpu_to_be32(count); @@ -607,8 +684,9 @@ retry: } } - dprintk("%s found %zu ranges\n", __func__, count); - return 0; + trace_bl_ext_tree_prepare_commit(ret, count, + arg->lastbytewritten, !!ret); + return ret; } void diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index 6c977288cc28..d526f5ba7887 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -75,7 +75,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, msg->len = sizeof(*bl_msg) + b->simple.len; msg->data = kzalloc(msg->len, gfp_mask); if (!msg->data) - goto out_free_data; + goto out_unlock; bl_msg = msg->data; bl_msg->type = BL_DEVICE_MOUNT; @@ -141,24 +141,18 @@ static const struct rpc_pipe_ops bl_upcall_ops = { .destroy_msg = bl_pipe_destroy_msg, }; -static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, +static int nfs4blocklayout_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { - struct dentry *dir, *dentry; + struct dentry *dir; + int err; dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); if (dir == NULL) - return ERR_PTR(-ENOENT); - dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); + return -ENOENT; + err = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); dput(dir); - return dentry; -} - -static void nfs4blocklayout_unregister_sb(struct super_block *sb, - struct rpc_pipe *pipe) -{ - if (pipe->dentry) - rpc_unlink(pipe->dentry); + return err; } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, @@ -167,7 +161,6 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, struct super_block *sb = ptr; struct net *net = sb->s_fs_info; struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; int ret = 0; if (!try_module_get(THIS_MODULE)) @@ -180,16 +173,10 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, switch (event) { case RPC_PIPEFS_MOUNT: - dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - break; - } - nn->bl_device_pipe->dentry = dentry; + ret = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); break; case RPC_PIPEFS_UMOUNT: - if (nn->bl_device_pipe->dentry) - nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); + rpc_unlink(nn->bl_device_pipe); break; default: ret = -ENOTSUPP; @@ -203,18 +190,17 @@ static struct notifier_block nfs4blocklayout_block = { .notifier_call = rpc_pipefs_event, }; -static struct dentry *nfs4blocklayout_register_net(struct net *net, - struct rpc_pipe *pipe) +static int nfs4blocklayout_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *pipefs_sb; - struct dentry *dentry; + int ret; pipefs_sb = rpc_get_sb_net(net); if (!pipefs_sb) - return NULL; - dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); + return 0; + ret = nfs4blocklayout_register_sb(pipefs_sb, pipe); rpc_put_sb_net(net); - return dentry; + return ret; } static void nfs4blocklayout_unregister_net(struct net *net, @@ -224,7 +210,7 @@ static void nfs4blocklayout_unregister_net(struct net *net, pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { - nfs4blocklayout_unregister_sb(pipefs_sb, pipe); + rpc_unlink(pipe); rpc_put_sb_net(net); } } @@ -232,20 +218,17 @@ static void nfs4blocklayout_unregister_net(struct net *net, static int nfs4blocklayout_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; + int err; mutex_init(&nn->bl_mutex); init_waitqueue_head(&nn->bl_wq); nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); if (IS_ERR(nn->bl_device_pipe)) return PTR_ERR(nn->bl_device_pipe); - dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); - if (IS_ERR(dentry)) { + err = nfs4blocklayout_register_net(net, nn->bl_device_pipe); + if (unlikely(err)) rpc_destroy_pipe_data(nn->bl_device_pipe); - return PTR_ERR(dentry); - } - nn->bl_device_pipe->dentry = dentry; - return 0; + return err; } static void nfs4blocklayout_net_exit(struct net *net) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 456af7d230cf..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -74,72 +74,20 @@ out_err: static int nfs4_callback_svc(void *vrqstp) { - int err; struct svc_rqst *rqstp = vrqstp; - set_freezable(); - - while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); - /* - * Listen for a request on the socket - */ - err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); - if (err == -EAGAIN || err == -EINTR) - continue; - svc_process(rqstp); - } - - svc_exit_thread(rqstp); - return 0; -} - -#if defined(CONFIG_NFS_V4_1) -/* - * The callback service for NFSv4.1 callbacks - */ -static int -nfs41_callback_svc(void *vrqstp) -{ - struct svc_rqst *rqstp = vrqstp; - struct svc_serv *serv = rqstp->rq_server; - struct rpc_rqst *req; - int error; - DEFINE_WAIT(wq); + svc_thread_init_status(rqstp, 0); set_freezable(); - while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); - - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); - spin_lock_bh(&serv->sv_cb_lock); - if (!list_empty(&serv->sv_cb_list)) { - req = list_first_entry(&serv->sv_cb_list, - struct rpc_rqst, rq_bc_list); - list_del(&req->rq_bc_list); - spin_unlock_bh(&serv->sv_cb_lock); - finish_wait(&serv->sv_cb_waitq, &wq); - dprintk("Invoking bc_svc_process()\n"); - error = bc_svc_process(serv, req, rqstp); - dprintk("bc_svc_process() returned w/ error code= %d\n", - error); - } else { - spin_unlock_bh(&serv->sv_cb_lock); - if (!kthread_should_stop()) - schedule(); - finish_wait(&serv->sv_cb_waitq, &wq); - } - } + while (!svc_thread_should_stop(rqstp)) + svc_recv(rqstp); svc_exit_thread(rqstp); return 0; } +#if defined(CONFIG_NFS_V4_1) static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -188,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, @@ -205,7 +153,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, ret = svc_bind(serv, net); if (ret < 0) { printk(KERN_WARNING "NFS: bind callback service failed\n"); - goto err_bind; + goto err; } ret = 0; @@ -218,13 +166,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, if (ret < 0) { printk(KERN_ERR "NFS: callback service start failed\n"); - goto err_socks; + goto err; } return 0; -err_socks: - svc_rpcb_cleanup(serv, net); -err_bind: +err: nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " "net = %x\n", ret, net->ns.inum); @@ -241,7 +187,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) * Check whether we're already up and running. */ if (cb_info->serv) - return svc_get(cb_info->serv); + return cb_info->serv; /* * Sanity check: if there's no task, @@ -252,10 +198,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) cb_info->users); threadfn = nfs4_callback_svc; -#if defined(CONFIG_NFS_V4_1) - if (minorversion) - threadfn = nfs41_callback_svc; -#else +#if !defined(CONFIG_NFS_V4_1) if (minorversion) return ERR_PTR(-ENOTSUPP); #endif @@ -266,10 +209,6 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) return ERR_PTR(-ENOMEM); } cb_info->serv = serv; - /* As there is only one thread we need to over-ride the - * default maximum of 80 connections - */ - serv->sv_maxconn = 1024; dprintk("nfs_callback_create_svc: service created\n"); return serv; } @@ -302,9 +241,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) cb_info->users++; err_net: - if (!cb_info->users) - cb_info->serv = NULL; - svc_put(serv); + if (!cb_info->users) { + svc_set_num_threads(cb_info->serv, NULL, 0); + svc_destroy(&cb_info->serv); + } err_create: mutex_unlock(&nfs_callback_mutex); return ret; @@ -328,11 +268,9 @@ void nfs_callback_down(int minorversion, struct net *net) nfs_callback_down_net(minorversion, serv, net); cb_info->users--; if (cb_info->users == 0) { - svc_get(serv); svc_set_num_threads(serv, NULL, 0); - svc_put(serv); dprintk("nfs_callback_down: service destroyed\n"); - cb_info->serv = NULL; + svc_destroy(&cb_info->serv); } mutex_unlock(&nfs_callback_mutex); } @@ -387,7 +325,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) * All other checking done after NFS decoding where the nfs_client can be * found in nfs4_callback_compound */ -static int nfs_callback_authenticate(struct svc_rqst *rqstp) +static enum svc_auth_status nfs_callback_authenticate(struct svc_rqst *rqstp) { rqstp->rq_auth_stat = rpc_autherr_badcred; @@ -414,15 +352,12 @@ static const struct svc_version *nfs4_callback_version[] = { [4] = &nfs4_callback_version4, }; -static struct svc_stat nfs4_callback_stats; - static struct svc_program nfs4_callback_program = { .pg_prog = NFS4_CALLBACK, /* RPC service number */ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */ .pg_vers = nfs4_callback_version, /* version table */ .pg_name = "NFSv4 callback", /* service name */ .pg_class = "nfs", /* authentication class */ - .pg_stats = &nfs4_callback_stats, .pg_authenticate = nfs_callback_authenticate, .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index ccd4f245cae2..154a6ed1299f 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -19,32 +19,14 @@ enum nfs4_callback_procnum { CB_COMPOUND = 1, }; -enum nfs4_callback_opnum { - OP_CB_GETATTR = 3, - OP_CB_RECALL = 4, -/* Callback operations new to NFSv4.1 */ - OP_CB_LAYOUTRECALL = 5, - OP_CB_NOTIFY = 6, - OP_CB_PUSH_DELEG = 7, - OP_CB_RECALL_ANY = 8, - OP_CB_RECALLABLE_OBJ_AVAIL = 9, - OP_CB_RECALL_SLOT = 10, - OP_CB_SEQUENCE = 11, - OP_CB_WANTS_CANCELLED = 12, - OP_CB_NOTIFY_LOCK = 13, - OP_CB_NOTIFY_DEVICEID = 14, -/* Callback operations new to NFSv4.2 */ - OP_CB_OFFLOAD = 15, - OP_CB_ILLEGAL = 10044, -}; - struct nfs4_slot; struct cb_process_state { - __be32 drc_status; struct nfs_client *clp; struct nfs4_slot *slot; - u32 minorversion; struct net *net; + u32 minorversion; + __be32 drc_status; + unsigned int referring_calls; }; struct cb_compound_hdr_arg { @@ -64,14 +46,15 @@ struct cb_compound_hdr_res { struct cb_getattrargs { struct nfs_fh fh; - uint32_t bitmap[2]; + uint32_t bitmap[3]; }; struct cb_getattrres { __be32 status; - uint32_t bitmap[2]; + uint32_t bitmap[3]; uint64_t size; uint64_t change_attr; + struct timespec64 atime; struct timespec64 ctime; struct timespec64 mtime; }; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c1eda73254e1..8397c43358bd 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -37,7 +37,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ goto out; - res->bitmap[0] = res->bitmap[1] = 0; + memset(res->bitmap, 0, sizeof(res->bitmap)); res->status = htonl(NFS4ERR_BADHANDLE); dprintk_rcu("NFS: GETATTR callback request from %s\n", @@ -59,12 +59,16 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, res->change_attr = delegation->change_attr; if (nfs_have_writebacks(inode)) res->change_attr++; - res->ctime = inode->i_ctime; - res->mtime = inode->i_mtime; - res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & - args->bitmap[0]; - res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & - args->bitmap[1]; + res->atime = inode_get_atime(inode); + res->ctime = inode_get_ctime(inode); + res->mtime = inode_get_mtime(inode); + res->bitmap[0] = (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE) & + args->bitmap[0]; + res->bitmap[1] = (FATTR4_WORD1_TIME_ACCESS | + FATTR4_WORD1_TIME_METADATA | + FATTR4_WORD1_TIME_MODIFY) & args->bitmap[1]; + res->bitmap[2] = (FATTR4_WORD2_TIME_DELEG_ACCESS | + FATTR4_WORD2_TIME_DELEG_MODIFY) & args->bitmap[2]; res->status = 0; out_iput: rcu_read_unlock(); @@ -207,7 +211,8 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp, * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) */ static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, - const nfs4_stateid *new) + const nfs4_stateid *new, + struct cb_process_state *cps) { u32 oldseq, newseq; @@ -221,28 +226,29 @@ static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, newseq = be32_to_cpu(new->seqid); /* Are we already in a layout recall situation? */ - if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && - lo->plh_return_seq != 0) { - if (newseq < lo->plh_return_seq) - return NFS4ERR_OLD_STATEID; - if (newseq > lo->plh_return_seq) - return NFS4ERR_DELAY; - goto out; - } + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return NFS4ERR_DELAY; - /* Check that the stateid matches what we think it should be. */ + /* + * Check that the stateid matches what we think it should be. + * Note that if the server sent us a list of referring calls, + * and we know that those have completed, then we trust the + * stateid argument is correct. + */ oldseq = be32_to_cpu(lo->plh_stateid.seqid); - if (newseq > oldseq + 1) + if (newseq > oldseq + 1 && !cps->referring_calls) return NFS4ERR_DELAY; + /* Crazy server! */ if (newseq <= oldseq) return NFS4ERR_OLD_STATEID; -out: + return NFS_OK; } static u32 initiate_file_draining(struct nfs_client *clp, - struct cb_layoutrecallargs *args) + struct cb_layoutrecallargs *args, + struct cb_process_state *cps) { struct inode *ino; struct pnfs_layout_hdr *lo; @@ -266,7 +272,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto out; } pnfs_get_layout_hdr(lo); - rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); + rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid, cps); if (rv != NFS_OK) goto unlock; @@ -317,19 +323,21 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, int stat; if (args->cbl_recall_type == RETURN_FSID) - stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true); + stat = pnfs_layout_destroy_byfsid(clp, &args->cbl_fsid, + PNFS_LAYOUT_BULK_RETURN); else - stat = pnfs_destroy_layouts_byclid(clp, true); + stat = pnfs_layout_destroy_byclid(clp, PNFS_LAYOUT_BULK_RETURN); if (stat != 0) return NFS4ERR_DELAY; return NFS4ERR_NOMATCHING_LAYOUT; } static u32 do_callback_layoutrecall(struct nfs_client *clp, - struct cb_layoutrecallargs *args) + struct cb_layoutrecallargs *args, + struct cb_process_state *cps) { if (args->cbl_recall_type == RETURN_FILE) - return initiate_file_draining(clp, args); + return initiate_file_draining(clp, args, cps); return initiate_bulk_draining(clp, args); } @@ -340,11 +348,12 @@ __be32 nfs4_callback_layoutrecall(void *argp, void *resp, u32 res = NFS4ERR_OP_NOT_IN_SESSION; if (cps->clp) - res = do_callback_layoutrecall(cps->clp, args); + res = do_callback_layoutrecall(cps->clp, args, cps); return cpu_to_be32(res); } -static void pnfs_recall_all_layouts(struct nfs_client *clp) +static void pnfs_recall_all_layouts(struct nfs_client *clp, + struct cb_process_state *cps) { struct cb_layoutrecallargs args; @@ -352,7 +361,7 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp) memset(&args, 0, sizeof(args)); args.cbl_recall_type = RETURN_ALL; /* FIXME we ignore errors, what should we do? */ - do_callback_layoutrecall(clp, &args); + do_callback_layoutrecall(clp, &args, cps); } __be32 nfs4_callback_devicenotify(void *argp, void *resp, @@ -450,6 +459,7 @@ static int referring_call_exists(struct nfs_client *clp, __acquires(lock) { int status = 0; + int found = 0; int i, j; struct nfs4_session *session; struct nfs4_slot_table *tbl; @@ -478,11 +488,12 @@ static int referring_call_exists(struct nfs_client *clp, spin_lock(lock); if (status) goto out; + found++; } } out: - return status; + return status < 0 ? status : found; } __be32 nfs4_callback_sequence(void *argp, void *resp, @@ -493,6 +504,7 @@ __be32 nfs4_callback_sequence(void *argp, void *resp, struct nfs4_slot_table *tbl; struct nfs4_slot *slot; struct nfs_client *clp; + int ret; int i; __be32 status = htonl(NFS4ERR_BADSESSION); @@ -552,11 +564,13 @@ __be32 nfs4_callback_sequence(void *argp, void *resp, * related callback was received before the response to the original * call. */ - if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists, - &tbl->slot_tbl_lock) < 0) { + ret = referring_call_exists(clp, args->csa_nrclists, args->csa_rclists, + &tbl->slot_tbl_lock); + if (ret < 0) { status = htonl(NFS4ERR_DELAY); goto out_unlock; } + cps->referring_calls = ret; /* * RFC5661 20.9.3 @@ -617,7 +631,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, nfs_expire_unused_delegation_types(cps->clp, flags); if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT)) - pnfs_recall_all_layouts(cps->clp); + pnfs_recall_all_layouts(cps->clp, cps); if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) { set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state); @@ -704,7 +718,7 @@ __be32 nfs4_callback_offload(void *data, void *dummy, copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); if (!copy) - return htonl(NFS4ERR_SERVERFAULT); + return cpu_to_be32(NFS4ERR_DELAY); spin_lock(&cps->clp->cl_lock); rcu_read_lock(); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 321af81c456e..4254ba3ee7c5 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -25,8 +25,9 @@ #define CB_OP_GETATTR_BITMAP_MAXSZ (4 * 4) // bitmap length, 3 bitmaps #define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ CB_OP_GETATTR_BITMAP_MAXSZ + \ - /* change, size, ctime, mtime */\ - (2 + 2 + 3 + 3) * 4) + /* change, size, atime, ctime, + * mtime, deleg_atime, deleg_mtime */\ + (2 + 2 + 3 + 3 + 3 + 3 + 3) * 4) #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #if defined(CONFIG_NFS_V4_1) @@ -117,7 +118,9 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) if (likely(attrlen > 0)) bitmap[0] = ntohl(*p++); if (attrlen > 1) - bitmap[1] = ntohl(*p); + bitmap[1] = ntohl(*p++); + if (attrlen > 2) + bitmap[2] = ntohl(*p); return 0; } @@ -372,6 +375,8 @@ static __be32 decode_rc_list(struct xdr_stream *xdr, rc_list->rcl_nrefcalls = ntohl(*p++); if (rc_list->rcl_nrefcalls) { + if (unlikely(rc_list->rcl_nrefcalls > xdr->buf->len)) + goto out; p = xdr_inline_decode(xdr, rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); if (unlikely(p == NULL)) @@ -445,7 +450,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, void *argp) { struct cb_recallanyargs *args = argp; - uint32_t bitmap[2]; + uint32_t bitmap[3]; __be32 *p, status; p = xdr_inline_decode(xdr, 4); @@ -635,6 +640,13 @@ static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec64 * return 0; } +static __be32 encode_attr_atime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time) +{ + if (!(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) + return 0; + return encode_attr_time(xdr,time); +} + static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time) { if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) @@ -649,6 +661,24 @@ static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, return encode_attr_time(xdr,time); } +static __be32 encode_attr_delegatime(struct xdr_stream *xdr, + const uint32_t *bitmap, + const struct timespec64 *time) +{ + if (!(bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS)) + return 0; + return encode_attr_time(xdr,time); +} + +static __be32 encode_attr_delegmtime(struct xdr_stream *xdr, + const uint32_t *bitmap, + const struct timespec64 *time) +{ + if (!(bitmap[2] & FATTR4_WORD2_TIME_DELEG_MODIFY)) + return 0; + return encode_attr_time(xdr,time); +} + static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr) { __be32 status; @@ -699,10 +729,19 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, status = encode_attr_size(xdr, res->bitmap, res->size); if (unlikely(status != 0)) goto out; + status = encode_attr_atime(xdr, res->bitmap, &res->atime); + if (unlikely(status != 0)) + goto out; status = encode_attr_ctime(xdr, res->bitmap, &res->ctime); if (unlikely(status != 0)) goto out; status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); + if (unlikely(status != 0)) + goto out; + status = encode_attr_delegatime(xdr, res->bitmap, &res->atime); + if (unlikely(status != 0)) + goto out; + status = encode_attr_delegmtime(xdr, res->bitmap, &res->mtime); *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); out: return status; @@ -945,6 +984,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) nfs_put_client(cps.clp); goto out_invalidcred; } + svc_xprt_set_valid(rqstp->rq_xprt); } cps.minorversion = hdr_arg.minorversion; @@ -967,6 +1007,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) nops--; } + if (svc_is_backchannel(rqstp) && cps.clp) { + rqstp->bc_to_initval = cps.clp->cl_rpcclient->cl_timeout->to_initval; + rqstp->bc_to_retries = cps.clp->cl_rpcclient->cl_timeout->to_retries; + } + *hdr_res.status = status; *hdr_res.nops = htonl(nops); nfs4_cb_free_slot(&cps); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index e4c5f193ed5e..54699299d5b1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -38,7 +38,7 @@ #include <linux/sunrpc/bc_xprt.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> - +#include <linux/nfslocalio.h> #include "nfs4_fs.h" #include "callback.h" @@ -55,9 +55,13 @@ #define NFSDBG_FACILITY NFSDBG_CLIENT static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); -static DEFINE_SPINLOCK(nfs_version_lock); -static DEFINE_MUTEX(nfs_version_mutex); -static LIST_HEAD(nfs_versions); +static DEFINE_RWLOCK(nfs_version_lock); + +static struct nfs_subversion *nfs_version_mods[5] = { + [2] = NULL, + [3] = NULL, + [4] = NULL, +}; /* * RPC cruft for NFS @@ -73,46 +77,41 @@ const struct rpc_program nfs_program = { .number = NFS_PROGRAM, .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, - .stats = &nfs_rpcstat, .pipe_dir_name = NFS_PIPE_DIRNAME, }; -struct rpc_stat nfs_rpcstat = { - .program = &nfs_program -}; - -static struct nfs_subversion *find_nfs_version(unsigned int version) +static struct nfs_subversion *__find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; - spin_lock(&nfs_version_lock); - list_for_each_entry(nfs, &nfs_versions, list) { - if (nfs->rpc_ops->version == version) { - spin_unlock(&nfs_version_lock); - return nfs; - } - } - - spin_unlock(&nfs_version_lock); - return ERR_PTR(-EPROTONOSUPPORT); + read_lock(&nfs_version_lock); + nfs = nfs_version_mods[version]; + read_unlock(&nfs_version_lock); + return nfs; } -struct nfs_subversion *get_nfs_version(unsigned int version) +struct nfs_subversion *find_nfs_version(unsigned int version) { - struct nfs_subversion *nfs = find_nfs_version(version); + struct nfs_subversion *nfs = __find_nfs_version(version); - if (IS_ERR(nfs)) { - mutex_lock(&nfs_version_mutex); - request_module("nfsv%d", version); - nfs = find_nfs_version(version); - mutex_unlock(&nfs_version_mutex); - } + if (!nfs && request_module("nfsv%d", version) == 0) + nfs = __find_nfs_version(version); + + if (!nfs) + return ERR_PTR(-EPROTONOSUPPORT); - if (!IS_ERR(nfs) && !try_module_get(nfs->owner)) + if (!get_nfs_version(nfs)) return ERR_PTR(-EAGAIN); + return nfs; } +int get_nfs_version(struct nfs_subversion *nfs) +{ + return try_module_get(nfs->owner); +} +EXPORT_SYMBOL_GPL(get_nfs_version); + void put_nfs_version(struct nfs_subversion *nfs) { module_put(nfs->owner); @@ -120,23 +119,23 @@ void put_nfs_version(struct nfs_subversion *nfs) void register_nfs_version(struct nfs_subversion *nfs) { - spin_lock(&nfs_version_lock); + write_lock(&nfs_version_lock); - list_add(&nfs->list, &nfs_versions); + nfs_version_mods[nfs->rpc_ops->version] = nfs; nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; - spin_unlock(&nfs_version_lock); + write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(register_nfs_version); void unregister_nfs_version(struct nfs_subversion *nfs) { - spin_lock(&nfs_version_lock); + write_lock(&nfs_version_lock); nfs_version[nfs->rpc_ops->version] = NULL; - list_del(&nfs->list); + nfs_version_mods[nfs->rpc_ops->version] = NULL; - spin_unlock(&nfs_version_lock); + write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(unregister_nfs_version); @@ -156,7 +155,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; - if (!try_module_get(clp->cl_nfs_mod->owner)) + if (!get_nfs_version(clp->cl_nfs_mod)) goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; @@ -181,7 +180,14 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; - clp->cl_net = get_net(cl_init->net); + clp->cl_net = get_net_track(cl_init->net, &clp->cl_ns_tracker, GFP_KERNEL); + +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + seqlock_init(&clp->cl_boot_lock); + ktime_get_real_ts64(&clp->cl_nfssvc_boot); + nfs_uuid_init(&clp->cl_uuid); + INIT_WORK(&clp->cl_local_probe_work, nfs_local_probe_async_work); +#endif /* CONFIG_NFS_LOCALIO */ clp->cl_principal = "*"; clp->cl_xprtsec = cl_init->xprtsec; @@ -238,15 +244,17 @@ static void pnfs_init_server(struct nfs_server *server) */ void nfs_free_client(struct nfs_client *clp) { + nfs_localio_disable_client(clp); + /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); - put_net(clp->cl_net); + put_net_track(clp->cl_net, &clp->cl_ns_tracker); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); - kfree(clp); + kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); @@ -330,6 +338,14 @@ again: /* Match the xprt security policy */ if (clp->cl_xprtsec.policy != data->xprtsec.policy) continue; + if (clp->cl_xprtsec.policy == RPC_XPRTSEC_TLS_X509) { + if (clp->cl_xprtsec.cert_serial != + data->xprtsec.cert_serial) + continue; + if (clp->cl_xprtsec.privkey_serial != + data->xprtsec.privkey_serial) + continue; + } refcount_inc(&clp->cl_count); return clp; @@ -429,7 +445,10 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); - return rpc_ops->init_client(new, cl_init); + new = rpc_ops->init_client(new, cl_init); + if (!IS_ERR(new)) + nfs_local_probe_async(new); + return new; } spin_unlock(&nn->nfs_client_lock); @@ -502,6 +521,7 @@ int nfs_create_rpc_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { .net = clp->cl_net, @@ -513,10 +533,13 @@ int nfs_create_rpc_client(struct nfs_client *clp, .servername = clp->cl_hostname, .nodename = cl_init->nodename, .program = &nfs_program, + .stats = &nn->rpcstats, .version = clp->rpc_ops->version, .authflavor = flavor, .cred = cl_init->cred, .xprtsec = cl_init->xprtsec, + .connect_timeout = cl_init->connect_timeout, + .reconnect_timeout = cl_init->reconnect_timeout, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) @@ -531,6 +554,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, args.flags |= RPC_CLNT_CREATE_NOPING; if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_REUSEPORT; + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NETUNREACH_FATAL; if (!IS_ERR(clp->cl_rpcclient)) return 0; @@ -665,6 +690,44 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, } EXPORT_SYMBOL_GPL(nfs_init_client); +static void nfs4_server_set_init_caps(struct nfs_server *server) +{ +#if IS_ENABLED(CONFIG_NFS_V4) + /* Set the basic capabilities */ + server->caps = server->nfs_client->cl_mvops->init_caps; + if (server->flags & NFS_MOUNT_NORDIRPLUS) + server->caps &= ~NFS_CAP_READDIRPLUS; + if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA) + server->caps &= ~NFS_CAP_READ_PLUS; + + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && + server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; +#endif +} + +void nfs_server_set_init_caps(struct nfs_server *server) +{ + switch (server->nfs_client->rpc_ops->version) { + case 2: + server->caps = NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; + break; + case 3: + server->caps = NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; + if (!(server->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; + break; + default: + nfs4_server_set_init_caps(server); + break; + } +} +EXPORT_SYMBOL_GPL(nfs_server_set_init_caps); + /* * Create a version 2 or 3 client */ @@ -694,6 +757,9 @@ static int nfs_init_server(struct nfs_server *server, if (ctx->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (ctx->flags & NFS_MOUNT_NETUNREACH_FATAL) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); + /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) @@ -706,7 +772,6 @@ static int nfs_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = ctx->flags; server->options = ctx->options; - server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; switch (clp->rpc_ops->version) { case 2: @@ -742,6 +807,8 @@ static int nfs_init_server(struct nfs_server *server, if (error < 0) goto error; + nfs_server_set_init_caps(server); + /* Preserve the values of mount_server-related mount options */ if (ctx->mount_server.addrlen) { memcpy(&server->mountd_address, &ctx->mount_server.address, @@ -794,7 +861,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; - server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); @@ -811,7 +877,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->maxfilesize = fsinfo->maxfilesize; - server->time_delta = fsinfo->time_delta; server->change_attr_type = fsinfo->change_attr_type; server->clone_blksize = fsinfo->clone_blksize; @@ -831,6 +896,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; + else + server->caps &= ~NFS_CAP_XATTR; #endif } @@ -916,7 +983,6 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; target->acdirmax = source->acdirmax; - target->caps = source->caps; target->options = source->options; target->auth_info = source->auth_info; target->port = source->port; @@ -984,8 +1050,10 @@ struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->layouts); INIT_LIST_HEAD(&server->state_owners_lru); INIT_LIST_HEAD(&server->ss_copies); + INIT_LIST_HEAD(&server->ss_src_copies); atomic_set(&server->active, 0); + atomic_long_set(&server->nr_active_delegations, 0); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { @@ -995,8 +1063,11 @@ struct nfs_server *nfs_alloc_server(void) server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; - ida_init(&server->openowner_id); - ida_init(&server->lockowner_id); + init_waitqueue_head(&server->write_congestion_wait); + atomic_long_set(&server->writeback, 0); + + atomic64_set(&server->owner_ctr, 0); + pnfs_init_server(server); rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); @@ -1004,6 +1075,14 @@ struct nfs_server *nfs_alloc_server(void) } EXPORT_SYMBOL_GPL(nfs_alloc_server); +static void delayed_free(struct rcu_head *p) +{ + struct nfs_server *server = container_of(p, struct nfs_server, rcu); + + nfs_free_iostats(server->io_stats); + kfree(server); +} + /* * Free up a server record */ @@ -1027,12 +1106,9 @@ void nfs_free_server(struct nfs_server *server) } ida_free(&s_sysfs_ids, server->s_sysfs_id); - ida_destroy(&server->lockowner_id); - ida_destroy(&server->openowner_id); - nfs_free_iostats(server->io_stats); put_cred(server->cred); - kfree(server); nfs_release_automount_timer(); + call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); @@ -1076,6 +1152,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc) if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } + /* Linux 'subtree_check' borkenness mandates this setting */ + server->fh_expire_type = NFS_FH_VOL_RENAME; if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, @@ -1139,6 +1217,8 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (error < 0) goto out_free_server; + nfs_server_set_init_caps(server); + /* probe the filesystem info for this server filesystem */ error = nfs_probe_server(server, fh); if (error < 0) @@ -1171,8 +1251,14 @@ void nfs_clients_init(struct net *net) #if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); #endif +#if IS_ENABLED(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&nn->nfs4_data_server_cache); + spin_lock_init(&nn->nfs4_data_server_lock); +#endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); + memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); + nn->rpcstats.program = &nfs_program; nfs_netns_sysfs_setup(nn, net); } @@ -1185,6 +1271,9 @@ void nfs_clients_exit(struct net *net) nfs_cleanup_cb_ident_idr(net); WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); +#if IS_ENABLED(CONFIG_NFS_V4_1) + WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache)); +#endif } #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index cf7365581031..9d3a5f29f17f 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -27,8 +27,15 @@ #define NFS_DEFAULT_DELEGATION_WATERMARK (5000U) -static atomic_long_t nfs_active_delegations; static unsigned nfs_delegation_watermark = NFS_DEFAULT_DELEGATION_WATERMARK; +module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644); + +static struct hlist_head *nfs_delegation_hash(struct nfs_server *server, + const struct nfs_fh *fhandle) +{ + return server->delegation_hash_table + + (nfs_fhandle_hash(fhandle) & server->delegation_hash_mask); +} static void __nfs_free_delegation(struct nfs_delegation *delegation) { @@ -37,11 +44,12 @@ static void __nfs_free_delegation(struct nfs_delegation *delegation) kfree_rcu(delegation, rcu); } -static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation) +static void nfs_mark_delegation_revoked(struct nfs_server *server, + struct nfs_delegation *delegation) { if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; - atomic_long_dec(&nfs_active_delegations); + atomic_long_dec(&server->nr_active_delegations); if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) nfs_clear_verifier_delegated(delegation->inode); } @@ -59,9 +67,10 @@ static void nfs_put_delegation(struct nfs_delegation *delegation) __nfs_free_delegation(delegation); } -static void nfs_free_delegation(struct nfs_delegation *delegation) +static void nfs_free_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) { - nfs_mark_delegation_revoked(delegation); + nfs_mark_delegation_revoked(server, delegation); nfs_put_delegation(delegation); } @@ -79,14 +88,14 @@ static void nfs_mark_return_delegation(struct nfs_server *server, struct nfs_delegation *delegation) { set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags); set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } -static bool -nfs4_is_valid_delegation(const struct nfs_delegation *delegation, - fmode_t flags) +static bool nfs4_is_valid_delegation(const struct nfs_delegation *delegation, + fmode_t type) { - if (delegation != NULL && (delegation->type & flags) == flags && + if (delegation != NULL && (delegation->type & type) == type && !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) return true; @@ -103,19 +112,22 @@ struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode) return NULL; } -static int -nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) +static int nfs4_do_check_delegation(struct inode *inode, fmode_t type, + int flags, bool mark) { struct nfs_delegation *delegation; int ret = 0; - flags &= FMODE_READ|FMODE_WRITE; + type &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (nfs4_is_valid_delegation(delegation, flags)) { + if (nfs4_is_valid_delegation(delegation, type)) { if (mark) nfs_mark_delegation_referenced(delegation); ret = 1; + if ((flags & NFS_DELEGATION_FLAG_TIME) && + !test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags)) + ret = 0; } rcu_read_unlock(); return ret; @@ -124,22 +136,23 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) * nfs4_have_delegation - check if inode has a delegation, mark it * NFS_DELEGATION_REFERENCED if there is one. * @inode: inode to check - * @flags: delegation types to check for + * @type: delegation types to check for + * @flags: various modifiers * * Returns one if inode has the indicated delegation, otherwise zero. */ -int nfs4_have_delegation(struct inode *inode, fmode_t flags) +int nfs4_have_delegation(struct inode *inode, fmode_t type, int flags) { - return nfs4_do_check_delegation(inode, flags, true); + return nfs4_do_check_delegation(inode, type, flags, true); } /* * nfs4_check_delegation - check if inode has a delegation, do not mark * NFS_DELEGATION_REFERENCED if it has one. */ -int nfs4_check_delegation(struct inode *inode, fmode_t flags) +int nfs4_check_delegation(struct inode *inode, fmode_t type) { - return nfs4_do_check_delegation(inode, flags, false); + return nfs4_do_check_delegation(inode, type, 0, false); } static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid) @@ -156,8 +169,8 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state list = &flctx->flc_posix; spin_lock(&flctx->flc_lock); restart: - list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file)->state != state) + for_each_file_lock(fl, list) { + if (nfs_file_open_context(fl->c.flc_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); @@ -181,7 +194,6 @@ static int nfs_delegation_claim_opens(struct inode *inode, struct nfs_open_context *ctx; struct nfs4_state_owner *sp; struct nfs4_state *state; - unsigned int seq; int err; again: @@ -202,12 +214,9 @@ again: sp = state->owner; /* Block nfs4_proc_unlck */ mutex_lock(&sp->so_delegreturn_mutex); - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid); if (!err) err = nfs_delegation_claim_locks(state, stateid); - if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) - err = -EAGAIN; mutex_unlock(&sp->so_delegreturn_mutex); put_nfs_open_context(ctx); if (err != 0) @@ -225,40 +234,51 @@ again: * @type: delegation type * @stateid: delegation stateid * @pagemod_limit: write delegation "space_limit" + * @deleg_type: raw delegation type * */ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, fmode_t type, const nfs4_stateid *stateid, - unsigned long pagemod_limit) + unsigned long pagemod_limit, u32 deleg_type) { struct nfs_delegation *delegation; const struct cred *oldcred = NULL; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL) { - spin_lock(&delegation->lock); - nfs4_stateid_copy(&delegation->stateid, stateid); - delegation->type = type; - delegation->pagemod_limit = pagemod_limit; - oldcred = delegation->cred; - delegation->cred = get_cred(cred); - clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); - if (test_and_clear_bit(NFS_DELEGATION_REVOKED, - &delegation->flags)) - atomic_long_inc(&nfs_active_delegations); - spin_unlock(&delegation->lock); - rcu_read_unlock(); - put_cred(oldcred); - trace_nfs4_reclaim_delegation(inode, type); - } else { + if (!delegation) { rcu_read_unlock(); nfs_inode_set_delegation(inode, cred, type, stateid, - pagemod_limit); + pagemod_limit, deleg_type); + return; } + + spin_lock(&delegation->lock); + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; + oldcred = delegation->cred; + delegation->cred = get_cred(cred); + switch (deleg_type) { + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + set_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags); + break; + default: + clear_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags); + } + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + atomic_long_inc(&NFS_SERVER(inode)->nr_active_delegations); + spin_unlock(&delegation->lock); + rcu_read_unlock(); + put_cred(oldcred); + trace_nfs4_reclaim_delegation(inode, type); } -static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +static int nfs_do_return_delegation(struct inode *inode, + struct nfs_delegation *delegation, + int issync) { const struct cred *cred; int res = 0; @@ -267,9 +287,8 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation * spin_lock(&delegation->lock); cred = get_cred(delegation->cred); spin_unlock(&delegation->lock); - res = nfs4_proc_delegreturn(inode, cred, - &delegation->stateid, - issync); + res = nfs4_proc_delegreturn(inode, cred, &delegation->stateid, + delegation, issync); put_cred(cred); } return res; @@ -297,7 +316,8 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) if (delegation == NULL) goto out; spin_lock(&delegation->lock); - if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (delegation->inode && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); /* Refcount matched in nfs_end_delegation_return() */ ret = nfs_get_delegation(delegation); @@ -321,14 +341,16 @@ nfs_start_delegation_return(struct nfs_inode *nfsi) } static void nfs_abort_delegation_return(struct nfs_delegation *delegation, - struct nfs_client *clp, int err) + struct nfs_server *server, int err) { - spin_lock(&delegation->lock); clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); if (err == -EAGAIN) { set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state); + set_bit(NFS4SERV_DELEGRETURN_DELAYED, + &server->delegation_flags); + set_bit(NFS4CLNT_DELEGRETURN_DELAYED, + &server->nfs_client->cl_state); } spin_unlock(&delegation->lock); } @@ -342,6 +364,8 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi, rcu_dereference_protected(nfsi->delegation, lockdep_is_held(&clp->cl_lock)); + trace_nfs4_detach_delegation(&nfsi->vfs_inode, delegation->type); + if (deleg_cur == NULL || delegation != deleg_cur) return NULL; @@ -350,6 +374,7 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi, spin_unlock(&delegation->lock); return NULL; } + hlist_del_init_rcu(&delegation->hash); list_del_rcu(&delegation->super_list); delegation->inode = NULL; rcu_assign_pointer(nfsi->delegation, NULL); @@ -397,7 +422,8 @@ nfs_update_delegation_cred(struct nfs_delegation *delegation, } static void -nfs_update_inplace_delegation(struct nfs_delegation *delegation, +nfs_update_inplace_delegation(struct nfs_server *server, + struct nfs_delegation *delegation, const struct nfs_delegation *update) { if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) { @@ -410,7 +436,7 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, nfs_update_delegation_cred(delegation, update->cred); /* smp_mb__before_atomic() is implicit due to xchg() */ clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags); - atomic_long_inc(&nfs_active_delegations); + atomic_long_inc(&server->nr_active_delegations); } } } @@ -422,13 +448,13 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, * @type: delegation type * @stateid: delegation stateid * @pagemod_limit: write delegation "space_limit" + * @deleg_type: raw delegation type * * Returns zero on success, or a negative errno value. */ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, - fmode_t type, - const nfs4_stateid *stateid, - unsigned long pagemod_limit) + fmode_t type, const nfs4_stateid *stateid, + unsigned long pagemod_limit, u32 deleg_type) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = server->nfs_client; @@ -448,6 +474,12 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, delegation->cred = get_cred(cred); delegation->inode = inode; delegation->flags = 1<<NFS_DELEGATION_REFERENCED; + switch (deleg_type) { + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + delegation->flags |= BIT(NFS_DELEGATION_DELEGTIME); + } + delegation->test_gen = 0; spin_lock_init(&delegation->lock); spin_lock(&clp->cl_lock); @@ -459,7 +491,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, if (nfs4_stateid_match_other(&old_delegation->stateid, &delegation->stateid)) { spin_lock(&old_delegation->lock); - nfs_update_inplace_delegation(old_delegation, + nfs_update_inplace_delegation(server, old_delegation, delegation); spin_unlock(&old_delegation->lock); goto out; @@ -505,19 +537,26 @@ add_new: spin_unlock(&inode->i_lock); list_add_tail_rcu(&delegation->super_list, &server->delegations); + hlist_add_head_rcu(&delegation->hash, + nfs_delegation_hash(server, &NFS_I(inode)->fh)); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; - atomic_long_inc(&nfs_active_delegations); + atomic_long_inc(&server->nr_active_delegations); trace_nfs4_set_delegation(inode, type); + + /* If we hold writebacks and have delegated mtime then update */ + if (deleg_type == NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG && + nfs_have_writebacks(inode)) + nfs_update_delegated_mtime(inode); out: spin_unlock(&clp->cl_lock); if (delegation != NULL) __nfs_free_delegation(delegation); if (freeme != NULL) { nfs_do_return_delegation(inode, freeme, 0); - nfs_free_delegation(freeme); + nfs_free_delegation(server, freeme); } return status; } @@ -527,7 +566,7 @@ out: */ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); unsigned int mode = O_WRONLY | O_RDWR; int err = 0; @@ -549,11 +588,11 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation /* * Guard against state recovery */ - err = nfs4_wait_clnt_recover(clp); + err = nfs4_wait_clnt_recover(server->nfs_client); } if (err) { - nfs_abort_delegation_return(delegation, clp, err); + nfs_abort_delegation_return(delegation, server, err); goto out; } @@ -568,19 +607,10 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) { bool ret = false; + trace_nfs_delegation_need_return(delegation); + if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; - else if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) { - struct inode *inode; - - spin_lock(&delegation->lock); - inode = delegation->inode; - if (inode && list_empty(&NFS_I(inode)->open_files)) - ret = true; - spin_unlock(&delegation->lock); - } - if (ret) - clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) || test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) @@ -599,6 +629,9 @@ static int nfs_server_return_marked_delegations(struct nfs_server *server, struct nfs_delegation *place_holder_deleg = NULL; int err = 0; + if (!test_and_clear_bit(NFS4SERV_DELEGRETURN, + &server->delegation_flags)) + return 0; restart: /* * To avoid quadratic looping we hold a reference @@ -627,6 +660,9 @@ restart: prev = delegation; continue; } + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; if (prev) { struct inode *tmp = nfs_delegation_grab_inode(prev); @@ -637,12 +673,6 @@ restart: } } - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - iput(to_put); - goto restart; - } delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); @@ -653,6 +683,7 @@ restart: cond_resched(); if (!err) goto restart; + set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags); set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); goto out; } @@ -667,6 +698,9 @@ static bool nfs_server_clear_delayed_delegations(struct nfs_server *server) struct nfs_delegation *d; bool ret = false; + if (!test_and_clear_bit(NFS4SERV_DELEGRETURN_DELAYED, + &server->delegation_flags)) + goto out; list_for_each_entry_rcu (d, &server->delegations, super_list) { if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags)) continue; @@ -674,6 +708,7 @@ static bool nfs_server_clear_delayed_delegations(struct nfs_server *server) clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags); ret = true; } +out: return ret; } @@ -733,7 +768,7 @@ void nfs_inode_evict_delegation(struct inode *inode) set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags); nfs_do_return_delegation(inode, delegation, 1); - nfs_free_delegation(delegation); + nfs_free_delegation(NFS_SERVER(inode), delegation); } } @@ -764,6 +799,43 @@ int nfs4_inode_return_delegation(struct inode *inode) } /** + * nfs4_inode_set_return_delegation_on_close - asynchronously return a delegation + * @inode: inode to process + * + * This routine is called to request that the delegation be returned as soon + * as the file is closed. If the file is already closed, the delegation is + * immediately returned. + */ +void nfs4_inode_set_return_delegation_on_close(struct inode *inode) +{ + struct nfs_delegation *delegation; + struct nfs_delegation *ret = NULL; + + if (!inode) + return; + rcu_read_lock(); + delegation = nfs4_get_valid_delegation(inode); + if (!delegation) + goto out; + spin_lock(&delegation->lock); + if (!delegation->inode) + goto out_unlock; + if (list_empty(&NFS_I(inode)->open_files) && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); + } else + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +out_unlock: + spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(inode); +out: + rcu_read_unlock(); + nfs_end_delegation_return(inode, ret, 0); +} + +/** * nfs4_inode_return_delegation_on_close - asynchronously return a delegation * @inode: inode to process * @@ -782,7 +854,8 @@ void nfs4_inode_return_delegation_on_close(struct inode *inode) if (!delegation) goto out; if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) || - atomic_long_read(&nfs_active_delegations) >= nfs_delegation_watermark) { + atomic_long_read(&NFS_SERVER(inode)->nr_active_delegations) >= + nfs_delegation_watermark) { spin_lock(&delegation->lock); if (delegation->inode && list_empty(&NFS_I(inode)->open_files) && @@ -824,11 +897,25 @@ int nfs4_inode_make_writeable(struct inode *inode) return nfs4_inode_return_delegation(inode); } -static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, - struct nfs_delegation *delegation) +static void +nfs_mark_return_if_closed_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) { - set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); + struct inode *inode; + + if (test_bit(NFS_DELEGATION_RETURN, &delegation->flags) || + test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) + return; + spin_lock(&delegation->lock); + inode = delegation->inode; + if (!inode) + goto out; + if (list_empty(&NFS_I(inode)->open_files)) + nfs_mark_return_delegation(server, delegation); + else + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +out: + spin_unlock(&delegation->lock); } static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) @@ -944,7 +1031,7 @@ static void nfs_revoke_delegation(struct inode *inode, } spin_unlock(&delegation->lock); } - nfs_mark_delegation_revoked(delegation); + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); ret = true; out: rcu_read_unlock(); @@ -952,13 +1039,6 @@ out: nfs_inode_find_state_and_recover(inode, stateid); } -void nfs_remove_bad_delegation(struct inode *inode, - const nfs4_stateid *stateid) -{ - nfs_revoke_delegation(inode, stateid); -} -EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); - void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid) { @@ -983,7 +1063,12 @@ void nfs_delegation_mark_returned(struct inode *inode, delegation->stateid.seqid = stateid->seqid; } - nfs_mark_delegation_revoked(delegation); + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + spin_unlock(&delegation->lock); + if (nfs_detach_delegation(NFS_I(inode), delegation, NFS_SERVER(inode))) + nfs_put_delegation(delegation); + goto out_rcu_unlock; out_clear_returning: clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); @@ -996,6 +1081,24 @@ out_rcu_unlock: } /** + * nfs_remove_bad_delegation - handle delegations that are unusable + * @inode: inode to process + * @stateid: the delegation's stateid + * + * If the server ACK-ed our FREE_STATEID then clean + * up the delegation, else mark and keep the revoked state. + */ +void nfs_remove_bad_delegation(struct inode *inode, + const nfs4_stateid *stateid) +{ + if (stateid && stateid->type == NFS4_FREED_STATEID_TYPE) + nfs_delegation_mark_returned(inode, stateid); + else + nfs_revoke_delegation(inode, stateid); +} +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); + +/** * nfs_expire_unused_delegation_types * @clp: client to process * @flags: delegation types to expire @@ -1073,11 +1176,12 @@ static struct inode * nfs_delegation_find_inode_server(struct nfs_server *server, const struct nfs_fh *fhandle) { + struct hlist_head *head = nfs_delegation_hash(server, fhandle); struct nfs_delegation *delegation; struct super_block *freeme = NULL; struct inode *res = NULL; - list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + hlist_for_each_entry_rcu(delegation, head, hash) { spin_lock(&delegation->lock); if (delegation->inode != NULL && !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && @@ -1164,7 +1268,6 @@ static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server, struct inode *inode; restart: rcu_read_lock(); -restart_locked: list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags) || @@ -1175,13 +1278,13 @@ restart_locked: continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) - goto restart_locked; + continue; delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); if (delegation != NULL) { if (nfs_detach_delegation(NFS_I(inode), delegation, server) != NULL) - nfs_free_delegation(delegation); + nfs_free_delegation(server, delegation); /* Match nfs_start_delegation_return_locked */ nfs_put_delegation(delegation); } @@ -1218,6 +1321,7 @@ static void nfs_mark_test_expired_delegation(struct nfs_server *server, return; clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags); set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state); } @@ -1294,24 +1398,30 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server, struct inode *inode; const struct cred *cred; nfs4_stateid stateid; + unsigned long gen = ++server->delegation_gen; + + if (!test_and_clear_bit(NFS4SERV_DELEGATION_EXPIRED, + &server->delegation_flags)) + return 0; restart: rcu_read_lock(); -restart_locked: list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags) || test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || test_bit(NFS_DELEGATION_TEST_EXPIRED, - &delegation->flags) == 0) + &delegation->flags) == 0 || + delegation->test_gen == gen) continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) - goto restart_locked; + continue; spin_lock(&delegation->lock); cred = get_cred_rcu(delegation->cred); nfs4_stateid_copy(&stateid, &delegation->stateid); spin_unlock(&delegation->lock); + delegation->test_gen = gen; clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); rcu_read_unlock(); nfs_delegation_test_free_expired(inode, &stateid, cred); @@ -1322,6 +1432,9 @@ restart_locked: goto restart; } nfs_inode_mark_test_expired_delegation(server,inode); + set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags); + set_bit(NFS4CLNT_DELEGATION_EXPIRED, + &server->nfs_client->cl_state); iput(inode); return -EAGAIN; } @@ -1476,4 +1589,17 @@ out: return ret; } -module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644); +int nfs4_delegation_hash_alloc(struct nfs_server *server) +{ + int delegation_buckets, i; + + delegation_buckets = roundup_pow_of_two(nfs_delegation_watermark / 16); + server->delegation_hash_mask = delegation_buckets - 1; + server->delegation_hash_table = kmalloc_array(delegation_buckets, + sizeof(*server->delegation_hash_table), GFP_KERNEL); + if (!server->delegation_hash_table) + return -ENOMEM; + for (i = 0; i < delegation_buckets; i++) + INIT_HLIST_HEAD(&server->delegation_hash_table[i]); + return 0; +} diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 1c378992b7c0..08ec2e9c68a4 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -14,6 +14,7 @@ * NFSv4 delegation */ struct nfs_delegation { + struct hlist_node hash; struct list_head super_list; const struct cred *cred; struct inode *inode; @@ -21,6 +22,7 @@ struct nfs_delegation { fmode_t type; unsigned long pagemod_limit; __u64 change_attr; + unsigned long test_gen; unsigned long flags; refcount_t refcount; spinlock_t lock; @@ -37,14 +39,18 @@ enum { NFS_DELEGATION_TEST_EXPIRED, NFS_DELEGATION_INODE_FREEING, NFS_DELEGATION_RETURN_DELAYED, + NFS_DELEGATION_DELEGTIME, }; int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, - fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit); + fmode_t type, const nfs4_stateid *stateid, + unsigned long pagemod_limit, u32 deleg_type); void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, - fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit); + fmode_t type, const nfs4_stateid *stateid, + unsigned long pagemod_limit, u32 deleg_type); int nfs4_inode_return_delegation(struct inode *inode); void nfs4_inode_return_delegation_on_close(struct inode *inode); +void nfs4_inode_set_return_delegation_on_close(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_evict_delegation(struct inode *inode); @@ -66,7 +72,9 @@ void nfs_test_expired_all_delegations(struct nfs_client *clp); void nfs_reap_expired_delegations(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ -int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync); +int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, + const nfs4_stateid *stateid, + struct nfs_delegation *delegation, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred); @@ -74,8 +82,8 @@ bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode); struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); -int nfs4_have_delegation(struct inode *inode, fmode_t flags); -int nfs4_check_delegation(struct inode *inode, fmode_t flags); +int nfs4_have_delegation(struct inode *inode, fmode_t type, int flags); +int nfs4_check_delegation(struct inode *inode, fmode_t type); bool nfs4_delegation_flush_on_close(const struct inode *inode); void nfs_inode_find_delegation_state_and_recover(struct inode *inode, const nfs4_stateid *stateid); @@ -83,9 +91,39 @@ int nfs4_inode_make_writeable(struct inode *inode); #endif +#define NFS_DELEGATION_FLAG_TIME BIT(1) + +void nfs_update_delegated_atime(struct inode *inode); +void nfs_update_delegated_mtime(struct inode *inode); +void nfs_update_delegated_mtime_locked(struct inode *inode); + +static inline int nfs_have_read_or_write_delegation(struct inode *inode) +{ + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0); +} + +static inline int nfs_have_write_delegation(struct inode *inode) +{ + return NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE, 0); +} + static inline int nfs_have_delegated_attributes(struct inode *inode) { - return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ); + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0); } +static inline int nfs_have_delegated_atime(struct inode *inode) +{ + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, + NFS_DELEGATION_FLAG_TIME); +} + +static inline int nfs_have_delegated_mtime(struct inode *inode) +{ + return NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE, + NFS_DELEGATION_FLAG_TIME); +} + +int nfs4_delegation_hash_alloc(struct nfs_server *server); + #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8f3112e71a6a..ea9f6ca8f30f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -56,6 +56,8 @@ static int nfs_readdir(struct file *, struct dir_context *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); static void nfs_readdir_clear_array(struct folio *); +static int nfs_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, int open_flags); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, @@ -149,7 +151,7 @@ struct nfs_cache_array { unsigned char folio_full : 1, folio_is_eof : 1, cookies_are_ordered : 1; - struct nfs_cache_array_entry array[]; + struct nfs_cache_array_entry array[] __counted_by(size); }; struct nfs_readdir_descriptor { @@ -326,7 +328,8 @@ static int nfs_readdir_folio_array_append(struct folio *folio, goto out; } - cache_entry = &array->array[array->size]; + array->size++; + cache_entry = &array->array[array->size - 1]; cache_entry->cookie = array->last_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; @@ -335,7 +338,6 @@ static int nfs_readdir_folio_array_append(struct folio *folio, array->last_cookie = entry->cookie; if (array->last_cookie <= cache_entry->cookie) array->cookies_are_ordered = 0; - array->size++; if (entry->eof != 0) nfs_readdir_array_set_eof(array); out: @@ -664,6 +666,8 @@ static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx, { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_FORCE_RDIRPLUS) + return true; if (ctx->pos == 0 || cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD) return true; @@ -825,17 +829,17 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, struct address_space *mapping = desc->file->f_mapping; struct folio *new, *folio = *arrays; struct xdr_stream stream; - struct page *scratch; + struct folio *scratch; struct xdr_buf buf; u64 cookie; int status; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (scratch == NULL) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); do { status = nfs_readdir_entry_decode(desc, entry, &stream); @@ -887,7 +891,7 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, if (folio != *arrays) nfs_readdir_folio_unlock_and_put(folio); - put_page(scratch); + folio_put(scratch); return status; } @@ -1089,6 +1093,17 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; + /* + * nfs_readdir_handle_cache_misses return force clear at + * (cache_misses > NFS_READDIR_CACHE_MISS_THRESHOLD) for + * readdir heuristic, NFS_READDIR_CACHE_MISS_THRESHOLD + 1 + * entries need be emitted here. + */ + if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 2) { + desc->eob = true; + break; + } + ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { @@ -1107,10 +1122,6 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; - if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) { - desc->eob = true; - break; - } } if (array->folio_is_eof) desc->eof = !desc->eob; @@ -1424,11 +1435,11 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry) static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf) { struct inode *inode = d_inode(dentry); - struct inode *dir = d_inode(dentry->d_parent); + struct inode *dir = d_inode_rcu(dentry->d_parent); - if (!nfs_verify_change_attribute(dir, verf)) + if (!dir || !nfs_verify_change_attribute(dir, verf)) return; - if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0)) nfs_set_verifier_delegated(&verf); dentry->d_time = verf; } @@ -1523,7 +1534,8 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) { if (NFS_PROTO(dir)->version == 2) return 0; - return flags & LOOKUP_EXCL; + return (flags & (LOOKUP_CREATE | LOOKUP_EXCL)) == + (LOOKUP_CREATE | LOOKUP_EXCL); } /* @@ -1618,7 +1630,16 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, switch (error) { case 1: break; - case 0: + case -ETIMEDOUT: + if (inode && (IS_ROOT(dentry) || + NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)) + error = 1; + break; + case -ESTALE: + case -ENOENT: + error = 0; + fallthrough; + default: /* * We can't d_drop the root of a disconnected tree: * its d_hash is on the s_anon list and d_drop() would hide @@ -1654,7 +1675,7 @@ nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, return nfs_lookup_revalidate_done(dir, dentry, inode, 1); } -static int nfs_lookup_revalidate_dentry(struct inode *dir, +static int nfs_lookup_revalidate_dentry(struct inode *dir, const struct qstr *name, struct dentry *dentry, struct inode *inode, unsigned int flags) { @@ -1672,19 +1693,9 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir, goto out; dir_verifier = nfs_save_change_attribute(dir); - ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); - if (ret < 0) { - switch (ret) { - case -ESTALE: - case -ENOENT: - ret = 0; - break; - case -ETIMEDOUT: - if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) - ret = 1; - } + ret = NFS_PROTO(dir)->lookup(dir, dentry, name, fhandle, fattr); + if (ret < 0) goto out; - } /* Request help from readdirplus */ nfs_lookup_advise_force_readdirplus(dir, flags); @@ -1724,11 +1735,11 @@ out: * cached dentry and do a new lookup. */ static int -nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, - unsigned int flags) +nfs_do_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; - int error; + int error = 0; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = d_inode(dentry); @@ -1767,47 +1778,57 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, if (NFS_STALE(inode)) goto out_bad; - return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags); + return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); out_valid: return nfs_lookup_revalidate_done(dir, dentry, inode, 1); out_bad: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_done(dir, dentry, inode, 0); + return nfs_lookup_revalidate_done(dir, dentry, inode, error); } static int -__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, - int (*reval)(struct inode *, struct dentry *, unsigned int)) +__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { - struct dentry *parent; - struct inode *dir; - int ret; - if (flags & LOOKUP_RCU) { if (dentry->d_fsdata == NFS_FSDATA_BLOCKED) return -ECHILD; - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - ret = reval(dir, dentry, flags); - if (parent != READ_ONCE(dentry->d_parent)) - return -ECHILD; } else { - /* Wait for unlink to complete */ + /* Wait for unlink to complete - see unblock_revalidate() */ wait_var_event(&dentry->d_fsdata, - dentry->d_fsdata != NFS_FSDATA_BLOCKED); - parent = dget_parent(dentry); - ret = reval(d_inode(parent), dentry, flags); - dput(parent); + smp_load_acquire(&dentry->d_fsdata) + != NFS_FSDATA_BLOCKED); } - return ret; + return 0; } -static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int nfs_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate); + if (__nfs_lookup_revalidate(dentry, flags)) + return -ECHILD; + return nfs_do_lookup_revalidate(dir, name, dentry, flags); +} + +static void block_revalidate(struct dentry *dentry) +{ + /* old devname - just in case */ + kfree(dentry->d_fsdata); + + /* Any new reference that could lead to an open + * will take ->d_lock in lookup_open() -> d_lookup(). + * Holding this lock ensures we cannot race with + * __nfs_lookup_revalidate() and removes and need + * for further barriers. + */ + lockdep_assert_held(&dentry->d_lock); + + dentry->d_fsdata = NFS_FSDATA_BLOCKED; +} + +static void unblock_revalidate(struct dentry *dentry) +{ + store_release_wake_up(&dentry->d_fsdata, NULL); } /* @@ -1950,7 +1971,8 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); - error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, + fhandle, fattr); if (error == -ENOENT) { if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) dir_verifier = inode_peek_iversion_raw(dir); @@ -1993,7 +2015,8 @@ void nfs_d_prune_case_insensitive_aliases(struct inode *inode) EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases); #if IS_ENABLED(CONFIG_NFS_V4) -static int nfs4_lookup_revalidate(struct dentry *, unsigned int); +static int nfs4_lookup_revalidate(struct inode *, const struct qstr *, + struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs4_lookup_revalidate, @@ -2175,18 +2198,21 @@ no_open: else dput(dentry); } - if (IS_ERR(res)) - return PTR_ERR(res); return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open); static int -nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, - unsigned int flags) +nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; + if (__nfs_lookup_revalidate(dentry, flags)) + return -ECHILD; + + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); + if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) goto full_reval; if (d_mountpoint(dentry)) @@ -2220,19 +2246,46 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, reval_dentry: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags); + return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); full_reval: - return nfs_do_lookup_revalidate(dir, dentry, flags); + return nfs_do_lookup_revalidate(dir, name, dentry, flags); } -static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) +#endif /* CONFIG_NFSV4 */ + +int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned int open_flags, + umode_t mode) { - return __nfs_lookup_revalidate(dentry, flags, - nfs4_do_lookup_revalidate); -} + struct dentry *res = NULL; + /* Same as look+open from lookup_open(), but with different O_TRUNC + * handling. + */ + int error = 0; -#endif /* CONFIG_NFSV4 */ + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + return -ENAMETOOLONG; + + if (open_flags & O_CREAT) { + error = nfs_do_create(dir, dentry, mode, open_flags); + if (!error) { + file->f_mode |= FMODE_CREATED; + return finish_open(file, dentry, NULL); + } else if (error != -EEXIST || open_flags & O_EXCL) + return error; + } + if (d_in_lookup(dentry)) { + /* The only flags nfs_lookup considers are + * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and + * we want those to be zero so the lookup isn't skipped. + */ + res = nfs_lookup(dir, dentry, 0); + } + return finish_no_open(file, res); + +} +EXPORT_SYMBOL_GPL(nfs_atomic_open_v23); struct dentry * nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, @@ -2247,7 +2300,8 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, d_drop(dentry); if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, + fhandle, fattr); if (error) goto out_error; } @@ -2294,18 +2348,23 @@ EXPORT_SYMBOL_GPL(nfs_instantiate); * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -int nfs_create(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, bool excl) +static int nfs_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, int open_flags) { struct iattr attr; - int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; int error; + open_flags |= O_CREAT; + dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; + if (open_flags & O_TRUNC) { + attr.ia_size = 0; + attr.ia_valid |= ATTR_SIZE; + } trace_nfs_create_enter(dir, dentry, open_flags); error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); @@ -2317,6 +2376,12 @@ out_err: d_drop(dentry); return error; } + +int nfs_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + return nfs_do_create(dir, dentry, mode, excl ? O_EXCL : 0); +} EXPORT_SYMBOL_GPL(nfs_create); /* @@ -2350,11 +2415,11 @@ EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct iattr attr; - int error; + struct dentry *ret; dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); @@ -2363,14 +2428,9 @@ int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, attr.ia_mode = mode | S_IFDIR; trace_nfs_mkdir_enter(dir, dentry); - error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); - trace_nfs_mkdir_exit(dir, dentry, error); - if (error != 0) - goto out_err; - return 0; -out_err: - d_drop(dentry); - return error; + ret = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); + trace_nfs_mkdir_exit(dir, dentry, PTR_ERR_OR_ZERO(ret)); + return ret; } EXPORT_SYMBOL_GPL(nfs_mkdir); @@ -2492,15 +2552,12 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); goto out; } - /* old devname */ - kfree(dentry->d_fsdata); - dentry->d_fsdata = NFS_FSDATA_BLOCKED; + block_revalidate(dentry); spin_unlock(&dentry->d_lock); error = nfs_safe_remove(dentry); nfs_dentry_remove_handle_error(dir, dentry, error); - dentry->d_fsdata = NULL; - wake_up_var(&dentry->d_fsdata); + unblock_revalidate(dentry); out: trace_nfs_unlink_exit(dir, dentry, error); return error; @@ -2525,7 +2582,7 @@ EXPORT_SYMBOL_GPL(nfs_unlink); int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { - struct page *page; + struct folio *folio; char *kaddr; struct iattr attr; unsigned int pathlen = strlen(symname); @@ -2540,24 +2597,24 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; - page = alloc_page(GFP_USER); - if (!page) + folio = folio_alloc(GFP_USER, 0); + if (!folio) return -ENOMEM; - kaddr = page_address(page); + kaddr = folio_address(folio); memcpy(kaddr, symname, pathlen); if (pathlen < PAGE_SIZE) memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); trace_nfs_symlink_enter(dir, dentry); - error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); + error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr); trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, dentry, symname, error); d_drop(dentry); - __free_page(page); + folio_put(folio); return error; } @@ -2567,18 +2624,13 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0, - GFP_KERNEL)) { - SetPageUptodate(page); - unlock_page(page); - /* - * add_to_page_cache_lru() grabs an extra page refcount. - * Drop it here to avoid leaking this page later. - */ - put_page(page); - } else - __free_page(page); + if (filemap_add_folio(d_inode(dentry)->i_mapping, folio, 0, + GFP_KERNEL) == 0) { + folio_mark_uptodate(folio); + folio_unlock(folio); + } + folio_put(folio); return 0; } EXPORT_SYMBOL_GPL(nfs_symlink); @@ -2612,8 +2664,19 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) { struct dentry *new_dentry = data->new_dentry; - new_dentry->d_fsdata = NULL; - wake_up_var(&new_dentry->d_fsdata); + unblock_revalidate(new_dentry); +} + +static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, + struct dentry *new_dentry) +{ + struct nfs_server *server = NFS_SB(old_dentry->d_sb); + + if (old_dentry->d_parent != new_dentry->d_parent) + return false; + if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) + return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); + return true; } /* @@ -2675,11 +2738,6 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) || WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED)) goto out; - if (new_dentry->d_fsdata) { - /* old devname */ - kfree(new_dentry->d_fsdata); - new_dentry->d_fsdata = NULL; - } spin_lock(&new_dentry->d_lock); if (d_count(new_dentry) > 2) { @@ -2701,18 +2759,21 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, new_dentry = dentry; new_inode = NULL; } else { - new_dentry->d_fsdata = NFS_FSDATA_BLOCKED; + block_revalidate(new_dentry); must_unblock = true; spin_unlock(&new_dentry->d_lock); } } - if (S_ISREG(old_inode->i_mode)) + if (S_ISREG(old_inode->i_mode) && + nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry)) nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); if (IS_ERR(task)) { + if (must_unblock) + unblock_revalidate(new_dentry); error = PTR_ERR(task); goto out; } @@ -2961,7 +3022,7 @@ static u64 nfs_access_login_time(const struct task_struct *task, rcu_read_lock(); for (;;) { parent = rcu_dereference(task->real_parent); - pcred = rcu_dereference(parent->cred); + pcred = __task_cred(parent); if (parent == task || cred_fscmp(pcred, cred) != 0) break; task = parent; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 9a18c5a69ace..48d89716193a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -56,6 +56,7 @@ #include <linux/uaccess.h> #include <linux/atomic.h> +#include "delegation.h" #include "internal.h" #include "iostat.h" #include "pnfs.h" @@ -93,12 +94,10 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq, dreq->max_count = dreq_len; if (dreq->count > dreq_len) dreq->count = dreq_len; - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - else /* Clear outstanding error if this is EOF */ - dreq->error = 0; } + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error) + dreq->error = hdr->error; } static void @@ -120,6 +119,32 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, dreq->count = dreq_len; } +static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, + struct nfs_page *req) +{ + loff_t offs = req_offset(req); + size_t req_start = (size_t)(offs - dreq->io_start); + + if (req_start < dreq->max_count) + dreq->max_count = req_start; + if (req_start < dreq->count) + dreq->count = req_start; +} + +static void nfs_direct_file_adjust_size_locked(struct inode *inode, + loff_t offset, size_t count) +{ + loff_t newsize = offset + (loff_t)count; + loff_t oldsize = i_size_read(inode); + + if (newsize > oldsize) { + i_size_write(inode, newsize); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; + trace_nfs_size_grow(inode, newsize); + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); + } +} + /** * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block @@ -131,8 +156,6 @@ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; - VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); - if (iov_iter_rw(iter) == READ) ret = nfs_file_direct_read(iocb, iter, true); else @@ -195,9 +218,10 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq) kref_put(&dreq->kref, nfs_direct_req_free); } -ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset) { - return dreq->bytes_left; + loff_t start = offset - dreq->io_start; + return dreq->max_count - start; } EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); @@ -263,6 +287,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) nfs_direct_count_bytes(dreq, hdr); spin_unlock(&dreq->lock); + nfs_update_delegated_atime(dreq->inode); + while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; @@ -294,6 +320,7 @@ static void nfs_read_sync_pgio_error(struct list_head *head, int error) static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr) { get_dreq(hdr->dreq); + set_bit(NFS_IOHDR_ODIRECT, &hdr->flags); } static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { @@ -358,7 +385,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, bytes -= req_len; requested_bytes += req_len; pos += req_len; - dreq->bytes_left -= req_len; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); @@ -430,7 +456,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, goto out; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = count; + dreq->max_count = count; dreq->io_start = iocb->ki_pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -446,8 +472,16 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, if (user_backed_iter(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; - if (!swap) - nfs_start_io_direct(inode); + if (!swap) { + result = nfs_start_io_direct(inode); + if (result) { + /* release the reference that would usually be + * consumed by nfs_direct_read_schedule_iovec() + */ + nfs_direct_req_release(dreq); + goto out_release; + } + } NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); @@ -472,21 +506,47 @@ out: return result; } -static void -nfs_direct_join_group(struct list_head *list, struct inode *inode) +static void nfs_direct_add_page_head(struct list_head *list, + struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + if (!list_empty(&head->wb_list) || !nfs_lock_request(head)) + return; + if (!list_empty(&head->wb_list)) { + nfs_unlock_request(head); + return; + } + list_add(&head->wb_list, list); + kref_get(&head->wb_kref); + kref_get(&head->wb_kref); +} + +static void nfs_direct_join_group(struct list_head *list, + struct nfs_commit_info *cinfo, + struct inode *inode) { - struct nfs_page *req, *next; + struct nfs_page *req, *subreq; list_for_each_entry(req, list, wb_list) { - if (req->wb_head != req || req->wb_this_page == req) + if (req->wb_head != req) { + nfs_direct_add_page_head(&req->wb_list, req); continue; - for (next = req->wb_this_page; - next != req->wb_head; - next = next->wb_this_page) { - nfs_list_remove_request(next); - nfs_release_request(next); } - nfs_join_page_group(req, inode); + subreq = req->wb_this_page; + if (subreq == req) + continue; + do { + /* + * Remove subrequests from this list before freeing + * them in the call to nfs_join_page_group(). + */ + if (!list_empty(&subreq->wb_list)) { + nfs_list_remove_request(subreq); + nfs_release_request(subreq); + } + } while ((subreq = subreq->wb_this_page) != req); + nfs_join_page_group(req, cinfo, inode); } } @@ -504,20 +564,15 @@ nfs_direct_write_scan_commit_list(struct inode *inode, static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; - struct nfs_page *req, *tmp; + struct nfs_page *req; LIST_HEAD(reqs); struct nfs_commit_info cinfo; - LIST_HEAD(failed); nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); - nfs_direct_join_group(&reqs, dreq->inode); + nfs_direct_join_group(&reqs, &cinfo, dreq->inode); - dreq->count = 0; - dreq->max_count = 0; - list_for_each_entry(req, &reqs, wb_list) - dreq->max_count += req->wb_bytes; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); get_dreq(dreq); @@ -525,27 +580,40 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); /* Bump the transmission count */ req->wb_nio++; if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_move_request(req, &failed); - spin_lock(&cinfo.inode->i_lock); - dreq->flags = 0; - if (desc.pg_error < 0) + spin_lock(&dreq->lock); + if (dreq->error < 0) { + desc.pg_error = dreq->error; + } else if (desc.pg_error != -EAGAIN) { + dreq->flags = 0; + if (!desc.pg_error) + desc.pg_error = -EIO; dreq->error = desc.pg_error; - else - dreq->error = -EIO; - spin_unlock(&cinfo.inode->i_lock); + } else + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + break; } nfs_release_request(req); } nfs_pageio_complete(&desc); - while (!list_empty(&failed)) { - req = nfs_list_entry(failed.next); + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); nfs_unlock_and_release_request(req); + if (desc.pg_error == -EAGAIN) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } else { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } } if (put_dreq(dreq)) @@ -562,30 +630,38 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) trace_nfs_direct_commit_complete(dreq); + spin_lock(&dreq->lock); if (status < 0) { /* Errors in commit are fatal */ dreq->error = status; - dreq->max_count = 0; - dreq->count = 0; dreq->flags = NFS_ODIRECT_DONE; } else { status = dreq->error; } + spin_unlock(&dreq->lock); nfs_init_cinfo_from_dreq(&cinfo, dreq); while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (status >= 0 && !nfs_write_match_verf(verf, req)) { - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + if (status < 0) { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } else if (!nfs_write_match_verf(verf, req)) { + spin_lock(&dreq->lock); + if (dreq->flags == 0) + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); /* * Despite the reboot, the write was successful, * so reset wb_nio. */ req->wb_nio = 0; nfs_mark_request_commit(req, NULL, &cinfo, 0); - } else /* Error or match */ + } else nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -620,10 +696,17 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) LIST_HEAD(mds_list); nfs_init_cinfo_from_dreq(&cinfo, dreq); + nfs_commit_begin(cinfo.mds); nfs_scan_commit(dreq->inode, &mds_list, &cinfo); res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo); - if (res < 0) /* res == -ENOMEM */ - nfs_direct_write_reschedule(dreq); + if (res < 0) { /* res == -ENOMEM */ + spin_lock(&dreq->lock); + if (dreq->flags == 0) + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + } + if (nfs_commit_end(cinfo.mds)) + nfs_direct_write_complete(dreq); } static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) @@ -638,6 +721,7 @@ static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) while (!list_empty(&reqs)) { req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); + nfs_direct_truncate_request(dreq, req); nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -673,7 +757,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; - struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct inode *inode = dreq->inode; int flags = NFS_ODIRECT_DONE; trace_nfs_direct_write_completion(dreq); @@ -687,14 +771,21 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } nfs_direct_count_bytes(dreq, hdr); - if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) { + if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) && + !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; flags = dreq->flags; } spin_unlock(&dreq->lock); + spin_lock(&inode->i_lock); + nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count); + nfs_update_delegated_mtime_locked(dreq->inode); + spin_unlock(&inode->i_lock); + while (!list_empty(&hdr->pages)) { + struct nfs_page *req; req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); @@ -731,18 +822,23 @@ static void nfs_write_sync_pgio_error(struct list_head *head, int error) static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; + struct nfs_page *req; + struct nfs_commit_info cinfo; trace_nfs_direct_write_reschedule_io(dreq); + nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - if (dreq->error == 0) { + if (dreq->error == 0) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.offset + hdr->args.count - - hdr->io_start; - } + set_bit(NFS_IOHDR_REDO, &hdr->flags); spin_unlock(&dreq->lock); + while (!list_empty(&hdr->pages)) { + req = nfs_list_entry(hdr->pages.next); + nfs_list_remove_request(req); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } } static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { @@ -770,9 +866,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; + struct nfs_commit_info cinfo; ssize_t result = 0; size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + bool defer = false; trace_nfs_direct_write_schedule_iovec(dreq); @@ -813,17 +911,36 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; } + pgbase = 0; + bytes -= req_len; + requested_bytes += req_len; + pos += req_len; + + if (defer) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + continue; + } + nfs_lock_request(req); - if (!nfs_pageio_add_request(&desc, req)) { + if (nfs_pageio_add_request(&desc, req)) + continue; + + /* Exit on hard errors */ + if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { result = desc.pg_error; nfs_unlock_and_release_request(req); break; } - pgbase = 0; - bytes -= req_len; - requested_bytes += req_len; - pos += req_len; - dreq->bytes_left -= req_len; + + /* If the error is soft, defer remaining requests */ + nfs_init_cinfo_from_dreq(&cinfo, dreq); + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + desc.pg_error = 0; + defer = true; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); @@ -904,7 +1021,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, goto out; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = count; + dreq->max_count = count; dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -922,7 +1039,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, FLUSH_STABLE); } else { - nfs_start_io_direct(inode); + result = nfs_start_io_direct(inode); + if (result) { + /* release the reference that would usually be + * consumed by nfs_direct_write_schedule_iovec() + */ + nfs_direct_req_release(dreq); + goto out_release; + } requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, FLUSH_COND_STABLE); @@ -962,8 +1086,7 @@ int __init nfs_init_directcache(void) { nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", sizeof(struct nfs_direct_req), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + 0, SLAB_RECLAIM_ACCOUNT, NULL); if (nfs_direct_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 6603b5cee029..714975e5c0db 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -7,14 +7,16 @@ * Resolves DNS hostnames into valid ip addresses */ -#ifdef CONFIG_NFS_USE_KERNEL_DNS - #include <linux/module.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/addr.h> -#include <linux/dns_resolver.h> + #include "dns_resolve.h" +#ifdef CONFIG_NFS_USE_KERNEL_DNS + +#include <linux/dns_resolver.h> + ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, struct sockaddr_storage *ss, size_t salen) { @@ -35,7 +37,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #else -#include <linux/module.h> #include <linux/hash.h> #include <linux/string.h> #include <linux/kmod.h> @@ -43,15 +44,12 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #include <linux/socket.h> #include <linux/seq_file.h> #include <linux/inet.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/addr.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" -#include "dns_resolve.h" #include "cache_lib.h" #include "netns.h" diff --git a/fs/nfs/export.c b/fs/nfs/export.c index be686b8e0c54..a10dd5f9d078 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -66,14 +66,21 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid, { struct nfs_fattr *fattr = NULL; struct nfs_fh *server_fh = nfs_exp_embedfh(fid->raw); - size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size; + size_t fh_size = offsetof(struct nfs_fh, data); const struct nfs_rpc_ops *rpc_ops; struct dentry *dentry; struct inode *inode; - int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size); + int len = EMBED_FH_OFF; u32 *p = fid->raw; int ret; + /* Initial check of bounds */ + if (fh_len < len + XDR_QUADLEN(fh_size) || + fh_len > XDR_QUADLEN(NFS_MAXFHSIZE)) + return NULL; + /* Calculate embedded filehandle size */ + fh_size += server_fh->size; + len += XDR_QUADLEN(fh_size); /* NULL translates to ESTALE */ if (fh_len < len || fh_type != len) return NULL; @@ -154,5 +161,6 @@ const struct export_operations nfs_export_ops = { EXPORT_OP_CLOSE_BEFORE_UNLINK | EXPORT_OP_REMOTE_FS | EXPORT_OP_NOATOMIC_ATTR | - EXPORT_OP_FLUSH_ON_CLOSE, + EXPORT_OP_FLUSH_ON_CLOSE | + EXPORT_OP_NOLOCKS, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 79b1b3fcd3fc..d020aab40c64 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -28,7 +28,9 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/gfp.h> +#include <linux/rmap.h> #include <linux/swap.h> +#include <linux/compaction.h> #include <linux/uaccess.h> #include <linux/filelock.h> @@ -159,6 +161,8 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t result; + trace_nfs_file_read(iocb, to); + if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_read(iocb, to, false); @@ -166,7 +170,10 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - nfs_start_io_read(inode); + result = nfs_start_io_read(inode); + if (result) + return result; + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); @@ -187,7 +194,10 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe dprintk("NFS: splice_read(%pD2, %zu@%llu)\n", in, len, *ppos); - nfs_start_io_read(inode); + result = nfs_start_io_read(inode); + if (result) + return result; + result = nfs_revalidate_mapping(inode, in->f_mapping); if (!result) { result = filemap_splice_read(in, ppos, pipe, len, flags); @@ -200,24 +210,25 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe EXPORT_SYMBOL_GPL(nfs_file_splice_read); int -nfs_file_mmap(struct file * file, struct vm_area_struct * vma) +nfs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); int status; dprintk("NFS: mmap(%pD2)\n", file); - /* Note: generic_file_mmap() returns ENOSYS on nommu systems + /* Note: generic_file_mmap_prepare() returns ENOSYS on nommu systems * so we call that before revalidating the mapping */ - status = generic_file_mmap(file, vma); + status = generic_file_mmap_prepare(desc); if (!status) { - vma->vm_ops = &nfs_file_vm_ops; + desc->vm_ops = &nfs_file_vm_ops; status = nfs_revalidate_mapping(inode, file->f_mapping); } return status; } -EXPORT_SYMBOL_GPL(nfs_file_mmap); +EXPORT_SYMBOL_GPL(nfs_file_mmap_prepare); /* * Flush any dirty pages for this process, and check for write errors. @@ -272,6 +283,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL_GPL(nfs_file_fsync); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to) +{ + struct folio *folio; + + if (from >= to) + return; + + folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + + if (folio_test_uptodate(folio)) { + loff_t fpos = folio_pos(folio); + size_t offset = from - fpos; + size_t end = folio_size(folio); + + if (to - fpos < end) + end = to - fpos; + folio_zero_segment(folio, offset, end); + trace_nfs_size_truncate_folio(mapping->host, to); + } + + folio_unlock(folio); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(nfs_truncate_last_folio); + /* * Decide whether a read/modify/write cycle may be more efficient * then a modify/write/read cycle when writing to a page in the @@ -321,6 +363,8 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, if (pnfs_ld_read_whole_page(file_inode(file))) return true; + if (folio_test_dropbehind(folio)) + return false; /* Open for reading too? */ if (file->f_mode & FMODE_READ) return true; @@ -335,23 +379,29 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, * If the writer ends up delaying the write, the writer needs to * increment the page use counts until he is done with the page. */ -static int nfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, +static int nfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct folio *folio; + struct file *file = iocb->ki_filp; int once_thru = 0; int ret; + trace_nfs_write_begin(file_inode(file), pos, len); + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); + nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); start: - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - return PTR_ERR(folio); - *pagep = &folio->page; + folio = write_begin_get_folio(iocb, mapping, pos >> PAGE_SHIFT, len); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out; + } + *foliop = folio; ret = nfs_flush_incompatible(file, folio); if (ret) { @@ -360,23 +410,28 @@ start: } else if (!once_thru && nfs_want_read_modify_write(file, folio, pos, len)) { once_thru = 1; + folio_clear_dropbehind(folio); ret = nfs_read_folio(file, folio); folio_put(folio); if (!ret) goto start; } +out: + trace_nfs_write_begin_done(file_inode(file), pos, len, ret); return ret; } -static int nfs_write_end(struct file *file, struct address_space *mapping, +static int nfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { + struct file *file = iocb->ki_filp; struct nfs_open_context *ctx = nfs_file_open_context(file); - struct folio *folio = page_folio(page); unsigned offset = offset_in_folio(folio, pos); int status; + trace_nfs_write_end(file_inode(file), pos, len); dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); @@ -405,13 +460,16 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, folio_unlock(folio); folio_put(folio); - if (status < 0) + if (status < 0) { + trace_nfs_write_end_done(file_inode(file), pos, len, status); return status; + } NFS_I(mapping->host)->write_io += copied; if (nfs_ctx_key_to_expire(ctx, mapping->host)) nfs_wb_all(mapping->host); + trace_nfs_write_end_done(file_inode(file), pos, len, copied); return copied; } @@ -425,16 +483,17 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, static void nfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", folio->index, offset, length); - if (offset != 0 || length < folio_size(folio)) - return; /* Cancel any unstarted writes on this page */ - nfs_wb_folio_cancel(inode, folio); - folio_wait_fscache(folio); - trace_nfs_invalidate_folio(inode, folio); + if (offset != 0 || length < folio_size(folio)) + nfs_wb_folio(inode, folio); + else + nfs_wb_folio_cancel(inode, folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ + trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length); } /* @@ -450,9 +509,9 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp) /* If the private flag is set, then the folio is not freeable */ if (folio_test_private(folio)) { if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL || - current_is_kswapd()) + current_is_kswapd() || current_is_kcompactd()) return false; - if (nfs_wb_folio(folio_file_mapping(folio)->host, folio) < 0) + if (nfs_wb_folio(folio->mapping->host, folio) < 0) return false; } return nfs_fscache_release_folio(folio, gfp); @@ -500,9 +559,10 @@ static int nfs_launder_folio(struct folio *folio) dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n", inode->i_ino, folio_pos(folio)); - folio_wait_fscache(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ ret = nfs_wb_folio(inode, folio); - trace_nfs_launder_folio_done(inode, folio, ret); + trace_nfs_launder_folio_done(inode, folio_pos(folio), + folio_size(folio), ret); return ret; } @@ -558,7 +618,6 @@ const struct address_space_operations nfs_file_aops = { .read_folio = nfs_read_folio, .readahead = nfs_readahead, .dirty_folio = filemap_dirty_folio, - .writepage = nfs_writepage, .writepages = nfs_writepages, .write_begin = nfs_write_begin, .write_end = nfs_write_end, @@ -567,7 +626,7 @@ const struct address_space_operations nfs_file_aops = { .migrate_folio = nfs_migrate_folio, .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, .swap_rw = nfs_swap_rw, @@ -589,13 +648,13 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", filp, filp->f_mapping->host->i_ino, - (long long)folio_file_pos(folio)); + (long long)folio_pos(folio)); sb_start_pagefault(inode->i_sb); /* make sure the cache has finished storing the page */ - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) { + if (folio_test_private_2(folio) && /* [DEPRECATED] */ + folio_wait_private_2_killable(folio) < 0) { ret = VM_FAULT_RETRY; goto out; } @@ -605,7 +664,7 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); folio_lock(folio); - mapping = folio_file_mapping(folio); + mapping = folio->mapping; if (mapping != inode->i_mapping) goto out_unlock; @@ -643,6 +702,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) errseq_t since; int error; + trace_nfs_file_write(iocb, from); + result = nfs_key_timeout_notify(file, inode); if (result) return result; @@ -667,7 +728,9 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) nfs_clear_invalid_mapping(file->f_mapping); since = filemap_sample_wb_err(file->f_mapping); - nfs_start_io_write(inode); + error = nfs_start_io_write(inode); + if (error) + return error; result = generic_write_checks(iocb, from); if (result > 0) result = generic_perform_write(iocb, from); @@ -721,17 +784,17 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status = 0; - unsigned int saved_type = fl->fl_type; + unsigned int saved_type = fl->c.flc_type; /* Try local locking first */ posix_test_lock(filp, fl); - if (fl->fl_type != F_UNLCK) { + if (fl->c.flc_type != F_UNLCK) { /* found a conflict */ goto out; } - fl->fl_type = saved_type; + fl->c.flc_type = saved_type; - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + if (nfs_have_read_or_write_delegation(inode)) goto out_noconflict; if (is_local) @@ -741,7 +804,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) out: return status; out_noconflict: - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; goto out; } @@ -766,7 +829,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * If we're signalled while cleaning up locks on process exit, we * still need to complete the unlock. */ - if (status < 0 && !(fl->fl_flags & FL_CLOSE)) + if (status < 0 && !(fl->c.flc_flags & FL_CLOSE)) return status; } @@ -814,7 +877,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { + if (!nfs_have_read_or_write_delegation(inode)) { nfs_zap_caches(inode); if (mapping_mapped(filp->f_mapping)) nfs_revalidate_mapping(inode, filp->f_mapping); @@ -833,12 +896,12 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) int is_local = 0; dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n", - filp, fl->fl_type, fl->fl_flags, + filp, fl->c.flc_type, fl->c.flc_flags, (long long)fl->fl_start, (long long)fl->fl_end); nfs_inc_stats(inode, NFSIOS_VFSLOCK); - if (fl->fl_flags & FL_RECLAIM) + if (fl->c.flc_flags & FL_RECLAIM) return -ENOGRACE; if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) @@ -852,7 +915,7 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) if (IS_GETLK(cmd)) ret = do_getlk(filp, cmd, fl, is_local); - else if (fl->fl_type == F_UNLCK) + else if (lock_is_unlock(fl)) ret = do_unlk(filp, cmd, fl, is_local); else ret = do_setlk(filp, cmd, fl, is_local); @@ -870,16 +933,16 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) int is_local = 0; dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n", - filp, fl->fl_type, fl->fl_flags); + filp, fl->c.flc_type, fl->c.flc_flags); - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; /* We're simulating flock() locks using posix locks on the server */ - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) return do_unlk(filp, cmd, fl, is_local); return do_setlk(filp, cmd, fl, is_local); } @@ -889,7 +952,7 @@ const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, .read_iter = nfs_file_read, .write_iter = nfs_file_write, - .mmap = nfs_file_mmap, + .mmap_prepare = nfs_file_mmap_prepare, .open = nfs_file_open, .flush = nfs_file_flush, .release = nfs_file_release, @@ -900,5 +963,6 @@ const struct file_operations nfs_file_operations = { .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, + .fop_flags = FOP_DONTCACHE, }; EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index ce8f8934bca5..5c4551117c58 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -488,7 +488,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr) /* Perform an asynchronous read to ds */ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred, NFS_PROTO(hdr->inode), &filelayout_read_call_ops, - 0, RPC_TASK_SOFTCONN); + 0, RPC_TASK_SOFTCONN, NULL); return PNFS_ATTEMPTED; } @@ -530,7 +530,7 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) /* Perform an asynchronous write */ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred, NFS_PROTO(hdr->inode), &filelayout_write_call_ops, - sync, RPC_TASK_SOFTCONN); + sync, RPC_TASK_SOFTCONN, NULL); return PNFS_ATTEMPTED; } @@ -605,14 +605,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); - /* FIXME: remove this check when layout segment support is added */ - if (lgr->range.offset != 0 || - lgr->range.length != NFS4_MAX_UINT64) { - dprintk("%s Only whole file layouts supported. Use MDS i/o\n", - __func__); - goto out; - } - if (fl->pattern_offset > lgr->range.offset) { dprintk("%s pattern_offset %lld too large\n", __func__, fl->pattern_offset); @@ -654,19 +646,19 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, { struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; __be32 *p; uint32_t nfl_util; int i; dprintk("%s: set_layout_map Begin\n", __func__); - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), * num_fh (4) */ @@ -732,11 +724,11 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, fl->fh_array[i]->size); } - __free_page(scratch); + folio_put(scratch); return 0; out_err: - __free_page(scratch); + folio_put(scratch); return -EIO; } @@ -875,15 +867,15 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), - 0, - NFS4_MAX_UINT64, + req_offset(req), + req->wb_bytes, IOMODE_READ, false, - GFP_KERNEL); + nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -899,15 +891,15 @@ static void filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), - 0, - NFS4_MAX_UINT64, + req_offset(req), + req->wb_bytes, IOMODE_RW, false, - GFP_NOFS); + nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -1019,7 +1011,7 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) data->args.fh = fh; return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode), &filelayout_commit_call_ops, how, - RPC_TASK_SOFTCONN); + RPC_TASK_SOFTCONN, NULL); out_err: pnfs_generic_prepare_to_resend_writes(data); pnfs_generic_commit_release(data); @@ -1118,7 +1110,6 @@ static const struct pnfs_commit_ops filelayout_commit_ops = { .clear_request_commit = pnfs_generic_clear_request_commit, .scan_commit_lists = pnfs_generic_scan_commit_lists, .recover_commit_reqs = pnfs_generic_recover_commit_reqs, - .search_commit_reqs = pnfs_generic_search_commit_reqs, .commit_pagelist = filelayout_commit_pagelist, }; diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index aed0748fd6ec..c7bb5da93307 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -51,7 +51,7 @@ struct nfs4_file_layout_dsaddr { u32 stripe_count; u8 *stripe_indices; u32 ds_num; - struct nfs4_pnfs_ds *ds_list[]; + struct nfs4_pnfs_ds *ds_list[] __counted_by(ds_num); }; struct nfs4_filelayout_segment { diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index acf4b88889dc..df79aeb68db4 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -35,6 +35,7 @@ #include "../internal.h" #include "../nfs4session.h" #include "filelayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -72,17 +73,18 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct nfs4_file_layout_dsaddr *dsaddr = NULL; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; + struct net *net = server->nfs_client->cl_net; /* set up xdr stream */ - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) goto out_err; xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* Get the stripe count (number of stripe index) */ p = xdr_inline_decode(&stream, 4); @@ -158,8 +160,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -169,9 +170,10 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_deviceid; } - dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!dsaddr->ds_list[i]) goto out_err_drain_dsaddrs; + trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr); /* If DS was already in cache, free ds addrs */ while (!list_empty(&dsaddrs)) { @@ -184,7 +186,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, } } - __free_page(scratch); + folio_put(scratch); return dsaddr; out_err_drain_dsaddrs: @@ -202,7 +204,7 @@ out_err_free_deviceid: out_err_free_stripe_indices: kfree(stripe_indices); out_err_free_scratch: - __free_page(scratch); + folio_put(scratch); out_err: dprintk("%s ERROR: returning NULL\n", __func__); return NULL; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7deb3cd76abe..9056f05a67dc 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -11,6 +11,7 @@ #include <linux/nfs_mount.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/file.h> #include <linux/sched/mm.h> #include <linux/sunrpc/metrics.h> @@ -46,7 +47,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, int dev_limit, enum nfs4_ff_op_type type); static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, - struct nfs4_ff_layout_mirror *mirror); + struct nfs4_ff_layout_ds_stripe *dss_info); static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) @@ -162,18 +163,33 @@ decode_name(struct xdr_stream *xdr, u32 *id) return 0; } -static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, - const struct nfs4_ff_layout_mirror *m2) +static struct nfsd_file * +ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id, + struct nfs_client *clp, const struct cred *cred, + struct nfs_fh *fh, fmode_t mode) +{ +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); + + return nfs_local_open_fh(clp, cred, fh, &mirror->dss[dss_id].nfl, mode); +#else + return NULL; +#endif +} + +static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1, + const struct nfs4_ff_layout_ds_stripe *dss2) { int i, j; - if (m1->fh_versions_cnt != m2->fh_versions_cnt) + if (dss1->fh_versions_cnt != dss2->fh_versions_cnt) return false; - for (i = 0; i < m1->fh_versions_cnt; i++) { + + for (i = 0; i < dss1->fh_versions_cnt; i++) { bool found_fh = false; - for (j = 0; j < m2->fh_versions_cnt; j++) { - if (nfs_compare_fh(&m1->fh_versions[i], - &m2->fh_versions[j]) == 0) { + for (j = 0; j < dss2->fh_versions_cnt; j++) { + if (nfs_compare_fh(&dss1->fh_versions[i], + &dss2->fh_versions[j]) == 0) { found_fh = true; break; } @@ -184,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, return true; } +static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + u32 dss_id; + + if (m1->dss_count != m2->dss_count) + return false; + + for (dss_id = 0; dss_id < m1->dss_count; dss_id++) + if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id])) + return false; + + return true; +} + +static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + u32 dss_id; + + if (m1->dss_count != m2->dss_count) + return false; + + for (dss_id = 0; dss_id < m1->dss_count; dss_id++) + if (memcmp(&m1->dss[dss_id].devid, + &m2->dss[dss_id].devid, + sizeof(m1->dss[dss_id].devid)) != 0) + return false; + + return true; +} + static struct nfs4_ff_layout_mirror * ff_layout_add_mirror(struct pnfs_layout_hdr *lo, struct nfs4_ff_layout_mirror *mirror) @@ -194,7 +242,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo, spin_lock(&inode->i_lock); list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { - if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0) + if (!ff_mirror_match_devid(mirror, pos)) continue; if (!ff_mirror_match_fh(mirror, pos)) continue; @@ -222,30 +270,52 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror) mirror->layout = NULL; } -static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) +static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(u32 dss_count, + gfp_t gfp_flags) { struct nfs4_ff_layout_mirror *mirror; mirror = kzalloc(sizeof(*mirror), gfp_flags); - if (mirror != NULL) { - spin_lock_init(&mirror->lock); - refcount_set(&mirror->ref, 1); - INIT_LIST_HEAD(&mirror->mirrors); + if (mirror == NULL) + return NULL; + + spin_lock_init(&mirror->lock); + refcount_set(&mirror->ref, 1); + INIT_LIST_HEAD(&mirror->mirrors); + + mirror->dss_count = dss_count; + mirror->dss = + kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), + gfp_flags); + if (mirror->dss == NULL) { + kfree(mirror); + return NULL; } + + for (u32 dss_id = 0; dss_id < mirror->dss_count; dss_id++) + nfs_localio_file_init(&mirror->dss[dss_id].nfl); + return mirror; } static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) { const struct cred *cred; + u32 dss_id; ff_layout_remove_mirror(mirror); - kfree(mirror->fh_versions); - cred = rcu_access_pointer(mirror->ro_cred); - put_cred(cred); - cred = rcu_access_pointer(mirror->rw_cred); - put_cred(cred); - nfs4_ff_layout_put_deviceid(mirror->mirror_ds); + + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + kfree(mirror->dss[dss_id].fh_versions); + cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred); + put_cred(cred); + cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred); + put_cred(cred); + nfs_close_local_fh(&mirror->dss[dss_id].nfl); + nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds); + } + + kfree(mirror->dss); kfree(mirror); } @@ -276,7 +346,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1, struct pnfs_layout_segment *l2) { const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1); - const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1); + const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2); u32 i; if (fl1->mirror_array_cnt != fl2->mirror_array_cnt) @@ -349,14 +419,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo, free_me); } +static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror) +{ + u32 dss_id, sum = 0; + + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) + sum += mirror->dss[dss_id].efficiency; + + return sum; +} + static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { int i, j; for (i = 0; i < fls->mirror_array_cnt - 1; i++) { for (j = i + 1; j < fls->mirror_array_cnt; j++) - if (fls->mirror_array[i]->efficiency < - fls->mirror_array[j]->efficiency) + if (ff_mirror_efficiency_sum(fls->mirror_array[i]) < + ff_mirror_efficiency_sum(fls->mirror_array[j])) swap(fls->mirror_array[i], fls->mirror_array[j]); } @@ -371,20 +451,21 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_ff_layout_segment *fls = NULL; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; u64 stripe_unit; u32 mirror_array_cnt; __be32 *p; int i, rc; + struct nfs4_ff_layout_ds_stripe *dss_info; dprintk("--> %s\n", __func__); - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) return ERR_PTR(-ENOMEM); xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* stripe unit and mirror_array_cnt */ rc = -EIO; @@ -410,116 +491,134 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array_cnt = mirror_array_cnt; fls->stripe_unit = stripe_unit; + u32 dss_count = 0; for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; struct cred *kcred; const struct cred __rcu *cred; kuid_t uid; kgid_t gid; - u32 ds_count, fh_count, id; - int j; + u32 fh_count, id; + int j, dss_id; rc = -EIO; p = xdr_inline_decode(&stream, 4); if (!p) goto out_err_free; - ds_count = be32_to_cpup(p); - /* FIXME: allow for striping? */ - if (ds_count != 1) + // Ensure all mirrors have same stripe count. + if (dss_count == 0) + dss_count = be32_to_cpup(p); + else if (dss_count != be32_to_cpup(p)) + goto out_err_free; + + if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT || + dss_count == 0) goto out_err_free; - fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); + if (dss_count > 1 && stripe_unit == 0) + goto out_err_free; + + fls->mirror_array[i] = ff_layout_alloc_mirror(dss_count, gfp_flags); if (fls->mirror_array[i] == NULL) { rc = -ENOMEM; goto out_err_free; } - fls->mirror_array[i]->ds_count = ds_count; + for (dss_id = 0; dss_id < dss_count; dss_id++) { + dss_info = &fls->mirror_array[i]->dss[dss_id]; + dss_info->mirror = fls->mirror_array[i]; - /* deviceid */ - rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid); - if (rc) - goto out_err_free; + /* deviceid */ + rc = decode_deviceid(&stream, &dss_info->devid); + if (rc) + goto out_err_free; - /* efficiency */ - rc = -EIO; - p = xdr_inline_decode(&stream, 4); - if (!p) - goto out_err_free; - fls->mirror_array[i]->efficiency = be32_to_cpup(p); + /* efficiency */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + dss_info->efficiency = be32_to_cpup(p); - /* stateid */ - rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid); - if (rc) - goto out_err_free; + /* stateid */ + rc = decode_pnfs_stateid(&stream, &dss_info->stateid); + if (rc) + goto out_err_free; - /* fh */ - rc = -EIO; - p = xdr_inline_decode(&stream, 4); - if (!p) - goto out_err_free; - fh_count = be32_to_cpup(p); + /* fh */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + fh_count = be32_to_cpup(p); - fls->mirror_array[i]->fh_versions = - kcalloc(fh_count, sizeof(struct nfs_fh), - gfp_flags); - if (fls->mirror_array[i]->fh_versions == NULL) { - rc = -ENOMEM; - goto out_err_free; - } + dss_info->fh_versions = + kcalloc(fh_count, sizeof(struct nfs_fh), + gfp_flags); + if (dss_info->fh_versions == NULL) { + rc = -ENOMEM; + goto out_err_free; + } - for (j = 0; j < fh_count; j++) { - rc = decode_nfs_fh(&stream, - &fls->mirror_array[i]->fh_versions[j]); + for (j = 0; j < fh_count; j++) { + rc = decode_nfs_fh(&stream, + &dss_info->fh_versions[j]); + if (rc) + goto out_err_free; + } + + dss_info->fh_versions_cnt = fh_count; + + /* user */ + rc = decode_name(&stream, &id); if (rc) goto out_err_free; - } - fls->mirror_array[i]->fh_versions_cnt = fh_count; + uid = make_kuid(&init_user_ns, id); - /* user */ - rc = decode_name(&stream, &id); - if (rc) - goto out_err_free; + /* group */ + rc = decode_name(&stream, &id); + if (rc) + goto out_err_free; - uid = make_kuid(&init_user_ns, id); + gid = make_kgid(&init_user_ns, id); - /* group */ - rc = decode_name(&stream, &id); - if (rc) - goto out_err_free; + if (gfp_flags & __GFP_FS) + kcred = prepare_kernel_cred(&init_task); + else { + unsigned int nofs_flags = memalloc_nofs_save(); - gid = make_kgid(&init_user_ns, id); - - if (gfp_flags & __GFP_FS) - kcred = prepare_kernel_cred(&init_task); - else { - unsigned int nofs_flags = memalloc_nofs_save(); - kcred = prepare_kernel_cred(&init_task); - memalloc_nofs_restore(nofs_flags); + kcred = prepare_kernel_cred(&init_task); + memalloc_nofs_restore(nofs_flags); + } + rc = -ENOMEM; + if (!kcred) + goto out_err_free; + kcred->fsuid = uid; + kcred->fsgid = gid; + cred = RCU_INITIALIZER(kcred); + + if (lgr->range.iomode == IOMODE_READ) + rcu_assign_pointer(dss_info->ro_cred, cred); + else + rcu_assign_pointer(dss_info->rw_cred, cred); } - rc = -ENOMEM; - if (!kcred) - goto out_err_free; - kcred->fsuid = uid; - kcred->fsgid = gid; - cred = RCU_INITIALIZER(kcred); - - if (lgr->range.iomode == IOMODE_READ) - rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); - else - rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); if (mirror != fls->mirror_array[i]) { - /* swap cred ptrs so free_mirror will clean up old */ - if (lgr->range.iomode == IOMODE_READ) { - cred = xchg(&mirror->ro_cred, cred); - rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); - } else { - cred = xchg(&mirror->rw_cred, cred); - rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + for (dss_id = 0; dss_id < dss_count; dss_id++) { + dss_info = &fls->mirror_array[i]->dss[dss_id]; + /* swap cred ptrs so free_mirror will clean up old */ + if (lgr->range.iomode == IOMODE_READ) { + cred = xchg(&mirror->dss[dss_id].ro_cred, + dss_info->ro_cred); + rcu_assign_pointer(dss_info->ro_cred, cred); + } else { + cred = xchg(&mirror->dss[dss_id].rw_cred, + dss_info->rw_cred); + rcu_assign_pointer(dss_info->rw_cred, cred); + } } ff_layout_free_mirror(fls->mirror_array[i]); fls->mirror_array[i] = mirror; @@ -547,7 +646,7 @@ out_sort_mirrors: ret = &fls->generic_hdr; dprintk("<-- %s (success)\n", __func__); out_free_page: - __free_page(scratch); + folio_put(scratch); return ret; out_err_free: _ff_layout_free_lseg(fls); @@ -576,6 +675,26 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg) _ff_layout_free_lseg(fls); } +static u32 calc_commit_idx(struct pnfs_layout_segment *lseg, + u32 mirror_idx, u32 dss_id) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + + return (mirror_idx * flseg->mirror_array[0]->dss_count) + dss_id; +} + +static u32 calc_mirror_idx_from_commit(struct pnfs_layout_segment *lseg, + u32 commit_index) +{ + return commit_index / FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count; +} + +static u32 calc_dss_id_from_commit(struct pnfs_layout_segment *lseg, + u32 commit_index) +{ + return commit_index % FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count; +} + static void nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { @@ -600,6 +719,7 @@ nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) static bool nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, struct nfs4_ff_layoutstat *layoutstat, ktime_t now) { @@ -607,8 +727,8 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout); nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); - if (!mirror->start_time) - mirror->start_time = now; + if (!mirror->dss[dss_id].start_time) + mirror->dss[dss_id].start_time = now; if (mirror->report_interval != 0) report_interval = (s64)mirror->report_interval * 1000LL; else if (layoutstats_timer != 0) @@ -658,13 +778,16 @@ nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, static void nfs4_ff_layout_stat_io_start_read(struct inode *inode, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now); - nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + report = nfs4_ff_layoutstat_start_io( + mirror, dss_id, &mirror->dss[dss_id].read_stat, now); + nfs4_ff_layout_stat_io_update_requested( + &mirror->dss[dss_id].read_stat, requested); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); @@ -675,11 +798,12 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode, static void nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, __u64 completed) { spin_lock(&mirror->lock); - nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, + nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].read_stat, requested, completed, ktime_get(), task->tk_start); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); @@ -689,13 +813,20 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, static void nfs4_ff_layout_stat_io_start_write(struct inode *inode, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now); - nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + report = nfs4_ff_layoutstat_start_io( + mirror, + dss_id, + &mirror->dss[dss_id].write_stat, + now); + nfs4_ff_layout_stat_io_update_requested( + &mirror->dss[dss_id].write_stat, + requested); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); @@ -706,6 +837,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode, static void nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, __u64 completed, enum nfs3_stable_how committed) @@ -714,25 +846,25 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, requested = completed = 0; spin_lock(&mirror->lock); - nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, + nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].write_stat, requested, completed, ktime_get(), task->tk_start); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); } static void -ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx) +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); if (devid) nfs4_mark_deviceid_unavailable(devid); } static void -ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); if (devid) nfs4_mark_deviceid_available(devid); @@ -741,69 +873,87 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) static struct nfs4_pnfs_ds * ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id, bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_ff_layout_mirror *mirror; - struct nfs4_pnfs_ds *ds; + struct nfs4_pnfs_ds *ds = ERR_PTR(-EAGAIN); u32 idx; /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); - if (!ds) + *dss_id = nfs4_ff_layout_calc_dss_id( + fls->stripe_unit, + fls->mirror_array[idx]->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false); + if (IS_ERR(ds)) continue; if (check_device && - nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) + nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) { + // reinitialize the error state in case if this is the last iteration + ds = ERR_PTR(-EINVAL); continue; + } *best_idx = idx; - return ds; + break; } - return NULL; + return ds; } static struct nfs4_pnfs_ds * ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { - return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false); + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id, false); } static struct nfs4_pnfs_ds * ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { - return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true); + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id, true); } static struct nfs4_pnfs_ds * ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { struct nfs4_pnfs_ds *ds; - ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); - if (ds) + ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id); + if (!IS_ERR(ds)) return ds; - return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); + return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id); } static struct nfs4_pnfs_ds * ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, - u32 *best_idx) + u32 *best_idx, + u32 offset, + u32 *dss_id) { struct pnfs_layout_segment *lseg = pgio->pg_lseg; struct nfs4_pnfs_ds *ds; ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, - best_idx); - if (ds || !pgio->pg_mirror_idx) + best_idx, offset, dss_id); + if (!IS_ERR(ds) || !pgio->pg_mirror_idx) return ds; - return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); + return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx, + offset, dss_id); } static void @@ -822,12 +972,54 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, } } -static void -ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio, - struct nfs_page *req) +static bool +ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls) { - pnfs_generic_pg_check_layout(pgio); - pnfs_generic_pg_check_range(pgio, req); + return fls->mirror_array[0]->dss_count > 1; +} + +/* + * ff_layout_pg_test(). Called by nfs_can_coalesce_requests() + * + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + unsigned int size; + u64 p_stripe, r_stripe; + u32 stripe_offset; + u64 segment_offset = pgio->pg_lseg->pls_range.offset; + u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + /* calls nfs_generic_pg_test */ + size = pnfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; + else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg))) + return size; + + /* see if req and prev are in the same stripe */ + if (prev) { + p_stripe = (u64)req_offset(prev) - segment_offset; + r_stripe = (u64)req_offset(req) - segment_offset; + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + if (p_stripe != r_stripe) + return 0; + } + + /* calculate remaining bytes in the current stripe */ + div_u64_rem((u64)req_offset(req) - segment_offset, + stripe_unit, + &stripe_offset); + WARN_ON_ONCE(stripe_offset > stripe_unit); + if (stripe_offset >= stripe_unit) + return 0; + return min(stripe_unit - (unsigned int)stripe_offset, size); } static void @@ -837,10 +1029,13 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_pgio_mirror *pgm; struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - u32 ds_idx; + u32 ds_idx, dss_id; + if (NFS_SERVER(pgio->pg_inode)->flags & + (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) + pgio->pg_maxretrans = io_maxretrans; retry: - ff_layout_pg_check_layout(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); /* Use full layout for now */ if (!pgio->pg_lseg) { ff_layout_pg_get_read(pgio, req, false); @@ -852,9 +1047,12 @@ retry: if (!pgio->pg_lseg) goto out_nolseg; } + /* Reset wb_nio, since getting layout segment was successful */ + req->wb_nio = 0; - ds = ff_layout_get_ds_for_read(pgio, &ds_idx); - if (!ds) { + ds = ff_layout_get_ds_for_read(pgio, &ds_idx, + req_offset(req), &dss_id); + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; pnfs_generic_pg_cleanup(pgio); @@ -865,17 +1063,27 @@ retry: mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); pgm = &pgio->pg_mirrors[0]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; + pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize; pgio->pg_mirror_idx = ds_idx; - - if (NFS_SERVER(pgio->pg_inode)->flags & - (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) - pgio->pg_maxretrans = io_maxretrans; return; out_nolseg: - if (pgio->pg_error < 0) - return; + if (pgio->pg_error < 0) { + if (pgio->pg_error != -EAGAIN) + return; + /* Retry getting layout segment if lower layer returned -EAGAIN */ + if (pgio->pg_maxretrans && req->wb_nio++ > pgio->pg_maxretrans) { + if (NFS_SERVER(pgio->pg_inode)->flags & NFS_MOUNT_SOFTERR) + pgio->pg_error = -ETIMEDOUT; + else + pgio->pg_error = -EIO; + return; + } + pgio->pg_error = 0; + /* Sleep for 1 second before retrying */ + ssleep(1); + goto retry; + } out_mds: trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode, 0, NFS4_MAX_UINT64, IOMODE_READ, @@ -892,10 +1100,10 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; struct nfs4_pnfs_ds *ds; - u32 i; + u32 i, dss_id; retry: - ff_layout_pg_check_layout(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), @@ -917,8 +1125,13 @@ retry: for (i = 0; i < pgio->pg_mirror_count; i++) { mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); - ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true); - if (!ds) { + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit, + mirror->dss_count, + req_offset(req)); + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, + dss_id, true); + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; pnfs_generic_pg_cleanup(pgio); @@ -927,7 +1140,7 @@ retry: goto retry; } pgm = &pgio->pg_mirrors[i]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; + pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize; } if (NFS_SERVER(pgio->pg_inode)->flags & @@ -993,14 +1206,14 @@ ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx) static const struct nfs_pageio_ops ff_layout_pg_read_ops = { .pg_init = ff_layout_pg_init_read, - .pg_test = pnfs_generic_pg_test, + .pg_test = ff_layout_pg_test, .pg_doio = pnfs_generic_pg_readpages, .pg_cleanup = pnfs_generic_pg_cleanup, }; static const struct nfs_pageio_ops ff_layout_pg_write_ops = { .pg_init = ff_layout_pg_init_write, - .pg_test = pnfs_generic_pg_test, + .pg_test = ff_layout_pg_test, .pg_doio = pnfs_generic_pg_writepages, .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write, .pg_cleanup = pnfs_generic_pg_cleanup, @@ -1048,11 +1261,15 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; + u32 dss_id = 0; + struct nfs4_pnfs_ds *ds; - if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx)) - ff_layout_send_layouterror(hdr->lseg); - else + ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx, + hdr->args.offset, &dss_id); + if (IS_ERR(ds)) pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); + else + ff_layout_send_layouterror(hdr->lseg); pnfs_read_resend_pnfs(hdr, new_idx); } @@ -1081,42 +1298,53 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr) } static int ff_layout_async_handle_error_v4(struct rpc_task *task, + u32 op_status, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { struct pnfs_layout_hdr *lo = lseg->pls_layout; struct inode *inode = lo->plh_inode; - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; - switch (task->tk_status) { - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: + switch (op_status) { + case NFS4_OK: + case NFS4ERR_NXIO: + break; + case NFSERR_PERM: + if (!task->tk_xprt) + break; + xprt_force_disconnect(task->tk_xprt); + goto out_retry; + case NFS4ERR_BADSESSION: + case NFS4ERR_BADSLOT: + case NFS4ERR_BAD_HIGH_SLOT: + case NFS4ERR_DEADSESSION: + case NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case NFS4ERR_SEQ_FALSE_RETRY: + case NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session. Exchangeid " "flags 0x%x\n", __func__, task->tk_status, clp->cl_exchange_flags); nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - break; - case -NFS4ERR_DELAY: - case -NFS4ERR_GRACE: + goto out_retry; + case NFS4ERR_DELAY: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + fallthrough; + case NFS4ERR_GRACE: rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); - break; - case -NFS4ERR_RETRY_UNCACHED_REP: - break; + goto out_retry; + case NFS4ERR_RETRY_UNCACHED_REP: + goto out_retry; /* Invalidate Layout errors */ - case -NFS4ERR_PNFS_NO_LAYOUT: - case -ESTALE: /* mapped NFS4ERR_STALE */ - case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ - case -EISDIR: /* mapped NFS4ERR_ISDIR */ - case -NFS4ERR_FHEXPIRED: - case -NFS4ERR_WRONG_TYPE: + case NFS4ERR_PNFS_NO_LAYOUT: + case NFS4ERR_STALE: + case NFS4ERR_BADHANDLE: + case NFS4ERR_ISDIR: + case NFS4ERR_FHEXPIRED: + case NFS4ERR_WRONG_TYPE: dprintk("%s Invalid layout error %d\n", __func__, task->tk_status); /* @@ -1129,11 +1357,20 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, pnfs_destroy_layout(NFS_I(inode)); rpc_wake_up(&tbl->slot_tbl_waitq); goto reset; + default: + break; + } + + switch (task->tk_status) { /* RPC connection errors */ + case -ENETDOWN: + case -ENETUNREACH: + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + return -NFS4ERR_FATAL_IOERROR; + fallthrough; case -ECONNREFUSED: case -EHOSTDOWN: case -EHOSTUNREACH: - case -ENETUNREACH: case -EIO: case -ETIMEDOUT: case -EPIPE: @@ -1144,25 +1381,55 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); rpc_wake_up(&tbl->slot_tbl_waitq); - fallthrough; + break; default: - if (ff_layout_avoid_mds_available_ds(lseg)) - return -NFS4ERR_RESET_TO_PNFS; -reset: - dprintk("%s Retry through MDS. Error %d\n", __func__, - task->tk_status); - return -NFS4ERR_RESET_TO_MDS; + break; } + + if (ff_layout_avoid_mds_available_ds(lseg)) + return -NFS4ERR_RESET_TO_PNFS; +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, + task->tk_status); + return -NFS4ERR_RESET_TO_MDS; + +out_retry: task->tk_status = 0; return -EAGAIN; } /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ static int ff_layout_async_handle_error_v3(struct rpc_task *task, + u32 op_status, + struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); + + switch (op_status) { + case NFS_OK: + case NFSERR_NXIO: + break; + case NFSERR_PERM: + if (!task->tk_xprt) + break; + xprt_force_disconnect(task->tk_xprt); + goto out_retry; + case NFSERR_ACCES: + case NFSERR_BADHANDLE: + case NFSERR_FBIG: + case NFSERR_IO: + case NFSERR_NOSPC: + case NFSERR_ROFS: + case NFSERR_STALE: + goto out_reset_to_pnfs; + case NFSERR_JUKEBOX: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + goto out_retry; + default: + break; + } switch (task->tk_status) { /* File access problems. Don't mark the device as unavailable */ @@ -1176,12 +1443,18 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, case -EJUKEBOX: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); goto out_retry; + case -ENETDOWN: + case -ENETUNREACH: + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + return -NFS4ERR_FATAL_IOERROR; + fallthrough; default: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); } +out_reset_to_pnfs: /* FIXME: Need to prevent infinite looping here. */ return -NFS4ERR_RESET_TO_PNFS; out_retry: @@ -1192,15 +1465,16 @@ out_retry: } static int ff_layout_async_handle_error(struct rpc_task *task, + u32 op_status, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { int vers = clp->cl_nfs_mod->rpc_vers->number; if (task->tk_status >= 0) { - ff_layout_mark_ds_reachable(lseg, idx); + ff_layout_mark_ds_reachable(lseg, idx, dss_id); return 0; } @@ -1210,10 +1484,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task, switch (vers) { case 3: - return ff_layout_async_handle_error_v3(task, lseg, idx); + return ff_layout_async_handle_error_v3(task, op_status, clp, + lseg, idx, dss_id); case 4: - return ff_layout_async_handle_error_v4(task, state, clp, - lseg, idx); + return ff_layout_async_handle_error_v4(task, op_status, state, + clp, lseg, idx, dss_id); default: /* should never happen */ WARN_ON_ONCE(1); @@ -1222,7 +1497,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, } static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, - u32 idx, u64 offset, u64 length, + u32 idx, u32 dss_id, u64 offset, u64 length, u32 *op_status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; @@ -1235,10 +1510,12 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -EPFNOSUPPORT: case -EPROTONOSUPPORT: case -EOPNOTSUPP: + case -EINVAL: case -ECONNREFUSED: case -ECONNRESET: case -EHOSTDOWN: case -EHOSTUNREACH: + case -ENETDOWN: case -ENETUNREACH: case -EADDRINUSE: case -ENOBUFS: @@ -1258,15 +1535,16 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, offset, length, status, opnum, + mirror, dss_id, offset, length, status, opnum, nfs_io_gfp_mask()); switch (status) { case NFS4ERR_DELAY: case NFS4ERR_GRACE: + case NFS4ERR_PERM: break; case NFS4ERR_NXIO: - ff_layout_mark_ds_unreachable(lseg, idx); + ff_layout_mark_ds_unreachable(lseg, idx, dss_id); /* * Don't return the layout if this is a read and we still * have layouts to try @@ -1286,19 +1564,27 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg); + u32 dss_id = nfs4_ff_layout_calc_dss_id( + flseg->stripe_unit, + flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count, + hdr->args.offset); int err; if (task->tk_status < 0) { - ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + ff_layout_io_track_ds_error(hdr->lseg, + hdr->pgio_mirror_idx, dss_id, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_READ, task->tk_status); - trace_ff_layout_read_error(hdr); + trace_ff_layout_read_error(hdr, task->tk_status); } - err = ff_layout_async_handle_error(task, hdr->args.context->state, + err = ff_layout_async_handle_error(task, hdr->res.op_status, + hdr->args.context->state, hdr->ds_clp, hdr->lseg, - hdr->pgio_mirror_idx); + hdr->pgio_mirror_idx, + dss_id); trace_nfs4_pnfs_read(hdr, err); clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); @@ -1312,6 +1598,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task, return task->tk_status; case -EAGAIN: goto out_eagain; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } return 0; @@ -1351,23 +1640,47 @@ ff_layout_set_layoutcommit(struct inode *inode, static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_start_read(hdr->inode, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - task->tk_start); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_start_read( + hdr->inode, + mirror, + dss_id, + hdr->args.count, + task->tk_start); } static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_end_read(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - hdr->res.count); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_end_read( + task, + mirror, + dss_id, + hdr->args.count, + hdr->res.count); set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags); } @@ -1455,20 +1768,28 @@ static void ff_layout_read_release(void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg); + u32 dss_id = nfs4_ff_layout_calc_dss_id( + flseg->stripe_unit, + flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count, + hdr->args.offset); loff_t end_offs = 0; int err; if (task->tk_status < 0) { - ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + ff_layout_io_track_ds_error(hdr->lseg, + hdr->pgio_mirror_idx, dss_id, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_WRITE, task->tk_status); - trace_ff_layout_write_error(hdr); + trace_ff_layout_write_error(hdr, task->tk_status); } - err = ff_layout_async_handle_error(task, hdr->args.context->state, + err = ff_layout_async_handle_error(task, hdr->res.op_status, + hdr->args.context->state, hdr->ds_clp, hdr->lseg, - hdr->pgio_mirror_idx); + hdr->pgio_mirror_idx, + dss_id); trace_nfs4_pnfs_write(hdr, err); clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); @@ -1482,6 +1803,9 @@ static int ff_layout_write_done_cb(struct rpc_task *task, return task->tk_status; case -EAGAIN: return -EAGAIN; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } if (hdr->res.verf->committed == NFS_FILE_SYNC || @@ -1503,17 +1827,20 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { int err; + u32 idx = calc_mirror_idx_from_commit(data->lseg, data->ds_commit_index); + u32 dss_id = calc_dss_id_from_commit(data->lseg, data->ds_commit_index); if (task->tk_status < 0) { - ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, + ff_layout_io_track_ds_error(data->lseg, idx, dss_id, data->args.offset, data->args.count, &data->res.op_status, OP_COMMIT, task->tk_status); - trace_ff_layout_commit_error(data); + trace_ff_layout_commit_error(data, task->tk_status); } - err = ff_layout_async_handle_error(task, NULL, data->ds_clp, - data->lseg, data->ds_commit_index); + err = ff_layout_async_handle_error(task, data->res.op_status, + NULL, data->ds_clp, data->lseg, idx, + dss_id); trace_nfs4_pnfs_commit_ds(data, err); switch (err) { @@ -1526,33 +1853,60 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, case -EAGAIN: rpc_restart_call_prepare(task); return -EAGAIN; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); - return 0; } static void ff_layout_write_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_start_write(hdr->inode, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - task->tk_start); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_start_write( + hdr->inode, + mirror, + dss_id, + hdr->args.count, + task->tk_start); } static void ff_layout_write_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count, - hdr->res.verf->committed); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_end_write( + task, + mirror, + dss_id, + hdr->args.count, + hdr->res.count, + hdr->res.verf->committed); set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags); } @@ -1635,10 +1989,16 @@ static void ff_layout_write_release(void *data) static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task, struct nfs_commit_data *cdata) { + u32 idx, dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags)) return; + + idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index); + dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index); nfs4_ff_layout_stat_io_start_write(cdata->inode, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + FF_LAYOUT_COMP(cdata->lseg, idx), + dss_id, 0, task->tk_start); } @@ -1647,6 +2007,7 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, { struct nfs_page *req; __u64 count = 0; + u32 idx, dss_id; if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags)) return; @@ -1655,8 +2016,12 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, list_for_each_entry(req, &cdata->pages, wb_list) count += req->wb_bytes; } + + idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index); + dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index); nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + FF_LAYOUT_COMP(cdata->lseg, idx), + dss_id, count, count, NFS_FILE_SYNC); set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags); } @@ -1763,32 +2128,41 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfsd_file *localio; struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; loff_t offset = hdr->args.offset; u32 idx = hdr->pgio_mirror_idx; int vers; struct nfs_fh *fh; + u32 dss_id; + bool ds_fatal_error = false; dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n", __func__, hdr->inode->i_ino, hdr->args.pgbase, (size_t)hdr->args.count, offset); mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); - if (!ds) + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(lseg)->stripe_unit, + mirror->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false); + if (IS_ERR(ds)) { + ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds)); goto out_failed; + } ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - hdr->inode); + hdr->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); @@ -1796,11 +2170,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->pgio_done_cb = ff_layout_read_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - fh = nfs4_ff_layout_select_ds_fh(mirror); + fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id); if (fh) hdr->args.fh = fh; - nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); + nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1809,16 +2183,24 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->args.offset = offset; hdr->mds_offset = offset; + /* Start IO accounting for local read */ + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, + FMODE_READ); + if (localio) { + hdr->task.tk_start = ktime_get(); + ff_layout_read_record_layoutstats_start(&hdr->task, hdr); + } + /* Perform an asynchronous read to ds */ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_read_call_ops_v3 : &ff_layout_read_call_ops_v4, - 0, RPC_TASK_SOFTCONN); + 0, RPC_TASK_SOFTCONN, localio); put_cred(ds_cred); return PNFS_ATTEMPTED; out_failed: - if (ff_layout_avoid_mds_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error) return PNFS_TRY_AGAIN; trace_pnfs_mds_fallback_read_pagelist(hdr->inode, hdr->args.offset, hdr->args.count, @@ -1833,28 +2215,37 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfsd_file *localio; struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; loff_t offset = hdr->args.offset; int vers; struct nfs_fh *fh; u32 idx = hdr->pgio_mirror_idx; + u32 dss_id; + bool ds_fatal_error = false; mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); - if (!ds) + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(lseg)->stripe_unit, + mirror->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true); + if (IS_ERR(ds)) { + ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds)); goto out_failed; + } ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - hdr->inode); + hdr->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, @@ -1864,12 +2255,12 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) hdr->pgio_done_cb = ff_layout_write_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - hdr->ds_commit_idx = idx; - fh = nfs4_ff_layout_select_ds_fh(mirror); + hdr->ds_commit_idx = calc_commit_idx(lseg, idx, dss_id); + fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id); if (fh) hdr->args.fh = fh; - nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); + nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1877,16 +2268,24 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) */ hdr->args.offset = offset; + /* Start IO accounting for local write */ + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, + FMODE_READ|FMODE_WRITE); + if (localio) { + hdr->task.tk_start = ktime_get(); + ff_layout_write_record_layoutstats_start(&hdr->task, hdr); + } + /* Perform an asynchronous write */ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_write_call_ops_v3 : &ff_layout_write_call_ops_v4, - sync, RPC_TASK_SOFTCONN); + sync, RPC_TASK_SOFTCONN, localio); put_cred(ds_cred); return PNFS_ATTEMPTED; out_failed: - if (ff_layout_avoid_mds_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error) return PNFS_TRY_AGAIN; trace_pnfs_mds_fallback_write_pagelist(hdr->inode, hdr->args.offset, hdr->args.count, @@ -1894,20 +2293,15 @@ out_failed: return PNFS_NOT_ATTEMPTED; } -static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) -{ - return i; -} - static struct nfs_fh * -select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i, u32 dss_id) { struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); /* FIXME: Assume that there is only one NFS version available * for the DS. */ - return &flseg->mirror_array[i]->fh_versions[0]; + return &flseg->mirror_array[i]->dss[dss_id].fh_versions[0]; } static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) @@ -1915,9 +2309,10 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct pnfs_layout_segment *lseg = data->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfsd_file *localio; struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; - u32 idx; + u32 idx, dss_id; int vers, ret; struct nfs_fh *fh; @@ -1925,22 +2320,23 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))) goto out_err; - idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + idx = calc_mirror_idx_from_commit(lseg, data->ds_commit_index); mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); - if (!ds) + dss_id = calc_dss_id_from_commit(lseg, data->ds_commit_index); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true); + if (IS_ERR(ds)) goto out_err; ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - data->inode); + data->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_err; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred, dss_id); if (!ds_cred) goto out_err; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), @@ -1949,14 +2345,22 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) data->cred = ds_cred; refcount_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; - fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + fh = select_ds_fh_from_commit(lseg, idx, dss_id); if (fh) data->args.fh = fh; + /* Start IO accounting for local commit */ + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, + FMODE_READ|FMODE_WRITE); + if (localio) { + data->task.tk_start = ktime_get(); + ff_layout_commit_record_layoutstats_start(&data->task, data); + } + ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, - how, RPC_TASK_SOFTCONN); + how, RPC_TASK_SOFTCONN, localio); put_cred(ds_cred); return ret; out_err: @@ -2010,25 +2414,28 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg) struct nfs4_pnfs_ds *ds; struct nfs_client *ds_clp; struct rpc_clnt *clnt; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < flseg->mirror_array_cnt; idx++) { mirror = flseg->mirror_array[idx]; - mirror_ds = mirror->mirror_ds; - if (!mirror_ds) - continue; - ds = mirror->mirror_ds->ds; - if (!ds) - continue; - ds_clp = ds->ds_clp; - if (!ds_clp) - continue; - clnt = ds_clp->cl_rpcclient; - if (!clnt) - continue; - if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg)) - continue; - rpc_clnt_disconnect(clnt); + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + mirror_ds = mirror->dss[dss_id].mirror_ds; + if (IS_ERR_OR_NULL(mirror_ds)) + continue; + ds = mirror->dss[dss_id].mirror_ds->ds; + if (!ds) + continue; + ds_clp = ds->ds_clp; + if (!ds_clp) + continue; + clnt = ds_clp->cl_rpcclient; + if (!clnt) + continue; + if (!rpc_cancel_tasks(clnt, -EAGAIN, + ff_layout_match_io, lseg)) + continue; + rpc_clnt_disconnect(clnt); + } } } @@ -2050,8 +2457,9 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); struct inode *inode = lseg->pls_layout->plh_inode; struct pnfs_commit_array *array, *new; + u32 size = flseg->mirror_array_cnt * flseg->mirror_array[0]->dss_count; - new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, + new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask()); if (new) { spin_lock(&inode->i_lock); @@ -2094,12 +2502,6 @@ static int ff_layout_encode_ioerr(struct xdr_stream *xdr, } static void -encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) -{ - WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0); -} - -static void ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr, const nfs4_stateid *stateid, const struct nfs42_layoutstat_devinfo *devinfo) @@ -2421,11 +2823,11 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr, static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_ds_stripe *dss_info) { struct nfs4_pnfs_ds_addr *da; - struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; - struct nfs_fh *fh = &mirror->fh_versions[0]; + struct nfs4_pnfs_ds *ds = dss_info->mirror_ds->ds; + struct nfs_fh *fh = &dss_info->fh_versions[0]; __be32 *p; da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); @@ -2437,13 +2839,17 @@ ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, p = xdr_reserve_space(xdr, 4 + fh->size); xdr_encode_opaque(p, fh->data, fh->size); /* ff_io_latency4 read */ - spin_lock(&mirror->lock); - ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat); + spin_lock(&dss_info->mirror->lock); + ff_layout_encode_io_latency(xdr, + &dss_info->read_stat.io_stat); /* ff_io_latency4 write */ - ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat); - spin_unlock(&mirror->lock); + ff_layout_encode_io_latency(xdr, + &dss_info->write_stat.io_stat); + spin_unlock(&dss_info->mirror->lock); /* nfstime4 */ - ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time)); + ff_layout_encode_nfstime(xdr, + ktime_sub(ktime_get(), + dss_info->start_time)); /* bool */ p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(false); @@ -2467,7 +2873,8 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args, static void ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque) { - struct nfs4_ff_layout_mirror *mirror = opaque->data; + struct nfs4_ff_layout_ds_stripe *dss_info = opaque->data; + struct nfs4_ff_layout_mirror *mirror = dss_info->mirror; ff_layout_put_mirror(mirror); } @@ -2484,44 +2891,54 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; + struct nfs4_ff_layout_ds_stripe *dss_info; struct nfs4_deviceid_node *dev; - int i = 0; + int i = 0, dss_id; list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { - if (i >= dev_limit) - break; - if (IS_ERR_OR_NULL(mirror->mirror_ds)) - continue; - if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, - &mirror->flags) && - type != NFS4_FF_OP_LAYOUTRETURN) - continue; - /* mirror refcount put in cleanup_layoutstats */ - if (!refcount_inc_not_zero(&mirror->ref)) - continue; - dev = &mirror->mirror_ds->id_node; - memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); - devinfo->offset = 0; - devinfo->length = NFS4_MAX_UINT64; - spin_lock(&mirror->lock); - devinfo->read_count = mirror->read_stat.io_stat.ops_completed; - devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; - devinfo->write_count = mirror->write_stat.io_stat.ops_completed; - devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; - spin_unlock(&mirror->lock); - devinfo->layout_type = LAYOUT_FLEX_FILES; - devinfo->ld_private.ops = &layoutstat_ops; - devinfo->ld_private.data = mirror; - - devinfo++; - i++; + for (dss_id = 0; dss_id < mirror->dss_count; ++dss_id) { + dss_info = &mirror->dss[dss_id]; + if (i >= dev_limit) + break; + if (IS_ERR_OR_NULL(dss_info->mirror_ds)) + continue; + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, + &mirror->flags) && + type != NFS4_FF_OP_LAYOUTRETURN) + continue; + /* mirror refcount put in cleanup_layoutstats */ + if (!refcount_inc_not_zero(&mirror->ref)) + continue; + dev = &dss_info->mirror_ds->id_node; + memcpy(&devinfo->dev_id, + &dev->deviceid, + NFS4_DEVICEID4_SIZE); + devinfo->offset = 0; + devinfo->length = NFS4_MAX_UINT64; + spin_lock(&mirror->lock); + devinfo->read_count = + dss_info->read_stat.io_stat.ops_completed; + devinfo->read_bytes = + dss_info->read_stat.io_stat.bytes_completed; + devinfo->write_count = + dss_info->write_stat.io_stat.ops_completed; + devinfo->write_bytes = + dss_info->write_stat.io_stat.bytes_completed; + spin_unlock(&mirror->lock); + devinfo->layout_type = LAYOUT_FLEX_FILES; + devinfo->ld_private.ops = &layoutstat_ops; + devinfo->ld_private.data = &mirror->dss[dss_id]; + + devinfo++; + i++; + } } return i; } -static int -ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) +static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) { + struct pnfs_layout_hdr *lo; struct nfs4_flexfile_layout *ff_layout; const int dev_count = PNFS_LAYOUTSTATS_MAXDEV; @@ -2532,11 +2949,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) return -ENOMEM; spin_lock(&args->inode->i_lock); - ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); - args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &args->devinfo[0], - dev_count, - NFS4_FF_OP_LAYOUTSTATS); + lo = NFS_I(args->inode)->layout; + if (lo && pnfs_layout_is_valid(lo)) { + ff_layout = FF_LAYOUT_FROM_HDR(lo); + args->num_dev = ff_layout_mirror_prepare_stats( + &ff_layout->generic_hdr, &args->devinfo[0], dev_count, + NFS4_FF_OP_LAYOUTSTATS); + } else + args->num_dev = 0; spin_unlock(&args->inode->i_lock); if (!args->num_dev) { kfree(args->devinfo); @@ -2552,7 +2972,7 @@ ff_layout_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *dummy) { #if IS_ENABLED(CONFIG_NFS_V4_2) - server->caps |= NFS_CAP_LAYOUTSTATS; + server->caps |= NFS_CAP_LAYOUTSTATS | NFS_CAP_REBOOT_LAYOUTRETURN; #endif return 0; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 354a031c69b1..17a008c8e97c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -21,6 +21,8 @@ * due to network error etc. */ #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 +#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096 + /* LAYOUTSTATS report interval in ms */ #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) #define FF_LAYOUTSTATS_MAXDEV 4 @@ -71,24 +73,32 @@ struct nfs4_ff_layoutstat { struct nfs4_ff_busy_timer busy_timer; }; -struct nfs4_ff_layout_mirror { - struct pnfs_layout_hdr *layout; - struct list_head mirrors; - u32 ds_count; - u32 efficiency; +struct nfs4_ff_layout_mirror; + +struct nfs4_ff_layout_ds_stripe { + struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; + u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; nfs4_stateid stateid; const struct cred __rcu *ro_cred; const struct cred __rcu *rw_cred; - refcount_t ref; - spinlock_t lock; - unsigned long flags; + struct nfs_file_localio nfl; struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; +}; + +struct nfs4_ff_layout_mirror { + struct pnfs_layout_hdr *layout; + struct list_head mirrors; + u32 dss_count; + struct nfs4_ff_layout_ds_stripe *dss; + refcount_t ref; + spinlock_t lock; + unsigned long flags; u32 report_interval; }; @@ -99,7 +109,7 @@ struct nfs4_ff_layout_segment { u64 stripe_unit; u32 flags; u32 mirror_array_cnt; - struct nfs4_ff_layout_mirror *mirror_array[]; + struct nfs4_ff_layout_mirror *mirror_array[] __counted_by(mirror_array_cnt); }; struct nfs4_flexfile_layout { @@ -149,12 +159,12 @@ FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) } static inline struct nfs4_deviceid_node * -FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx); if (mirror != NULL) { - struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds; + struct nfs4_ff_layout_ds *mirror_ds = mirror->dss[dss_id].mirror_ds; if (!IS_ERR_OR_NULL(mirror_ds)) return &mirror_ds->id_node; @@ -181,9 +191,22 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) } static inline int -nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror) +nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror, u32 dss_id) +{ + return mirror->dss[dss_id].mirror_ds->ds_versions[0].version; +} + +static inline u32 +nfs4_ff_layout_calc_dss_id(const u64 stripe_unit, const u32 dss_count, const loff_t offset) { - return mirror->mirror_ds->ds_versions[0].version; + u64 tmp = offset; + + if (dss_count == 1 || stripe_unit == 0) + return 0; + + do_div(tmp, stripe_unit); + + return do_div(tmp, dss_count); } struct nfs4_ff_layout_ds * @@ -192,9 +215,9 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds); void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds); int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, - struct nfs4_ff_layout_mirror *mirror, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - gfp_t gfp_flags); + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, u64 offset, u64 length, int status, + enum nfs_opnum4 opnum, gfp_t gfp_flags); void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg); int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head); void ff_layout_free_ds_ioerr(struct list_head *head); @@ -203,23 +226,27 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, struct list_head *head, unsigned int maxnum); struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror); +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id); void nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, - nfs4_stateid *stateid); + u32 dss_id, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, bool fail_return); struct rpc_clnt * nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, - struct inode *inode); + struct inode *inode, + u32 dss_id); const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, const struct pnfs_layout_range *range, - const struct cred *mdscred); + const struct cred *mdscred, + u32 dss_id); bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e028f5a0ef5f..c55ea8fa3bfa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -44,18 +44,19 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, { struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; struct nfs4_ff_layout_ds *new_ds = NULL; struct nfs4_ff_ds_version *ds_versions = NULL; + struct net *net = server->nfs_client->cl_net; u32 mp_count; u32 version_count; __be32 *p; int i, ret = -ENOMEM; /* set up xdr stream */ - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) goto out_err; @@ -69,7 +70,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, INIT_LIST_HEAD(&dsaddrs); xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* multipath count */ p = xdr_inline_decode(&stream, 4); @@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, for (i = 0; i < mp_count; i++) { /* multipath ds */ - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, new_ds->ds_versions = ds_versions; new_ds->ds_versions_cnt = version_count; - new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!new_ds->ds) goto out_err_drain_dsaddrs; @@ -163,7 +163,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, kfree(da); } - __free_page(scratch); + folio_put(scratch); return new_ds; out_err_drain_dsaddrs: @@ -177,7 +177,7 @@ out_err_drain_dsaddrs: kfree(ds_versions); out_scratch: - __free_page(scratch); + folio_put(scratch); out_err: kfree(new_ds); @@ -250,16 +250,16 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, } int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, - struct nfs4_ff_layout_mirror *mirror, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - gfp_t gfp_flags) + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, u64 offset, u64 length, int status, + enum nfs_opnum4 opnum, gfp_t gfp_flags) { struct nfs4_ff_layout_ds_err *dserr; if (status == 0) return 0; - if (IS_ERR_OR_NULL(mirror->mirror_ds)) + if (IS_ERR_OR_NULL(mirror->dss[dss_id].mirror_ds)) return -EINVAL; dserr = kmalloc(sizeof(*dserr), gfp_flags); @@ -271,8 +271,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, dserr->length = length; dserr->status = status; dserr->opnum = opnum; - nfs4_stateid_copy(&dserr->stateid, &mirror->stateid); - memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid, + nfs4_stateid_copy(&dserr->stateid, &mirror->dss[dss_id].stateid); + memcpy(&dserr->deviceid, &mirror->dss[dss_id].mirror_ds->id_node.deviceid, NFS4_DEVICEID4_SIZE); spin_lock(&flo->generic_hdr.plh_inode->i_lock); @@ -282,14 +282,14 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, } static const struct cred * -ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) +ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode, u32 dss_id) { const struct cred *cred, __rcu **pcred; if (iomode == IOMODE_READ) - pcred = &mirror->ro_cred; + pcred = &mirror->dss[dss_id].ro_cred; else - pcred = &mirror->rw_cred; + pcred = &mirror->dss[dss_id].rw_cred; rcu_read_lock(); do { @@ -304,43 +304,45 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) } struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror) +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id) { /* FIXME: For now assume there is only 1 version available for the DS */ - return &mirror->fh_versions[0]; + return &mirror->dss[dss_id].fh_versions[0]; } void nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, - nfs4_stateid *stateid) + u32 dss_id, + nfs4_stateid *stateid) { - if (nfs4_ff_layout_ds_version(mirror) == 4) - nfs4_stateid_copy(stateid, &mirror->stateid); + if (nfs4_ff_layout_ds_version(mirror, dss_id) == 4) + nfs4_stateid_copy(stateid, &mirror->dss[dss_id].stateid); } static bool ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id) { if (mirror == NULL) goto outerr; - if (mirror->mirror_ds == NULL) { + if (mirror->dss[dss_id].mirror_ds == NULL) { struct nfs4_deviceid_node *node; struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), - &mirror->devid, lo->plh_lc_cred, + &mirror->dss[dss_id].devid, lo->plh_lc_cred, GFP_KERNEL); if (node) mirror_ds = FF_LAYOUT_MIRROR_DS(node); /* check for race with another call to this function */ - if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && + if (cmpxchg(&mirror->dss[dss_id].mirror_ds, NULL, mirror_ds) && mirror_ds != ERR_PTR(-ENODEV)) nfs4_put_deviceid_node(node); } - if (IS_ERR(mirror->mirror_ds)) + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) goto outerr; return true; @@ -352,6 +354,7 @@ outerr: * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on * @mirror: layout mirror describing the DS to use + * @dss_id: DS stripe id to select stripe to use * @fail_return: return layout on connect failure? * * Try to prepare a DS connection to accept an RPC call. This involves @@ -368,18 +371,19 @@ outerr: struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, bool fail_return) { - struct nfs4_pnfs_ds *ds = NULL; + struct nfs4_pnfs_ds *ds; struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; - int status; + int status = -EAGAIN; - if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror)) + if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror, dss_id)) goto noconnect; - ds = mirror->mirror_ds->ds; + ds = mirror->dss[dss_id].mirror_ds->ds; if (READ_ONCE(ds->ds_clp)) goto out; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ @@ -388,31 +392,37 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. */ - status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node, + status = nfs4_pnfs_ds_connect(s, ds, &mirror->dss[dss_id].mirror_ds->id_node, dataserver_timeo, dataserver_retrans, - mirror->mirror_ds->ds_versions[0].version, - mirror->mirror_ds->ds_versions[0].minor_version); + mirror->dss[dss_id].mirror_ds->ds_versions[0].version, + mirror->dss[dss_id].mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ if (!status) { + /* + * ds_clp is put in destroy_ds(). + * keep ds_clp even if DS is local, so that if local IO cannot + * proceed somehow, we can fall back to NFS whenever we want. + */ + nfs_local_probe_async(ds->ds_clp); max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); - if (mirror->mirror_ds->ds_versions[0].rsize > max_payload) - mirror->mirror_ds->ds_versions[0].rsize = max_payload; - if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) - mirror->mirror_ds->ds_versions[0].wsize = max_payload; + if (mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize > max_payload) + mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize = max_payload; + if (mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize > max_payload) + mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize = max_payload; goto out; } noconnect: ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, lseg->pls_range.offset, + mirror, dss_id, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); ff_layout_send_layouterror(lseg); if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); - ds = NULL; + ds = ERR_PTR(status); out: return ds; } @@ -420,12 +430,13 @@ out: const struct cred * ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, const struct pnfs_layout_range *range, - const struct cred *mdscred) + const struct cred *mdscred, + u32 dss_id) { const struct cred *cred; - if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) { - cred = ff_layout_get_mirror_cred(mirror, range->iomode); + if (mirror && !mirror->dss[dss_id].mirror_ds->ds_versions[0].tightly_coupled) { + cred = ff_layout_get_mirror_cred(mirror, range->iomode, dss_id); if (!cred) cred = get_cred(mdscred); } else { @@ -439,15 +450,17 @@ ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, * @mirror: pointer to the mirror * @ds_clp: nfs_client for the DS * @inode: pointer to inode + * @dss_id: DS stripe id * * Find or create a DS rpc client with th MDS server rpc client auth flavor * in the nfs_client cl_ds_clients list. */ struct rpc_clnt * nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, - struct nfs_client *ds_clp, struct inode *inode) + struct nfs_client *ds_clp, struct inode *inode, + u32 dss_id) { - switch (mirror->mirror_ds->ds_versions[0].version) { + switch (mirror->dss[dss_id].mirror_ds->ds_versions[0].version) { case 3: /* For NFSv3 DS, flavor is set when creating DS connections */ return ds_clp->cl_rpcclient; @@ -553,16 +566,18 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (mirror) { - if (!mirror->mirror_ds) + if (!mirror) + continue; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + if (!mirror->dss[dss_id].mirror_ds) return true; - if (IS_ERR(mirror->mirror_ds)) + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) continue; - devid = &mirror->mirror_ds->id_node; + devid = &mirror->dss[dss_id].mirror_ds->id_node; if (!nfs4_test_deviceid_unavailable(devid)) return true; } @@ -575,17 +590,21 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (!mirror || IS_ERR(mirror->mirror_ds)) - return false; - if (!mirror->mirror_ds) - continue; - devid = &mirror->mirror_ds->id_node; - if (nfs4_test_deviceid_unavailable(devid)) + if (!mirror) return false; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) + return false; + if (!mirror->dss[dss_id].mirror_ds) + continue; + devid = &mirror->dss[dss_id].mirror_ds->id_node; + if (nfs4_test_deviceid_unavailable(devid)) + return false; + } } return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 853e8d609bb3..b4679b7161b0 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -49,6 +49,8 @@ enum nfs_param { Opt_bsize, Opt_clientaddr, Opt_cto, + Opt_alignwrite, + Opt_fatal_neterrors, Opt_fg, Opt_fscache, Opt_fscache_flag, @@ -71,6 +73,8 @@ enum nfs_param { Opt_posix, Opt_proto, Opt_rdirplus, + Opt_rdirplus_none, + Opt_rdirplus_force, Opt_rdma, Opt_resvport, Opt_retrans, @@ -92,6 +96,22 @@ enum nfs_param { Opt_wsize, Opt_write, Opt_xprtsec, + Opt_cert_serial, + Opt_privkey_serial, +}; + +enum { + Opt_fatal_neterrors_default, + Opt_fatal_neterrors_enetunreach, + Opt_fatal_neterrors_none, +}; + +static const struct constant_table nfs_param_enums_fatal_neterrors[] = { + { "default", Opt_fatal_neterrors_default }, + { "ENETDOWN:ENETUNREACH", Opt_fatal_neterrors_enetunreach }, + { "ENETUNREACH:ENETDOWN", Opt_fatal_neterrors_enetunreach }, + { "none", Opt_fatal_neterrors_none }, + {} }; enum { @@ -149,6 +169,9 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_u32 ("bsize", Opt_bsize), fsparam_string("clientaddr", Opt_clientaddr), fsparam_flag_no("cto", Opt_cto), + fsparam_flag_no("alignwrite", Opt_alignwrite), + fsparam_enum("fatal_neterrors", Opt_fatal_neterrors, + nfs_param_enums_fatal_neterrors), fsparam_flag ("fg", Opt_fg), fsparam_flag_no("fsc", Opt_fscache_flag), fsparam_string("fsc", Opt_fscache), @@ -172,7 +195,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_u32 ("port", Opt_port), fsparam_flag_no("posix", Opt_posix), fsparam_string("proto", Opt_proto), - fsparam_flag_no("rdirplus", Opt_rdirplus), + fsparam_flag_no("rdirplus", Opt_rdirplus), // rdirplus|nordirplus + fsparam_string("rdirplus", Opt_rdirplus), // rdirplus=... fsparam_flag ("rdma", Opt_rdma), fsparam_flag_no("resvport", Opt_resvport), fsparam_u32 ("retrans", Opt_retrans), @@ -199,6 +223,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_enum ("write", Opt_write, nfs_param_enums_write), fsparam_u32 ("wsize", Opt_wsize), fsparam_string("xprtsec", Opt_xprtsec), + fsparam_s32("cert_serial", Opt_cert_serial), + fsparam_s32("privkey_serial", Opt_privkey_serial), {} }; @@ -286,6 +312,12 @@ static const struct constant_table nfs_xprtsec_policies[] = { {} }; +static const struct constant_table nfs_rdirplus_tokens[] = { + { "none", Opt_rdirplus_none }, + { "force", Opt_rdirplus_force }, + {} +}; + /* * Sanity-check a server address provided by the mount command. * @@ -523,6 +555,32 @@ static int nfs_parse_version_string(struct fs_context *fc, return 0; } +#ifdef CONFIG_KEYS +static int nfs_tls_key_verify(key_serial_t key_id) +{ + struct key *key = key_lookup(key_id); + int error = 0; + + if (IS_ERR(key)) { + pr_err("key id %08x not found\n", key_id); + return PTR_ERR(key); + } + if (test_bit(KEY_FLAG_REVOKED, &key->flags) || + test_bit(KEY_FLAG_INVALIDATED, &key->flags)) { + pr_err("key id %08x revoked\n", key_id); + error = -EKEYREVOKED; + } + + key_put(key); + return error; +} +#else +static inline int nfs_tls_key_verify(key_serial_t key_id) +{ + return -ENOENT; +} +#endif /* CONFIG_KEYS */ + /* * Parse a single mount parameter. */ @@ -592,6 +650,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, else ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY; break; + case Opt_alignwrite: + if (result.negated) + ctx->flags |= NFS_MOUNT_NO_ALIGNWRITE; + else + ctx->flags &= ~NFS_MOUNT_NO_ALIGNWRITE; + break; case Opt_ac: if (result.negated) ctx->flags |= NFS_MOUNT_NOAC; @@ -600,9 +664,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_lock: if (result.negated) { + ctx->lock_status = NFS_LOCK_NOLOCK; ctx->flags |= NFS_MOUNT_NONLM; ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); } else { + ctx->lock_status = NFS_LOCK_LOCK; ctx->flags &= ~NFS_MOUNT_NONLM; ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); } @@ -626,10 +692,25 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, ctx->flags &= ~NFS_MOUNT_NOACL; break; case Opt_rdirplus: - if (result.negated) + if (result.negated) { + ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS; ctx->flags |= NFS_MOUNT_NORDIRPLUS; - else - ctx->flags &= ~NFS_MOUNT_NORDIRPLUS; + } else if (!param->string) { + ctx->flags &= ~(NFS_MOUNT_NORDIRPLUS | NFS_MOUNT_FORCE_RDIRPLUS); + } else { + switch (lookup_constant(nfs_rdirplus_tokens, param->string, -1)) { + case Opt_rdirplus_none: + ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS; + ctx->flags |= NFS_MOUNT_NORDIRPLUS; + break; + case Opt_rdirplus_force: + ctx->flags &= ~NFS_MOUNT_NORDIRPLUS; + ctx->flags |= NFS_MOUNT_FORCE_RDIRPLUS; + break; + default: + goto out_invalid_value; + } + } break; case Opt_sharecache: if (result.negated) @@ -652,6 +733,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, ctx->fscache_uniq = NULL; break; case Opt_fscache: + trace_nfs_mount_assign(param->key, param->string); ctx->options |= NFS_OPTION_FSCACHE; kfree(ctx->fscache_uniq); ctx->fscache_uniq = param->string; @@ -755,6 +837,18 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, if (ret < 0) return ret; break; + case Opt_cert_serial: + ret = nfs_tls_key_verify(result.int_32); + if (ret < 0) + return ret; + ctx->xprtsec.cert_serial = result.int_32; + break; + case Opt_privkey_serial: + ret = nfs_tls_key_verify(result.int_32); + if (ret < 0) + return ret; + ctx->xprtsec.privkey_serial = result.int_32; + break; case Opt_proto: if (!param->string) @@ -861,6 +955,25 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, goto out_of_bounds; ctx->nfs_server.max_connect = result.uint_32; break; + case Opt_fatal_neterrors: + trace_nfs_mount_assign(param->key, param->string); + switch (result.uint_32) { + case Opt_fatal_neterrors_default: + if (fc->net_ns != &init_net) + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + else + ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL; + break; + case Opt_fatal_neterrors_enetunreach: + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + break; + case Opt_fatal_neterrors_none: + ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL; + break; + default: + goto out_invalid_value; + } + break; case Opt_lookupcache: trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { @@ -1111,9 +1224,12 @@ static int nfs23_parse_monolithic(struct fs_context *fc, ctx->acdirmax = data->acdirmax; ctx->need_mount = false; - memcpy(sap, &data->addr, sizeof(data->addr)); - ctx->nfs_server.addrlen = sizeof(data->addr); - ctx->nfs_server.port = ntohs(data->addr.sin_port); + if (!is_remount_fc(fc)) { + memcpy(sap, &data->addr, sizeof(data->addr)); + ctx->nfs_server.addrlen = sizeof(data->addr); + ctx->nfs_server.port = ntohs(data->addr.sin_port); + } + if (sap->ss_family != AF_INET || !nfs_verify_server_address(sap)) goto out_no_address; @@ -1153,8 +1269,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, int ret; data->context[NFS_MAX_CONTEXT_LEN] = '\0'; - ret = vfs_parse_fs_string(fc, "context", - data->context, strlen(data->context)); + ret = vfs_parse_fs_string(fc, "context", data->context); if (ret < 0) return ret; #else @@ -1453,7 +1568,7 @@ static int nfs_fs_context_validate(struct fs_context *fc) /* Load the NFS protocol module if we haven't done so yet */ if (!ctx->nfs_mod) { - nfs_mod = get_nfs_version(ctx->version); + nfs_mod = find_nfs_version(ctx->version); if (IS_ERR(nfs_mod)) { ret = PTR_ERR(nfs_mod); goto out_version_unavailable; @@ -1527,7 +1642,7 @@ static int nfs_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) } nfs_copy_fh(ctx->mntfh, src->mntfh); - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); ctx->client_address = NULL; ctx->mount_server.hostname = NULL; ctx->nfs_server.export_path = NULL; @@ -1619,7 +1734,7 @@ static int nfs_init_fs_context(struct fs_context *fc) } ctx->nfs_mod = nfss->nfs_client->cl_nfs_mod; - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); } else { /* defaults */ ctx->timeo = NFS_UNSPEC_TIMEO; @@ -1637,6 +1752,9 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->xprtsec.cert_serial = TLS_NO_CERT; ctx->xprtsec.privkey_serial = TLS_NO_PRIVKEY; + if (fc->net_ns != &init_net) + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + fc->s_iflags |= SB_I_STABLE_WRITES; } fc->fs_private = ctx; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 8c35d88a84b1..8b0785178731 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -180,6 +180,9 @@ void nfs_fscache_init_inode(struct inode *inode) &auxdata, /* aux_data */ sizeof(auxdata), i_size_read(inode)); + + if (netfs_inode(inode)->cache) + mapping_set_release_always(inode->i_mapping); } /* @@ -260,21 +263,25 @@ int nfs_netfs_readahead(struct readahead_control *ractl) static atomic_t nfs_netfs_debug_id; static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file) { + if (!file) { + if (WARN_ON_ONCE(rreq->origin != NETFS_PGPRIV2_COPY_TO_CACHE)) + return -EIO; + return 0; + } + rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + rreq->io_streams[0].sreq_max_len = NFS_SB(rreq->inode->i_sb)->rsize; return 0; } static void nfs_netfs_free_request(struct netfs_io_request *rreq) { - put_nfs_open_context(rreq->netfs_priv); -} - -static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq) -{ - return fscache_begin_read_operation(&rreq->cache_resources, - netfs_i_cookie(netfs_inode(rreq->inode))); + if (rreq->netfs_priv) + put_nfs_open_context(rreq->netfs_priv); } static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) @@ -289,14 +296,6 @@ static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sre return netfs; } -static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq) -{ - size_t rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize; - - sreq->len = min(sreq->len, rsize); - return true; -} - static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) { struct nfs_netfs_io_data *netfs; @@ -304,34 +303,32 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) struct inode *inode = sreq->rreq->inode; struct nfs_open_context *ctx = sreq->rreq->netfs_priv; struct page *page; + unsigned long idx; + pgoff_t start, last; int err; - pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; - pgoff_t last = ((sreq->start + sreq->len - - sreq->transferred - 1) >> PAGE_SHIFT); - XA_STATE(xas, &sreq->rreq->mapping->i_pages, start); + + start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; + last = ((sreq->start + sreq->len - sreq->transferred - 1) >> PAGE_SHIFT); nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); netfs = nfs_netfs_alloc(sreq); - if (!netfs) - return netfs_subreq_terminated(sreq, -ENOMEM, false); + if (!netfs) { + sreq->error = -ENOMEM; + return netfs_read_subreq_terminated(sreq); + } pgio.pg_netfs = netfs; /* used in completion */ - xas_lock(&xas); - xas_for_each(&xas, page, last) { + xa_for_each_range(&sreq->rreq->mapping->i_pages, idx, page, start, last) { /* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs */ - xas_pause(&xas); - xas_unlock(&xas); err = nfs_read_add_folio(&pgio, ctx, page_folio(page)); if (err < 0) { netfs->error = err; goto out; } - xas_lock(&xas); } - xas_unlock(&xas); out: nfs_pageio_complete_read(&pgio); nfs_netfs_put(netfs); @@ -349,7 +346,7 @@ void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) int nfs_netfs_folio_unlock(struct folio *folio) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; /* * If fscache is enabled, netfs will unlock pages. @@ -369,7 +366,9 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) return; sreq = netfs->sreq; - if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) + if (test_bit(NFS_IOHDR_EOF, &hdr->flags) && + sreq->rreq->origin != NETFS_UNBUFFERED_READ && + sreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); if (hdr->error) @@ -384,7 +383,5 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) const struct netfs_request_ops nfs_netfs_ops = { .init_request = nfs_netfs_init_request, .free_request = nfs_netfs_free_request, - .begin_cache_operation = nfs_netfs_begin_cache_operation, .issue_read = nfs_netfs_issue_read, - .clamp_length = nfs_netfs_clamp_length }; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index e1706e736c64..9d86868f4998 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -60,8 +60,6 @@ static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs) static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) { - ssize_t final_len; - /* Only the last RPC completion should call netfs_subreq_terminated() */ if (!refcount_dec_and_test(&netfs->refcount)) return; @@ -74,13 +72,15 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) * Correct the final length here to be no larger than the netfs subrequest * length, and thus avoid netfs's "Subreq overread" warning message. */ - final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred)); - netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false); + netfs->sreq->transferred = min_t(s64, netfs->sreq->len, + atomic64_read(&netfs->transferred)); + netfs->sreq->error = netfs->error; + netfs_read_subreq_terminated(netfs->sreq); kfree(netfs); } static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) { - netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops); + netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); } extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); @@ -101,10 +101,10 @@ extern int nfs_netfs_read_folio(struct file *file, struct folio *folio); static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) { - if (folio_test_fscache(folio)) { + if (folio_test_private_2(folio)) { /* [DEPRECATED] */ if (current_is_kswapd() || !(gfp & __GFP_FS)) return false; - folio_wait_fscache(folio); + folio_wait_private_2(folio); } fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host))); return true; @@ -114,10 +114,10 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata * struct inode *inode) { memset(auxdata, 0, sizeof(*auxdata)); - auxdata->mtime_sec = inode->i_mtime.tv_sec; - auxdata->mtime_nsec = inode->i_mtime.tv_nsec; - auxdata->ctime_sec = inode->i_ctime.tv_sec; - auxdata->ctime_nsec = inode->i_ctime.tv_nsec; + auxdata->mtime_sec = inode_get_mtime(inode).tv_sec; + auxdata->mtime_nsec = inode_get_mtime(inode).tv_nsec; + auxdata->ctime_sec = inode_get_ctime(inode).tv_sec; + auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec; if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4) auxdata->change_attr = inode_peek_iversion_raw(inode); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 11ff2b2e060f..f13d25d95b85 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -62,7 +62,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i } /* - * get an NFS2/NFS3 root dentry from the root filehandle + * get a root dentry from the root filehandle */ int nfs_get_root(struct super_block *s, struct fs_context *fc) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8172dd4135a1..f76fe406937a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -74,6 +74,8 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { + if (unlikely(nfs_current_task_exiting())) + return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; @@ -106,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_drop_inode(struct inode *inode) { - return NFS_STALE(inode) || generic_drop_inode(inode); + return NFS_STALE(inode) || inode_generic_drop(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); @@ -190,12 +192,12 @@ static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); - bool have_delegation = NFS_PROTO(inode)->have_delegation(inode, FMODE_READ); - if (have_delegation) { + if (nfs_have_delegated_attributes(inode)) { if (!(flags & NFS_INO_REVAL_FORCED)) flags &= ~(NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_BTIME | NFS_INO_INVALID_XATTR); flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); } @@ -206,12 +208,15 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) nfs_fscache_invalidate(inode, 0); flags &= ~NFS_INO_REVAL_FORCED; - nfsi->cache_validity |= flags; + flags |= nfsi->cache_validity; + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; - if (inode->i_mapping->nrpages == 0) { - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; - nfs_ooo_clear(nfsi); - } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { + /* pairs with nfs_clear_invalid_mapping()'s smp_load_acquire() */ + smp_store_release(&nfsi->cache_validity, flags); + + if (inode->i_mapping->nrpages == 0 || + nfsi->cache_validity & NFS_INO_INVALID_DATA) { nfs_ooo_clear(nfsi); } trace_nfs_set_cache_invalid(inode, 0); @@ -276,6 +281,8 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { + if (nfs_have_delegated_atime(inode)) + return; spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); @@ -468,7 +475,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) goto out_no_inode; } - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long now = jiffies; @@ -491,6 +498,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; nfs_inode_init_regular(nfsi); + mapping_set_large_folios(inode->i_mapping); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; @@ -512,9 +520,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) } else init_special_inode(inode, inode->i_mode, fattr->rdev); - memset(&inode->i_atime, 0, sizeof(inode->i_atime)); - memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); - memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); + inode_set_atime(inode, 0, 0); + inode_set_mtime(inode, 0, 0); + inode_set_ctime(inode, 0, 0); + memset(&nfsi->btime, 0, sizeof(nfsi->btime)); inode_set_iversion_raw(inode, 0); inode->i_size = 0; clear_nlink(inode); @@ -527,17 +536,21 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode_set_atime_to_ts(inode, fattr->atime); else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); + if (fattr->valid & NFS_ATTR_FATTR_BTIME) + nfsi->btime = fattr->btime; + else if (fattr_supported & NFS_ATTR_FATTR_BTIME) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode_set_iversion_raw(inode, fattr->change_attr); else @@ -550,6 +563,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) set_nlink(inode, fattr->nlink); else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); + else + set_nlink(inode, 1); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (fattr_supported & NFS_ATTR_FATTR_OWNER) @@ -593,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - atomic_read(&inode->i_count)); + icount_read(inode)); out: return inode; @@ -604,6 +619,95 @@ out_no_inode: } EXPORT_SYMBOL_GPL(nfs_fhget); +static void +nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) +{ + unsigned long cache_validity = NFS_I(inode)->cache_validity; + + if (nfs_have_delegated_mtime(inode)) { + if (!(cache_validity & NFS_INO_INVALID_CTIME)) + fattr->valid &= ~(NFS_ATTR_FATTR_PRECTIME | + NFS_ATTR_FATTR_CTIME); + + if (!(cache_validity & NFS_INO_INVALID_MTIME)) + fattr->valid &= ~(NFS_ATTR_FATTR_PREMTIME | + NFS_ATTR_FATTR_MTIME); + + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + fattr->valid &= ~NFS_ATTR_FATTR_ATIME; + } else if (nfs_have_delegated_atime(inode)) { + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + fattr->valid &= ~NFS_ATTR_FATTR_ATIME; + } +} + +static void nfs_set_timestamps_to_ts(struct inode *inode, struct iattr *attr) +{ + unsigned int cache_flags = 0; + + if (attr->ia_valid & ATTR_MTIME_SET) { + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + struct timespec64 now; + int updated = 0; + + now = inode_set_ctime_current(inode); + if (!timespec64_equal(&now, &ctime)) + updated |= S_CTIME; + + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (!timespec64_equal(&now, &mtime)) + updated |= S_MTIME; + + inode_maybe_inc_iversion(inode, updated); + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (attr->ia_valid & ATTR_ATIME_SET) { + inode_set_atime_to_ts(inode, attr->ia_atime); + cache_flags |= NFS_INO_INVALID_ATIME; + } + NFS_I(inode)->cache_validity &= ~cache_flags; +} + +static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) +{ + enum file_time_flags time_flags = 0; + unsigned int cache_flags = 0; + + if (ia_valid & ATTR_MTIME) { + time_flags |= S_MTIME | S_CTIME; + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (ia_valid & ATTR_ATIME) { + time_flags |= S_ATIME; + cache_flags |= NFS_INO_INVALID_ATIME; + } + inode_update_timestamps(inode, time_flags); + NFS_I(inode)->cache_validity &= ~cache_flags; +} + +void nfs_update_delegated_atime(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (nfs_have_delegated_atime(inode)) + nfs_update_timestamps(inode, ATTR_ATIME); + spin_unlock(&inode->i_lock); +} + +void nfs_update_delegated_mtime_locked(struct inode *inode) +{ + if (nfs_have_delegated_mtime(inode)) + nfs_update_timestamps(inode, ATTR_MTIME); +} + +void nfs_update_delegated_mtime(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_update_delegated_mtime_locked(inode); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_update_delegated_mtime); + #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int @@ -612,7 +716,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; + loff_t oldsize = i_size_read(inode); int error = 0; + kuid_t task_uid = current_fsuid(); + kuid_t owner_uid = inode->i_uid; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -627,10 +734,39 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (error) return error; - if (attr->ia_size == i_size_read(inode)) + if (attr->ia_size == oldsize) attr->ia_valid &= ~ATTR_SIZE; } + if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { + spin_lock(&inode->i_lock); + if (attr->ia_valid & ATTR_MTIME_SET) { + if (uid_eq(task_uid, owner_uid)) { + nfs_set_timestamps_to_ts(inode, attr); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| + ATTR_ATIME|ATTR_ATIME_SET); + } + } else { + nfs_update_timestamps(inode, attr->ia_valid); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); + } + spin_unlock(&inode->i_lock); + } else if (nfs_have_delegated_atime(inode) && + attr->ia_valid & ATTR_ATIME && + !(attr->ia_valid & ATTR_MTIME)) { + if (attr->ia_valid & ATTR_ATIME_SET) { + if (uid_eq(task_uid, owner_uid)) { + spin_lock(&inode->i_lock); + nfs_set_timestamps_to_ts(inode, attr); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + } + } else { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; + } + } + /* Optimization: if the end result is no change, don't RPC */ if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; @@ -638,8 +774,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + nfs_file_block_o_direct(NFS_I(inode)); nfs_sync_inode(inode); + } fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) { @@ -648,8 +786,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); - if (error == 0) + if (error == 0) { + if (attr->ia_valid & ATTR_SIZE) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + attr->ia_size); error = nfs_refresh_inode(inode, fattr); + } nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); @@ -686,6 +828,7 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); + nfs_update_delegated_mtime_locked(inode); spin_lock(&inode->i_lock); out: return err; @@ -709,8 +852,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & ATTR_SIZE) != 0) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME | - NFS_INO_INVALID_BLOCKS); + if (!nfs_have_delegated_mtime(inode)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } @@ -731,7 +875,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -742,14 +886,14 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode_set_atime_to_ts(inode, fattr->atime); else if (attr->ia_valid & ATTR_ATIME_SET) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -758,14 +902,14 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); else if (attr->ia_valid & ATTR_MTIME_SET) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -806,6 +950,7 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) static u32 nfs_get_valid_attrmask(struct inode *inode) { + u64 fattr_valid = NFS_SERVER(inode)->fattr_valid; unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); u32 reply_mask = STATX_INO | STATX_TYPE; @@ -825,6 +970,9 @@ static u32 nfs_get_valid_attrmask(struct inode *inode) reply_mask |= STATX_UID | STATX_GID; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; + if (!(cache_validity & NFS_INO_INVALID_BTIME) && + (fattr_valid & NFS_ATTR_FATTR_BTIME)) + reply_mask |= STATX_BTIME; if (!(cache_validity & NFS_INO_INVALID_CHANGE)) reply_mask |= STATX_CHANGE_COOKIE; return reply_mask; @@ -835,6 +983,7 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, { struct inode *inode = d_inode(path->dentry); struct nfs_server *server = NFS_SERVER(inode); + u64 fattr_valid = server->fattr_valid; unsigned long cache_validity; int err = 0; bool force_sync = query_flags & AT_STATX_FORCE_SYNC; @@ -845,9 +994,12 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | - STATX_INO | STATX_SIZE | STATX_BLOCKS | + STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | STATX_CHANGE_COOKIE; + if (!(fattr_valid & NFS_ATTR_FATTR_BTIME)) + request_mask &= ~STATX_BTIME; + if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { if (readdirplus_enabled) nfs_readdirplus_parent_cache_hit(path->dentry); @@ -856,8 +1008,12 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, /* Flush out writes to the server in order to update c/mtime/version. */ if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) && - S_ISREG(inode->i_mode)) - filemap_write_and_wait(inode->i_mapping); + S_ISREG(inode->i_mode)) { + if (nfs_have_delegated_mtime(inode)) + filemap_fdatawrite(inode->i_mapping); + else + filemap_write_and_wait(inode->i_mapping); + } /* * We may force a getattr if the user cares about atime. @@ -875,7 +1031,7 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, /* Is the user requesting attributes that might need revalidation? */ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| STATX_MTIME|STATX_UID|STATX_GID| - STATX_SIZE|STATX_BLOCKS| + STATX_SIZE|STATX_BLOCKS|STATX_BTIME| STATX_CHANGE_COOKIE))) goto out_no_revalidate; @@ -899,6 +1055,8 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, do_update |= cache_validity & NFS_INO_INVALID_OTHER; if (request_mask & STATX_BLOCKS) do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; + if (request_mask & STATX_BTIME) + do_update |= cache_validity & NFS_INO_INVALID_BTIME; if (do_update) { if (readdirplus_enabled) @@ -912,7 +1070,7 @@ out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); stat->change_cookie = inode_peek_iversion_raw(inode); stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; @@ -920,6 +1078,22 @@ out_no_revalidate: stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; + stat->btime = NFS_I(inode)->btime; + + /* Special handling for STATX_DIOALIGN and STATX_DIO_READ_ALIGN + * - NFS doesn't have DIO alignment constraints, avoid getting + * these DIO attrs from remote and just respond with most + * accommodating limits (so client will issue supported DIO). + * - this is unintuitive, but the most coarse-grained + * dio_offset_align is the most accommodating. + */ + if ((request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) && + S_ISREG(inode->i_mode)) { + stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN; + stat->dio_mem_align = 4; /* 4-byte alignment */ + stat->dio_offset_align = PAGE_SIZE; + stat->dio_read_offset_align = stat->dio_offset_align; + } out: trace_nfs_getattr_exit(inode, err); return err; @@ -1012,7 +1186,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) if (!is_sync) return; inode = d_inode(ctx->dentry); - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + if (nfs_have_read_or_write_delegation(inode)) return; nfsi = NFS_I(inode); if (inode->i_mapping->nrpages == 0) @@ -1053,6 +1227,8 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); ctx->mdsthreshold = NULL; + nfs_localio_file_init(&ctx->nfl); + return ctx; } EXPORT_SYMBOL_GPL(alloc_nfs_open_context); @@ -1084,6 +1260,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) nfs_sb_deactive(sb); put_rpccred(rcu_dereference_protected(ctx->ll_cred, 1)); kfree(ctx->mdsthreshold); + nfs_close_local_fh(&ctx->nfl); kfree_rcu(ctx, rcu_head); } @@ -1340,6 +1517,13 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; + smp_rmb(); /* pairs with smp_wmb() below */ + if (test_bit(NFS_INO_INVALIDATING, bitlock)) + continue; + /* pairs with nfs_set_cache_invalid()'s smp_store_release() */ + if (!(smp_load_acquire(&nfsi->cache_validity) & NFS_INO_INVALID_DATA)) + goto out; + /* Slow-path that double-checks with spinlock held */ spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock)) { spin_unlock(&inode->i_lock); @@ -1444,18 +1628,18 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); } /* If we have atomic WCC data, we may update some attributes */ - ts = inode->i_ctime; + ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) && timespec64_equal(&ts, &fattr->pre_ctime)) { - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); } - ts = inode->i_mtime; + ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) && timespec64_equal(&ts, &fattr->pre_mtime)) { - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) @@ -1482,7 +1666,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat unsigned long invalid = 0; struct timespec64 ts; - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + if (nfs_have_delegated_attributes(inode)) return 0; if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { @@ -1506,11 +1690,11 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) invalid |= NFS_INO_INVALID_CHANGE; - ts = inode->i_mtime; + ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; - ts = inode->i_ctime; + ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_CTIME; @@ -1534,7 +1718,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_NLINK; - ts = inode->i_atime; + ts = inode_get_atime(inode); if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; @@ -1565,6 +1749,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr) fattr->gencount = nfs_inc_attr_generation_counter(); fattr->owner_name = NULL; fattr->group_name = NULL; + fattr->mdsthreshold = NULL; } EXPORT_SYMBOL_GPL(nfs_fattr_init); @@ -1803,7 +1988,7 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr, NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER | - NFS_INO_INVALID_NLINK; + NFS_INO_INVALID_NLINK | NFS_INO_INVALID_BTIME; unsigned long cache_validity = NFS_I(inode)->cache_validity; enum nfs4_change_attr_type ctype = NFS_SERVER(inode)->change_attr_type; @@ -1997,12 +2182,12 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { - fattr->pre_ctime = inode->i_ctime; + fattr->pre_ctime = inode_get_ctime(inode); fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { - fattr->pre_mtime = inode->i_mtime; + fattr->pre_mtime = inode_get_mtime(inode); fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && @@ -2069,10 +2254,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) bool attr_changed = false; bool have_delegation; - dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", + dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - atomic_read(&inode->i_count), fattr->valid); + icount_read(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ @@ -2118,6 +2303,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfsi->read_cache_jiffies = fattr->time_start; + /* Fix up any delegated attributes in the struct nfs_fattr */ + nfs_fattr_fixup_delegated(inode, fattr); + save_cache_validity = nfsi->cache_validity; nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME @@ -2161,7 +2349,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | NFS_INO_INVALID_MODE - | NFS_INO_INVALID_OTHER; + | NFS_INO_INVALID_OTHER + | NFS_INO_INVALID_BTIME; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); attr_changed = true; @@ -2184,17 +2373,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MTIME; if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; + if (fattr->valid & NFS_ATTR_FATTR_BTIME) + nfsi->btime = fattr->btime; + else if (fattr_supported & NFS_ATTR_FATTR_BTIME) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_BTIME; + /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { new_isize = nfs_size_to_loff_t(fattr->size); @@ -2220,7 +2415,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) save_cache_validity & NFS_INO_INVALID_SIZE; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode_set_atime_to_ts(inode, fattr->atime); else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATIME; @@ -2372,7 +2567,7 @@ static int __init nfs_init_inodecache(void) nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; @@ -2390,35 +2585,54 @@ static void nfs_destroy_inodecache(void) kmem_cache_destroy(nfs_inode_cachep); } +struct workqueue_struct *nfslocaliod_workqueue; struct workqueue_struct *nfsiod_workqueue; EXPORT_SYMBOL_GPL(nfsiod_workqueue); /* - * start up the nfsiod workqueue + * Destroy the nfsiod workqueues */ -static int nfsiod_start(void) +static void nfsiod_stop(void) { struct workqueue_struct *wq; - dprintk("RPC: creating workqueue nfsiod\n"); - wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); - if (wq == NULL) - return -ENOMEM; - nfsiod_workqueue = wq; - return 0; + + wq = nfsiod_workqueue; + if (wq != NULL) { + nfsiod_workqueue = NULL; + destroy_workqueue(wq); + } +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + wq = nfslocaliod_workqueue; + if (wq != NULL) { + nfslocaliod_workqueue = NULL; + destroy_workqueue(wq); + } +#endif /* CONFIG_NFS_LOCALIO */ } /* - * Destroy the nfsiod workqueue + * Start the nfsiod workqueues */ -static void nfsiod_stop(void) +static int nfsiod_start(void) { - struct workqueue_struct *wq; - - wq = nfsiod_workqueue; - if (wq == NULL) - return; - nfsiod_workqueue = NULL; - destroy_workqueue(wq); + dprintk("RPC: creating workqueue nfsiod\n"); + nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (nfsiod_workqueue == NULL) + return -ENOMEM; +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + /* + * localio writes need to use a normal (non-memreclaim) workqueue. + * When we start getting low on space, XFS goes and calls flush_work() on + * a non-memreclaim work queue, which causes a priority inversion problem. + */ + dprintk("RPC: creating workqueue nfslocaliod\n"); + nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0); + if (unlikely(nfslocaliod_workqueue == NULL)) { + nfsiod_stop(); + return -ENOMEM; + } +#endif /* CONFIG_NFS_LOCALIO */ + return 0; } unsigned int nfs_net_id; @@ -2426,12 +2640,32 @@ EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { + struct nfs_net *nn = net_generic(net, nfs_net_id); + int err; + nfs_clients_init(net); - return nfs_fs_proc_net_init(net); + + if (!rpc_proc_register(net, &nn->rpcstats)) { + err = -ENOMEM; + goto err_proc_rpc; + } + + err = nfs_fs_proc_net_init(net); + if (err) + goto err_proc_nfs; + + return 0; + +err_proc_nfs: + rpc_proc_unregister(net, "nfs"); +err_proc_rpc: + nfs_clients_exit(net); + return err; } static void nfs_net_exit(struct net *net) { + rpc_proc_unregister(net, "nfs"); nfs_fs_proc_net_exit(net); nfs_clients_exit(net); } @@ -2443,6 +2677,35 @@ static struct pernet_operations nfs_net_ops = { .size = sizeof(struct nfs_net), }; +#ifdef CONFIG_KEYS +static struct key *nfs_keyring; + +static int __init nfs_init_keyring(void) +{ + nfs_keyring = keyring_alloc(".nfs", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + (KEY_USR_ALL & ~KEY_USR_SETATTR), + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + return PTR_ERR_OR_ZERO(nfs_keyring); +} + +static void nfs_exit_keyring(void) +{ + key_put(nfs_keyring); +} +#else +static inline int nfs_init_keyring(void) +{ + return 0; +} + +static inline void nfs_exit_keyring(void) +{ +} +#endif /* CONFIG_KEYS */ + /* * Initialize NFS */ @@ -2450,6 +2713,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_init_keyring(); + if (err) + return err; + err = nfs_sysfs_init(); if (err < 0) goto out10; @@ -2486,15 +2753,12 @@ static int __init init_nfs_fs(void) if (err) goto out1; - rpc_proc_register(&init_net, &nfs_rpcstat); - err = register_nfs_fs(); if (err) goto out0; return 0; out0: - rpc_proc_unregister(&init_net, "nfs"); nfs_destroy_directcache(); out1: nfs_destroy_writepagecache(); @@ -2513,6 +2777,7 @@ out7: out9: nfs_sysfs_exit(); out10: + nfs_exit_keyring(); return err; } @@ -2524,15 +2789,16 @@ static void __exit exit_nfs_fs(void) nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); unregister_pernet_subsys(&nfs_net_ops); - rpc_proc_unregister(&init_net, "nfs"); unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); nfs_sysfs_exit(); + nfs_exit_keyring(); } /* Not quite true; I just maintain it */ MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); +MODULE_DESCRIPTION("NFS client support"); MODULE_LICENSE("GPL"); module_param(enable_ino64, bool, 0644); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 913c09806c7f..2ecd38e1d17a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -6,12 +6,14 @@ #include "nfs4_fs.h" #include <linux/fs_context.h> #include <linux/security.h> +#include <linux/compiler_attributes.h> #include <linux/crc32.h> #include <linux/sunrpc/addr.h> #include <linux/nfs_page.h> +#include <linux/nfslocalio.h> #include <linux/wait_bit.h> -#define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) +#define NFS_SB_MASK (SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; @@ -82,6 +84,8 @@ struct nfs_client_initdata { const struct rpc_timeout *timeparms; const struct cred *cred; struct xprtsec_parms xprtsec; + unsigned long connect_timeout; + unsigned long reconnect_timeout; }; /* @@ -110,6 +114,7 @@ struct nfs_fs_context { unsigned short protofamily; unsigned short mountfamily; bool has_sec_mnt_opts; + int lock_status; struct { union { @@ -151,6 +156,12 @@ struct nfs_fs_context { } clone_data; }; +enum nfs_lock_status { + NFS_LOCK_NOT_SET = 0, + NFS_LOCK_LOCK = 1, + NFS_LOCK_NOLOCK = 2, +}; + #define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \ errorf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); })) @@ -196,7 +207,6 @@ struct nfs_mount_request { }; extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans); -extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern const struct rpc_program nfs_program; @@ -221,7 +231,7 @@ extern struct nfs_client * nfs4_find_client_sessionid(struct net *, const struct sockaddr *, struct nfs4_sessionid *, u32); extern struct nfs_server *nfs_create_server(struct fs_context *); -extern void nfs4_server_set_init_caps(struct nfs_server *); +extern void nfs_server_set_init_caps(struct nfs_server *); extern struct nfs_server *nfs4_create_server(struct fs_context *); extern struct nfs_server *nfs4_create_referral_server(struct fs_context *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, @@ -299,7 +309,8 @@ void nfs_pgio_header_free(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, const struct cred *cred, const struct nfs_rpc_ops *rpc_ops, - const struct rpc_call_ops *call_ops, int how, int flags); + const struct rpc_call_ops *call_ops, int how, int flags, + struct nfsd_file *localio); void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); @@ -388,8 +399,8 @@ struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); void nfs_d_prune_case_insensitive_aliases(struct inode *inode); int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t); +struct dentry *nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, + umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, @@ -420,15 +431,18 @@ loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -int nfs_file_mmap(struct file *, struct vm_area_struct *); +int nfs_file_mmap_prepare(struct vm_area_desc *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; +extern struct workqueue_struct *nfslocaliod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_free_inode(struct inode *); extern int nfs_write_inode(struct inode *, struct writeback_control *); @@ -440,6 +454,63 @@ extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); +#if IS_ENABLED(CONFIG_NFS_LOCALIO) +/* localio.c */ +struct nfs_local_dio { + u32 mem_align; + u32 offset_align; + loff_t middle_offset; + loff_t end_offset; + ssize_t start_len; /* Length for misaligned first extent */ + ssize_t middle_len; /* Length for DIO-aligned middle extent */ + ssize_t end_len; /* Length for misaligned last extent */ +}; + +extern void nfs_local_probe_async(struct nfs_client *); +extern void nfs_local_probe_async_work(struct work_struct *); +extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *, + const struct cred *, + struct nfs_fh *, + struct nfs_file_localio *, + const fmode_t); +extern int nfs_local_doio(struct nfs_client *, + struct nfsd_file *, + struct nfs_pgio_header *, + const struct rpc_call_ops *); +extern int nfs_local_commit(struct nfsd_file *, + struct nfs_commit_data *, + const struct rpc_call_ops *, int); +extern bool nfs_server_is_local(const struct nfs_client *clp); + +#else /* CONFIG_NFS_LOCALIO */ +static inline void nfs_local_probe(struct nfs_client *clp) {} +static inline void nfs_local_probe_async(struct nfs_client *clp) {} +static inline struct nfsd_file * +nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, + struct nfs_fh *fh, struct nfs_file_localio *nfl, + const fmode_t mode) +{ + return NULL; +} +static inline int nfs_local_doio(struct nfs_client *clp, + struct nfsd_file *localio, + struct nfs_pgio_header *hdr, + const struct rpc_call_ops *call_ops) +{ + return -EINVAL; +} +static inline int nfs_local_commit(struct nfsd_file *localio, + struct nfs_commit_data *data, + const struct rpc_call_ops *call_ops, int how) +{ + return -EINVAL; +} +static inline bool nfs_server_is_local(const struct nfs_client *clp) +{ + return false; +} +#endif /* CONFIG_NFS_LOCALIO */ + /* super.c */ extern const struct super_operations nfs_sops; bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); @@ -447,8 +518,6 @@ int nfs_try_get_tree(struct fs_context *); int nfs_get_tree_common(struct fs_context *); void nfs_kill_super(struct super_block *); -extern struct rpc_stat nfs_rpcstat; - extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); @@ -461,11 +530,11 @@ extern const struct netfs_request_ops nfs_netfs_ops; #endif /* io.c */ -extern void nfs_start_io_read(struct inode *inode); +extern __must_check int nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); -extern void nfs_start_io_write(struct inode *inode); +extern __must_check int nfs_start_io_write(struct inode *inode); extern void nfs_end_io_write(struct inode *inode); -extern void nfs_start_io_direct(struct inode *inode); +extern __must_check int nfs_start_io_direct(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) @@ -473,6 +542,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } +/* Must be called with exclusively locked inode->i_rwsem */ +static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(&nfsi->vfs_inode); + } +} + + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, @@ -493,11 +572,11 @@ extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); +extern bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size); extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, struct nfs_open_context *ctx, struct folio *folio); extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio); -extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ @@ -520,7 +599,8 @@ extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, const struct nfs_rpc_ops *nfs_ops, const struct rpc_call_ops *call_ops, - int how, int flags); + int how, int flags, + struct nfsd_file *localio); extern void nfs_init_commit(struct nfs_commit_data *data, struct list_head *head, struct pnfs_layout_segment *lseg, @@ -612,9 +692,12 @@ nfs_write_match_verf(const struct nfs_writeverf *verf, static inline gfp_t nfs_io_gfp_mask(void) { - if (current->flags & PF_WQ_WORKER) - return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; - return GFP_KERNEL; + gfp_t ret = current_gfp_context(GFP_KERNEL); + + /* For workers __GFP_NORETRY only with __GFP_IO or __GFP_FS */ + if ((current->flags & PF_WQ_WORKER) && ret == GFP_KERNEL) + ret |= __GFP_NORETRY | __GFP_NOWARN; + return ret; } /* @@ -652,7 +735,7 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); -extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset); /* nfs4proc.c */ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, @@ -709,9 +792,9 @@ unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) if ((bsize & (bsize - 1)) || nrbitsp) { unsigned char nrbits; - for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--) + for (nrbits = 31; nrbits && !(bsize & (1UL << nrbits)); nrbits--) ; - bsize = 1 << nrbits; + bsize = 1UL << nrbits; if (nrbitsp) *nrbitsp = nrbits; } @@ -777,7 +860,7 @@ static inline void nfs_folio_mark_unstable(struct folio *folio, struct nfs_commit_info *cinfo) { if (folio && !cinfo->dreq) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; long nr = folio_nr_pages(folio); /* This page is really still in write-back - just that the @@ -792,31 +875,12 @@ static inline void nfs_folio_mark_unstable(struct folio *folio, /* * Determine the number of bytes of data the page contains */ -static inline -unsigned int nfs_page_length(struct page *page) -{ - loff_t i_size = i_size_read(page_file_mapping(page)->host); - - if (i_size > 0) { - pgoff_t index = page_index(page); - pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT; - if (index < end_index) - return PAGE_SIZE; - if (index == end_index) - return ((i_size - 1) & ~PAGE_MASK) + 1; - } - return 0; -} - -/* - * Determine the number of bytes of data the page contains - */ static inline size_t nfs_folio_length(struct folio *folio) { - loff_t i_size = i_size_read(folio_file_mapping(folio)->host); + loff_t i_size = i_size_read(folio->mapping->host); if (i_size > 0) { - pgoff_t index = folio_index(folio) >> folio_order(folio); + pgoff_t index = folio->index >> folio_order(folio); pgoff_t end_index = (i_size - 1) >> folio_shift(folio); if (index < end_index) return folio_size(folio); @@ -858,18 +922,16 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } -#ifdef CONFIG_CRC32 static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } -#else -static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) + +static inline bool nfs_current_task_exiting(void) { - return 0; + return (current->flags & PF_EXITING) != 0; } -#endif static inline bool nfs_error_is_fatal(int err) { @@ -933,7 +995,6 @@ struct nfs_direct_req { loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ - bytes_left, /* bytes left to be sent */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ diff --git a/fs/nfs/io.c b/fs/nfs/io.c index b5551ed8f648..d275b0a250bf 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -14,15 +14,6 @@ #include "internal.h" -/* Call with exclusively locked inode->i_rwsem */ -static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) -{ - if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { - clear_bit(NFS_INO_ODIRECT, &nfsi->flags); - inode_dio_wait(inode); - } -} - /** * nfs_start_io_read - declare the file is being used for buffered reads * @inode: file inode @@ -39,19 +30,28 @@ static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. */ -void +int nfs_start_io_read(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + int err; + /* Be an optimist! */ - down_read(&inode->i_rwsem); + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) - return; + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); - nfs_block_o_direct(nfsi, inode); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; + nfs_file_block_o_direct(nfsi); downgrade_write(&inode->i_rwsem); + + return 0; } /** @@ -74,11 +74,15 @@ nfs_end_io_read(struct inode *inode) * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. */ -void +int nfs_start_io_write(struct inode *inode) { - down_write(&inode->i_rwsem); - nfs_block_o_direct(NFS_I(inode), inode); + int err; + + err = down_write_killable(&inode->i_rwsem); + if (!err) + nfs_file_block_o_direct(NFS_I(inode)); + return err; } /** @@ -119,19 +123,28 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. */ -void +int nfs_start_io_direct(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + int err; + /* Be an optimist! */ - down_read(&inode->i_rwsem); + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) - return; + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; nfs_block_buffered(nfsi, inode); downgrade_write(&inode->i_rwsem); + + return 0; } /** diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 5aa776b5a3e7..49862c95b224 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -46,10 +46,11 @@ static inline void nfs_add_stats(const struct inode *inode, nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } -static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) -{ - return alloc_percpu(struct nfs_iostats); -} +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). + */ +#define nfs_alloc_iostats() alloc_percpu(struct nfs_iostats) static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats) { diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c new file mode 100644 index 000000000000..f33bfa7b58e6 --- /dev/null +++ b/fs/nfs/localio.c @@ -0,0 +1,1072 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * NFS client support for local clients to bypass network stack + * + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> + * Copyright (C) 2024 NeilBrown <neilb@suse.de> + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/vfs.h> +#include <linux/file.h> +#include <linux/inet.h> +#include <linux/sunrpc/addr.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#include <linux/nfs_common.h> +#include <linux/nfslocalio.h> +#include <linux/bvec.h> + +#include <linux/nfs.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_xdr.h> + +#include "internal.h" +#include "pnfs.h" +#include "nfstrace.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +#define NFSLOCAL_MAX_IOS 3 + +struct nfs_local_kiocb { + struct kiocb kiocb; + struct bio_vec *bvec; + struct nfs_pgio_header *hdr; + struct work_struct work; + void (*aio_complete_work)(struct work_struct *); + struct nfsd_file *localio; + /* Begin mostly DIO-specific members */ + size_t end_len; + short int end_iter_index; + atomic_t n_iters; + bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; + struct iov_iter iters[NFSLOCAL_MAX_IOS] ____cacheline_aligned; + /* End mostly DIO-specific members */ +}; + +struct nfs_local_fsync_ctx { + struct nfsd_file *localio; + struct nfs_commit_data *data; + struct work_struct work; + struct completion *done; +}; + +static bool localio_enabled __read_mostly = true; +module_param(localio_enabled, bool, 0644); + +static inline bool nfs_client_is_local(const struct nfs_client *clp) +{ + return !!rcu_access_pointer(clp->cl_uuid.net); +} + +bool nfs_server_is_local(const struct nfs_client *clp) +{ + return nfs_client_is_local(clp) && localio_enabled; +} +EXPORT_SYMBOL_GPL(nfs_server_is_local); + +/* + * UUID_IS_LOCAL XDR functions + */ + +static void localio_xdr_enc_uuidargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const u8 *uuid = data; + + encode_opaque_fixed(xdr, uuid, UUID_SIZE); +} + +static int localio_xdr_dec_uuidres(struct rpc_rqst *req, + struct xdr_stream *xdr, + void *result) +{ + /* void return */ + return 0; +} + +static const struct rpc_procinfo nfs_localio_procedures[] = { + [LOCALIOPROC_UUID_IS_LOCAL] = { + .p_proc = LOCALIOPROC_UUID_IS_LOCAL, + .p_encode = localio_xdr_enc_uuidargs, + .p_decode = localio_xdr_dec_uuidres, + .p_arglen = XDR_QUADLEN(UUID_SIZE), + .p_replen = 0, + .p_statidx = LOCALIOPROC_UUID_IS_LOCAL, + .p_name = "UUID_IS_LOCAL", + }, +}; + +static unsigned int nfs_localio_counts[ARRAY_SIZE(nfs_localio_procedures)]; +static const struct rpc_version nfslocalio_version1 = { + .number = 1, + .nrprocs = ARRAY_SIZE(nfs_localio_procedures), + .procs = nfs_localio_procedures, + .counts = nfs_localio_counts, +}; + +static const struct rpc_version *nfslocalio_version[] = { + [1] = &nfslocalio_version1, +}; + +extern const struct rpc_program nfslocalio_program; +static struct rpc_stat nfslocalio_rpcstat = { &nfslocalio_program }; + +const struct rpc_program nfslocalio_program = { + .name = "nfslocalio", + .number = NFS_LOCALIO_PROGRAM, + .nrvers = ARRAY_SIZE(nfslocalio_version), + .version = nfslocalio_version, + .stats = &nfslocalio_rpcstat, +}; + +/* + * nfs_init_localioclient - Initialise an NFS localio client connection + */ +static struct rpc_clnt *nfs_init_localioclient(struct nfs_client *clp) +{ + struct rpc_clnt *rpcclient_localio; + + rpcclient_localio = rpc_bind_new_program(clp->cl_rpcclient, + &nfslocalio_program, 1); + + dprintk_rcu("%s: server (%s) %s NFS LOCALIO.\n", + __func__, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), + (IS_ERR(rpcclient_localio) ? "does not support" : "supports")); + + return rpcclient_localio; +} + +static bool nfs_server_uuid_is_local(struct nfs_client *clp) +{ + u8 uuid[UUID_SIZE]; + struct rpc_message msg = { + .rpc_argp = &uuid, + }; + struct rpc_clnt *rpcclient_localio; + int status; + + rpcclient_localio = nfs_init_localioclient(clp); + if (IS_ERR(rpcclient_localio)) + return false; + + export_uuid(uuid, &clp->cl_uuid.uuid); + + msg.rpc_proc = &nfs_localio_procedures[LOCALIOPROC_UUID_IS_LOCAL]; + status = rpc_call_sync(rpcclient_localio, &msg, 0); + dprintk("%s: NFS reply UUID_IS_LOCAL: status=%d\n", + __func__, status); + rpc_shutdown_client(rpcclient_localio); + + /* Server is only local if it initialized required struct members */ + if (status || !rcu_access_pointer(clp->cl_uuid.net) || !clp->cl_uuid.dom) + return false; + + return true; +} + +/* + * nfs_local_probe - probe local i/o support for an nfs_server and nfs_client + * - called after alloc_client and init_client (so cl_rpcclient exists) + * - this function is idempotent, it can be called for old or new clients + */ +static void nfs_local_probe(struct nfs_client *clp) +{ + /* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */ + if (!localio_enabled || + clp->cl_rpcclient->cl_auth->au_flavor != RPC_AUTH_UNIX) { + nfs_localio_disable_client(clp); + return; + } + + if (nfs_client_is_local(clp)) + return; + + if (!nfs_uuid_begin(&clp->cl_uuid)) + return; + if (nfs_server_uuid_is_local(clp)) + nfs_localio_enable_client(clp); + nfs_uuid_end(&clp->cl_uuid); +} + +void nfs_local_probe_async_work(struct work_struct *work) +{ + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_local_probe_work); + + if (!refcount_inc_not_zero(&clp->cl_count)) + return; + nfs_local_probe(clp); + nfs_put_client(clp); +} + +void nfs_local_probe_async(struct nfs_client *clp) +{ + queue_work(nfsiod_workqueue, &clp->cl_local_probe_work); +} +EXPORT_SYMBOL_GPL(nfs_local_probe_async); + +static inline void nfs_local_file_put(struct nfsd_file *localio) +{ + /* nfs_to_nfsd_file_put_local() expects an __rcu pointer + * but we have a __kernel pointer. It is always safe + * to cast a __kernel pointer to an __rcu pointer + * because the cast only weakens what is known about the pointer. + */ + struct nfsd_file __rcu *nf = (struct nfsd_file __rcu*) localio; + + nfs_to_nfsd_file_put_local(&nf); +} + +/* + * __nfs_local_open_fh - open a local filehandle in terms of nfsd_file. + * + * Returns a pointer to a struct nfsd_file or ERR_PTR. + * Caller must release returned nfsd_file with nfs_to_nfsd_file_put_local(). + */ +static struct nfsd_file * +__nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, + struct nfs_fh *fh, struct nfs_file_localio *nfl, + struct nfsd_file __rcu **pnf, + const fmode_t mode) +{ + int status = 0; + struct nfsd_file *localio; + + localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient, + cred, fh, nfl, pnf, mode); + if (IS_ERR(localio)) { + status = PTR_ERR(localio); + switch (status) { + case -ENOMEM: + case -ENXIO: + case -ENOENT: + /* Revalidate localio */ + nfs_localio_disable_client(clp); + nfs_local_probe(clp); + } + } + trace_nfs_local_open_fh(fh, mode, status); + return localio; +} + +/* + * nfs_local_open_fh - open a local filehandle in terms of nfsd_file. + * First checking if the open nfsd_file is already cached, otherwise + * must __nfs_local_open_fh and insert the nfsd_file in nfs_file_localio. + * + * Returns a pointer to a struct nfsd_file or NULL. + */ +struct nfsd_file * +nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, + struct nfs_fh *fh, struct nfs_file_localio *nfl, + const fmode_t mode) +{ + struct nfsd_file *nf, __rcu **pnf; + + if (!nfs_server_is_local(clp)) + return NULL; + if (mode & ~(FMODE_READ | FMODE_WRITE)) + return NULL; + + if (mode & FMODE_WRITE) + pnf = &nfl->rw_file; + else + pnf = &nfl->ro_file; + + nf = __nfs_local_open_fh(clp, cred, fh, nfl, pnf, mode); + if (IS_ERR(nf)) + return NULL; + return nf; +} +EXPORT_SYMBOL_GPL(nfs_local_open_fh); + +static void +nfs_local_iocb_free(struct nfs_local_kiocb *iocb) +{ + kfree(iocb->bvec); + kfree(iocb); +} + +static struct nfs_local_kiocb * +nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, + struct file *file, gfp_t flags) +{ + struct nfs_local_kiocb *iocb; + + iocb = kzalloc(sizeof(*iocb), flags); + if (iocb == NULL) + return NULL; + + iocb->bvec = kmalloc_array(hdr->page_array.npages, + sizeof(struct bio_vec), flags); + if (iocb->bvec == NULL) { + kfree(iocb); + return NULL; + } + + init_sync_kiocb(&iocb->kiocb, file); + + iocb->hdr = hdr; + iocb->kiocb.ki_pos = hdr->args.offset; + iocb->kiocb.ki_flags &= ~IOCB_APPEND; + iocb->kiocb.ki_complete = NULL; + iocb->aio_complete_work = NULL; + + iocb->end_iter_index = -1; + + return iocb; +} + +static bool +nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw, + size_t len, struct nfs_local_dio *local_dio) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + loff_t offset = hdr->args.offset; + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; + loff_t start_end, orig_end, middle_end; + + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, + &nf_dio_offset_align, &nf_dio_read_offset_align); + if (rw == ITER_DEST) + nf_dio_offset_align = nf_dio_read_offset_align; + + if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align)) + return false; + if (unlikely(nf_dio_offset_align > PAGE_SIZE)) + return false; + if (unlikely(len < nf_dio_offset_align)) + return false; + + local_dio->mem_align = nf_dio_mem_align; + local_dio->offset_align = nf_dio_offset_align; + + start_end = round_up(offset, nf_dio_offset_align); + orig_end = offset + len; + middle_end = round_down(orig_end, nf_dio_offset_align); + + local_dio->middle_offset = start_end; + local_dio->end_offset = middle_end; + + local_dio->start_len = start_end - offset; + local_dio->middle_len = middle_end - start_end; + local_dio->end_len = orig_end - middle_end; + + if (rw == ITER_DEST) + trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio); + else + trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio); + return true; +} + +static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, + unsigned int addr_mask, unsigned int len_mask) +{ + const struct bio_vec *bvec = i->bvec; + size_t skip = i->iov_offset; + size_t size = i->count; + + if (size & len_mask) + return false; + do { + size_t len = bvec->bv_len; + + if (len > size) + len = size; + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) + return false; + bvec++; + size -= len; + skip = 0; + } while (size); + + return true; +} + +static void +nfs_local_iter_setup(struct iov_iter *iter, int rw, struct bio_vec *bvec, + unsigned int nvecs, unsigned long total, + size_t start, size_t len) +{ + iov_iter_bvec(iter, rw, bvec, nvecs, total); + if (start) + iov_iter_advance(iter, start); + iov_iter_truncate(iter, len); +} + +/* + * Setup as many as 3 iov_iter based on extents described by @local_dio. + * Returns the number of iov_iter that were setup. + */ +static int +nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, + unsigned int nvecs, unsigned long total, + struct nfs_local_dio *local_dio) +{ + int n_iters = 0; + struct iov_iter *iters = iocb->iters; + + /* Setup misaligned start? */ + if (local_dio->start_len) { + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, 0, local_dio->start_len); + ++n_iters; + } + + /* + * Setup DIO-aligned middle, if there is no misaligned end (below) + * then AIO completion is used, see nfs_local_call_{read,write} + */ + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, nvecs, + total, local_dio->start_len, local_dio->middle_len); + + iocb->iter_is_dio_aligned[n_iters] = + nfs_iov_iter_aligned_bvec(&iters[n_iters], + local_dio->mem_align-1, local_dio->offset_align-1); + + if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) { + trace_nfs_local_dio_misaligned(iocb->hdr->inode, + local_dio->start_len, local_dio->middle_len, local_dio); + return 0; /* no DIO-aligned IO possible */ + } + iocb->end_iter_index = n_iters; + ++n_iters; + + /* Setup misaligned end? */ + if (local_dio->end_len) { + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, local_dio->start_len + + local_dio->middle_len, local_dio->end_len); + iocb->end_iter_index = n_iters; + ++n_iters; + } + + atomic_set(&iocb->n_iters, n_iters); + return n_iters; +} + +static noinline_for_stack void +nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + struct page **pagevec = hdr->page_array.pagevec; + unsigned long v, total; + unsigned int base; + size_t len; + + v = 0; + total = hdr->args.count; + base = hdr->args.pgbase; + while (total && v < hdr->page_array.npages) { + len = min_t(size_t, total, PAGE_SIZE - base); + bvec_set_page(&iocb->bvec[v], *pagevec, len, base); + total -= len; + ++pagevec; + ++v; + base = 0; + } + len = hdr->args.count - total; + + /* + * For each iocb, iocb->n_iters is always at least 1 and we always + * end io after first nfs_local_pgio_done call unless misaligned DIO. + */ + atomic_set(&iocb->n_iters, 1); + + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { + struct nfs_local_dio local_dio; + + if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) && + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) { + /* Ensure DIO WRITE's IO on stable storage upon completion */ + if (rw == ITER_SOURCE) + iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC; + return; /* is DIO-aligned */ + } + } + + /* Use buffered IO */ + iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); +} + +static void +nfs_local_hdr_release(struct nfs_pgio_header *hdr, + const struct rpc_call_ops *call_ops) +{ + call_ops->rpc_call_done(&hdr->task, hdr); + call_ops->rpc_release(hdr); +} + +static void +nfs_local_pgio_init(struct nfs_pgio_header *hdr, + const struct rpc_call_ops *call_ops) +{ + hdr->task.tk_ops = call_ops; + if (!hdr->task.tk_start) + hdr->task.tk_start = ktime_get(); +} + +static bool +nfs_local_pgio_done(struct nfs_local_kiocb *iocb, long status, bool force) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + + /* Must handle partial completions */ + if (status >= 0) { + hdr->res.count += status; + /* @hdr was initialized to 0 (zeroed during allocation) */ + if (hdr->task.tk_status == 0) + hdr->res.op_status = NFS4_OK; + } else { + hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status); + hdr->task.tk_status = status; + } + + if (force) + return true; + + BUG_ON(atomic_read(&iocb->n_iters) <= 0); + return atomic_dec_and_test(&iocb->n_iters); +} + +static void +nfs_local_iocb_release(struct nfs_local_kiocb *iocb) +{ + nfs_local_file_put(iocb->localio); + nfs_local_iocb_free(iocb); +} + +static void +nfs_local_pgio_release(struct nfs_local_kiocb *iocb) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + + nfs_local_iocb_release(iocb); + nfs_local_hdr_release(hdr, hdr->task.tk_ops); +} + +/* + * Complete the I/O from iocb->kiocb.ki_complete() + * + * Note that this function can be called from a bottom half context, + * hence we need to queue the rpc_call_done() etc to a workqueue + */ +static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb) +{ + INIT_WORK(&iocb->work, iocb->aio_complete_work); + queue_work(nfsiod_workqueue, &iocb->work); +} + +static void nfs_local_read_done(struct nfs_local_kiocb *iocb) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + struct file *filp = iocb->kiocb.ki_filp; + long status = hdr->task.tk_status; + + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n"); + } + + /* + * Must clear replen otherwise NFSv3 data corruption will occur + * if/when switching from LOCALIO back to using normal RPC. + */ + hdr->res.replen = 0; + + /* nfs_readpage_result() handles short read */ + + if (hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) + hdr->res.eof = true; + + dprintk("%s: read %ld bytes eof %d.\n", __func__, + status > 0 ? status : 0, hdr->res.eof); +} + +static inline void nfs_local_read_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_read_done(iocb); + nfs_local_pgio_release(iocb); +} + +static void nfs_local_read_aio_complete_work(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + + nfs_local_read_iocb_done(iocb); +} + +static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) +{ + struct nfs_local_kiocb *iocb = + container_of(kiocb, struct nfs_local_kiocb, kiocb); + + /* AIO completion of DIO read should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ +} + +static void nfs_local_call_read(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + struct file *filp = iocb->kiocb.ki_filp; + bool force_done = false; + ssize_t status; + int n_iters; + + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_read_aio_complete; + iocb->aio_complete_work = nfs_local_read_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + + scoped_with_creds(filp->f_cred) + status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); + + if (status != -EIOCBQUEUED) { + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial read */ + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_read_iocb_done(iocb); + break; + } + } + } +} + +static int +nfs_local_do_read(struct nfs_local_kiocb *iocb, + const struct rpc_call_ops *call_ops) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + + dprintk("%s: vfs_read count=%u pos=%llu\n", + __func__, hdr->args.count, hdr->args.offset); + + nfs_local_pgio_init(hdr, call_ops); + hdr->res.eof = false; + + INIT_WORK(&iocb->work, nfs_local_call_read); + queue_work(nfslocaliod_workqueue, &iocb->work); + + return 0; +} + +static void +nfs_copy_boot_verifier(struct nfs_write_verifier *verifier, struct inode *inode) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + u32 *verf = (u32 *)verifier->data; + unsigned int seq; + + do { + seq = read_seqbegin(&clp->cl_boot_lock); + verf[0] = (u32)clp->cl_nfssvc_boot.tv_sec; + verf[1] = (u32)clp->cl_nfssvc_boot.tv_nsec; + } while (read_seqretry(&clp->cl_boot_lock, seq)); +} + +static void +nfs_reset_boot_verifier(struct inode *inode) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + + write_seqlock(&clp->cl_boot_lock); + ktime_get_real_ts64(&clp->cl_nfssvc_boot); + write_sequnlock(&clp->cl_boot_lock); +} + +static void +nfs_set_local_verifier(struct inode *inode, + struct nfs_writeverf *verf, + enum nfs3_stable_how how) +{ + nfs_copy_boot_verifier(&verf->verifier, inode); + verf->committed = how; +} + +/* Factored out from fs/nfsd/vfs.h:fh_getattr() */ +static int __vfs_getattr(const struct path *p, struct kstat *stat, int version) +{ + u32 request_mask = STATX_BASIC_STATS; + + if (version == 4) + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); + return vfs_getattr(p, stat, request_mask, AT_STATX_SYNC_AS_STAT); +} + +/* Copied from fs/nfsd/nfsfh.c:nfsd4_change_attribute() */ +static u64 __nfsd4_change_attribute(const struct kstat *stat, + const struct inode *inode) +{ + u64 chattr; + + if (stat->result_mask & STATX_CHANGE_COOKIE) { + chattr = stat->change_cookie; + if (S_ISREG(inode->i_mode) && + !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) { + chattr += (u64)stat->ctime.tv_sec << 30; + chattr += stat->ctime.tv_nsec; + } + } else { + chattr = time_to_chattr(&stat->ctime); + } + return chattr; +} + +static void nfs_local_vfs_getattr(struct nfs_local_kiocb *iocb) +{ + struct kstat stat; + struct file *filp = iocb->kiocb.ki_filp; + struct nfs_pgio_header *hdr = iocb->hdr; + struct nfs_fattr *fattr = hdr->res.fattr; + int version = NFS_PROTO(hdr->inode)->version; + + if (unlikely(!fattr) || __vfs_getattr(&filp->f_path, &stat, version)) + return; + + fattr->valid = (NFS_ATTR_FATTR_FILEID | + NFS_ATTR_FATTR_CHANGE | + NFS_ATTR_FATTR_SIZE | + NFS_ATTR_FATTR_ATIME | + NFS_ATTR_FATTR_MTIME | + NFS_ATTR_FATTR_CTIME | + NFS_ATTR_FATTR_SPACE_USED); + + fattr->fileid = stat.ino; + fattr->size = stat.size; + fattr->atime = stat.atime; + fattr->mtime = stat.mtime; + fattr->ctime = stat.ctime; + if (version == 4) { + fattr->change_attr = + __nfsd4_change_attribute(&stat, file_inode(filp)); + } else + fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); + fattr->du.nfs3.used = stat.blocks << 9; +} + +static void nfs_local_write_done(struct nfs_local_kiocb *iocb) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + long status = hdr->task.tk_status; + + dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); + + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n"); + } + + /* Handle short writes as if they are ENOSPC */ + status = hdr->res.count; + if (status > 0 && status < hdr->args.count) { + hdr->mds_offset += status; + hdr->args.offset += status; + hdr->args.pgbase += status; + hdr->args.count -= status; + nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset); + status = -ENOSPC; + /* record -ENOSPC in terms of nfs_local_pgio_done */ + (void) nfs_local_pgio_done(iocb, status, true); + } + if (hdr->task.tk_status < 0) + nfs_reset_boot_verifier(hdr->inode); +} + +static inline void nfs_local_write_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_write_done(iocb); + nfs_local_vfs_getattr(iocb); + nfs_local_pgio_release(iocb); +} + +static void nfs_local_write_aio_complete_work(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + + nfs_local_write_iocb_done(iocb); +} + +static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) +{ + struct nfs_local_kiocb *iocb = + container_of(kiocb, struct nfs_local_kiocb, kiocb); + + /* AIO completion of DIO write should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ +} + +static void nfs_local_call_write(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + struct file *filp = iocb->kiocb.ki_filp; + unsigned long old_flags = current->flags; + bool force_done = false; + ssize_t status; + int n_iters; + + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; + + file_start_write(filp); + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_write_aio_complete; + iocb->aio_complete_work = nfs_local_write_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + + scoped_with_creds(filp->f_cred) + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); + + if (status != -EIOCBQUEUED) { + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial write */ + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_write_iocb_done(iocb); + break; + } + } + } + file_end_write(filp); + + current->flags = old_flags; +} + +static int +nfs_local_do_write(struct nfs_local_kiocb *iocb, + const struct rpc_call_ops *call_ops) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + + dprintk("%s: vfs_write count=%u pos=%llu %s\n", + __func__, hdr->args.count, hdr->args.offset, + (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); + + switch (hdr->args.stable) { + default: + break; + case NFS_DATA_SYNC: + iocb->kiocb.ki_flags |= IOCB_DSYNC; + break; + case NFS_FILE_SYNC: + iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC; + } + + nfs_local_pgio_init(hdr, call_ops); + + nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable); + + INIT_WORK(&iocb->work, nfs_local_call_write); + queue_work(nfslocaliod_workqueue, &iocb->work); + + return 0; +} + +static struct nfs_local_kiocb * +nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio) +{ + struct file *file = nfs_to->nfsd_file_file(localio); + struct nfs_local_kiocb *iocb; + gfp_t gfp_mask; + int rw; + + if (hdr->rw_mode & FMODE_READ) { + if (!file->f_op->read_iter) + return ERR_PTR(-EOPNOTSUPP); + gfp_mask = GFP_KERNEL; + rw = ITER_DEST; + } else { + if (!file->f_op->write_iter) + return ERR_PTR(-EOPNOTSUPP); + gfp_mask = GFP_NOIO; + rw = ITER_SOURCE; + } + + iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask); + if (iocb == NULL) + return ERR_PTR(-ENOMEM); + iocb->hdr = hdr; + iocb->localio = localio; + + nfs_local_iters_init(iocb, rw); + + return iocb; +} + +int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, + struct nfs_pgio_header *hdr, + const struct rpc_call_ops *call_ops) +{ + struct nfs_local_kiocb *iocb; + int status = 0; + + if (!hdr->args.count) + return 0; + + iocb = nfs_local_iocb_init(hdr, localio); + if (IS_ERR(iocb)) + return PTR_ERR(iocb); + + switch (hdr->rw_mode) { + case FMODE_READ: + status = nfs_local_do_read(iocb, call_ops); + break; + case FMODE_WRITE: + status = nfs_local_do_write(iocb, call_ops); + break; + default: + dprintk("%s: invalid mode: %d\n", __func__, + hdr->rw_mode); + status = -EOPNOTSUPP; + } + + if (status != 0) { + if (status == -EAGAIN) + nfs_localio_disable_client(clp); + nfs_local_iocb_release(iocb); + hdr->task.tk_status = status; + nfs_local_hdr_release(hdr, call_ops); + } + return status; +} + +static void +nfs_local_init_commit(struct nfs_commit_data *data, + const struct rpc_call_ops *call_ops) +{ + data->task.tk_ops = call_ops; +} + +static int +nfs_local_run_commit(struct file *filp, struct nfs_commit_data *data) +{ + loff_t start = data->args.offset; + loff_t end = LLONG_MAX; + + if (data->args.count > 0) { + end = start + data->args.count - 1; + if (end < start) + end = LLONG_MAX; + } + + dprintk("%s: commit %llu - %llu\n", __func__, start, end); + return vfs_fsync_range(filp, start, end, 0); +} + +static void +nfs_local_commit_done(struct nfs_commit_data *data, int status) +{ + if (status >= 0) { + nfs_set_local_verifier(data->inode, + data->res.verf, + NFS_FILE_SYNC); + data->res.op_status = NFS4_OK; + data->task.tk_status = 0; + } else { + nfs_reset_boot_verifier(data->inode); + data->res.op_status = nfs_localio_errno_to_nfs4_stat(status); + data->task.tk_status = status; + } +} + +static void +nfs_local_release_commit_data(struct nfsd_file *localio, + struct nfs_commit_data *data, + const struct rpc_call_ops *call_ops) +{ + nfs_local_file_put(localio); + call_ops->rpc_call_done(&data->task, data); + call_ops->rpc_release(data); +} + +static void +nfs_local_fsync_ctx_free(struct nfs_local_fsync_ctx *ctx) +{ + nfs_local_release_commit_data(ctx->localio, ctx->data, + ctx->data->task.tk_ops); + kfree(ctx); +} + +static void +nfs_local_fsync_work(struct work_struct *work) +{ + struct nfs_local_fsync_ctx *ctx; + int status; + + ctx = container_of(work, struct nfs_local_fsync_ctx, work); + + status = nfs_local_run_commit(nfs_to->nfsd_file_file(ctx->localio), + ctx->data); + nfs_local_commit_done(ctx->data, status); + if (ctx->done != NULL) + complete(ctx->done); + nfs_local_fsync_ctx_free(ctx); +} + +static struct nfs_local_fsync_ctx * +nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data, + struct nfsd_file *localio, gfp_t flags) +{ + struct nfs_local_fsync_ctx *ctx = kmalloc(sizeof(*ctx), flags); + + if (ctx != NULL) { + ctx->localio = localio; + ctx->data = data; + INIT_WORK(&ctx->work, nfs_local_fsync_work); + ctx->done = NULL; + } + return ctx; +} + +int nfs_local_commit(struct nfsd_file *localio, + struct nfs_commit_data *data, + const struct rpc_call_ops *call_ops, int how) +{ + struct nfs_local_fsync_ctx *ctx; + + ctx = nfs_local_fsync_ctx_alloc(data, localio, GFP_KERNEL); + if (!ctx) { + nfs_local_commit_done(data, -ENOMEM); + nfs_local_release_commit_data(localio, data, call_ops); + return -ENOMEM; + } + + nfs_local_init_commit(data, call_ops); + + if (how & FLUSH_SYNC) { + DECLARE_COMPLETION_ONSTACK(done); + ctx->done = &done; + queue_work(nfsiod_workqueue, &ctx->work); + wait_for_completion(&done); + } else + queue_work(nfsiod_workqueue, &ctx->work); + + return 0; +} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 68e76b626371..db8dfb920394 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -128,11 +128,6 @@ struct mountres { rpc_authflavor_t *auth_flavors; }; -struct mnt_fhstatus { - u32 status; - struct nfs_fh *fh; -}; - /** * nfs_mount - Obtain an NFS file handle for the given host and path * @info: pointer to mount request arguments @@ -228,74 +223,6 @@ out_mnt_err: goto out; } -/** - * nfs_umount - Notify a server that we have unmounted this export - * @info: pointer to umount request arguments - * - * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always - * use UDP. - */ -void nfs_umount(const struct nfs_mount_request *info) -{ - static const struct rpc_timeout nfs_umnt_timeout = { - .to_initval = 1 * HZ, - .to_maxval = 3 * HZ, - .to_retries = 2, - }; - struct rpc_create_args args = { - .net = info->net, - .protocol = IPPROTO_UDP, - .address = (struct sockaddr *)info->sap, - .addrsize = info->salen, - .timeout = &nfs_umnt_timeout, - .servername = info->hostname, - .program = &mnt_program, - .version = info->version, - .authflavor = RPC_AUTH_UNIX, - .flags = RPC_CLNT_CREATE_NOPING, - .cred = current_cred(), - }; - struct rpc_message msg = { - .rpc_argp = info->dirpath, - }; - struct rpc_clnt *clnt; - int status; - - if (strlen(info->dirpath) > MNTPATHLEN) - return; - - if (info->noresvport) - args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; - - clnt = rpc_create(&args); - if (IS_ERR(clnt)) - goto out_clnt_err; - - dprintk("NFS: sending UMNT request for %s:%s\n", - (info->hostname ? info->hostname : "server"), info->dirpath); - - if (info->version == NFS_MNT3_VERSION) - msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; - else - msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; - - status = rpc_call_sync(clnt, &msg, 0); - rpc_shutdown_client(clnt); - - if (unlikely(status < 0)) - goto out_call_err; - - return; - -out_clnt_err: - dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", - PTR_ERR(clnt)); - return; - -out_call_err: - dprintk("NFS: UMNT request failed, status=%d\n", status); -} - /* * XDR encode/decode functions for MOUNT */ diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 19d51ebf842c..5a4d193da1a9 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -182,7 +182,7 @@ struct vfsmount *nfs_d_automount(struct path *path) ctx->version = client->rpc_ops->version; ctx->minorversion = client->cl_minorversion; ctx->nfs_mod = client->cl_nfs_mod; - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); ret = client->rpc_ops->submount(fc, server); if (ret < 0) { @@ -195,7 +195,6 @@ struct vfsmount *nfs_d_automount(struct path *path) if (IS_ERR(mnt)) goto out_fc; - mntget(mnt); /* prevent immediate expiration */ if (timeout <= 0) goto out_fc; @@ -215,7 +214,8 @@ nfs_namespace_getattr(struct mnt_idmap *idmap, if (NFS_FH(d_inode(path->dentry))->size != 0) return nfs_getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); return 0; } @@ -290,7 +290,8 @@ int nfs_do_submount(struct fs_context *fc) nfs_errorf(fc, "NFS: Couldn't determine submount pathname"); ret = PTR_ERR(p); } else { - ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p); + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(p, buffer + 4096 - p)); if (!ret) ret = vfs_get_tree(fc); } @@ -307,7 +308,7 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server) int err; /* Look it up again to get its attributes */ - err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry, + err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry, &dentry->d_name, ctx->mntfh, ctx->clone_data.fattr); dput(parent); if (err != 0) @@ -335,7 +336,7 @@ static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp) num *= HZ; *((int *)kp->arg) = num; if (!list_empty(&nfs_automount_list)) - mod_delayed_work(system_wq, &nfs_automount_task, num); + mod_delayed_work(system_percpu_wq, &nfs_automount_task, num); } else { *((int *)kp->arg) = -1*HZ; cancel_delayed_work(&nfs_automount_task); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index c8374f74dce1..6ba3ea39e928 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -9,6 +9,7 @@ #include <linux/nfs4.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/sunrpc/stats.h> struct bl_dev_msg { int32_t status; @@ -30,10 +31,15 @@ struct nfs_net { unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; int cb_users[NFS4_MAX_MINOR_VERSION + 1]; -#endif +#endif /* CONFIG_NFS_V4 */ +#if IS_ENABLED(CONFIG_NFS_V4_1) + struct list_head nfs4_data_server_cache; + spinlock_t nfs4_data_server_lock; +#endif /* CONFIG_NFS_V4_1 */ struct nfs_netns_client *nfs_client; spinlock_t nfs_client_lock; ktime_t boot_time; + struct rpc_stat rpcstats; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_nfsfs; #endif diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h index 5ba00610aede..8a5f51be013a 100644 --- a/fs/nfs/nfs.h +++ b/fs/nfs/nfs.h @@ -18,11 +18,11 @@ struct nfs_subversion { const struct rpc_version *rpc_vers; /* NFS version information */ const struct nfs_rpc_ops *rpc_ops; /* NFS operations */ const struct super_operations *sops; /* NFS Super operations */ - const struct xattr_handler **xattr; /* NFS xattr handlers */ - struct list_head list; /* List of NFS versions */ + const struct xattr_handler * const *xattr; /* NFS xattr handlers */ }; -struct nfs_subversion *get_nfs_version(unsigned int); +struct nfs_subversion *find_nfs_version(unsigned int); +int get_nfs_version(struct nfs_subversion *); void put_nfs_version(struct nfs_subversion *); void register_nfs_version(struct nfs_subversion *); void unregister_nfs_version(struct nfs_subversion *); diff --git a/fs/nfs/nfs2super.c b/fs/nfs/nfs2super.c index 467f21ee6a35..b1badc70bd71 100644 --- a/fs/nfs/nfs2super.c +++ b/fs/nfs/nfs2super.c @@ -26,6 +26,7 @@ static void __exit exit_nfs_v2(void) unregister_nfs_version(&nfs_v2); } +MODULE_DESCRIPTION("NFSv2 client support"); MODULE_LICENSE("GPL"); module_init(init_nfs_v2); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 05c3b4b2b3dd..9eff09158518 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -22,14 +22,12 @@ #include <linux/nfs.h> #include <linux/nfs2.h> #include <linux/nfs_fs.h> -#include "nfstrace.h" +#include <linux/nfs_common.h> #include "internal.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_XDR -/* Mapping from NFS error code to "errno" error code. */ -#define errno_NFSERR_IO EIO - /* * Declare the space requirements for NFS arguments and replies as * number of 32bit-words @@ -64,8 +62,6 @@ #define NFS_readdirres_sz (1+NFS_pagepad_sz) #define NFS_statfsres_sz (1+NFS_info_sz) -static int nfs_stat_to_errno(enum nfs_stat); - /* * Encode/decode NFSv2 basic data types * @@ -949,7 +945,7 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_filename_inline(xdr, &entry->name, &entry->len); if (unlikely(error)) - return -EAGAIN; + return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN; /* * The type (size and byte order) of nfscookie isn't defined in @@ -1054,70 +1050,6 @@ out_default: return nfs_stat_to_errno(status); } - -/* - * We need to translate between nfs status return values and - * the local errno values which may not be the same. - */ -static const struct { - int stat; - int errno; -} nfs_errtbl[] = { - { NFS_OK, 0 }, - { NFSERR_PERM, -EPERM }, - { NFSERR_NOENT, -ENOENT }, - { NFSERR_IO, -errno_NFSERR_IO}, - { NFSERR_NXIO, -ENXIO }, -/* { NFSERR_EAGAIN, -EAGAIN }, */ - { NFSERR_ACCES, -EACCES }, - { NFSERR_EXIST, -EEXIST }, - { NFSERR_XDEV, -EXDEV }, - { NFSERR_NODEV, -ENODEV }, - { NFSERR_NOTDIR, -ENOTDIR }, - { NFSERR_ISDIR, -EISDIR }, - { NFSERR_INVAL, -EINVAL }, - { NFSERR_FBIG, -EFBIG }, - { NFSERR_NOSPC, -ENOSPC }, - { NFSERR_ROFS, -EROFS }, - { NFSERR_MLINK, -EMLINK }, - { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, - { NFSERR_NOTEMPTY, -ENOTEMPTY }, - { NFSERR_DQUOT, -EDQUOT }, - { NFSERR_STALE, -ESTALE }, - { NFSERR_REMOTE, -EREMOTE }, -#ifdef EWFLUSH - { NFSERR_WFLUSH, -EWFLUSH }, -#endif - { NFSERR_BADHANDLE, -EBADHANDLE }, - { NFSERR_NOT_SYNC, -ENOTSYNC }, - { NFSERR_BAD_COOKIE, -EBADCOOKIE }, - { NFSERR_NOTSUPP, -ENOTSUPP }, - { NFSERR_TOOSMALL, -ETOOSMALL }, - { NFSERR_SERVERFAULT, -EREMOTEIO }, - { NFSERR_BADTYPE, -EBADTYPE }, - { NFSERR_JUKEBOX, -EJUKEBOX }, - { -1, -EIO } -}; - -/** - * nfs_stat_to_errno - convert an NFS status code to a local errno - * @status: NFS status code to convert - * - * Returns a local errno value, or -EIO if the NFS status code is - * not recognized. This function is used jointly by NFSv2 and NFSv3. - */ -static int nfs_stat_to_errno(enum nfs_stat status) -{ - int i; - - for (i = 0; nfs_errtbl[i].stat != -1; i++) { - if (nfs_errtbl[i].stat == (int)status) - return nfs_errtbl[i].errno; - } - dprintk("NFS: Unrecognized nfs status value: %u\n", status); - return nfs_errtbl[i].errno; -} - #define PROC(proc, argtype, restype, timer) \ [NFSPROC_##proc] = { \ .p_proc = NFSPROC_##proc, \ diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 18d8f6529f61..a126eb31f62f 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -104,7 +104,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu) switch (status) { case 0: - status = nfs_refresh_inode(inode, res.fattr); + nfs_refresh_inode(inode, res.fattr); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index eff3802c5e03..5d97c1d38bb6 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -2,6 +2,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/sunrpc/addr.h> +#include <net/handshake.h> #include "internal.h" #include "nfs3_fs.h" #include "netns.h" @@ -86,6 +87,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct rpc_timeout ds_timeout; + unsigned long connect_timeout = ds_timeo * (ds_retrans + 1) * HZ / 10; struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, @@ -97,7 +99,13 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, - .xprtsec = mds_clp->cl_xprtsec, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, + .connect_timeout = connect_timeout, + .reconnect_timeout = connect_timeout, }; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -108,14 +116,22 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { - case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: + if (mds_clp->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_clp->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; + case XPRT_TRANSPORT_RDMA: + case XPRT_TRANSPORT_TCP: if (mds_clp->cl_nconnect > 1) cl_init.nconnect = mds_clp->cl_nconnect; } if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); __set_bit(NFS_CS_DS, &cl_init.init_flags); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 4bf208a0a8e9..a4cb67573aa7 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -39,7 +39,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; - } while (!fatal_signal_pending(current)); + } while (!fatal_signal_pending(current) && !nfs_current_task_exiting()); return res; } @@ -192,7 +192,7 @@ __nfs3_proc_lookup(struct inode *dir, const char *name, size_t len, } static int -nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, +nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { unsigned short task_flags = 0; @@ -202,8 +202,7 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, task_flags |= RPC_TASK_TIMEOUT; dprintk("NFS call lookup %pd2\n", dentry); - return __nfs3_proc_lookup(dir, dentry->d_name.name, - dentry->d_name.len, fhandle, fattr, + return __nfs3_proc_lookup(dir, name->name, name->len, fhandle, fattr, task_flags); } @@ -543,9 +542,10 @@ out: } static int -nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, +nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio, unsigned int len, struct iattr *sattr) { + struct page *page = &folio->page; struct nfs3_createdata *data; struct dentry *d_alias; int status = -ENOMEM; @@ -578,13 +578,13 @@ out: return status; } -static int +static struct dentry * nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - struct dentry *d_alias; - int status = -ENOMEM; + struct dentry *ret = ERR_PTR(-ENOMEM); + int status; dprintk("NFS call mkdir %pd\n", dentry); @@ -592,8 +592,9 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) if (data == NULL) goto out; - status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); - if (status) + ret = ERR_PTR(posix_acl_create(dir, &sattr->ia_mode, + &default_acl, &acl)); + if (IS_ERR(ret)) goto out; data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; @@ -602,25 +603,27 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; - d_alias = nfs3_do_create(dir, dentry, data); - status = PTR_ERR_OR_ZERO(d_alias); + ret = nfs3_do_create(dir, dentry, data); - if (status != 0) + if (IS_ERR(ret)) goto out_release_acls; - if (d_alias) - dentry = d_alias; + if (ret) + dentry = ret; status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + if (status) { + dput(ret); + ret = ERR_PTR(status); + } - dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); out: nfs3_free_createdata(data); - dprintk("NFS reply mkdir: %d\n", status); - return status; + dprintk("NFS reply mkdir: %d\n", PTR_ERR_OR_ZERO(ret)); + return ret; } static int @@ -843,6 +846,41 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return status; } +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +static unsigned nfs3_localio_probe_throttle __read_mostly = 0; +module_param(nfs3_localio_probe_throttle, uint, 0644); +MODULE_PARM_DESC(nfs3_localio_probe_throttle, + "Probe for NFSv3 LOCALIO every N IO requests. Must be power-of-2, defaults to 0 (probing disabled)."); + +static void nfs3_localio_probe(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + /* Throttled to reduce nfs_local_probe_async() frequency */ + if (!nfs3_localio_probe_throttle || nfs_server_is_local(clp)) + return; + + /* + * Try (re)enabling LOCALIO if isn't enabled -- admin deems + * it worthwhile to periodically check if LOCALIO possible by + * setting the 'nfs3_localio_probe_throttle' module parameter. + * + * This is useful if LOCALIO was previously enabled, but was + * disabled due to server restart, and IO has successfully + * completed in terms of normal RPC. + */ + if ((clp->cl_uuid.nfs3_localio_probe_count++ & + (nfs3_localio_probe_throttle - 1)) == 0) { + if (!nfs_server_is_local(clp)) + nfs_local_probe_async(clp); + } +} + +#else +static void nfs3_localio_probe(struct nfs_server *server) {} +#endif + static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct inode *inode = hdr->inode; @@ -854,8 +892,11 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; - if (task->tk_status >= 0 && !server->read_hdrsize) - cmpxchg(&server->read_hdrsize, 0, hdr->res.replen); + if (task->tk_status >= 0) { + if (!server->read_hdrsize) + cmpxchg(&server->read_hdrsize, 0, hdr->res.replen); + nfs3_localio_probe(server); + } nfs_invalidate_atime(inode); nfs_refresh_inode(inode, &hdr->fattr); @@ -885,8 +926,10 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; - if (task->tk_status >= 0) + if (task->tk_status >= 0) { nfs_writeback_update_inode(hdr); + nfs3_localio_probe(NFS_SERVER(inode)); + } return 0; } @@ -962,7 +1005,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) struct nfs_open_context *ctx = nfs_file_open_context(filp); int status; - if (fl->fl_flags & FL_CLOSE) { + if (fl->c.flc_flags & FL_CLOSE) { l_ctx = nfs_get_lock_context(ctx); if (IS_ERR(l_ctx)) l_ctx = NULL; @@ -978,13 +1021,21 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) return status; } -static int nfs3_have_delegation(struct inode *inode, fmode_t flags) +static int nfs3_have_delegation(struct inode *inode, fmode_t type, int flags) +{ + return 0; +} + +static int nfs3_return_delegation(struct inode *inode) { + if (S_ISREG(inode->i_mode)) + nfs_wb_all(inode); return 0; } static const struct inode_operations nfs3_dir_inode_operations = { .create = nfs_create, + .atomic_open = nfs_atomic_open_v23, .lookup = nfs_lookup, .link = nfs_link, .unlink = nfs_unlink, @@ -1060,6 +1111,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .clear_acl_cache = forget_all_cached_acls, .close_context = nfs_close_context, .have_delegation = nfs3_have_delegation, + .return_delegation = nfs3_return_delegation, .alloc_client = nfs_alloc_client, .init_client = nfs_init_client, .free_client = nfs_free_client, diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index 8a9be9e47f76..20a80478449e 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -27,6 +27,7 @@ static void __exit exit_nfs_v3(void) unregister_nfs_version(&nfs_v3); } +MODULE_DESCRIPTION("NFSv3 client support"); MODULE_LICENSE("GPL"); module_init(init_nfs_v3); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 3b0b650c9c5a..e17d72908412 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -21,14 +21,13 @@ #include <linux/nfs3.h> #include <linux/nfs_fs.h> #include <linux/nfsacl.h> -#include "nfstrace.h" +#include <linux/nfs_common.h> + #include "internal.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_XDR -/* Mapping from NFS error code to "errno" error code. */ -#define errno_NFSERR_IO EIO - /* * Declare the space requirements for NFS arguments and replies as * number of 32bit-words @@ -91,8 +90,6 @@ NFS3_pagepad_sz) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) -static int nfs3_stat_to_errno(enum nfs_stat); - /* * Map file type to S_IFMT bits */ @@ -1406,7 +1403,7 @@ static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, out: return error; out_default: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1445,7 +1442,7 @@ static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1495,7 +1492,7 @@ out_default: error = decode_post_op_attr(xdr, result->dir_attr, userns); if (unlikely(error)) goto out; - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1537,7 +1534,7 @@ static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, out: return error; out_default: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1578,7 +1575,7 @@ static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, out: return error; out_default: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1658,7 +1655,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1728,7 +1725,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1795,7 +1792,7 @@ out_default: error = decode_wcc_data(xdr, result->dir_attr, userns); if (unlikely(error)) goto out; - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1835,7 +1832,7 @@ static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1881,7 +1878,7 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1926,7 +1923,7 @@ static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /** @@ -1991,7 +1988,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_inline_filename3(xdr, &entry->name, &entry->len); if (unlikely(error)) - return -EAGAIN; + return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN; error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) @@ -2101,7 +2098,7 @@ out_default: error = decode_post_op_attr(xdr, result->dir_attr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -2167,7 +2164,7 @@ static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -2243,7 +2240,7 @@ static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -2304,7 +2301,7 @@ static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -2350,7 +2347,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } #ifdef CONFIG_NFS_V3_ACL @@ -2416,7 +2413,7 @@ static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, out: return error; out_default: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, @@ -2435,76 +2432,11 @@ static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, out: return error; out_default: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } #endif /* CONFIG_NFS_V3_ACL */ - -/* - * We need to translate between nfs status return values and - * the local errno values which may not be the same. - */ -static const struct { - int stat; - int errno; -} nfs_errtbl[] = { - { NFS_OK, 0 }, - { NFSERR_PERM, -EPERM }, - { NFSERR_NOENT, -ENOENT }, - { NFSERR_IO, -errno_NFSERR_IO}, - { NFSERR_NXIO, -ENXIO }, -/* { NFSERR_EAGAIN, -EAGAIN }, */ - { NFSERR_ACCES, -EACCES }, - { NFSERR_EXIST, -EEXIST }, - { NFSERR_XDEV, -EXDEV }, - { NFSERR_NODEV, -ENODEV }, - { NFSERR_NOTDIR, -ENOTDIR }, - { NFSERR_ISDIR, -EISDIR }, - { NFSERR_INVAL, -EINVAL }, - { NFSERR_FBIG, -EFBIG }, - { NFSERR_NOSPC, -ENOSPC }, - { NFSERR_ROFS, -EROFS }, - { NFSERR_MLINK, -EMLINK }, - { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, - { NFSERR_NOTEMPTY, -ENOTEMPTY }, - { NFSERR_DQUOT, -EDQUOT }, - { NFSERR_STALE, -ESTALE }, - { NFSERR_REMOTE, -EREMOTE }, -#ifdef EWFLUSH - { NFSERR_WFLUSH, -EWFLUSH }, -#endif - { NFSERR_BADHANDLE, -EBADHANDLE }, - { NFSERR_NOT_SYNC, -ENOTSYNC }, - { NFSERR_BAD_COOKIE, -EBADCOOKIE }, - { NFSERR_NOTSUPP, -ENOTSUPP }, - { NFSERR_TOOSMALL, -ETOOSMALL }, - { NFSERR_SERVERFAULT, -EREMOTEIO }, - { NFSERR_BADTYPE, -EBADTYPE }, - { NFSERR_JUKEBOX, -EJUKEBOX }, - { -1, -EIO } -}; - -/** - * nfs3_stat_to_errno - convert an NFS status code to a local errno - * @status: NFS status code to convert - * - * Returns a local errno value, or -EIO if the NFS status code is - * not recognized. This function is used jointly by NFSv2 and NFSv3. - */ -static int nfs3_stat_to_errno(enum nfs_stat status) -{ - int i; - - for (i = 0; nfs_errtbl[i].stat != -1; i++) { - if (nfs_errtbl[i].stat == (int)status) - return nfs_errtbl[i].errno; - } - dprintk("NFS: Unrecognized nfs status value: %u\n", status); - return nfs_errtbl[i].errno; -} - - #define PROC(proc, argtype, restype, timer) \ [NFS3PROC_##proc] = { \ .p_proc = NFS3PROC_##proc, \ diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 0fe5aacbcfdf..aafd15a4afce 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -13,6 +13,7 @@ * more? Need to consider not to pre-alloc too much for a compound. */ #define PNFS_LAYOUTSTATS_MAXDEV (4) +#define READ_PLUS_SCRATCH_SIZE (16) /* nfs4.2proc.c */ #ifdef CONFIG_NFS_V4_2 @@ -20,6 +21,7 @@ int nfs42_proc_allocate(struct file *, loff_t, loff_t); ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t, struct nl4_server *, nfs4_stateid *, bool); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); +int nfs42_proc_zero_range(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); @@ -54,11 +56,14 @@ int nfs42_proc_removexattr(struct inode *inode, const char *name); * They would be 7 bytes long in the eventual buffer ("user.x\0"), and * 8 bytes long XDR-encoded. * - * Include the trailing eof word as well. + * Include the trailing eof word as well and make the result a multiple + * of 4 bytes. */ static inline u32 nfs42_listxattr_xdrsize(u32 buflen) { - return ((buflen / (XATTR_USER_PREFIX_LEN + 2)) * 8) + 4; + u32 size = 8 * buflen / (XATTR_USER_PREFIX_LEN + 2) + 4; + + return (size + 3) & ~3; } #endif /* CONFIG_NFS_V4_2 */ #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 63802d195556..d537fb0c230e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -21,6 +21,8 @@ #define NFSDBG_FACILITY NFSDBG_PROC static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std); +static int nfs42_proc_offload_status(struct file *file, nfs4_stateid *stateid, + u64 *copied); static void nfs42_set_netaddr(struct file *filep, struct nfs42_netaddr *naddr) { @@ -81,7 +83,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (status == 0) { if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); + nfs_set_cache_invalid(inode, + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_MODE); spin_unlock(&inode->i_lock); } status = nfs_post_op_update_inode_force_wcc(inode, @@ -111,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, exception.inode = inode; exception.state = lock->open_context->state; + nfs_file_block_o_direct(NFS_I(inode)); err = nfs_sync_inode(inode); if (err) goto out; @@ -134,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) @@ -142,8 +147,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + + if (err == 0) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + else if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | + NFS_CAP_ZERO_RANGE); inode_unlock(inode); return err; @@ -166,12 +176,53 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + NFS_SERVER(inode)->caps &= ~(NFS_CAP_DEALLOCATE | + NFS_CAP_ZERO_RANGE); inode_unlock(inode); return err; } +int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], + }; + struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) { + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + truncate_pagecache_range(inode, offset, (offset + len) -1); + } else if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; + + inode_unlock(inode); + return err; +} + +static void nfs4_copy_dequeue_callback(struct nfs_server *dst_server, + struct nfs_server *src_server, + struct nfs4_copy_state *copy) +{ + spin_lock(&dst_server->nfs_client->cl_lock); + list_del_init(©->copies); + spin_unlock(&dst_server->nfs_client->cl_lock); + if (dst_server != src_server) { + spin_lock(&src_server->nfs_client->cl_lock); + list_del_init(©->src_copies); + spin_unlock(&src_server->nfs_client->cl_lock); + } +} + static int handle_async_copy(struct nfs42_copy_res *res, struct nfs_server *dst_server, struct nfs_server *src_server, @@ -181,9 +232,12 @@ static int handle_async_copy(struct nfs42_copy_res *res, bool *restart) { struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter; - int status = NFS4_OK; struct nfs_open_context *dst_ctx = nfs_file_open_context(dst); struct nfs_open_context *src_ctx = nfs_file_open_context(src); + struct nfs_client *clp = dst_server->nfs_client; + unsigned long timeout = 3 * HZ; + int status = NFS4_OK; + u64 copied; copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); if (!copy) @@ -217,19 +271,16 @@ static int handle_async_copy(struct nfs42_copy_res *res, if (dst_server != src_server) { spin_lock(&src_server->nfs_client->cl_lock); - list_add_tail(©->src_copies, &src_server->ss_copies); + list_add_tail(©->src_copies, &src_server->ss_src_copies); spin_unlock(&src_server->nfs_client->cl_lock); } - status = wait_for_completion_interruptible(©->completion); - spin_lock(&dst_server->nfs_client->cl_lock); - list_del_init(©->copies); - spin_unlock(&dst_server->nfs_client->cl_lock); - if (dst_server != src_server) { - spin_lock(&src_server->nfs_client->cl_lock); - list_del_init(©->src_copies); - spin_unlock(&src_server->nfs_client->cl_lock); - } +wait: + status = wait_for_completion_interruptible_timeout(©->completion, + timeout); + if (!status) + goto timeout; + nfs4_copy_dequeue_callback(dst_server, src_server, copy); if (status == -ERESTARTSYS) { goto out_cancel; } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) { @@ -239,6 +290,7 @@ static int handle_async_copy(struct nfs42_copy_res *res, } out: res->write_res.count = copy->count; + /* Copy out the updated write verifier provided by CB_OFFLOAD. */ memcpy(&res->write_res.verifier, ©->verf, sizeof(copy->verf)); status = -copy->error; @@ -250,6 +302,39 @@ out_cancel: if (!nfs42_files_from_same_server(src, dst)) nfs42_do_offload_cancel_async(src, src_stateid); goto out_free; +timeout: + timeout <<= 1; + if (timeout > (clp->cl_lease_time >> 1)) + timeout = clp->cl_lease_time >> 1; + status = nfs42_proc_offload_status(dst, ©->stateid, &copied); + if (status == -EINPROGRESS) + goto wait; + nfs4_copy_dequeue_callback(dst_server, src_server, copy); + switch (status) { + case 0: + /* The server recognized the copy stateid, so it hasn't + * rebooted. Don't overwrite the verifier returned in the + * COPY result. */ + res->write_res.count = copied; + goto out_free; + case -EREMOTEIO: + /* COPY operation failed on the server. */ + status = -EOPNOTSUPP; + res->write_res.count = copied; + goto out_free; + case -EBADF: + /* Server did not recognize the copy stateid. It has + * probably restarted and lost the plot. */ + res->write_res.count = 0; + status = -EOPNOTSUPP; + break; + case -EOPNOTSUPP: + /* RFC 7862 REQUIREs server to support OFFLOAD_STATUS when + * it has signed up for an async COPY, so server is not + * spec-compliant. */ + res->write_res.count = 0; + } + goto out_free; } static int process_copy_commit(struct file *dst, loff_t pos_dst, @@ -278,22 +363,27 @@ out: /** * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload - * @inode: pointer to destination inode + * @file: pointer to destination file * @pos: destination offset * @len: copy length + * @oldsize: length of the file prior to clone/copy * * Punch a hole in the inode page cache, so that the NFS client will * know to retrieve new data. * Update the file size if necessary, and then mark the inode as having * invalid cached values for change attribute, ctime, mtime and space used. */ -static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len) +static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len, + loff_t oldsize) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; loff_t newsize = pos + len; loff_t end = newsize - 1; - WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_SHIFT, end >> PAGE_SHIFT)); + nfs_truncate_last_folio(mapping, oldsize, pos); + WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT)); spin_lock(&inode->i_lock); if (newsize > i_size_read(inode)) @@ -326,6 +416,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_server *src_server = NFS_SERVER(src_inode); loff_t pos_src = args->src_pos; loff_t pos_dst = args->dst_pos; + loff_t oldsize_dst = i_size_read(dst_inode); size_t count = args->count; ssize_t status; @@ -354,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, return status; } + nfs_file_block_o_direct(NFS_I(dst_inode)); status = nfs_sync_inode(dst_inode); if (status) return status; @@ -399,7 +491,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, goto out; } - nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count); + nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst); nfs_invalidate_atime(src_inode); status = res->write_res.count; out: @@ -471,8 +563,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, continue; } break; - } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && !args.sync) { - args.sync = true; + } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && + args.sync != res.synchronous) { + args.sync = res.synchronous; dst_exception.retry = 1; continue; } else if ((err == -ESTALE || @@ -496,15 +589,15 @@ out_put_src_lock: return err; } -struct nfs42_offloadcancel_data { +struct nfs42_offload_data { struct nfs_server *seq_server; struct nfs42_offload_status_args args; struct nfs42_offload_status_res res; }; -static void nfs42_offload_cancel_prepare(struct rpc_task *task, void *calldata) +static void nfs42_offload_prepare(struct rpc_task *task, void *calldata) { - struct nfs42_offloadcancel_data *data = calldata; + struct nfs42_offload_data *data = calldata; nfs4_setup_sequence(data->seq_server->nfs_client, &data->args.osa_seq_args, @@ -513,7 +606,7 @@ static void nfs42_offload_cancel_prepare(struct rpc_task *task, void *calldata) static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata) { - struct nfs42_offloadcancel_data *data = calldata; + struct nfs42_offload_data *data = calldata; trace_nfs4_offload_cancel(&data->args, task->tk_status); nfs41_sequence_done(task, &data->res.osr_seq_res); @@ -523,22 +616,22 @@ static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata) rpc_restart_call_prepare(task); } -static void nfs42_free_offloadcancel_data(void *data) +static void nfs42_offload_release(void *data) { kfree(data); } static const struct rpc_call_ops nfs42_offload_cancel_ops = { - .rpc_call_prepare = nfs42_offload_cancel_prepare, + .rpc_call_prepare = nfs42_offload_prepare, .rpc_call_done = nfs42_offload_cancel_done, - .rpc_release = nfs42_free_offloadcancel_data, + .rpc_release = nfs42_offload_release, }; static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *stateid) { struct nfs_server *dst_server = NFS_SERVER(file_inode(dst)); - struct nfs42_offloadcancel_data *data = NULL; + struct nfs42_offload_data *data = NULL; struct nfs_open_context *ctx = nfs_file_open_context(dst); struct rpc_task *task; struct rpc_message msg = { @@ -550,14 +643,14 @@ static int nfs42_do_offload_cancel_async(struct file *dst, .rpc_message = &msg, .callback_ops = &nfs42_offload_cancel_ops, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE, }; int status; if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL)) return -EOPNOTSUPP; - data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_KERNEL); + data = kzalloc(sizeof(struct nfs42_offload_data), GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -580,6 +673,108 @@ static int nfs42_do_offload_cancel_async(struct file *dst, return status; } +static int +_nfs42_proc_offload_status(struct nfs_server *server, struct file *file, + struct nfs42_offload_data *data) +{ + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OFFLOAD_STATUS], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = ctx->cred, + }; + int status; + + status = nfs4_call_sync(server->client, server, &msg, + &data->args.osa_seq_args, + &data->res.osr_seq_res, 1); + trace_nfs4_offload_status(&data->args, status); + switch (status) { + case 0: + break; + + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + /* + * Server does not recognize the COPY stateid. CB_OFFLOAD + * could have purged it, or server might have rebooted. + * Since COPY stateids don't have an associated inode, + * avoid triggering state recovery. + */ + status = -EBADF; + break; + case -NFS4ERR_NOTSUPP: + case -ENOTSUPP: + case -EOPNOTSUPP: + server->caps &= ~NFS_CAP_OFFLOAD_STATUS; + status = -EOPNOTSUPP; + break; + } + + return status; +} + +/** + * nfs42_proc_offload_status - Poll completion status of an async copy operation + * @dst: handle of file being copied into + * @stateid: copy stateid (from async COPY result) + * @copied: OUT: number of bytes copied so far + * + * Return values: + * %0: Server returned an NFS4_OK completion status + * %-EINPROGRESS: Server returned no completion status + * %-EREMOTEIO: Server returned an error completion status + * %-EBADF: Server did not recognize the copy stateid + * %-EOPNOTSUPP: Server does not support OFFLOAD_STATUS + * %-ERESTARTSYS: Wait interrupted by signal + * + * Other negative errnos indicate the client could not complete the + * request. + */ +static int +nfs42_proc_offload_status(struct file *dst, nfs4_stateid *stateid, u64 *copied) +{ + struct inode *inode = file_inode(dst); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_exception exception = { + .inode = inode, + }; + struct nfs42_offload_data *data; + int status; + + if (!(server->caps & NFS_CAP_OFFLOAD_STATUS)) + return -EOPNOTSUPP; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->seq_server = server; + data->args.osa_src_fh = NFS_FH(inode); + memcpy(&data->args.osa_stateid, stateid, + sizeof(data->args.osa_stateid)); + exception.stateid = &data->args.osa_stateid; + do { + status = _nfs42_proc_offload_status(server, dst, data); + if (status == -EOPNOTSUPP) + goto out; + status = nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + if (status) + goto out; + + *copied = data->res.osr_count; + if (!data->res.complete_count) + status = -EINPROGRESS; + else if (data->res.osr_complete != NFS_OK) + status = -EREMOTEIO; + +out: + kfree(data); + return status; +} + static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, struct nfs42_copy_notify_args *args, struct nfs42_copy_notify_res *res) @@ -859,7 +1054,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, .rpc_message = &msg, .callback_ops = &nfs42_layoutstat_ops, .callback_data = data, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE, }; struct rpc_task *task; @@ -1014,7 +1209,7 @@ int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, struct rpc_task_setup task_setup = { .rpc_message = &msg, .callback_ops = &nfs42_layouterror_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE, }; unsigned int i; @@ -1063,6 +1258,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct nfs42_clone_res res = { .server = server, }; + loff_t oldsize_dst = i_size_read(dst_inode); int status; msg->rpc_argp = &args; @@ -1097,7 +1293,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, /* a zero-length count means clone to EOF in src */ if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; - nfs42_copy_dest_done(dst_inode, dst_offset, count); + nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); } @@ -1318,7 +1514,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, ret = -ENOMEM; - res.scratch = alloc_page(GFP_KERNEL); + res.scratch = folio_alloc(GFP_KERNEL, 0); if (!res.scratch) goto out; @@ -1356,7 +1552,7 @@ out_free_pages: } kfree(pages); out_free_scratch: - __free_page(res.scratch); + folio_put(res.scratch); out: return ret; @@ -1377,7 +1573,6 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, for (i = 0; i < np; i++) { pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) { - np = i + 1; err = -ENOMEM; goto out; } @@ -1401,8 +1596,8 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, } while (exception.retry); out: - while (--np >= 0) - __free_page(pages[np]); + while (--i >= 0) + __free_page(pages[i]); kfree(pages); return err; diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 911f634ba3da..37d79400e5f4 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -132,7 +132,7 @@ nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - return list_lru_add(lru, &entry->lru); + return list_lru_add_obj(lru, &entry->lru); } static bool @@ -143,7 +143,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - return list_lru_del(lru, &entry->lru); + return list_lru_del_obj(lru, &entry->lru); } /* @@ -349,7 +349,7 @@ nfs4_xattr_cache_unlink(struct inode *inode) oldcache = nfsi->xattr_cache; if (oldcache != NULL) { - list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru); + list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru); oldcache->inode = NULL; } nfsi->xattr_cache = NULL; @@ -474,7 +474,7 @@ nfs4_xattr_get_cache(struct inode *inode, int add) kref_get(&cache->ref); nfsi->xattr_cache = cache; cache->inode = inode; - list_lru_add(&nfs4_xattr_cache_lru, &cache->lru); + list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru); } spin_unlock(&inode->i_lock); @@ -796,32 +796,13 @@ static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc); -static struct shrinker nfs4_xattr_cache_shrinker = { - .count_objects = nfs4_xattr_cache_count, - .scan_objects = nfs4_xattr_cache_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_MEMCG_AWARE, -}; - -static struct shrinker nfs4_xattr_entry_shrinker = { - .count_objects = nfs4_xattr_entry_count, - .scan_objects = nfs4_xattr_entry_scan, - .seeks = DEFAULT_SEEKS, - .batch = 512, - .flags = SHRINKER_MEMCG_AWARE, -}; - -static struct shrinker nfs4_xattr_large_entry_shrinker = { - .count_objects = nfs4_xattr_entry_count, - .scan_objects = nfs4_xattr_entry_scan, - .seeks = 1, - .batch = 512, - .flags = SHRINKER_MEMCG_AWARE, -}; +static struct shrinker *nfs4_xattr_cache_shrinker; +static struct shrinker *nfs4_xattr_entry_shrinker; +static struct shrinker *nfs4_xattr_large_entry_shrinker; static enum lru_status cache_lru_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct inode *inode; @@ -886,7 +867,7 @@ nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc) static enum lru_status entry_lru_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct nfs4_xattr_bucket *bucket; @@ -943,7 +924,7 @@ nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) struct nfs4_xattr_entry *entry; struct list_lru *lru; - lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); @@ -971,7 +952,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count; struct list_lru *lru; - lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; count = list_lru_shrink_count(lru, sc); @@ -991,18 +972,34 @@ static void nfs4_xattr_cache_init_once(void *p) INIT_LIST_HEAD(&cache->dispose); } -static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, - struct list_lru *lru, const char *name) +typedef unsigned long (*count_objects_cb)(struct shrinker *s, + struct shrink_control *sc); +typedef unsigned long (*scan_objects_cb)(struct shrinker *s, + struct shrink_control *sc); + +static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker, + struct list_lru *lru, const char *name, + count_objects_cb count, + scan_objects_cb scan, long batch, int seeks) { - int ret = 0; + int ret; - ret = register_shrinker(shrinker, name); - if (ret) + *shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name); + if (!*shrinker) + return -ENOMEM; + + ret = list_lru_init_memcg(lru, *shrinker); + if (ret) { + shrinker_free(*shrinker); return ret; + } - ret = list_lru_init_memcg(lru, shrinker); - if (ret) - unregister_shrinker(shrinker); + (*shrinker)->count_objects = count; + (*shrinker)->scan_objects = scan; + (*shrinker)->batch = batch; + (*shrinker)->seeks = seeks; + + shrinker_register(*shrinker); return ret; } @@ -1010,7 +1007,7 @@ static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker, struct list_lru *lru) { - unregister_shrinker(shrinker); + shrinker_free(shrinker); list_lru_destroy(lru); } @@ -1020,33 +1017,37 @@ int __init nfs4_xattr_cache_init(void) nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache", sizeof(struct nfs4_xattr_cache), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + (SLAB_RECLAIM_ACCOUNT), nfs4_xattr_cache_init_once); if (nfs4_xattr_cache_cachep == NULL) return -ENOMEM; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker, - &nfs4_xattr_cache_lru, - "nfs-xattr_cache"); + &nfs4_xattr_cache_lru, "nfs-xattr_cache", + nfs4_xattr_cache_count, + nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS); if (ret) goto out1; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker, - &nfs4_xattr_entry_lru, - "nfs-xattr_entry"); + &nfs4_xattr_entry_lru, "nfs-xattr_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS); if (ret) goto out2; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru, - "nfs-xattr_large_entry"); + "nfs-xattr_large_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, 1); if (!ret) return 0; - nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); out2: - nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); out1: kmem_cache_destroy(nfs4_xattr_cache_cachep); @@ -1056,11 +1057,11 @@ out1: void nfs4_xattr_cache_exit(void) { - nfs4_xattr_shrinker_destroy(&nfs4_xattr_large_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru); - nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); - nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); kmem_cache_destroy(nfs4_xattr_cache_cachep); } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 95234208dc9e..e10d83ba835e 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -35,6 +35,11 @@ #define encode_offload_cancel_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE)) #define decode_offload_cancel_maxsz (op_decode_hdr_maxsz) +#define encode_offload_status_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_offload_status_maxsz (op_decode_hdr_maxsz + \ + 2 /* osr_count */ + \ + 2 /* osr_complete */) #define encode_copy_notify_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE) + \ 1 + /* nl4_type */ \ @@ -54,10 +59,16 @@ (1 /* data_content4 */ + \ 2 /* data_info4.di_offset */ + \ 1 /* data_info4.di_length */) +#define NFS42_READ_PLUS_HOLE_SEGMENT_SIZE \ + (1 /* data_content4 */ + \ + 2 /* data_info4.di_offset */ + \ + 2 /* data_info4.di_length */) +#define READ_PLUS_SEGMENT_SIZE_DIFF (NFS42_READ_PLUS_HOLE_SEGMENT_SIZE - \ + NFS42_READ_PLUS_DATA_SEGMENT_SIZE) #define decode_read_plus_maxsz (op_decode_hdr_maxsz + \ 1 /* rpr_eof */ + \ 1 /* rpr_contents count */ + \ - NFS42_READ_PLUS_DATA_SEGMENT_SIZE) + NFS42_READ_PLUS_HOLE_SEGMENT_SIZE) #define encode_seek_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ 2 /* offset */ + \ @@ -137,10 +148,20 @@ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_offload_cancel_maxsz) +#define NFS4_enc_offload_status_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_offload_status_maxsz) +#define NFS4_dec_offload_status_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_offload_status_maxsz) #define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_copy_notify_maxsz) #define NFS4_dec_copy_notify_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_copy_notify_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ @@ -153,6 +174,18 @@ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_zero_range_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_deallocate_maxsz + \ + encode_allocate_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_zero_range_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_deallocate_maxsz + \ + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -337,6 +370,14 @@ static void encode_offload_cancel(struct xdr_stream *xdr, encode_nfs4_stateid(xdr, &args->osa_stateid); } +static void encode_offload_status(struct xdr_stream *xdr, + const struct nfs42_offload_status_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_OFFLOAD_STATUS, decode_offload_status_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->osa_stateid); +} + static void encode_copy_notify(struct xdr_stream *xdr, const struct nfs42_copy_notify_args *args, struct compound_hdr *hdr) @@ -543,7 +584,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req, } /* - * Encode OFFLOAD_CANEL request + * Encode OFFLOAD_CANCEL request */ static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -562,6 +603,25 @@ static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req, } /* + * Encode OFFLOAD_STATUS request + */ +static void nfs4_xdr_enc_offload_status(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_offload_status_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->osa_seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->osa_seq_args, &hdr); + encode_putfh(xdr, args->osa_src_fh, &hdr); + encode_offload_status(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode COPY_NOTIFY request */ static void nfs4_xdr_enc_copy_notify(struct rpc_rqst *req, @@ -601,6 +661,27 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, } /* + * Encode ZERO_RANGE request + */ +static void nfs4_xdr_enc_zero_range(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_falloc_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_deallocate(xdr, args, &hdr); + encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); + encode_nops(&hdr); +} + +/* * Encode READ_PLUS request */ static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, @@ -617,8 +698,8 @@ static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, encode_putfh(xdr, args->fh, &hdr); encode_read_plus(xdr, args, &hdr); - rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->count, hdr.replen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count, + hdr.replen - READ_PLUS_SEGMENT_SIZE_DIFF); encode_nops(&hdr); } @@ -913,6 +994,26 @@ static int decode_offload_cancel(struct xdr_stream *xdr, return decode_op_hdr(xdr, OP_OFFLOAD_CANCEL); } +static int decode_offload_status(struct xdr_stream *xdr, + struct nfs42_offload_status_res *res) +{ + ssize_t result; + int status; + + status = decode_op_hdr(xdr, OP_OFFLOAD_STATUS); + if (status) + return status; + /* osr_count */ + if (xdr_stream_decode_u64(xdr, &res->osr_count) < 0) + return -EIO; + /* osr_complete<1> */ + result = xdr_stream_decode_uint32_array(xdr, &res->osr_complete, 1); + if (result < 0) + return -EIO; + res->complete_count = result; + return 0; +} + static int decode_copy_notify(struct xdr_stream *xdr, struct nfs42_copy_notify_res *res) { @@ -1056,13 +1157,12 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) res->eof = be32_to_cpup(p++); segments = be32_to_cpup(p++); if (segments == 0) - return status; + return 0; segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL); if (!segs) return -ENOMEM; - status = -EIO; for (i = 0; i < segments; i++) { status = decode_read_plus_segment(xdr, &segs[i]); if (status < 0) @@ -1364,6 +1464,32 @@ out: } /* + * Decode OFFLOAD_STATUS response + */ +static int nfs4_xdr_dec_offload_status(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_offload_status_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->osr_seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_offload_status(xdr, res); + +out: + return status; +} + +/* * Decode COPY_NOTIFY response */ static int nfs4_xdr_dec_copy_notify(struct rpc_rqst *rqstp, @@ -1418,6 +1544,37 @@ out: } /* + * Decode ZERO_RANGE request + */ +static int nfs4_xdr_dec_zero_range(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_falloc_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_deallocate(xdr, res); + if (status) + goto out; + status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); +out: + return status; +} + +/* * Decode READ_PLUS request */ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, @@ -1428,7 +1585,7 @@ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_buffer(xdr, res->scratch, sizeof(res->scratch)); + xdr_set_scratch_buffer(xdr, res->scratch, READ_PLUS_SCRATCH_SIZE); status = decode_compound_hdr(xdr, &hdr); if (status) @@ -1624,7 +1781,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_page(xdr, res->scratch); + xdr_set_scratch_folio(xdr, res->scratch); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4c9f8bd866ab..c34c89af9c7d 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -63,11 +63,11 @@ struct nfs4_minor_version_ops { bool (*match_stateid)(const nfs4_stateid *, const nfs4_stateid *); int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, - struct nfs_fsinfo *); + struct nfs_fattr *); void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); int (*test_and_free_expired)(struct nfs_server *, - nfs4_stateid *, const struct cred *); + nfs4_stateid *, const struct cred *); struct nfs_seqid * (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); void (*session_trunk)(struct rpc_clnt *clnt, @@ -82,7 +82,7 @@ struct nfs4_minor_version_ops { #define NFS_SEQID_CONFIRMED 1 struct nfs_seqid_counter { ktime_t create_time; - int owner_id; + u64 owner_id; int flags; u32 counter; spinlock_t lock; /* Protects the list */ @@ -120,7 +120,6 @@ struct nfs4_state_owner { unsigned long so_flags; struct list_head so_states; struct nfs_seqid_counter so_seqid; - seqcount_spinlock_t so_reclaim_seqcount; struct mutex so_delegreturn_mutex; }; @@ -209,6 +208,7 @@ struct nfs4_exception { struct inode *inode; nfs4_stateid *stateid; long timeout; + unsigned short retrans; unsigned char task_is_privileged : 1; unsigned char delay : 1, recovering : 1, @@ -296,7 +296,8 @@ extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int, int); extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, const struct cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, const struct cred *); -extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); +extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, + struct nfs_fattr *, bool); extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, const struct cred *cred); extern int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred); extern int nfs4_destroy_clientid(struct nfs_client *clp); @@ -315,7 +316,7 @@ extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); -extern const struct xattr_handler *nfs4_xattr_handlers[]; +extern const struct xattr_handler * const nfs4_xattr_handlers[]; extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, @@ -328,8 +329,8 @@ extern int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode); -extern int nfs4_proc_setlease(struct file *file, long arg, - struct file_lock **lease, void **priv); +extern int nfs4_proc_setlease(struct file *file, int arg, + struct file_lease **lease, void **priv); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern void nfs4_update_changeattr(struct inode *dir, @@ -546,6 +547,7 @@ extern unsigned short max_session_slots; extern unsigned short max_session_cb_slots; extern unsigned short send_implementation_id; extern bool recover_lost_locks; +extern short nfs_delay_retrans; #define NFS4_CLIENT_ID_UNIQ_LEN (64) extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN]; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index d9114a754db7..3a4baed993c9 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -11,6 +11,7 @@ #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <net/handshake.h> #include "internal.h" #include "callback.h" #include "delegation.h" @@ -222,6 +223,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; + clp->cl_last_renewal = jiffies; #if IS_ENABLED(CONFIG_NFS_V4_1) init_waitqueue_head(&clp->cl_lock_waitq); #endif @@ -231,7 +233,10 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - + if (test_bit(NFS_CS_PNFS, &cl_init->init_flags)) + __set_bit(NFS_CS_PNFS, &clp->cl_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &cl_init->init_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags); /* * Set up the connection to the server before we add add to the * global list. @@ -415,6 +420,8 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) .net = old->cl_net, .servername = old->cl_hostname, }; + int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ? + clp->cl_max_connect : old->cl_max_connect; if (clp->cl_proto != old->cl_proto) return; @@ -428,7 +435,7 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) xprt_args.addrlen = clp_salen; rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, &max_connect); } /** @@ -797,6 +804,7 @@ static void nfs4_destroy_server(struct nfs_server *server) unset_pnfs_layoutdriver(server); nfs4_purge_state_owners(server, &freeme); nfs4_free_state_owners(&freeme); + kfree(server->delegation_hash_table); } /* @@ -890,51 +898,40 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, * Set up an NFS4 client */ static int nfs4_set_client(struct nfs_server *server, - const char *hostname, - const struct sockaddr_storage *addr, - const size_t addrlen, - const char *ip_addr, - int proto, const struct rpc_timeout *timeparms, - u32 minorversion, unsigned int nconnect, - unsigned int max_connect, - struct net *net, - struct xprtsec_parms *xprtsec) + struct nfs_client_initdata *cl_init) { - struct nfs_client_initdata cl_init = { - .hostname = hostname, - .addr = addr, - .addrlen = addrlen, - .ip_addr = ip_addr, - .nfs_mod = &nfs_v4, - .proto = proto, - .minorversion = minorversion, - .net = net, - .timeparms = timeparms, - .cred = server->cred, - .xprtsec = *xprtsec, - }; struct nfs_client *clp; - if (minorversion == 0) - __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags); - else - cl_init.max_connect = max_connect; - switch (proto) { + cl_init->nfs_mod = &nfs_v4; + cl_init->cred = server->cred; + + if (cl_init->minorversion == 0) { + __set_bit(NFS_CS_REUSEPORT, &cl_init->init_flags); + cl_init->max_connect = 0; + } + + switch (cl_init->proto) { + case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: - cl_init.nconnect = nconnect; + break; + default: + cl_init->nconnect = 0; } if (server->flags & NFS_MOUNT_NORESVPORT) - __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_NORESVPORT, &cl_init->init_flags); if (server->options & NFS_OPTION_MIGRATION) - __set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); + __set_bit(NFS_CS_MIGRATION, &cl_init->init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) - __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); - server->port = rpc_get_port((struct sockaddr *)addr); + __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init->init_flags); + server->port = rpc_get_port((struct sockaddr *)cl_init->addr); + + if (server->flags & NFS_MOUNT_NETUNREACH_FATAL) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init->init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); + clp = nfs_get_client(cl_init); if (IS_ERR(clp)) return PTR_ERR(clp); @@ -987,7 +984,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, - .xprtsec = mds_srv->nfs_client->cl_xprtsec, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, }; char buf[INET6_ADDRSTRLEN + 1]; @@ -996,8 +997,14 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { - case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: + if (mds_srv->nfs_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_srv->nfs_client->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; + case XPRT_TRANSPORT_RDMA: + case XPRT_TRANSPORT_TCP: if (mds_clp->cl_nconnect > 1) { cl_init.nconnect = mds_clp->cl_nconnect; cl_init.max_connect = NFS_MAX_TRANSPORTS; @@ -1006,7 +1013,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); + __set_bit(NFS_CS_PNFS, &cl_init.init_flags); + cl_init.max_connect = NFS_MAX_TRANSPORTS; /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS @@ -1074,29 +1085,15 @@ static void nfs4_session_limit_xasize(struct nfs_server *server) #endif } -void nfs4_server_set_init_caps(struct nfs_server *server) -{ - /* Set the basic capabilities */ - server->caps |= server->nfs_client->cl_mvops->init_caps; - if (server->flags & NFS_MOUNT_NORDIRPLUS) - server->caps &= ~NFS_CAP_READDIRPLUS; - if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA) - server->caps &= ~NFS_CAP_READ_PLUS; - - /* - * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower - * authentication. - */ - if (nfs4_disable_idmapping && - server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) - server->caps |= NFS_CAP_UIDGID_NOMAP; -} - static int nfs4_server_common_setup(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) { int error; + error = nfs4_delegation_hash_alloc(server); + if (error) + return error; + /* data servers support only a subset of NFSv4.1 */ if (is_ds_only_client(server->nfs_client)) return -EPROTONOSUPPORT; @@ -1104,14 +1101,14 @@ static int nfs4_server_common_setup(struct nfs_server *server, /* We must ensure the session is initialised first */ error = nfs4_init_session(server->nfs_client); if (error < 0) - goto out; + return error; - nfs4_server_set_init_caps(server); + nfs_server_set_init_caps(server); /* Probe the root fh to retrieve its FSID and filehandle */ error = nfs4_get_rootfh(server, mntfh, auth_probe); if (error < 0) - goto out; + return error; dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, @@ -1120,7 +1117,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, error = nfs_probe_server(server, mntfh); if (error < 0) - goto out; + return error; nfs4_session_limit_rwsize(server); nfs4_session_limit_xasize(server); @@ -1131,8 +1128,7 @@ static int nfs4_server_common_setup(struct nfs_server *server, nfs_server_insert_lists(server); server->mount_time = jiffies; server->destroy = nfs4_destroy_server; -out: - return error; + return 0; } /* @@ -1142,6 +1138,19 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct rpc_timeout timeparms; + struct nfs_client_initdata cl_init = { + .hostname = ctx->nfs_server.hostname, + .addr = &ctx->nfs_server._address, + .addrlen = ctx->nfs_server.addrlen, + .ip_addr = ctx->client_address, + .proto = ctx->nfs_server.protocol, + .minorversion = ctx->minorversion, + .net = fc->net_ns, + .timeparms = &timeparms, + .xprtsec = ctx->xprtsec, + .nconnect = ctx->nfs_server.nconnect, + .max_connect = ctx->nfs_server.max_connect, + }; int error; nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol, @@ -1161,18 +1170,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) ctx->selected_flavor = RPC_AUTH_UNIX; /* Get a client record */ - error = nfs4_set_client(server, - ctx->nfs_server.hostname, - &ctx->nfs_server._address, - ctx->nfs_server.addrlen, - ctx->client_address, - ctx->nfs_server.protocol, - &timeparms, - ctx->minorversion, - ctx->nfs_server.nconnect, - ctx->nfs_server.max_connect, - fc->net_ns, - &ctx->xprtsec); + error = nfs4_set_client(server, &cl_init); if (error < 0) return error; @@ -1232,18 +1230,28 @@ error: struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); - struct nfs_client *parent_client; - struct nfs_server *server, *parent_server; - int proto, error; + struct nfs_server *parent_server = NFS_SB(ctx->clone_data.sb); + struct nfs_client *parent_client = parent_server->nfs_client; + struct nfs_client_initdata cl_init = { + .hostname = ctx->nfs_server.hostname, + .addr = &ctx->nfs_server._address, + .addrlen = ctx->nfs_server.addrlen, + .ip_addr = parent_client->cl_ipaddr, + .minorversion = parent_client->cl_mvops->minor_version, + .net = parent_client->cl_net, + .timeparms = parent_server->client->cl_timeout, + .xprtsec = parent_client->cl_xprtsec, + .nconnect = parent_client->cl_nconnect, + .max_connect = parent_client->cl_max_connect, + }; + struct nfs_server *server; bool auth_probe; + int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); - parent_server = NFS_SB(ctx->clone_data.sb); - parent_client = parent_server->nfs_client; - server->cred = get_cred(parent_server->cred); /* Initialise the client representation from the parent server */ @@ -1252,38 +1260,17 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) /* Get a client representation */ #if IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT); - error = nfs4_set_client(server, - ctx->nfs_server.hostname, - &ctx->nfs_server._address, - ctx->nfs_server.addrlen, - parent_client->cl_ipaddr, - XPRT_TRANSPORT_RDMA, - parent_server->client->cl_timeout, - parent_client->cl_mvops->minor_version, - parent_client->cl_nconnect, - parent_client->cl_max_connect, - parent_client->cl_net, - &parent_client->cl_xprtsec); + cl_init.proto = XPRT_TRANSPORT_RDMA; + error = nfs4_set_client(server, &cl_init); if (!error) goto init_server; #endif /* IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) */ - proto = XPRT_TRANSPORT_TCP; + cl_init.proto = XPRT_TRANSPORT_TCP; if (parent_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE) - proto = XPRT_TRANSPORT_TCP_TLS; + cl_init.proto = XPRT_TRANSPORT_TCP_TLS; rpc_set_port(&ctx->nfs_server.address, NFS_PORT); - error = nfs4_set_client(server, - ctx->nfs_server.hostname, - &ctx->nfs_server._address, - ctx->nfs_server.addrlen, - parent_client->cl_ipaddr, - proto, - parent_server->client->cl_timeout, - parent_client->cl_mvops->minor_version, - parent_client->cl_nconnect, - parent_client->cl_max_connect, - parent_client->cl_net, - &parent_client->cl_xprtsec); + error = nfs4_set_client(server, &cl_init); if (error < 0) goto error; @@ -1339,6 +1326,19 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, char buf[INET6_ADDRSTRLEN + 1]; struct sockaddr_storage address; struct sockaddr *localaddr = (struct sockaddr *)&address; + struct nfs_client_initdata cl_init = { + .hostname = hostname, + .addr = sap, + .addrlen = salen, + .ip_addr = buf, + .proto = clp->cl_proto, + .minorversion = clp->cl_minorversion, + .net = net, + .timeparms = clnt->cl_timeout, + .xprtsec = clp->cl_xprtsec, + .nconnect = clp->cl_nconnect, + .max_connect = clp->cl_max_connect, + }; int error; error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); @@ -1354,11 +1354,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, nfs_server_remove_lists(server); set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); - error = nfs4_set_client(server, hostname, sap, salen, buf, - clp->cl_proto, clnt->cl_timeout, - clp->cl_minorversion, - clp->cl_nconnect, clp->cl_max_connect, - net, &clp->cl_xprtsec); + error = nfs4_set_client(server, &cl_init); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); if (error != 0) { nfs_server_insert_lists(server); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4aeadd6e1a6d..7317f26892c5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -10,6 +10,7 @@ #include <linux/mount.h> #include <linux/nfs_fs.h> #include <linux/nfs_ssc.h> +#include <linux/splice.h> #include "delegation.h" #include "internal.h" #include "iostat.h" @@ -195,8 +196,8 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(file_in, pos_in, file_out, - pos_out, count, flags); + ret = splice_copy_file_range(file_in, pos_in, file_out, + pos_out, count); return ret; } @@ -224,8 +225,14 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE))) + switch (mode) { + case 0: + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + case FALLOC_FL_ZERO_RANGE: + break; + default: return -EOPNOTSUPP; + } ret = inode_newsize_ok(inode, offset + len); if (ret < 0) @@ -233,6 +240,8 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (mode & FALLOC_FL_PUNCH_HOLE) return nfs42_proc_deallocate(filep, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) + return nfs42_proc_zero_range(filep, offset ,len); return nfs42_proc_allocate(filep, offset, len); } @@ -244,7 +253,6 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, struct nfs_server *server = NFS_SERVER(dst_inode); struct inode *src_inode = file_inode(src_file); unsigned int bs = server->clone_blksize; - bool same_inode = false; int ret; /* NFS does not support deduplication. */ @@ -266,25 +274,15 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, goto out; } - if (src_inode == dst_inode) - same_inode = true; - /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ - if (same_inode) { - inode_lock(src_inode); - } else if (dst_inode < src_inode) { - inode_lock_nested(dst_inode, I_MUTEX_PARENT); - inode_lock_nested(src_inode, I_MUTEX_CHILD); - } else { - inode_lock_nested(src_inode, I_MUTEX_PARENT); - inode_lock_nested(dst_inode, I_MUTEX_CHILD); - } - + lock_two_nondirectories(src_inode, dst_inode); /* flush all pending writes on both src and dst so that server * has the latest data */ + nfs_file_block_o_direct(NFS_I(src_inode)); ret = nfs_sync_inode(src_inode); if (ret) goto out_unlock; + nfs_file_block_o_direct(NFS_I(dst_inode)); ret = nfs_sync_inode(dst_inode); if (ret) goto out_unlock; @@ -297,15 +295,7 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1); out_unlock: - if (same_inode) { - inode_unlock(src_inode); - } else if (dst_inode < src_inode) { - inode_unlock(src_inode); - inode_unlock(dst_inode); - } else { - inode_unlock(dst_inode); - inode_unlock(src_inode); - } + unlock_two_nondirectories(src_inode, dst_inode); out: return ret < 0 ? ret : count; } @@ -438,16 +428,18 @@ void nfs42_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease, +static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { + if (!S_ISREG(file_inode(file)->i_mode)) + return -EINVAL; return nfs4_proc_setlease(file, arg, lease, priv); } const struct file_operations nfs4_file_operations = { .read_iter = nfs_file_read, .write_iter = nfs_file_write, - .mmap = nfs_file_mmap, + .mmap_prepare = nfs_file_mmap_prepare, .open = nfs4_file_open, .flush = nfs4_file_flush, .release = nfs_file_release, @@ -466,4 +458,5 @@ const struct file_operations nfs4_file_operations = { #else .llseek = nfs_file_llseek, #endif + .fop_flags = FOP_DONTCACHE, }; diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index 1a69479a3a59..e67ea345de69 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -12,30 +12,28 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) { - struct nfs_fsinfo fsinfo; + struct nfs_fattr *fattr = nfs_alloc_fattr(); int ret = -ENOMEM; - fsinfo.fattr = nfs_alloc_fattr(); - if (fsinfo.fattr == NULL) + if (fattr == NULL) goto out; /* Start by getting the root filehandle from the server */ - ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe); + ret = nfs4_proc_get_rootfh(server, mntfh, fattr, auth_probe); if (ret < 0) { dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); goto out; } - if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) - || !S_ISDIR(fsinfo.fattr->mode)) { + if (!(fattr->valid & NFS_ATTR_FATTR_TYPE) || !S_ISDIR(fattr->mode)) { printk(KERN_ERR "nfs4_get_rootfh:" " getroot encountered non-directory\n"); ret = -ENOTDIR; goto out; } - memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); out: - nfs_free_fattr(fsinfo.fattr); + nfs_free_fattr(fattr); return ret; } diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 25a7c771cfd8..9e1c48c5c0b8 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -306,15 +306,12 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, const char *type, void *data, size_t data_size, struct idmap *idmap) { - const struct cred *saved_cred; struct key *rkey; const struct user_key_payload *payload; ssize_t ret; - saved_cred = override_creds(id_resolver_cache); - rkey = nfs_idmap_request_key(name, namelen, type, idmap); - revert_creds(saved_cred); - + scoped_with_creds(id_resolver_cache) + rkey = nfs_idmap_request_key(name, namelen, type, idmap); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); goto out; @@ -424,26 +421,16 @@ static void nfs_idmap_pipe_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; - struct rpc_pipe *pipe = idmap->idmap_pipe; - if (pipe->dentry) { - rpc_unlink(pipe->dentry); - pipe->dentry = NULL; - } + rpc_unlink(idmap->idmap_pipe); } static int nfs_idmap_pipe_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; - struct rpc_pipe *pipe = idmap->idmap_pipe; - struct dentry *dentry; - dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - pipe->dentry = dentry; - return 0; + return rpc_mkpipe_dentry(dir, "idmap", idmap, idmap->idmap_pipe); } static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e1a886b58354..93c6ce04332b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -103,10 +103,10 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, const struct cred *cred, struct nfs4_slot *slot, bool is_privileged); -static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, - const struct cred *); -static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, - const struct cred *, bool); +static int nfs41_test_stateid(struct nfs_server *, const nfs4_stateid *, + const struct cred *); +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, + const struct cred *, bool); #endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL @@ -114,6 +114,7 @@ static inline struct nfs4_label * nfs4_label_init_security(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct nfs4_label *label) { + struct lsm_context shim; int err; if (label == NULL) @@ -128,18 +129,26 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, label->label = NULL; err = security_dentry_init_security(dentry, sattr->ia_mode, - &dentry->d_name, NULL, - (void **)&label->label, &label->len); - if (err == 0) - return label; + &dentry->d_name, NULL, &shim); + if (err) + return NULL; - return NULL; + label->lsmid = shim.id; + label->label = shim.context; + label->len = shim.len; + return label; } static inline void nfs4_label_release_security(struct nfs4_label *label) { - if (label) - security_release_secctx(label->label, label->len); + struct lsm_context shim; + + if (label) { + shim.context = label->label; + shim.len = label->len; + shim.id = label->lsmid; + security_release_secctx(&shim); + } } static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) { @@ -170,6 +179,7 @@ static int nfs4_map_errors(int err) case -NFS4ERR_RESOURCE: case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: return -EREMOTEIO; case -NFS4ERR_WRONGSEC: case -NFS4ERR_WRONG_CRED: @@ -185,6 +195,9 @@ static int nfs4_map_errors(int err) return -EBUSY; case -NFS4ERR_NOT_SAME: return -ENOTSYNC; + case -ENETDOWN: + case -ENETUNREACH: + break; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -209,6 +222,7 @@ const u32 nfs4_fattr_bitmap[3] = { | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_MOUNTED_ON_FILEID, @@ -230,6 +244,7 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY, FATTR4_WORD2_MDSTHRESHOLD @@ -292,7 +307,7 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, unsigned long cache_validity; memcpy(dst, src, NFS4_BITMASK_SZ*sizeof(*dst)); - if (!inode || !nfs4_have_delegation(inode, FMODE_READ)) + if (!inode || !nfs_have_read_or_write_delegation(inode)) return; cache_validity = READ_ONCE(NFS_I(inode)->cache_validity) | flags; @@ -309,6 +324,21 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, dst[1] &= ~FATTR4_WORD1_MODE; if (!(cache_validity & NFS_INO_INVALID_OTHER)) dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP); + + if (!(cache_validity & NFS_INO_INVALID_BTIME)) + dst[1] &= ~FATTR4_WORD1_TIME_CREATE; + + if (nfs_have_delegated_mtime(inode)) { + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); + if (!(cache_validity & NFS_INO_INVALID_MTIME)) + dst[1] &= ~(FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET); + if (!(cache_validity & NFS_INO_INVALID_CTIME)) + dst[1] &= ~(FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY_SET); + } else if (nfs_have_delegated_atime(inode)) { + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); + } } static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, @@ -361,7 +391,9 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = htonl(attrs); /* bitmap */ *p++ = htonl(12); /* attribute buffer length */ *p++ = htonl(NF4DIR); + spin_lock(&dentry->d_lock); p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent))); + spin_unlock(&dentry->d_lock); readdir->pgbase = (char *)p - (char *)start; readdir->count -= readdir->pgbase; @@ -421,6 +453,8 @@ static int nfs4_delay_killable(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!__fatal_signal_pending(current)) @@ -432,6 +466,8 @@ static int nfs4_delay_interruptible(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!signal_pending(current)) @@ -558,6 +594,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_GRACE: case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: exception->delay = 1; return 0; @@ -585,6 +622,21 @@ wait_on_recovery: return 0; } +/* + * Track the number of NFS4ERR_DELAY related retransmissions and return + * EAGAIN if the 'softerr' mount option is set, and we've exceeded the limit + * set by 'nfs_delay_retrans'. + */ +static int nfs4_exception_should_retrans(const struct nfs_server *server, + struct nfs4_exception *exception) +{ + if (server->flags & NFS_MOUNT_SOFTERR && nfs_delay_retrans >= 0) { + if (exception->retrans++ >= (unsigned short)nfs_delay_retrans) + return -EAGAIN; + } + return 0; +} + /* This is the error handling routine for processes that are allowed * to sleep. */ @@ -595,6 +647,11 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ ret = nfs4_do_handle_exception(server, errorcode, exception); if (exception->delay) { + int ret2 = nfs4_exception_should_retrans(server, exception); + if (ret2 < 0) { + exception->retry = 0; + return ret2; + } ret = nfs4_delay(&exception->timeout, exception->interruptible); goto out_retry; @@ -621,8 +678,22 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server, struct nfs_client *clp = server->nfs_client; int ret; + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + exception->delay = 0; + exception->recovering = 0; + exception->retry = 0; + return -EIO; + } + ret = nfs4_do_handle_exception(server, errorcode, exception); if (exception->delay) { + int ret2 = nfs4_exception_should_retrans(server, exception); + if (ret2 < 0) { + exception->retry = 0; + return ret2; + } rpc_delay(task, nfs4_update_delay(&exception->timeout)); goto out_retry; } @@ -1218,7 +1289,8 @@ nfs4_update_changeattr_locked(struct inode *inode, struct nfs_inode *nfsi = NFS_I(inode); u64 change_attr = inode_peek_iversion_raw(inode); - cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + if (!nfs_have_delegated_mtime(inode)) + cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; if (S_ISDIR(inode->i_mode)) cache_validity |= NFS_INO_INVALID_DATA; @@ -1237,12 +1309,13 @@ nfs4_update_changeattr_locked(struct inode *inode, if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + if (!nfs_have_delegated_attributes(inode)) cache_validity |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | - NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR; + NFS_INO_INVALID_MODE | NFS_INO_INVALID_BTIME | + NFS_INO_INVALID_XATTR; nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); } nfsi->attrtimeo_timestamp = jiffies; @@ -1293,8 +1366,7 @@ static fmode_t _nfs4_ctx_to_openmode(const struct nfs_open_context *ctx) } static u32 -nfs4_map_atomic_open_share(struct nfs_server *server, - fmode_t fmode, int openflags) +nfs4_fmode_to_share_access(fmode_t fmode) { u32 res = 0; @@ -1308,11 +1380,27 @@ nfs4_map_atomic_open_share(struct nfs_server *server, case FMODE_READ|FMODE_WRITE: res = NFS4_SHARE_ACCESS_BOTH; } + return res; +} + +static u32 +nfs4_map_atomic_open_share(struct nfs_server *server, + fmode_t fmode, int openflags) +{ + u32 res = nfs4_fmode_to_share_access(fmode); + if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1)) goto out; /* Want no delegation if we're using O_DIRECT */ - if (openflags & O_DIRECT) + if (openflags & O_DIRECT) { res |= NFS4_SHARE_WANT_NO_DELEG; + goto out; + } + /* res |= NFS4_SHARE_WANT_NO_PREFERENCE; */ + if (server->caps & NFS_CAP_DELEGTIME) + res |= NFS4_SHARE_WANT_DELEG_TIMESTAMPS; + if (server->caps & NFS_CAP_OPEN_XOR) + res |= NFS4_SHARE_WANT_OPEN_XOR_DELEGATION; out: return res; } @@ -1710,7 +1798,8 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, rcu_read_unlock(); trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + !nfs_current_task_exiting()) { if (schedule_timeout(5*HZ) == 0) status = -EAGAIN; else @@ -1927,44 +2016,41 @@ out_return_state: } static void -nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) -{ - struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client; - struct nfs_delegation *delegation; - int delegation_flags = 0; - - rcu_read_lock(); - delegation = rcu_dereference(NFS_I(state->inode)->delegation); - if (delegation) - delegation_flags = delegation->flags; - rcu_read_unlock(); - switch (data->o_arg.claim) { - default: +nfs4_process_delegation(struct inode *inode, const struct cred *cred, + enum open_claim_type4 claim, + const struct nfs4_open_delegation *delegation) +{ + switch (delegation->open_delegation_type) { + case NFS4_OPEN_DELEGATE_READ: + case NFS4_OPEN_DELEGATE_WRITE: + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: break; + default: + return; + } + switch (claim) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: pr_err_ratelimited("NFS: Broken NFSv4 server %s is " "returning a delegation for " "OPEN(CLAIM_DELEGATE_CUR)\n", - clp->cl_hostname); - return; + NFS_SERVER(inode)->nfs_client->cl_hostname); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + nfs_inode_reclaim_delegation(inode, cred, delegation->type, + &delegation->stateid, + delegation->pagemod_limit, + delegation->open_delegation_type); + break; + default: + nfs_inode_set_delegation(inode, cred, delegation->type, + &delegation->stateid, + delegation->pagemod_limit, + delegation->open_delegation_type); } - if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) - nfs_inode_set_delegation(state->inode, - data->owner->so_cred, - data->o_res.delegation_type, - &data->o_res.delegation, - data->o_res.pagemod_limit); - else - nfs_inode_reclaim_delegation(state->inode, - data->owner->so_cred, - data->o_res.delegation_type, - &data->o_res.delegation, - data->o_res.pagemod_limit); - - if (data->o_res.do_recall) - nfs_async_inode_return_delegation(state->inode, - &data->o_res.delegation); + if (delegation->do_recall) + nfs_async_inode_return_delegation(inode, &delegation->stateid); } /* @@ -1988,11 +2074,16 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) if (ret) return ERR_PTR(ret); - if (data->o_res.delegation_type != 0) - nfs4_opendata_check_deleg(data, state); + nfs4_process_delegation(state->inode, + data->owner->so_cred, + data->o_arg.claim, + &data->o_res.delegation); - if (!update_open_stateid(state, &data->o_res.stateid, - NULL, data->o_arg.fmode)) + if (!(data->o_res.rflags & NFS4_OPEN_RESULT_NO_OPEN_STATEID)) { + if (!update_open_stateid(state, &data->o_res.stateid, + NULL, data->o_arg.fmode)) + return ERR_PTR(-EAGAIN); + } else if (!update_open_stateid(state, NULL, NULL, data->o_arg.fmode)) return ERR_PTR(-EAGAIN); refcount_inc(&state->count); @@ -2056,10 +2147,18 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) if (IS_ERR(state)) goto out; - if (data->o_res.delegation_type != 0) - nfs4_opendata_check_deleg(data, state); - if (!update_open_stateid(state, &data->o_res.stateid, - NULL, data->o_arg.fmode)) { + nfs4_process_delegation(state->inode, + data->owner->so_cred, + data->o_arg.claim, + &data->o_res.delegation); + + if (!(data->o_res.rflags & NFS4_OPEN_RESULT_NO_OPEN_STATEID)) { + if (!update_open_stateid(state, &data->o_res.stateid, + NULL, data->o_arg.fmode)) { + nfs4_put_open_state(state); + state = ERR_PTR(-EAGAIN); + } + } else if (!update_open_stateid(state, NULL, NULL, data->o_arg.fmode)) { nfs4_put_open_state(state); state = ERR_PTR(-EAGAIN); } @@ -2195,7 +2294,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state { struct nfs_delegation *delegation; struct nfs4_opendata *opendata; - fmode_t delegation_type = 0; + u32 delegation_type = NFS4_OPEN_DELEGATE_NONE; int status; opendata = nfs4_open_recoverdata_alloc(ctx, state, @@ -2204,8 +2303,20 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state return PTR_ERR(opendata); rcu_read_lock(); delegation = rcu_dereference(NFS_I(state->inode)->delegation); - if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) - delegation_type = delegation->type; + if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) { + switch(delegation->type) { + case FMODE_READ: + delegation_type = NFS4_OPEN_DELEGATE_READ; + if (test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags)) + delegation_type = NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG; + break; + case FMODE_WRITE: + case FMODE_READ|FMODE_WRITE: + delegation_type = NFS4_OPEN_DELEGATE_WRITE; + if (test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags)) + delegation_type = NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG; + } + } rcu_read_unlock(); opendata->o_arg.u.delegation_type = delegation_type; status = nfs4_open_recover(opendata, state); @@ -2526,12 +2637,14 @@ static void nfs4_open_release(void *calldata) struct nfs4_opendata *data = calldata; struct nfs4_state *state = NULL; + /* In case of error, no cleanup! */ + if (data->rpc_status != 0 || !data->rpc_done) { + nfs_release_seqid(data->o_arg.seqid); + goto out_free; + } /* If this request hasn't been cancelled, do nothing */ if (!data->cancelled) goto out_free; - /* In case of error, no cleanup! */ - if (data->rpc_status != 0 || !data->rpc_done) - goto out_free; /* In case we need an open_confirm, no cleanup! */ if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) goto out_free; @@ -2703,8 +2816,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data, return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + struct nfs_fh *fh = &o_res->fh; + nfs4_sequence_free_slot(&o_res->seq_res); - nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL); + if (o_arg->claim == NFS4_OPEN_CLAIM_FH) + fh = NFS_FH(d_inode(data->dentry)); + nfs4_proc_getattr(server, fh, o_res->f_attr, NULL); } return 0; } @@ -2794,16 +2911,14 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st } static int nfs40_test_and_free_expired_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - const struct cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { return -NFS4ERR_BAD_STATEID; } #if defined(CONFIG_NFS_V4_1) static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - const struct cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { int status; @@ -2812,6 +2927,7 @@ static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, break; case NFS4_INVALID_STATEID_TYPE: case NFS4_SPECIAL_STATEID_TYPE: + case NFS4_FREED_STATEID_TYPE: return -NFS4ERR_BAD_STATEID; case NFS4_REVOKED_STATEID_TYPE: goto out_free; @@ -3038,10 +3154,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx); struct inode *dir = d_inode(opendata->dir); unsigned long dir_verifier; - unsigned int seq; int ret; - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); dir_verifier = nfs_save_change_attribute(dir); ret = _nfs4_proc_open(opendata, ctx); @@ -3064,9 +3178,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (d_really_is_negative(dentry)) { struct dentry *alias; d_drop(dentry); - alias = d_exact_alias(dentry, state->inode); - if (!alias) - alias = d_splice_alias(igrab(state->inode), dentry); + alias = d_splice_alias(igrab(state->inode), dentry); /* d_splice_alias() can't fail here - it's a non-directory */ if (alias) { dput(ctx->dentry); @@ -3082,7 +3194,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, case NFS4_OPEN_CLAIM_DELEGATE_PREV: if (!opendata->rpc_done) break; - if (opendata->o_res.delegation_type != 0) + if (opendata->o_res.delegation.type != 0) dir_verifier = nfs_save_change_attribute(dir); nfs_set_verifier(dentry, dir_verifier); } @@ -3094,11 +3206,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (ret != 0) goto out; - if (d_inode(dentry) == state->inode) { + if (d_inode(dentry) == state->inode) nfs_inode_attach_open_context(ctx); - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) - nfs4_schedule_stateid_recovery(server, state); - } out: if (!opendata->cancelled) { @@ -3368,13 +3477,18 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, .inode = inode, .stateid = &arg.stateid, }; - unsigned long adjust_flags = NFS_INO_INVALID_CHANGE; + unsigned long adjust_flags = NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME; int err; if (sattr->ia_valid & (ATTR_MODE | ATTR_KILL_SUID | ATTR_KILL_SGID)) adjust_flags |= NFS_INO_INVALID_MODE; if (sattr->ia_valid & (ATTR_UID | ATTR_GID)) adjust_flags |= NFS_INO_INVALID_OTHER; + if (sattr->ia_valid & ATTR_ATIME) + adjust_flags |= NFS_INO_INVALID_ATIME; + if (sattr->ia_valid & ATTR_MTIME) + adjust_flags |= NFS_INO_INVALID_MTIME; do { nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), @@ -3486,7 +3600,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, write_sequnlock(&state->seqlock); trace_nfs4_close_stateid_update_wait(state->inode, dst, 0); - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current) || nfs_current_task_exiting()) status = -EINTR; else if (schedule_timeout(5*HZ) != 0) @@ -3522,6 +3636,7 @@ struct nfs4_closedata { } lr; struct nfs_fattr fattr; unsigned long timestamp; + unsigned short retrans; }; static void nfs4_free_closedata(void *data) @@ -3550,6 +3665,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) .state = state, .inode = calldata->inode, .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -3597,6 +3713,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + calldata->retrans = exception.retrans; if (exception.retry) goto out_restart; } @@ -3674,7 +3791,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) { /* Close-to-open cache consistency revalidation */ - if (!nfs4_have_delegation(inode, FMODE_READ)) { + if (!nfs4_have_delegation(inode, FMODE_READ, 0)) { nfs4_bitmask_set(calldata->arg.bitmask_store, server->cache_consistency_bitmask, inode, 0); @@ -3684,8 +3801,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) } calldata->arg.share_access = - nfs4_map_atomic_open_share(NFS_SERVER(inode), - calldata->arg.fmode, 0); + nfs4_fmode_to_share_access(calldata->arg.fmode); if (calldata->res.fattr == NULL) calldata->arg.bitmask = NULL; @@ -3816,8 +3932,11 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) { + struct dentry *dentry = ctx->dentry; if (ctx->state == NULL) return; + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + nfs4_inode_set_return_delegation_on_close(d_inode(dentry)); if (is_sync) nfs4_close_sync(ctx->state, _nfs4_ctx_to_openmode(ctx)); else @@ -3826,11 +3945,26 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) #define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) -#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_XATTR_SUPPORT - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_OPEN_ARGUMENTS - 1UL) + +#define FATTR4_WORD2_NFS42_TIME_DELEG_MASK \ + (FATTR4_WORD2_TIME_DELEG_MODIFY|FATTR4_WORD2_TIME_DELEG_ACCESS) +static bool nfs4_server_delegtime_capable(struct nfs4_server_caps_res *res) +{ + u32 share_access_want = res->open_caps.oa_share_access_want[0]; + u32 attr_bitmask = res->attr_bitmask[2]; + + return (share_access_want & NFS4_SHARE_WANT_DELEG_TIMESTAMPS) && + ((attr_bitmask & FATTR4_WORD2_NFS42_TIME_DELEG_MASK) == + FATTR4_WORD2_NFS42_TIME_DELEG_MASK); +} static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { - u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion; + u32 minorversion = server->nfs_client->cl_minorversion; + u32 bitmask[3] = { + [0] = FATTR4_WORD0_SUPPORTED_ATTRS, + }; struct nfs4_server_caps_arg args = { .fhandle = fhandle, .bitmask = bitmask, @@ -3853,9 +3987,19 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_CASE_PRESERVING; if (minorversion) bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + if (minorversion > 1) + bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { + bitmask[0] = (FATTR4_WORD0_SUPPORTED_ATTRS | + FATTR4_WORD0_FH_EXPIRE_TYPE | + FATTR4_WORD0_LINK_SUPPORT | + FATTR4_WORD0_SYMLINK_SUPPORT | + FATTR4_WORD0_ACLSUPPORT | + FATTR4_WORD0_CASE_INSENSITIVE | + FATTR4_WORD0_CASE_PRESERVING) & + res.attr_bitmask[0]; /* Sanity check the server answers */ switch (minorversion) { case 0: @@ -3864,13 +4008,20 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f break; case 1: res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK; + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT & + res.attr_bitmask[2]; break; case 2: res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK; + bitmask[2] = (FATTR4_WORD2_SUPPATTR_EXCLCREAT | + FATTR4_WORD2_OPEN_ARGUMENTS) & + res.attr_bitmask[2]; } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); - server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | - NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL); + server->caps &= + ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS | + NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS | + NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME); server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) @@ -3909,10 +4060,20 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME; if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)) server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)) + server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_CREATE)) + server->fattr_valid &= ~NFS_ATTR_FATTR_BTIME; memcpy(server->attr_bitmask_nl, res.attr_bitmask, sizeof(server->attr_bitmask)); server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + if (res.open_caps.oa_share_access_want[0] & + NFS4_SHARE_WANT_OPEN_XOR_DELEGATION) + server->caps |= NFS_CAP_OPEN_XOR; + if (nfs4_server_delegtime_capable(&res)) + server->caps |= NFS_CAP_DELEGTIME; + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; @@ -3938,7 +4099,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) }; int err; - nfs4_server_set_init_caps(server); do { err = nfs4_handle_exception(server, _nfs4_server_capabilities(server, fhandle), @@ -3997,6 +4157,23 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location, } } +static bool _is_same_nfs4_pathname(struct nfs4_pathname *path1, + struct nfs4_pathname *path2) +{ + int i; + + if (path1->ncomponents != path2->ncomponents) + return false; + for (i = 0; i < path1->ncomponents; i++) { + if (path1->components[i].len != path2->components[i].len) + return false; + if (memcmp(path1->components[i].data, path2->components[i].data, + path1->components[i].len)) + return false; + } + return true; +} + static int _nfs4_discover_trunking(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -4030,9 +4207,13 @@ static int _nfs4_discover_trunking(struct nfs_server *server, if (status) goto out_free_3; - for (i = 0; i < locations->nlocations; i++) + for (i = 0; i < locations->nlocations; i++) { + if (!_is_same_nfs4_pathname(&locations->fs_path, + &locations->locations[i].rootpath)) + continue; test_fs_location_for_trunking(&locations->locations[i], clp, server); + } out_free_3: kfree(locations->fattr); out_free_2: @@ -4065,15 +4246,18 @@ out: } static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fattr *fattr) { - u32 bitmask[3]; + u32 bitmask[3] = { + [0] = FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE | + FATTR4_WORD0_SIZE | FATTR4_WORD0_FSID, + }; struct nfs4_lookup_root_arg args = { .bitmask = bitmask, }; struct nfs4_lookup_res res = { .server = server, - .fattr = info->fattr, + .fattr = fattr, .fh = fhandle, }; struct rpc_message msg = { @@ -4082,27 +4266,20 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; - bitmask[0] = nfs4_fattr_bitmap[0]; - bitmask[1] = nfs4_fattr_bitmap[1]; - /* - * Process the label in the upcoming getfattr - */ - bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL; - - nfs_fattr_init(info->fattr); + nfs_fattr_init(fattr); return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fattr *fattr) { struct nfs4_exception exception = { .interruptible = true, }; int err; do { - err = _nfs4_lookup_root(server, fhandle, info); - trace_nfs4_lookup_root(server, fhandle, info->fattr, err); + err = _nfs4_lookup_root(server, fhandle, fattr); + trace_nfs4_lookup_root(server, fhandle, fattr, err); switch (err) { case 0: case -NFS4ERR_WRONGSEC: @@ -4115,8 +4292,9 @@ out: return err; } -static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, rpc_authflavor_t flavor) +static int nfs4_lookup_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + rpc_authflavor_t flavor) { struct rpc_auth_create_args auth_args = { .pseudoflavor = flavor, @@ -4126,7 +4304,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl auth = rpcauth_create(&auth_args, server->client); if (IS_ERR(auth)) return -EACCES; - return nfs4_lookup_root(server, fhandle, info); + return nfs4_lookup_root(server, fhandle, fattr); } /* @@ -4139,7 +4317,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl * negative errno value. */ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fattr *fattr) { /* Per 3530bis 15.33.5 */ static const rpc_authflavor_t flav_array[] = { @@ -4155,8 +4333,9 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, if (server->auth_info.flavor_len > 0) { /* try each flavor specified by user */ for (i = 0; i < server->auth_info.flavor_len; i++) { - status = nfs4_lookup_root_sec(server, fhandle, info, - server->auth_info.flavors[i]); + status = nfs4_lookup_root_sec( + server, fhandle, fattr, + server->auth_info.flavors[i]); if (status == -NFS4ERR_WRONGSEC || status == -EACCES) continue; break; @@ -4164,7 +4343,7 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, } else { /* no flavors specified by user, try default list */ for (i = 0; i < ARRAY_SIZE(flav_array); i++) { - status = nfs4_lookup_root_sec(server, fhandle, info, + status = nfs4_lookup_root_sec(server, fhandle, fattr, flav_array[i]); if (status == -NFS4ERR_WRONGSEC || status == -EACCES) continue; @@ -4188,28 +4367,22 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, * nfs4_proc_get_rootfh - get file handle for server's pseudoroot * @server: initialized nfs_server handle * @fhandle: we fill in the pseudo-fs root file handle - * @info: we fill in an FSINFO struct + * @fattr: we fill in a bare bones struct fattr * @auth_probe: probe the auth flavours * * Returns zero on success, or a negative errno. */ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, - bool auth_probe) + struct nfs_fattr *fattr, bool auth_probe) { int status = 0; if (!auth_probe) - status = nfs4_lookup_root(server, fhandle, info); + status = nfs4_lookup_root(server, fhandle, fattr); if (auth_probe || status == NFS4ERR_WRONGSEC) - status = server->nfs_client->cl_mvops->find_root_sec(server, - fhandle, info); - - if (status == 0) - status = nfs4_server_capabilities(server, fhandle); - if (status == 0) - status = nfs4_do_fsinfo(server, fhandle, info); + status = server->nfs_client->cl_mvops->find_root_sec( + server, fhandle, fattr); return nfs4_map_errors(status); } @@ -4398,15 +4571,15 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, - struct dentry *dentry, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_server *server = NFS_SERVER(dir); int status; struct nfs4_lookup_arg args = { .bitmask = server->attr_bitmask, .dir_fh = NFS_FH(dir), - .name = &dentry->d_name, + .name = name, }; struct nfs4_lookup_res res = { .server = server, @@ -4448,17 +4621,16 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) } static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, - struct dentry *dentry, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) + struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs4_exception exception = { .interruptible = true, }; struct rpc_clnt *client = *clnt; - const struct qstr *name = &dentry->d_name; int err; do { - err = _nfs4_proc_lookup(client, dir, dentry, fhandle, fattr); + err = _nfs4_proc_lookup(client, dir, dentry, name, fhandle, fattr); trace_nfs4_lookup(dir, name, err); switch (err) { case -NFS4ERR_BADNAME: @@ -4493,13 +4665,13 @@ out: return err; } -static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry, +static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int status; struct rpc_clnt *client = NFS_CLIENT(dir); - status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr); + status = nfs4_proc_lookup_common(&client, dir, dentry, name, fhandle, fattr); if (client != NFS_CLIENT(dir)) { rpc_shutdown_client(client); nfs_fixup_secinfo_attributes(fattr); @@ -4514,7 +4686,8 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct dentry *dentry, struct rpc_clnt *client = NFS_CLIENT(dir); int status; - status = nfs4_proc_lookup_common(&client, dir, dentry, fhandle, fattr); + status = nfs4_proc_lookup_common(&client, dir, dentry, &dentry->d_name, + fhandle, fattr); if (status < 0) return ERR_PTR(status); return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client; @@ -4542,16 +4715,19 @@ static int _nfs4_proc_lookupp(struct inode *inode, }; unsigned short task_flags = 0; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) + if (server->flags & NFS_MOUNT_SOFTREVAL) task_flags |= RPC_TASK_TIMEOUT; + if (server->caps & NFS_CAP_MOVEABLE) + task_flags |= RPC_TASK_MOVEABLE; args.bitmask = nfs4_bitmask(server, fattr->label); nfs_fattr_init(fattr); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, task_flags); + status = nfs4_do_call_sync(clnt, server, &msg, &args.seq_args, + &res.seq_res, task_flags); dprintk("NFS reply lookupp: %d\n", status); return status; } @@ -4591,7 +4767,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry }; int status = 0; - if (!nfs4_have_delegation(inode, FMODE_READ)) { + if (!nfs4_have_delegation(inode, FMODE_READ, 0)) { res.fattr = nfs_alloc_fattr(); if (res.fattr == NULL) return -ENOMEM; @@ -4909,8 +5085,9 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct goto out; nfs4_inode_make_writeable(inode); - nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.fattr->label), inode, - NFS_INO_INVALID_CHANGE); + nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.fattr->label), + inode, + NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start, @@ -4988,9 +5165,6 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { spin_lock(&dir->i_lock); - /* Creating a directory bumps nlink in the parent */ - if (data->arg.ftype == NF4DIR) - nfs4_inc_nlink_locked(dir); nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, data->res.fattr->time_start, NFS_INO_INVALID_DATA); @@ -5000,6 +5174,31 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ return status; } +static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, + struct nfs4_createdata *data, int *statusp) +{ + struct dentry *ret; + + *statusp = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + &data->arg.seq_args, &data->res.seq_res, 1); + + if (*statusp) + return NULL; + + spin_lock(&dir->i_lock); + /* Creating a directory bumps nlink in the parent */ + nfs4_inc_nlink_locked(dir); + nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, + data->res.fattr->time_start, + NFS_INO_INVALID_DATA); + spin_unlock(&dir->i_lock); + ret = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + if (!IS_ERR(ret)) + return ret; + *statusp = PTR_ERR(ret); + return NULL; +} + static void nfs4_free_createdata(struct nfs4_createdata *data) { nfs4_label_free(data->fattr.label); @@ -5007,9 +5206,10 @@ static void nfs4_free_createdata(struct nfs4_createdata *data) } static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, - struct page *page, unsigned int len, struct iattr *sattr, + struct folio *folio, unsigned int len, struct iattr *sattr, struct nfs4_label *label) { + struct page *page = &folio->page; struct nfs4_createdata *data; int status = -ENAMETOOLONG; @@ -5034,7 +5234,7 @@ out: } static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, - struct page *page, unsigned int len, struct iattr *sattr) + struct folio *folio, unsigned int len, struct iattr *sattr) { struct nfs4_exception exception = { .interruptible = true, @@ -5045,7 +5245,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, label = nfs4_label_init_security(dir, dentry, sattr, &l); do { - err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label); + err = _nfs4_proc_symlink(dir, dentry, folio, len, sattr, label); trace_nfs4_symlink(dir, &dentry->d_name, err); err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); @@ -5055,32 +5255,35 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, return err; } -static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr, struct nfs4_label *label) +static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, + struct nfs4_label *label, int *statusp) { struct nfs4_createdata *data; - int status = -ENOMEM; + struct dentry *ret = NULL; + *statusp = -ENOMEM; data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); if (data == NULL) goto out; data->arg.label = label; - status = nfs4_do_create(dir, dentry, data); + ret = nfs4_do_mkdir(dir, dentry, data, statusp); nfs4_free_createdata(data); out: - return status; + return ret; } -static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr) +static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr) { struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { .interruptible = true, }; struct nfs4_label l, *label; + struct dentry *alias; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -5088,14 +5291,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_mode &= ~current_umask(); do { - err = _nfs4_proc_mkdir(dir, dentry, sattr, label); + alias = _nfs4_proc_mkdir(dir, dentry, sattr, label, &err); trace_nfs4_mkdir(dir, &dentry->d_name, err); - err = nfs4_handle_exception(NFS_SERVER(dir), err, - &exception); + if (err) + alias = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + err, + &exception)); } while (exception.retry); nfs4_label_release_security(label); - return err; + return alias; } static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg, @@ -5394,9 +5599,11 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -5429,7 +5636,7 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task, struct rpc_message *msg = &task->tk_msg; if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] && - server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) { + task->tk_status == -ENOTSUPP) { server->caps &= ~NFS_CAP_READ_PLUS; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; rpc_restart_call_prepare(task); @@ -5438,18 +5645,8 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task, return false; } -static inline void nfs4_read_plus_scratch_free(struct nfs_pgio_header *hdr) -{ - if (hdr->res.scratch) { - kfree(hdr->res.scratch); - hdr->res.scratch = NULL; - } -} - static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - nfs4_read_plus_scratch_free(hdr); - if (!nfs4_sequence_done(task, &hdr->res.seq_res)) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) @@ -5469,8 +5666,7 @@ static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr, /* Note: We don't use READ_PLUS with pNFS yet */ if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) { msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; - hdr->res.scratch = kmalloc(32, GFP_KERNEL); - return hdr->res.scratch != NULL; + return nfs_read_alloc_scratch(hdr, READ_PLUS_SCRATCH_SIZE); } return false; } @@ -5521,10 +5717,12 @@ static int nfs4_write_done_cb(struct rpc_task *task, .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, NFS_SERVER(inode), task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -5570,7 +5768,7 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr) /* Otherwise, request attributes if and only if we don't hold * a delegation */ - return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; + return nfs4_have_delegation(hdr->inode, FMODE_READ, 0) == 0; } void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], @@ -5598,6 +5796,8 @@ void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], bitmask[1] |= FATTR4_WORD1_TIME_MODIFY; if (cache_validity & NFS_INO_INVALID_BLOCKS) bitmask[1] |= FATTR4_WORD1_SPACE_USED; + if (cache_validity & NFS_INO_INVALID_BTIME) + bitmask[1] |= FATTR4_WORD1_TIME_CREATE; if (cache_validity & NFS_INO_INVALID_SIZE) bitmask[0] |= FATTR4_WORD0_SIZE; @@ -5629,7 +5829,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); - nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr); + nfs4_state_protect_write(hdr->ds_clp ? hdr->ds_clp : server->nfs_client, clnt, msg, hdr); } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) @@ -5670,7 +5870,8 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0); - nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_COMMIT, clnt, msg); + nfs4_state_protect(data->ds_clp ? data->ds_clp : server->nfs_client, + NFS_SP4_MACH_CRED_COMMIT, clnt, msg); } static int _nfs4_proc_commit(struct file *dst, struct nfs_commitargs *args, @@ -5971,7 +6172,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, } /* for decoding across pages */ - res.acl_scratch = alloc_page(GFP_KERNEL); + res.acl_scratch = folio_alloc(GFP_KERNEL, 0); if (!res.acl_scratch) goto out_free; @@ -6004,11 +6205,10 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, out_ok: ret = res.acl_len; out_free: - for (i = 0; i < npages; i++) - if (pages[i]) - __free_page(pages[i]); + while (--i >= 0) + __free_page(pages[i]); if (res.acl_scratch) - __free_page(res.acl_scratch); + folio_put(res.acl_scratch); kfree(pages); return ret; } @@ -6036,6 +6236,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, struct nfs_server *server = NFS_SERVER(inode); int ret; + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); @@ -6110,6 +6312,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, { struct nfs4_exception exception = { }; int err; + + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; do { err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); @@ -6132,7 +6337,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_label label = {0, 0, buflen, buf}; + struct nfs4_label label = {0, 0, 0, buflen, buf}; u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; struct nfs_fattr fattr = { @@ -6237,7 +6442,7 @@ static int nfs4_do_set_security_label(struct inode *inode, static int nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { - struct nfs4_label ilabel = {0, 0, buflen, (char *)buf }; + struct nfs4_label ilabel = {0, 0, 0, buflen, (char *)buf }; struct nfs_fattr *fattr; int status; @@ -6252,6 +6457,7 @@ nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) if (status == 0) nfs_setsecurity(inode, fattr); + nfs_free_fattr(fattr); return status; } #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ @@ -6530,6 +6736,7 @@ struct nfs4_delegreturndata { struct nfs_fh fh; nfs4_stateid stateid; unsigned long timestamp; + unsigned short retrans; struct { struct nfs4_layoutreturn_args arg; struct nfs4_layoutreturn_res res; @@ -6537,6 +6744,7 @@ struct nfs4_delegreturndata { u32 roc_barrier; bool roc; } lr; + struct nfs4_delegattr sattr; struct nfs_fattr fattr; int rpc_status; struct inode *inode; @@ -6549,6 +6757,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) .inode = data->inode, .stateid = &data->stateid, .task_is_privileged = data->args.seq_args.sa_privileged, + .retrans = data->retrans, }; if (!nfs4_sequence_done(task, &data->res.seq_res)) @@ -6561,6 +6770,30 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) &data->res.lr_ret) == -EAGAIN) goto out_restart; + if (data->args.sattr_args && task->tk_status != 0) { + switch(data->res.sattr_ret) { + case 0: + data->args.sattr_args = NULL; + data->res.sattr_res = false; + break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_BAD_STATEID: + /* Let the main handler below do stateid recovery */ + break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_delegation_stateid(&data->stateid, + data->inode)) + goto out_restart; + fallthrough; + default: + data->args.sattr_args = NULL; + data->res.sattr_res = false; + goto out_restart; + } + } + switch (task->tk_status) { case 0: renew_lease(data->res.server, data->timestamp); @@ -6596,6 +6829,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) task->tk_status = nfs4_async_handle_exception(task, data->res.server, task->tk_status, &exception); + data->retrans = exception.retrans; if (exception.retry) goto out_restart; } @@ -6654,7 +6888,10 @@ static const struct rpc_call_ops nfs4_delegreturn_ops = { .rpc_release = nfs4_delegreturn_release, }; -static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync) +static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, + const nfs4_stateid *stateid, + struct nfs_delegation *delegation, + int issync) { struct nfs4_delegreturndata *data; struct nfs_server *server = NFS_SERVER(inode); @@ -6706,12 +6943,27 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, } } + if (delegation && + test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags)) { + if (delegation->type & FMODE_READ) { + data->sattr.atime = inode_get_atime(inode); + data->sattr.atime_set = true; + } + if (delegation->type & FMODE_WRITE) { + data->sattr.mtime = inode_get_mtime(inode); + data->sattr.mtime_set = true; + } + data->args.sattr_args = &data->sattr; + data->res.sattr_res = true; + } + if (!data->inode) nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 1); else nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0); + task_setup_data.callback_data = data; msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; @@ -6729,13 +6981,16 @@ out: return status; } -int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync) +int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, + const nfs4_stateid *stateid, + struct nfs_delegation *delegation, int issync) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_exception exception = { }; int err; do { - err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); + err = _nfs4_proc_delegreturn(inode, cred, stateid, + delegation, issync); trace_nfs4_delegreturn(inode, stateid, err); switch (err) { case -NFS4ERR_STALE_STATEID: @@ -6779,7 +7034,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); switch (status) { case 0: - request->fl_type = F_UNLCK; + request->c.flc_type = F_UNLCK; break; case -NFS4ERR_DENIED: status = 0; @@ -6851,6 +7106,7 @@ struct nfs4_unlockdata { struct file_lock fl; struct nfs_server *server; unsigned long timestamp; + unsigned short retrans; }; static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, @@ -6861,10 +7117,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_unlockdata *p; struct nfs4_state *state = lsp->ls_state; struct inode *inode = state->inode; + struct nfs_lock_context *l_ctx; p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return NULL; + l_ctx = nfs_get_lock_context(ctx); + if (!IS_ERR(l_ctx)) { + p->l_ctx = l_ctx; + } else { + kfree(p); + return NULL; + } p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; @@ -6872,7 +7136,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->lsp = lsp; /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); - p->l_ctx = nfs_get_lock_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); @@ -6898,6 +7161,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) struct nfs4_exception exception = { .inode = calldata->lsp->ls_state->inode, .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -6931,6 +7195,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) task->tk_status = nfs4_async_handle_exception(task, calldata->server, task->tk_status, &exception); + calldata->retrans = exception.retrans; if (exception.retry) rpc_restart_call_prepare(task); } @@ -6997,8 +7262,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, /* Ensure this is an unlock - when canceling a lock, the * canceled lock is passed in, and it won't be an unlock. */ - fl->fl_type = F_UNLCK; - if (fl->fl_flags & FL_CLOSE) + fl->c.flc_type = F_UNLCK; + if (fl->c.flc_flags & FL_CLOSE) set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); @@ -7024,11 +7289,11 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * struct rpc_task *task; struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); int status = 0; - unsigned char fl_flags = request->fl_flags; + unsigned char saved_flags = request->c.flc_flags; status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ - request->fl_flags |= FL_EXISTS; + request->c.flc_flags |= FL_EXISTS; /* Exclude nfs_delegation_claim_locks() */ mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ @@ -7052,14 +7317,16 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = -ENOMEM; if (IS_ERR(seqid)) goto out; - task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); + task = nfs4_do_unlck(request, + nfs_file_open_context(request->c.flc_file), + lsp, seqid); status = PTR_ERR(task); if (IS_ERR(task)) goto out; status = rpc_wait_for_completion_task(task); rpc_put_task(task); out: - request->fl_flags = fl_flags; + request->c.flc_flags = saved_flags; trace_nfs4_unlock(request, state, F_SETLK, status); return status; } @@ -7170,7 +7437,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), data->timestamp); if (data->arg.new_lock && !data->cancelled) { - data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); + data->fl.c.flc_flags &= ~(FL_SLEEP | FL_ACCESS); if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) goto out_restart; } @@ -7181,8 +7448,15 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid)) goto out_restart; break; - case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: + if (data->arg.new_lock_owner != 0 && + nfs4_refresh_open_old_stateid(&data->arg.open_stateid, + lsp->ls_state)) + goto out_restart; + if (nfs4_refresh_lock_old_stateid(&data->arg.lock_stateid, lsp)) + goto out_restart; + fallthrough; + case -NFS4ERR_BAD_STATEID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: if (data->arg.new_lock_owner != 0) { @@ -7264,7 +7538,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; - data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), + data = nfs4_alloc_lockdata(fl, + nfs_file_open_context(fl->c.flc_file), fl->fl_u.nfs4_fl.owner, GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -7370,10 +7645,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock { struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs4_state_owner *sp = state->owner; - unsigned char fl_flags = request->fl_flags; + unsigned char flags = request->c.flc_flags; int status; - request->fl_flags |= FL_ACCESS; + request->c.flc_flags |= FL_ACCESS; status = locks_lock_inode_wait(state->inode, request); if (status < 0) goto out; @@ -7382,7 +7657,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ - request->fl_flags = fl_flags & ~FL_SLEEP; + request->c.flc_flags = flags & ~FL_SLEEP; status = locks_lock_inode_wait(state->inode, request); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); @@ -7392,7 +7667,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock mutex_unlock(&sp->so_delegreturn_mutex); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); out: - request->fl_flags = fl_flags; + request->c.flc_flags = flags; return status; } @@ -7534,7 +7809,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) return -EINVAL; - if (request->fl_type == F_UNLCK) { + if (lock_is_unlock(request)) { if (state != NULL) return nfs4_proc_unlck(state, cmd, request); return 0; @@ -7543,7 +7818,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (state == NULL) return -ENOLCK; - if ((request->fl_flags & FL_POSIX) && + if ((request->c.flc_flags & FL_POSIX) && !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) return -ENOLCK; @@ -7551,7 +7826,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) * Don't rely on the VFS having checked the file open mode, * since it won't do this for flock() locks. */ - switch (request->fl_type) { + switch (request->c.flc_type) { case F_RDLCK: if (!(filp->f_mode & FMODE_READ)) return -EBADF; @@ -7573,7 +7848,7 @@ static int nfs4_delete_lease(struct file *file, void **priv) return generic_setlease(file, F_UNLCK, NULL, priv); } -static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, +static int nfs4_add_lease(struct file *file, int arg, struct file_lease **lease, void **priv) { struct inode *inode = file_inode(file); @@ -7581,17 +7856,17 @@ static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, int ret; /* No delegation, no lease */ - if (!nfs4_have_delegation(inode, type)) + if (!nfs4_have_delegation(inode, type, 0)) return -EAGAIN; ret = generic_setlease(file, arg, lease, priv); - if (ret || nfs4_have_delegation(inode, type)) + if (ret || nfs4_have_delegation(inode, type, 0)) return ret; /* We raced with a delegation return */ nfs4_delete_lease(file, priv); return -EAGAIN; } -int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease, +int nfs4_proc_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { switch (arg) { @@ -7615,10 +7890,10 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, return err; do { err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - if (err != -NFS4ERR_DELAY) + if (err != -NFS4ERR_DELAY && err != -NFS4ERR_GRACE) break; ssleep(1); - } while (err == -NFS4ERR_DELAY); + } while (err == -NFS4ERR_DELAY || err == -NFSERR_GRACE); return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } @@ -8792,6 +9067,8 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred, #ifdef CONFIG_NFS_V4_1_MIGRATION calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR; #endif + if (test_bit(NFS_CS_PNFS, &clp->cl_flags)) + calldata->args.flags |= EXCHGID4_FLAG_USE_PNFS_DS; msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; @@ -8933,23 +9210,30 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED); +try_again: /* Test connection for session trunking. Async exchange_id call */ task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt); if (IS_ERR(task)) return; status = task->tk_status; - if (status == 0) + if (status == 0) { status = nfs4_detect_session_trunking(adata->clp, task->tk_msg.rpc_resp, xprt); - + trace_nfs4_trunked_exchange_id(adata->clp, + xprt->address_strings[RPC_DISPLAY_ADDR], status); + } if (status == 0) rpc_clnt_xprt_switch_add_xprt(clnt, xprt); - else if (rpc_clnt_xprt_switch_has_addr(clnt, + else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt, (struct sockaddr *)&xprt->addr)) rpc_clnt_xprt_switch_remove_xprt(clnt, xprt); rpc_put_task(task); + if (status == -NFS4ERR_DELAY) { + ssleep(1); + goto try_again; + } } EXPORT_SYMBOL_GPL(nfs4_test_session_trunk); @@ -9176,7 +9460,7 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args goto out; if (rcvd->max_rqst_sz > sent->max_rqst_sz) return -EINVAL; - if (rcvd->max_resp_sz < sent->max_resp_sz) + if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; @@ -9370,7 +9654,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) return; trace_nfs4_sequence(clp, task->tk_status); - if (task->tk_status < 0 && !task->tk_client->cl_shutdown) { + if (task->tk_status < 0 && clp->cl_cons_state >= 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); if (refcount_read(&clp->cl_count) == 1) return; @@ -9620,6 +9904,9 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, nfs4_sequence_free_slot(&lgp->res.seq_res); + exception->state = NULL; + exception->stateid = NULL; + switch (nfs4err) { case 0: goto out; @@ -9655,6 +9942,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, status = -EBUSY; break; case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: status = -ERECALLCONFLICT; break; case -NFS4ERR_DELEG_REVOKED: @@ -9715,7 +10003,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { }; struct pnfs_layout_segment * -nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, + struct nfs4_exception *exception) { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); @@ -9735,13 +10024,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) RPC_TASK_MOVEABLE, }; struct pnfs_layout_segment *lseg = NULL; - struct nfs4_exception exception = { - .inode = inode, - .timeout = *timeout, - }; int status = 0; nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); + exception->retry = 0; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -9752,11 +10038,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) goto out; if (task->tk_status < 0) { - status = nfs4_layoutget_handle_exception(task, lgp, &exception); - *timeout = exception.timeout; + exception->retry = 1; + status = nfs4_layoutget_handle_exception(task, lgp, exception); } else if (lgp->res.layoutp->len == 0) { + exception->retry = 1; status = -EAGAIN; - *timeout = nfs4_update_delay(&exception.timeout); + nfs4_update_delay(&exception->timeout); } else lseg = pnfs_layout_process(lgp); out: @@ -9794,6 +10081,11 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; + if (task->tk_rpc_status == -ETIMEDOUT) { + lrp->rpc_status = -EAGAIN; + lrp->res.lrs_present = 0; + return; + } /* * Was there an RPC level error? Assume the call succeeded, * and that we need to release the layout @@ -9813,13 +10105,25 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) fallthrough; default: task->tk_status = 0; + lrp->res.lrs_present = 0; fallthrough; case 0: break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + nfs4_schedule_session_recovery(server->nfs_client->cl_session, + task->tk_status); + lrp->res.lrs_present = 0; + lrp->rpc_status = -EAGAIN; + task->tk_status = 0; + break; case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) - break; - goto out_restart; + if (nfs4_async_handle_error(task, server, NULL, NULL) == + -EAGAIN) + goto out_restart; + lrp->res.lrs_present = 0; + break; } return; out_restart: @@ -9833,8 +10137,13 @@ static void nfs4_layoutreturn_release(void *calldata) struct nfs4_layoutreturn *lrp = calldata; struct pnfs_layout_hdr *lo = lrp->args.layout; - pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range, + if (lrp->rpc_status == 0 || !lrp->inode) + pnfs_layoutreturn_free_lsegs( + lo, &lrp->args.stateid, &lrp->args.range, lrp->res.lrs_present ? &lrp->res.stateid : NULL); + else + pnfs_layoutreturn_retry_later(lo, &lrp->args.stateid, + &lrp->args.range); nfs4_sequence_free_slot(&lrp->res.seq_res); if (lrp->ld_private.ops && lrp->ld_private.ops->free) lrp->ld_private.ops->free(&lrp->ld_private); @@ -9850,7 +10159,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { .rpc_release = nfs4_layoutreturn_release, }; -int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, unsigned int flags) { struct rpc_task *task; struct rpc_message msg = { @@ -9873,7 +10182,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) &task_setup_data.rpc_client, &msg); lrp->inode = nfs_igrab_and_active(lrp->args.inode); - if (!sync) { + if (flags & PNFS_FL_LAYOUTRETURN_ASYNC) { if (!lrp->inode) { nfs4_layoutreturn_release(lrp); return -EAGAIN; @@ -9881,6 +10190,8 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) task_setup_data.flags |= RPC_TASK_ASYNC; } if (!lrp->inode) + flags |= PNFS_FL_LAYOUTRETURN_PRIVILEGED; + if (flags & PNFS_FL_LAYOUTRETURN_PRIVILEGED) nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, 1); else @@ -9889,7 +10200,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - if (sync) + if (!(flags & PNFS_FL_LAYOUTRETURN_ASYNC)) status = task->tk_status; trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status); dprintk("<-- %s status=%d\n", __func__, status); @@ -10051,10 +10362,10 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if * possible) as per RFC3530bis and RFC5661 Security Considerations sections */ -static int -_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, - struct nfs4_secinfo_flavors *flavors, bool use_integrity) +static int _nfs41_proc_secinfo_no_name(struct nfs_server *server, + struct nfs_fh *fhandle, + struct nfs4_secinfo_flavors *flavors, + bool use_integrity) { struct nfs41_secinfo_no_name_args args = { .style = SECINFO_STYLE_CURRENT_FH, @@ -10098,9 +10409,9 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int -nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +static int nfs41_proc_secinfo_no_name(struct nfs_server *server, + struct nfs_fh *fhandle, + struct nfs4_secinfo_flavors *flavors) { struct nfs4_exception exception = { .interruptible = true, @@ -10112,7 +10423,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, /* try to use integrity protection with machine cred */ if (_nfs4_is_integrity_protected(server->nfs_client)) - err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + err = _nfs41_proc_secinfo_no_name(server, fhandle, flavors, true); /* @@ -10122,7 +10433,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, * the current filesystem's rpc_client and the user cred. */ if (err == -NFS4ERR_WRONGSEC) - err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + err = _nfs41_proc_secinfo_no_name(server, fhandle, flavors, false); switch (err) { @@ -10138,9 +10449,8 @@ out: return err; } -static int -nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) +static int nfs41_find_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int err; struct page *page; @@ -10156,14 +10466,14 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, } flavors = page_address(page); - err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + err = nfs41_proc_secinfo_no_name(server, fhandle, flavors); /* * Fall back on "guess and check" method if * the server doesn't support SECINFO_NO_NAME */ if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) { - err = nfs4_find_root_sec(server, fhandle, info); + err = nfs4_find_root_sec(server, fhandle, fattr); goto out_freepage; } if (err) @@ -10188,8 +10498,8 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, flavor = RPC_AUTH_MAXFLAVOR; if (flavor != RPC_AUTH_MAXFLAVOR) { - err = nfs4_lookup_root_sec(server, fhandle, - info, flavor); + err = nfs4_lookup_root_sec(server, fhandle, fattr, + flavor); if (!err) break; } @@ -10207,12 +10517,12 @@ out: } static int _nfs41_test_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - const struct cred *cred) + const nfs4_stateid *stateid, + const struct cred *cred) { int status; struct nfs41_test_stateid_args args = { - .stateid = stateid, + .stateid = *stateid, }; struct nfs41_test_stateid_res res; struct rpc_message msg = { @@ -10268,8 +10578,8 @@ static void nfs4_handle_delay_or_session_error(struct nfs_server *server, * failed or the state ID is not currently valid. */ static int nfs41_test_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - const struct cred *cred) + const nfs4_stateid *stateid, + const struct cred *cred) { struct nfs4_exception exception = { .interruptible = true, @@ -10336,7 +10646,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { * Note: this function is always asynchronous. */ static int nfs41_free_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, + nfs4_stateid *stateid, const struct cred *cred, bool privileged) { @@ -10376,6 +10686,7 @@ static int nfs41_free_stateid(struct nfs_server *server, if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); + stateid->type = NFS4_FREED_STATEID_TYPE; return 0; } @@ -10391,6 +10702,8 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) static bool nfs41_match_stateid(const nfs4_stateid *s1, const nfs4_stateid *s2) { + trace_nfs41_match_stateid(s1, s2); + if (s1->type != s2->type) return false; @@ -10408,6 +10721,8 @@ static bool nfs41_match_stateid(const nfs4_stateid *s1, static bool nfs4_match_stateid(const nfs4_stateid *s1, const nfs4_stateid *s2) { + trace_nfs4_match_stateid(s1, s2); + return nfs4_stateid_match(s1, s2); } @@ -10542,12 +10857,14 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_OFFLOAD_CANCEL | NFS_CAP_COPY_NOTIFY | NFS_CAP_DEALLOCATE + | NFS_CAP_ZERO_RANGE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE | NFS_CAP_LAYOUTERROR | NFS_CAP_READ_PLUS - | NFS_CAP_MOVEABLE, + | NFS_CAP_MOVEABLE + | NFS_CAP_OFFLOAD_STATUS, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -10576,30 +10893,44 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { - ssize_t error, error2, error3; + ssize_t error, error2, error3, error4 = 0; + size_t left = size; - error = generic_listxattr(dentry, list, size); + error = generic_listxattr(dentry, list, left); if (error < 0) return error; if (list) { list += error; - size -= error; + left -= error; } - error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size); + error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, left); if (error2 < 0) return error2; if (list) { list += error2; - size -= error2; + left -= error2; } - error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, size); + error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left); if (error3 < 0) return error3; + if (list) { + list += error3; + left -= error3; + } - return error + error2 + error3; + if (!nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) { + error4 = security_inode_listsecurity(d_inode(dentry), list, left); + if (error4 < 0) + return error4; + } + + error += error2 + error3 + error4; + if (size && error > size) + return -ERANGE; + return error; } static void nfs4_enable_swap(struct inode *inode) @@ -10619,7 +10950,9 @@ static void nfs4_disable_swap(struct inode *inode) */ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + wake_up_var(&clp->cl_state); } static const struct inode_operations nfs4_dir_inode_operations = { @@ -10646,6 +10979,26 @@ static const struct inode_operations nfs4_file_inode_operations = { .listxattr = nfs4_listxattr, }; +static struct nfs_server *nfs4_clone_server(struct nfs_server *source, + struct nfs_fh *fh, struct nfs_fattr *fattr, + rpc_authflavor_t flavor) +{ + struct nfs_server *server; + int error; + + server = nfs_clone_server(source, fh, fattr, flavor); + if (IS_ERR(server)) + return server; + + error = nfs4_delegation_hash_alloc(server); + if (error) { + nfs_free_server(server); + return ERR_PTR(error); + } + + return server; +} + const struct nfs_rpc_ops nfs_v4_clientops = { .version = 4, /* protocol version */ .dentry_ops = &nfs4_dentry_operations, @@ -10693,11 +11046,12 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, .have_delegation = nfs4_have_delegation, + .return_delegation = nfs4_inode_return_delegation, .alloc_client = nfs4_alloc_client, .init_client = nfs4_init_client, .free_client = nfs4_free_client, .create_server = nfs4_create_server, - .clone_server = nfs_clone_server, + .clone_server = nfs4_clone_server, .discover_trunking = nfs4_discover_trunking, .enable_swap = nfs4_enable_swap, .disable_swap = nfs4_disable_swap, @@ -10734,7 +11088,7 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { }; #endif -const struct xattr_handler *nfs4_xattr_handlers[] = { +const struct xattr_handler * const nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, #if defined(CONFIG_NFS_V4_1) &nfs4_xattr_nfs4_dacl_handler, diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index db3811af0796..18ae614e5a6c 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -122,7 +122,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) timeout = 5 * HZ; dprintk("%s: requeueing work. Lease period = %ld\n", __func__, (timeout + HZ - 1) / HZ); - mod_delayed_work(system_wq, &clp->cl_renewd, timeout); + mod_delayed_work(system_percpu_wq, &clp->cl_renewd, timeout); set_bit(NFS_CS_RENEWD, &clp->cl_res_state); spin_unlock(&clp->cl_lock); } diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 351616c61df5..f9c291e2165c 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -148,16 +148,12 @@ static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst, memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN); } -#ifdef CONFIG_CRC32 /* * nfs_session_id_hash - calculate the crc32 hash for the session id * @session - pointer to session */ #define nfs_session_id_hash(sess_id) \ (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) -#else -#define nfs_session_id_hash(session) (0) -#endif #else /* defined(CONFIG_NFS_V4_1) */ static inline int nfs4_init_session(struct nfs_client *clp) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e079987af4a3..01179f7de322 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -501,11 +501,7 @@ nfs4_alloc_state_owner(struct nfs_server *server, sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; - sp->so_seqid.owner_id = ida_alloc(&server->openowner_id, gfp_flags); - if (sp->so_seqid.owner_id < 0) { - kfree(sp); - return NULL; - } + sp->so_seqid.owner_id = atomic64_inc_return(&server->owner_ctr); sp->so_server = server; sp->so_cred = get_cred(cred); spin_lock_init(&sp->so_lock); @@ -513,7 +509,6 @@ nfs4_alloc_state_owner(struct nfs_server *server, nfs4_init_seqid_counter(&sp->so_seqid); atomic_set(&sp->so_count, 1); INIT_LIST_HEAD(&sp->so_lru); - seqcount_spinlock_init(&sp->so_reclaim_seqcount, &sp->so_lock); mutex_init(&sp->so_delegreturn_mutex); return sp; } @@ -537,7 +532,6 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp) { nfs4_destroy_seqid_counter(&sp->so_seqid); put_cred(sp->so_cred); - ida_free(&sp->so_server->openowner_id, sp->so_seqid.owner_id); kfree(sp); } @@ -847,15 +841,15 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) */ static struct nfs4_lock_state * __nfs4_find_lock_state(struct nfs4_state *state, - fl_owner_t fl_owner, fl_owner_t fl_owner2) + fl_owner_t owner, fl_owner_t owner2) { struct nfs4_lock_state *pos, *ret = NULL; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (pos->ls_owner == fl_owner) { + if (pos->ls_owner == owner) { ret = pos; break; } - if (pos->ls_owner == fl_owner2) + if (pos->ls_owner == owner2) ret = pos; } if (ret) @@ -868,7 +862,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t owner) { struct nfs4_lock_state *lsp; struct nfs_server *server = state->owner->so_server; @@ -879,20 +873,14 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f nfs4_init_seqid_counter(&lsp->ls_seqid); refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; - lsp->ls_owner = fl_owner; - lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT); - if (lsp->ls_seqid.owner_id < 0) - goto out_free; + lsp->ls_owner = owner; + lsp->ls_seqid.owner_id = atomic64_inc_return(&server->owner_ctr); INIT_LIST_HEAD(&lsp->ls_locks); return lsp; -out_free: - kfree(lsp); - return NULL; } void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - ida_free(&server->lockowner_id, lsp->ls_seqid.owner_id); nfs4_destroy_seqid_counter(&lsp->ls_seqid); kfree(lsp); } @@ -980,7 +968,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) if (fl->fl_ops != NULL) return 0; - lsp = nfs4_get_lock_state(state, fl->fl_owner); + lsp = nfs4_get_lock_state(state, fl->c.flc_owner); if (lsp == NULL) return -ENOMEM; fl->fl_u.nfs4_fl.owner = lsp; @@ -993,7 +981,7 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, const struct nfs_lock_context *l_ctx) { struct nfs4_lock_state *lsp; - fl_owner_t fl_owner, fl_flock_owner; + fl_owner_t owner, fl_flock_owner; int ret = -ENOENT; if (l_ctx == NULL) @@ -1002,11 +990,11 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) goto out; - fl_owner = l_ctx->lockowner; + owner = l_ctx->lockowner; fl_flock_owner = l_ctx->open_context->flock_owner; spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner); + lsp = __nfs4_find_lock_state(state, owner, fl_flock_owner); if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) ret = -EIO; else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { @@ -1095,14 +1083,12 @@ void nfs_release_seqid(struct nfs_seqid *seqid) return; sequence = seqid->sequence; spin_lock(&sequence->lock); - list_del_init(&seqid->list); - if (!list_empty(&sequence->list)) { - struct nfs_seqid *next; - - next = list_first_entry(&sequence->list, - struct nfs_seqid, list); + if (list_is_first(&seqid->list, &sequence->list) && + !list_is_singular(&sequence->list)) { + struct nfs_seqid *next = list_next_entry(seqid, list); rpc_wake_up_queued_task(&sequence->wait, next->task); } + list_del_init(&seqid->list); spin_unlock(&sequence->lock); } @@ -1209,16 +1195,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *clnt = clp->cl_rpcclient; + bool swapon = false; - if (clp->cl_rpcclient->cl_shutdown) + if (clp->cl_cons_state < 0) return; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { - wake_up_var(&clp->cl_state); - return; + + if (atomic_read(&clnt->cl_swapper)) { + swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, + &clp->cl_state); + if (!swapon) { + wake_up_var(&clp->cl_state); + return; + } } - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); + + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; + __module_get(THIS_MODULE); refcount_inc(&clp->cl_count); @@ -1235,8 +1231,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) __func__, PTR_ERR(task)); if (!nfs_client_init_is_complete(clp)) nfs_mark_client_ready(clp, PTR_ERR(task)); + if (swapon) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs4_clear_state_manager_bit(clp); - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -1406,7 +1403,7 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); - return 0; + return clp->cl_cons_state < 0 ? clp->cl_cons_state : 0; } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); @@ -1518,8 +1515,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ down_write(&nfsi->rwsem); spin_lock(&flctx->flc_lock); restart: - list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file)->state != state) + for_each_file_lock(fl, list) { + if (nfs_file_open_context(fl->c.flc_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = ops->recover_lock(state, fl); @@ -1586,7 +1583,7 @@ static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state complete(©->completion); } } - list_for_each_entry(copy, &sp->so_server->ss_copies, src_copies) { + list_for_each_entry(copy, &sp->so_server->ss_src_copies, src_copies) { if ((test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags) && !nfs4_stateid_match_other(&state->stateid, ©->parent_src_state->stateid))) @@ -1656,7 +1653,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, * server that doesn't support a grace period. */ spin_lock(&sp->so_lock); - raw_write_seqcount_begin(&sp->so_reclaim_seqcount); restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) @@ -1724,7 +1720,6 @@ restart: spin_lock(&sp->so_lock); goto restart; } - raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); #ifdef CONFIG_NFS_V4_2 if (found_ssc_copy_state) @@ -1734,7 +1729,6 @@ restart: out_err: nfs4_put_open_state(state); spin_lock(&sp->so_lock); - raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return status; } @@ -1856,6 +1850,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) if (!nfs4_state_clear_reclaim_reboot(clp)) return; + pnfs_destroy_all_layouts(clp); ops = clp->cl_mvops->reboot_recovery_ops; cred = nfs4_get_clid_cred(clp); err = nfs4_reclaim_complete(clp, ops, cred); @@ -1917,9 +1912,12 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov struct nfs_server *server; struct rb_node *pos; LIST_HEAD(freeme); - int status = 0; int lost_locks = 0; + int status; + status = nfs4_begin_drain_session(clp); + if (status < 0) + return status; restart: rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { @@ -1946,6 +1944,7 @@ restart: set_bit(ops->owner_flag_bit, &sp->so_flags); nfs4_put_state_owner(sp); status = nfs4_recovery_handle_error(clp, status); + nfs4_free_state_owners(&freeme); return (status != 0) ? status : -EAGAIN; } @@ -1956,6 +1955,7 @@ restart: } rcu_read_unlock(); nfs4_free_state_owners(&freeme); + nfs_local_probe_async(clp); if (lost_locks) pr_warn("NFS: %s: lost %d locks\n", clp->cl_hostname, lost_locks); @@ -2012,6 +2012,12 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) nfs_mark_client_ready(clp, -EPERM); clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); return -EPERM; + case -ETIMEDOUT: + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) { + nfs_mark_client_ready(clp, -EIO); + return -EIO; + } + fallthrough; case -EACCES: case -NFS4ERR_DELAY: case -EAGAIN: @@ -2058,7 +2064,6 @@ static int nfs4_establish_lease(struct nfs_client *clp) put_cred(cred); if (status != 0) return status; - pnfs_destroy_all_layouts(clp); return 0; } @@ -2106,6 +2111,7 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred { struct nfs_client *clp = server->nfs_client; struct nfs4_fs_locations *locations = NULL; + struct nfs_fattr *fattr; struct inode *inode; struct page *page; int status, result; @@ -2115,19 +2121,16 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred (unsigned long long)server->fsid.minor, clp->cl_hostname); - result = 0; page = alloc_page(GFP_KERNEL); locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); - if (page == NULL || locations == NULL) { - dprintk("<-- %s: no memory\n", __func__); - goto out; - } - locations->fattr = nfs_alloc_fattr(); - if (locations->fattr == NULL) { + fattr = nfs_alloc_fattr(); + if (page == NULL || locations == NULL || fattr == NULL) { dprintk("<-- %s: no memory\n", __func__); + result = 0; goto out; } + locations->fattr = fattr; inode = d_inode(server->super->s_root); result = nfs4_proc_get_locations(server, NFS_FH(inode), locations, page, cred); @@ -2672,6 +2675,8 @@ static void nfs4_state_manager(struct nfs_client *clp) section = "reclaim reboot"; status = nfs4_do_reclaim(clp, clp->cl_mvops->reboot_recovery_ops); + if (status == 0) + status = pnfs_layout_handle_reboot(clp); if (status == -EAGAIN) continue; if (status < 0) @@ -2683,6 +2688,9 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Detect expired delegations... */ if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) { section = "detect expired delegations"; + status = nfs4_begin_drain_session(clp); + if (status < 0) + goto out_error; nfs_reap_expired_delegations(clp); continue; } @@ -2703,6 +2711,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); + if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, + &clp->cl_state)) { + memflags = memalloc_nofs_save(); + continue; + } + if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) { if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { nfs_client_return_marked_delegations(clp); @@ -2724,7 +2739,18 @@ out_error: pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); - ssleep(1); + switch (status) { + case -ENETDOWN: + case -ENETUNREACH: + nfs_mark_client_ready(clp, -EIO); + break; + case -EINVAL: + nfs_mark_client_ready(clp, status); + break; + default: + ssleep(1); + break; + } out_drain: memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); @@ -2741,22 +2767,25 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); again: - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); nfs4_state_manager(clp); - if (atomic_read(&cl->cl_swapper)) { + + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && + !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) { wait_var_event_interruptible(&clp->cl_state, test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)); - if (atomic_read(&cl->cl_swapper) && - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + if (!atomic_read(&cl->cl_swapper)) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; /* Either no longer a swapper, or were signalled */ + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); } - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); if (refcount_read(&clp->cl_count) > 1 && !signalled() && test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; nfs_put_client(clp); diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index d09bcfd7db89..5ec9c83f1ef0 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -145,18 +145,13 @@ static int do_nfs4_mount(struct nfs_server *server, const char *export_path) { struct nfs_fs_context *root_ctx; + struct nfs_fs_context *ctx; struct fs_context *root_fc; struct vfsmount *root_mnt; struct dentry *dentry; - size_t len; + char *source; int ret; - struct fs_parameter param = { - .key = "source", - .type = fs_value_is_string, - .dirfd = -1, - }; - if (IS_ERR(server)) return PTR_ERR(server); @@ -168,25 +163,32 @@ static int do_nfs4_mount(struct nfs_server *server, kfree(root_fc->source); root_fc->source = NULL; + ctx = nfs_fc2context(fc); root_ctx = nfs_fc2context(root_fc); root_ctx->internal = true; root_ctx->server = server; - /* We leave export_path unset as it's not used to find the root. */ - len = strlen(hostname) + 5; - param.string = kmalloc(len, GFP_KERNEL); - if (param.string == NULL) { - put_fs_context(root_fc); - return -ENOMEM; + if (ctx->fscache_uniq) { + ret = vfs_parse_fs_string(root_fc, "fsc", ctx->fscache_uniq); + if (ret < 0) { + put_fs_context(root_fc); + return ret; + } } + /* We leave export_path unset as it's not used to find the root. */ /* Does hostname needs to be enclosed in brackets? */ if (strchr(hostname, ':')) - param.size = snprintf(param.string, len, "[%s]:/", hostname); + source = kasprintf(GFP_KERNEL, "[%s]:/", hostname); else - param.size = snprintf(param.string, len, "%s:/", hostname); - ret = vfs_parse_fs_param(root_fc, ¶m); - kfree(param.string); + source = kasprintf(GFP_KERNEL, "%s:/", hostname); + + if (!source) { + put_fs_context(root_fc); + return -ENOMEM; + } + ret = vfs_parse_fs_string(root_fc, "source", source); + kfree(source); if (ret < 0) { put_fs_context(root_fc); return ret; @@ -308,6 +310,7 @@ static void __exit exit_nfs_v4(void) nfs_dns_resolver_destroy(); } +MODULE_DESCRIPTION("NFSv4 client support"); MODULE_LICENSE("GPL"); module_init(init_nfs_v4); diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index e776200e9a11..d1a92d8f8ba4 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -17,7 +17,7 @@ static const int nfs_set_port_min; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; -static struct ctl_table nfs4_cb_sysctls[] = { +static const struct ctl_table nfs4_cb_sysctls[] = { { .procname = "nfs_callback_tcpport", .data = &nfs_callback_set_tcpport, @@ -34,7 +34,6 @@ static struct ctl_table nfs4_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; int nfs4_register_sysctl(void) diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index d9ac556bebcf..987c92d6364b 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -2,6 +2,8 @@ /* * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> */ +#include <uapi/linux/pr.h> +#include <linux/blkdev.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" #include "internal.h" @@ -24,8 +26,17 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done); EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done); EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist); EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_ds_connect); EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error); EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error); EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_ext_tree_prepare_commit); +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_reg); +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_reg_err); +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_unreg); +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_unreg_err); + +EXPORT_TRACEPOINT_SYMBOL_GPL(fl_getdevinfo); #endif diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index d27919d7241d..9776d220cec3 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -14,6 +14,8 @@ #include <trace/misc/fs.h> #include <trace/misc/nfs.h> +#include "delegation.h" + #define show_nfs_fattr_flags(valid) \ __print_flags((unsigned long)valid, "|", \ { NFS_ATTR_FATTR_TYPE, "TYPE" }, \ @@ -30,7 +32,8 @@ { NFS_ATTR_FATTR_CTIME, "CTIME" }, \ { NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \ { NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \ - { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }) + { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }, \ + { NFS_ATTR_FATTR_BTIME, "BTIME" }) DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_PROTO( @@ -47,7 +50,7 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_fast_assign( __entry->error = error < 0 ? -error : 0; - __assign_str(dstaddr, clp->cl_hostname); + __assign_str(dstaddr); ), TP_printk( @@ -77,6 +80,36 @@ DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session); DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence); DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete); +TRACE_EVENT(nfs4_trunked_exchange_id, + TP_PROTO( + const struct nfs_client *clp, + const char *addr, + int error + ), + + TP_ARGS(clp, addr, error), + + TP_STRUCT__entry( + __string(main_addr, clp->cl_hostname) + __string(trunk_addr, addr) + __field(unsigned long, error) + ), + + TP_fast_assign( + __entry->error = error < 0 ? -error : 0; + __assign_str(main_addr); + __assign_str(trunk_addr); + ), + + TP_printk( + "error=%ld (%s) main_addr=%s trunk_addr=%s", + -__entry->error, + show_nfs4_status(__entry->error), + __get_str(main_addr), + __get_str(trunk_addr) + ) +); + TRACE_EVENT(nfs4_sequence_done, TP_PROTO( const struct nfs4_session *session, @@ -243,6 +276,32 @@ TRACE_EVENT(nfs4_cb_offload, show_nfs_stable_how(__entry->cb_how) ) ); + +TRACE_EVENT(pnfs_ds_connect, + TP_PROTO( + char *ds_remotestr, + int status + ), + + TP_ARGS(ds_remotestr, status), + + TP_STRUCT__entry( + __string(ds_ips, ds_remotestr) + __field(int, status) + ), + + TP_fast_assign( + __assign_str(ds_ips); + __entry->status = status; + ), + + TP_printk( + "ds_ips=%s, status=%d", + __get_str(ds_ips), + __entry->status + ) +); + #endif /* CONFIG_NFS_V4_1 */ TRACE_EVENT(nfs4_setup_sequence, @@ -335,7 +394,7 @@ TRACE_EVENT(nfs4_state_mgr, TP_fast_assign( __entry->state = clp->cl_state; - __assign_str(hostname, clp->cl_hostname); + __assign_str(hostname); ), TP_printk( @@ -363,8 +422,8 @@ TRACE_EVENT(nfs4_state_mgr_failed, TP_fast_assign( __entry->error = status < 0 ? -status : 0; __entry->state = clp->cl_state; - __assign_str(hostname, clp->cl_hostname); - __assign_str(section, section); + __assign_str(hostname); + __assign_str(section); ), TP_printk( @@ -548,7 +607,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fhandle = 0; } __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent)); - __assign_str(name, ctx->dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -699,7 +758,7 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __entry->error = error < 0 ? -error : 0; __entry->cmd = cmd; - __entry->type = request->fl_type; + __entry->type = request->c.flc_type; __entry->start = request->fl_start; __entry->end = request->fl_end; __entry->dev = inode->i_sb->s_dev; @@ -771,7 +830,7 @@ TRACE_EVENT(nfs4_set_lock, __entry->error = error < 0 ? -error : 0; __entry->cmd = cmd; - __entry->type = request->fl_type; + __entry->type = request->c.flc_type; __entry->start = request->fl_start; __entry->end = request->fl_end; __entry->dev = inode->i_sb->s_dev; @@ -926,6 +985,52 @@ DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_ARGS(inode, fmode)) DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation); DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation); +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_detach_delegation); + +#define show_delegation_flags(flags) \ + __print_flags(flags, "|", \ + { BIT(NFS_DELEGATION_NEED_RECLAIM), "NEED_RECLAIM" }, \ + { BIT(NFS_DELEGATION_RETURN), "RETURN" }, \ + { BIT(NFS_DELEGATION_RETURN_IF_CLOSED), "RETURN_IF_CLOSED" }, \ + { BIT(NFS_DELEGATION_REFERENCED), "REFERENCED" }, \ + { BIT(NFS_DELEGATION_RETURNING), "RETURNING" }, \ + { BIT(NFS_DELEGATION_REVOKED), "REVOKED" }, \ + { BIT(NFS_DELEGATION_TEST_EXPIRED), "TEST_EXPIRED" }, \ + { BIT(NFS_DELEGATION_INODE_FREEING), "INODE_FREEING" }, \ + { BIT(NFS_DELEGATION_RETURN_DELAYED), "RETURN_DELAYED" }) + +DECLARE_EVENT_CLASS(nfs4_delegation_event, + TP_PROTO( + const struct nfs_delegation *delegation + ), + + TP_ARGS(delegation), + + TP_STRUCT__entry( + __field(u32, fhandle) + __field(unsigned int, fmode) + __field(unsigned long, flags) + ), + + TP_fast_assign( + __entry->fhandle = nfs_fhandle_hash(NFS_FH(delegation->inode)); + __entry->fmode = delegation->type; + __entry->flags = delegation->flags; + ), + + TP_printk( + "fhandle=0x%08x fmode=%s flags=%s", + __entry->fhandle, show_fs_fmode_flags(__entry->fmode), + show_delegation_flags(__entry->flags) + ) +); +#define DEFINE_NFS4_DELEGATION_EVENT(name) \ + DEFINE_EVENT(nfs4_delegation_event, name, \ + TP_PROTO( \ + const struct nfs_delegation *delegation \ + ), \ + TP_ARGS(delegation)) +DEFINE_NFS4_DELEGATION_EVENT(nfs_delegation_need_return); TRACE_EVENT(nfs4_delegreturn_exit, TP_PROTO( @@ -1042,7 +1147,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = -error; - __assign_str(name, name->name); + __assign_str(name); ), TP_printk( @@ -1126,8 +1231,8 @@ TRACE_EVENT(nfs4_rename, __entry->olddir = NFS_FILEID(olddir); __entry->newdir = NFS_FILEID(newdir); __entry->error = error < 0 ? -error : 0; - __assign_str(oldname, oldname->name); - __assign_str(newname, newname->name); + __assign_str(oldname); + __assign_str(newname); ), TP_printk( @@ -1329,7 +1434,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown"); + __assign_str(dstaddr); ), TP_printk( @@ -1386,7 +1491,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown"); + __assign_str(dstaddr); __entry->stateid_seq = be32_to_cpu(stateid->seqid); __entry->stateid_hash = @@ -1419,6 +1524,63 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall); DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file); +#define show_stateid_type(type) \ + __print_symbolic(type, \ + { NFS4_INVALID_STATEID_TYPE, "INVALID" }, \ + { NFS4_SPECIAL_STATEID_TYPE, "SPECIAL" }, \ + { NFS4_OPEN_STATEID_TYPE, "OPEN" }, \ + { NFS4_LOCK_STATEID_TYPE, "LOCK" }, \ + { NFS4_DELEGATION_STATEID_TYPE, "DELEGATION" }, \ + { NFS4_LAYOUT_STATEID_TYPE, "LAYOUT" }, \ + { NFS4_PNFS_DS_STATEID_TYPE, "PNFS_DS" }, \ + { NFS4_REVOKED_STATEID_TYPE, "REVOKED" }, \ + { NFS4_FREED_STATEID_TYPE, "FREED" }) + +DECLARE_EVENT_CLASS(nfs4_match_stateid_event, + TP_PROTO( + const nfs4_stateid *s1, + const nfs4_stateid *s2 + ), + + TP_ARGS(s1, s2), + + TP_STRUCT__entry( + __field(int, s1_seq) + __field(int, s2_seq) + __field(u32, s1_hash) + __field(u32, s2_hash) + __field(int, s1_type) + __field(int, s2_type) + ), + + TP_fast_assign( + __entry->s1_seq = s1->seqid; + __entry->s1_hash = nfs_stateid_hash(s1); + __entry->s1_type = s1->type; + __entry->s2_seq = s2->seqid; + __entry->s2_hash = nfs_stateid_hash(s2); + __entry->s2_type = s2->type; + ), + + TP_printk( + "s1=%s:%x:%u s2=%s:%x:%u", + show_stateid_type(__entry->s1_type), + __entry->s1_hash, __entry->s1_seq, + show_stateid_type(__entry->s2_type), + __entry->s2_hash, __entry->s2_seq + ) +); + +#define DEFINE_NFS4_MATCH_STATEID_EVENT(name) \ + DEFINE_EVENT(nfs4_match_stateid_event, name, \ + TP_PROTO( \ + const nfs4_stateid *s1, \ + const nfs4_stateid *s2 \ + ), \ + TP_ARGS(s1, s2)) +DEFINE_NFS4_MATCH_STATEID_EVENT(nfs41_match_stateid); +DEFINE_NFS4_MATCH_STATEID_EVENT(nfs4_match_stateid); + DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( const char *name, @@ -1930,7 +2092,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_event, ), TP_fast_assign( - __assign_str(dstaddr, clp->cl_hostname); + __assign_str(dstaddr); memcpy(__entry->deviceid, deviceid->data, NFS4_DEVICEID4_SIZE); ), @@ -1968,7 +2130,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status, TP_fast_assign( __entry->dev = server->s_dev; __entry->status = status; - __assign_str(dstaddr, server->nfs_client->cl_hostname); + __assign_str(dstaddr); memcpy(__entry->deviceid, deviceid->data, NFS4_DEVICEID4_SIZE); ), @@ -1991,15 +2153,45 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status, DEFINE_PNFS_DEVICEID_STATUS(nfs4_getdeviceinfo); DEFINE_PNFS_DEVICEID_STATUS(nfs4_find_deviceid); +TRACE_EVENT(fl_getdevinfo, + TP_PROTO( + const struct nfs_server *server, + const struct nfs4_deviceid *deviceid, + char *ds_remotestr + ), + TP_ARGS(server, deviceid, ds_remotestr), + + TP_STRUCT__entry( + __string(mds_addr, server->nfs_client->cl_hostname) + __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE) + __string(ds_ips, ds_remotestr) + ), + + TP_fast_assign( + __assign_str(mds_addr); + __assign_str(ds_ips); + memcpy(__entry->deviceid, deviceid->data, + NFS4_DEVICEID4_SIZE); + ), + TP_printk( + "deviceid=%s, mds_addr=%s, ds_ips=%s", + __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE), + __get_str(mds_addr), + __get_str(ds_ips) + ) +); + DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_PROTO( - const struct nfs_pgio_header *hdr + const struct nfs_pgio_header *hdr, + int error ), - TP_ARGS(hdr), + TP_ARGS(hdr, error), TP_STRUCT__entry( __field(unsigned long, error) + __field(unsigned long, nfs_error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -2015,7 +2207,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_fast_assign( const struct inode *inode = hdr->inode; - __entry->error = hdr->res.op_status; + __entry->error = -error; + __entry->nfs_error = hdr->res.op_status; __entry->fhandle = nfs_fhandle_hash(hdr->args.fh); __entry->fileid = NFS_FILEID(inode); __entry->dev = inode->i_sb->s_dev; @@ -2025,14 +2218,13 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, be32_to_cpu(hdr->args.stateid.seqid); __entry->stateid_hash = nfs_stateid_hash(&hdr->args.stateid); - __assign_str(dstaddr, hdr->ds_clp ? - rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown"); + __assign_str(dstaddr); ), TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s", + "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s " + "nfs_error=%lu (%s)", -__entry->error, show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -2040,28 +2232,32 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, __entry->fhandle, __entry->offset, __entry->count, __entry->stateid_seq, __entry->stateid_hash, - __get_str(dstaddr) + __get_str(dstaddr), __entry->nfs_error, + show_nfs4_status(__entry->nfs_error) ) ); #define DEFINE_NFS4_FLEXFILES_IO_EVENT(name) \ DEFINE_EVENT(nfs4_flexfiles_io_event, name, \ TP_PROTO( \ - const struct nfs_pgio_header *hdr \ + const struct nfs_pgio_header *hdr, \ + int error \ ), \ - TP_ARGS(hdr)) + TP_ARGS(hdr, error)) DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_read_error); DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_write_error); TRACE_EVENT(ff_layout_commit_error, TP_PROTO( - const struct nfs_commit_data *data + const struct nfs_commit_data *data, + int error ), - TP_ARGS(data), + TP_ARGS(data, error), TP_STRUCT__entry( __field(unsigned long, error) + __field(unsigned long, nfs_error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -2075,30 +2271,152 @@ TRACE_EVENT(ff_layout_commit_error, TP_fast_assign( const struct inode *inode = data->inode; - __entry->error = data->res.op_status; + __entry->error = -error; + __entry->nfs_error = data->res.op_status; __entry->fhandle = nfs_fhandle_hash(data->args.fh); __entry->fileid = NFS_FILEID(inode); __entry->dev = inode->i_sb->s_dev; __entry->offset = data->args.offset; __entry->count = data->args.count; - __assign_str(dstaddr, data->ds_clp ? - rpc_peeraddr2str(data->ds_clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown"); + __assign_str(dstaddr); ), TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%llu count=%u dstaddr=%s", + "offset=%llu count=%u dstaddr=%s nfs_error=%lu (%s)", -__entry->error, show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->count, - __get_str(dstaddr) + __get_str(dstaddr), __entry->nfs_error, + show_nfs4_status(__entry->nfs_error) + ) +); + +TRACE_EVENT(bl_ext_tree_prepare_commit, + TP_PROTO( + int ret, + size_t count, + u64 lwb, + bool not_all_ranges + ), + + TP_ARGS(ret, count, lwb, not_all_ranges), + + TP_STRUCT__entry( + __field(int, ret) + __field(size_t, count) + __field(u64, lwb) + __field(bool, not_all_ranges) + ), + + TP_fast_assign( + __entry->ret = ret; + __entry->count = count; + __entry->lwb = lwb; + __entry->not_all_ranges = not_all_ranges; + ), + + TP_printk( + "ret=%d, found %zu ranges, lwb=%llu%s", + __entry->ret, + __entry->count, + __entry->lwb, + __entry->not_all_ranges ? ", not all ranges encoded" : + "" ) ); +DECLARE_EVENT_CLASS(pnfs_bl_pr_key_class, + TP_PROTO( + const struct block_device *bdev, + u64 key + ), + TP_ARGS(bdev, key), + TP_STRUCT__entry( + __field(u64, key) + __field(dev_t, dev) + __string(device, bdev->bd_disk->disk_name) + ), + TP_fast_assign( + __entry->key = key; + __entry->dev = bdev->bd_dev; + __assign_str(device); + ), + TP_printk("dev=%d,%d (%s) key=0x%016llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __get_str(device), __entry->key + ) +); + +#define DEFINE_NFS4_BLOCK_PRKEY_EVENT(name) \ + DEFINE_EVENT(pnfs_bl_pr_key_class, name, \ + TP_PROTO( \ + const struct block_device *bdev, \ + u64 key \ + ), \ + TP_ARGS(bdev, key)) +DEFINE_NFS4_BLOCK_PRKEY_EVENT(bl_pr_key_reg); +DEFINE_NFS4_BLOCK_PRKEY_EVENT(bl_pr_key_unreg); + +/* + * From uapi/linux/pr.h + */ +TRACE_DEFINE_ENUM(PR_STS_SUCCESS); +TRACE_DEFINE_ENUM(PR_STS_IOERR); +TRACE_DEFINE_ENUM(PR_STS_RESERVATION_CONFLICT); +TRACE_DEFINE_ENUM(PR_STS_RETRY_PATH_FAILURE); +TRACE_DEFINE_ENUM(PR_STS_PATH_FAST_FAILED); +TRACE_DEFINE_ENUM(PR_STS_PATH_FAILED); + +#define show_pr_status(x) \ + __print_symbolic(x, \ + { PR_STS_SUCCESS, "SUCCESS" }, \ + { PR_STS_IOERR, "IOERR" }, \ + { PR_STS_RESERVATION_CONFLICT, "RESERVATION_CONFLICT" }, \ + { PR_STS_RETRY_PATH_FAILURE, "RETRY_PATH_FAILURE" }, \ + { PR_STS_PATH_FAST_FAILED, "PATH_FAST_FAILED" }, \ + { PR_STS_PATH_FAILED, "PATH_FAILED" }) + +DECLARE_EVENT_CLASS(pnfs_bl_pr_key_err_class, + TP_PROTO( + const struct block_device *bdev, + u64 key, + int status + ), + TP_ARGS(bdev, key, status), + TP_STRUCT__entry( + __field(u64, key) + __field(dev_t, dev) + __field(unsigned long, status) + __string(device, bdev->bd_disk->disk_name) + ), + TP_fast_assign( + __entry->key = key; + __entry->dev = bdev->bd_dev; + __entry->status = status; + __assign_str(device); + ), + TP_printk("dev=%d,%d (%s) key=0x%016llx status=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __get_str(device), __entry->key, + show_pr_status(__entry->status) + ) +); + +#define DEFINE_NFS4_BLOCK_PRKEY_ERR_EVENT(name) \ + DEFINE_EVENT(pnfs_bl_pr_key_err_class, name, \ + TP_PROTO( \ + const struct block_device *bdev, \ + u64 key, \ + int status \ + ), \ + TP_ARGS(bdev, key, status)) +DEFINE_NFS4_BLOCK_PRKEY_ERR_EVENT(bl_pr_key_reg_err); +DEFINE_NFS4_BLOCK_PRKEY_ERR_EVENT(bl_pr_key_unreg_err); + #ifdef CONFIG_NFS_V4_2 TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA); TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE); @@ -2466,7 +2784,7 @@ TRACE_EVENT(nfs4_copy_notify, ) ); -TRACE_EVENT(nfs4_offload_cancel, +DECLARE_EVENT_CLASS(nfs4_offload_class, TP_PROTO( const struct nfs42_offload_status_args *args, int error @@ -2498,6 +2816,15 @@ TRACE_EVENT(nfs4_offload_cancel, __entry->stateid_seq, __entry->stateid_hash ) ); +#define DEFINE_NFS4_OFFLOAD_EVENT(name) \ + DEFINE_EVENT(nfs4_offload_class, name, \ + TP_PROTO( \ + const struct nfs42_offload_status_args *args, \ + int error \ + ), \ + TP_ARGS(args, error)) +DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_cancel); +DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_status); DECLARE_EVENT_CLASS(nfs4_xattr_event, TP_PROTO( @@ -2521,7 +2848,7 @@ DECLARE_EVENT_CLASS(nfs4_xattr_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); - __assign_str(name, name); + __assign_str(name); ), TP_printk( diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index deec76cf5afe..1d0e6c10f921 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,6 +52,7 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> +#include <linux/nfs_common.h> #include "nfs4_fs.h" #include "nfs4trace.h" @@ -63,11 +64,7 @@ #define NFSDBG_FACILITY NFSDBG_XDR -/* Mapping from NFS error code to "errno" error code. */ -#define errno_NFSERR_IO EIO - struct compound_hdr; -static int nfs4_stat_to_errno(int); static void encode_layoutget(struct xdr_stream *xdr, const struct nfs4_layoutget_args *args, struct compound_hdr *hdr); @@ -85,9 +82,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ #define pagepad_maxsz (1) -#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2) -#define lock_owner_id_maxsz (1 + 1 + 4) -#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define open_owner_id_maxsz (2 + 1 + 2 + 2) +#define lock_owner_id_maxsz (2 + 1 + 2) #define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define op_encode_hdr_maxsz (1) @@ -188,7 +184,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define encode_claim_null_maxsz (1 + nfs4_name_maxsz) #define encode_open_maxsz (op_encode_hdr_maxsz + \ 2 + encode_share_access_maxsz + 2 + \ - open_owner_id_maxsz + \ + 1 + open_owner_id_maxsz + \ encode_opentype_maxsz + \ encode_claim_null_maxsz) #define decode_space_limit_maxsz (3) @@ -224,6 +220,11 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, encode_attrs_maxsz) #define decode_setattr_maxsz (op_decode_hdr_maxsz + \ nfs4_fattr_bitmap_maxsz) +#define encode_delegattr_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + nfs4_fattr_bitmap_maxsz + \ + 2*nfstime4_maxsz) +#define decode_delegattr_maxsz (decode_setattr_maxsz) #define encode_read_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) #define decode_read_maxsz (op_decode_hdr_maxsz + 2 + pagepad_maxsz) @@ -253,13 +254,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define encode_link_maxsz (op_encode_hdr_maxsz + \ nfs4_name_maxsz) #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) -#define encode_lockowner_maxsz (7) +#define encode_lockowner_maxsz (2 + 1 + lock_owner_id_maxsz) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ 7 + \ 1 + encode_stateid_maxsz + 1 + \ encode_lockowner_maxsz) #define decode_lock_denied_maxsz \ - (8 + decode_lockowner_maxsz) + (2 + 2 + 1 + 2 + 1 + lock_owner_id_maxsz) #define decode_lock_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) #define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ @@ -615,7 +617,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, encode_lockowner_maxsz) #define NFS4_dec_release_lockowner_sz \ (compound_decode_hdr_maxsz + \ - decode_lockowner_maxsz) + decode_release_lockowner_maxsz) #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -758,12 +760,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_layoutreturn_maxsz + \ + encode_delegattr_maxsz + \ encode_delegreturn_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_layoutreturn_maxsz + \ + decode_delegattr_maxsz + \ decode_delegreturn_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ @@ -968,11 +972,6 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) return p; } -static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) -{ - WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0); -} - static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { WARN_ON_ONCE(xdr_stream_encode_opaque(xdr, str, len) < 0); @@ -1060,9 +1059,10 @@ static void encode_nops(struct compound_hdr *hdr) *hdr->nops_p = htonl(hdr->nops); } -static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid) +static void encode_nfs4_stateid(struct xdr_stream *xdr, + const nfs4_stateid *stateid) { - encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); + encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); } static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) @@ -1305,7 +1305,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct static inline int nfs4_lock_type(struct file_lock *fl, int block) { - if (fl->fl_type == F_RDLCK) + if (lock_is_read(fl)) return block ? NFS4_READW_LT : NFS4_READ_LT; return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT; } @@ -1412,16 +1412,16 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena __be32 *p; /* * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, - * owner 4 = 32 + * owner 28 */ encode_nfs4_seqid(xdr, arg->seqid); encode_share_access(xdr, arg->share_access); - p = reserve_space(xdr, 36); + p = reserve_space(xdr, 40); p = xdr_encode_hyper(p, arg->clientid); - *p++ = cpu_to_be32(24); + *p++ = cpu_to_be32(28); p = xdr_encode_opaque_fixed(p, "open id:", 8); *p++ = cpu_to_be32(arg->server->s_dev); - *p++ = cpu_to_be32(arg->id.uniquifier); + p = xdr_encode_hyper(p, arg->id.uniquifier); xdr_encode_hyper(p, arg->id.create_time); } @@ -1468,20 +1468,18 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a } } -static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type) +static inline void encode_delegation_type(struct xdr_stream *xdr, u32 delegation_type) { __be32 *p; p = reserve_space(xdr, 4); switch (delegation_type) { - case 0: - *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); - break; - case FMODE_READ: - *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); - break; - case FMODE_WRITE|FMODE_READ: - *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); + case NFS4_OPEN_DELEGATE_NONE: + case NFS4_OPEN_DELEGATE_READ: + case NFS4_OPEN_DELEGATE_WRITE: + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + *p = cpu_to_be32(delegation_type); break; default: BUG(); @@ -1497,7 +1495,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * encode_string(xdr, name->len, name->name); } -static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) +static inline void encode_claim_previous(struct xdr_stream *xdr, u32 type) { __be32 *p; @@ -1602,7 +1600,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { uint32_t attrs[3] = { - FATTR4_WORD0_RDATTR_ERROR, + FATTR4_WORD0_TYPE + | FATTR4_WORD0_RDATTR_ERROR, FATTR4_WORD1_MOUNTED_ON_FILEID, }; uint32_t dircount = readdir->count; @@ -1612,12 +1611,21 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg unsigned int i; if (readdir->plus) { - attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| - FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID; - attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER| - FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| - FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| - FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + attrs[0] |= FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEHANDLE + | FATTR4_WORD0_FILEID; + attrs[1] |= FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_CREATE + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY; attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; } /* Use mounted_on_fileid only if the server supports it */ @@ -1726,6 +1734,33 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs server->attr_bitmask); } +static void encode_delegattr(struct xdr_stream *xdr, + const nfs4_stateid *stateid, + const struct nfs4_delegattr *attr, + struct compound_hdr *hdr) +{ + uint32_t bitmap[3] = { 0 }; + uint32_t len = 0; + __be32 *p; + + encode_op_hdr(xdr, OP_SETATTR, encode_delegattr_maxsz, hdr); + encode_nfs4_stateid(xdr, stateid); + if (attr->atime_set) { + bitmap[2] |= FATTR4_WORD2_TIME_DELEG_ACCESS; + len += (nfstime4_maxsz << 2); + } + if (attr->mtime_set) { + bitmap[2] |= FATTR4_WORD2_TIME_DELEG_MODIFY; + len += (nfstime4_maxsz << 2); + } + xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap)); + xdr_stream_encode_opaque_inline(xdr, (void **)&p, len); + if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) + p = xdr_encode_nfstime4(p, &attr->atime); + if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) + p = xdr_encode_nfstime4(p, &attr->mtime); +} + static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) { __be32 *p; @@ -2096,7 +2131,7 @@ static void encode_test_stateid(struct xdr_stream *xdr, { encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr); encode_uint32(xdr, 1); - encode_nfs4_stateid(xdr, args->stateid); + encode_nfs4_stateid(xdr, &args->stateid); } static void encode_free_stateid(struct xdr_stream *xdr, @@ -2803,6 +2838,8 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, encode_putfh(xdr, args->fhandle, &hdr); if (args->lr_args) encode_layoutreturn(xdr, args->lr_args, &hdr); + if (args->sattr_args) + encode_delegattr(xdr, args->stateid, args->sattr_args, &hdr); if (args->bitmask) encode_getfattr(xdr, args->bitmask, &hdr); encode_delegreturn(xdr, args->stateid, &hdr); @@ -3403,7 +3440,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } - dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); + dprintk("%s: link support=%s\n", __func__, str_false_true(*res == 0)); return 0; } @@ -3421,7 +3458,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } - dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); + dprintk("%s: symlink support=%s\n", __func__, str_false_true(*res == 0)); return 0; } @@ -3563,7 +3600,7 @@ static int decode_attr_case_insensitive(struct xdr_stream *xdr, uint32_t *bitmap *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_CASE_INSENSITIVE; } - dprintk("%s: case_insensitive=%s\n", __func__, *res == 0 ? "false" : "true"); + dprintk("%s: case_insensitive=%s\n", __func__, str_false_true(*res == 0)); return 0; } @@ -3581,7 +3618,7 @@ static int decode_attr_case_preserving(struct xdr_stream *xdr, uint32_t *bitmap, *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_CASE_PRESERVING; } - dprintk("%s: case_preserving=%s\n", __func__, *res == 0 ? "false" : "true"); + dprintk("%s: case_preserving=%s\n", __func__, str_false_true(*res == 0)); return 0; } @@ -4171,6 +4208,24 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str return status; } +static int decode_attr_time_create(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_CREATE - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_CREATE)) { + status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_BTIME; + bitmap[1] &= ~FATTR4_WORD1_TIME_CREATE; + } + dprintk("%s: btime=%lld\n", __func__, time->tv_sec); + return status; +} + static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -4289,8 +4344,29 @@ static int decode_attr_xattrsupport(struct xdr_stream *xdr, uint32_t *bitmap, *res = be32_to_cpup(p); bitmap[2] &= ~FATTR4_WORD2_XATTR_SUPPORT; } - dprintk("%s: XATTR support=%s\n", __func__, - *res == 0 ? "false" : "true"); + dprintk("%s: XATTR support=%s\n", __func__, str_false_true(*res == 0)); + return 0; +} + +static int decode_attr_open_arguments(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs4_open_caps *res) +{ + memset(res, 0, sizeof(*res)); + if (unlikely(bitmap[2] & (FATTR4_WORD2_OPEN_ARGUMENTS - 1U))) + return -EIO; + if (likely(bitmap[2] & FATTR4_WORD2_OPEN_ARGUMENTS)) { + if (decode_bitmap4(xdr, res->oa_share_access, ARRAY_SIZE(res->oa_share_access)) < 0) + return -EIO; + if (decode_bitmap4(xdr, res->oa_share_deny, ARRAY_SIZE(res->oa_share_deny)) < 0) + return -EIO; + if (decode_bitmap4(xdr, res->oa_share_access_want, ARRAY_SIZE(res->oa_share_access_want)) < 0) + return -EIO; + if (decode_bitmap4(xdr, res->oa_open_claim, ARRAY_SIZE(res->oa_open_claim)) < 0) + return -EIO; + if (decode_bitmap4(xdr, res->oa_createmode, ARRAY_SIZE(res->oa_createmode)) < 0) + return -EIO; + bitmap[2] &= ~FATTR4_WORD2_OPEN_ARGUMENTS; + } return 0; } @@ -4343,14 +4419,6 @@ static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) return 0; } -static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) -{ - ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len); - if (unlikely(ret < 0)) - return -EIO; - return 0; -} - static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); @@ -4468,6 +4536,8 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re if ((status = decode_attr_exclcreat_supported(xdr, bitmap, res->exclcreat_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_open_arguments(xdr, bitmap, &res->open_caps)) != 0) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: dprintk("%s: xdr returned %d!\n", __func__, -status); @@ -4730,6 +4800,11 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; + status = decode_attr_time_create(xdr, bitmap, &fattr->btime); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime); if (status < 0) goto xdr_error; @@ -4855,7 +4930,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, } /* - * The prefered block size for layout directed io + * The preferred block size for layout directed io */ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -5026,7 +5101,7 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) /* * We create the owner, so we know a proper owner.id length is 4. */ -static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) +static int decode_lock_denied(struct xdr_stream *xdr, struct file_lock *fl) { uint64_t offset, length, clientid; __be32 *p; @@ -5043,10 +5118,10 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_end = fl->fl_start + (loff_t)length - 1; if (length == ~(uint64_t)0) fl->fl_end = OFFSET_MAX; - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; if (type & 1) - fl->fl_type = F_RDLCK; - fl->fl_pid = 0; + fl->c.flc_type = F_RDLCK; + fl->c.flc_pid = 0; } p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ @@ -5139,13 +5214,12 @@ static int decode_space_limit(struct xdr_stream *xdr, } static int decode_rw_delegation(struct xdr_stream *xdr, - uint32_t delegation_type, - struct nfs_openres *res) + struct nfs4_open_delegation *res) { __be32 *p; int status; - status = decode_delegation_stateid(xdr, &res->delegation); + status = decode_delegation_stateid(xdr, &res->stateid); if (unlikely(status)) return status; p = xdr_inline_decode(xdr, 4); @@ -5153,52 +5227,57 @@ static int decode_rw_delegation(struct xdr_stream *xdr, return -EIO; res->do_recall = be32_to_cpup(p); - switch (delegation_type) { + switch (res->open_delegation_type) { case NFS4_OPEN_DELEGATE_READ: - res->delegation_type = FMODE_READ; + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + res->type = FMODE_READ; break; case NFS4_OPEN_DELEGATE_WRITE: - res->delegation_type = FMODE_WRITE|FMODE_READ; + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + res->type = FMODE_WRITE|FMODE_READ; if (decode_space_limit(xdr, &res->pagemod_limit) < 0) return -EIO; } return decode_ace(xdr, NULL); } -static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) +static int decode_no_delegation(struct xdr_stream *xdr, + struct nfs4_open_delegation *res) { __be32 *p; - uint32_t why_no_delegation; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) return -EIO; - why_no_delegation = be32_to_cpup(p); - switch (why_no_delegation) { + res->why_no_delegation = be32_to_cpup(p); + switch (res->why_no_delegation) { case WND4_CONTENTION: case WND4_RESOURCE: - xdr_inline_decode(xdr, 4); - /* Ignore for now */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + res->will_notify = be32_to_cpup(p); } return 0; } -static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) +static int decode_delegation(struct xdr_stream *xdr, + struct nfs4_open_delegation *res) { __be32 *p; - uint32_t delegation_type; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) return -EIO; - delegation_type = be32_to_cpup(p); - res->delegation_type = 0; - switch (delegation_type) { + res->open_delegation_type = be32_to_cpup(p); + switch (res->open_delegation_type) { case NFS4_OPEN_DELEGATE_NONE: return 0; case NFS4_OPEN_DELEGATE_READ: case NFS4_OPEN_DELEGATE_WRITE: - return decode_rw_delegation(xdr, delegation_type, res); + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + return decode_rw_delegation(xdr, res); case NFS4_OPEN_DELEGATE_NONE_EXT: return decode_no_delegation(xdr, res); } @@ -5239,7 +5318,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) for (; i < NFS4_BITMAP_SIZE; i++) res->attrset[i] = 0; - return decode_delegation(xdr, res); + return decode_delegation(xdr, &res->delegation); xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); return -EIO; @@ -5471,6 +5550,11 @@ static int decode_setattr(struct xdr_stream *xdr) return -EIO; } +static int decode_delegattr(struct xdr_stream *xdr) +{ + return decode_setattr(xdr); +} + static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res) { __be32 *p; @@ -6501,7 +6585,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, int status; if (res->acl_scratch != NULL) - xdr_set_scratch_page(xdr, res->acl_scratch); + xdr_set_scratch_folio(xdr, res->acl_scratch); status = decode_compound_hdr(xdr, &hdr); if (status) goto out; @@ -7043,6 +7127,12 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, if (status) goto out; } + if (res->sattr_res) { + status = decode_delegattr(xdr); + res->sattr_ret = status; + if (status) + goto out; + } if (res->fattr) { status = decode_getfattr(xdr, res->fattr, res->server); if (status != 0) @@ -7538,72 +7628,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, return 0; } -/* - * We need to translate between nfs status return values and - * the local errno values which may not be the same. - */ -static struct { - int stat; - int errno; -} nfs_errtbl[] = { - { NFS4_OK, 0 }, - { NFS4ERR_PERM, -EPERM }, - { NFS4ERR_NOENT, -ENOENT }, - { NFS4ERR_IO, -errno_NFSERR_IO}, - { NFS4ERR_NXIO, -ENXIO }, - { NFS4ERR_ACCESS, -EACCES }, - { NFS4ERR_EXIST, -EEXIST }, - { NFS4ERR_XDEV, -EXDEV }, - { NFS4ERR_NOTDIR, -ENOTDIR }, - { NFS4ERR_ISDIR, -EISDIR }, - { NFS4ERR_INVAL, -EINVAL }, - { NFS4ERR_FBIG, -EFBIG }, - { NFS4ERR_NOSPC, -ENOSPC }, - { NFS4ERR_ROFS, -EROFS }, - { NFS4ERR_MLINK, -EMLINK }, - { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, - { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, - { NFS4ERR_DQUOT, -EDQUOT }, - { NFS4ERR_STALE, -ESTALE }, - { NFS4ERR_BADHANDLE, -EBADHANDLE }, - { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, - { NFS4ERR_NOTSUPP, -ENOTSUPP }, - { NFS4ERR_TOOSMALL, -ETOOSMALL }, - { NFS4ERR_SERVERFAULT, -EREMOTEIO }, - { NFS4ERR_BADTYPE, -EBADTYPE }, - { NFS4ERR_LOCKED, -EAGAIN }, - { NFS4ERR_SYMLINK, -ELOOP }, - { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, - { NFS4ERR_DEADLOCK, -EDEADLK }, - { NFS4ERR_NOXATTR, -ENODATA }, - { NFS4ERR_XATTR2BIG, -E2BIG }, - { -1, -EIO } -}; - -/* - * Convert an NFS error code to a local one. - * This one is used jointly by NFSv2 and NFSv3. - */ -static int -nfs4_stat_to_errno(int stat) -{ - int i; - for (i = 0; nfs_errtbl[i].stat != -1; i++) { - if (nfs_errtbl[i].stat == stat) - return nfs_errtbl[i].errno; - } - if (stat <= 10000 || stat > 10100) { - /* The server is looney tunes. */ - return -EREMOTEIO; - } - /* If we cannot translate the error, the recovery routines should - * handle it. - * Note: remaining NFSv4 error codes have values > 10000, so should - * not conflict with native Linux error codes. - */ - return -stat; -} - #ifdef CONFIG_NFS_V4_2 #include "nfs42xdr.c" #endif /* CONFIG_NFS_V4_2 */ @@ -7702,6 +7726,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(CLONE, enc_clone, dec_clone), PROC42(COPY, enc_copy, dec_copy), PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel), + PROC42(OFFLOAD_STATUS, enc_offload_status, dec_offload_status), PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify), PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror), @@ -7710,6 +7735,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs), PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr), PROC42(READ_PLUS, enc_read_plus, dec_read_plus), + PROC42(ZERO_RANGE, enc_zero_range, dec_zero_range), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 7600100ba26f..432612d22437 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -175,10 +175,10 @@ static int __init root_nfs_cat(char *dest, const char *src, size_t len = strlen(dest); if (len && dest[len - 1] != ',') - if (strlcat(dest, ",", destlen) > destlen) + if (strlcat(dest, ",", destlen) >= destlen) return -1; - if (strlcat(dest, src, destlen) > destlen) + if (strlcat(dest, src, destlen) >= destlen) return -1; return 0; } diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 4e90ca531176..6ce55e8e6b67 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -32,7 +32,8 @@ { NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \ { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \ { NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \ - { NFS_INO_INVALID_MODE, "INVALID_MODE" }) + { NFS_INO_INVALID_MODE, "INVALID_MODE" }, \ + { NFS_INO_INVALID_BTIME, "INVALID_BTIME" }) #define nfs_show_nfsi_flags(v) \ __print_flags(v, "|", \ @@ -44,6 +45,23 @@ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ { BIT(NFS_INO_ODIRECT), "ODIRECT" }) +#define nfs_show_wb_flags(v) \ + __print_flags(v, "|", \ + { BIT(PG_BUSY), "BUSY" }, \ + { BIT(PG_MAPPED), "MAPPED" }, \ + { BIT(PG_FOLIO), "FOLIO" }, \ + { BIT(PG_CLEAN), "CLEAN" }, \ + { BIT(PG_COMMIT_TO_DS), "COMMIT_TO_DS" }, \ + { BIT(PG_INODE_REF), "INODE_REF" }, \ + { BIT(PG_HEADLOCK), "HEADLOCK" }, \ + { BIT(PG_TEARDOWN), "TEARDOWN" }, \ + { BIT(PG_UNLOCKPAGE), "UNLOCKPAGE" }, \ + { BIT(PG_UPTODATE), "UPTODATE" }, \ + { BIT(PG_WB_END), "WB_END" }, \ + { BIT(PG_REMOVE), "REMOVE" }, \ + { BIT(PG_CONTENDED1), "CONTENDED1" }, \ + { BIT(PG_CONTENDED2), "CONTENDED2" }) + DECLARE_EVENT_CLASS(nfs_inode_event, TP_PROTO( const struct inode *inode @@ -56,6 +74,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event, __field(u32, fhandle) __field(u64, fileid) __field(u64, version) + __field(unsigned long, cache_validity) ), TP_fast_assign( @@ -64,14 +83,17 @@ DECLARE_EVENT_CLASS(nfs_inode_event, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); + __entry->cache_validity = nfsi->cache_validity; ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ", + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu cache_validity=0x%lx (%s)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, - (unsigned long long)__entry->version + (unsigned long long)__entry->version, + __entry->cache_validity, + nfs_show_cache_validity(__entry->cache_validity) ) ); @@ -267,6 +289,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class, TP_ARGS(inode, new_size)) DEFINE_NFS_UPDATE_SIZE_EVENT(truncate); +DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio); DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); DEFINE_NFS_UPDATE_SIZE_EVENT(update); DEFINE_NFS_UPDATE_SIZE_EVENT(grow); @@ -400,6 +423,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) + __field(u64, fileid) __string(name, dentry->d_name.name) ), @@ -407,16 +431,18 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); + __assign_str(name); ), TP_printk( - "flags=0x%lx (%s) name=%02x:%02x:%llu/%s", + "flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu", __entry->flags, show_fs_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->fileid ) ); @@ -444,6 +470,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) + __field(u64, fileid) __string(name, dentry->d_name.name) ), @@ -452,17 +479,19 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); + __assign_str(name); ), TP_printk( - "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s", + "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu", -__entry->error, show_nfs_status(__entry->error), __entry->flags, show_fs_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->fileid ) ); @@ -506,7 +535,7 @@ TRACE_EVENT(nfs_atomic_open_enter, __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fmode = (__force unsigned long)ctx->mode; - __assign_str(name, ctx->dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -545,7 +574,7 @@ TRACE_EVENT(nfs_atomic_open_exit, __entry->dir = NFS_FILEID(dir); __entry->flags = flags; __entry->fmode = (__force unsigned long)ctx->mode; - __assign_str(name, ctx->dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -581,7 +610,7 @@ TRACE_EVENT(nfs_create_enter, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -617,7 +646,7 @@ TRACE_EVENT(nfs_create_exit, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -648,7 +677,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -687,7 +716,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -741,7 +770,7 @@ TRACE_EVENT(nfs_link_enter, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -777,7 +806,7 @@ TRACE_EVENT(nfs_link_exit, __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); __entry->error = error < 0 ? -error : 0; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -813,8 +842,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event, __entry->dev = old_dir->i_sb->s_dev; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); - __assign_str(old_name, old_dentry->d_name.name); - __assign_str(new_name, new_dentry->d_name.name); + __assign_str(old_name); + __assign_str(new_name); ), TP_printk( @@ -862,8 +891,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, __entry->error = -error; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); - __assign_str(old_name, old_dentry->d_name.name); - __assign_str(new_name, new_dentry->d_name.name); + __assign_str(old_name); + __assign_str(new_name); ), TP_printk( @@ -893,7 +922,7 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, DEFINE_NFS_RENAME_EVENT(nfs_rename_enter); DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit); -DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename); +DEFINE_NFS_RENAME_EVENT_DONE(nfs_async_rename_done); TRACE_EVENT(nfs_sillyrename_unlink, TP_PROTO( @@ -933,10 +962,11 @@ TRACE_EVENT(nfs_sillyrename_unlink, DECLARE_EVENT_CLASS(nfs_folio_event, TP_PROTO( const struct inode *inode, - struct folio *folio + loff_t offset, + size_t count ), - TP_ARGS(inode, folio), + TP_ARGS(inode, offset, count), TP_STRUCT__entry( __field(dev_t, dev) @@ -944,7 +974,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event, __field(u64, fileid) __field(u64, version) __field(loff_t, offset) - __field(u32, count) + __field(size_t, count) ), TP_fast_assign( @@ -954,13 +984,13 @@ DECLARE_EVENT_CLASS(nfs_folio_event, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = folio_file_pos(folio); - __entry->count = nfs_folio_length(folio); + __entry->offset = offset; + __entry->count = count; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " - "offset=%lld count=%u", + "offset=%lld count=%zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, @@ -972,18 +1002,20 @@ DECLARE_EVENT_CLASS(nfs_folio_event, DEFINE_EVENT(nfs_folio_event, name, \ TP_PROTO( \ const struct inode *inode, \ - struct folio *folio \ + loff_t offset, \ + size_t count \ ), \ - TP_ARGS(inode, folio)) + TP_ARGS(inode, offset, count)) DECLARE_EVENT_CLASS(nfs_folio_event_done, TP_PROTO( const struct inode *inode, - struct folio *folio, + loff_t offset, + size_t count, int ret ), - TP_ARGS(inode, folio, ret), + TP_ARGS(inode, offset, count, ret), TP_STRUCT__entry( __field(dev_t, dev) @@ -992,7 +1024,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done, __field(u64, fileid) __field(u64, version) __field(loff_t, offset) - __field(u32, count) + __field(size_t, count) ), TP_fast_assign( @@ -1002,14 +1034,14 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = folio_file_pos(folio); - __entry->count = nfs_folio_length(folio); + __entry->offset = offset; + __entry->count = count; __entry->ret = ret; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " - "offset=%lld count=%u ret=%d", + "offset=%lld count=%zu ret=%d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, @@ -1021,10 +1053,11 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done, DEFINE_EVENT(nfs_folio_event_done, name, \ TP_PROTO( \ const struct inode *inode, \ - struct folio *folio, \ + loff_t offset, \ + size_t count, \ int ret \ ), \ - TP_ARGS(inode, folio, ret)) + TP_ARGS(inode, offset, count, ret)) DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done); @@ -1035,6 +1068,73 @@ DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done); +DEFINE_NFS_FOLIO_EVENT(nfs_try_to_update_request); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_try_to_update_request_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_update_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_update_folio_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_write_begin); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_begin_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_write_end); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_end_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_writepages); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writepages_done); + +DECLARE_EVENT_CLASS(nfs_kiocb_event, + TP_PROTO( + const struct kiocb *iocb, + const struct iov_iter *iter + ), + + TP_ARGS(iocb, iter), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + + TP_fast_assign( + const struct inode *inode = file_inode(iocb->ki_filp); + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); + __entry->offset = iocb->ki_pos; + __entry->count = iov_iter_count(iter); + __entry->flags = iocb->ki_flags; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld count=%zu ki_flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->offset, __entry->count, + __print_flags(__entry->flags, "|", TRACE_IOCB_STRINGS) + ) +); + +#define DEFINE_NFS_KIOCB_EVENT(name) \ + DEFINE_EVENT(nfs_kiocb_event, name, \ + TP_PROTO( \ + const struct kiocb *iocb, \ + const struct iov_iter *iter \ + ), \ + TP_ARGS(iocb, iter)) + +DEFINE_NFS_KIOCB_EVENT(nfs_file_read); +DEFINE_NFS_KIOCB_EVENT(nfs_file_write); + TRACE_EVENT(nfs_aop_readahead, TP_PROTO( const struct inode *inode, @@ -1382,6 +1482,55 @@ TRACE_EVENT(nfs_writeback_done, ) ); +DECLARE_EVENT_CLASS(nfs_page_class, + TP_PROTO( + const struct nfs_page *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(const struct nfs_page *__private, req) + __field(loff_t, offset) + __field(unsigned int, count) + __field(unsigned long, flags) + ), + + TP_fast_assign( + const struct inode *inode = folio_inode(req->wb_folio); + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->req = req; + __entry->offset = req_offset(req); + __entry->count = req->wb_bytes; + __entry->flags = req->wb_flags; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x req=%p offset=%lld count=%u flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->req, __entry->offset, __entry->count, + nfs_show_wb_flags(__entry->flags) + ) +); + +#define DEFINE_NFS_PAGE_EVENT(name) \ + DEFINE_EVENT(nfs_page_class, name, \ + TP_PROTO( \ + const struct nfs_page *req \ + ), \ + TP_ARGS(req)) + +DEFINE_NFS_PAGE_EVENT(nfs_writepage_setup); +DEFINE_NFS_PAGE_EVENT(nfs_do_writepage); + DECLARE_EVENT_CLASS(nfs_page_error_class, TP_PROTO( const struct inode *inode, @@ -1539,7 +1688,6 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, __field(u32, fhandle) __field(loff_t, offset) __field(ssize_t, count) - __field(ssize_t, bytes_left) __field(ssize_t, error) __field(int, flags) ), @@ -1554,19 +1702,18 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = dreq->io_start; __entry->count = dreq->count; - __entry->bytes_left = dreq->bytes_left; __entry->error = dreq->error; __entry->flags = dreq->flags; ), TP_printk( "error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zd bytes_left=%zd flags=%s", + "offset=%lld count=%zd flags=%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, - __entry->count, __entry->bytes_left, + __entry->count, nfs_show_direct_req_flags(__entry->flags) ) ); @@ -1585,6 +1732,76 @@ DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io); +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +DECLARE_EVENT_CLASS(nfs_local_dio_class, + TP_PROTO( + const struct inode *inode, + loff_t offset, + ssize_t count, + const struct nfs_local_dio *local_dio + ), + TP_ARGS(inode, offset, count, local_dio), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, offset) + __field(ssize_t, count) + __field(u32, mem_align) + __field(u32, offset_align) + __field(loff_t, start) + __field(ssize_t, start_len) + __field(loff_t, middle) + __field(ssize_t, middle_len) + __field(loff_t, end) + __field(ssize_t, end_len) + ), + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->offset = offset; + __entry->count = count; + __entry->mem_align = local_dio->mem_align; + __entry->offset_align = local_dio->offset_align; + __entry->start = offset; + __entry->start_len = local_dio->start_len; + __entry->middle = local_dio->middle_offset; + __entry->middle_len = local_dio->middle_len; + __entry->end = local_dio->end_offset; + __entry->end_len = local_dio->end_len; + ), + TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zd " + "mem_align=%u offset_align=%u " + "start=%llu+%zd middle=%llu+%zd end=%llu+%zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->offset, __entry->count, + __entry->mem_align, __entry->offset_align, + __entry->start, __entry->start_len, + __entry->middle, __entry->middle_len, + __entry->end, __entry->end_len) +) + +#define DEFINE_NFS_LOCAL_DIO_EVENT(name) \ +DEFINE_EVENT(nfs_local_dio_class, nfs_local_dio_##name, \ + TP_PROTO(const struct inode *inode, \ + loff_t offset, \ + ssize_t count, \ + const struct nfs_local_dio *local_dio),\ + TP_ARGS(inode, offset, count, local_dio)) + +DEFINE_NFS_LOCAL_DIO_EVENT(read); +DEFINE_NFS_LOCAL_DIO_EVENT(write); +DEFINE_NFS_LOCAL_DIO_EVENT(misaligned); + +#endif /* CONFIG_NFS_LOCALIO */ + TRACE_EVENT(nfs_fh_to_dentry, TP_PROTO( const struct super_block *sb, @@ -1632,8 +1849,8 @@ TRACE_EVENT(nfs_mount_assign, ), TP_fast_assign( - __assign_str(option, option); - __assign_str(value, value); + __assign_str(option); + __assign_str(value); ), TP_printk("option %s=%s", @@ -1653,7 +1870,7 @@ TRACE_EVENT(nfs_mount_option, ), TP_fast_assign( - __assign_str(option, param->key); + __assign_str(option); ), TP_printk("option %s", __get_str(option)) @@ -1671,12 +1888,41 @@ TRACE_EVENT(nfs_mount_path, ), TP_fast_assign( - __assign_str(path, path); + __assign_str(path); ), TP_printk("path='%s'", __get_str(path)) ); +TRACE_EVENT(nfs_local_open_fh, + TP_PROTO( + const struct nfs_fh *fh, + fmode_t fmode, + int error + ), + + TP_ARGS(fh, fmode, error), + + TP_STRUCT__entry( + __field(int, error) + __field(u32, fhandle) + __field(unsigned int, fmode) + ), + + TP_fast_assign( + __entry->error = error; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->fmode = (__force unsigned int)fmode; + ), + + TP_printk( + "fhandle=0x%08x mode=%s result=%d", + __entry->fhandle, + show_fs_fmode_flags(__entry->fmode), + __entry->error + ) +); + DECLARE_EVENT_CLASS(nfs_xdr_event, TP_PROTO( const struct xdr_stream *xdr, @@ -1706,9 +1952,8 @@ DECLARE_EVENT_CLASS(nfs_xdr_event, __entry->xid = be32_to_cpu(rqstp->rq_xid); __entry->version = task->tk_client->cl_vers; __entry->error = error; - __assign_str(program, - task->tk_client->cl_program->name); - __assign_str(procedure, task->tk_msg.rpc_proc->p_name); + __assign_str(program); + __assign_str(procedure); ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6efb5068c116..6e69ce43a13f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -188,102 +188,6 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); /* - * nfs_page_lock_head_request - page lock the head of the page group - * @req: any member of the page group - */ -struct nfs_page * -nfs_page_group_lock_head(struct nfs_page *req) -{ - struct nfs_page *head = req->wb_head; - - while (!nfs_lock_request(head)) { - int ret = nfs_wait_on_request(head); - if (ret < 0) - return ERR_PTR(ret); - } - if (head != req) - kref_get(&head->wb_kref); - return head; -} - -/* - * nfs_unroll_locks - unlock all newly locked reqs and wait on @req - * @head: head request of page group, must be holding head lock - * @req: request that couldn't lock and needs to wait on the req bit lock - * - * This is a helper function for nfs_lock_and_join_requests - * returns 0 on success, < 0 on error. - */ -static void -nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req) -{ - struct nfs_page *tmp; - - /* relinquish all the locks successfully grabbed this run */ - for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { - if (!kref_read(&tmp->wb_kref)) - continue; - nfs_unlock_and_release_request(tmp); - } -} - -/* - * nfs_page_group_lock_subreq - try to lock a subrequest - * @head: head request of page group - * @subreq: request to lock - * - * This is a helper function for nfs_lock_and_join_requests which - * must be called with the head request and page group both locked. - * On error, it returns with the page group unlocked. - */ -static int -nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq) -{ - int ret; - - if (!kref_get_unless_zero(&subreq->wb_kref)) - return 0; - while (!nfs_lock_request(subreq)) { - nfs_page_group_unlock(head); - ret = nfs_wait_on_request(subreq); - if (!ret) - ret = nfs_page_group_lock(head); - if (ret < 0) { - nfs_unroll_locks(head, subreq); - nfs_release_request(subreq); - return ret; - } - } - return 0; -} - -/* - * nfs_page_group_lock_subrequests - try to lock the subrequests - * @head: head request of page group - * - * This is a helper function for nfs_lock_and_join_requests which - * must be called with the head request locked. - */ -int nfs_page_group_lock_subrequests(struct nfs_page *head) -{ - struct nfs_page *subreq; - int ret; - - ret = nfs_page_group_lock(head); - if (ret < 0) - return ret; - /* lock each request in the page group */ - for (subreq = head->wb_this_page; subreq != head; - subreq = subreq->wb_this_page) { - ret = nfs_page_group_lock_subreq(head, subreq); - if (ret < 0) - return ret; - } - nfs_page_group_unlock(head); - return 0; -} - -/* * nfs_page_set_headlock - set the request PG_HEADLOCK * @req: request that is to be locked * @@ -349,13 +253,14 @@ nfs_page_group_unlock(struct nfs_page *req) nfs_page_clear_headlock(req); } -/* - * nfs_page_group_sync_on_bit_locked +/** + * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set + * @req: request in page group + * @bit: PG_* bit that is used to sync page group * * must be called with page group lock held */ -static bool -nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) { struct nfs_page *head = req->wb_head; struct nfs_page *tmp; @@ -569,7 +474,7 @@ struct nfs_page *nfs_page_create_from_folio(struct nfs_open_context *ctx, if (IS_ERR(l_ctx)) return ERR_CAST(l_ctx); - ret = nfs_page_create(l_ctx, offset, folio_index(folio), offset, count); + ret = nfs_page_create(l_ctx, offset, folio->index, offset, count); if (!IS_ERR(ret)) { nfs_page_assign_folio(ret, folio); nfs_page_group_init(ret, NULL); @@ -694,25 +599,6 @@ void nfs_release_request(struct nfs_page *req) } EXPORT_SYMBOL_GPL(nfs_release_request); -/** - * nfs_wait_on_request - Wait for a request to complete. - * @req: request to wait upon. - * - * Interruptible by fatal signals only. - * The user is responsible for holding a count on the request. - */ -int -nfs_wait_on_request(struct nfs_page *req) -{ - if (!test_bit(PG_BUSY, &req->wb_flags)) - return 0; - set_bit(PG_CONTENDED2, &req->wb_flags); - smp_mb__after_atomic(); - return wait_on_bit_io(&req->wb_flags, PG_BUSY, - TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL_GPL(nfs_wait_on_request); - /* * nfs_generic_pg_test - determine if requests can be coalesced * @desc: pointer to descriptor @@ -846,7 +732,8 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, const struct cred *cred, const struct nfs_rpc_ops *rpc_ops, - const struct rpc_call_ops *call_ops, int how, int flags) + const struct rpc_call_ops *call_ops, int how, int flags, + struct nfsd_file *localio) { struct rpc_task *task; struct rpc_message msg = { @@ -876,6 +763,10 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, hdr->args.count, (unsigned long long)hdr->args.offset); + if (localio) + return nfs_local_doio(NFS_SERVER(hdr->inode)->nfs_client, + localio, hdr, call_ops); + task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -1068,6 +959,13 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); if (ret == 0) { + struct nfs_client *clp = NFS_SERVER(hdr->inode)->nfs_client; + + struct nfsd_file *localio = + nfs_local_open_fh(clp, hdr->cred, hdr->args.fh, + &hdr->args.context->nfl, + hdr->args.context->mode); + if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion) task_flags = RPC_TASK_MOVEABLE; ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), @@ -1076,7 +974,8 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) NFS_PROTO(hdr->inode), desc->pg_rpc_callops, desc->pg_ioflags, - RPC_TASK_CRED_NOREF | task_flags); + RPC_TASK_CRED_NOREF | task_flags, + localio); } return ret; } @@ -1545,6 +1444,11 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) continue; } else if (index == prev->wb_index + 1) continue; + /* + * We will submit more requests after these. Indicate + * this to the underlying layers. + */ + desc->pg_moreio = 1; nfs_pageio_complete(desc); break; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 306cba0b9e69..f157d43d1312 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -61,6 +61,7 @@ static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, u32 seq); static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list); +static int pnfs_layout_return_on_reboot(struct pnfs_layout_hdr *lo); /* Return the registered pnfs layout driver module matching given id */ static struct pnfs_layoutdriver_type * @@ -305,7 +306,6 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { struct inode *inode; - unsigned long i_state; if (!lo) return; @@ -316,12 +316,11 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); - i_state = inode->i_state; + /* Notify pnfs_destroy_layout_final() that we're done */ + if (inode_state_read(inode) & (I_FREEING | I_CLEAR)) + wake_up_var_locked(lo, &inode->i_lock); spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); - /* Notify pnfs_destroy_layout_final() that we're done */ - if (i_state & (I_FREEING | I_CLEAR)) - wake_up_var(lo); } } @@ -476,6 +475,18 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, return !list_empty(&lo->plh_segs); } +static int pnfs_mark_layout_stateid_return(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list, + enum pnfs_iomode iomode, u32 seq) +{ + struct pnfs_layout_range range = { + .iomode = iomode, + .length = NFS4_MAX_UINT64, + }; + + return pnfs_mark_matching_lsegs_return(lo, lseg_list, &range, seq); +} + static int pnfs_iomode_to_fail_bit(u32 iomode) { @@ -732,6 +743,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return remaining; } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} + static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, struct list_head *free_me, @@ -788,23 +807,17 @@ void pnfs_destroy_layout(struct nfs_inode *nfsi) } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); -static bool pnfs_layout_removed(struct nfs_inode *nfsi, - struct pnfs_layout_hdr *lo) -{ - bool ret; - - spin_lock(&nfsi->vfs_inode.i_lock); - ret = nfsi->layout != lo; - spin_unlock(&nfsi->vfs_inode.i_lock); - return ret; -} - void pnfs_destroy_layout_final(struct nfs_inode *nfsi) { struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi); + struct inode *inode = &nfsi->vfs_inode; - if (lo) - wait_var_event(lo, pnfs_layout_removed(nfsi, lo)); + if (lo) { + spin_lock(&inode->i_lock); + wait_var_event_spinlock(lo, nfsi->layout != lo, + &inode->i_lock); + spin_unlock(&inode->i_lock); + } } static bool @@ -846,8 +859,6 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, break; inode = pnfs_grab_inode_layout_hdr(lo); if (inode != NULL) { - if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) - list_del_rcu(&lo->plh_layouts); if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) continue; @@ -868,7 +879,7 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, static int pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, - bool is_bulk_recall) + enum pnfs_layout_destroy_mode mode) { struct pnfs_layout_hdr *lo; struct inode *inode; @@ -886,8 +897,11 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, spin_lock(&inode->i_lock); list_del_init(&lo->plh_bulk_destroy); - if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) { - if (is_bulk_recall) + if (mode == PNFS_LAYOUT_FILE_BULK_RETURN) { + pnfs_mark_layout_stateid_return(lo, &lseg_list, + IOMODE_ANY, 0); + } else if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) { + if (mode == PNFS_LAYOUT_BULK_RETURN) set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); ret = -EAGAIN; } @@ -901,10 +915,8 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, return ret; } -int -pnfs_destroy_layouts_byfsid(struct nfs_client *clp, - struct nfs_fsid *fsid, - bool is_recall) +int pnfs_layout_destroy_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid, + enum pnfs_layout_destroy_mode mode) { struct nfs_server *server; LIST_HEAD(layout_list); @@ -923,33 +935,40 @@ restart: rcu_read_unlock(); spin_unlock(&clp->cl_lock); - if (list_empty(&layout_list)) - return 0; - return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); + return pnfs_layout_free_bulk_destroy_list(&layout_list, mode); } -int -pnfs_destroy_layouts_byclid(struct nfs_client *clp, - bool is_recall) +static void pnfs_layout_build_destroy_list_byclient(struct nfs_client *clp, + struct list_head *list) { struct nfs_server *server; - LIST_HEAD(layout_list); spin_lock(&clp->cl_lock); rcu_read_lock(); restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - if (pnfs_layout_bulk_destroy_byserver_locked(clp, - server, - &layout_list) != 0) + if (pnfs_layout_bulk_destroy_byserver_locked(clp, server, + list) != 0) goto restart; } rcu_read_unlock(); spin_unlock(&clp->cl_lock); +} - if (list_empty(&layout_list)) - return 0; - return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +static int pnfs_layout_do_destroy_byclid(struct nfs_client *clp, + struct list_head *list, + enum pnfs_layout_destroy_mode mode) +{ + pnfs_layout_build_destroy_list_byclient(clp, list); + return pnfs_layout_free_bulk_destroy_list(list, mode); +} + +int pnfs_layout_destroy_byclid(struct nfs_client *clp, + enum pnfs_layout_destroy_mode mode) +{ + LIST_HEAD(layout_list); + + return pnfs_layout_do_destroy_byclid(clp, &layout_list, mode); } /* @@ -962,7 +981,68 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) nfs4_deviceid_mark_client_invalid(clp); nfs4_deviceid_purge_client(clp); - pnfs_destroy_layouts_byclid(clp, false); + pnfs_layout_destroy_byclid(clp, PNFS_LAYOUT_INVALIDATE); +} + +static void pnfs_layout_build_recover_list_byclient(struct nfs_client *clp, + struct list_head *list) +{ + struct nfs_server *server; + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!(server->caps & NFS_CAP_REBOOT_LAYOUTRETURN)) + continue; + if (pnfs_layout_bulk_destroy_byserver_locked(clp, server, + list) != 0) + goto restart; + } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); +} + +static int pnfs_layout_bulk_list_reboot(struct list_head *list) +{ + struct pnfs_layout_hdr *lo; + struct nfs_server *server; + int ret; + + list_for_each_entry(lo, list, plh_bulk_destroy) { + server = NFS_SERVER(lo->plh_inode); + ret = pnfs_layout_return_on_reboot(lo); + switch (ret) { + case 0: + continue; + case -NFS4ERR_BAD_STATEID: + server->caps &= ~NFS_CAP_REBOOT_LAYOUTRETURN; + break; + case -NFS4ERR_NO_GRACE: + break; + default: + goto err; + } + break; + } + return 0; +err: + return ret; +} + +int pnfs_layout_handle_reboot(struct nfs_client *clp) +{ + LIST_HEAD(list); + int ret = 0, ret2; + + pnfs_layout_build_recover_list_byclient(clp, &list); + if (!list_empty(&list)) + ret = pnfs_layout_bulk_list_reboot(&list); + ret2 = pnfs_layout_do_destroy_byclid(clp, &list, + PNFS_LAYOUT_INVALIDATE); + if (!ret) + ret = ret2; + return (ret == 0) ? 0 : -EAGAIN; } static void @@ -1163,6 +1243,33 @@ static void pnfs_clear_layoutcommit(struct inode *inode, } } +static void +pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range, + struct list_head *freeme) +{ + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + pnfs_reset_return_info(lo); + else + pnfs_mark_layout_stateid_invalid(lo, freeme); + pnfs_clear_layoutreturn_waitbit(lo); +} + +void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range) +{ + struct inode *inode = lo->plh_inode; + LIST_HEAD(freeme); + + spin_lock(&inode->i_lock); + pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range, &freeme); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); +} + void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, const nfs4_stateid *arg_stateid, const struct pnfs_layout_range *range, @@ -1172,15 +1279,15 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, LIST_HEAD(freeme); spin_lock(&inode->i_lock); - if (!pnfs_layout_is_valid(lo) || - !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + if (!nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) goto out_unlock; - if (stateid) { + if (stateid && pnfs_layout_is_valid(lo)) { u32 seq = be32_to_cpu(arg_stateid->seqid); pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); pnfs_set_layout_stateid(lo, stateid, NULL, true); + pnfs_reset_return_info(lo); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: @@ -1197,7 +1304,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, enum pnfs_iomode *iomode) { /* Serialise LAYOUTGET/LAYOUTRETURN */ - if (atomic_read(&lo->plh_outstanding) != 0) + if (atomic_read(&lo->plh_outstanding) != 0 && lo->plh_return_seq == 0) return false; if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) return false; @@ -1239,7 +1346,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, const struct cred **pcred, enum pnfs_iomode iomode, - bool sync) + unsigned int flags) { struct inode *ino = lo->plh_inode; struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; @@ -1266,33 +1373,21 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, if (ld->prepare_layoutreturn) ld->prepare_layoutreturn(&lrp->args); - status = nfs4_proc_layoutreturn(lrp, sync); + status = nfs4_proc_layoutreturn(lrp, flags); out: dprintk("<-- %s status: %d\n", __func__, status); return status; } -static bool -pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo, - enum pnfs_iomode iomode, - u32 seq) -{ - struct pnfs_layout_range recall_range = { - .length = NFS4_MAX_UINT64, - .iomode = iomode, - }; - return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, - &recall_range, seq) != -EBUSY; -} - /* Return true if layoutreturn is needed */ static bool pnfs_layout_need_return(struct pnfs_layout_hdr *lo) { if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) return false; - return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode, - lo->plh_return_seq); + return pnfs_mark_layout_stateid_return(lo, &lo->plh_return_segs, + lo->plh_return_iomode, + lo->plh_return_seq) != EBUSY; } static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) @@ -1312,7 +1407,8 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, + PNFS_FL_LAYOUTRETURN_ASYNC); } } else spin_unlock(&inode->i_lock); @@ -1379,7 +1475,8 @@ _pnfs_return_layout(struct inode *ino) send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL); spin_unlock(&ino->i_lock); if (send) - status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true); + status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, + 0); out_wait_layoutreturn: wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); out_put_layout_hdr: @@ -1417,6 +1514,24 @@ pnfs_commit_and_return_layout(struct inode *inode) return ret; } +static int pnfs_layout_return_on_reboot(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = lo->plh_inode; + const struct cred *cred; + + spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + return 0; + } + cred = get_cred(lo->plh_lc_cred); + pnfs_get_layout_hdr(lo); + spin_unlock(&inode->i_lock); + + return pnfs_send_layoutreturn(lo, &zero_stateid, &cred, IOMODE_ANY, + PNFS_FL_LAYOUTRETURN_PRIVILEGED); +} + bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, @@ -1520,7 +1635,7 @@ out_noroc: return true; } if (layoutreturn) - pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true); + pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, 0); pnfs_put_layout_hdr(lo); return false; } @@ -1542,6 +1657,18 @@ int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, /* Was there an RPC level error? If not, retry */ if (task->tk_rpc_status == 0) break; + /* + * Is there a fatal network level error? + * If so release the layout, but flag the error. + */ + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + *ret = 0; + (*respp)->lrs_present = 0; + retval = -EIO; + break; + } /* If the call was not sent, let caller handle it */ if (!RPC_WAS_SENT(task)) return 0; @@ -1570,22 +1697,24 @@ int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, } void pnfs_roc_release(struct nfs4_layoutreturn_args *args, - struct nfs4_layoutreturn_res *res, - int ret) + struct nfs4_layoutreturn_res *res, int ret) { struct pnfs_layout_hdr *lo = args->layout; struct inode *inode = args->inode; const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; + LIST_HEAD(freeme); switch (ret) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_NOMATCHING_LAYOUT: spin_lock(&inode->i_lock); - if (pnfs_layout_is_valid(lo) && - nfs4_stateid_match_other(&args->stateid, &lo->plh_stateid)) - pnfs_set_plh_return_info(lo, args->range.iomode, 0); - pnfs_clear_layoutreturn_waitbit(lo); + pnfs_layoutreturn_retry_later_locked(lo, &args->stateid, + &args->range, &freeme); spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); break; case 0: if (res->lrs_present) @@ -1922,8 +2051,10 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) { if (atomic_dec_and_test(&lo->plh_outstanding) && - test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) + test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) { + smp_mb__after_atomic(); wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN); + } } static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) @@ -1980,7 +2111,9 @@ pnfs_update_layout(struct inode *ino, struct pnfs_layout_segment *lseg = NULL; struct nfs4_layoutget *lgp; nfs4_stateid stateid; - long timeout = 0; + struct nfs4_exception exception = { + .inode = ino, + }; unsigned long giveup = jiffies + (clp->cl_lease_time << 1); bool first; @@ -1997,6 +2130,14 @@ pnfs_update_layout(struct inode *ino, } lookup_again: + if (!nfs4_valid_open_stateid(ctx->state)) { + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_INVALID_OPEN); + lseg = ERR_PTR(-EIO); + goto out; + } + lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp)); if (IS_ERR(lseg)) goto out; @@ -2144,7 +2285,7 @@ lookup_again: lgp->lo = lo; pnfs_get_layout_hdr(lo); - lseg = nfs4_proc_layoutget(lgp, &timeout); + lseg = nfs4_proc_layoutget(lgp, &exception); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); nfs_layoutget_end(lo); @@ -2171,6 +2312,8 @@ lookup_again: goto out_put_layout_hdr; } if (lseg) { + if (!exception.retry) + goto out_put_layout_hdr; if (first) pnfs_clear_first_layoutget(lo); trace_pnfs_update_layout(ino, pos, count, @@ -2554,7 +2697,8 @@ pnfs_mark_layout_for_return(struct inode *inode, return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); spin_unlock(&inode->i_lock); if (return_now) - pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, + PNFS_FL_LAYOUTRETURN_ASYNC); } else { spin_unlock(&inode->i_lock); nfs_commit_inode(inode, 0); @@ -2634,31 +2778,45 @@ pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo, return mode == 0; } -static int -pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data) +static int pnfs_layout_return_unused_byserver(struct nfs_server *server, + void *data) { const struct pnfs_layout_range *range = data; + const struct cred *cred; struct pnfs_layout_hdr *lo; struct inode *inode; + nfs4_stateid stateid; + enum pnfs_iomode iomode; + restart: rcu_read_lock(); list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { - if (!pnfs_layout_can_be_returned(lo) || + inode = lo->plh_inode; + if (!inode || !pnfs_layout_can_be_returned(lo) || test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) continue; - inode = lo->plh_inode; spin_lock(&inode->i_lock); - if (!pnfs_should_return_unused_layout(lo, range)) { + if (!lo->plh_inode || + !pnfs_should_return_unused_layout(lo, range)) { spin_unlock(&inode->i_lock); continue; } + pnfs_get_layout_hdr(lo); + pnfs_set_plh_return_info(lo, range->iomode, 0); + if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, + range, 0) != 0 || + !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) { + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + pnfs_put_layout_hdr(lo); + cond_resched(); + goto restart; + } spin_unlock(&inode->i_lock); - inode = pnfs_grab_inode_layout_hdr(lo); - if (!inode) - continue; rcu_read_unlock(); - pnfs_mark_layout_for_return(inode, range); - iput(inode); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, + PNFS_FL_LAYOUTRETURN_ASYNC); + pnfs_put_layout_hdr(lo); cond_resched(); goto restart; } @@ -2680,43 +2838,34 @@ pnfs_layout_return_unused_byclid(struct nfs_client *clp, &range); } +/* Check if we have we have a valid layout but if there isn't an intersection + * between the request and the pgio->pg_lseg, put this pgio->pg_lseg away. + */ void -pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) +pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) { if (pgio->pg_lseg == NULL || - test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags)) + (test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags) && + pnfs_lseg_request_intersecting(pgio->pg_lseg, req))) return; pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); -/* - * Check for any intersection between the request and the pgio->pg_lseg, - * and if none, put this pgio->pg_lseg away. - */ -void -pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) { - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; - } -} -EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range); - void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { u64 rd_size; - pnfs_generic_pg_check_layout(pgio); - pnfs_generic_pg_check_range(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); if (pgio->pg_lseg == NULL) { if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); else - rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq, + req_offset(req)); pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), @@ -2740,8 +2889,7 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - pnfs_generic_pg_check_layout(pgio); - pnfs_generic_pg_check_range(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), @@ -3184,6 +3332,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; int status; + bool mark_as_dirty = false; if (!pnfs_layoutcommit_outstanding(inode)) return 0; @@ -3235,19 +3384,23 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (ld->prepare_layoutcommit) { status = ld->prepare_layoutcommit(&data->args); if (status) { - put_cred(data->cred); + if (status != -ENOSPC) + put_cred(data->cred); spin_lock(&inode->i_lock); set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - goto out_unlock; + if (status != -ENOSPC) + goto out_unlock; + spin_unlock(&inode->i_lock); + mark_as_dirty = true; } } status = nfs4_proc_layoutcommit(data, sync); out: - if (status) + if (status || mark_as_dirty) mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d886c8226d8f..91ff877185c8 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -35,6 +35,7 @@ #include <linux/nfs_page.h> #include <linux/workqueue.h> +struct nfs4_exception; struct nfs4_opendata; enum { @@ -59,6 +60,7 @@ struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *ds_remotestr; /* comma sep list of addrs */ struct list_head ds_addrs; + const struct net *ds_net; struct nfs_client *ds_clp; refcount_t ds_count; unsigned long ds_state; @@ -117,6 +119,12 @@ enum layoutdriver_policy_flags { PNFS_LAYOUTGET_ON_OPEN = 1 << 3, }; +enum pnfs_layout_destroy_mode { + PNFS_LAYOUT_INVALIDATE = 0, + PNFS_LAYOUT_BULK_RETURN, + PNFS_LAYOUT_FILE_BULK_RETURN, +}; + struct nfs4_deviceid_node; /* Per-layout driver specific registration structure */ @@ -126,7 +134,6 @@ struct pnfs_layoutdriver_type { const char *name; struct module *owner; unsigned flags; - unsigned max_deviceinfo_size; unsigned max_layoutget_response; int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); @@ -192,8 +199,6 @@ struct pnfs_commit_ops { int max); void (*recover_commit_reqs) (struct list_head *list, struct nfs_commit_info *cinfo); - struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, - struct folio *folio); }; struct pnfs_layout_hdr { @@ -241,12 +246,18 @@ extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id); extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld); /* nfs4proc.c */ +#define PNFS_FL_LAYOUTRETURN_ASYNC (1U << 0) +#define PNFS_FL_LAYOUTRETURN_PRIVILEGED (1U << 1) + extern size_t max_response_pages(struct nfs_server *server); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, const struct cred *cred); -extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout); -extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); +extern struct pnfs_layout_segment * +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, + struct nfs4_exception *exception); +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, + unsigned int flags); /* pnfs.c */ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); @@ -254,8 +265,7 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); -void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio); -void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req); +void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio, struct nfs_page *req); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, @@ -271,11 +281,10 @@ void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); void pnfs_destroy_layout_final(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); -int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, - struct nfs_fsid *fsid, - bool is_recall); -int pnfs_destroy_layouts_byclid(struct nfs_client *clp, - bool is_recall); +int pnfs_layout_destroy_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid, + enum pnfs_layout_destroy_mode mode); +int pnfs_layout_destroy_byclid(struct nfs_client *clp, + enum pnfs_layout_destroy_mode mode); bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode); @@ -321,6 +330,9 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, enum pnfs_iomode iomode, bool strict_iomode, gfp_t gfp_flags); +void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range); void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, const nfs4_stateid *arg_stateid, const struct pnfs_layout_range *range, @@ -342,6 +354,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg); void pnfs_layout_return_unused_byclid(struct nfs_client *clp, enum pnfs_iomode iomode); +int pnfs_layout_handle_reboot(struct nfs_client *clp); /* nfs4_deviceid_flags */ enum { @@ -394,8 +407,6 @@ void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data); void pnfs_generic_rw_release(void *data); void pnfs_generic_recover_commit_reqs(struct list_head *dst, struct nfs_commit_info *cinfo); -struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, - struct folio *folio); int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int how, @@ -405,7 +416,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode, int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max); void pnfs_generic_write_commit_done(struct rpc_task *task, void *data); void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); -struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net, + struct list_head *dsaddrs, gfp_t gfp_flags); void nfs4_pnfs_v3_ds_connect_unload(void); int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, @@ -555,17 +567,6 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) fl_cinfo->ops->recover_commit_reqs(head, cinfo); } -static inline struct nfs_page * -pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, - struct folio *folio) -{ - struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - - if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs) - return NULL; - return fl_cinfo->ops->search_commit_reqs(cinfo, folio); -} - /* Should the pNFS client commit and return the layout upon a setattr */ static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) @@ -723,6 +724,11 @@ static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi) { } +static inline int pnfs_layout_handle_reboot(struct nfs_client *clp) +{ + return 0; +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { @@ -862,13 +868,6 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) { } -static inline struct nfs_page * -pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, - struct folio *folio) -{ - return NULL; -} - static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) { return 0; diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index ddbbf4fcda86..bf0f2d67e96c 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -110,9 +110,6 @@ nfs4_get_device_info(struct nfs_server *server, * GETDEVICEINFO's maxcount */ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; - if (server->pnfs_curr_ld->max_deviceinfo_size && - server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz) - max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size; max_pages = nfs_page_array_len(0, max_resp_sz); dprintk("%s: server %p max_resp_sz %u max_pages %d\n", __func__, server, max_resp_sz, max_pages); @@ -154,7 +151,7 @@ nfs4_get_device_info(struct nfs_server *server, set_bit(NFS_DEVICEID_NOCACHE, &d->flags); out_free_pages: - for (i = 0; i < max_pages; i++) + while (--i >= 0) __free_page(pages[i]); kfree(pages); out_free_pdev: diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index a0112ad4937a..9976cc16b689 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -16,6 +16,8 @@ #include "nfs4session.h" #include "internal.h" #include "pnfs.h" +#include "netns.h" +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -351,53 +353,6 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, } EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); -static struct nfs_page * -pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, - unsigned int nbuckets, struct folio *folio) -{ - struct nfs_page *req; - struct pnfs_commit_bucket *b; - unsigned int i; - - /* Linearly search the commit lists for each bucket until a matching - * request is found */ - for (i = 0, b = buckets; i < nbuckets; i++, b++) { - list_for_each_entry(req, &b->written, wb_list) { - if (nfs_page_to_folio(req) == folio) - return req->wb_head; - } - list_for_each_entry(req, &b->committing, wb_list) { - if (nfs_page_to_folio(req) == folio) - return req->wb_head; - } - } - return NULL; -} - -/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request - * for @folio - * @cinfo - commit info for current inode - * @folio - page to search for matching head request - * - * Return: the head request if one is found, otherwise %NULL. - */ -struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, - struct folio *folio) -{ - struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - struct pnfs_commit_array *array; - struct nfs_page *req; - - list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) { - req = pnfs_bucket_search_commit_reqs(array->buckets, - array->nbuckets, folio); - if (req) - return req; - } - return NULL; -} -EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs); - static struct pnfs_layout_segment * pnfs_bucket_get_committing(struct list_head *head, struct pnfs_commit_bucket *bucket, @@ -537,7 +492,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), data->mds_ops, how, - RPC_TASK_CRED_NOREF); + RPC_TASK_CRED_NOREF, NULL); } else { nfs_init_commit(data, NULL, data->lseg, cinfo); initiate_commit(data, how); @@ -551,14 +506,14 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist); /* * Data server cache * - * Data servers can be mapped to different device ids. - * nfs4_pnfs_ds reference counting + * Data servers can be mapped to different device ids, but should + * never be shared between net namespaces. + * + * nfs4_pnfs_ds reference counting: * - set to 1 on allocation * - incremented when a device id maps a data server already in the cache. * - decremented when deviceid is removed from the cache. */ -static DEFINE_SPINLOCK(nfs4_ds_cache_lock); -static LIST_HEAD(nfs4_data_server_cache); /* Debug routines */ static void @@ -651,11 +606,11 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1, * Lookup DS by addresses. nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct list_head *dsaddrs) +_data_server_lookup_locked(const struct nfs_net *nn, const struct list_head *dsaddrs) { struct nfs4_pnfs_ds *ds; - list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) + list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node) if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) return ds; return NULL; @@ -700,10 +655,11 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds) void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds) { - if (refcount_dec_and_lock(&ds->ds_count, - &nfs4_ds_cache_lock)) { + struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id); + + if (refcount_dec_and_lock(&ds->ds_count, &nn->nfs4_data_server_lock)) { list_del_init(&ds->ds_node); - spin_unlock(&nfs4_ds_cache_lock); + spin_unlock(&nn->nfs4_data_server_lock); destroy_ds(ds); } } @@ -763,8 +719,9 @@ out_err: * uncached and return cached struct nfs4_pnfs_ds. */ struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags) { + struct nfs_net *nn = net_generic(net, nfs_net_id); struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; char *remotestr; @@ -780,16 +737,17 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) /* this is only used for debugging, so it's ok if its NULL */ remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); - spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(dsaddrs); + spin_lock(&nn->nfs4_data_server_lock); + tmp_ds = _data_server_lookup_locked(nn, dsaddrs); if (tmp_ds == NULL) { INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; refcount_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); + ds->ds_net = net; ds->ds_clp = NULL; - list_add(&ds->ds_node, &nfs4_data_server_cache); + list_add(&ds->ds_node, &nn->nfs4_data_server_cache); dprintk("%s add new data server %s\n", __func__, ds->ds_remotestr); } else { @@ -801,7 +759,7 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) refcount_read(&tmp_ds->ds_count)); ds = tmp_ds; } - spin_unlock(&nfs4_ds_cache_lock); + spin_unlock(&nn->nfs4_data_server_lock); out: return ds; } @@ -851,7 +809,11 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, unsigned int retrans) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; + unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -870,21 +832,33 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, + .connect_timeout = connect_timeout, + .reconnect_timeout = connect_timeout, + .xprtsec = clp->cl_xprtsec, }; - if (da->da_transport != clp->cl_proto) + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) + xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; + + if (xprt_args.ident != clp->cl_proto) continue; - if (da->da_addr.ss_family != clp->cl_addr.ss_family) + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) continue; /* Add this address as an alias */ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, NULL); continue; } - clp = get_v3_ds_connect(mds_srv, - &da->da_addr, - da->da_addrlen, da->da_transport, - timeo, retrans); + + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen, + ds_proto, timeo, retrans); if (IS_ERR(clp)) continue; clp->cl_rpcclient->cl_softerr = 0; @@ -910,12 +884,17 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, u32 minor_version) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); list_for_each_entry(da, &ds->ds_addrs, da_node) { + char servername[48]; + dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); @@ -926,6 +905,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, + .xprtsec = clp->cl_xprtsec, }; struct nfs4_add_xprt_data xprtdata = { .clp = clp, @@ -935,26 +915,63 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, .data = &xprtdata, }; - if (da->da_transport != clp->cl_proto) + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) { + struct sockaddr *addr = + (struct sockaddr *)&da->da_addr; + struct sockaddr_in *sin = + (struct sockaddr_in *)&da->da_addr; + struct sockaddr_in6 *sin6 = + (struct sockaddr_in6 *)&da->da_addr; + + /* for NFS with TLS we need to supply a correct + * servername of the trunked transport, not the + * servername of the main transport stored in + * clp->cl_hostname. And set the protocol to + * indicate to use TLS + */ + servername[0] = '\0'; + switch(addr->sa_family) { + case AF_INET: + snprintf(servername, sizeof(servername), + "%pI4", &sin->sin_addr.s_addr); + break; + case AF_INET6: + snprintf(servername, sizeof(servername), + "%pI6", &sin6->sin6_addr); + break; + default: + /* do not consider this address */ + continue; + } + xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; + xprt_args.servername = servername; + } + if (xprt_args.ident != clp->cl_proto) continue; - if (da->da_addr.ss_family != clp->cl_addr.ss_family) + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) continue; + /** * Test this address for session trunking and * add as an alias */ - xprtdata.cred = nfs4_get_clid_cred(clp), + xprtdata.cred = nfs4_get_clid_cred(clp); rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_setup_test_and_add_xprt, &rpcdata); if (xprtdata.cred) put_cred(xprtdata.cred); } else { - clp = nfs4_set_ds_client(mds_srv, - &da->da_addr, - da->da_addrlen, - da->da_transport, timeo, - retrans, minor_version); + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = nfs4_set_ds_client(mds_srv, &da->da_addr, + da->da_addrlen, ds_proto, + timeo, retrans, minor_version); if (IS_ERR(clp)) continue; @@ -965,7 +982,6 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, clp = ERR_PTR(-EIO); continue; } - } } @@ -996,8 +1012,10 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, err = nfs4_wait_ds_connect(ds); if (err || ds->ds_clp) goto out; - if (nfs4_test_deviceid_unavailable(devid)) - return -ENODEV; + if (nfs4_test_deviceid_unavailable(devid)) { + err = -ENODEV; + goto out; + } } while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0); if (ds->ds_clp) @@ -1027,11 +1045,12 @@ out: if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) { WARN_ON_ONCE(ds->ds_clp || !nfs4_test_deviceid_unavailable(devid)); - return -EINVAL; - } - err = nfs_client_init_status(ds->ds_clp); + err = -EINVAL; + } else + err = nfs_client_init_status(ds->ds_clp); } + trace_pnfs_ds_connect(ds->ds_remotestr, err); return err; } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index e3570c656b0f..63e71310b9f6 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -153,13 +153,13 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs_proc_lookup(struct inode *dir, struct dentry *dentry, +nfs_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_diropargs arg = { .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len + .name = name->name, + .len = name->len }; struct nfs_diropok res = { .fh = fhandle, @@ -396,9 +396,10 @@ nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) } static int -nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, +nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio, unsigned int len, struct iattr *sattr) { + struct page *page = &folio->page; struct nfs_fh *fh; struct nfs_fattr *fattr; struct nfs_symlinkargs arg = { @@ -445,13 +446,14 @@ out: return status; } -static int +static struct dentry * nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], }; + struct dentry *alias = NULL; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); @@ -463,12 +465,15 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + if (status == 0) { + alias = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + status = PTR_ERR_OR_ZERO(alias); + } else + alias = ERR_PTR(status); nfs_free_createdata(data); out: dprintk("NFS reply mkdir: %d\n", status); - return status; + return alias; } static int @@ -686,14 +691,22 @@ out_einval: return -EINVAL; } -static int nfs_have_delegation(struct inode *inode, fmode_t flags) +static int nfs_have_delegation(struct inode *inode, fmode_t type, int flags) +{ + return 0; +} + +static int nfs_return_delegation(struct inode *inode) { + if (S_ISREG(inode->i_mode)) + nfs_wb_all(inode); return 0; } static const struct inode_operations nfs_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, + .atomic_open = nfs_atomic_open_v23, .link = nfs_link, .unlink = nfs_unlink, .symlink = nfs_symlink, @@ -755,6 +768,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .lock_check_bounds = nfs_lock_check_bounds, .close_context = nfs_close_context, .have_delegation = nfs_have_delegation, + .return_delegation = nfs_return_delegation, .alloc_client = nfs_alloc_client, .init_client = nfs_init_client, .free_client = nfs_free_client, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f71eeee67e20..3c1fa320b3f1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -28,6 +28,7 @@ #include "fscache.h" #include "pnfs.h" #include "nfstrace.h" +#include "delegation.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -47,6 +48,7 @@ static struct nfs_pgio_header *nfs_readhdr_alloc(void) static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) { + kfree(rhdr->res.scratch); kmem_cache_free(nfs_rdata_cachep, rhdr); } @@ -54,7 +56,8 @@ static int nfs_return_empty_folio(struct folio *folio) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); - folio_unlock(folio); + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); return 0; } @@ -108,12 +111,18 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); +bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size) +{ + WARN_ON(hdr->res.scratch != NULL); + hdr->res.scratch = kmalloc(size, GFP_KERNEL); + return hdr->res.scratch != NULL; +} +EXPORT_SYMBOL_GPL(nfs_read_alloc_scratch); + static void nfs_readpage_release(struct nfs_page *req, int error) { struct folio *folio = nfs_page_to_folio(req); - if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT) - folio_set_error(folio); if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) if (nfs_netfs_folio_unlock(folio)) folio_unlock(folio); @@ -278,7 +287,7 @@ int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, struct nfs_open_context *ctx, struct folio *folio) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; struct nfs_server *server = NFS_SERVER(inode); size_t fsize = folio_size(folio); unsigned int rsize = server->rsize; @@ -295,6 +304,8 @@ int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len); if (IS_ERR(new)) { error = PTR_ERR(new); + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); goto out; } @@ -312,21 +323,57 @@ out: } /* - * Read a page over NFS. - * We read the page synchronously in the following case: - * - The error flag is set for this page. This happens only when a - * previous async read operation failed. + * Actually read a folio over the wire. */ -int nfs_read_folio(struct file *file, struct folio *folio) +static int nfs_do_read_folio(struct file *file, struct folio *folio) { struct inode *inode = file_inode(file); struct nfs_pageio_descriptor pgio; struct nfs_open_context *ctx; int ret; - trace_nfs_aop_readpage(inode, folio); + ctx = get_nfs_open_context(nfs_file_open_context(file)); + + xchg(&ctx->error, 0); + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); + + ret = nfs_read_add_folio(&pgio, ctx, folio); + if (ret) + goto out_put; + + nfs_pageio_complete_read(&pgio); + nfs_update_delegated_atime(inode); + if (pgio.pg_error < 0) { + ret = pgio.pg_error; + goto out_put; + } + + ret = folio_wait_locked_killable(folio); + if (!folio_test_uptodate(folio) && !ret) + ret = xchg(&ctx->error, 0); + +out_put: + put_nfs_open_context(ctx); + return ret; +} + +/* + * Synchronously read a folio. + * + * This is not heavily used as most users to try an asynchronous + * large read through ->readahead first. + */ +int nfs_read_folio(struct file *file, struct folio *folio) +{ + struct inode *inode = file_inode(file); + loff_t pos = folio_pos(folio); + size_t len = folio_size(folio); + int ret; + + trace_nfs_aop_readpage(inode, pos, len); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - task_io_account_read(folio_size(folio)); + task_io_account_read(len); /* * Try to flush any pending writes to the file.. @@ -346,30 +393,10 @@ int nfs_read_folio(struct file *file, struct folio *folio) goto out_unlock; ret = nfs_netfs_read_folio(file, folio); - if (!ret) - goto out; - - ctx = get_nfs_open_context(nfs_file_open_context(file)); - - xchg(&ctx->error, 0); - nfs_pageio_init_read(&pgio, inode, false, - &nfs_async_read_completion_ops); - - ret = nfs_read_add_folio(&pgio, ctx, folio); if (ret) - goto out_put; - - nfs_pageio_complete_read(&pgio); - ret = pgio.pg_error < 0 ? pgio.pg_error : 0; - if (!ret) { - ret = folio_wait_locked_killable(folio); - if (!folio_test_uptodate(folio) && !ret) - ret = xchg(&ctx->error, 0); - } -out_put: - put_nfs_open_context(ctx); + ret = nfs_do_read_folio(file, folio); out: - trace_nfs_aop_readpage_done(inode, folio, ret); + trace_nfs_aop_readpage_done(inode, pos, len, ret); return ret; out_unlock: folio_unlock(folio); @@ -416,6 +443,7 @@ void nfs_readahead(struct readahead_control *ractl) } nfs_pageio_complete_read(&pgio); + nfs_update_delegated_atime(inode); put_nfs_open_context(ctx); out: diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2284f749d892..72dee6f3050e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -47,6 +47,7 @@ #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> +#include <linux/sched.h> #include <linux/slab.h> #include <net/ipv6.h> #include <linux/netdevice.h> @@ -72,6 +73,7 @@ #include "nfs.h" #include "netns.h" #include "sysfs.h" +#include "nfs4idmap.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -129,11 +131,7 @@ static void nfs_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static struct shrinker acl_shrinker = { - .count_objects = nfs_access_cache_count, - .scan_objects = nfs_access_cache_scan, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *acl_shrinker; /* * Register the NFS filesystems @@ -153,9 +151,18 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - ret = register_shrinker(&acl_shrinker, "nfs-acl"); - if (ret < 0) + + acl_shrinker = shrinker_alloc(0, "nfs-acl"); + if (!acl_shrinker) { + ret = -ENOMEM; goto error_3; + } + + acl_shrinker->count_objects = nfs_access_cache_count; + acl_shrinker->scan_objects = nfs_access_cache_scan; + + shrinker_register(acl_shrinker); + #ifdef CONFIG_NFS_V4_2 nfs_ssc_register_ops(); #endif @@ -175,7 +182,7 @@ error_0: */ void __exit unregister_nfs_fs(void) { - unregister_shrinker(&acl_shrinker); + shrinker_free(acl_shrinker); nfs_unregister_sysctl(); unregister_nfs4_fs(); #ifdef CONFIG_NFS_V4_2 @@ -223,6 +230,7 @@ static int __nfs_list_for_each_server(struct list_head *head, ret = fn(server, data); if (ret) goto out; + cond_resched(); rcu_read_lock(); } rcu_read_unlock(); @@ -446,8 +454,12 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, { NFS_MOUNT_NONLM, ",nolock", "" }, { NFS_MOUNT_NOACL, ",noacl", "" }, { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, + { NFS_MOUNT_FORCE_RDIRPLUS, ",rdirplus=force", "" }, { NFS_MOUNT_UNSHARED, ",nosharecache", "" }, { NFS_MOUNT_NORESVPORT, ",noresvport", "" }, + { NFS_MOUNT_NETUNREACH_FATAL, + ",fatal_neterrors=ENETDOWN:ENETUNREACH", + ",fatal_neterrors=none" }, { 0, NULL, NULL } }; const struct proc_nfs_info *nfs_infop; @@ -511,8 +523,16 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, else nfs_show_nfsv4_options(m, nfss, showdefaults); - if (nfss->options & NFS_OPTION_FSCACHE) + if (nfss->options & NFS_OPTION_FSCACHE) { +#ifdef CONFIG_NFS_FSCACHE + if (nfss->fscache_uniq) + seq_printf(m, ",fsc=%s", nfss->fscache_uniq); + else + seq_puts(m, ",fsc"); +#else seq_puts(m, ",fsc"); +#endif + } if (nfss->options & NFS_OPTION_MIGRATION) seq_puts(m, ",migration"); @@ -536,6 +556,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, else seq_puts(m, ",local_lock=posix"); + if (nfss->flags & NFS_MOUNT_NO_ALIGNWRITE) + seq_puts(m, ",noalignwrite"); + if (nfss->flags & NFS_MOUNT_WRITE_EAGER) { if (nfss->flags & NFS_MOUNT_WRITE_WAIT) seq_puts(m, ",write=wait"); @@ -867,7 +890,15 @@ static int nfs_request_mount(struct fs_context *fc, * Now ask the mount server to map our export path * to a file handle. */ - status = nfs_mount(&request, ctx->timeo, ctx->retrans); + if ((request.protocol == XPRT_TRANSPORT_UDP) == + !(ctx->flags & NFS_MOUNT_TCP)) + /* + * NFS protocol and mount protocol are both UDP or neither UDP + * so timeouts are compatible. Use NFS timeouts for MOUNT + */ + status = nfs_mount(&request, ctx->timeo, ctx->retrans); + else + status = nfs_mount(&request, NFS_UNSPEC_TIMEO, NFS_UNSPEC_RETRANS); if (status != 0) { dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", request.hostname, status); @@ -888,6 +919,16 @@ static struct nfs_server *nfs_try_mount_request(struct fs_context *fc) rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS]; unsigned int authlist_len = ARRAY_SIZE(authlist); + /* make sure 'nolock'/'lock' override the 'local_lock' mount option */ + if (ctx->lock_status) { + if (ctx->lock_status == NFS_LOCK_NOLOCK) { + ctx->flags |= NFS_MOUNT_NONLM; + ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); + } else { + ctx->flags &= ~NFS_MOUNT_NONLM; + ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); + } + } status = nfs_request_mount(fc, ctx->mntfh, authlist, &authlist_len); if (status) return ERR_PTR(status); @@ -1011,6 +1052,16 @@ int nfs_reconfigure(struct fs_context *fc) sync_filesystem(sb); /* + * The SB_RDONLY flag has been removed from the superblock during + * mounts to prevent interference between different filesystems. + * Similarly, it is also necessary to ignore the SB_RDONLY flag + * during reconfiguration; otherwise, it may also result in the + * creation of redundant superblocks when mounting a directory with + * different rw and ro flags multiple times. + */ + fc->sb_flags_mask &= ~SB_RDONLY; + + /* * Userspace mount programs that send binary options generally send * them populated with default values. We have no way to know which * ones were explicitly specified. Fall back to legacy behavior and @@ -1071,7 +1122,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) sb->s_export_op = &nfs_export_ops; break; case 4: - sb->s_flags |= SB_POSIXACL; + sb->s_iflags |= SB_I_NOUMASK; sb->s_time_gran = 1; sb->s_time_min = S64_MIN; sb->s_time_max = S64_MAX; @@ -1132,7 +1183,7 @@ static int nfs_set_super(struct super_block *s, struct fs_context *fc) struct nfs_server *server = fc->s_fs_info; int ret; - s->s_d_op = server->nfs_client->rpc_ops->dentry_ops; + set_default_d_op(s, server->nfs_client->rpc_ops->dentry_ops); ret = set_anon_super(s, server); if (ret == 0) server->s_dev = s->s_dev; @@ -1267,8 +1318,17 @@ int nfs_get_tree_common(struct fs_context *fc) if (IS_ERR(server)) return PTR_ERR(server); + /* + * When NFS_MOUNT_UNSHARED is not set, NFS forces the sharing of a + * superblock among each filesystem that mounts sub-directories + * belonging to a single exported root path. + * To prevent interference between different filesystems, the + * SB_RDONLY flag should be removed from the superblock. + */ if (server->flags & NFS_MOUNT_UNSHARED) compare_super = NULL; + else + fc->sb_flags &= ~SB_RDONLY; /* -o noac implies -o sync */ if (server->flags & NFS_MOUNT_NOAC) @@ -1339,15 +1399,13 @@ error_splat_super: void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); - dev_t dev = s->s_dev; nfs_sysfs_move_sb_to_server(server); - generic_shutdown_super(s); + kill_anon_super(s); nfs_fscache_release_super_cookie(s); nfs_free_server(server); - free_anon_bdev(dev); } EXPORT_SYMBOL_GPL(nfs_kill_super); @@ -1368,6 +1426,7 @@ unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE; unsigned short send_implementation_id = 1; char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; bool recover_lost_locks = false; +short nfs_delay_retrans = -1; EXPORT_SYMBOL_GPL(nfs_callback_nr_threads); EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); @@ -1378,6 +1437,7 @@ EXPORT_SYMBOL_GPL(max_session_cb_slots); EXPORT_SYMBOL_GPL(send_implementation_id); EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); EXPORT_SYMBOL_GPL(recover_lost_locks); +EXPORT_SYMBOL_GPL(nfs_delay_retrans); #define NFS_CALLBACK_MAXPORTNR (65535U) @@ -1426,5 +1486,9 @@ MODULE_PARM_DESC(recover_lost_locks, "If the server reports that a lock might be lost, " "try to recover it risking data corruption."); - +module_param_named(delay_retrans, nfs_delay_retrans, short, 0644); +MODULE_PARM_DESC(delay_retrans, + "Unless negative, specifies the number of times the NFSv4 " + "client retries a request before returning an EAGAIN error, " + "after a reply of NFS4ERR_DELAY from the server."); #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 0e27a2e4e68b..58146e935402 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -32,47 +32,39 @@ static int nfs_symlink_filler(struct file *file, struct folio *folio) int error; error = NFS_PROTO(inode)->readlink(inode, &folio->page, 0, PAGE_SIZE); - if (error < 0) - goto error; - folio_mark_uptodate(folio); - folio_unlock(folio); - return 0; - -error: - folio_set_error(folio); - folio_unlock(folio); - return -EIO; + folio_end_read(folio, error == 0); + return error; } static const char *nfs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page; + struct folio *folio; void *err; if (!dentry) { err = ERR_PTR(nfs_revalidate_mapping_rcu(inode)); if (err) return err; - page = find_get_page(inode->i_mapping, 0); - if (!page) + folio = filemap_get_folio(inode->i_mapping, 0); + if (IS_ERR(folio)) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) return err; - page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler, + folio = read_cache_folio(&inode->i_data, 0, nfs_symlink_filler, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); + if (IS_ERR(folio)) + return ERR_CAST(folio); } - set_delayed_call(done, page_put_link, page); - return page_address(page); + set_delayed_call(done, page_put_link, folio); + return folio_address(folio); } /* diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index f39e2089bc4c..f579df0e8d67 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -14,7 +14,7 @@ static struct ctl_table_header *nfs_callback_sysctl_table; -static struct ctl_table nfs_cb_sysctls[] = { +static const struct ctl_table nfs_cb_sysctls[] = { { .procname = "nfs_mountpoint_timeout", .data = &nfs_mountpoint_expiry_timeout, @@ -29,7 +29,6 @@ static struct ctl_table nfs_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; int nfs_register_sysctl(void) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index acda8f033d30..ea6e6168092b 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -14,6 +14,7 @@ #include <linux/rcupdate.h> #include <linux/lockd/lockd.h> +#include "internal.h" #include "nfs4_fs.h" #include "netns.h" #include "sysfs.h" @@ -188,6 +189,7 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent, return p; kobject_put(&p->kobject); + kobject_put(&p->nfs_net_kobj); } return NULL; } @@ -228,6 +230,25 @@ static void shutdown_client(struct rpc_clnt *clnt) rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL); } +/* + * Shut down the nfs_client only once all the superblocks + * have been shut down. + */ +static void shutdown_nfs_client(struct nfs_client *clp) +{ + struct nfs_server *server; + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!(server->flags & NFS_MOUNT_SHUTDOWN)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + nfs_mark_client_ready(clp, -EIO); + shutdown_client(clp->cl_rpcclient); +} + static ssize_t shutdown_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -259,7 +280,6 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, server->flags |= NFS_MOUNT_SHUTDOWN; shutdown_client(server->client); - shutdown_client(server->nfs_client->cl_rpcclient); if (!IS_ERR(server->client_acl)) shutdown_client(server->client_acl); @@ -267,11 +287,44 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, if (server->nlm_host) shutdown_client(server->nlm_host->h_rpcclnt); out: + shutdown_nfs_client(server->nfs_client); return count; } static struct kobj_attribute nfs_sysfs_attr_shutdown = __ATTR_RW(shutdown); +#if IS_ENABLED(CONFIG_NFS_V4_1) +static ssize_t +implid_domain_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid; + + if (!impl_id || strlen(impl_id->domain) == 0) + return 0; //sysfs_emit(buf, ""); + return sysfs_emit(buf, "%s\n", impl_id->domain); +} + +static struct kobj_attribute nfs_sysfs_attr_implid_domain = __ATTR_RO(implid_domain); + + +static ssize_t +implid_name_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid; + + if (!impl_id || strlen(impl_id->name) == 0) + return 0; //sysfs_emit(buf, ""); + return sysfs_emit(buf, "%s\n", impl_id->name); +} + +static struct kobj_attribute nfs_sysfs_attr_implid_name = __ATTR_RO(implid_name); + +#endif /* IS_ENABLED(CONFIG_NFS_V4_1) */ + #define RPC_CLIENT_NAME_SIZE 64 void nfs_sysfs_link_rpc_client(struct nfs_server *server, @@ -280,9 +333,9 @@ void nfs_sysfs_link_rpc_client(struct nfs_server *server, char name[RPC_CLIENT_NAME_SIZE]; int ret; - strcpy(name, clnt->cl_program->name); - strcat(name, uniq ? uniq : ""); - strcat(name, "_client"); + strscpy(name, clnt->cl_program->name, sizeof(name)); + strncat(name, uniq ? uniq : "", sizeof(name) - strlen(name) - 1); + strncat(name, "_client", sizeof(name) - strlen(name) - 1); ret = sysfs_create_link_nowarn(&server->kobj, &clnt->cl_sysfs->kobject, name); @@ -309,6 +362,59 @@ static struct kobj_type nfs_sb_ktype = { .child_ns_type = nfs_netns_object_child_ns_type, }; +#if IS_ENABLED(CONFIG_NFS_V4_1) +static void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) +{ + int ret; + + if (!server->nfs_client->cl_implid) + return; + + ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_domain.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); + + ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_name.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +#else /* CONFIG_NFS_V4_1 */ +static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +static ssize_t +localio_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + bool localio = nfs_server_is_local(server->nfs_client); + return sysfs_emit(buf, "%d\n", localio); +} + +static struct kobj_attribute nfs_sysfs_attr_localio = __ATTR_RO(localio); + +static void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ + int ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_localio.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +#else +static inline void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ +} +#endif /* IS_ENABLED(CONFIG_NFS_LOCALIO) */ + void nfs_sysfs_add_server(struct nfs_server *server) { int ret; @@ -325,6 +431,9 @@ void nfs_sysfs_add_server(struct nfs_server *server) if (ret < 0) pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", server->s_sysfs_id, ret); + + nfs_sysfs_add_nfsv41_server(server); + nfs_sysfs_add_nfs_localio_server(server); } EXPORT_SYMBOL_GPL(nfs_sysfs_add_server); @@ -345,8 +454,10 @@ void nfs_sysfs_move_sb_to_server(struct nfs_server *server) int ret = -ENOMEM; s = kasprintf(GFP_KERNEL, "server-%d", server->s_sysfs_id); - if (s) + if (s) { ret = kobject_rename(&server->kobj, s); + kfree(s); + } if (ret < 0) pr_warn("NFS: rename sysfs %s failed (%d)\n", server->kobj.name, ret); diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 150a953a8be9..b55467911648 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -232,6 +232,8 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode) dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); + NFS_PROTO(inode)->return_delegation(inode); + if (NFS_STALE(inode) || !nfs_call_unlink(dentry, inode, data)) nfs_free_unlinkdata(data); } @@ -267,7 +269,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct inode *new_dir = data->new_dir; struct dentry *old_dentry = data->old_dentry; - trace_nfs_sillyrename_rename(old_dir, old_dentry, + trace_nfs_async_rename_done(old_dir, old_dentry, new_dir, data->new_dentry, task->tk_status); if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { rpc_restart_call_prepare(task); @@ -462,18 +464,17 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) sdentry = NULL; do { - int slen; dput(sdentry); sillycounter++; - slen = scnprintf(silly, sizeof(silly), - SILLYNAME_PREFIX "%0*llx%0*x", - SILLYNAME_FILEID_LEN, fileid, - SILLYNAME_COUNTER_LEN, sillycounter); + scnprintf(silly, sizeof(silly), + SILLYNAME_PREFIX "%0*llx%0*x", + SILLYNAME_FILEID_LEN, fileid, + SILLYNAME_COUNTER_LEN, sillycounter); dfprintk(VFS, "NFS: trying to rename %pd to %s\n", dentry, silly); - sdentry = lookup_one_len(silly, dentry->d_parent, slen); + sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent); /* * N.B. Better to return EBUSY here ... it could be * dangerous to delete the file while it's in use. diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f4cca8f00c0c..336c510f3750 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -59,12 +59,10 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_inode_remove_request(struct nfs_page *req); -static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); -static struct nfs_page * -nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct folio *folio); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -155,132 +153,51 @@ nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) } } -static int -nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) { - int ret; - - if (!test_bit(PG_REMOVE, &req->wb_flags)) - return 0; - ret = nfs_page_group_lock(req); - if (ret) - return ret; if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) nfs_page_set_inode_ref(req, inode); - nfs_page_group_unlock(req); - return 0; -} - -static struct nfs_page *nfs_folio_private_request(struct folio *folio) -{ - return folio_get_private(folio); } /** - * nfs_folio_find_private_request - find head request associated with a folio + * nfs_folio_find_head_request - find head request associated with a folio * @folio: pointer to folio * * must be called while holding the inode lock. * * returns matching head request with reference held, or NULL if not found. */ -static struct nfs_page *nfs_folio_find_private_request(struct folio *folio) +static struct nfs_page *nfs_folio_find_head_request(struct folio *folio) { - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct nfs_page *req; if (!folio_test_private(folio)) return NULL; - spin_lock(&mapping->private_lock); - req = nfs_folio_private_request(folio); + spin_lock(&mapping->i_private_lock); + req = folio->private; if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } - spin_unlock(&mapping->private_lock); - return req; -} - -static struct nfs_page *nfs_folio_find_swap_request(struct folio *folio) -{ - struct inode *inode = folio_file_mapping(folio)->host; - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_page *req = NULL; - if (!folio_test_swapcache(folio)) - return NULL; - mutex_lock(&nfsi->commit_mutex); - if (folio_test_swapcache(folio)) { - req = nfs_page_search_commits_for_head_request_locked(nfsi, - folio); - if (req) { - WARN_ON_ONCE(req->wb_head != req); - kref_get(&req->wb_kref); - } - } - mutex_unlock(&nfsi->commit_mutex); - return req; -} - -/** - * nfs_folio_find_head_request - find head request associated with a folio - * @folio: pointer to folio - * - * returns matching head request with reference held, or NULL if not found. - */ -static struct nfs_page *nfs_folio_find_head_request(struct folio *folio) -{ - struct nfs_page *req; - - req = nfs_folio_find_private_request(folio); - if (!req) - req = nfs_folio_find_swap_request(folio); + spin_unlock(&mapping->i_private_lock); return req; } -static struct nfs_page *nfs_folio_find_and_lock_request(struct folio *folio) -{ - struct inode *inode = folio_file_mapping(folio)->host; - struct nfs_page *req, *head; - int ret; - - for (;;) { - req = nfs_folio_find_head_request(folio); - if (!req) - return req; - head = nfs_page_group_lock_head(req); - if (head != req) - nfs_release_request(req); - if (IS_ERR(head)) - return head; - ret = nfs_cancel_remove_inode(head, inode); - if (ret < 0) { - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); - } - /* Ensure that nobody removed the request before we locked it */ - if (head == nfs_folio_private_request(folio)) - break; - if (folio_test_swapcache(folio)) - break; - nfs_unlock_and_release_request(head); - } - return head; -} - /* Adjust the file length if we're writing beyond the end */ static void nfs_grow_file(struct folio *folio, unsigned int offset, unsigned int count) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; loff_t end, i_size; pgoff_t end_index; spin_lock(&inode->i_lock); i_size = i_size_read(inode); end_index = ((i_size - 1) >> folio_shift(folio)) << folio_order(folio); - if (i_size > 0 && folio_index(folio) < end_index) + if (i_size > 0 && folio->index < end_index) goto out; - end = folio_file_pos(folio) + (loff_t)offset + (loff_t)count; + end = folio_pos(folio) + (loff_t)offset + (loff_t)count; if (i_size >= end) goto out; trace_nfs_size_grow(inode, end); @@ -288,6 +205,8 @@ static void nfs_grow_file(struct folio *folio, unsigned int offset, NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); out: + /* Atomically update timestamps if they are delegated to us. */ + nfs_update_delegated_mtime_locked(inode); spin_unlock(&inode->i_lock); nfs_fscache_invalidate(inode, 0); } @@ -308,9 +227,8 @@ static void nfs_set_pageerror(struct address_space *mapping) static void nfs_mapping_set_error(struct folio *folio, int error) { - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; - folio_set_error(folio); filemap_set_wb_err(mapping, error); if (mapping->host) errseq_set(&mapping->host->i_sb->s_wb_err, @@ -319,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error) } /* - * nfs_page_group_search_locked - * @head - head request of page group - * @page_offset - offset into page - * - * Search page group with head @head to find a request that contains the - * page offset @page_offset. + * nfs_page_covers_folio + * @req: struct nfs_page * - * Returns a pointer to the first matching nfs request, or NULL if no - * match is found. - * - * Must be called with the page group lock held - */ -static struct nfs_page * -nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) -{ - struct nfs_page *req; - - req = head; - do { - if (page_offset >= req->wb_pgbase && - page_offset < (req->wb_pgbase + req->wb_bytes)) - return req; - - req = req->wb_this_page; - } while (req != head); - - return NULL; -} - -/* - * nfs_page_group_covers_page - * @head - head request of page group - * - * Return true if the page group with head @head covers the whole page, - * returns false otherwise + * Return true if the request covers the whole folio. + * Note that the caller should ensure all subrequests have been joined */ static bool nfs_page_group_covers_page(struct nfs_page *req) { unsigned int len = nfs_folio_length(nfs_page_to_folio(req)); - struct nfs_page *tmp; - unsigned int pos = 0; - nfs_page_group_lock(req); - - for (;;) { - tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (!tmp) - break; - pos = tmp->wb_pgbase + tmp->wb_bytes; - } - - nfs_page_group_unlock(req); - return pos >= len; + return req->wb_pgbase == 0 && req->wb_bytes == len; } /* We can set the PG_uptodate flag if we see that a write request @@ -409,7 +285,7 @@ int nfs_congestion_kb; static void nfs_folio_set_writeback(struct folio *folio) { - struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host); + struct nfs_server *nfss = NFS_SERVER(folio->mapping->host); folio_start_writeback(folio); if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH) @@ -418,12 +294,14 @@ static void nfs_folio_set_writeback(struct folio *folio) static void nfs_folio_end_writeback(struct folio *folio) { - struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host); + struct nfs_server *nfss = NFS_SERVER(folio->mapping->host); - folio_end_writeback(folio); + folio_end_writeback_no_dropbehind(folio); if (atomic_long_dec_return(&nfss->writeback) < - NFS_CONGESTION_OFF_THRESH) + NFS_CONGESTION_OFF_THRESH) { nfss->write_congested = 0; + wake_up_all(&nfss->write_congestion_wait); + } } static void nfs_page_end_writeback(struct nfs_page *req) @@ -502,8 +380,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. */ -void -nfs_join_page_group(struct nfs_page *head, struct inode *inode) +void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; @@ -533,7 +411,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) * Commit list removal accounting is done after locks are dropped */ subreq = head; do { - nfs_clear_request_commit(subreq); + nfs_clear_request_commit(cinfo, subreq); subreq = subreq->wb_this_page; } while (subreq != head); @@ -547,6 +425,74 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) nfs_destroy_unlinked_subrequests(destroy_list, head, inode); } +/** + * nfs_wait_on_request - Wait for a request to complete. + * @req: request to wait upon. + * + * Interruptible by fatal signals only. + * The user is responsible for holding a count on the request. + */ +static int nfs_wait_on_request(struct nfs_page *req) +{ + if (!test_bit(PG_BUSY, &req->wb_flags)) + return 0; + set_bit(PG_CONTENDED2, &req->wb_flags); + smp_mb__after_atomic(); + return wait_on_bit_io(&req->wb_flags, PG_BUSY, + TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_unroll_locks - unlock all newly locked reqs and wait on @req + * @head: head request of page group, must be holding head lock + * @req: request that couldn't lock and needs to wait on the req bit lock + * + * This is a helper function for nfs_lock_and_join_requests + * returns 0 on success, < 0 on error. + */ +static void +nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req) +{ + struct nfs_page *tmp; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { + if (!kref_read(&tmp->wb_kref)) + continue; + nfs_unlock_and_release_request(tmp); + } +} + +/* + * nfs_page_group_lock_subreq - try to lock a subrequest + * @head: head request of page group + * @subreq: request to lock + * + * This is a helper function for nfs_lock_and_join_requests which + * must be called with the head request and page group both locked. + * On error, it returns with the page group unlocked. + */ +static int +nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq) +{ + int ret; + + if (!kref_get_unless_zero(&subreq->wb_kref)) + return 0; + while (!nfs_lock_request(subreq)) { + nfs_page_group_unlock(head); + ret = nfs_wait_on_request(subreq); + if (!ret) + ret = nfs_page_group_lock(head); + if (ret < 0) { + nfs_unroll_locks(head, subreq); + nfs_release_request(subreq); + return ret; + } + } + return 0; +} + /* * nfs_lock_and_join_requests - join all subreqs to the head req * @folio: the folio used to lookup the "page group" of nfs_page structures @@ -564,8 +510,9 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) */ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { - struct inode *inode = folio_file_mapping(folio)->host; - struct nfs_page *head; + struct inode *inode = folio->mapping->host; + struct nfs_page *head, *subreq; + struct nfs_commit_info cinfo; int ret; /* @@ -573,20 +520,50 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) * reference to the whole page group - the group will not be destroyed * until the head reference is released. */ - head = nfs_folio_find_and_lock_request(folio); - if (IS_ERR_OR_NULL(head)) - return head; +retry: + head = nfs_folio_find_head_request(folio); + if (!head) + return NULL; - /* lock each request in the page group */ - ret = nfs_page_group_lock_subrequests(head); - if (ret < 0) { + while (!nfs_lock_request(head)) { + ret = nfs_wait_on_request(head); + if (ret < 0) { + nfs_release_request(head); + return ERR_PTR(ret); + } + } + + ret = nfs_page_group_lock(head); + if (ret < 0) + goto out_unlock; + + /* Ensure that nobody removed the request before we locked it */ + if (head != folio->private) { + nfs_page_group_unlock(head); nfs_unlock_and_release_request(head); - return ERR_PTR(ret); + goto retry; } - nfs_join_page_group(head, inode); + nfs_cancel_remove_inode(head, inode); + /* lock each request in the page group */ + for (subreq = head->wb_this_page; + subreq != head; + subreq = subreq->wb_this_page) { + ret = nfs_page_group_lock_subreq(head, subreq); + if (ret < 0) + goto out_unlock; + } + + nfs_page_group_unlock(head); + + nfs_init_cinfo_from_inode(&cinfo, inode); + nfs_join_page_group(head, &cinfo, inode); return head; + +out_unlock: + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); } static void nfs_write_error(struct nfs_page *req, int error) @@ -602,20 +579,21 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct folio *folio, - struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) +static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; - int ret = 0; + int ret; + + nfs_pageio_cond_complete(pgio, folio->index); req = nfs_lock_and_join_requests(folio); if (!req) - goto out; - ret = PTR_ERR(req); + return 0; if (IS_ERR(req)) - goto out; + return PTR_ERR(req); + trace_nfs_do_writepage(req); nfs_folio_set_writeback(folio); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); @@ -624,7 +602,6 @@ static int nfs_page_async_flush(struct folio *folio, if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -632,28 +609,20 @@ static int nfs_page_async_flush(struct folio *folio, */ if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - if (wbc->sync_mode == WB_SYNC_NONE) - ret = AOP_WRITEPAGE_ACTIVATE; folio_redirty_for_writepage(wbc, folio); nfs_redirty_request(req); pgio->pg_error = 0; - } else - nfs_add_stats(folio_file_mapping(folio)->host, - NFSIOS_WRITEPAGES, 1); -out: - return ret; + return ret; + } + + nfs_add_stats(folio->mapping->host, NFSIOS_WRITEPAGES, 1); + return 0; + out_launder: nfs_write_error(req, ret); return 0; } -static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) -{ - nfs_pageio_cond_complete(pgio, folio_index(folio)); - return nfs_page_async_flush(folio, wbc, pgio); -} - /* * Write an mmapped page to the server. */ @@ -661,13 +630,9 @@ static int nfs_writepage_locked(struct folio *folio, struct writeback_control *wbc) { struct nfs_pageio_descriptor pgio; - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; int err; - if (wbc->sync_mode == WB_SYNC_NONE && - NFS_SERVER(inode)->write_congested) - return AOP_WRITEPAGE_ACTIVATE; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); @@ -677,28 +642,6 @@ static int nfs_writepage_locked(struct folio *folio, return err; } -int nfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - int ret; - - ret = nfs_writepage_locked(folio, wbc); - if (ret != AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); - return ret; -} - -static int nfs_writepages_callback(struct folio *folio, - struct writeback_control *wbc, void *data) -{ - int ret; - - ret = nfs_do_writepage(folio, wbc, data); - if (ret != AOP_WRITEPAGE_ACTIVATE) - folio_unlock(folio); - return ret; -} - static void nfs_io_completion_commit(void *inode) { nfs_commit_inode(inode, 0); @@ -710,17 +653,24 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) struct nfs_pageio_descriptor pgio; struct nfs_io_completion *ioc = NULL; unsigned int mntflags = NFS_SERVER(inode)->flags; + struct nfs_server *nfss = NFS_SERVER(inode); int priority = 0; int err; - if (wbc->sync_mode == WB_SYNC_NONE && - NFS_SERVER(inode)->write_congested) - return 0; + trace_nfs_writepages(inode, wbc->range_start, wbc->range_end - wbc->range_start); + + /* Wait with writeback until write congestion eases */ + if (wbc->sync_mode == WB_SYNC_NONE && nfss->write_congested) { + err = wait_event_killable(nfss->write_congestion_wait, + nfss->write_congested == 0); + if (err) + goto out_err; + } nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || - wbc->for_background || wbc->for_sync || wbc->for_reclaim) { + wbc->for_background || wbc->for_sync) { ioc = nfs_io_completion_alloc(GFP_KERNEL); if (ioc) nfs_io_completion_init(ioc, nfs_io_completion_commit, @@ -729,20 +679,26 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) } do { + struct folio *folio = NULL; + nfs_pageio_init_write(&pgio, inode, priority, false, &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; - err = write_cache_pages(mapping, wbc, nfs_writepages_callback, - &pgio); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) { + err = nfs_do_writepage(folio, wbc, &pgio); + folio_unlock(folio); + } pgio.pg_error = 0; nfs_pageio_complete(&pgio); + if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR) + break; } while (err < 0 && !nfs_error_is_fatal(err)); nfs_io_completion_put(ioc); - if (err < 0) - goto out_err; - return 0; + if (err > 0) + err = 0; out_err: + trace_nfs_writepages_done(inode, wbc->range_start, wbc->range_end - wbc->range_start, err); return err; } @@ -752,25 +708,18 @@ out_err: static void nfs_inode_add_request(struct nfs_page *req) { struct folio *folio = nfs_page_to_folio(req); - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct nfs_inode *nfsi = NFS_I(mapping->host); WARN_ON_ONCE(req->wb_this_page != req); /* Lock the request! */ nfs_lock_request(req); - - /* - * Swap-space should not get truncated. Hence no need to plug the race - * with invalidate/truncate. - */ - spin_lock(&mapping->private_lock); - if (likely(!folio_test_swapcache(folio))) { - set_bit(PG_MAPPED, &req->wb_flags); - folio_set_private(folio); - folio->private = req; - } - spin_unlock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); + set_bit(PG_MAPPED, &req->wb_flags); + folio_set_private(folio); + folio->private = req; + spin_unlock(&mapping->i_private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. @@ -785,22 +734,28 @@ static void nfs_inode_add_request(struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { + struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); + + nfs_page_group_lock(req); + if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) { struct folio *folio = nfs_page_to_folio(req->wb_head); - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; - spin_lock(&mapping->private_lock); - if (likely(folio && !folio_test_swapcache(folio))) { + spin_lock(&mapping->i_private_lock); + if (likely(folio)) { folio->private = NULL; folio_clear_private(folio); clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); + + folio_end_dropbehind(folio); } + nfs_page_group_unlock(req); if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { + atomic_long_dec(&nfsi->nrequests); nfs_release_request(req); - atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests); } } @@ -811,38 +766,6 @@ static void nfs_mark_request_dirty(struct nfs_page *req) filemap_dirty_folio(folio_mapping(folio), folio); } -/* - * nfs_page_search_commits_for_head_request_locked - * - * Search through commit lists on @inode for the head request for @folio. - * Must be called while holding the inode (which is cinfo) lock. - * - * Returns the head request if found, or NULL if not found. - */ -static struct nfs_page * -nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct folio *folio) -{ - struct nfs_page *freq, *t; - struct nfs_commit_info cinfo; - struct inode *inode = &nfsi->vfs_inode; - - nfs_init_cinfo_from_inode(&cinfo, inode); - - /* search through pnfs commit lists */ - freq = pnfs_search_commit_reqs(inode, &cinfo, folio); - if (freq) - return freq->wb_head; - - /* Linearly search the commit list for the correct request */ - list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { - if (nfs_page_to_folio(freq) == folio) - return freq->wb_head; - } - - return NULL; -} - /** * nfs_request_add_commit_list_locked - add request to a commit list * @req: pointer to a struct nfs_page @@ -949,24 +872,22 @@ static void nfs_folio_clear_commit(struct folio *folio) long nr = folio_nr_pages(folio); node_stat_mod_folio(folio, NR_WRITEBACK, -nr); - wb_stat_mod(&inode_to_bdi(folio_file_mapping(folio)->host)->wb, + wb_stat_mod(&inode_to_bdi(folio->mapping->host)->wb, WB_WRITEBACK, -nr); } } /* Called holding the request lock on @req */ -static void -nfs_clear_request_commit(struct nfs_page *req) +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { struct nfs_open_context *ctx = nfs_req_openctx(req); struct inode *inode = d_inode(ctx->dentry); - struct nfs_commit_info cinfo; - nfs_init_cinfo_from_inode(&cinfo, inode); mutex_lock(&NFS_I(inode)->commit_mutex); - if (!pnfs_clear_request_commit(req, &cinfo)) { - nfs_request_remove_commit_list(req, &cinfo); + if (!pnfs_clear_request_commit(req, cinfo)) { + nfs_request_remove_commit_list(req, cinfo); } mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_folio_clear_commit(nfs_page_to_folio(req)); @@ -1010,7 +931,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) req->wb_nio = 0; memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo, - hdr->pgio_mirror_idx); + hdr->ds_commit_idx); goto next; } remove_req: @@ -1101,11 +1022,12 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio, unsigned int end; int error; + trace_nfs_try_to_update_request(folio_inode(folio), offset, bytes); end = offset + bytes; req = nfs_lock_and_join_requests(folio); if (IS_ERR_OR_NULL(req)) - return req; + goto out; rqend = req->wb_offset + req->wb_bytes; /* @@ -1127,6 +1049,9 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio, else req->wb_bytes = rqend - req->wb_offset; req->wb_nio = 0; +out: + trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, + PTR_ERR_OR_ZERO(req)); return req; out_flushme: /* @@ -1136,7 +1061,8 @@ out_flushme: */ nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); - error = nfs_wb_folio(folio_file_mapping(folio)->host, folio); + error = nfs_wb_folio(folio->mapping->host, folio); + trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, error); return (error < 0) ? ERR_PTR(error) : NULL; } @@ -1174,6 +1100,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, req = nfs_setup_write_request(ctx, folio, offset, count); if (IS_ERR(req)) return PTR_ERR(req); + trace_nfs_writepage_setup(req); /* Update file length */ nfs_grow_file(folio, offset, count); nfs_mark_uptodate(req); @@ -1212,7 +1139,7 @@ int nfs_flush_incompatible(struct file *file, struct folio *folio) nfs_release_request(req); if (!do_flush) return 0; - status = nfs_wb_folio(folio_file_mapping(folio)->host, folio); + status = nfs_wb_folio(folio->mapping->host, folio); } while (status == 0); return status; } @@ -1286,7 +1213,7 @@ out: */ static bool nfs_folio_write_uptodate(struct folio *folio, unsigned int pagelen) { - struct inode *inode = folio_file_mapping(folio)->host; + struct inode *inode = folio->mapping->host; struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) @@ -1307,7 +1234,7 @@ static bool is_whole_file_wrlock(struct file_lock *fl) { return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX && - fl->fl_type == F_WRLCK; + lock_is_write(fl); } /* If we know the page is up to date, and we're not using byte range locks (or @@ -1325,12 +1252,15 @@ static int nfs_can_extend_write(struct file *file, struct folio *folio, struct file_lock_context *flctx = locks_inode_context(inode); struct file_lock *fl; int ret; + unsigned int mntflags = NFS_SERVER(inode)->flags; + if (mntflags & NFS_MOUNT_NO_ALIGNWRITE) + return 0; if (file->f_flags & O_DSYNC) return 0; if (!nfs_folio_write_uptodate(folio, pagelen)) return 0; - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) + if (nfs_have_write_delegation(inode)) return 1; if (!flctx || (list_empty_careful(&flctx->flc_flock) && list_empty_careful(&flctx->flc_posix))) @@ -1341,13 +1271,13 @@ static int nfs_can_extend_write(struct file *file, struct folio *folio, spin_lock(&flctx->flc_lock); if (!list_empty(&flctx->flc_posix)) { fl = list_first_entry(&flctx->flc_posix, struct file_lock, - fl_list); + c.flc_list); if (is_whole_file_wrlock(fl)) ret = 1; } else if (!list_empty(&flctx->flc_flock)) { fl = list_first_entry(&flctx->flc_flock, struct file_lock, - fl_list); - if (fl->fl_type == F_WRLCK) + c.flc_list); + if (lock_is_write(fl)) ret = 1; } spin_unlock(&flctx->flc_lock); @@ -1364,28 +1294,35 @@ int nfs_update_folio(struct file *file, struct folio *folio, unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct address_space *mapping = folio_file_mapping(folio); + struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; unsigned int pagelen = nfs_folio_length(folio); int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); + trace_nfs_update_folio(inode, offset, count); + dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count, - (long long)(folio_file_pos(folio) + offset)); + (long long)(folio_pos(folio) + offset)); if (!count) goto out; if (nfs_can_extend_write(file, folio, pagelen)) { - count = max(count + offset, pagelen); - offset = 0; + unsigned int end = count + offset; + + offset = round_down(offset, PAGE_SIZE); + if (end < pagelen) + end = min(round_up(end, PAGE_SIZE), pagelen); + count = end - offset; } status = nfs_writepage_setup(ctx, folio, offset, count); if (status < 0) nfs_set_pageerror(mapping); out: + trace_nfs_update_folio_done(inode, offset, count, status); dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; @@ -1524,6 +1461,13 @@ void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) struct nfs_fattr *fattr = &hdr->fattr; struct inode *inode = hdr->inode; + if (nfs_have_delegated_mtime(inode)) { + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); + spin_unlock(&inode->i_lock); + return; + } + spin_lock(&inode->i_lock); nfs_writeback_check_extend(hdr, fattr); nfs_post_op_update_inode_force_wcc_locked(inode, fattr); @@ -1591,7 +1535,8 @@ static int nfs_writeback_done(struct rpc_task *task, /* Deal with the suid/sgid bit corner case */ if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE + | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); } return 0; @@ -1656,7 +1601,7 @@ static int wait_on_commit(struct nfs_mds_commit_info *cinfo) !atomic_read(&cinfo->rpcs_out)); } -static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) +void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) { atomic_inc(&cinfo->rpcs_out); } @@ -1680,7 +1625,8 @@ EXPORT_SYMBOL_GPL(nfs_commitdata_release); int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, const struct nfs_rpc_ops *nfs_ops, const struct rpc_call_ops *call_ops, - int how, int flags) + int how, int flags, + struct nfsd_file *localio) { struct rpc_task *task; int priority = flush_task_priority(how); @@ -1709,6 +1655,9 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, dprintk("NFS: initiated commit call\n"); + if (localio) + return nfs_local_commit(localio, data, call_ops, how); + task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -1808,6 +1757,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo) { struct nfs_commit_data *data; + struct nfsd_file *localio; unsigned short task_flags = 0; /* another commit raced with us */ @@ -1824,9 +1774,13 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, nfs_init_commit(data, head, NULL, cinfo); if (NFS_SERVER(inode)->nfs_client->cl_minorversion) task_flags = RPC_TASK_MOVEABLE; + + localio = nfs_local_open_fh(NFS_SERVER(inode)->nfs_client, data->cred, + data->args.fh, &data->context->nfl, + data->context->mode); return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), data->mds_ops, how, - RPC_TASK_CRED_NOREF | task_flags); + RPC_TASK_CRED_NOREF | task_flags, localio); } /* @@ -1847,7 +1801,6 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) struct nfs_page *req; int status = data->task.tk_status; struct nfs_commit_info cinfo; - struct nfs_server *nfss; struct folio *folio; while (!list_empty(&data->pages)) { @@ -1868,7 +1821,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) nfs_mapping_set_error(folio, status); nfs_inode_remove_request(req); } - dprintk_cont(", error = %d\n", status); + dprintk(", error = %d\n", status); goto next; } @@ -1878,11 +1831,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a match */ if (folio) nfs_inode_remove_request(req); - dprintk_cont(" OK\n"); + dprintk(" OK\n"); goto next; } /* We have a mismatch. Write the page again */ - dprintk_cont(" mismatch\n"); + dprintk(" mismatch\n"); nfs_mark_request_dirty(req); atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: @@ -1890,9 +1843,6 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Latency breaker */ cond_resched(); } - nfss = NFS_SERVER(data->inode); - if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - nfss->write_congested = 0; nfs_init_cinfo(&cinfo, data->inode, data->dreq); nfs_commit_end(cinfo.mds); @@ -2068,6 +2018,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) * release it */ nfs_inode_remove_request(req); nfs_unlock_and_release_request(req); + folio_cancel_dirty(folio); } return ret; @@ -2083,17 +2034,17 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) */ int nfs_wb_folio(struct inode *inode, struct folio *folio) { - loff_t range_start = folio_file_pos(folio); - loff_t range_end = range_start + (loff_t)folio_size(folio) - 1; + loff_t range_start = folio_pos(folio); + size_t len = folio_size(folio); struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, .range_start = range_start, - .range_end = range_end, + .range_end = range_start + len - 1, }; int ret; - trace_nfs_writeback_folio(inode, folio); + trace_nfs_writeback_folio(inode, range_start, len); for (;;) { folio_wait_writeback(folio); @@ -2111,7 +2062,7 @@ int nfs_wb_folio(struct inode *inode, struct folio *folio) goto out_error; } out_error: - trace_nfs_writeback_folio_done(inode, folio, ret); + trace_nfs_writeback_folio_done(inode, range_start, len, ret); return ret; } @@ -2127,13 +2078,17 @@ int nfs_migrate_folio(struct address_space *mapping, struct folio *dst, * that we can safely release the inode reference while holding * the folio lock. */ - if (folio_test_private(src)) - return -EBUSY; + if (folio_test_private(src)) { + if (mode == MIGRATE_SYNC) + nfs_wb_folio(src->mapping->host, src); + if (folio_test_private(src)) + return -EBUSY; + } - if (folio_test_fscache(src)) { + if (folio_test_private_2(src)) { /* [DEPRECATED] */ if (mode == MIGRATE_ASYNC) return -EBUSY; - folio_wait_fscache(src); + folio_wait_private_2(src); } return migrate_folio(mapping, dst, src, mode); |
