diff options
Diffstat (limited to 'fs/nfs')
90 files changed, 28520 insertions, 13835 deletions
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 69d02cf8cf37..07932ce9246c 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -1,8 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only config NFS_FS tristate "NFS client support" depends on INET && FILE_LOCKING && MULTIUSER + select CRC32 select LOCKD select SUNRPC + select NFS_COMMON select NFS_ACL_SUPPORT if NFS_V3_ACL help Choose Y here if you want to access files residing on other @@ -32,12 +35,12 @@ config NFS_FS config NFS_V2 tristate "NFS client support for NFS version 2" depends on NFS_FS - default y + default n help This option enables support for version 2 of the NFS protocol (RFC 1094) in the kernel's NFS client. - If unsure, say Y. + If unsure, say N. config NFS_V3 tristate "NFS client support for NFS version 3" @@ -74,7 +77,6 @@ config NFS_V3_ACL config NFS_V4 tristate "NFS client support for NFS version 4" depends on NFS_FS - select SUNRPC_GSS select KEYS help This option enables support for version 4 of the NFS protocol @@ -89,7 +91,7 @@ config NFS_V4 config NFS_SWAP bool "Provide swap over NFS support" default n - depends on NFS_FS + depends on NFS_FS && SWAP select SUNRPC_SWAP help This option enables swapon to work on files located on NFS mounts. @@ -125,8 +127,8 @@ config PNFS_BLOCK config PNFS_FLEXFILE_LAYOUT tristate - depends on NFS_V4_1 && NFS_V3 - default m + depends on NFS_V4_1 + default NFS_V4 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN string "NFSv4.1 Implementation ID Domain" @@ -163,13 +165,15 @@ config ROOT_NFS If you want your system to mount its root file system via NFS, choose Y here. This is common practice for managing systems without local permanent storage. For details, read - <file:Documentation/filesystems/nfs/nfsroot.txt>. + <file:Documentation/admin-guide/nfs/nfsroot.rst>. Most people say N here. config NFS_FSCACHE bool "Provide NFS client caching support" - depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y + depends on NFS_FS + select NETFS_SUPPORT + select FSCACHE help Say Y here if you want NFS data to be cached locally on disc through the general filesystem cache manager @@ -193,5 +197,20 @@ config NFS_USE_KERNEL_DNS config NFS_DEBUG bool depends on NFS_FS && SUNRPC_DEBUG - select CRC32 default y + +config NFS_DISABLE_UDP_SUPPORT + bool "NFS: Disable NFS UDP protocol support" + depends on NFS_FS + default y + help + Choose Y here to disable the use of NFS over UDP. NFS over UDP + on modern networks (1Gb+) can lead to data corruption caused by + fragmentation during high loads. + +config NFS_V4_2_READ_PLUS + bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" + depends on NFS_V4_2 + default y + help + Choose Y here to enable use of the NFS v4.2 READ_PLUS operation. diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 1fb118902d57..9fb2f2cac87e 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the Linux nfs filesystem routines. # @@ -7,10 +8,12 @@ obj-$(CONFIG_NFS_FS) += nfs.o CFLAGS_nfstrace.o += -I$(src) nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ io.o direct.o pagelist.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o nfstrace.o export.o + write.o namespace.o mount_clnt.o nfstrace.o \ + export.o sysfs.o fs_context.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o -nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o +nfs-$(CONFIG_NFS_FSCACHE) += fscache.o +nfs-$(CONFIG_NFS_LOCALIO) += localio.o obj-$(CONFIG_NFS_V2) += nfsv2.o nfsv2-y := nfs2super.o proc.o nfs2xdr.o @@ -28,7 +31,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o -nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o +nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o nfs42xattr.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile index 3ca14c36d08b..7668a1bfb5fa 100644 --- a/fs/nfs/blocklayout/Makefile +++ b/fs/nfs/blocklayout/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Makefile for the pNFS block layout driver kernel module # diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d8863a804b15..0e4c67373e4f 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -115,30 +115,13 @@ bl_submit_bio(struct bio *bio) return NULL; } -static struct bio * -bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, - bio_end_io_t end_io, struct parallel_io *par) +static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map) { - struct bio *bio; - - npg = min(npg, BIO_MAX_PAGES); - bio = bio_alloc(GFP_NOIO, npg); - if (!bio && (current->flags & PF_MEMALLOC)) { - while (!bio && (npg /= 2)) - bio = bio_alloc(GFP_NOIO, npg); - } - - if (bio) { - bio->bi_iter.bi_sector = disk_sector; - bio->bi_bdev = bdev; - bio->bi_end_io = end_io; - bio->bi_private = par; - } - return bio; + return offset >= map->start && offset < map->start + map->len; } static struct bio * -do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, +do_add_page_to_bio(struct bio *bio, int npg, enum req_op op, sector_t isect, struct page *page, struct pnfs_block_dev_map *map, struct pnfs_block_extent *be, bio_end_io_t end_io, struct parallel_io *par, unsigned int offset, int *len) @@ -148,7 +131,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, u64 disk_addr, end; dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, - npg, rw, (unsigned long long)isect, offset, *len); + npg, (__force u32)op, (unsigned long long)isect, offset, *len); /* translate to device offset */ isect += be->be_v_offset; @@ -156,8 +139,8 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, /* translate to physical disk offset */ disk_addr = (u64)isect << SECTOR_SHIFT; - if (disk_addr < map->start || disk_addr >= map->start + map->len) { - if (!dev->map(dev, disk_addr, map)) + if (!offset_in_map(disk_addr, map)) { + if (!dev->map(dev, disk_addr, map) || !offset_in_map(disk_addr, map)) return ERR_PTR(-EIO); bio = bl_submit_bio(bio); } @@ -166,16 +149,15 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, /* limit length to what the device mapping allows */ end = disk_addr + *len; - if (end >= map->start + map->len) - *len = map->start + map->len - disk_addr; + if (end >= map->disk_offset + map->len) + *len = map->disk_offset + map->len - disk_addr; retry: if (!bio) { - bio = bl_alloc_init_bio(npg, map->bdev, - disk_addr >> SECTOR_SHIFT, end_io, par); - if (!bio) - return ERR_PTR(-ENOMEM); - bio_set_op_attrs(bio, rw, 0); + bio = bio_alloc(map->bdev, bio_max_segs(npg), op, GFP_NOIO); + bio->bi_iter.bi_sector = disk_addr >> SECTOR_SHIFT; + bio->bi_end_io = end_io; + bio->bi_private = par; } if (bio_add_page(bio, page, *len, offset) < *len) { bio = bl_submit_bio(bio); @@ -184,6 +166,29 @@ retry: return bio; } +static void bl_mark_devices_unavailable(struct nfs_pgio_header *header, bool rw) +{ + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + size_t bytes_left = header->args.count; + sector_t isect, extent_length = 0; + struct pnfs_block_extent be; + + isect = header->args.offset >> SECTOR_SHIFT; + bytes_left += header->args.offset - (isect << SECTOR_SHIFT); + + while (bytes_left > 0) { + if (!ext_tree_lookup(bl, isect, &be, rw)) + return; + extent_length = be.be_length - (isect - be.be_f_offset); + nfs4_mark_deviceid_unavailable(be.be_device); + isect += extent_length; + if (bytes_left > extent_length << SECTOR_SHIFT) + bytes_left -= extent_length << SECTOR_SHIFT; + else + bytes_left = 0; + } +} + static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; @@ -194,6 +199,7 @@ static void bl_end_io_read(struct bio *bio) if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); + bl_mark_devices_unavailable(header, false); } bio_put(bio); @@ -285,7 +291,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) } else { bio = do_add_page_to_bio(bio, header->page_array.npages - i, - READ, + REQ_OP_READ, isect, pages[i], &map, &be, bl_end_io_read, par, pg_offset, &pg_len); @@ -323,6 +329,7 @@ static void bl_end_io_write(struct bio *bio) if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); + bl_mark_devices_unavailable(header, true); } bio_put(bio); put_parallel(par); @@ -413,9 +420,8 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) pg_len = PAGE_SIZE; bio = do_add_page_to_bio(bio, header->page_array.npages - i, - WRITE, isect, pages[i], &map, &be, - bl_end_io_write, par, - 0, &pg_len); + REQ_OP_WRITE, isect, pages[i], &map, + &be, bl_end_io_write, par, 0, &pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; @@ -446,7 +452,7 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) err = ext_tree_remove(bl, true, 0, LLONG_MAX); WARN_ON(err); - kfree(bl); + kfree_rcu(bl, bl_layout.plh_rcu); } static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode, @@ -552,6 +558,53 @@ static int decode_sector_number(__be32 **rp, sector_t *sp) return 0; } +static struct nfs4_deviceid_node * +bl_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, const struct cred *cred, + gfp_t gfp_mask) +{ + struct nfs4_deviceid_node *node; + int err = -ENODEV; + +retry: + node = nfs4_find_get_deviceid(server, id, cred, gfp_mask); + if (!node) + return ERR_PTR(-ENODEV); + + /* + * Devices that are marked unavailable are left in the cache with a + * timeout to avoid sending GETDEVINFO after every LAYOUTGET, or + * constantly attempting to register the device. Once marked as + * unavailable they must be deleted and never reused. + */ + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { + unsigned long end = jiffies; + unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT; + + if (!time_in_range(node->timestamp_unavailable, start, end)) { + /* Uncork subsequent GETDEVINFO operations for this device */ + nfs4_delete_deviceid(node->ld, node->nfs_client, id); + goto retry; + } + goto out_put; + } + + if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node))) { + /* + * If we cannot register, treat this device as transient: + * Make a negative cache entry for the device + */ + nfs4_mark_deviceid_unavailable(node); + goto out_put; + } + + return node; + +out_put: + nfs4_put_deviceid_node(node); + return ERR_PTR(err); +} + static int bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, struct layout_verification *lv, struct list_head *extents, @@ -573,16 +626,18 @@ bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, memcpy(&id, p, NFS4_DEVICEID4_SIZE); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); - error = -EIO; - be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, + be->be_device = bl_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, lo->plh_lc_cred, gfp_mask); - if (!be->be_device) + if (IS_ERR(be->be_device)) { + error = PTR_ERR(be->be_device); goto out_free_be; + } /* * The next three values are read in as bytes, but stored in the * extent structure in 512-byte granularity. */ + error = -EIO; if (decode_sector_number(&p, &be->be_f_offset) < 0) goto out_put_deviceid; if (decode_sector_number(&p, &be->be_length) < 0) @@ -621,7 +676,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, struct pnfs_layout_segment *lseg; struct xdr_buf buf; struct xdr_stream xdr; - struct page *scratch; + struct folio *scratch; int status, i; uint32_t count; __be32 *p; @@ -634,13 +689,13 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, return ERR_PTR(-ENOMEM); status = -ENOMEM; - scratch = alloc_page(gfp_mask); + scratch = folio_alloc(gfp_mask, 0); if (!scratch) goto out; xdr_init_decode_pages(&xdr, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_folio(&xdr, scratch); status = -EIO; p = xdr_inline_decode(&xdr, 4); @@ -689,14 +744,20 @@ process_extents: } out_free_scratch: - __free_page(scratch); + folio_put(scratch); out: dprintk("%s returns %d\n", __func__, status); - if (status) { + switch (status) { + case -ENODEV: + /* Our extent block devices are unavailable */ + set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags); + fallthrough; + case 0: + return lseg; + default: kfree(lseg); return ERR_PTR(status); } - return lseg; } static void @@ -798,6 +859,13 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } pnfs_generic_pg_init_read(pgio, req); + + if (pgio->pg_lseg && + test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) { + pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg); + pnfs_set_lo_fail(pgio->pg_lseg); + nfs_pageio_reset_read_mds(pgio); + } } /* @@ -826,7 +894,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (end != inode->i_mapping->nrpages) { rcu_read_lock(); - end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); + end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX); rcu_read_unlock(); } @@ -847,12 +915,19 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } if (pgio->pg_dreq == NULL) - wb_size = pnfs_num_cont_bytes(pgio->pg_inode, - req->wb_index); + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index); else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req)); pnfs_generic_pg_init_write(pgio, req, wb_size); + + if (pgio->pg_lseg && + test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) { + + pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg); + pnfs_set_lo_fail(pgio->pg_lseg); + nfs_pageio_reset_write_mds(pgio); + } } /* @@ -887,6 +962,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .name = "LAYOUT_BLOCK_VOLUME", .owner = THIS_MODULE, .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR | PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, @@ -910,6 +986,7 @@ static struct pnfs_layoutdriver_type scsilayout_type = { .name = "LAYOUT_SCSI", .owner = THIS_MODULE, .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR | PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, @@ -967,6 +1044,7 @@ static void __exit nfs4blocklayout_exit(void) } MODULE_ALIAS("nfs-layouttype4-3"); +MODULE_ALIAS("nfs-layouttype4-5"); module_init(nfs4blocklayout_init); module_exit(nfs4blocklayout_exit); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index efc007f00742..6da40ca19570 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -92,10 +92,9 @@ struct pnfs_block_volume { }; struct pnfs_block_dev_map { - sector_t start; - sector_t len; - - sector_t disk_offset; + u64 start; + u64 len; + u64 disk_offset; struct block_device *bdev; }; @@ -105,20 +104,26 @@ struct pnfs_block_dev { u64 start; u64 len; + enum pnfs_block_volume_type type; u32 nr_children; struct pnfs_block_dev *children; u64 chunk_size; - struct block_device *bdev; + struct file *bdev_file; u64 disk_offset; + unsigned long flags; u64 pr_key; - bool pr_registered; bool (*map)(struct pnfs_block_dev *dev, u64 offset, struct pnfs_block_dev_map *map); }; +/* pnfs_block_dev flag bits */ +enum { + PNFS_BDEV_REGISTERED = 0, +}; + /* sector_t fields are all in 512-byte sectors */ struct pnfs_block_extent { union { @@ -173,6 +178,7 @@ struct bl_msg_hdr { #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ /* dev.c */ +bool bl_register_dev(struct pnfs_block_dev *d); struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, gfp_t gfp_mask); void bl_free_deviceid_node(struct nfs4_deviceid_node *d); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index a69ef4e9c24c..ab76120705e2 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014-2016 Christoph Hellwig. */ @@ -9,12 +10,81 @@ #include <linux/pr.h> #include "blocklayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD +static void bl_unregister_scsi(struct pnfs_block_dev *dev) +{ + struct block_device *bdev = file_bdev(dev->bdev_file); + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + int status; + + status = ops->pr_register(bdev, dev->pr_key, 0, false); + if (status) + trace_bl_pr_key_unreg_err(bdev, dev->pr_key, status); + else + trace_bl_pr_key_unreg(bdev, dev->pr_key); +} + +static bool bl_register_scsi(struct pnfs_block_dev *dev) +{ + struct block_device *bdev = file_bdev(dev->bdev_file); + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + int status; + + if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags)) + return true; + + status = ops->pr_register(bdev, 0, dev->pr_key, true); + if (status) { + trace_bl_pr_key_reg_err(bdev, dev->pr_key, status); + return false; + } + trace_bl_pr_key_reg(bdev, dev->pr_key); + return true; +} + +static void bl_unregister_dev(struct pnfs_block_dev *dev) +{ + u32 i; + + if (dev->nr_children) { + for (i = 0; i < dev->nr_children; i++) + bl_unregister_dev(&dev->children[i]); + return; + } + + if (dev->type == PNFS_BLOCK_VOLUME_SCSI && + test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags)) + bl_unregister_scsi(dev); +} + +bool bl_register_dev(struct pnfs_block_dev *dev) +{ + u32 i; + + if (dev->nr_children) { + for (i = 0; i < dev->nr_children; i++) { + if (!bl_register_dev(&dev->children[i])) { + while (i > 0) + bl_unregister_dev(&dev->children[--i]); + return false; + } + } + return true; + } + + if (dev->type == PNFS_BLOCK_VOLUME_SCSI) + return bl_register_scsi(dev); + return true; +} + static void bl_free_device(struct pnfs_block_dev *dev) { + bl_unregister_dev(dev); + if (dev->nr_children) { int i; @@ -22,19 +92,8 @@ bl_free_device(struct pnfs_block_dev *dev) bl_free_device(&dev->children[i]); kfree(dev->children); } else { - if (dev->pr_registered) { - const struct pr_ops *ops = - dev->bdev->bd_disk->fops->pr_ops; - int error; - - error = ops->pr_register(dev->bdev, dev->pr_key, 0, - false); - if (error) - pr_err("failed to unregister PR key.\n"); - } - - if (dev->bdev) - blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); + if (dev->bdev_file) + fput(dev->bdev_file); } } @@ -168,7 +227,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, map->start = dev->start; map->len = dev->len; map->disk_offset = dev->disk_offset; - map->bdev = dev->bdev; + map->bdev = file_bdev(dev->bdev_file); return true; } @@ -198,12 +257,13 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, struct pnfs_block_dev *child; u64 chunk; u32 chunk_idx; + u64 disk_chunk; u64 disk_offset; chunk = div_u64(offset, dev->chunk_size); - div_u64_rem(chunk, dev->nr_children, &chunk_idx); + disk_chunk = div_u64_rem(chunk, dev->nr_children, &chunk_idx); - if (chunk_idx > dev->nr_children) { + if (chunk_idx >= dev->nr_children) { dprintk("%s: invalid chunk idx %d (%lld/%lld)\n", __func__, chunk_idx, offset, dev->chunk_size); /* error, should not happen */ @@ -214,7 +274,7 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, offset = chunk * dev->chunk_size; /* disk offset of the stripe */ - disk_offset = div_u64(offset, dev->nr_children); + disk_offset = disk_chunk * dev->chunk_size; child = &dev->children[chunk_idx]; child->map(child, disk_offset, map); @@ -235,27 +295,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; - struct block_device *bdev; + struct file *bdev_file; dev_t dev; dev = bl_resolve_deviceid(server, v, gfp_mask); if (!dev) return -EIO; - bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); - if (IS_ERR(bdev)) { + bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, + NULL, NULL); + if (IS_ERR(bdev_file)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); - return PTR_ERR(bdev); + MAJOR(dev), MINOR(dev), PTR_ERR(bdev_file)); + return PTR_ERR(bdev_file); } - d->bdev = bdev; - - - d->len = i_size_read(d->bdev->bd_inode); + d->bdev_file = bdev_file; + d->len = bdev_nr_bytes(file_bdev(bdev_file)); d->map = bl_map_simple; printk(KERN_INFO "pNFS: using block device %s\n", - d->bdev->bd_disk->disk_name); + file_bdev(bdev_file)->bd_disk->disk_name); return 0; } @@ -300,51 +359,26 @@ bl_validate_designator(struct pnfs_block_volume *v) } } -/* - * Try to open the udev path for the WWN. At least on Debian the udev - * by-id path will always point to the dm-multipath device if one exists. - */ -static struct block_device * -bl_open_udev_path(struct pnfs_block_volume *v) +static struct file * +bl_open_path(struct pnfs_block_volume *v, const char *prefix) { - struct block_device *bdev; + struct file *bdev_file; const char *devname; - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN", - v->scsi.designator_len, v->scsi.designator); + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN", + prefix, v->scsi.designator_len, v->scsi.designator); if (!devname) return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); - if (IS_ERR(bdev)) { - pr_warn("pNFS: failed to open device %s (%ld)\n", - devname, PTR_ERR(bdev)); + bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, + NULL, NULL); + if (IS_ERR(bdev_file)) { + dprintk("failed to open device %s (%ld)\n", + devname, PTR_ERR(bdev_file)); } kfree(devname); - return bdev; -} - -/* - * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the - * wwn- links will only point to the first discovered SCSI device there. - */ -static struct block_device * -bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v) -{ - struct block_device *bdev; - const char *devname; - - devname = kasprintf(GFP_KERNEL, - "/dev/disk/by-id/dm-uuid-mpath-%d%*phN", - v->scsi.designator_type, - v->scsi.designator_len, v->scsi.designator); - if (!devname) - return ERR_PTR(-ENOMEM); - - bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); - kfree(devname); - return bdev; + return bdev_file; } static int @@ -354,45 +388,50 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *v = &volumes[idx]; struct block_device *bdev; const struct pr_ops *ops; + struct file *bdev_file; int error; if (!bl_validate_designator(v)) return -EINVAL; - bdev = bl_open_dm_mpath_udev_path(v); - if (IS_ERR(bdev)) - bdev = bl_open_udev_path(v); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - d->bdev = bdev; + /* + * Try to open the RH/Fedora specific dm-mpath udev path first, as the + * wwn- links will only point to the first discovered SCSI device there. + * On other distributions like Debian, the default SCSI by-id path will + * point to the dm-multipath device if one exists. + */ + bdev_file = bl_open_path(v, "dm-uuid-mpath-0x"); + if (IS_ERR(bdev_file)) + bdev_file = bl_open_path(v, "wwn-0x"); + if (IS_ERR(bdev_file)) + bdev_file = bl_open_path(v, "nvme-eui."); + if (IS_ERR(bdev_file)) { + pr_warn("pNFS: no device found for volume %*phN\n", + v->scsi.designator_len, v->scsi.designator); + return PTR_ERR(bdev_file); + } + d->bdev_file = bdev_file; + bdev = file_bdev(bdev_file); - d->len = i_size_read(d->bdev->bd_inode); + d->len = bdev_nr_bytes(bdev); d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; - pr_info("pNFS: using block device %s (reservation key 0x%llx)\n", - d->bdev->bd_disk->disk_name, d->pr_key); + if (d->len == 0) + return -ENODEV; - ops = d->bdev->bd_disk->fops->pr_ops; + ops = bdev->bd_disk->fops->pr_ops; if (!ops) { pr_err("pNFS: block device %s does not support reservations.", - d->bdev->bd_disk->disk_name); + bdev->bd_disk->disk_name); error = -EINVAL; goto out_blkdev_put; } - error = ops->pr_register(d->bdev, 0, d->pr_key, true); - if (error) { - pr_err("pNFS: failed to register key for block device %s.", - d->bdev->bd_disk->disk_name); - goto out_blkdev_put; - } - - d->pr_registered = true; return 0; out_blkdev_put: - blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE); + fput(d->bdev_file); return error; } @@ -421,7 +460,7 @@ bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, int ret, i; d->children = kcalloc(v->concat.volumes_count, - sizeof(struct pnfs_block_dev), GFP_KERNEL); + sizeof(struct pnfs_block_dev), gfp_mask); if (!d->children) return -ENOMEM; @@ -450,7 +489,7 @@ bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, int ret, i; d->children = kcalloc(v->stripe.volumes_count, - sizeof(struct pnfs_block_dev), GFP_KERNEL); + sizeof(struct pnfs_block_dev), gfp_mask); if (!d->children) return -ENOMEM; @@ -474,7 +513,9 @@ static int bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { - switch (volumes[idx].type) { + d->type = volumes[idx].type; + + switch (d->type) { case PNFS_BLOCK_VOLUME_SIMPLE: return bl_parse_simple(server, d, volumes, idx, gfp_mask); case PNFS_BLOCK_VOLUME_SLICE: @@ -486,7 +527,7 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, case PNFS_BLOCK_VOLUME_SCSI: return bl_parse_scsi(server, d, volumes, idx, gfp_mask); default: - dprintk("unsupported volume type: %d\n", volumes[idx].type); + dprintk("unsupported volume type: %d\n", d->type); return -EIO; } } @@ -500,16 +541,16 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct pnfs_block_dev *top; struct xdr_stream xdr; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; int nr_volumes, ret, i; __be32 *p; - scratch = alloc_page(gfp_mask); + scratch = folio_alloc(gfp_mask, 0); if (!scratch) goto out; xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_folio(&xdr, scratch); p = xdr_inline_decode(&xdr, sizeof(__be32)); if (!p) @@ -532,19 +573,16 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_free_volumes; ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); - if (ret) { - bl_free_device(top); - kfree(top); - goto out_free_volumes; - } node = &top->node; nfs4_init_deviceid_node(node, server, &pdev->dev_id); + if (ret) + nfs4_mark_deviceid_unavailable(node); out_free_volumes: kfree(volumes); out_free_scratch: - __free_page(scratch); + folio_put(scratch); out: return node; } diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index c85fbfd2d0d9..315949a7e92d 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014-2016 Christoph Hellwig. */ @@ -5,6 +6,7 @@ #include <linux/vmalloc.h> #include "blocklayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -519,10 +521,71 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); } -static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, +/** + * ext_tree_try_encode_commit - try to encode all extents into the buffer + * @bl: pointer to the layout + * @p: pointer to the output buffer + * @buffer_size: size of the output buffer + * @count: output pointer to the number of encoded extents + * @lastbyte: output pointer to the last written byte + * + * Return values: + * %0: Success, all required extents encoded, outputs are valid + * %-ENOSPC: Buffer too small, nothing encoded, outputs are invalid + */ +static int +ext_tree_try_encode_commit(struct pnfs_block_layout *bl, __be32 *p, size_t buffer_size, size_t *count, __u64 *lastbyte) { struct pnfs_block_extent *be; + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_WRITTEN) + continue; + + (*count)++; + if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { + spin_unlock(&bl->bl_ext_lock); + return -ENOSPC; + } + } + for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_WRITTEN) + continue; + + if (bl->bl_scsi_layout) + p = encode_scsi_range(be, p); + else + p = encode_block_extent(be, p); + be->be_tag = EXTENT_COMMITTING; + } + *lastbyte = (bl->bl_lwb != 0) ? bl->bl_lwb - 1 : U64_MAX; + bl->bl_lwb = 0; + spin_unlock(&bl->bl_ext_lock); + + return 0; +} + +/** + * ext_tree_encode_commit - encode as much as possible extents into the buffer + * @bl: pointer to the layout + * @p: pointer to the output buffer + * @buffer_size: size of the output buffer + * @count: output pointer to the number of encoded extents + * @lastbyte: output pointer to the last written byte + * + * Return values: + * %0: Success, all required extents encoded, outputs are valid + * %-ENOSPC: Buffer too small, some extents are encoded, outputs are valid + */ +static int +ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, + size_t buffer_size, size_t *count, __u64 *lastbyte) +{ + struct pnfs_block_extent *be, *be_prev; int ret = 0; spin_lock(&bl->bl_ext_lock); @@ -533,9 +596,9 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, (*count)++; if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { - /* keep counting.. */ + (*count)--; ret = -ENOSPC; - continue; + break; } if (bl->bl_scsi_layout) @@ -543,14 +606,30 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, else p = encode_block_extent(be, p); be->be_tag = EXTENT_COMMITTING; + be_prev = be; + } + if (!ret) { + *lastbyte = (bl->bl_lwb != 0) ? bl->bl_lwb - 1 : U64_MAX; + bl->bl_lwb = 0; + } else { + *lastbyte = be_prev->be_f_offset + be_prev->be_length; + *lastbyte <<= SECTOR_SHIFT; + *lastbyte -= 1; } - *lastbyte = bl->bl_lwb - 1; - bl->bl_lwb = 0; spin_unlock(&bl->bl_ext_lock); return ret; } +/** + * ext_tree_prepare_commit - encode extents that need to be committed + * @arg: layout commit data + * + * Return values: + * %0: Success, all required extents are encoded + * %-ENOSPC: Some extents are encoded, but not all, due to RPC size limit + * %-ENOMEM: Out of memory, extents not encoded + */ int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) { @@ -559,20 +638,18 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) __be32 *start_p; int ret; - dprintk("%s enter\n", __func__); - arg->layoutupdate_page = alloc_page(GFP_NOFS); if (!arg->layoutupdate_page) return -ENOMEM; start_p = page_address(arg->layoutupdate_page); arg->layoutupdate_pages = &arg->layoutupdate_page; -retry: - ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten); + ret = ext_tree_try_encode_commit(bl, start_p + 1, buffer_size, + &count, &arg->lastbytewritten); if (unlikely(ret)) { ext_tree_free_commitdata(arg, buffer_size); - buffer_size = ext_tree_layoutupdate_size(bl, count); + buffer_size = NFS_SERVER(arg->inode)->wsize; count = 0; arg->layoutupdate_pages = @@ -581,13 +658,14 @@ retry: if (!arg->layoutupdate_pages) return -ENOMEM; - start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + start_p = __vmalloc(buffer_size, GFP_NOFS); if (!start_p) { kfree(arg->layoutupdate_pages); return -ENOMEM; } - goto retry; + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, + &count, &arg->lastbytewritten); } *start_p = cpu_to_be32(count); @@ -606,8 +684,9 @@ retry: } } - dprintk("%s found %zu ranges\n", __func__, count); - return 0; + trace_bl_ext_tree_prepare_commit(ret, count, + arg->lastbytewritten, !!ret); + return ret; } void diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index 9fb067a6f7e0..d526f5ba7887 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -27,7 +27,6 @@ */ #include <linux/module.h> -#include <linux/genhd.h> #include <linux/blkdev.h> #include "blocklayout.h" @@ -76,10 +75,10 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, msg->len = sizeof(*bl_msg) + b->simple.len; msg->data = kzalloc(msg->len, gfp_mask); if (!msg->data) - goto out_free_data; + goto out_unlock; bl_msg = msg->data; - bl_msg->type = BL_DEVICE_MOUNT, + bl_msg->type = BL_DEVICE_MOUNT; bl_msg->totallen = b->simple.len; nfs4_encode_simple(msg->data + sizeof(*bl_msg), b); @@ -142,24 +141,18 @@ static const struct rpc_pipe_ops bl_upcall_ops = { .destroy_msg = bl_pipe_destroy_msg, }; -static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, +static int nfs4blocklayout_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { - struct dentry *dir, *dentry; + struct dentry *dir; + int err; dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); if (dir == NULL) - return ERR_PTR(-ENOENT); - dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); + return -ENOENT; + err = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); dput(dir); - return dentry; -} - -static void nfs4blocklayout_unregister_sb(struct super_block *sb, - struct rpc_pipe *pipe) -{ - if (pipe->dentry) - rpc_unlink(pipe->dentry); + return err; } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, @@ -168,7 +161,6 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, struct super_block *sb = ptr; struct net *net = sb->s_fs_info; struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; int ret = 0; if (!try_module_get(THIS_MODULE)) @@ -181,16 +173,10 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, switch (event) { case RPC_PIPEFS_MOUNT: - dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - break; - } - nn->bl_device_pipe->dentry = dentry; + ret = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); break; case RPC_PIPEFS_UMOUNT: - if (nn->bl_device_pipe->dentry) - nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); + rpc_unlink(nn->bl_device_pipe); break; default: ret = -ENOTSUPP; @@ -204,18 +190,17 @@ static struct notifier_block nfs4blocklayout_block = { .notifier_call = rpc_pipefs_event, }; -static struct dentry *nfs4blocklayout_register_net(struct net *net, - struct rpc_pipe *pipe) +static int nfs4blocklayout_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *pipefs_sb; - struct dentry *dentry; + int ret; pipefs_sb = rpc_get_sb_net(net); if (!pipefs_sb) - return NULL; - dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); + return 0; + ret = nfs4blocklayout_register_sb(pipefs_sb, pipe); rpc_put_sb_net(net); - return dentry; + return ret; } static void nfs4blocklayout_unregister_net(struct net *net, @@ -225,7 +210,7 @@ static void nfs4blocklayout_unregister_net(struct net *net, pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { - nfs4blocklayout_unregister_sb(pipefs_sb, pipe); + rpc_unlink(pipe); rpc_put_sb_net(net); } } @@ -233,20 +218,17 @@ static void nfs4blocklayout_unregister_net(struct net *net, static int nfs4blocklayout_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; + int err; mutex_init(&nn->bl_mutex); init_waitqueue_head(&nn->bl_wq); nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); if (IS_ERR(nn->bl_device_pipe)) return PTR_ERR(nn->bl_device_pipe); - dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); - if (IS_ERR(dentry)) { + err = nfs4blocklayout_register_net(net, nn->bl_device_pipe); + if (unlikely(err)) rpc_destroy_pipe_data(nn->bl_device_pipe); - return PTR_ERR(dentry); - } - nn->bl_device_pipe->dentry = dentry; - return 0; + return err; } static void nfs4blocklayout_net_exit(struct net *net) diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 2ae676f93e6b..ef6729568432 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/cache_lib.c * @@ -66,7 +67,7 @@ out: */ void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) { - if (atomic_dec_and_test(&dreq->count)) + if (refcount_dec_and_test(&dreq->count)) kfree(dreq); } @@ -86,7 +87,7 @@ static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) dreq = container_of(req, struct nfs_cache_defer_req, req); dreq->deferred_req.revisit = nfs_dns_cache_revisit; - atomic_inc(&dreq->count); + refcount_inc(&dreq->count); return &dreq->deferred_req; } @@ -98,7 +99,7 @@ struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); if (dreq) { init_completion(&dreq->completion); - atomic_set(&dreq->count, 1); + refcount_set(&dreq->count, 1); dreq->req.defer = nfs_dns_cache_defer; } return dreq; diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 4116d2c3f52f..220ee409abc4 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Helper routines for the NFS client caches * @@ -15,7 +16,7 @@ struct nfs_cache_defer_req { struct cache_req req; struct cache_deferred_req deferred_req; struct completion completion; - atomic_t count; + refcount_t count; }; extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 34323877ec13..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/callback.c * @@ -16,7 +17,6 @@ #include <linux/errno.h> #include <linux/mutex.h> #include <linux/freezer.h> -#include <linux/kthread.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/bc_xprt.h> @@ -40,23 +40,26 @@ static struct svc_program nfs4_callback_program; static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) { + const struct cred *cred = current_cred(); int ret; struct nfs_net *nn = net_generic(net, nfs_net_id); - ret = svc_create_xprt(serv, "tcp", net, PF_INET, - nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + ret = svc_xprt_create(serv, "tcp", net, PF_INET, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS, + cred); if (ret <= 0) goto out_err; nn->nfs_callback_tcpport = ret; - dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", - nn->nfs_callback_tcpport, PF_INET, net); + dprintk("NFS: Callback listener port = %u (af %u, net %x)\n", + nn->nfs_callback_tcpport, PF_INET, net->ns.inum); - ret = svc_create_xprt(serv, "tcp", net, PF_INET6, - nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); + ret = svc_xprt_create(serv, "tcp", net, PF_INET6, + nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS, + cred); if (ret > 0) { nn->nfs_callback_tcpport6 = ret; - dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", - nn->nfs_callback_tcpport6, PF_INET6, net); + dprintk("NFS: Callback listener port = %u (af %u, net %x)\n", + nn->nfs_callback_tcpport6, PF_INET6, net->ns.inum); } else if (ret != -EAFNOSUPPORT) goto out_err; return 0; @@ -71,72 +74,20 @@ out_err: static int nfs4_callback_svc(void *vrqstp) { - int err; struct svc_rqst *rqstp = vrqstp; + svc_thread_init_status(rqstp, 0); + set_freezable(); - while (!kthread_freezable_should_stop(NULL)) { + while (!svc_thread_should_stop(rqstp)) + svc_recv(rqstp); - if (signal_pending(current)) - flush_signals(current); - /* - * Listen for a request on the socket - */ - err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); - if (err == -EAGAIN || err == -EINTR) - continue; - svc_process(rqstp); - } svc_exit_thread(rqstp); - module_put_and_exit(0); return 0; } #if defined(CONFIG_NFS_V4_1) -/* - * The callback service for NFSv4.1 callbacks - */ -static int -nfs41_callback_svc(void *vrqstp) -{ - struct svc_rqst *rqstp = vrqstp; - struct svc_serv *serv = rqstp->rq_server; - struct rpc_rqst *req; - int error; - DEFINE_WAIT(wq); - - set_freezable(); - - while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); - - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); - spin_lock_bh(&serv->sv_cb_lock); - if (!list_empty(&serv->sv_cb_list)) { - req = list_first_entry(&serv->sv_cb_list, - struct rpc_rqst, rq_bc_list); - list_del(&req->rq_bc_list); - spin_unlock_bh(&serv->sv_cb_lock); - finish_wait(&serv->sv_cb_waitq, &wq); - dprintk("Invoking bc_svc_process()\n"); - error = bc_svc_process(serv, req, rqstp); - dprintk("bc_svc_process() returned w/ error code= %d\n", - error); - } else { - spin_unlock_bh(&serv->sv_cb_lock); - if (!kthread_should_stop()) - schedule(); - finish_wait(&serv->sv_cb_waitq, &wq); - } - } - svc_exit_thread(rqstp); - module_put_and_exit(0); - return 0; -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -165,12 +116,12 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS) nrservs = NFS4_MIN_NR_CALLBACK_THREADS; - if (serv->sv_nrthreads-1 == nrservs) + if (serv->sv_nrthreads == nrservs) return 0; - ret = serv->sv_ops->svo_setup(serv, NULL, nrservs); + ret = svc_set_num_threads(serv, NULL, nrservs); if (ret) { - serv->sv_ops->svo_setup(serv, NULL, 0); + svc_set_num_threads(serv, NULL, 0); return ret; } dprintk("nfs_callback_up: service started\n"); @@ -184,8 +135,8 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc if (--nn->cb_users[minorversion]) return; - dprintk("NFS: destroy per-net callback data; net=%p\n", net); - svc_shutdown_net(serv, net); + dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, @@ -197,88 +148,46 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, if (nn->cb_users[minorversion]++) return 0; - dprintk("NFS: create per-net callback data; net=%p\n", net); + dprintk("NFS: create per-net callback data; net=%x\n", net->ns.inum); ret = svc_bind(serv, net); if (ret < 0) { printk(KERN_WARNING "NFS: bind callback service failed\n"); - goto err_bind; + goto err; } - ret = -EPROTONOSUPPORT; + ret = 0; if (!IS_ENABLED(CONFIG_NFS_V4_1) || minorversion == 0) ret = nfs4_callback_up_net(serv, net); - else if (xprt->ops->bc_up) - ret = xprt->ops->bc_up(serv, net); + else if (xprt->ops->bc_setup) + set_bc_enabled(serv); + else + ret = -EPROTONOSUPPORT; if (ret < 0) { printk(KERN_ERR "NFS: callback service start failed\n"); - goto err_socks; + goto err; } return 0; -err_socks: - svc_rpcb_cleanup(serv, net); -err_bind: +err: nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " - "net = %p\n", ret, net); + "net = %x\n", ret, net->ns.inum); return ret; } -static struct svc_serv_ops nfs40_cb_sv_ops = { - .svo_function = nfs4_callback_svc, - .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads_sync, - .svo_module = THIS_MODULE, -}; -#if defined(CONFIG_NFS_V4_1) -static struct svc_serv_ops nfs41_cb_sv_ops = { - .svo_function = nfs41_callback_svc, - .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads_sync, - .svo_module = THIS_MODULE, -}; - -static struct svc_serv_ops *nfs4_cb_sv_ops[] = { - [0] = &nfs40_cb_sv_ops, - [1] = &nfs41_cb_sv_ops, -}; -#else -static struct svc_serv_ops *nfs4_cb_sv_ops[] = { - [0] = &nfs40_cb_sv_ops, - [1] = NULL, -}; -#endif - static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + int (*threadfn)(void *data); struct svc_serv *serv; - struct svc_serv_ops *sv_ops; /* * Check whether we're already up and running. */ - if (cb_info->serv) { - /* - * Note: increase service usage, because later in case of error - * svc_destroy() will be called. - */ - svc_get(cb_info->serv); + if (cb_info->serv) return cb_info->serv; - } - - switch (minorversion) { - case 0: - sv_ops = nfs4_cb_sv_ops[0]; - break; - default: - sv_ops = nfs4_cb_sv_ops[1]; - } - - if (sv_ops == NULL) - return ERR_PTR(-ENOTSUPP); /* * Sanity check: if there's no task, @@ -288,16 +197,18 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); + threadfn = nfs4_callback_svc; +#if !defined(CONFIG_NFS_V4_1) + if (minorversion) + return ERR_PTR(-ENOTSUPP); +#endif + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, + threadfn); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); } cb_info->serv = serv; - /* As there is only one thread we need to over-ride the - * default maximum of 80 connections - */ - serv->sv_maxconn = 1024; dprintk("nfs_callback_create_svc: service created\n"); return serv; } @@ -329,16 +240,11 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) goto err_start; cb_info->users++; - /* - * svc_create creates the svc_serv with sv_nrthreads == 1, and then - * svc_prepare_thread increments that. So we need to call svc_destroy - * on both success and failure so that the refcount is 1 when the - * thread exits. - */ err_net: - if (!cb_info->users) - cb_info->serv = NULL; - svc_destroy(serv); + if (!cb_info->users) { + svc_set_num_threads(cb_info->serv, NULL, 0); + svc_destroy(&cb_info->serv); + } err_create: mutex_unlock(&nfs_callback_mutex); return ret; @@ -362,11 +268,9 @@ void nfs_callback_down(int minorversion, struct net *net) nfs_callback_down_net(minorversion, serv, net); cb_info->users--; if (cb_info->users == 0) { - svc_get(serv); - serv->sv_ops->svo_setup(serv, NULL, 0); - svc_destroy(serv); + svc_set_num_threads(serv, NULL, 0); dprintk("nfs_callback_down: service destroyed\n"); - cb_info->serv = NULL; + svc_destroy(&cb_info->serv); } mutex_unlock(&nfs_callback_mutex); } @@ -421,8 +325,10 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) * All other checking done after NFS decoding where the nfs_client can be * found in nfs4_callback_compound */ -static int nfs_callback_authenticate(struct svc_rqst *rqstp) +static enum svc_auth_status nfs_callback_authenticate(struct svc_rqst *rqstp) { + rqstp->rq_auth_stat = rpc_autherr_badcred; + switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: if (rqstp->rq_proc != CB_NULL) @@ -433,6 +339,8 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) if (svc_is_backchannel(rqstp)) return SVC_DENIED; } + + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } @@ -444,14 +352,13 @@ static const struct svc_version *nfs4_callback_version[] = { [4] = &nfs4_callback_version4, }; -static struct svc_stat nfs4_callback_stats; - static struct svc_program nfs4_callback_program = { .pg_prog = NFS4_CALLBACK, /* RPC service number */ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */ .pg_vers = nfs4_callback_version, /* version table */ .pg_name = "NFSv4 callback", /* service name */ .pg_class = "nfs", /* authentication class */ - .pg_stats = &nfs4_callback_stats, .pg_authenticate = nfs_callback_authenticate, + .pg_init_request = svc_generic_init_request, + .pg_rpcbind_set = svc_generic_rpcbind_set, }; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 3dc54d7cb19c..154a6ed1299f 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/nfs/callback.h * @@ -18,32 +19,14 @@ enum nfs4_callback_procnum { CB_COMPOUND = 1, }; -enum nfs4_callback_opnum { - OP_CB_GETATTR = 3, - OP_CB_RECALL = 4, -/* Callback operations new to NFSv4.1 */ - OP_CB_LAYOUTRECALL = 5, - OP_CB_NOTIFY = 6, - OP_CB_PUSH_DELEG = 7, - OP_CB_RECALL_ANY = 8, - OP_CB_RECALLABLE_OBJ_AVAIL = 9, - OP_CB_RECALL_SLOT = 10, - OP_CB_SEQUENCE = 11, - OP_CB_WANTS_CANCELLED = 12, - OP_CB_NOTIFY_LOCK = 13, - OP_CB_NOTIFY_DEVICEID = 14, -/* Callback operations new to NFSv4.2 */ - OP_CB_OFFLOAD = 15, - OP_CB_ILLEGAL = 10044, -}; - struct nfs4_slot; struct cb_process_state { - __be32 drc_status; struct nfs_client *clp; struct nfs4_slot *slot; - u32 minorversion; struct net *net; + u32 minorversion; + __be32 drc_status; + unsigned int referring_calls; }; struct cb_compound_hdr_arg { @@ -63,16 +46,17 @@ struct cb_compound_hdr_res { struct cb_getattrargs { struct nfs_fh fh; - uint32_t bitmap[2]; + uint32_t bitmap[3]; }; struct cb_getattrres { __be32 status; - uint32_t bitmap[2]; + uint32_t bitmap[3]; uint64_t size; uint64_t change_attr; - struct timespec ctime; - struct timespec mtime; + struct timespec64 atime; + struct timespec64 ctime; + struct timespec64 mtime; }; struct cb_recallargs { @@ -126,7 +110,9 @@ extern __be32 nfs4_callback_sequence(void *argp, void *resp, #define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 #define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 #define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 -#define RCA4_TYPE_MASK_ALL 0xf31f +#define PNFS_FF_RCA4_TYPE_MASK_READ 16 +#define PNFS_FF_RCA4_TYPE_MASK_RW 17 +#define RCA4_TYPE_MASK_ALL 0x3f31f struct cb_recallanyargs { uint32_t craa_objs_to_keep; @@ -167,7 +153,7 @@ struct cb_devicenotifyitem { }; struct cb_devicenotifyargs { - int ndevs; + uint32_t ndevs; struct cb_devicenotifyitem *devs; }; @@ -183,6 +169,18 @@ struct cb_notify_lock_args { extern __be32 nfs4_callback_notify_lock(void *argp, void *resp, struct cb_process_state *cps); #endif /* CONFIG_NFS_V4_1 */ +#ifdef CONFIG_NFS_V4_2 +struct cb_offloadargs { + struct nfs_fh coa_fh; + nfs4_stateid coa_stateid; + uint32_t error; + uint64_t wr_count; + struct nfs_writeverf wr_writeverf; +}; + +extern __be32 nfs4_callback_offload(void *args, void *dummy, + struct cb_process_state *cps); +#endif /* CONFIG_NFS_V4_2 */ extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); extern __be32 nfs4_callback_getattr(void *argp, void *resp, struct cb_process_state *cps); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 5427cdf04c5a..8397c43358bd 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/callback_proc.c * @@ -5,10 +6,15 @@ * * NFSv4 callback procedures */ + +#include <linux/errno.h> +#include <linux/math.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/slab.h> #include <linux/rcupdate.h> +#include <linux/types.h> + #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" @@ -25,45 +31,49 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, struct cb_getattrargs *args = argp; struct cb_getattrres *res = resp; struct nfs_delegation *delegation; - struct nfs_inode *nfsi; struct inode *inode; res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION); if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */ goto out; - res->bitmap[0] = res->bitmap[1] = 0; + memset(res->bitmap, 0, sizeof(res->bitmap)); res->status = htonl(NFS4ERR_BADHANDLE); dprintk_rcu("NFS: GETATTR callback request from %s\n", rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) { + if (IS_ERR(inode)) { + if (inode == ERR_PTR(-EAGAIN)) + res->status = htonl(NFS4ERR_DELAY); trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL, -ntohl(res->status)); goto out; } - nfsi = NFS_I(inode); rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); + delegation = nfs4_get_valid_delegation(inode); if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) goto out_iput; res->size = i_size_read(inode); res->change_attr = delegation->change_attr; - if (nfsi->nrequests != 0) + if (nfs_have_writebacks(inode)) res->change_attr++; - res->ctime = inode->i_ctime; - res->mtime = inode->i_mtime; - res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & - args->bitmap[0]; - res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & - args->bitmap[1]; + res->atime = inode_get_atime(inode); + res->ctime = inode_get_ctime(inode); + res->mtime = inode_get_mtime(inode); + res->bitmap[0] = (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE) & + args->bitmap[0]; + res->bitmap[1] = (FATTR4_WORD1_TIME_ACCESS | + FATTR4_WORD1_TIME_METADATA | + FATTR4_WORD1_TIME_MODIFY) & args->bitmap[1]; + res->bitmap[2] = (FATTR4_WORD2_TIME_DELEG_ACCESS | + FATTR4_WORD2_TIME_DELEG_MODIFY) & args->bitmap[2]; res->status = 0; out_iput: rcu_read_unlock(); trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status)); - iput(inode); + nfs_iput_and_deactive(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status)); return res->status; @@ -85,7 +95,9 @@ __be32 nfs4_callback_recall(void *argp, void *resp, res = htonl(NFS4ERR_BADHANDLE); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) { + if (IS_ERR(inode)) { + if (inode == ERR_PTR(-EAGAIN)) + res = htonl(NFS4ERR_DELAY); trace_nfs4_cb_recall(cps->clp, &args->fh, NULL, &args->stateid, -ntohl(res)); goto out; @@ -103,7 +115,7 @@ __be32 nfs4_callback_recall(void *argp, void *resp, } trace_nfs4_cb_recall(cps->clp, &args->fh, inode, &args->stateid, -ntohl(res)); - iput(inode); + nfs_iput_and_deactive(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); return res; @@ -118,33 +130,32 @@ out: */ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp, const nfs4_stateid *stateid) + __must_hold(RCU) { struct nfs_server *server; struct inode *inode; struct pnfs_layout_hdr *lo; -restart: + rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry(lo, &server->layouts, plh_layouts) { - if (stateid != NULL && - !nfs4_stateid_match_other(stateid, &lo->plh_stateid)) + list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { + if (!pnfs_layout_is_valid(lo)) continue; - inode = igrab(lo->plh_inode); - if (!inode) + if (!nfs4_stateid_match_other(stateid, &lo->plh_stateid)) continue; - if (!nfs_sb_active(inode->i_sb)) { - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); - spin_lock(&clp->cl_lock); - rcu_read_lock(); - goto restart; - } - return inode; + if (nfs_sb_active(server->super)) + inode = igrab(lo->plh_inode); + else + inode = ERR_PTR(-EAGAIN); + rcu_read_unlock(); + if (inode) + return inode; + nfs_sb_deactive(server->super); + return ERR_PTR(-EAGAIN); } } - - return NULL; + rcu_read_unlock(); + return ERR_PTR(-ENOENT); } /* @@ -161,30 +172,27 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp, struct inode *inode; struct pnfs_layout_hdr *lo; -restart: + rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry(lo, &server->layouts, plh_layouts) { + list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { nfsi = NFS_I(lo->plh_inode); if (nfs_compare_fh(fh, &nfsi->fh)) continue; if (nfsi->layout != lo) continue; - inode = igrab(lo->plh_inode); - if (!inode) - continue; - if (!nfs_sb_active(inode->i_sb)) { - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); - spin_lock(&clp->cl_lock); - rcu_read_lock(); - goto restart; - } - return inode; + if (nfs_sb_active(server->super)) + inode = igrab(lo->plh_inode); + else + inode = ERR_PTR(-EAGAIN); + rcu_read_unlock(); + if (inode) + return inode; + nfs_sb_deactive(server->super); + return ERR_PTR(-EAGAIN); } } - - return NULL; + rcu_read_unlock(); + return ERR_PTR(-ENOENT); } static struct inode *nfs_layout_find_inode(struct nfs_client *clp, @@ -193,14 +201,9 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp, { struct inode *inode; - spin_lock(&clp->cl_lock); - rcu_read_lock(); inode = nfs_layout_find_inode_by_stateid(clp, stateid); - if (!inode) + if (inode == ERR_PTR(-ENOENT)) inode = nfs_layout_find_inode_by_fh(clp, fh); - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - return inode; } @@ -208,13 +211,14 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp, * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) */ static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, - const nfs4_stateid *new) + const nfs4_stateid *new, + struct cb_process_state *cps) { u32 oldseq, newseq; - /* Is the stateid still not initialised? */ + /* Is the stateid not initialised? */ if (!pnfs_layout_is_valid(lo)) - return NFS4ERR_DELAY; + return NFS4ERR_NOMATCHING_LAYOUT; /* Mismatched stateid? */ if (!nfs4_stateid_match_other(&lo->plh_stateid, new)) @@ -222,28 +226,29 @@ static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, newseq = be32_to_cpu(new->seqid); /* Are we already in a layout recall situation? */ - if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && - lo->plh_return_seq != 0) { - if (newseq < lo->plh_return_seq) - return NFS4ERR_OLD_STATEID; - if (newseq > lo->plh_return_seq) - return NFS4ERR_DELAY; - goto out; - } + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return NFS4ERR_DELAY; - /* Check that the stateid matches what we think it should be. */ + /* + * Check that the stateid matches what we think it should be. + * Note that if the server sent us a list of referring calls, + * and we know that those have completed, then we trust the + * stateid argument is correct. + */ oldseq = be32_to_cpu(lo->plh_stateid.seqid); - if (newseq > oldseq + 1) + if (newseq > oldseq + 1 && !cps->referring_calls) return NFS4ERR_DELAY; + /* Crazy server! */ if (newseq <= oldseq) return NFS4ERR_OLD_STATEID; -out: + return NFS_OK; } static u32 initiate_file_draining(struct nfs_client *clp, - struct cb_layoutrecallargs *args) + struct cb_layoutrecallargs *args, + struct cb_process_state *cps) { struct inode *ino; struct pnfs_layout_hdr *lo; @@ -251,8 +256,11 @@ static u32 initiate_file_draining(struct nfs_client *clp, LIST_HEAD(free_me_list); ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid); - if (!ino) - goto out; + if (IS_ERR(ino)) { + if (ino == ERR_PTR(-EAGAIN)) + rv = NFS4ERR_DELAY; + goto out_noput; + } pnfs_layoutcommit_inode(ino, false); @@ -264,10 +272,9 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto out; } pnfs_get_layout_hdr(lo); - rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); + rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid, cps); if (rv != NFS_OK) goto unlock; - pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); /* * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) @@ -277,19 +284,24 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto unlock; } - if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, + pnfs_set_layout_stateid(lo, &args->cbl_stateid, NULL, true); + switch (pnfs_mark_matching_lsegs_return(lo, &free_me_list, &args->cbl_range, be32_to_cpu(args->cbl_stateid.seqid))) { + case 0: + case -EBUSY: + /* There are layout segments that need to be returned */ rv = NFS4_OK; - goto unlock; - } - - /* Embrace your forgetfulness! */ - rv = NFS4ERR_NOMATCHING_LAYOUT; + break; + case -ENOENT: + set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags); + /* Embrace your forgetfulness! */ + rv = NFS4ERR_NOMATCHING_LAYOUT; - if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { - NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, - &args->cbl_range); + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, + &args->cbl_range); + } } unlock: spin_unlock(&ino->i_lock); @@ -298,9 +310,10 @@ unlock: nfs_commit_inode(ino, 0); pnfs_put_layout_hdr(lo); out: + nfs_iput_and_deactive(ino); +out_noput: trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino, &args->cbl_stateid, -rv); - nfs_iput_and_deactive(ino); return rv; } @@ -310,19 +323,21 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, int stat; if (args->cbl_recall_type == RETURN_FSID) - stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true); + stat = pnfs_layout_destroy_byfsid(clp, &args->cbl_fsid, + PNFS_LAYOUT_BULK_RETURN); else - stat = pnfs_destroy_layouts_byclid(clp, true); + stat = pnfs_layout_destroy_byclid(clp, PNFS_LAYOUT_BULK_RETURN); if (stat != 0) return NFS4ERR_DELAY; return NFS4ERR_NOMATCHING_LAYOUT; } static u32 do_callback_layoutrecall(struct nfs_client *clp, - struct cb_layoutrecallargs *args) + struct cb_layoutrecallargs *args, + struct cb_process_state *cps) { if (args->cbl_recall_type == RETURN_FILE) - return initiate_file_draining(clp, args); + return initiate_file_draining(clp, args, cps); return initiate_bulk_draining(clp, args); } @@ -333,11 +348,12 @@ __be32 nfs4_callback_layoutrecall(void *argp, void *resp, u32 res = NFS4ERR_OP_NOT_IN_SESSION; if (cps->clp) - res = do_callback_layoutrecall(cps->clp, args); + res = do_callback_layoutrecall(cps->clp, args, cps); return cpu_to_be32(res); } -static void pnfs_recall_all_layouts(struct nfs_client *clp) +static void pnfs_recall_all_layouts(struct nfs_client *clp, + struct cb_process_state *cps) { struct cb_layoutrecallargs args; @@ -345,19 +361,18 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp) memset(&args, 0, sizeof(args)); args.cbl_recall_type = RETURN_ALL; /* FIXME we ignore errors, what should we do? */ - do_callback_layoutrecall(clp, &args); + do_callback_layoutrecall(clp, &args, cps); } __be32 nfs4_callback_devicenotify(void *argp, void *resp, struct cb_process_state *cps) { struct cb_devicenotifyargs *args = argp; - int i; + const struct pnfs_layoutdriver_type *ld = NULL; + uint32_t i; __be32 res = 0; - struct nfs_client *clp = cps->clp; - struct nfs_server *server = NULL; - if (!clp) { + if (!cps->clp) { res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); goto out; } @@ -365,23 +380,15 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, for (i = 0; i < args->ndevs; i++) { struct cb_devicenotifyitem *dev = &args->devs[i]; - if (!server || - server->pnfs_curr_ld->id != dev->cbd_layout_type) { - rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) - if (server->pnfs_curr_ld && - server->pnfs_curr_ld->id == dev->cbd_layout_type) { - rcu_read_unlock(); - goto found; - } - rcu_read_unlock(); - continue; + if (!ld || ld->id != dev->cbd_layout_type) { + pnfs_put_layoutdriver(ld); + ld = pnfs_find_layoutdriver(dev->cbd_layout_type); + if (!ld) + continue; } - - found: - nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); + nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id); } - + pnfs_put_layoutdriver(ld); out: kfree(args->devs); return res; @@ -404,30 +411,39 @@ static __be32 validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot, const struct cb_sequenceargs * args) { + __be32 ret; + + ret = cpu_to_be32(NFS4ERR_BADSLOT); if (args->csa_slotid > tbl->server_highest_slotid) - return htonl(NFS4ERR_BADSLOT); + goto out_err; /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { + ret = cpu_to_be32(NFS4ERR_DELAY); if (nfs4_test_locked_slot(tbl, slot->slot_nr)) - return htonl(NFS4ERR_DELAY); + goto out_err; + /* Signal process_op to set this error on next op */ + ret = cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP); if (args->csa_cachethis == 0) - return htonl(NFS4ERR_RETRY_UNCACHED_REP); + goto out_err; /* Liar! We never allowed you to set csa_cachethis != 0 */ - return htonl(NFS4ERR_SEQ_FALSE_RETRY); + ret = cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY); + goto out_err; } - /* Wraparound */ - if (unlikely(slot->seq_nr == 0xFFFFFFFFU)) { - if (args->csa_sequenceid == 1) - return htonl(NFS4_OK); - } else if (likely(args->csa_sequenceid == slot->seq_nr + 1)) - return htonl(NFS4_OK); - + /* Note: wraparound relies on seq_nr being of type u32 */ /* Misordered request */ - return htonl(NFS4ERR_SEQ_MISORDERED); + ret = cpu_to_be32(NFS4ERR_SEQ_MISORDERED); + if (args->csa_sequenceid != slot->seq_nr + 1) + goto out_err; + + return cpu_to_be32(NFS4_OK); + +out_err: + trace_nfs4_cb_seqid_err(args, ret); + return ret; } /* @@ -435,11 +451,15 @@ validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot, * a match. If the slot is in use and the sequence numbers match, the * client is still waiting for a response to the original request. */ -static bool referring_call_exists(struct nfs_client *clp, +static int referring_call_exists(struct nfs_client *clp, uint32_t nrclists, - struct referring_call_list *rclists) + struct referring_call_list *rclists, + spinlock_t *lock) + __releases(lock) + __acquires(lock) { - bool status = 0; + int status = 0; + int found = 0; int i, j; struct nfs4_session *session; struct nfs4_slot_table *tbl; @@ -462,15 +482,18 @@ static bool referring_call_exists(struct nfs_client *clp, for (j = 0; j < rclist->rcl_nrefcalls; j++) { ref = &rclist->rcl_refcalls[j]; + spin_unlock(lock); status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid, ref->rc_sequenceid, HZ >> 1) < 0; + spin_lock(lock); if (status) goto out; + found++; } } out: - return status; + return status < 0 ? status : found; } __be32 nfs4_callback_sequence(void *argp, void *resp, @@ -481,6 +504,7 @@ __be32 nfs4_callback_sequence(void *argp, void *resp, struct nfs4_slot_table *tbl; struct nfs4_slot *slot; struct nfs_client *clp; + int ret; int i; __be32 status = htonl(NFS4ERR_BADSESSION); @@ -540,10 +564,13 @@ __be32 nfs4_callback_sequence(void *argp, void *resp, * related callback was received before the response to the original * call. */ - if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { + ret = referring_call_exists(clp, args->csa_nrclists, args->csa_rclists, + &tbl->slot_tbl_lock); + if (ret < 0) { status = htonl(NFS4ERR_DELAY); goto out_unlock; } + cps->referring_calls = ret; /* * RFC5661 20.9.3 @@ -571,7 +598,7 @@ out: } static bool -validate_bitmap_values(unsigned long mask) +validate_bitmap_values(unsigned int mask) { return (mask & ~RCA4_TYPE_MASK_ALL) == 0; } @@ -582,6 +609,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, struct cb_recallanyargs *args = argp; __be32 status; fmode_t flags = 0; + bool schedule_manager = false; status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); if (!cps->clp) /* set in cb_sequence */ @@ -595,17 +623,27 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, goto out; status = cpu_to_be32(NFS4_OK); - if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) - &args->craa_type_mask)) + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_RDATA_DLG)) flags = FMODE_READ; - if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) - &args->craa_type_mask)) + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_WDATA_DLG)) flags |= FMODE_WRITE; - if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) - &args->craa_type_mask)) - pnfs_recall_all_layouts(cps->clp); if (flags) nfs_expire_unused_delegation_types(cps->clp, flags); + + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT)) + pnfs_recall_all_layouts(cps->clp, cps); + + if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) { + set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state); + schedule_manager = true; + } + if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_RW)) { + set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &cps->clp->cl_state); + schedule_manager = true; + } + if (schedule_manager) + nfs4_schedule_state_manager(cps->clp); + out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; @@ -656,3 +694,60 @@ __be32 nfs4_callback_notify_lock(void *argp, void *resp, return htonl(NFS4_OK); } #endif /* CONFIG_NFS_V4_1 */ +#ifdef CONFIG_NFS_V4_2 +static void nfs4_copy_cb_args(struct nfs4_copy_state *cp_state, + struct cb_offloadargs *args) +{ + cp_state->count = args->wr_count; + cp_state->error = args->error; + if (!args->error) { + cp_state->verf.committed = args->wr_writeverf.committed; + memcpy(&cp_state->verf.verifier.data[0], + &args->wr_writeverf.verifier.data[0], + NFS4_VERIFIER_SIZE); + } +} + +__be32 nfs4_callback_offload(void *data, void *dummy, + struct cb_process_state *cps) +{ + struct cb_offloadargs *args = data; + struct nfs_server *server; + struct nfs4_copy_state *copy, *tmp_copy; + bool found = false; + + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); + if (!copy) + return cpu_to_be32(NFS4ERR_DELAY); + + spin_lock(&cps->clp->cl_lock); + rcu_read_lock(); + list_for_each_entry_rcu(server, &cps->clp->cl_superblocks, + client_link) { + list_for_each_entry(tmp_copy, &server->ss_copies, copies) { + if (memcmp(args->coa_stateid.other, + tmp_copy->stateid.other, + sizeof(args->coa_stateid.other))) + continue; + nfs4_copy_cb_args(tmp_copy, args); + complete(&tmp_copy->completion); + found = true; + goto out; + } + } +out: + rcu_read_unlock(); + if (!found) { + memcpy(©->stateid, &args->coa_stateid, NFS4_STATEID_SIZE); + nfs4_copy_cb_args(copy, args); + list_add_tail(©->copies, &cps->clp->pending_cb_stateids); + } else + kfree(copy); + spin_unlock(&cps->clp->cl_lock); + + trace_nfs4_cb_offload(&args->coa_fh, &args->coa_stateid, + args->wr_count, args->error, + args->wr_writeverf.committed); + return 0; +} +#endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 681dd642f119..4254ba3ee7c5 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/callback_xdr.c * @@ -17,14 +18,16 @@ #include "callback.h" #include "internal.h" #include "nfs4session.h" +#include "nfs4trace.h" #define CB_OP_TAGLEN_MAXSZ (512) #define CB_OP_HDR_RES_MAXSZ (2 * 4) // opcode, status #define CB_OP_GETATTR_BITMAP_MAXSZ (4 * 4) // bitmap length, 3 bitmaps #define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ CB_OP_GETATTR_BITMAP_MAXSZ + \ - /* change, size, ctime, mtime */\ - (2 + 2 + 3 + 3) * 4) + /* change, size, atime, ctime, + * mtime, deleg_atime, deleg_mtime */\ + (2 + 2 + 3 + 3 + 3 + 3 + 3) * 4) #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #if defined(CONFIG_NFS_V4_1) @@ -37,6 +40,9 @@ #define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_NOTIFY_LOCK_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #endif /* CONFIG_NFS_V4_1 */ +#ifdef CONFIG_NFS_V4_2 +#define CB_OP_OFFLOAD_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#endif /* CONFIG_NFS_V4_2 */ #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -58,24 +64,13 @@ static __be32 nfs4_callback_null(struct svc_rqst *rqstp) return htonl(NFS4_OK); } -static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_argsize_check(rqstp, p); -} - -static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} - -static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes) +/* + * svc_process_common() looks for an XDR encoder to know when + * not to drop a Reply. + */ +static bool nfs4_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr) { - __be32 *p; - - p = xdr_inline_decode(xdr, nbytes); - if (unlikely(p == NULL)) - printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n"); - return p; + return true; } static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, @@ -94,13 +89,13 @@ static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh) { __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); fh->size = ntohl(*p); if (fh->size > NFS4_FHSIZE) return htonl(NFS4ERR_BADHANDLE); - p = read_buf(xdr, fh->size); + p = xdr_inline_decode(xdr, fh->size); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); memcpy(&fh->data[0], p, fh->size); @@ -113,17 +108,19 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) __be32 *p; unsigned int attrlen; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); attrlen = ntohl(*p); - p = read_buf(xdr, attrlen << 2); + p = xdr_inline_decode(xdr, attrlen << 2); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); if (likely(attrlen > 0)) bitmap[0] = ntohl(*p++); if (attrlen > 1) - bitmap[1] = ntohl(*p); + bitmap[1] = ntohl(*p++); + if (attrlen > 2) + bitmap[2] = ntohl(*p); return 0; } @@ -131,7 +128,7 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { __be32 *p; - p = read_buf(xdr, NFS4_STATEID_SIZE); + p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); memcpy(stateid->data, p, NFS4_STATEID_SIZE); @@ -152,7 +149,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ); if (unlikely(status != 0)) return status; - p = read_buf(xdr, 12); + p = xdr_inline_decode(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); hdr->minorversion = ntohl(*p++); @@ -172,7 +169,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) { __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE_HDR); *op = ntohl(*p); @@ -201,7 +198,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) return status; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); args->truncate = ntohl(*p); @@ -223,7 +220,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, __be32 status = 0; uint32_t iomode; - p = read_buf(xdr, 4 * sizeof(uint32_t)); + p = xdr_inline_decode(xdr, 4 * sizeof(uint32_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); @@ -241,14 +238,14 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, if (unlikely(status != 0)) return status; - p = read_buf(xdr, 2 * sizeof(uint64_t)); + p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); return decode_layout_stateid(xdr, &args->cbl_stateid); } else if (args->cbl_recall_type == RETURN_FSID) { - p = read_buf(xdr, 2 * sizeof(uint64_t)); + p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_fsid.major); @@ -264,25 +261,19 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, void *argp) { struct cb_devicenotifyargs *args = argp; + uint32_t tmp, n, i; __be32 *p; __be32 status = 0; - u32 tmp; - int n, i; - args->ndevs = 0; /* Num of device notifications */ - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto out; } n = ntohl(*p++); - if (n <= 0) + if (n == 0) goto out; - if (n > ULONG_MAX / sizeof(*args->devs)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL); if (!args->devs) { @@ -294,7 +285,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, for (i = 0; i < n; i++) { struct cb_devicenotifyitem *dev = &args->devs[i]; - p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); + p = xdr_inline_decode(xdr, (4 * sizeof(uint32_t)) + + NFS4_DEVICEID4_SIZE); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto err; @@ -325,7 +317,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto err; @@ -335,19 +327,21 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, dev->cbd_immediate = 0; } - args->ndevs++; - dprintk("%s: type %d layout 0x%x immediate %d\n", __func__, dev->cbd_notify_type, dev->cbd_layout_type, dev->cbd_immediate); } + args->ndevs = n; + dprintk("%s: ndevs %d\n", __func__, args->ndevs); + return 0; +err: + kfree(args->devs); out: + args->devs = NULL; + args->ndevs = 0; dprintk("%s: status %d ndevs %d\n", __func__, ntohl(status), args->ndevs); return status; -err: - kfree(args->devs); - goto out; } static __be32 decode_sessionid(struct xdr_stream *xdr, @@ -355,7 +349,7 @@ static __be32 decode_sessionid(struct xdr_stream *xdr, { __be32 *p; - p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN); + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); @@ -375,13 +369,15 @@ static __be32 decode_rc_list(struct xdr_stream *xdr, goto out; status = htonl(NFS4ERR_RESOURCE); - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) goto out; rc_list->rcl_nrefcalls = ntohl(*p++); if (rc_list->rcl_nrefcalls) { - p = read_buf(xdr, + if (unlikely(rc_list->rcl_nrefcalls > xdr->buf->len)) + goto out; + p = xdr_inline_decode(xdr, rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); if (unlikely(p == NULL)) goto out; @@ -414,7 +410,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, if (status) return status; - p = read_buf(xdr, 5 * sizeof(uint32_t)); + p = xdr_inline_decode(xdr, 5 * sizeof(uint32_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); @@ -454,10 +450,10 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, void *argp) { struct cb_recallanyargs *args = argp; - uint32_t bitmap[2]; + uint32_t bitmap[3]; __be32 *p, status; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->craa_objs_to_keep = ntohl(*p++); @@ -476,7 +472,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp, struct cb_recallslotargs *args = argp; __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->crsa_target_highest_slotid = ntohl(*p++); @@ -488,14 +484,14 @@ static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_arg __be32 *p; unsigned int len; - p = read_buf(xdr, 12); + p = xdr_inline_decode(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbnl_owner.clientid); len = be32_to_cpu(*p); - p = read_buf(xdr, len); + p = xdr_inline_decode(xdr, len); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); @@ -526,7 +522,72 @@ static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, } #endif /* CONFIG_NFS_V4_1 */ +#ifdef CONFIG_NFS_V4_2 +static __be32 decode_write_response(struct xdr_stream *xdr, + struct cb_offloadargs *args) +{ + __be32 *p; + /* skip the always zero field */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out; + p++; + + /* decode count, stable_how, verifier */ + p = xdr_inline_decode(xdr, 8 + 4); + if (unlikely(!p)) + goto out; + p = xdr_decode_hyper(p, &args->wr_count); + args->wr_writeverf.committed = be32_to_cpup(p); + p = xdr_inline_decode(xdr, NFS4_VERIFIER_SIZE); + if (likely(p)) { + memcpy(&args->wr_writeverf.verifier.data[0], p, + NFS4_VERIFIER_SIZE); + return 0; + } +out: + return htonl(NFS4ERR_RESOURCE); +} + +static __be32 decode_offload_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct cb_offloadargs *args = data; + __be32 *p; + __be32 status; + + /* decode fh */ + status = decode_fh(xdr, &args->coa_fh); + if (unlikely(status != 0)) + return status; + + /* decode stateid */ + status = decode_stateid(xdr, &args->coa_stateid); + if (unlikely(status != 0)) + return status; + + /* decode status */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out; + args->error = ntohl(*p++); + if (!args->error) { + status = decode_write_response(xdr, args); + if (unlikely(status != 0)) + return status; + } else { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out; + p = xdr_decode_hyper(p, &args->wr_count); + } + return 0; +out: + return htonl(NFS4ERR_RESOURCE); +} +#endif /* CONFIG_NFS_V4_2 */ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { if (unlikely(xdr_stream_encode_opaque(xdr, str, len) < 0)) @@ -534,35 +595,10 @@ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char return 0; } -#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) -#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) -static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep) +static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, size_t sz) { - __be32 bm[2]; - __be32 *p; - - bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0); - bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1); - if (bm[1] != 0) { - p = xdr_reserve_space(xdr, 16); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(2); - *p++ = bm[0]; - *p++ = bm[1]; - } else if (bm[0] != 0) { - p = xdr_reserve_space(xdr, 12); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(1); - *p++ = bm[0]; - } else { - p = xdr_reserve_space(xdr, 8); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(0); - } - *savep = p; + if (xdr_stream_encode_uint32_array(xdr, bitmap, sz) < 0) + return cpu_to_be32(NFS4ERR_RESOURCE); return 0; } @@ -592,7 +628,7 @@ static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, u return 0; } -static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time) +static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec64 *time) { __be32 *p; @@ -604,20 +640,45 @@ static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *ti return 0; } -static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +static __be32 encode_attr_atime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time) +{ + if (!(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) + return 0; + return encode_attr_time(xdr,time); +} + +static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time) { if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) return 0; return encode_attr_time(xdr,time); } -static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time) { if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) return 0; return encode_attr_time(xdr,time); } +static __be32 encode_attr_delegatime(struct xdr_stream *xdr, + const uint32_t *bitmap, + const struct timespec64 *time) +{ + if (!(bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS)) + return 0; + return encode_attr_time(xdr,time); +} + +static __be32 encode_attr_delegmtime(struct xdr_stream *xdr, + const uint32_t *bitmap, + const struct timespec64 *time) +{ + if (!(bitmap[2] & FATTR4_WORD2_TIME_DELEG_MODIFY)) + return 0; + return encode_attr_time(xdr,time); +} + static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr) { __be32 status; @@ -655,19 +716,32 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (unlikely(status != 0)) goto out; - status = encode_attr_bitmap(xdr, res->bitmap, &savep); + status = encode_attr_bitmap(xdr, res->bitmap, ARRAY_SIZE(res->bitmap)); if (unlikely(status != 0)) goto out; + status = cpu_to_be32(NFS4ERR_RESOURCE); + savep = xdr_reserve_space(xdr, sizeof(*savep)); + if (unlikely(!savep)) + goto out; status = encode_attr_change(xdr, res->bitmap, res->change_attr); if (unlikely(status != 0)) goto out; status = encode_attr_size(xdr, res->bitmap, res->size); if (unlikely(status != 0)) goto out; + status = encode_attr_atime(xdr, res->bitmap, &res->atime); + if (unlikely(status != 0)) + goto out; status = encode_attr_ctime(xdr, res->bitmap, &res->ctime); if (unlikely(status != 0)) goto out; status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); + if (unlikely(status != 0)) + goto out; + status = encode_attr_delegatime(xdr, res->bitmap, &res->atime); + if (unlikely(status != 0)) + goto out; + status = encode_attr_delegmtime(xdr, res->bitmap, &res->mtime); *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); out: return status; @@ -793,7 +867,10 @@ preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op) if (status != htonl(NFS4ERR_OP_ILLEGAL)) return status; - if (op_nr == OP_CB_OFFLOAD) + if (op_nr == OP_CB_OFFLOAD) { + *op = &callback_ops[op_nr]; + return htonl(NFS_OK); + } else return htonl(NFS4ERR_NOTSUPP); return htonl(NFS4ERR_OP_ILLEGAL); } @@ -821,17 +898,16 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) } static __be32 process_op(int nop, struct svc_rqst *rqstp, - struct xdr_stream *xdr_in, void *argp, - struct xdr_stream *xdr_out, void *resp, - struct cb_process_state *cps) + struct cb_process_state *cps) { + struct xdr_stream *xdr_out = &rqstp->rq_res_stream; struct callback_op *op = &callback_ops[0]; unsigned int op_nr; __be32 status; long maxlen; __be32 res; - status = decode_op_hdr(xdr_in, &op_nr); + status = decode_op_hdr(&rqstp->rq_arg_stream, &op_nr); if (unlikely(status)) return status; @@ -861,9 +937,11 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp, maxlen = xdr_out->end - xdr_out->p; if (maxlen > 0 && maxlen < PAGE_SIZE) { - status = op->decode_args(rqstp, xdr_in, argp); + status = op->decode_args(rqstp, &rqstp->rq_arg_stream, + rqstp->rq_argp); if (likely(status == 0)) - status = op->process_op(argp, resp, cps); + status = op->process_op(rqstp->rq_argp, rqstp->rq_resp, + cps); } else status = htonl(NFS4ERR_RESOURCE); @@ -872,7 +950,7 @@ encode_hdr: if (unlikely(res)) return res; if (op->encode_res != NULL && status == 0) - status = op->encode_res(rqstp, xdr_out, resp); + status = op->encode_res(rqstp, xdr_out, rqstp->rq_resp); return status; } @@ -883,40 +961,42 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) { struct cb_compound_hdr_arg hdr_arg = { 0 }; struct cb_compound_hdr_res hdr_res = { NULL }; - struct xdr_stream xdr_in, xdr_out; - __be32 *p, status; struct cb_process_state cps = { .drc_status = 0, .clp = NULL, .net = SVC_NET(rqstp), }; unsigned int nops = 0; + __be32 status; - xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); - - p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); - xdr_init_encode(&xdr_out, &rqstp->rq_res, p); - - status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); + status = decode_compound_hdr_arg(&rqstp->rq_arg_stream, &hdr_arg); if (status == htonl(NFS4ERR_RESOURCE)) return rpc_garbage_args; if (hdr_arg.minorversion == 0) { cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); - if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) + if (!cps.clp) { + trace_nfs_cb_no_clp(rqstp->rq_xid, hdr_arg.cb_ident); + goto out_invalidcred; + } + if (!check_gss_callback_principal(cps.clp, rqstp)) { + trace_nfs_cb_badprinc(rqstp->rq_xid, hdr_arg.cb_ident); + nfs_put_client(cps.clp); goto out_invalidcred; + } + svc_xprt_set_valid(rqstp->rq_xprt); } cps.minorversion = hdr_arg.minorversion; hdr_res.taglen = hdr_arg.taglen; hdr_res.tag = hdr_arg.tag; - if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) + if (encode_compound_hdr_res(&rqstp->rq_res_stream, &hdr_res) != 0) { + if (cps.clp) + nfs_put_client(cps.clp); return rpc_system_err; - + } while (status == 0 && nops != hdr_arg.nops) { - status = process_op(nops, rqstp, &xdr_in, - rqstp->rq_argp, &xdr_out, rqstp->rq_resp, - &cps); + status = process_op(nops, rqstp, &cps); nops++; } @@ -927,6 +1007,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) nops--; } + if (svc_is_backchannel(rqstp) && cps.clp) { + rqstp->bc_to_initval = cps.clp->cl_rpcclient->cl_timeout->to_initval; + rqstp->bc_to_retries = cps.clp->cl_rpcclient->cl_timeout->to_retries; + } + *hdr_res.status = status; *hdr_res.nops = htonl(nops); nfs4_cb_free_slot(&cps); @@ -935,7 +1020,17 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) out_invalidcred: pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n"); - return rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; + return rpc_success; +} + +static int +nfs_callback_dispatch(struct svc_rqst *rqstp) +{ + const struct svc_procedure *procp = rqstp->rq_procinfo; + + *rqstp->rq_accept_statp = procp->pc_func(rqstp); + return 1; } /* @@ -989,6 +1084,13 @@ static struct callback_op callback_ops[] = { .res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ, }, #endif /* CONFIG_NFS_V4_1 */ +#ifdef CONFIG_NFS_V4_2 + [OP_CB_OFFLOAD] = { + .process_op = nfs4_callback_offload, + .decode_args = decode_offload_args, + .res_maxsize = CB_OP_OFFLOAD_RES_MAXSZ, + }, +#endif /* CONFIG_NFS_V4_2 */ }; /* @@ -997,39 +1099,43 @@ static struct callback_op callback_ops[] = { static const struct svc_procedure nfs4_callback_procedures1[] = { [CB_NULL] = { .pc_func = nfs4_callback_null, - .pc_decode = nfs4_decode_void, .pc_encode = nfs4_encode_void, .pc_xdrressize = 1, + .pc_name = "NULL", }, [CB_COMPOUND] = { .pc_func = nfs4_callback_compound, .pc_encode = nfs4_encode_void, .pc_argsize = 256, + .pc_argzero = 256, .pc_ressize = 256, .pc_xdrressize = NFS4_CALLBACK_BUFSIZE, + .pc_name = "COMPOUND", } }; -static unsigned int nfs4_callback_count1[ARRAY_SIZE(nfs4_callback_procedures1)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfs4_callback_count1[ARRAY_SIZE(nfs4_callback_procedures1)]); const struct svc_version nfs4_callback_version1 = { .vs_vers = 1, .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), .vs_proc = nfs4_callback_procedures1, .vs_count = nfs4_callback_count1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, - .vs_dispatch = NULL, + .vs_dispatch = nfs_callback_dispatch, .vs_hidden = true, .vs_need_cong_ctrl = true, }; -static unsigned int nfs4_callback_count4[ARRAY_SIZE(nfs4_callback_procedures1)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfs4_callback_count4[ARRAY_SIZE(nfs4_callback_procedures1)]); const struct svc_version nfs4_callback_version4 = { .vs_vers = 4, .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), .vs_proc = nfs4_callback_procedures1, .vs_count = nfs4_callback_count4, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, - .vs_dispatch = NULL, + .vs_dispatch = nfs_callback_dispatch, .vs_hidden = true, .vs_need_cong_ctrl = true, }; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index efebe6cf4378..54699299d5b1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* client.c: NFS client sharing and management code * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ @@ -42,7 +38,7 @@ #include <linux/sunrpc/bc_xprt.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> - +#include <linux/nfslocalio.h> #include "nfs4_fs.h" #include "callback.h" @@ -53,13 +49,19 @@ #include "pnfs.h" #include "nfs.h" #include "netns.h" +#include "sysfs.h" +#include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_CLIENT static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); -static DEFINE_SPINLOCK(nfs_version_lock); -static DEFINE_MUTEX(nfs_version_mutex); -static LIST_HEAD(nfs_versions); +static DEFINE_RWLOCK(nfs_version_lock); + +static struct nfs_subversion *nfs_version_mods[5] = { + [2] = NULL, + [3] = NULL, + [4] = NULL, +}; /* * RPC cruft for NFS @@ -75,46 +77,41 @@ const struct rpc_program nfs_program = { .number = NFS_PROGRAM, .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, - .stats = &nfs_rpcstat, .pipe_dir_name = NFS_PIPE_DIRNAME, }; -struct rpc_stat nfs_rpcstat = { - .program = &nfs_program -}; - -static struct nfs_subversion *find_nfs_version(unsigned int version) +static struct nfs_subversion *__find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; - spin_lock(&nfs_version_lock); - list_for_each_entry(nfs, &nfs_versions, list) { - if (nfs->rpc_ops->version == version) { - spin_unlock(&nfs_version_lock); - return nfs; - } - } - - spin_unlock(&nfs_version_lock); - return ERR_PTR(-EPROTONOSUPPORT); + read_lock(&nfs_version_lock); + nfs = nfs_version_mods[version]; + read_unlock(&nfs_version_lock); + return nfs; } -struct nfs_subversion *get_nfs_version(unsigned int version) +struct nfs_subversion *find_nfs_version(unsigned int version) { - struct nfs_subversion *nfs = find_nfs_version(version); + struct nfs_subversion *nfs = __find_nfs_version(version); - if (IS_ERR(nfs)) { - mutex_lock(&nfs_version_mutex); - request_module("nfsv%d", version); - nfs = find_nfs_version(version); - mutex_unlock(&nfs_version_mutex); - } + if (!nfs && request_module("nfsv%d", version) == 0) + nfs = __find_nfs_version(version); - if (!IS_ERR(nfs) && !try_module_get(nfs->owner)) + if (!nfs) + return ERR_PTR(-EPROTONOSUPPORT); + + if (!get_nfs_version(nfs)) return ERR_PTR(-EAGAIN); + return nfs; } +int get_nfs_version(struct nfs_subversion *nfs) +{ + return try_module_get(nfs->owner); +} +EXPORT_SYMBOL_GPL(get_nfs_version); + void put_nfs_version(struct nfs_subversion *nfs) { module_put(nfs->owner); @@ -122,23 +119,23 @@ void put_nfs_version(struct nfs_subversion *nfs) void register_nfs_version(struct nfs_subversion *nfs) { - spin_lock(&nfs_version_lock); + write_lock(&nfs_version_lock); - list_add(&nfs->list, &nfs_versions); + nfs_version_mods[nfs->rpc_ops->version] = nfs; nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; - spin_unlock(&nfs_version_lock); + write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(register_nfs_version); void unregister_nfs_version(struct nfs_subversion *nfs) { - spin_lock(&nfs_version_lock); + write_lock(&nfs_version_lock); nfs_version[nfs->rpc_ops->version] = NULL; - list_del(&nfs->list); + nfs_version_mods[nfs->rpc_ops->version] = NULL; - spin_unlock(&nfs_version_lock); + write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(unregister_nfs_version); @@ -151,19 +148,19 @@ EXPORT_SYMBOL_GPL(unregister_nfs_version); struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp; - struct rpc_cred *cred; int err = -ENOMEM; if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; + clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; - if (!try_module_get(clp->cl_nfs_mod->owner)) + if (!get_nfs_version(clp->cl_nfs_mod)) goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; - atomic_set(&clp->cl_count, 1); + refcount_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); @@ -179,14 +176,21 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); + clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; - clp->cl_net = get_net(cl_init->net); - - cred = rpc_lookup_machine_cred("*"); - if (!IS_ERR(cred)) - clp->cl_machine_cred = cred; - nfs_fscache_get_client_cookie(clp); - + clp->cl_nconnect = cl_init->nconnect; + clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; + clp->cl_net = get_net_track(cl_init->net, &clp->cl_ns_tracker, GFP_KERNEL); + +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + seqlock_init(&clp->cl_boot_lock); + ktime_get_real_ts64(&clp->cl_nfssvc_boot); + nfs_uuid_init(&clp->cl_uuid); + INIT_WORK(&clp->cl_local_probe_work, nfs_local_probe_async_work); +#endif /* CONFIG_NFS_LOCALIO */ + + clp->cl_principal = "*"; + clp->cl_xprtsec = cl_init->xprtsec; return clp; error_cleanup: @@ -199,7 +203,7 @@ error_0: EXPORT_SYMBOL_GPL(nfs_alloc_client); #if IS_ENABLED(CONFIG_NFS_V4) -void nfs_cleanup_cb_ident_idr(struct net *net) +static void nfs_cleanup_cb_ident_idr(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); @@ -218,11 +222,10 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); - rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); } #else -void nfs_cleanup_cb_ident_idr(struct net *net) +static void nfs_cleanup_cb_ident_idr(struct net *net) { } @@ -241,20 +244,17 @@ static void pnfs_init_server(struct nfs_server *server) */ void nfs_free_client(struct nfs_client *clp) { - nfs_fscache_release_client_cookie(clp); + nfs_localio_disable_client(clp); /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); - if (clp->cl_machine_cred != NULL) - put_rpccred(clp->cl_machine_cred); - - put_net(clp->cl_net); + put_net_track(clp->cl_net, &clp->cl_ns_tracker); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); - kfree(clp); + kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); @@ -270,7 +270,7 @@ void nfs_put_client(struct nfs_client *clp) nn = net_generic(clp->cl_net, nfs_net_id); - if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { + if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); nfs_cb_idr_remove_locked(clp); spin_unlock(&nn->nfs_client_lock); @@ -289,15 +289,29 @@ EXPORT_SYMBOL_GPL(nfs_put_client); static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; - const struct sockaddr *sap = data->addr; + const struct sockaddr *sap = (struct sockaddr *)data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); + int error; +again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; + /* If a client is still initializing then we need to wait */ + if (clp->cl_cons_state > NFS_CS_READY) { + refcount_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + error = nfs_wait_client_init_complete(clp); + nfs_put_client(clp); + spin_lock(&nn->nfs_client_lock); + if (error < 0) + return ERR_PTR(error); + goto again; + } + /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; @@ -307,6 +321,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat /* Match nfsv4 minorversion */ if (clp->cl_minorversion != data->minorversion) continue; + + /* Match request for a dedicated DS */ + if (test_bit(NFS_CS_DS, &data->init_flags) != + test_bit(NFS_CS_DS, &clp->cl_flags)) + continue; + /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) /* Match all xprt_switch full socket addresses */ @@ -315,7 +335,19 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat sap)) continue; - atomic_inc(&clp->cl_count); + /* Match the xprt security policy */ + if (clp->cl_xprtsec.policy != data->xprtsec.policy) + continue; + if (clp->cl_xprtsec.policy == RPC_XPRTSEC_TLS_X509) { + if (clp->cl_xprtsec.cert_serial != + data->xprtsec.cert_serial) + continue; + if (clp->cl_xprtsec.privkey_serial != + data->xprtsec.privkey_serial) + continue; + } + + refcount_inc(&clp->cl_count); return clp; } return NULL; @@ -393,7 +425,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) if (cl_init->hostname == NULL) { WARN_ON(1); - return NULL; + return ERR_PTR(-EINVAL); } /* see if the client already exists */ @@ -405,14 +437,18 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) spin_unlock(&nn->nfs_client_lock); if (new) new->rpc_ops->free_client(new); + if (IS_ERR(clp)) + return clp; return nfs_found_client(cl_init, clp); } if (new) { list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); - new->cl_flags = cl_init->init_flags; - return rpc_ops->init_client(new, cl_init); + new = rpc_ops->init_client(new, cl_init); + if (!IS_ERR(new)) + nfs_local_probe_async(new); + return new; } spin_unlock(&nn->nfs_client_lock); @@ -446,10 +482,11 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, switch (proto) { case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_TCP_TLS: case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; - if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0) + if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; @@ -484,18 +521,25 @@ int nfs_create_rpc_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { .net = clp->cl_net, .protocol = clp->cl_proto, + .nconnect = clp->cl_nconnect, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, .timeout = cl_init->timeparms, .servername = clp->cl_hostname, .nodename = cl_init->nodename, .program = &nfs_program, + .stats = &nn->rpcstats, .version = clp->rpc_ops->version, .authflavor = flavor, + .cred = cl_init->cred, + .xprtsec = cl_init->xprtsec, + .connect_timeout = cl_init->connect_timeout, + .reconnect_timeout = cl_init->reconnect_timeout, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) @@ -506,6 +550,12 @@ int nfs_create_rpc_client(struct nfs_client *clp, args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; + if (test_bit(NFS_CS_NOPING, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NOPING; + if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_REUSEPORT; + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NETUNREACH_FATAL; if (!IS_ERR(clp->cl_rpcclient)) return 0; @@ -517,7 +567,9 @@ int nfs_create_rpc_client(struct nfs_client *clp, return PTR_ERR(clnt); } + clnt->cl_principal = clp->cl_principal; clp->cl_rpcclient = clnt; + clnt->cl_max_connect = clp->cl_max_connect; return 0; } EXPORT_SYMBOL_GPL(nfs_create_rpc_client); @@ -547,6 +599,7 @@ static int nfs_start_lockd(struct nfs_server *server) 1 : 0, .net = clp->cl_net, .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, + .cred = server->cred, }; if (nlm_init.nfs_version > 3) @@ -559,8 +612,10 @@ static int nfs_start_lockd(struct nfs_server *server) default: nlm_init.protocol = IPPROTO_TCP; break; +#ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT case XPRT_TRANSPORT_UDP: nlm_init.protocol = IPPROTO_UDP; +#endif } host = nlmclnt_init(&nlm_init); @@ -569,6 +624,7 @@ static int nfs_start_lockd(struct nfs_server *server) server->nlm_host = host; server->destroy = nfs_destroy_server; + nfs_sysfs_link_rpc_client(server, nlmclnt_rpc_clnt(host), NULL); return 0; } @@ -593,9 +649,12 @@ int nfs_init_server_rpcclient(struct nfs_server *server, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; server->client->cl_softrtry = 0; + if (server->flags & NFS_MOUNT_SOFTERR) + server->client->cl_softerr = 1; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; + nfs_sysfs_link_rpc_client(server, server->client, NULL); return 0; } EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); @@ -631,79 +690,136 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, } EXPORT_SYMBOL_GPL(nfs_init_client); +static void nfs4_server_set_init_caps(struct nfs_server *server) +{ +#if IS_ENABLED(CONFIG_NFS_V4) + /* Set the basic capabilities */ + server->caps = server->nfs_client->cl_mvops->init_caps; + if (server->flags & NFS_MOUNT_NORDIRPLUS) + server->caps &= ~NFS_CAP_READDIRPLUS; + if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA) + server->caps &= ~NFS_CAP_READ_PLUS; + + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && + server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; +#endif +} + +void nfs_server_set_init_caps(struct nfs_server *server) +{ + switch (server->nfs_client->rpc_ops->version) { + case 2: + server->caps = NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; + break; + case 3: + server->caps = NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; + if (!(server->flags & NFS_MOUNT_NORDIRPLUS)) + server->caps |= NFS_CAP_READDIRPLUS; + break; + default: + nfs4_server_set_init_caps(server); + break; + } +} +EXPORT_SYMBOL_GPL(nfs_server_set_init_caps); + /* * Create a version 2 or 3 client */ static int nfs_init_server(struct nfs_server *server, - const struct nfs_parsed_mount_data *data, - struct nfs_subversion *nfs_mod) + const struct fs_context *fc) { + const struct nfs_fs_context *ctx = nfs_fc2context(fc); struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { - .hostname = data->nfs_server.hostname, - .addr = (const struct sockaddr *)&data->nfs_server.address, - .addrlen = data->nfs_server.addrlen, - .nfs_mod = nfs_mod, - .proto = data->nfs_server.protocol, - .net = data->net, + .hostname = ctx->nfs_server.hostname, + .addr = &ctx->nfs_server._address, + .addrlen = ctx->nfs_server.addrlen, + .nfs_mod = ctx->nfs_mod, + .proto = ctx->nfs_server.protocol, + .net = fc->net_ns, .timeparms = &timeparms, + .cred = server->cred, + .nconnect = ctx->nfs_server.nconnect, + .init_flags = (1UL << NFS_CS_REUSEPORT), + .xprtsec = ctx->xprtsec, }; struct nfs_client *clp; int error; - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, - data->timeo, data->retrans); - if (data->flags & NFS_MOUNT_NORESVPORT) + nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol, + ctx->timeo, ctx->retrans); + if (ctx->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (ctx->flags & NFS_MOUNT_NETUNREACH_FATAL) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); + /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) return PTR_ERR(clp); server->nfs_client = clp; + nfs_sysfs_add_server(server); + nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state"); /* Initialise the client representation from the mount data */ - server->flags = data->flags; - server->options = data->options; - server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| - NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| - NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; - - if (data->rsize) - server->rsize = nfs_block_size(data->rsize, NULL); - if (data->wsize) - server->wsize = nfs_block_size(data->wsize, NULL); - - server->acregmin = data->acregmin * HZ; - server->acregmax = data->acregmax * HZ; - server->acdirmin = data->acdirmin * HZ; - server->acdirmax = data->acdirmax * HZ; + server->flags = ctx->flags; + server->options = ctx->options; + + switch (clp->rpc_ops->version) { + case 2: + server->fattr_valid = NFS_ATTR_FATTR_V2; + break; + case 3: + server->fattr_valid = NFS_ATTR_FATTR_V3; + break; + default: + server->fattr_valid = NFS_ATTR_FATTR_V4; + } + + if (ctx->rsize) + server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto); + if (ctx->wsize) + server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto); + + server->acregmin = ctx->acregmin * HZ; + server->acregmax = ctx->acregmax * HZ; + server->acdirmin = ctx->acdirmin * HZ; + server->acdirmax = ctx->acdirmax * HZ; /* Start lockd here, before we might error out */ error = nfs_start_lockd(server); if (error < 0) goto error; - server->port = data->nfs_server.port; - server->auth_info = data->auth_info; + server->port = ctx->nfs_server.port; + server->auth_info = ctx->auth_info; error = nfs_init_server_rpcclient(server, &timeparms, - data->selected_flavor); + ctx->selected_flavor); if (error < 0) goto error; + nfs_server_set_init_caps(server); + /* Preserve the values of mount_server-related mount options */ - if (data->mount_server.addrlen) { - memcpy(&server->mountd_address, &data->mount_server.address, - data->mount_server.addrlen); - server->mountd_addrlen = data->mount_server.addrlen; + if (ctx->mount_server.addrlen) { + memcpy(&server->mountd_address, &ctx->mount_server.address, + ctx->mount_server.addrlen); + server->mountd_addrlen = ctx->mount_server.addrlen; } - server->mountd_version = data->mount_server.version; - server->mountd_port = data->mount_server.port; - server->mountd_protocol = data->mount_server.protocol; + server->mountd_version = ctx->mount_server.version; + server->mountd_port = ctx->mount_server.port; + server->mountd_protocol = ctx->mount_server.protocol; - server->namelen = data->namlen; + server->namelen = ctx->namlen; return 0; error: @@ -718,20 +834,23 @@ error: static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) { - unsigned long max_rpc_payload; + struct nfs_client *clp = server->nfs_client; + unsigned long max_rpc_payload, raw_max_rpc_payload; /* Work out a lot of parameters */ if (server->rsize == 0) - server->rsize = nfs_block_size(fsinfo->rtpref, NULL); + server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto); if (server->wsize == 0) - server->wsize = nfs_block_size(fsinfo->wtpref, NULL); + server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto); if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) - server->rsize = nfs_block_size(fsinfo->rtmax, NULL); + server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto); if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) - server->wsize = nfs_block_size(fsinfo->wtmax, NULL); + server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto); + + raw_max_rpc_payload = rpc_max_payload(server->client); + max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL); - max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); if (server->rsize > max_rpc_payload) server->rsize = max_rpc_payload; if (server->rsize > NFS_MAX_FILE_IO_SIZE) @@ -742,13 +861,12 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; - server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); - if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES) - server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES; + if (server->dtsize > NFS_MAX_FILE_IO_SIZE) + server->dtsize = NFS_MAX_FILE_IO_SIZE; if (server->dtsize > server->rsize) server->dtsize = server->rsize; @@ -759,17 +877,34 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->maxfilesize = fsinfo->maxfilesize; - server->time_delta = fsinfo->time_delta; + server->change_attr_type = fsinfo->change_attr_type; server->clone_blksize = fsinfo->clone_blksize; /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); + +#ifdef CONFIG_NFS_V4_2 + /* + * Defaults until limited by the session parameters. + */ + server->gxasize = min_t(unsigned int, raw_max_rpc_payload, + XATTR_SIZE_MAX); + server->sxasize = min_t(unsigned int, raw_max_rpc_payload, + XATTR_SIZE_MAX); + server->lxasize = min_t(unsigned int, raw_max_rpc_payload, + nfs42_listxattr_xdrsize(XATTR_LIST_MAX)); + + if (fsinfo->xattr_support) + server->caps |= NFS_CAP_XATTR; + else + server->caps &= ~NFS_CAP_XATTR; +#endif } /* * Probe filesystem information, including the FSID on v2/v3 */ -int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) +static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) { struct nfs_fsinfo fsinfo; struct nfs_client *clp = server->nfs_client; @@ -801,9 +936,40 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs server->namelen = pathinfo.max_namelen; } + if (clp->rpc_ops->discover_trunking != NULL && + (server->caps & NFS_CAP_FS_LOCATIONS && + (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) { + error = clp->rpc_ops->discover_trunking(server, mntfh); + if (error < 0) + return error; + } + return 0; } -EXPORT_SYMBOL_GPL(nfs_probe_fsinfo); + +/* + * Grab the destination's particulars, including lease expiry time. + * + * Returns zero if probe succeeded and retrieved FSID matches the FSID + * we have cached. + */ +int nfs_probe_server(struct nfs_server *server, struct nfs_fh *mntfh) +{ + struct nfs_fattr *fattr; + int error; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) + return -ENOMEM; + + /* Sanity: the probe won't work if the destination server + * does not recognize the migrated FH. */ + error = nfs_probe_fsinfo(server, mntfh, fattr); + + nfs_free_fattr(fattr); + return error; +} +EXPORT_SYMBOL_GPL(nfs_probe_server); /* * Copy useful information when duplicating a server record @@ -817,7 +983,6 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; target->acdirmax = source->acdirmax; - target->caps = source->caps; target->options = source->options; target->auth_info = source->auth_info; target->port = source->port; @@ -857,6 +1022,8 @@ void nfs_server_remove_lists(struct nfs_server *server) } EXPORT_SYMBOL_GPL(nfs_server_remove_lists); +static DEFINE_IDA(s_sysfs_ids); + /* * Allocate and initialise a server record */ @@ -868,6 +1035,12 @@ struct nfs_server *nfs_alloc_server(void) if (!server) return NULL; + server->s_sysfs_id = ida_alloc(&s_sysfs_ids, GFP_KERNEL); + if (server->s_sysfs_id < 0) { + kfree(server); + return NULL; + } + server->client = server->client_acl = ERR_PTR(-EINVAL); /* Zero out the NFS state stuff */ @@ -876,8 +1049,11 @@ struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->delegations); INIT_LIST_HEAD(&server->layouts); INIT_LIST_HEAD(&server->state_owners_lru); + INIT_LIST_HEAD(&server->ss_copies); + INIT_LIST_HEAD(&server->ss_src_copies); atomic_set(&server->active, 0); + atomic_long_set(&server->nr_active_delegations, 0); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { @@ -885,14 +1061,28 @@ struct nfs_server *nfs_alloc_server(void) return NULL; } - ida_init(&server->openowner_id); - ida_init(&server->lockowner_id); + server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + + init_waitqueue_head(&server->write_congestion_wait); + atomic_long_set(&server->writeback, 0); + + atomic64_set(&server->owner_ctr, 0); + pnfs_init_server(server); + rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); return server; } EXPORT_SYMBOL_GPL(nfs_alloc_server); +static void delayed_free(struct rcu_head *p) +{ + struct nfs_server *server = container_of(p, struct nfs_server, rcu); + + nfs_free_iostats(server->io_stats); + kfree(server); +} + /* * Free up a server record */ @@ -910,11 +1100,15 @@ void nfs_free_server(struct nfs_server *server) nfs_put_client(server->nfs_client); - ida_destroy(&server->lockowner_id); - ida_destroy(&server->openowner_id); - nfs_free_iostats(server->io_stats); - kfree(server); + if (server->kobj.state_initialized) { + nfs_sysfs_remove_server(server); + kobject_put(&server->kobj); + } + ida_free(&s_sysfs_ids, server->s_sysfs_id); + + put_cred(server->cred); nfs_release_automount_timer(); + call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); @@ -922,9 +1116,9 @@ EXPORT_SYMBOL_GPL(nfs_free_server); * Create a version 2 or 3 volume record * - keyed on server and FSID */ -struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, - struct nfs_subversion *nfs_mod) +struct nfs_server *nfs_create_server(struct fs_context *fc) { + struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_server *server; struct nfs_fattr *fattr; int error; @@ -933,32 +1127,37 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info, if (!server) return ERR_PTR(-ENOMEM); + server->cred = get_cred(fc->cred); + error = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) goto error; /* Get a client representation */ - error = nfs_init_server(server, mount_info->parsed, nfs_mod); + error = nfs_init_server(server, fc); if (error < 0) goto error; /* Probe the root fh to retrieve its FSID */ - error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr); + error = nfs_probe_fsinfo(server, ctx->mntfh, fattr); if (error < 0) goto error; if (server->nfs_client->rpc_ops->version == 3) { if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) server->namelen = NFS3_MAXNAMLEN; - if (!(mount_info->parsed->flags & NFS_MOUNT_NORDIRPLUS)) + if (!(ctx->flags & NFS_MOUNT_NORDIRPLUS)) server->caps |= NFS_CAP_READDIRPLUS; } else { if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } + /* Linux 'subtree_check' borkenness mandates this setting */ + server->fh_expire_type = NFS_FH_VOL_RENAME; if (!(fattr->valid & NFS_ATTR_FATTR)) { - error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL); + error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, + fattr, NULL); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; @@ -991,34 +1190,37 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, rpc_authflavor_t flavor) { struct nfs_server *server; - struct nfs_fattr *fattr_fsinfo; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); - error = -ENOMEM; - fattr_fsinfo = nfs_alloc_fattr(); - if (fattr_fsinfo == NULL) - goto out_free_server; + server->cred = get_cred(source->cred); /* Copy data from the source */ server->nfs_client = source->nfs_client; server->destroy = source->destroy; - atomic_inc(&server->nfs_client->cl_count); + refcount_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); server->fsid = fattr->fsid; + nfs_sysfs_add_server(server); + + nfs_sysfs_link_rpc_client(server, + server->nfs_client->cl_rpcclient, "_state"); + error = nfs_init_server_rpcclient(server, source->client->cl_timeout, flavor); if (error < 0) goto out_free_server; + nfs_server_set_init_caps(server); + /* probe the filesystem info for this server filesystem */ - error = nfs_probe_fsinfo(server, fh, fattr_fsinfo); + error = nfs_probe_server(server, fh); if (error < 0) goto out_free_server; @@ -1032,11 +1234,9 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, nfs_server_insert_lists(server); server->mount_time = jiffies; - nfs_free_fattr(fattr_fsinfo); return server; out_free_server: - nfs_free_fattr(fattr_fsinfo); nfs_free_server(server); return ERR_PTR(error); } @@ -1051,12 +1251,32 @@ void nfs_clients_init(struct net *net) #if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); #endif +#if IS_ENABLED(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&nn->nfs4_data_server_cache); + spin_lock_init(&nn->nfs4_data_server_lock); +#endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); + memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); + nn->rpcstats.program = &nfs_program; + + nfs_netns_sysfs_setup(nn, net); +} + +void nfs_clients_exit(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nfs_netns_sysfs_destroy(nn); + nfs_cleanup_cb_ident_idr(net); + WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); + WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); +#if IS_ENABLED(CONFIG_NFS_V4_1) + WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache)); +#endif } #ifdef CONFIG_PROC_FS -static int nfs_server_list_open(struct inode *inode, struct file *file); static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); @@ -1069,14 +1289,6 @@ static const struct seq_operations nfs_server_list_ops = { .show = nfs_server_list_show, }; -static const struct file_operations nfs_server_list_fops = { - .open = nfs_server_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -static int nfs_volume_list_open(struct inode *inode, struct file *file); static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); @@ -1089,23 +1301,6 @@ static const struct seq_operations nfs_volume_list_ops = { .show = nfs_volume_list_show, }; -static const struct file_operations nfs_volume_list_fops = { - .open = nfs_volume_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -/* - * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which - * we're dealing - */ -static int nfs_server_list_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &nfs_server_list_ops, - sizeof(struct seq_net_private)); -} - /* * set up the iterator to start reading from the server list and return the first item */ @@ -1166,7 +1361,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v) clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), - atomic_read(&clp->cl_count), + refcount_read(&clp->cl_count), clp->cl_hostname); rcu_read_unlock(); @@ -1174,15 +1369,6 @@ static int nfs_server_list_show(struct seq_file *m, void *v) } /* - * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes - */ -static int nfs_volume_list_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &nfs_volume_list_ops, - sizeof(struct seq_net_private)); -} - -/* * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) @@ -1267,14 +1453,14 @@ int nfs_fs_proc_net_init(struct net *net) goto error_0; /* a file of servers with which we're dealing */ - p = proc_create("servers", S_IFREG|S_IRUGO, - nn->proc_nfsfs, &nfs_server_list_fops); + p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs, + &nfs_server_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; /* a file of volumes that we have mounted */ - p = proc_create("volumes", S_IFREG|S_IRUGO, - nn->proc_nfsfs, &nfs_volume_list_fops); + p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, + &nfs_volume_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; return 0; @@ -1319,6 +1505,7 @@ error_0: void nfs_fs_proc_exit(void) { remove_proc_subtree("fs/nfsfs", NULL); + ida_destroy(&s_sysfs_ids); } #endif /* CONFIG_PROC_FS */ diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d7df5e67b0c1..9d3a5f29f17f 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/delegation.c * @@ -12,25 +13,67 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/iversion.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/nfs_xdr.h> #include "nfs4_fs.h" +#include "nfs4session.h" #include "delegation.h" #include "internal.h" #include "nfs4trace.h" -static void nfs_free_delegation(struct nfs_delegation *delegation) +#define NFS_DEFAULT_DELEGATION_WATERMARK (5000U) + +static unsigned nfs_delegation_watermark = NFS_DEFAULT_DELEGATION_WATERMARK; +module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644); + +static struct hlist_head *nfs_delegation_hash(struct nfs_server *server, + const struct nfs_fh *fhandle) { - if (delegation->cred) { - put_rpccred(delegation->cred); - delegation->cred = NULL; - } + return server->delegation_hash_table + + (nfs_fhandle_hash(fhandle) & server->delegation_hash_mask); +} + +static void __nfs_free_delegation(struct nfs_delegation *delegation) +{ + put_cred(delegation->cred); + delegation->cred = NULL; kfree_rcu(delegation, rcu); } +static void nfs_mark_delegation_revoked(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; + atomic_long_dec(&server->nr_active_delegations); + if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + nfs_clear_verifier_delegated(delegation->inode); + } +} + +static struct nfs_delegation *nfs_get_delegation(struct nfs_delegation *delegation) +{ + refcount_inc(&delegation->refcount); + return delegation; +} + +static void nfs_put_delegation(struct nfs_delegation *delegation) +{ + if (refcount_dec_and_test(&delegation->refcount)) + __nfs_free_delegation(delegation); +} + +static void nfs_free_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + nfs_mark_delegation_revoked(server, delegation); + nfs_put_delegation(delegation); +} + /** * nfs_mark_delegation_referenced - set delegation's REFERENCED flag * @delegation: delegation to process @@ -41,61 +84,82 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } -static bool -nfs4_is_valid_delegation(const struct nfs_delegation *delegation, - fmode_t flags) +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) { - if (delegation != NULL && (delegation->type & flags) == flags && + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + +static bool nfs4_is_valid_delegation(const struct nfs_delegation *delegation, + fmode_t type) +{ + if (delegation != NULL && (delegation->type & type) == type && !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) return true; return false; } -static int -nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) +struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode) +{ + struct nfs_delegation *delegation; + + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (nfs4_is_valid_delegation(delegation, 0)) + return delegation; + return NULL; +} + +static int nfs4_do_check_delegation(struct inode *inode, fmode_t type, + int flags, bool mark) { struct nfs_delegation *delegation; int ret = 0; - flags &= FMODE_READ|FMODE_WRITE; + type &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (nfs4_is_valid_delegation(delegation, flags)) { + if (nfs4_is_valid_delegation(delegation, type)) { if (mark) nfs_mark_delegation_referenced(delegation); ret = 1; + if ((flags & NFS_DELEGATION_FLAG_TIME) && + !test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags)) + ret = 0; } rcu_read_unlock(); return ret; } /** - * nfs_have_delegation - check if inode has a delegation, mark it + * nfs4_have_delegation - check if inode has a delegation, mark it * NFS_DELEGATION_REFERENCED if there is one. * @inode: inode to check - * @flags: delegation types to check for + * @type: delegation types to check for + * @flags: various modifiers * * Returns one if inode has the indicated delegation, otherwise zero. */ -int nfs4_have_delegation(struct inode *inode, fmode_t flags) +int nfs4_have_delegation(struct inode *inode, fmode_t type, int flags) { - return nfs4_do_check_delegation(inode, flags, true); + return nfs4_do_check_delegation(inode, type, flags, true); } /* * nfs4_check_delegation - check if inode has a delegation, do not mark * NFS_DELEGATION_REFERENCED if it has one. */ -int nfs4_check_delegation(struct inode *inode, fmode_t flags) +int nfs4_check_delegation(struct inode *inode, fmode_t type) { - return nfs4_do_check_delegation(inode, flags, false); + return nfs4_do_check_delegation(inode, type, 0, false); } -static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid) { struct inode *inode = state->inode; struct file_lock *fl; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct list_head *list; int status = 0; @@ -105,8 +169,8 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ list = &flctx->flc_posix; spin_lock(&flctx->flc_lock); restart: - list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file) != ctx) + for_each_file_lock(fl, list) { + if (nfs_file_open_context(fl->c.flc_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); @@ -130,12 +194,11 @@ static int nfs_delegation_claim_opens(struct inode *inode, struct nfs_open_context *ctx; struct nfs4_state_owner *sp; struct nfs4_state *state; - unsigned int seq; int err; again: - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -145,24 +208,22 @@ again: continue; if (!nfs4_stateid_match(&state->stateid, stateid)) continue; - get_nfs_open_context(ctx); - spin_unlock(&inode->i_lock); + if (!get_nfs_open_context(ctx)) + continue; + rcu_read_unlock(); sp = state->owner; /* Block nfs4_proc_unlck */ mutex_lock(&sp->so_delegreturn_mutex); - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); - err = nfs4_open_delegation_recall(ctx, state, stateid, type); + err = nfs4_open_delegation_recall(ctx, state, stateid); if (!err) - err = nfs_delegation_claim_locks(ctx, state, stateid); - if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) - err = -EAGAIN; + err = nfs_delegation_claim_locks(state, stateid); mutex_unlock(&sp->so_delegreturn_mutex); put_nfs_open_context(ctx); if (err != 0) return err; goto again; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); return 0; } @@ -170,50 +231,66 @@ again: * nfs_inode_reclaim_delegation - process a delegation reclaim request * @inode: inode to process * @cred: credential to use for request - * @res: new delegation state from server + * @type: delegation type + * @stateid: delegation stateid + * @pagemod_limit: write delegation "space_limit" + * @deleg_type: raw delegation type * */ -void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, - struct nfs_openres *res) +void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, + fmode_t type, const nfs4_stateid *stateid, + unsigned long pagemod_limit, u32 deleg_type) { struct nfs_delegation *delegation; - struct rpc_cred *oldcred = NULL; + const struct cred *oldcred = NULL; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL) { - spin_lock(&delegation->lock); - if (delegation->inode != NULL) { - nfs4_stateid_copy(&delegation->stateid, &res->delegation); - delegation->type = res->delegation_type; - delegation->pagemod_limit = res->pagemod_limit; - oldcred = delegation->cred; - delegation->cred = get_rpccred(cred); - clear_bit(NFS_DELEGATION_NEED_RECLAIM, - &delegation->flags); - spin_unlock(&delegation->lock); - rcu_read_unlock(); - put_rpccred(oldcred); - trace_nfs4_reclaim_delegation(inode, res->delegation_type); - return; - } - /* We appear to have raced with a delegation return. */ - spin_unlock(&delegation->lock); + if (!delegation) { + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, type, stateid, + pagemod_limit, deleg_type); + return; + } + + spin_lock(&delegation->lock); + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; + oldcred = delegation->cred; + delegation->cred = get_cred(cred); + switch (deleg_type) { + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + set_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags); + break; + default: + clear_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags); } + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + atomic_long_inc(&NFS_SERVER(inode)->nr_active_delegations); + spin_unlock(&delegation->lock); rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, res); + put_cred(oldcred); + trace_nfs4_reclaim_delegation(inode, type); } -static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) +static int nfs_do_return_delegation(struct inode *inode, + struct nfs_delegation *delegation, + int issync) { + const struct cred *cred; int res = 0; - if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) - res = nfs4_proc_delegreturn(inode, - delegation->cred, - &delegation->stateid, - issync); - nfs_free_delegation(delegation); + if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + spin_lock(&delegation->lock); + cred = get_cred(delegation->cred); + spin_unlock(&delegation->lock); + res = nfs4_proc_delegreturn(inode, cred, &delegation->stateid, + delegation, issync); + put_cred(cred); + } return res; } @@ -224,6 +301,8 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation spin_lock(&delegation->lock); if (delegation->inode != NULL) inode = igrab(delegation->inode); + if (!inode) + set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags); spin_unlock(&delegation->lock); return inode; } @@ -237,9 +316,15 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) if (delegation == NULL) goto out; spin_lock(&delegation->lock); - if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) - ret = delegation; + if (delegation->inode && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); + } spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(&nfsi->vfs_inode); out: return ret; } @@ -255,16 +340,19 @@ nfs_start_delegation_return(struct nfs_inode *nfsi) return delegation; } -static void -nfs_abort_delegation_return(struct nfs_delegation *delegation, - struct nfs_client *clp) +static void nfs_abort_delegation_return(struct nfs_delegation *delegation, + struct nfs_server *server, int err) { - spin_lock(&delegation->lock); clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + if (err == -EAGAIN) { + set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); + set_bit(NFS4SERV_DELEGRETURN_DELAYED, + &server->delegation_flags); + set_bit(NFS4CLNT_DELEGRETURN_DELAYED, + &server->nfs_client->cl_state); + } spin_unlock(&delegation->lock); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); } static struct nfs_delegation * @@ -276,11 +364,17 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi, rcu_dereference_protected(nfsi->delegation, lockdep_is_held(&clp->cl_lock)); + trace_nfs4_detach_delegation(&nfsi->vfs_inode, delegation->type); + if (deleg_cur == NULL || delegation != deleg_cur) return NULL; spin_lock(&delegation->lock); - set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + if (!delegation->inode) { + spin_unlock(&delegation->lock); + return NULL; + } + hlist_del_init_rcu(&delegation->hash); list_del_rcu(&delegation->super_list); delegation->inode = NULL; rcu_assign_pointer(nfsi->delegation, NULL); @@ -307,20 +401,43 @@ nfs_inode_detach_delegation(struct inode *inode) struct nfs_server *server = NFS_SERVER(inode); struct nfs_delegation *delegation; - delegation = nfs_start_delegation_return(nfsi); - if (delegation == NULL) - return NULL; - return nfs_detach_delegation(nfsi, delegation, server); + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) + delegation = nfs_detach_delegation(nfsi, delegation, server); + rcu_read_unlock(); + return delegation; +} + +static void +nfs_update_delegation_cred(struct nfs_delegation *delegation, + const struct cred *cred) +{ + const struct cred *old; + + if (cred_fscmp(delegation->cred, cred) != 0) { + old = xchg(&delegation->cred, get_cred(cred)); + put_cred(old); + } } static void -nfs_update_inplace_delegation(struct nfs_delegation *delegation, +nfs_update_inplace_delegation(struct nfs_server *server, + struct nfs_delegation *delegation, const struct nfs_delegation *update) { if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) { delegation->stateid.seqid = update->stateid.seqid; smp_wmb(); delegation->type = update->type; + delegation->pagemod_limit = update->pagemod_limit; + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + delegation->change_attr = update->change_attr; + nfs_update_delegation_cred(delegation, update->cred); + /* smp_mb__before_atomic() is implicit due to xchg() */ + clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags); + atomic_long_inc(&server->nr_active_delegations); + } } } @@ -328,11 +445,16 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, * nfs_inode_set_delegation - set up a delegation on an inode * @inode: inode to which delegation applies * @cred: cred to use for subsequent delegation processing - * @res: new delegation state from server + * @type: delegation type + * @stateid: delegation stateid + * @pagemod_limit: write delegation "space_limit" + * @deleg_type: raw delegation type * * Returns zero on success, or a negative errno value. */ -int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) +int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, + fmode_t type, const nfs4_stateid *stateid, + unsigned long pagemod_limit, u32 deleg_type) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = server->nfs_client; @@ -341,29 +463,40 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct struct nfs_delegation *freeme = NULL; int status = 0; - delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + delegation = kmalloc(sizeof(*delegation), GFP_KERNEL_ACCOUNT); if (delegation == NULL) return -ENOMEM; - nfs4_stateid_copy(&delegation->stateid, &res->delegation); - delegation->type = res->delegation_type; - delegation->pagemod_limit = res->pagemod_limit; - delegation->change_attr = inode->i_version; - delegation->cred = get_rpccred(cred); + nfs4_stateid_copy(&delegation->stateid, stateid); + refcount_set(&delegation->refcount, 1); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; + delegation->change_attr = inode_peek_iversion_raw(inode); + delegation->cred = get_cred(cred); delegation->inode = inode; delegation->flags = 1<<NFS_DELEGATION_REFERENCED; + switch (deleg_type) { + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + delegation->flags |= BIT(NFS_DELEGATION_DELEGTIME); + } + delegation->test_gen = 0; spin_lock_init(&delegation->lock); spin_lock(&clp->cl_lock); old_delegation = rcu_dereference_protected(nfsi->delegation, lockdep_is_held(&clp->cl_lock)); - if (old_delegation != NULL) { - /* Is this an update of the existing delegation? */ - if (nfs4_stateid_match_other(&old_delegation->stateid, - &delegation->stateid)) { - nfs_update_inplace_delegation(old_delegation, - delegation); - goto out; - } + if (old_delegation == NULL) + goto add_new; + /* Is this an update of the existing delegation? */ + if (nfs4_stateid_match_other(&old_delegation->stateid, + &delegation->stateid)) { + spin_lock(&old_delegation->lock); + nfs_update_inplace_delegation(server, old_delegation, + delegation); + spin_unlock(&old_delegation->lock); + goto out; + } + if (!test_bit(NFS_DELEGATION_REVOKED, &old_delegation->flags)) { /* * Deal with broken servers that hand out two * delegations for the same file. @@ -382,23 +515,49 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct if (test_and_set_bit(NFS_DELEGATION_RETURNING, &old_delegation->flags)) goto out; - freeme = nfs_detach_delegation_locked(nfsi, - old_delegation, clp); - if (freeme == NULL) - goto out; } + freeme = nfs_detach_delegation_locked(nfsi, old_delegation, clp); + if (freeme == NULL) + goto out; +add_new: + /* + * If we didn't revalidate the change attribute before setting + * the delegation, then pre-emptively ask for a full attribute + * cache revalidation. + */ + spin_lock(&inode->i_lock); + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_CHANGE) + nfs_set_cache_invalid(inode, + NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | + NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | + NFS_INO_INVALID_OTHER | NFS_INO_INVALID_DATA | + NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); + spin_unlock(&inode->i_lock); + list_add_tail_rcu(&delegation->super_list, &server->delegations); + hlist_add_head_rcu(&delegation->hash, + nfs_delegation_hash(server, &NFS_I(inode)->fh)); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; - trace_nfs4_set_delegation(inode, res->delegation_type); + atomic_long_inc(&server->nr_active_delegations); + + trace_nfs4_set_delegation(inode, type); + /* If we hold writebacks and have delegated mtime then update */ + if (deleg_type == NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG && + nfs_have_writebacks(inode)) + nfs_update_delegated_mtime(inode); out: spin_unlock(&clp->cl_lock); if (delegation != NULL) - nfs_free_delegation(delegation); - if (freeme != NULL) + __nfs_free_delegation(delegation); + if (freeme != NULL) { nfs_do_return_delegation(inode, freeme, 0); + nfs_free_delegation(server, freeme); + } return status; } @@ -407,13 +566,19 @@ out: */ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_server *server = NFS_SERVER(inode); + unsigned int mode = O_WRONLY | O_RDWR; int err = 0; if (delegation == NULL) return 0; - do { + + if (!issync) + mode |= O_NONBLOCK; + /* Recall of any remaining application leases */ + err = break_lease(inode, mode); + + while (err == 0) { if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) break; err = nfs_delegation_claim_opens(inode, &delegation->stateid, @@ -423,18 +588,18 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation /* * Guard against state recovery */ - err = nfs4_wait_clnt_recover(clp); - } while (err == 0); + err = nfs4_wait_clnt_recover(server->nfs_client); + } if (err) { - nfs_abort_delegation_return(delegation, clp); + nfs_abort_delegation_return(delegation, server, err); goto out; } - if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode))) - goto out; err = nfs_do_return_delegation(inode, delegation, issync); out: + /* Refcount matched in nfs_start_delegation_return_locked() */ + nfs_put_delegation(delegation); return err; } @@ -442,19 +607,124 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) { bool ret = false; - if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) - goto out; + trace_nfs_delegation_need_return(delegation); + if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; - if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { - struct inode *inode; + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || + test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) || + test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + ret = false; - spin_lock(&delegation->lock); - inode = delegation->inode; - if (inode && list_empty(&NFS_I(inode)->open_files)) + return ret; +} + +static int nfs_server_return_marked_delegations(struct nfs_server *server, + void __always_unused *data) +{ + struct nfs_delegation *delegation; + struct nfs_delegation *prev; + struct inode *inode; + struct inode *place_holder = NULL; + struct nfs_delegation *place_holder_deleg = NULL; + int err = 0; + + if (!test_and_clear_bit(NFS4SERV_DELEGRETURN, + &server->delegation_flags)) + return 0; +restart: + /* + * To avoid quadratic looping we hold a reference + * to an inode place_holder. Each time we restart, we + * list delegation in the server from the delegations + * of that inode. + * prev is an RCU-protected pointer to a delegation which + * wasn't marked for return and might be a good choice for + * the next place_holder. + */ + prev = NULL; + delegation = NULL; + rcu_read_lock(); + if (place_holder) + delegation = rcu_dereference(NFS_I(place_holder)->delegation); + if (!delegation || delegation != place_holder_deleg) + delegation = list_entry_rcu(server->delegations.next, + struct nfs_delegation, super_list); + list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) { + struct inode *to_put = NULL; + + if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags)) + continue; + if (!nfs_delegation_need_return(delegation)) { + if (nfs4_is_valid_delegation(delegation, 0)) + prev = delegation; + continue; + } + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + + if (prev) { + struct inode *tmp = nfs_delegation_grab_inode(prev); + if (tmp) { + to_put = place_holder; + place_holder = tmp; + place_holder_deleg = prev; + } + } + + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + rcu_read_unlock(); + + iput(to_put); + + err = nfs_end_delegation_return(inode, delegation, 0); + iput(inode); + cond_resched(); + if (!err) + goto restart; + set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); + goto out; + } + rcu_read_unlock(); +out: + iput(place_holder); + return err; +} + +static bool nfs_server_clear_delayed_delegations(struct nfs_server *server) +{ + struct nfs_delegation *d; + bool ret = false; + + if (!test_and_clear_bit(NFS4SERV_DELEGRETURN_DELAYED, + &server->delegation_flags)) + goto out; + list_for_each_entry_rcu (d, &server->delegations, super_list) { + if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags)) + continue; + nfs_mark_return_delegation(server, d); + clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags); + ret = true; + } +out: + return ret; +} + +static bool nfs_client_clear_delayed_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + bool ret = false; + + if (!test_and_clear_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state)) + goto out; + rcu_read_lock(); + list_for_each_entry_rcu (server, &clp->cl_superblocks, client_link) { + if (nfs_server_clear_delayed_delegations(server)) ret = true; - spin_unlock(&delegation->lock); } + rcu_read_unlock(); out: return ret; } @@ -471,60 +741,39 @@ out: */ int nfs_client_return_marked_delegations(struct nfs_client *clp) { - struct nfs_delegation *delegation; - struct nfs_server *server; - struct inode *inode; - int err = 0; - -restart: - rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry_rcu(delegation, &server->delegations, - super_list) { - if (!nfs_delegation_need_return(delegation)) - continue; - if (!nfs_sb_active(server->super)) - continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - nfs_sb_deactive(server->super); - goto restart; - } - delegation = nfs_start_delegation_return_locked(NFS_I(inode)); - rcu_read_unlock(); - - err = nfs_end_delegation_return(inode, delegation, 0); - iput(inode); - nfs_sb_deactive(server->super); - if (!err) - goto restart; - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); - return err; - } - } - rcu_read_unlock(); + int err = nfs_client_for_each_server( + clp, nfs_server_return_marked_delegations, NULL); + if (err) + return err; + /* If a return was delayed, sleep to prevent hard looping */ + if (nfs_client_clear_delayed_delegations(clp)) + ssleep(1); return 0; } /** - * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens + * nfs_inode_evict_delegation - return delegation, don't reclaim opens * @inode: inode to process * * Does not protect against delegation reclaims, therefore really only safe - * to be called from nfs4_clear_inode(). + * to be called from nfs4_clear_inode(). Guaranteed to always free + * the delegation structure. */ -void nfs_inode_return_delegation_noreclaim(struct inode *inode) +void nfs_inode_evict_delegation(struct inode *inode) { struct nfs_delegation *delegation; delegation = nfs_inode_detach_delegation(inode); - if (delegation != NULL) + if (delegation != NULL) { + set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags); nfs_do_return_delegation(inode, delegation, 1); + nfs_free_delegation(NFS_SERVER(inode), delegation); + } } /** - * nfs_inode_return_delegation - synchronously return a delegation + * nfs4_inode_return_delegation - synchronously return a delegation * @inode: inode to process * * This routine will always flush any dirty data to disk on the @@ -537,27 +786,136 @@ int nfs4_inode_return_delegation(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - int err = 0; - nfs_wb_all(inode); delegation = nfs_start_delegation_return(nfsi); - if (delegation != NULL) - err = nfs_end_delegation_return(inode, delegation, 1); - return err; + if (delegation != NULL) { + /* Synchronous recall of any application leases */ + break_lease(inode, O_WRONLY | O_RDWR); + if (S_ISREG(inode->i_mode)) + nfs_wb_all(inode); + return nfs_end_delegation_return(inode, delegation, 1); + } + return 0; } -static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, - struct nfs_delegation *delegation) +/** + * nfs4_inode_set_return_delegation_on_close - asynchronously return a delegation + * @inode: inode to process + * + * This routine is called to request that the delegation be returned as soon + * as the file is closed. If the file is already closed, the delegation is + * immediately returned. + */ +void nfs4_inode_set_return_delegation_on_close(struct inode *inode) { - set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); + struct nfs_delegation *delegation; + struct nfs_delegation *ret = NULL; + + if (!inode) + return; + rcu_read_lock(); + delegation = nfs4_get_valid_delegation(inode); + if (!delegation) + goto out; + spin_lock(&delegation->lock); + if (!delegation->inode) + goto out_unlock; + if (list_empty(&NFS_I(inode)->open_files) && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); + } else + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +out_unlock: + spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(inode); +out: + rcu_read_unlock(); + nfs_end_delegation_return(inode, ret, 0); } -static void nfs_mark_return_delegation(struct nfs_server *server, - struct nfs_delegation *delegation) +/** + * nfs4_inode_return_delegation_on_close - asynchronously return a delegation + * @inode: inode to process + * + * This routine is called on file close in order to determine if the + * inode delegation needs to be returned immediately. + */ +void nfs4_inode_return_delegation_on_close(struct inode *inode) { - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); + struct nfs_delegation *delegation; + struct nfs_delegation *ret = NULL; + + if (!inode) + return; + rcu_read_lock(); + delegation = nfs4_get_valid_delegation(inode); + if (!delegation) + goto out; + if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) || + atomic_long_read(&NFS_SERVER(inode)->nr_active_delegations) >= + nfs_delegation_watermark) { + spin_lock(&delegation->lock); + if (delegation->inode && + list_empty(&NFS_I(inode)->open_files) && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); + } + spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(inode); + } +out: + rcu_read_unlock(); + nfs_end_delegation_return(inode, ret, 0); +} + +/** + * nfs4_inode_make_writeable + * @inode: pointer to inode + * + * Make the inode writeable by returning the delegation if necessary + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_inode_make_writeable(struct inode *inode) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = nfs4_get_valid_delegation(inode); + if (delegation == NULL || + (nfs4_has_session(NFS_SERVER(inode)->nfs_client) && + (delegation->type & FMODE_WRITE))) { + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + return nfs4_inode_return_delegation(inode); +} + +static void +nfs_mark_return_if_closed_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + struct inode *inode; + + if (test_bit(NFS_DELEGATION_RETURN, &delegation->flags) || + test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) + return; + spin_lock(&delegation->lock); + inode = delegation->inode; + if (!inode) + goto out; + if (list_empty(&NFS_I(inode)->open_files)) + nfs_mark_return_delegation(server, delegation); + else + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +out: + spin_unlock(&delegation->lock); } static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) @@ -600,8 +958,8 @@ void nfs_expire_all_delegations(struct nfs_client *clp) } /** - * nfs_super_return_all_delegations - return delegations for one superblock - * @sb: sb to process + * nfs_server_return_all_delegations - return delegations for one superblock + * @server: pointer to nfs_server to process * */ void nfs_server_return_all_delegations(struct nfs_server *server) @@ -646,15 +1004,7 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl rcu_read_unlock(); } -static void nfs_mark_delegation_revoked(struct nfs_server *server, - struct nfs_delegation *delegation) -{ - set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); - delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; - nfs_mark_return_delegation(server, delegation); -} - -static bool nfs_revoke_delegation(struct inode *inode, +static void nfs_revoke_delegation(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_delegation *delegation; @@ -668,27 +1018,83 @@ static bool nfs_revoke_delegation(struct inode *inode, if (stateid == NULL) { nfs4_stateid_copy(&tmp, &delegation->stateid); stateid = &tmp; - } else if (!nfs4_stateid_match(stateid, &delegation->stateid)) - goto out; + } else { + if (!nfs4_stateid_match_other(stateid, &delegation->stateid)) + goto out; + spin_lock(&delegation->lock); + if (stateid->seqid) { + if (nfs4_stateid_is_newer(&delegation->stateid, stateid)) { + spin_unlock(&delegation->lock); + goto out; + } + delegation->stateid.seqid = stateid->seqid; + } + spin_unlock(&delegation->lock); + } nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); ret = true; out: rcu_read_unlock(); if (ret) nfs_inode_find_state_and_recover(inode, stateid); - return ret; } -void nfs_remove_bad_delegation(struct inode *inode, +void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_delegation *delegation; - if (!nfs_revoke_delegation(inode, stateid)) + if (!inode) return; - delegation = nfs_inode_detach_delegation(inode); - if (delegation) - nfs_free_delegation(delegation); + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (!delegation) + goto out_rcu_unlock; + + spin_lock(&delegation->lock); + if (!nfs4_stateid_match_other(stateid, &delegation->stateid)) + goto out_spin_unlock; + if (stateid->seqid) { + /* If delegation->stateid is newer, dont mark as returned */ + if (nfs4_stateid_is_newer(&delegation->stateid, stateid)) + goto out_clear_returning; + if (delegation->stateid.seqid != stateid->seqid) + delegation->stateid.seqid = stateid->seqid; + } + + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + spin_unlock(&delegation->lock); + if (nfs_detach_delegation(NFS_I(inode), delegation, NFS_SERVER(inode))) + nfs_put_delegation(delegation); + goto out_rcu_unlock; + +out_clear_returning: + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); +out_spin_unlock: + spin_unlock(&delegation->lock); +out_rcu_unlock: + rcu_read_unlock(); + + nfs_inode_find_state_and_recover(inode, stateid); +} + +/** + * nfs_remove_bad_delegation - handle delegations that are unusable + * @inode: inode to process + * @stateid: the delegation's stateid + * + * If the server ACK-ed our FREE_STATEID then clean + * up the delegation, else mark and keep the revoked state. + */ +void nfs_remove_bad_delegation(struct inode *inode, + const nfs4_stateid *stateid) +{ + if (stateid && stateid->type == NFS4_FREED_STATEID_TYPE) + nfs_delegation_mark_returned(inode, stateid); + else + nfs_revoke_delegation(inode, stateid); } EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); @@ -747,7 +1153,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation; rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); + delegation = nfs4_get_valid_delegation(inode); if (delegation == NULL) goto out_enoent; if (stateid != NULL && @@ -756,6 +1162,9 @@ int nfs_async_inode_return_delegation(struct inode *inode, nfs_mark_return_delegation(server, delegation); rcu_read_unlock(); + /* If there are any application leases or delegations, recall them */ + break_lease(inode, O_WRONLY | O_RDWR | O_NONBLOCK); + nfs_delegation_run_state_manager(clp); return 0; out_enoent: @@ -767,20 +1176,33 @@ static struct inode * nfs_delegation_find_inode_server(struct nfs_server *server, const struct nfs_fh *fhandle) { + struct hlist_head *head = nfs_delegation_hash(server, fhandle); struct nfs_delegation *delegation; + struct super_block *freeme = NULL; struct inode *res = NULL; - list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + hlist_for_each_entry_rcu(delegation, head, hash) { spin_lock(&delegation->lock); if (delegation->inode != NULL && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { - res = igrab(delegation->inode); + if (nfs_sb_active(server->super)) { + freeme = server->super; + res = igrab(delegation->inode); + } + spin_unlock(&delegation->lock); + if (res != NULL) + return res; + if (freeme) { + rcu_read_unlock(); + nfs_sb_deactive(freeme); + rcu_read_lock(); + } + return ERR_PTR(-EAGAIN); } spin_unlock(&delegation->lock); - if (res != NULL) - break; } - return res; + return ERR_PTR(-ENOENT); } /** @@ -795,16 +1217,18 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle) { struct nfs_server *server; - struct inode *res = NULL; + struct inode *res; rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { res = nfs_delegation_find_inode_server(server, fhandle); - if (res != NULL) - break; + if (res != ERR_PTR(-ENOENT)) { + rcu_read_unlock(); + return res; + } } rcu_read_unlock(); - return res; + return ERR_PTR(-ENOENT); } static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) @@ -837,50 +1261,50 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp) rcu_read_unlock(); } -/** - * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done - * @clp: nfs_client to process - * - */ -void nfs_delegation_reap_unclaimed(struct nfs_client *clp) +static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server, + void __always_unused *data) { struct nfs_delegation *delegation; - struct nfs_server *server; struct inode *inode; - restart: rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry_rcu(delegation, &server->delegations, - super_list) { - if (test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags)) - continue; - if (test_bit(NFS_DELEGATION_NEED_RECLAIM, - &delegation->flags) == 0) - continue; - if (!nfs_sb_active(server->super)) - continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - nfs_sb_deactive(server->super); - goto restart; - } - delegation = nfs_start_delegation_return_locked(NFS_I(inode)); - rcu_read_unlock(); - if (delegation != NULL) { - delegation = nfs_detach_delegation(NFS_I(inode), - delegation, server); - if (delegation != NULL) - nfs_free_delegation(delegation); - } - iput(inode); - nfs_sb_deactive(server->super); - goto restart; + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags) == 0) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + rcu_read_unlock(); + if (delegation != NULL) { + if (nfs_detach_delegation(NFS_I(inode), delegation, + server) != NULL) + nfs_free_delegation(server, delegation); + /* Match nfs_start_delegation_return_locked */ + nfs_put_delegation(delegation); } + iput(inode); + cond_resched(); + goto restart; } rcu_read_unlock(); + return 0; +} + +/** + * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done + * @clp: nfs_client to process + * + */ +void nfs_delegation_reap_unclaimed(struct nfs_client *clp) +{ + nfs_client_for_each_server(clp, nfs_server_reap_unclaimed_delegations, + NULL); } static inline bool nfs4_server_rebooted(const struct nfs_client *clp) @@ -897,6 +1321,7 @@ static void nfs_mark_test_expired_delegation(struct nfs_server *server, return; clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags); set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state); } @@ -939,64 +1364,97 @@ void nfs_mark_test_expired_all_delegations(struct nfs_client *clp) } /** - * nfs_reap_expired_delegations - reap expired delegations + * nfs_test_expired_all_delegations - test all delegations for a client * @clp: nfs_client to process * - * Iterates through all the delegations associated with this server and - * checks if they have may have been revoked. This function is usually - * expected to be called in cases where the server may have lost its - * lease. + * Helper for handling "recallable state revoked" status from server. */ -void nfs_reap_expired_delegations(struct nfs_client *clp) +void nfs_test_expired_all_delegations(struct nfs_client *clp) +{ + nfs_mark_test_expired_all_delegations(clp); + nfs4_schedule_state_manager(clp); +} + +static void +nfs_delegation_test_free_expired(struct inode *inode, + nfs4_stateid *stateid, + const struct cred *cred) +{ + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops; + int status; + + if (!cred) + return; + status = ops->test_and_free_expired(server, stateid, cred); + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) + nfs_remove_bad_delegation(inode, stateid); +} + +static int nfs_server_reap_expired_delegations(struct nfs_server *server, + void __always_unused *data) { - const struct nfs4_minor_version_ops *ops = clp->cl_mvops; struct nfs_delegation *delegation; - struct nfs_server *server; struct inode *inode; - struct rpc_cred *cred; + const struct cred *cred; nfs4_stateid stateid; + unsigned long gen = ++server->delegation_gen; + if (!test_and_clear_bit(NFS4SERV_DELEGATION_EXPIRED, + &server->delegation_flags)) + return 0; restart: rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry_rcu(delegation, &server->delegations, - super_list) { - if (test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags)) - continue; - if (test_bit(NFS_DELEGATION_TEST_EXPIRED, - &delegation->flags) == 0) - continue; - if (!nfs_sb_active(server->super)) - continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - nfs_sb_deactive(server->super); - goto restart; - } - cred = get_rpccred_rcu(delegation->cred); - nfs4_stateid_copy(&stateid, &delegation->stateid); - clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); - rcu_read_unlock(); - if (cred != NULL && - ops->test_and_free_expired(server, &stateid, cred) < 0) { - nfs_revoke_delegation(inode, &stateid); - nfs_inode_find_state_and_recover(inode, &stateid); - } - put_rpccred(cred); - if (nfs4_server_rebooted(clp)) { - nfs_inode_mark_test_expired_delegation(server,inode); - iput(inode); - nfs_sb_deactive(server->super); - return; - } + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags) == 0 || + delegation->test_gen == gen) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + continue; + spin_lock(&delegation->lock); + cred = get_cred_rcu(delegation->cred); + nfs4_stateid_copy(&stateid, &delegation->stateid); + spin_unlock(&delegation->lock); + delegation->test_gen = gen; + clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + rcu_read_unlock(); + nfs_delegation_test_free_expired(inode, &stateid, cred); + put_cred(cred); + if (!nfs4_server_rebooted(server->nfs_client)) { iput(inode); - nfs_sb_deactive(server->super); + cond_resched(); goto restart; } + nfs_inode_mark_test_expired_delegation(server,inode); + set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags); + set_bit(NFS4CLNT_DELEGATION_EXPIRED, + &server->nfs_client->cl_state); + iput(inode); + return -EAGAIN; } rcu_read_unlock(); + return 0; +} + +/** + * nfs_reap_expired_delegations - reap expired delegations + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * checks if they have may have been revoked. This function is usually + * expected to be called in cases where the server may have lost its + * lease. + */ +void nfs_reap_expired_delegations(struct nfs_client *clp) +{ + nfs_client_for_each_server(clp, nfs_server_reap_expired_delegations, + NULL); } void nfs_inode_find_delegation_state_and_recover(struct inode *inode, @@ -1009,7 +1467,8 @@ void nfs_inode_find_delegation_state_and_recover(struct inode *inode, rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation && - nfs4_stateid_match_other(&delegation->stateid, stateid)) { + nfs4_stateid_match_or_older(&delegation->stateid, stateid) && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation); found = true; } @@ -1041,6 +1500,35 @@ int nfs_delegations_present(struct nfs_client *clp) } /** + * nfs4_refresh_delegation_stateid - Update delegation stateid seqid + * @dst: stateid to refresh + * @inode: inode to check + * + * Returns "true" and updates "dst->seqid" * if inode had a delegation + * that matches our delegation stateid. Otherwise "false" is returned. + */ +bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode) +{ + struct nfs_delegation *delegation; + bool ret = false; + if (!inode) + goto out; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL && + nfs4_stateid_match_other(dst, &delegation->stateid) && + nfs4_stateid_is_newer(&delegation->stateid, dst) && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + dst->seqid = delegation->stateid.seqid; + ret = true; + } + rcu_read_unlock(); +out: + return ret; +} + +/** * nfs4_copy_delegation_stateid - Copy inode's state ID information * @inode: inode to check * @flags: delegation type requirement @@ -1051,22 +1539,27 @@ int nfs_delegations_present(struct nfs_client *clp) * otherwise "false" is returned. */ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, - nfs4_stateid *dst, struct rpc_cred **cred) + nfs4_stateid *dst, const struct cred **cred) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - bool ret; + bool ret = false; flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); + if (!delegation) + goto out; + spin_lock(&delegation->lock); ret = nfs4_is_valid_delegation(delegation, flags); if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); nfs_mark_delegation_referenced(delegation); if (cred) - *cred = get_rpccred(delegation->cred); + *cred = get_cred(delegation->cred); } + spin_unlock(&delegation->lock); +out: rcu_read_unlock(); return ret; } @@ -1089,9 +1582,24 @@ bool nfs4_delegation_flush_on_close(const struct inode *inode) delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL || !(delegation->type & FMODE_WRITE)) goto out; - if (nfsi->nrequests < delegation->pagemod_limit) + if (atomic_long_read(&nfsi->nrequests) < delegation->pagemod_limit) ret = false; out: rcu_read_unlock(); return ret; } + +int nfs4_delegation_hash_alloc(struct nfs_server *server) +{ + int delegation_buckets, i; + + delegation_buckets = roundup_pow_of_two(nfs_delegation_watermark / 16); + server->delegation_hash_mask = delegation_buckets - 1; + server->delegation_hash_table = kmalloc_array(delegation_buckets, + sizeof(*server->delegation_hash_table), GFP_KERNEL); + if (!server->delegation_hash_table) + return -ENOMEM; + for (i = 0; i < delegation_buckets; i++) + INIT_HLIST_HEAD(&server->delegation_hash_table[i]); + return 0; +} diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index e9d555796873..08ec2e9c68a4 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/nfs/delegation.h * @@ -13,14 +14,17 @@ * NFSv4 delegation */ struct nfs_delegation { + struct hlist_node hash; struct list_head super_list; - struct rpc_cred *cred; + const struct cred *cred; struct inode *inode; nfs4_stateid stateid; fmode_t type; unsigned long pagemod_limit; __u64 change_attr; + unsigned long test_gen; unsigned long flags; + refcount_t refcount; spinlock_t lock; struct rcu_head rcu; }; @@ -33,13 +37,22 @@ enum { NFS_DELEGATION_RETURNING, NFS_DELEGATION_REVOKED, NFS_DELEGATION_TEST_EXPIRED, + NFS_DELEGATION_INODE_FREEING, + NFS_DELEGATION_RETURN_DELAYED, + NFS_DELEGATION_DELEGTIME, }; -int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); -void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); +int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, + fmode_t type, const nfs4_stateid *stateid, + unsigned long pagemod_limit, u32 deleg_type); +void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, + fmode_t type, const nfs4_stateid *stateid, + unsigned long pagemod_limit, u32 deleg_type); int nfs4_inode_return_delegation(struct inode *inode); +void nfs4_inode_return_delegation_on_close(struct inode *inode); +void nfs4_inode_set_return_delegation_on_close(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); -void nfs_inode_return_delegation_noreclaim(struct inode *inode); +void nfs_inode_evict_delegation(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); void nfs_server_return_all_delegations(struct nfs_server *); @@ -49,32 +62,68 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid); +void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); void nfs_mark_test_expired_all_delegations(struct nfs_client *clp); +void nfs_test_expired_all_delegations(struct nfs_client *clp); void nfs_reap_expired_delegations(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ -int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); -int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); +int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, + const nfs4_stateid *stateid, + struct nfs_delegation *delegation, int issync); +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); -bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred); +bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred); +bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode); +struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); -int nfs4_have_delegation(struct inode *inode, fmode_t flags); -int nfs4_check_delegation(struct inode *inode, fmode_t flags); +int nfs4_have_delegation(struct inode *inode, fmode_t type, int flags); +int nfs4_check_delegation(struct inode *inode, fmode_t type); bool nfs4_delegation_flush_on_close(const struct inode *inode); void nfs_inode_find_delegation_state_and_recover(struct inode *inode, const nfs4_stateid *stateid); +int nfs4_inode_make_writeable(struct inode *inode); #endif +#define NFS_DELEGATION_FLAG_TIME BIT(1) + +void nfs_update_delegated_atime(struct inode *inode); +void nfs_update_delegated_mtime(struct inode *inode); +void nfs_update_delegated_mtime_locked(struct inode *inode); + +static inline int nfs_have_read_or_write_delegation(struct inode *inode) +{ + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0); +} + +static inline int nfs_have_write_delegation(struct inode *inode) +{ + return NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE, 0); +} + static inline int nfs_have_delegated_attributes(struct inode *inode) { - return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) && - !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED); + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0); } +static inline int nfs_have_delegated_atime(struct inode *inode) +{ + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, + NFS_DELEGATION_FLAG_TIME); +} + +static inline int nfs_have_delegated_mtime(struct inode *inode) +{ + return NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE, + NFS_DELEGATION_FLAG_TIME); +} + +int nfs4_delegation_hash_alloc(struct nfs_server *server); + #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3522b1249019..ea9f6ca8f30f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/dir.c * @@ -17,6 +18,7 @@ * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM */ +#include <linux/compat.h> #include <linux/module.h> #include <linux/time.h> #include <linux/errno.h> @@ -37,6 +39,7 @@ #include <linux/sched.h> #include <linux/kmemleak.h> #include <linux/xattr.h> +#include <linux/hash.h> #include "delegation.h" #include "iostat.h" @@ -52,34 +55,43 @@ static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, struct dir_context *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); -static void nfs_readdir_clear_array(struct page*); +static void nfs_readdir_clear_array(struct folio *); +static int nfs_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, int open_flags); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, - .iterate = nfs_readdir, + .iterate_shared = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, }; const struct address_space_operations nfs_dir_aops = { - .freepage = nfs_readdir_clear_array, + .free_folio = nfs_readdir_clear_array, }; -static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) +#define NFS_INIT_DTSIZE PAGE_SIZE + +static struct nfs_open_dir_context * +alloc_nfs_open_dir_context(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (ctx != NULL) { - ctx->duped = 0; ctx->attr_gencount = nfsi->attr_gencount; - ctx->dir_cookie = 0; - ctx->dup_cookie = 0; - ctx->cred = get_rpccred(cred); + ctx->dtsize = NFS_INIT_DTSIZE; spin_lock(&dir->i_lock); - list_add(&ctx->list, &nfsi->open_files); + if (list_empty(&nfsi->open_files) && + (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) + nfs_set_cache_invalid(dir, + NFS_INO_INVALID_DATA | + NFS_INO_REVAL_FORCED); + list_add_tail_rcu(&ctx->list, &nfsi->open_files); + memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf)); spin_unlock(&dir->i_lock); return ctx; } @@ -89,10 +101,9 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) { spin_lock(&dir->i_lock); - list_del(&ctx->list); + list_del_rcu(&ctx->list); spin_unlock(&dir->i_lock); - put_rpccred(ctx->cred); - kfree(ctx); + kfree_rcu(ctx, rcu_head); } /* @@ -103,30 +114,18 @@ nfs_opendir(struct inode *inode, struct file *filp) { int res = 0; struct nfs_open_dir_context *ctx; - struct rpc_cred *cred; dfprintk(FILE, "NFS: open dir(%pD2)\n", filp); nfs_inc_stats(inode, NFSIOS_VFSOPEN); - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return PTR_ERR(cred); - ctx = alloc_nfs_open_dir_context(inode, cred); + ctx = alloc_nfs_open_dir_context(inode); if (IS_ERR(ctx)) { res = PTR_ERR(ctx); goto out; } filp->private_data = ctx; - if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { - /* This is a mountpoint, so d_revalidate will never - * have been called, so we need to refresh the - * inode (for close-open consistency) ourselves. - */ - __nfs_revalidate_inode(NFS_SERVER(inode), inode); - } out: - put_rpccred(cred); return res; } @@ -140,48 +139,135 @@ nfs_closedir(struct inode *inode, struct file *filp) struct nfs_cache_array_entry { u64 cookie; u64 ino; - struct qstr string; + const char *name; + unsigned int name_len; unsigned char d_type; }; struct nfs_cache_array { - int size; - int eof_index; + u64 change_attr; u64 last_cookie; - struct nfs_cache_array_entry array[0]; + unsigned int size; + unsigned char folio_full : 1, + folio_is_eof : 1, + cookies_are_ordered : 1; + struct nfs_cache_array_entry array[] __counted_by(size); }; -typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool); -typedef struct { +struct nfs_readdir_descriptor { struct file *file; - struct page *page; + struct folio *folio; struct dir_context *ctx; - unsigned long page_index; - u64 *dir_cookie; + pgoff_t folio_index; + pgoff_t folio_index_max; + u64 dir_cookie; u64 last_cookie; loff_t current_index; - decode_dirent_t decode; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; + unsigned long dir_verifier; unsigned long timestamp; unsigned long gencount; + unsigned long attr_gencount; unsigned int cache_entry_index; + unsigned int buffer_fills; + unsigned int dtsize; + bool clear_cache; bool plus; + bool eob; bool eof; -} nfs_readdir_descriptor_t; +}; + +static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz) +{ + struct nfs_server *server = NFS_SERVER(file_inode(desc->file)); + unsigned int maxsize = server->dtsize; + + if (sz > maxsize) + sz = maxsize; + if (sz < NFS_MIN_FILE_IO_SIZE) + sz = NFS_MIN_FILE_IO_SIZE; + desc->dtsize = sz; +} + +static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc) +{ + nfs_set_dtsize(desc, desc->dtsize >> 1); +} + +static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc) +{ + nfs_set_dtsize(desc, desc->dtsize << 1); +} + +static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie, + u64 change_attr) +{ + struct nfs_cache_array *array; + + array = kmap_local_folio(folio, 0); + array->change_attr = change_attr; + array->last_cookie = last_cookie; + array->size = 0; + array->folio_full = 0; + array->folio_is_eof = 0; + array->cookies_are_ordered = 1; + kunmap_local(array); +} /* * we are freeing strings created by nfs_add_to_readdir_array() */ -static -void nfs_readdir_clear_array(struct page *page) +static void nfs_readdir_clear_array(struct folio *folio) { struct nfs_cache_array *array; - int i; + unsigned int i; - array = kmap_atomic(page); + array = kmap_local_folio(folio, 0); for (i = 0; i < array->size; i++) - kfree(array->array[i].string.name); - kunmap_atomic(array); + kfree(array->array[i].name); + array->size = 0; + kunmap_local(array); +} + +static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie, + u64 change_attr) +{ + nfs_readdir_clear_array(folio); + nfs_readdir_folio_init_array(folio, last_cookie, change_attr); +} + +static struct folio * +nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags) +{ + struct folio *folio = folio_alloc(gfp_flags, 0); + if (folio) + nfs_readdir_folio_init_array(folio, last_cookie, 0); + return folio; +} + +static void nfs_readdir_folio_array_free(struct folio *folio) +{ + if (folio) { + nfs_readdir_clear_array(folio); + folio_put(folio); + } +} + +static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array) +{ + return array->size == 0 ? array->last_cookie : array->array[0].cookie; +} + +static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) +{ + array->folio_is_eof = 1; + array->folio_full = 1; +} + +static bool nfs_readdir_array_is_full(struct nfs_cache_array *array) +{ + return array->folio_full; } /* @@ -189,53 +275,229 @@ void nfs_readdir_clear_array(struct page *page) * when called by nfs_readdir_add_to_array, the strings will be freed in * nfs_clear_readdir_array() */ -static -int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +static const char *nfs_readdir_copy_name(const char *name, unsigned int len) { - string->len = len; - string->name = kmemdup(name, len, GFP_KERNEL); - if (string->name == NULL) - return -ENOMEM; + const char *ret = kmemdup_nul(name, len, GFP_KERNEL); + /* * Avoid a kmemleak false positive. The pointer to the name is stored * in a page cache page which kmemleak does not scan. */ - kmemleak_not_leak(string->name); - string->hash = full_name_hash(NULL, name, len); + if (ret != NULL) + kmemleak_not_leak(ret); + return ret; +} + +static size_t nfs_readdir_array_maxentries(void) +{ + return (PAGE_SIZE - sizeof(struct nfs_cache_array)) / + sizeof(struct nfs_cache_array_entry); +} + +/* + * Check that the next array entry lies entirely within the page bounds + */ +static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) +{ + if (array->folio_full) + return -ENOSPC; + if (array->size == nfs_readdir_array_maxentries()) { + array->folio_full = 1; + return -ENOSPC; + } return 0; } -static -int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) +static int nfs_readdir_folio_array_append(struct folio *folio, + const struct nfs_entry *entry, + u64 *cookie) { - struct nfs_cache_array *array = kmap(page); + struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; - int ret; + const char *name; + int ret = -ENOMEM; - cache_entry = &array->array[array->size]; + name = nfs_readdir_copy_name(entry->name, entry->len); - /* Check that this entry lies within the page bounds */ - ret = -ENOSPC; - if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + array = kmap_local_folio(folio, 0); + if (!name) + goto out; + ret = nfs_readdir_array_can_expand(array); + if (ret) { + kfree(name); goto out; + } - cache_entry->cookie = entry->prev_cookie; + array->size++; + cache_entry = &array->array[array->size - 1]; + cache_entry->cookie = array->last_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; - ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); - if (ret) - goto out; + cache_entry->name_len = entry->len; + cache_entry->name = name; array->last_cookie = entry->cookie; - array->size++; + if (array->last_cookie <= cache_entry->cookie) + array->cookies_are_ordered = 0; if (entry->eof != 0) - array->eof_index = array->size; + nfs_readdir_array_set_eof(array); out: - kunmap(page); + *cookie = array->last_cookie; + kunmap_local(array); return ret; } +#define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14) +/* + * Hash algorithm allowing content addressible access to sequences + * of directory cookies. Content is addressed by the value of the + * cookie index of the first readdir entry in a page. + * + * We select only the first 18 bits to avoid issues with excessive + * memory use for the page cache XArray. 18 bits should allow the caching + * of 262144 pages of sequences of readdir entries. Since each page holds + * 127 readdir entries for a typical 64-bit system, that works out to a + * cache of ~ 33 million entries per directory. + */ +static pgoff_t nfs_readdir_folio_cookie_hash(u64 cookie) +{ + if (cookie == 0) + return 0; + return hash_64(cookie, 18); +} + +static bool nfs_readdir_folio_validate(struct folio *folio, u64 last_cookie, + u64 change_attr) +{ + struct nfs_cache_array *array = kmap_local_folio(folio, 0); + int ret = true; + + if (array->change_attr != change_attr) + ret = false; + if (nfs_readdir_array_index_cookie(array) != last_cookie) + ret = false; + kunmap_local(array); + return ret; +} + +static void nfs_readdir_folio_unlock_and_put(struct folio *folio) +{ + folio_unlock(folio); + folio_put(folio); +} + +static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie, + u64 change_attr) +{ + if (folio_test_uptodate(folio)) { + if (nfs_readdir_folio_validate(folio, cookie, change_attr)) + return; + nfs_readdir_clear_array(folio); + } + nfs_readdir_folio_init_array(folio, cookie, change_attr); + folio_mark_uptodate(folio); +} + +static struct folio *nfs_readdir_folio_get_locked(struct address_space *mapping, + u64 cookie, u64 change_attr) +{ + pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); + struct folio *folio; + + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) + return NULL; + nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); + return folio; +} + +static u64 nfs_readdir_folio_last_cookie(struct folio *folio) +{ + struct nfs_cache_array *array; + u64 ret; + + array = kmap_local_folio(folio, 0); + ret = array->last_cookie; + kunmap_local(array); + return ret; +} + +static bool nfs_readdir_folio_needs_filling(struct folio *folio) +{ + struct nfs_cache_array *array; + bool ret; + + array = kmap_local_folio(folio, 0); + ret = !nfs_readdir_array_is_full(array); + kunmap_local(array); + return ret; +} + +static void nfs_readdir_folio_set_eof(struct folio *folio) +{ + struct nfs_cache_array *array; + + array = kmap_local_folio(folio, 0); + nfs_readdir_array_set_eof(array); + kunmap_local(array); +} + +static struct folio *nfs_readdir_folio_get_next(struct address_space *mapping, + u64 cookie, u64 change_attr) +{ + pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); + struct folio *folio; + + folio = __filemap_get_folio(mapping, index, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return NULL; + nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); + if (nfs_readdir_folio_last_cookie(folio) != cookie) + nfs_readdir_folio_reinit_array(folio, cookie, change_attr); + return folio; +} + +static inline +int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return in_compat_syscall(); +#else + return (BITS_PER_LONG == 32); +#endif +} + static -int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +bool nfs_readdir_use_cookie(const struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return false; + return true; +} + +static void nfs_readdir_seek_next_array(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) +{ + if (array->folio_full) { + desc->last_cookie = array->last_cookie; + desc->current_index += array->size; + desc->cache_entry_index = 0; + desc->folio_index++; + } else + desc->last_cookie = nfs_readdir_array_index_cookie(array); +} + +static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc) +{ + desc->current_index = 0; + desc->last_cookie = 0; + desc->folio_index = 0; +} + +static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) { loff_t diff = desc->ctx->pos - desc->current_index; unsigned int index; @@ -243,119 +505,111 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri if (diff < 0) goto out_eof; if (diff >= array->size) { - if (array->eof_index >= 0) + if (array->folio_is_eof) goto out_eof; + nfs_readdir_seek_next_array(array, desc); return -EAGAIN; } index = (unsigned int)diff; - *desc->dir_cookie = array->array[index].cookie; + desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; return 0; out_eof: - desc->eof = 1; + desc->eof = true; return -EBADCOOKIE; } -static bool -nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) +static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, + u64 cookie) { - if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) + if (!array->cookies_are_ordered) + return true; + /* Optimisation for monotonically increasing cookies */ + if (cookie >= array->last_cookie) return false; - smp_rmb(); - return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); + if (array->size && cookie < array->array[0].cookie) + return false; + return true; } -static -int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) { - int i; - loff_t new_pos; + unsigned int i; int status = -EAGAIN; + if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie)) + goto check_eof; + for (i = 0; i < array->size; i++) { - if (array->array[i].cookie == *desc->dir_cookie) { - struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); - struct nfs_open_dir_context *ctx = desc->file->private_data; - - new_pos = desc->current_index + i; - if (ctx->attr_gencount != nfsi->attr_gencount || - !nfs_readdir_inode_mapping_valid(nfsi)) { - ctx->duped = 0; - ctx->attr_gencount = nfsi->attr_gencount; - } else if (new_pos < desc->ctx->pos) { - if (ctx->duped > 0 - && ctx->dup_cookie == *desc->dir_cookie) { - if (printk_ratelimit()) { - pr_notice("NFS: directory %pD2 contains a readdir loop." - "Please contact your server vendor. " - "The file: %.*s has duplicate cookie %llu\n", - desc->file, array->array[i].string.len, - array->array[i].string.name, *desc->dir_cookie); - } - status = -ELOOP; - goto out; - } - ctx->dup_cookie = *desc->dir_cookie; - ctx->duped = -1; - } - desc->ctx->pos = new_pos; + if (array->array[i].cookie == desc->dir_cookie) { + if (nfs_readdir_use_cookie(desc->file)) + desc->ctx->pos = desc->dir_cookie; + else + desc->ctx->pos = desc->current_index + i; desc->cache_entry_index = i; return 0; } } - if (array->eof_index >= 0) { +check_eof: + if (array->folio_is_eof) { status = -EBADCOOKIE; - if (*desc->dir_cookie == array->last_cookie) - desc->eof = 1; - } -out: + if (desc->dir_cookie == array->last_cookie) + desc->eof = true; + } else + nfs_readdir_seek_next_array(array, desc); return status; } -static -int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) +static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) { struct nfs_cache_array *array; int status; - array = kmap(desc->page); + array = kmap_local_folio(desc->folio, 0); - if (*desc->dir_cookie == 0) + if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); else status = nfs_readdir_search_for_cookie(array, desc); - if (status == -EAGAIN) { - desc->last_cookie = array->last_cookie; - desc->current_index += array->size; - desc->page_index++; - } - kunmap(desc->page); + kunmap_local(array); return status; } /* Fill a page with xdr information before transferring to the cache page */ -static -int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, - struct nfs_entry *entry, struct file *file, struct inode *inode) +static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, + __be32 *verf, u64 cookie, + struct page **pages, size_t bufsize, + __be32 *verf_res) { - struct nfs_open_dir_context *ctx = file->private_data; - struct rpc_cred *cred = ctx->cred; + struct inode *inode = file_inode(desc->file); + struct nfs_readdir_arg arg = { + .dentry = file_dentry(desc->file), + .cred = desc->file->f_cred, + .verf = verf, + .cookie = cookie, + .pages = pages, + .page_len = bufsize, + .plus = desc->plus, + }; + struct nfs_readdir_res res = { + .verf = verf_res, + }; unsigned long timestamp, gencount; int error; again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); - error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages, - NFS_SERVER(inode)->dtsize, desc->plus); + desc->dir_verifier = nfs_save_change_attribute(inode); + error = NFS_PROTO(inode)->readdir(&arg, &res); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; - clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); - desc->plus = false; + desc->plus = arg.plus = false; goto again; } goto error; @@ -366,12 +620,13 @@ error: return error; } -static int xdr_decode(nfs_readdir_descriptor_t *desc, +static int xdr_decode(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct xdr_stream *xdr) { + struct inode *inode = file_inode(desc->file); int error; - error = desc->decode(xdr, entry, desc->plus); + error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus); if (error) return error; entry->fattr->time_start = desc->timestamp; @@ -403,59 +658,78 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) return 1; } -static -bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) +#define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL) + +static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx, + unsigned int cache_hits, + unsigned int cache_misses) { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; - if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) + if (NFS_SERVER(dir)->flags & NFS_MOUNT_FORCE_RDIRPLUS) return true; - if (ctx->pos == 0) + if (ctx->pos == 0 || + cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD) return true; return false; } /* - * This function is called by the lookup and getattr code to request the + * This function is called by the getattr code to request the * use of readdirplus to accelerate any future lookups in the same * directory. */ -void nfs_advise_use_readdirplus(struct inode *dir) +void nfs_readdir_record_entry_cache_hit(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && - !list_empty(&nfsi->open_files)) - set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); + S_ISDIR(dir->i_mode)) { + rcu_read_lock(); + list_for_each_entry_rcu (ctx, &nfsi->open_files, list) + atomic_inc(&ctx->cache_hits); + rcu_read_unlock(); + } } /* * This function is mainly for use by nfs_getattr(). * * If this is an 'ls -l', we want to force use of readdirplus. - * Do this by checking if there is an active file descriptor - * and calling nfs_advise_use_readdirplus, then forcing a - * cache flush. */ -void nfs_force_use_readdirplus(struct inode *dir) +void nfs_readdir_record_entry_cache_miss(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && - !list_empty(&nfsi->open_files)) { - set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); - invalidate_mapping_pages(dir->i_mapping, 0, -1); + S_ISDIR(dir->i_mode)) { + rcu_read_lock(); + list_for_each_entry_rcu (ctx, &nfsi->open_files, list) + atomic_inc(&ctx->cache_misses); + rcu_read_unlock(); } } +static void nfs_lookup_advise_force_readdirplus(struct inode *dir, + unsigned int flags) +{ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + return; + if (flags & (LOOKUP_EXCL | LOOKUP_PARENT | LOOKUP_REVAL)) + return; + nfs_readdir_record_entry_cache_miss(dir); +} + static -void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) +void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, + unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; - struct inode *dir = d_inode(parent); struct inode *inode; int status; @@ -494,12 +768,16 @@ again: if (nfs_same_file(dentry, entry)) { if (!entry->fh->size) goto out; - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, dir_verifier); status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) - nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label); + nfs_setsecurity(d_inode(dentry), entry->fattr); + trace_nfs_readdir_lookup_revalidate(d_inode(parent), + dentry, 0, status); goto out; } else { + trace_nfs_readdir_lookup_revalidate_failed( + d_inode(parent), dentry, 0); d_invalidate(dentry); dput(dentry); dentry = NULL; @@ -511,7 +789,7 @@ again: goto out; } - inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); + inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); alias = d_splice_alias(inode, dentry); d_lookup_done(dentry); if (alias) { @@ -520,266 +798,337 @@ again: dput(dentry); dentry = alias; } - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, dir_verifier); + trace_nfs_readdir_lookup(d_inode(parent), dentry, 0); out: dput(dentry); } -/* Perform conversion from xdr to cache array */ -static -int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, - struct page **xdr_pages, struct page *page, unsigned int buflen) +static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc, + struct nfs_entry *entry, + struct xdr_stream *stream) { + int ret; + + if (entry->fattr->label) + entry->fattr->label->len = NFS4_MAXLABELLEN; + ret = xdr_decode(desc, entry, stream); + if (ret || !desc->plus) + return ret; + nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier); + return 0; +} + +/* Perform conversion from xdr to cache array */ +static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, + struct nfs_entry *entry, + struct page **xdr_pages, unsigned int buflen, + struct folio **arrays, size_t narrays, + u64 change_attr) +{ + struct address_space *mapping = desc->file->f_mapping; + struct folio *new, *folio = *arrays; struct xdr_stream stream; + struct folio *scratch; struct xdr_buf buf; - struct page *scratch; - struct nfs_cache_array *array; - unsigned int count = 0; + u64 cookie; int status; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (scratch == NULL) return -ENOMEM; - if (buflen == 0) - goto out_nopages; - xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_folio(&stream, scratch); do { - status = xdr_decode(desc, entry, &stream); - if (status != 0) { - if (status == -EAGAIN) - status = 0; + status = nfs_readdir_entry_decode(desc, entry, &stream); + if (status != 0) break; - } - count++; + status = nfs_readdir_folio_array_append(folio, entry, &cookie); + if (status != -ENOSPC) + continue; - if (desc->plus) - nfs_prime_dcache(file_dentry(desc->file), entry); + if (folio->mapping != mapping) { + if (!--narrays) + break; + new = nfs_readdir_folio_array_alloc(cookie, GFP_KERNEL); + if (!new) + break; + arrays++; + *arrays = folio = new; + } else { + new = nfs_readdir_folio_get_next(mapping, cookie, + change_attr); + if (!new) + break; + if (folio != *arrays) + nfs_readdir_folio_unlock_and_put(folio); + folio = new; + } + desc->folio_index_max++; + status = nfs_readdir_folio_array_append(folio, entry, &cookie); + } while (!status && !entry->eof); - status = nfs_readdir_add_to_array(entry, page); - if (status != 0) + switch (status) { + case -EBADCOOKIE: + if (!entry->eof) break; - } while (!entry->eof); - -out_nopages: - if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = kmap(page); - array->eof_index = array->size; + nfs_readdir_folio_set_eof(folio); + fallthrough; + case -EAGAIN: + status = 0; + break; + case -ENOSPC: status = 0; - kunmap(page); + if (!desc->plus) + break; + while (!nfs_readdir_entry_decode(desc, entry, &stream)) + ; } - put_page(scratch); + if (folio != *arrays) + nfs_readdir_folio_unlock_and_put(folio); + + folio_put(scratch); return status; } -static -void nfs_readdir_free_pages(struct page **pages, unsigned int npages) +static void nfs_readdir_free_pages(struct page **pages, size_t npages) { - unsigned int i; - for (i = 0; i < npages; i++) - put_page(pages[i]); + while (npages--) + put_page(pages[npages]); + kfree(pages); } /* - * nfs_readdir_large_page will allocate pages that must be freed with a call - * to nfs_readdir_free_pagearray + * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call + * to nfs_readdir_free_pages() */ -static -int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) +static struct page **nfs_readdir_alloc_pages(size_t npages) { - unsigned int i; + struct page **pages; + size_t i; + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; for (i = 0; i < npages; i++) { struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) goto out_freepages; pages[i] = page; } - return 0; + return pages; out_freepages: nfs_readdir_free_pages(pages, i); - return -ENOMEM; + return NULL; } -static -int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) +static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, + __be32 *verf_arg, __be32 *verf_res, + struct folio **arrays, size_t narrays) { - struct page *pages[NFS_MAX_READDIR_PAGES]; - struct nfs_entry entry; - struct file *file = desc->file; - struct nfs_cache_array *array; + u64 change_attr; + struct page **pages; + struct folio *folio = *arrays; + struct nfs_entry *entry; + size_t array_size; + struct inode *inode = file_inode(desc->file); + unsigned int dtsize = desc->dtsize; + unsigned int pglen; int status = -ENOMEM; - unsigned int array_size = ARRAY_SIZE(pages); - - entry.prev_cookie = 0; - entry.cookie = desc->last_cookie; - entry.eof = 0; - entry.fh = nfs_alloc_fhandle(); - entry.fattr = nfs_alloc_fattr(); - entry.server = NFS_SERVER(inode); - if (entry.fh == NULL || entry.fattr == NULL) - goto out; - entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); - if (IS_ERR(entry.label)) { - status = PTR_ERR(entry.label); + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + entry->cookie = nfs_readdir_folio_last_cookie(folio); + entry->fh = nfs_alloc_fhandle(); + entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); + entry->server = NFS_SERVER(inode); + if (entry->fh == NULL || entry->fattr == NULL) goto out; - } - array = kmap(page); - memset(array, 0, sizeof(struct nfs_cache_array)); - array->eof_index = -1; + array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = nfs_readdir_alloc_pages(array_size); + if (!pages) + goto out; - status = nfs_readdir_alloc_pages(pages, array_size); + change_attr = inode_peek_iversion_raw(inode); + status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages, + dtsize, verf_res); if (status < 0) - goto out_release_array; - do { - unsigned int pglen; - status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); + goto free_pages; - if (status < 0) - break; - pglen = status; - status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); - if (status < 0) { - if (status == -ENOSPC) - status = 0; - break; - } - } while (array->eof_index < 0); + pglen = status; + if (pglen != 0) + status = nfs_readdir_folio_filler(desc, entry, pages, pglen, + arrays, narrays, change_attr); + else + nfs_readdir_folio_set_eof(folio); + desc->buffer_fills++; +free_pages: nfs_readdir_free_pages(pages, array_size); -out_release_array: - kunmap(page); - nfs4_label_free(entry.label); out: - nfs_free_fattr(entry.fattr); - nfs_free_fhandle(entry.fh); + nfs_free_fattr(entry->fattr); + nfs_free_fhandle(entry->fh); + kfree(entry); return status; } -/* - * Now we cache directories properly, by converting xdr information - * to an array that can be used for lookups later. This results in - * fewer cache pages, since we can store more information on each page. - * We only need to convert from xdr once so future lookups are much simpler - */ -static -int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) +static void nfs_readdir_folio_put(struct nfs_readdir_descriptor *desc) { - struct inode *inode = file_inode(desc->file); - int ret; - - ret = nfs_readdir_xdr_to_array(desc, page, inode); - if (ret < 0) - goto error; - SetPageUptodate(page); - - if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { - /* Should never happen */ - nfs_zap_mapping(inode, inode->i_mapping); - } - unlock_page(page); - return 0; - error: - unlock_page(page); - return ret; + folio_put(desc->folio); + desc->folio = NULL; } -static -void cache_page_release(nfs_readdir_descriptor_t *desc) +static void +nfs_readdir_folio_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) { - if (!desc->page->mapping) - nfs_readdir_clear_array(desc->page); - put_page(desc->page); - desc->page = NULL; + folio_unlock(desc->folio); + nfs_readdir_folio_put(desc); } -static -struct page *get_cache_page(nfs_readdir_descriptor_t *desc) +static struct folio * +nfs_readdir_folio_get_cached(struct nfs_readdir_descriptor *desc) { - return read_cache_page(desc->file->f_mapping, - desc->page_index, (filler_t *)nfs_readdir_filler, desc); + struct address_space *mapping = desc->file->f_mapping; + u64 change_attr = inode_peek_iversion_raw(mapping->host); + u64 cookie = desc->last_cookie; + struct folio *folio; + + folio = nfs_readdir_folio_get_locked(mapping, cookie, change_attr); + if (!folio) + return NULL; + if (desc->clear_cache && !nfs_readdir_folio_needs_filling(folio)) + nfs_readdir_folio_reinit_array(folio, cookie, change_attr); + return folio; } /* * Returns 0 if desc->dir_cookie was found on page desc->page_index + * and locks the page to prevent removal from the page cache. */ -static -int find_cache_page(nfs_readdir_descriptor_t *desc) +static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) { + struct inode *inode = file_inode(desc->file); + struct nfs_inode *nfsi = NFS_I(inode); + __be32 verf[NFS_DIR_VERIFIER_SIZE]; int res; - desc->page = get_cache_page(desc); - if (IS_ERR(desc->page)) - return PTR_ERR(desc->page); - + desc->folio = nfs_readdir_folio_get_cached(desc); + if (!desc->folio) + return -ENOMEM; + if (nfs_readdir_folio_needs_filling(desc->folio)) { + /* Grow the dtsize if we had to go back for more pages */ + if (desc->folio_index == desc->folio_index_max) + nfs_grow_dtsize(desc); + desc->folio_index_max = desc->folio_index; + trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf, + desc->last_cookie, + desc->folio->index, desc->dtsize); + res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, + &desc->folio, 1); + if (res < 0) { + nfs_readdir_folio_unlock_and_put_cached(desc); + trace_nfs_readdir_cache_fill_done(inode, res); + if (res == -EBADCOOKIE || res == -ENOTSYNC) { + invalidate_inode_pages2(desc->file->f_mapping); + nfs_readdir_rewind_search(desc); + trace_nfs_readdir_invalidate_cache_range( + inode, 0, MAX_LFS_FILESIZE); + return -EAGAIN; + } + return res; + } + /* + * Set the cookie verifier if the page cache was empty + */ + if (desc->last_cookie == 0 && + memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) { + memcpy(nfsi->cookieverf, verf, + sizeof(nfsi->cookieverf)); + invalidate_inode_pages2_range(desc->file->f_mapping, 1, + -1); + trace_nfs_readdir_invalidate_cache_range( + inode, 1, MAX_LFS_FILESIZE); + } + desc->clear_cache = false; + } res = nfs_readdir_search_array(desc); - if (res != 0) - cache_page_release(desc); + if (res == 0) + return 0; + nfs_readdir_folio_unlock_and_put_cached(desc); return res; } /* Search for desc->dir_cookie from the beginning of the page cache */ -static inline -int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) +static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; - if (desc->page_index == 0) { - desc->current_index = 0; - desc->last_cookie = 0; - } do { - res = find_cache_page(desc); + res = find_and_lock_cache_page(desc); } while (res == -EAGAIN); return res; } +#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) + /* * Once we've found the start of the dirent within a page: fill 'er up... */ -static -int nfs_do_filldir(nfs_readdir_descriptor_t *desc) +static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, + const __be32 *verf) { struct file *file = desc->file; - int i = 0; - int res = 0; - struct nfs_cache_array *array = NULL; - struct nfs_open_dir_context *ctx = file->private_data; + struct nfs_cache_array *array; + unsigned int i; + bool first_emit = !desc->dir_cookie; - array = kmap(desc->page); + array = kmap_local_folio(desc->folio, 0); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; + /* + * nfs_readdir_handle_cache_misses return force clear at + * (cache_misses > NFS_READDIR_CACHE_MISS_THRESHOLD) for + * readdir heuristic, NFS_READDIR_CACHE_MISS_THRESHOLD + 1 + * entries need be emitted here. + */ + if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 2) { + desc->eob = true; + break; + } + ent = &array->array[i]; - if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, + if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { - desc->eof = 1; + desc->eob = true; break; } - desc->ctx->pos++; - if (i < (array->size-1)) - *desc->dir_cookie = array->array[i+1].cookie; + memcpy(desc->verf, verf, sizeof(desc->verf)); + if (i == array->size - 1) { + desc->dir_cookie = array->last_cookie; + nfs_readdir_seek_next_array(array, desc); + } else { + desc->dir_cookie = array->array[i + 1].cookie; + desc->last_cookie = array->array[0].cookie; + } + if (nfs_readdir_use_cookie(file)) + desc->ctx->pos = desc->dir_cookie; else - *desc->dir_cookie = array->last_cookie; - if (ctx->duped != 0) - ctx->duped = 1; + desc->ctx->pos++; } - if (array->eof_index >= 0) - desc->eof = 1; + if (array->folio_is_eof) + desc->eof = !desc->eob; - kunmap(desc->page); - cache_page_release(desc); - dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", - (unsigned long long)*desc->dir_cookie, res); - return res; + kunmap_local(array); + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", + (unsigned long long)desc->dir_cookie); } /* @@ -794,41 +1143,77 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) * we should already have a complete representation of the * directory in the page cache by the time we get here. */ -static inline -int uncached_readdir(nfs_readdir_descriptor_t *desc) +static int uncached_readdir(struct nfs_readdir_descriptor *desc) { - struct page *page = NULL; - int status; - struct inode *inode = file_inode(desc->file); - struct nfs_open_dir_context *ctx = desc->file->private_data; + struct folio **arrays; + size_t i, sz = 512; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; + int status = -ENOMEM; - dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", - (unsigned long long)*desc->dir_cookie); + dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %llu\n", + (unsigned long long)desc->dir_cookie); - page = alloc_page(GFP_HIGHUSER); - if (!page) { - status = -ENOMEM; + arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL); + if (!arrays) + goto out; + arrays[0] = nfs_readdir_folio_array_alloc(desc->dir_cookie, GFP_KERNEL); + if (!arrays[0]) goto out; - } - desc->page_index = 0; - desc->last_cookie = *desc->dir_cookie; - desc->page = page; - ctx->duped = 0; + desc->folio_index = 0; + desc->cache_entry_index = 0; + desc->last_cookie = desc->dir_cookie; + desc->folio_index_max = 0; - status = nfs_readdir_xdr_to_array(desc, page, inode); - if (status < 0) - goto out_release; + trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie, + -1, desc->dtsize); - status = nfs_do_filldir(desc); + status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); + if (status < 0) { + trace_nfs_readdir_uncached_done(file_inode(desc->file), status); + goto out_free; + } + + for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { + desc->folio = arrays[i]; + nfs_do_filldir(desc, verf); + } + desc->folio = NULL; - out: - dfprintk(DIRCACHE, "NFS: %s: returns %d\n", - __func__, status); + /* + * Grow the dtsize if we have to go back for more pages, + * or shrink it if we're reading too many. + */ + if (!desc->eof) { + if (!desc->eob) + nfs_grow_dtsize(desc); + else if (desc->buffer_fills == 1 && + i < (desc->folio_index_max >> 1)) + nfs_shrink_dtsize(desc); + } +out_free: + for (i = 0; i < sz && arrays[i]; i++) + nfs_readdir_folio_array_free(arrays[i]); +out: + if (!nfs_readdir_use_cookie(desc->file)) + nfs_readdir_rewind_search(desc); + desc->folio_index_max = -1; + kfree(arrays); + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; - out_release: - cache_page_release(desc); - goto out; +} + +static bool nfs_readdir_handle_cache_misses(struct inode *inode, + struct nfs_readdir_descriptor *desc, + unsigned int cache_misses, + bool force_clear) +{ + if (desc->ctx->pos == 0 || !desc->plus) + return false; + if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear) + return false; + trace_nfs_readdir_force_readdirplus(inode); + return true; } /* The file offset position represents the dirent entry number. A @@ -839,10 +1224,12 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); - nfs_readdir_descriptor_t my_desc, - *desc = &my_desc; + struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_dir_context *dir_ctx = file->private_data; - int res = 0; + struct nfs_readdir_descriptor *desc; + unsigned int cache_hits, cache_misses; + bool force_clear; + int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); @@ -854,18 +1241,38 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) * to either find the entry with the appropriate number or * revalidate the cookie. */ - memset(desc, 0, sizeof(*desc)); + nfs_revalidate_mapping(inode, file->f_mapping); + res = -ENOMEM; + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) + goto out; desc->file = file; desc->ctx = ctx; - desc->dir_cookie = &dir_ctx->dir_cookie; - desc->decode = NFS_PROTO(inode)->decode_dirent; - desc->plus = nfs_use_readdirplus(inode, ctx); + desc->folio_index_max = -1; + + spin_lock(&file->f_lock); + desc->dir_cookie = dir_ctx->dir_cookie; + desc->folio_index = dir_ctx->page_index; + desc->last_cookie = dir_ctx->last_cookie; + desc->attr_gencount = dir_ctx->attr_gencount; + desc->eof = dir_ctx->eof; + nfs_set_dtsize(desc, dir_ctx->dtsize); + memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); + cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0); + cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0); + force_clear = dir_ctx->force_clear; + spin_unlock(&file->f_lock); + + if (desc->eof) { + res = 0; + goto out_free; + } - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) - res = nfs_revalidate_mapping(inode, file->f_mapping); - if (res < 0) - goto out; + desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses); + force_clear = nfs_readdir_handle_cache_misses(inode, desc, cache_misses, + force_clear); + desc->clear_cache = force_clear; do { res = readdir_search_pagecache(desc); @@ -873,18 +1280,18 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res == -EBADCOOKIE) { res = 0; /* This means either end of directory */ - if (*desc->dir_cookie && desc->eof == 0) { + if (desc->dir_cookie && !desc->eof) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc); if (res == 0) continue; + if (res == -EBADCOOKIE || res == -ENOTSYNC) + res = 0; } break; } if (res == -ETOOSMALL && desc->plus) { - clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); nfs_zap_caches(inode); - desc->page_index = 0; desc->plus = false; desc->eof = false; continue; @@ -892,43 +1299,68 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res < 0) break; - res = nfs_do_filldir(desc); - if (res < 0) - break; - } while (!desc->eof); + nfs_do_filldir(desc, nfsi->cookieverf); + nfs_readdir_folio_unlock_and_put_cached(desc); + if (desc->folio_index == desc->folio_index_max) + desc->clear_cache = force_clear; + } while (!desc->eob && !desc->eof); + + spin_lock(&file->f_lock); + dir_ctx->dir_cookie = desc->dir_cookie; + dir_ctx->last_cookie = desc->last_cookie; + dir_ctx->attr_gencount = desc->attr_gencount; + dir_ctx->page_index = desc->folio_index; + dir_ctx->force_clear = force_clear; + dir_ctx->eof = desc->eof; + dir_ctx->dtsize = desc->dtsize; + memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); + spin_unlock(&file->f_lock); +out_free: + kfree(desc); + out: - if (res > 0) - res = 0; dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); return res; } static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { - struct inode *inode = file_inode(filp); struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); - inode_lock(inode); switch (whence) { - case 1: - offset += filp->f_pos; - case 0: - if (offset >= 0) - break; - default: - offset = -EINVAL; - goto out; + default: + return -EINVAL; + case SEEK_SET: + if (offset < 0) + return -EINVAL; + spin_lock(&filp->f_lock); + break; + case SEEK_CUR: + if (offset == 0) + return filp->f_pos; + spin_lock(&filp->f_lock); + offset += filp->f_pos; + if (offset < 0) { + spin_unlock(&filp->f_lock); + return -EINVAL; + } } if (offset != filp->f_pos) { filp->f_pos = offset; - dir_ctx->dir_cookie = 0; - dir_ctx->duped = 0; + dir_ctx->page_index = 0; + if (!nfs_readdir_use_cookie(filp)) { + dir_ctx->dir_cookie = 0; + dir_ctx->last_cookie = 0; + } else { + dir_ctx->dir_cookie = offset; + dir_ctx->last_cookie = offset; + } + dir_ctx->eof = false; } -out: - inode_unlock(inode); + spin_unlock(&filp->f_lock); return offset; } @@ -939,32 +1371,134 @@ out: static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *inode = file_inode(filp); - dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); - inode_lock(inode); - nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - inode_unlock(inode); + nfs_inc_stats(file_inode(filp), NFSIOS_VFSFSYNC); return 0; } /** * nfs_force_lookup_revalidate - Mark the directory as having changed - * @dir - pointer to directory inode + * @dir: pointer to directory inode * * This forces the revalidation code in nfs_lookup_revalidate() to do a * full lookup on all child dentries of 'dir' whenever a change occurs * on the server that might have invalidated our dcache. * + * Note that we reserve bit '0' as a tag to let us know when a dentry + * was revalidated while holding a delegation on its inode. + * * The caller should be holding dir->i_lock */ void nfs_force_lookup_revalidate(struct inode *dir) { - NFS_I(dir)->cache_change_attribute++; + NFS_I(dir)->cache_change_attribute += 2; } EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); +/** + * nfs_verify_change_attribute - Detects NFS remote directory changes + * @dir: pointer to parent directory inode + * @verf: previously saved change attribute + * + * Return "false" if the verifiers doesn't match the change attribute. + * This would usually indicate that the directory contents have changed on + * the server, and that any dentries need revalidating. + */ +static bool nfs_verify_change_attribute(struct inode *dir, unsigned long verf) +{ + return (verf & ~1UL) == nfs_save_change_attribute(dir); +} + +static void nfs_set_verifier_delegated(unsigned long *verf) +{ + *verf |= 1UL; +} + +#if IS_ENABLED(CONFIG_NFS_V4) +static void nfs_unset_verifier_delegated(unsigned long *verf) +{ + *verf &= ~1UL; +} +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ + +static bool nfs_test_verifier_delegated(unsigned long verf) +{ + return verf & 1; +} + +static bool nfs_verifier_is_delegated(struct dentry *dentry) +{ + return nfs_test_verifier_delegated(dentry->d_time); +} + +static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf) +{ + struct inode *inode = d_inode(dentry); + struct inode *dir = d_inode_rcu(dentry->d_parent); + + if (!dir || !nfs_verify_change_attribute(dir, verf)) + return; + if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0)) + nfs_set_verifier_delegated(&verf); + dentry->d_time = verf; +} + +/** + * nfs_set_verifier - save a parent directory verifier in the dentry + * @dentry: pointer to dentry + * @verf: verifier to save + * + * Saves the parent directory verifier in @dentry. If the inode has + * a delegation, we also tag the dentry as having been revalidated + * while holding a delegation so that we know we don't have to + * look it up again after a directory change. + */ +void nfs_set_verifier(struct dentry *dentry, unsigned long verf) +{ + + spin_lock(&dentry->d_lock); + nfs_set_verifier_locked(dentry, verf); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL_GPL(nfs_set_verifier); + +#if IS_ENABLED(CONFIG_NFS_V4) +/** + * nfs_clear_verifier_delegated - clear the dir verifier delegation tag + * @inode: pointer to inode + * + * Iterates through the dentries in the inode alias list and clears + * the tag used to indicate that the dentry has been revalidated + * while holding a delegation. + * This function is intended for use when the delegation is being + * returned or revoked. + */ +void nfs_clear_verifier_delegated(struct inode *inode) +{ + struct dentry *alias; + + if (!inode) + return; + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + spin_lock(&alias->d_lock); + nfs_unset_verifier_delegated(&alias->d_time); + spin_unlock(&alias->d_lock); + } + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated); +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ + +static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry) +{ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) && + d_really_is_negative(dentry)) + return dentry->d_time == inode_peek_iversion_raw(dir); + return nfs_verify_change_attribute(dir, dentry->d_time); +} + /* * A check for whether or not the parent directory has changed. * In the case it has, we assume that the dentries are untrustworthy @@ -978,7 +1512,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, return 1; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) return 0; - if (!nfs_verify_change_attribute(dir, dentry->d_time)) + if (!nfs_dentry_verify_change(dir, dentry)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ if (nfs_mapping_need_revalidate_inode(dir)) { @@ -987,7 +1521,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) return 0; } - if (!nfs_verify_change_attribute(dir, dentry->d_time)) + if (!nfs_dentry_verify_change(dir, dentry)) return 0; return 1; } @@ -1000,7 +1534,8 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) { if (NFS_PROTO(dir)->version == 2) return 0; - return flags & LOOKUP_EXCL; + return (flags & (LOOKUP_CREATE | LOOKUP_EXCL)) == + (LOOKUP_CREATE | LOOKUP_EXCL); } /* @@ -1019,15 +1554,32 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) if (IS_AUTOMOUNT(inode)) return 0; + + if (flags & LOOKUP_OPEN) { + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + /* A NFSv4 OPEN will revalidate later */ + if (server->caps & NFS_CAP_ATOMIC_OPEN) + goto out; + fallthrough; + case S_IFDIR: + if (server->flags & NFS_MOUNT_NOCTO) + break; + /* NFS close-to-open cache consistency validation */ + goto out_force; + } + } + /* VFS wants an on-the-wire revalidation */ if (flags & LOOKUP_REVAL) goto out_force; - /* This is an open(2) */ - if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) && - (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) - goto out_force; out: - return (inode->i_nlink == 0) ? -ENOENT : 0; + if (inode->i_nlink > 0 || + (inode->i_nlink == 0 && + test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags))) + return 0; + else + return -ESTALE; out_force: if (flags & LOOKUP_RCU) return -ECHILD; @@ -1037,6 +1589,13 @@ out_force: goto out; } +static void nfs_mark_dir_for_revalidate(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE); + spin_unlock(&inode->i_lock); +} + /* * We judge how long we want to trust negative * dentries by looking at the parent inode mtime. @@ -1046,19 +1605,124 @@ out_force: * * If LOOKUP_RCU prevents us from performing a full check, return 1 * suggesting a reval is needed. + * + * Note that when creating a new file, or looking up a rename target, + * then it shouldn't be necessary to revalidate a negative dentry. */ static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, unsigned int flags) { - /* Don't revalidate a negative dentry if we're creating a new file */ - if (flags & LOOKUP_CREATE) + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) return 1; + /* Case insensitive server? Revalidate negative dentries */ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + return 1; return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); } +static int +nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, + struct inode *inode, int error) +{ + switch (error) { + case 1: + break; + case -ETIMEDOUT: + if (inode && (IS_ROOT(dentry) || + NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)) + error = 1; + break; + case -ESTALE: + case -ENOENT: + error = 0; + fallthrough; + default: + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (inode && IS_ROOT(dentry)) + error = 1; + break; + } + trace_nfs_lookup_revalidate_exit(dir, dentry, 0, error); + return error; +} + +static int +nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + int ret = 1; + if (nfs_neg_need_reval(dir, dentry, flags)) { + if (flags & LOOKUP_RCU) + return -ECHILD; + ret = 0; + } + return nfs_lookup_revalidate_done(dir, dentry, NULL, ret); +} + +static int +nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + return nfs_lookup_revalidate_done(dir, dentry, inode, 1); +} + +static int nfs_lookup_revalidate_dentry(struct inode *dir, const struct qstr *name, + struct dentry *dentry, + struct inode *inode, unsigned int flags) +{ + struct nfs_fh *fhandle; + struct nfs_fattr *fattr; + unsigned long dir_verifier; + int ret; + + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); + + ret = -ENOMEM; + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); + if (fhandle == NULL || fattr == NULL) + goto out; + + dir_verifier = nfs_save_change_attribute(dir); + ret = NFS_PROTO(dir)->lookup(dir, dentry, name, fhandle, fattr); + if (ret < 0) + goto out; + + /* Request help from readdirplus */ + nfs_lookup_advise_force_readdirplus(dir, flags); + + ret = 0; + if (nfs_compare_fh(NFS_FH(inode), fhandle)) + goto out; + if (nfs_refresh_inode(inode, fattr) < 0) + goto out; + + nfs_setsecurity(inode, fattr); + nfs_set_verifier(dentry, dir_verifier); + + ret = 1; +out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + + /* + * If the lookup failed despite the dentry change attribute being + * a match, then we should revalidate the directory cache. + */ + if (!ret && nfs_dentry_verify_change(dir, dentry)) + nfs_mark_dir_for_revalidate(dir); + return nfs_lookup_revalidate_done(dir, dentry, inode, ret); +} + /* * This is called every time the dcache has a lookup hit, * and we should check whether we can really trust that @@ -1070,60 +1734,41 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, * If the parent directory is seen to have changed, we throw out the * cached dentry and do a new lookup. */ -static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int +nfs_do_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - struct inode *dir; struct inode *inode; - struct dentry *parent; - struct nfs_fh *fhandle = NULL; - struct nfs_fattr *fattr = NULL; - struct nfs4_label *label = NULL; - int error; + int error = 0; - if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - } nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = d_inode(dentry); - if (!inode) { - if (nfs_neg_need_reval(dir, dentry, flags)) { - if (flags & LOOKUP_RCU) - return -ECHILD; - goto out_bad; - } - goto out_valid; - } + if (!inode) + return nfs_lookup_revalidate_negative(dir, dentry, flags); if (is_bad_inode(inode)) { - if (flags & LOOKUP_RCU) - return -ECHILD; dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", __func__, dentry); goto out_bad; } - if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) - goto out_set_verifier; + if ((flags & LOOKUP_RENAME_TARGET) && d_count(dentry) < 2 && + nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + goto out_bad; + + if (nfs_verifier_is_delegated(dentry)) + return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* Force a full look up iff the parent directory has changed */ - if (!nfs_is_exclusive_create(dir, flags) && + if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) && nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { error = nfs_lookup_verify_inode(inode, flags); if (error) { - if (flags & LOOKUP_RCU) - return -ECHILD; if (error == -ESTALE) - goto out_zap_parent; - goto out_error; + nfs_mark_dir_for_revalidate(dir); + goto out_bad; } - nfs_advise_use_readdirplus(dir); goto out_valid; } @@ -1133,81 +1778,57 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (NFS_STALE(inode)) goto out_bad; - error = -ENOMEM; - fhandle = nfs_alloc_fhandle(); - fattr = nfs_alloc_fattr(); - if (fhandle == NULL || fattr == NULL) - goto out_error; + return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); +out_valid: + return nfs_lookup_revalidate_done(dir, dentry, inode, 1); +out_bad: + if (flags & LOOKUP_RCU) + return -ECHILD; + return nfs_lookup_revalidate_done(dir, dentry, inode, error); +} - label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); - if (IS_ERR(label)) - goto out_error; +static int +__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) { + if (dentry->d_fsdata == NFS_FSDATA_BLOCKED) + return -ECHILD; + } else { + /* Wait for unlink to complete - see unblock_revalidate() */ + wait_var_event(&dentry->d_fsdata, + smp_load_acquire(&dentry->d_fsdata) + != NFS_FSDATA_BLOCKED); + } + return 0; +} - trace_nfs_lookup_revalidate_enter(dir, dentry, flags); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); - trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); - if (error == -ESTALE || error == -ENOENT) - goto out_bad; - if (error) - goto out_error; - if (nfs_compare_fh(NFS_FH(inode), fhandle)) - goto out_bad; - if ((error = nfs_refresh_inode(inode, fattr)) != 0) - goto out_bad; +static int nfs_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) +{ + if (__nfs_lookup_revalidate(dentry, flags)) + return -ECHILD; + return nfs_do_lookup_revalidate(dir, name, dentry, flags); +} - nfs_setsecurity(inode, fattr, label); +static void block_revalidate(struct dentry *dentry) +{ + /* old devname - just in case */ + kfree(dentry->d_fsdata); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); + /* Any new reference that could lead to an open + * will take ->d_lock in lookup_open() -> d_lookup(). + * Holding this lock ensures we cannot race with + * __nfs_lookup_revalidate() and removes and need + * for further barriers. + */ + lockdep_assert_held(&dentry->d_lock); - /* set a readdirplus hint that we had a cache miss */ - nfs_force_use_readdirplus(dir); + dentry->d_fsdata = NFS_FSDATA_BLOCKED; +} -out_set_verifier: - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - out_valid: - if (flags & LOOKUP_RCU) { - if (parent != ACCESS_ONCE(dentry->d_parent)) - return -ECHILD; - } else - dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", - __func__, dentry); - return 1; -out_zap_parent: - nfs_zap_caches(dir); - out_bad: - WARN_ON(flags & LOOKUP_RCU); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); - nfs_mark_for_revalidate(dir); - if (inode && S_ISDIR(inode->i_mode)) { - /* Purge readdir caches. */ - nfs_zap_caches(inode); - /* - * We can't d_drop the root of a disconnected tree: - * its d_hash is on the s_anon list and d_drop() would hide - * it from shrink_dcache_for_unmount(), leading to busy - * inodes on unmount and further oopses. - */ - if (IS_ROOT(dentry)) - goto out_valid; - } - dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", - __func__, dentry); - return 0; -out_error: - WARN_ON(flags & LOOKUP_RCU); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); - dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", - __func__, dentry, error); - return error; +static void unblock_revalidate(struct dentry *dentry) +{ + store_release_wake_up(&dentry->d_fsdata, NULL); } /* @@ -1241,8 +1862,7 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - if (nfs_mapping_need_revalidate_inode(inode)) - error = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + error = nfs_lookup_verify_inode(inode, flags); dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", __func__, inode->i_ino, error ? "invalid" : "valid"); return !error; @@ -1264,7 +1884,7 @@ static int nfs_dentry_delete(const struct dentry *dentry) /* Unhash it, so that ->d_iput() would be called */ return 1; } - if (!(dentry->d_sb->s_flags & MS_ACTIVE)) { + if (!(dentry->d_sb->s_flags & SB_ACTIVE)) { /* Unhash it, so that ancestors of killed async unlink * files will be cleaned up during umount */ return 1; @@ -1278,9 +1898,12 @@ static void nfs_drop_nlink(struct inode *inode) { spin_lock(&inode->i_lock); /* drop the inode if we're reasonably sure this is the last link */ - if (inode->i_nlink == 1) - clear_nlink(inode); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + if (inode->i_nlink > 0) + drop_nlink(inode); + NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); + nfs_set_cache_invalid( + inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_NLINK); spin_unlock(&inode->i_lock); } @@ -1290,10 +1913,6 @@ static void nfs_drop_nlink(struct inode *inode) */ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) { - if (S_ISDIR(inode->i_mode)) - /* drop any readdir cache as it could easily be old */ - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; - if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { nfs_complete_unlink(dentry, inode); nfs_drop_nlink(inode); @@ -1328,7 +1947,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in struct inode *inode = NULL; struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; - struct nfs4_label *label = NULL; + unsigned long dir_verifier; int error; dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry); @@ -1341,58 +1960,67 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. */ - if (nfs_is_exclusive_create(dir, flags)) + if (nfs_is_exclusive_create(dir, flags) || flags & LOOKUP_RENAME_TARGET) return NULL; res = ERR_PTR(-ENOMEM); fhandle = nfs_alloc_fhandle(); - fattr = nfs_alloc_fattr(); + fattr = nfs_alloc_fattr_with_label(NFS_SERVER(dir)); if (fhandle == NULL || fattr == NULL) goto out; - label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT); - if (IS_ERR(label)) - goto out; - + dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); - if (error == -ENOENT) + error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, + fhandle, fattr); + if (error == -ENOENT) { + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + dir_verifier = inode_peek_iversion_raw(dir); goto no_entry; + } if (error < 0) { res = ERR_PTR(error); - goto out_label; + goto out; } - inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); + inode = nfs_fhget(dentry->d_sb, fhandle, fattr); res = ERR_CAST(inode); if (IS_ERR(res)) - goto out_label; + goto out; /* Notify readdir to use READDIRPLUS */ - nfs_force_use_readdirplus(dir); + nfs_lookup_advise_force_readdirplus(dir, flags); no_entry: res = d_splice_alias(inode, dentry); if (res != NULL) { if (IS_ERR(res)) - goto out_label; + goto out; dentry = res; } - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); -out_label: - trace_nfs_lookup_exit(dir, dentry, flags, error); - nfs4_label_free(label); + nfs_set_verifier(dentry, dir_verifier); out: + trace_nfs_lookup_exit(dir, dentry, flags, PTR_ERR_OR_ZERO(res)); nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); return res; } EXPORT_SYMBOL_GPL(nfs_lookup); +void nfs_d_prune_case_insensitive_aliases(struct inode *inode) +{ + /* Case insensitive server? Revalidate dentries */ + if (inode && nfs_server_capable(inode, NFS_CAP_CASE_INSENSITIVE)) + d_prune_aliases(inode); +} +EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases); + #if IS_ENABLED(CONFIG_NFS_V4) -static int nfs4_lookup_revalidate(struct dentry *, unsigned int); +static int nfs4_lookup_revalidate(struct inode *, const struct qstr *, + struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs4_lookup_revalidate, + .d_weak_revalidate = nfs_weak_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, @@ -1400,16 +2028,6 @@ const struct dentry_operations nfs4_dentry_operations = { }; EXPORT_SYMBOL_GPL(nfs4_dentry_operations); -static fmode_t flags_to_mode(int flags) -{ - fmode_t res = (__force fmode_t)flags & FMODE_EXEC; - if ((flags & O_ACCMODE) != O_WRONLY) - res |= FMODE_READ; - if ((flags & O_ACCMODE) != O_RDONLY) - res |= FMODE_WRITE; - return res; -} - static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp) { return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp); @@ -1423,25 +2041,24 @@ static int do_open(struct inode *inode, struct file *filp) static int nfs_finish_open(struct nfs_open_context *ctx, struct dentry *dentry, - struct file *file, unsigned open_flags, - int *opened) + struct file *file, unsigned open_flags) { int err; - err = finish_open(file, dentry, do_open, opened); + err = finish_open(file, dentry, do_open); if (err) goto out; - if (S_ISREG(file->f_path.dentry->d_inode->i_mode)) + if (S_ISREG(file_inode(file)->i_mode)) nfs_file_set_open_context(file, ctx); else - err = -ESTALE; + err = -EOPENSTALE; out: return err; } int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, - umode_t mode, int *opened) + umode_t mode) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; @@ -1449,7 +2066,9 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; unsigned int lookup_flags = 0; + unsigned long dir_verifier; bool switched = false; + int created = 0; int err; /* Expect a negative dentry */ @@ -1510,7 +2129,9 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, goto out; trace_nfs_atomic_open_enter(dir, ctx, open_flags); - inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, opened); + inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, &created); + if (created) + file->f_mode |= FMODE_CREATED; if (IS_ERR(inode)) { err = PTR_ERR(inode); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); @@ -1519,7 +2140,11 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, switch (err) { case -ENOENT: d_splice_alias(NULL, dentry); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + dir_verifier = inode_peek_iversion_raw(dir); + else + dir_verifier = nfs_save_change_attribute(dir); + nfs_set_verifier(dentry, dir_verifier); break; case -EISDIR: case -ENOTDIR: @@ -1534,8 +2159,9 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, } goto out; } + file->f_mode |= FMODE_CAN_ODIRECT; - err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened); + err = nfs_finish_open(ctx, ctx->dentry, file, open_flags); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); out: @@ -1547,6 +2173,24 @@ out: no_open: res = nfs_lookup(dir, dentry, lookup_flags); + if (!res) { + inode = d_inode(dentry); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) + res = ERR_PTR(-ENOTDIR); + else if (inode && S_ISREG(inode->i_mode)) + res = ERR_PTR(-EOPENSTALE); + } else if (!IS_ERR(res)) { + inode = d_inode(res); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) { + dput(res); + res = ERR_PTR(-ENOTDIR); + } else if (inode && S_ISREG(inode->i_mode)) { + dput(res); + res = ERR_PTR(-EOPENSTALE); + } + } if (switched) { d_lookup_done(dentry); if (!res) @@ -1554,113 +2198,147 @@ no_open: else dput(dentry); } - if (IS_ERR(res)) - return PTR_ERR(res); return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open); -static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int +nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { struct inode *inode; - int ret = 0; + + if (__nfs_lookup_revalidate(dentry, flags)) + return -ECHILD; + + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) - goto no_open; + goto full_reval; if (d_mountpoint(dentry)) - goto no_open; - if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) - goto no_open; + goto full_reval; inode = d_inode(dentry); /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ - if (inode == NULL) { - struct dentry *parent; - struct inode *dir; - - if (flags & LOOKUP_RCU) { - parent = ACCESS_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - } - if (!nfs_neg_need_reval(dir, dentry, flags)) - ret = 1; - else if (flags & LOOKUP_RCU) - ret = -ECHILD; - if (!(flags & LOOKUP_RCU)) - dput(parent); - else if (parent != ACCESS_ONCE(dentry->d_parent)) - return -ECHILD; - goto out; - } + if (inode == NULL) + goto full_reval; + + if (nfs_verifier_is_delegated(dentry)) + return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) - goto no_open; + goto full_reval; + /* We cannot do exclusive creation on a positive dentry */ - if (flags & LOOKUP_EXCL) - goto no_open; + if (flags & (LOOKUP_EXCL | LOOKUP_REVAL)) + goto reval_dentry; - /* Let f_op->open() actually open (and revalidate) the file */ - ret = 1; + /* Check if the directory changed */ + if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) + goto reval_dentry; -out: - return ret; + /* Let f_op->open() actually open (and revalidate) the file */ + return 1; +reval_dentry: + if (flags & LOOKUP_RCU) + return -ECHILD; + return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); -no_open: - return nfs_lookup_revalidate(dentry, flags); +full_reval: + return nfs_do_lookup_revalidate(dir, name, dentry, flags); } #endif /* CONFIG_NFSV4 */ -/* - * Code common to create, mkdir, and mknod. - */ -int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, - struct nfs4_label *label) +int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned int open_flags, + umode_t mode) +{ + struct dentry *res = NULL; + /* Same as look+open from lookup_open(), but with different O_TRUNC + * handling. + */ + int error = 0; + + if (dentry->d_name.len > NFS_SERVER(dir)->namelen) + return -ENAMETOOLONG; + + if (open_flags & O_CREAT) { + error = nfs_do_create(dir, dentry, mode, open_flags); + if (!error) { + file->f_mode |= FMODE_CREATED; + return finish_open(file, dentry, NULL); + } else if (error != -EEXIST || open_flags & O_EXCL) + return error; + } + if (d_in_lookup(dentry)) { + /* The only flags nfs_lookup considers are + * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and + * we want those to be zero so the lookup isn't skipped. + */ + res = nfs_lookup(dir, dentry, 0); + } + return finish_no_open(file, res); + +} +EXPORT_SYMBOL_GPL(nfs_atomic_open_v23); + +struct dentry * +nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) { struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); struct inode *inode; - int error = -EACCES; + struct dentry *d; + int error; d_drop(dentry); - /* We may have been initialized further down */ - if (d_really_is_positive(dentry)) - goto out; if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); + error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, + fhandle, fattr); if (error) goto out_error; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); if (!(fattr->valid & NFS_ATTR_FATTR)) { struct nfs_server *server = NFS_SB(dentry->d_sb); - error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL); + error = server->nfs_client->rpc_ops->getattr(server, fhandle, + fattr, NULL); if (error < 0) goto out_error; } - inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); - error = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_error; - d_add(dentry, inode); + inode = nfs_fhget(dentry->d_sb, fhandle, fattr); + d = d_splice_alias(inode, dentry); out: dput(parent); - return 0; + return d; out_error: - nfs_mark_for_revalidate(dir); - dput(parent); - return error; + d = ERR_PTR(error); + goto out; +} +EXPORT_SYMBOL_GPL(nfs_add_or_obtain); + +/* + * Code common to create, mkdir, and mknod. + */ +int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + struct dentry *d; + + d = nfs_add_or_obtain(dentry, fhandle, fattr); + if (IS_ERR(d)) + return PTR_ERR(d); + + /* Callers don't care */ + dput(d); + return 0; } EXPORT_SYMBOL_GPL(nfs_instantiate); @@ -1670,18 +2348,23 @@ EXPORT_SYMBOL_GPL(nfs_instantiate); * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -int nfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int nfs_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, int open_flags) { struct iattr attr; - int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; int error; + open_flags |= O_CREAT; + dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; + if (open_flags & O_TRUNC) { + attr.ia_size = 0; + attr.ia_valid |= ATTR_SIZE; + } trace_nfs_create_enter(dir, dentry, open_flags); error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); @@ -1693,13 +2376,20 @@ out_err: d_drop(dentry); return error; } + +int nfs_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + return nfs_do_create(dir, dentry, mode, excl ? O_EXCL : 0); +} EXPORT_SYMBOL_GPL(nfs_create); /* * See comments for nfs_proc_create regarding failed operations. */ int -nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +nfs_mknod(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; int status; @@ -1725,10 +2415,11 @@ EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct iattr attr; - int error; + struct dentry *ret; dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); @@ -1737,14 +2428,9 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) attr.ia_mode = mode | S_IFDIR; trace_nfs_mkdir_enter(dir, dentry); - error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); - trace_nfs_mkdir_exit(dir, dentry, error); - if (error != 0) - goto out_err; - return 0; -out_err: - d_drop(dentry); - return error; + ret = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); + trace_nfs_mkdir_exit(dir, dentry, PTR_ERR_OR_ZERO(ret)); + return ret; } EXPORT_SYMBOL_GPL(nfs_mkdir); @@ -1754,6 +2440,21 @@ static void nfs_dentry_handle_enoent(struct dentry *dentry) d_delete(dentry); } +static void nfs_dentry_remove_handle_error(struct inode *dir, + struct dentry *dentry, int error) +{ + switch (error) { + case -ENOENT: + if (d_really_is_positive(dentry)) + d_delete(dentry); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + break; + case 0: + nfs_d_prune_case_insensitive_aliases(d_inode(dentry)); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + } +} + int nfs_rmdir(struct inode *dir, struct dentry *dentry) { int error; @@ -1776,6 +2477,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) up_write(&NFS_I(d_inode(dentry))->rmdir_sem); } else error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); + nfs_dentry_remove_handle_error(dir, dentry, error); trace_nfs_rmdir_exit(dir, dentry, error); return error; @@ -1805,12 +2507,11 @@ static int nfs_safe_remove(struct dentry *dentry) trace_nfs_remove_enter(dir, dentry); if (inode != NULL) { - NFS_PROTO(inode)->return_delegation(inode); - error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + error = NFS_PROTO(dir)->remove(dir, dentry); if (error == 0) nfs_drop_nlink(inode); } else - error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + error = NFS_PROTO(dir)->remove(dir, dentry); if (error == -ENOENT) nfs_dentry_handle_enoent(dentry); trace_nfs_remove_exit(dir, dentry, error); @@ -1826,30 +2527,37 @@ out: int nfs_unlink(struct inode *dir, struct dentry *dentry) { int error; - int need_rehash = 0; dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_unlink_enter(dir, dentry); spin_lock(&dentry->d_lock); - if (d_count(dentry) > 1) { + if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED, + &NFS_I(d_inode(dentry))->flags)) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(d_inode(dentry), 0); error = nfs_sillyrename(dir, dentry); goto out; } - if (!d_unhashed(dentry)) { - __d_drop(dentry); - need_rehash = 1; + /* We must prevent any concurrent open until the unlink + * completes. ->d_revalidate will wait for ->d_fsdata + * to clear. We set it here to ensure no lookup succeeds until + * the unlink is complete on the server. + */ + error = -ETXTBSY; + if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) || + WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) { + spin_unlock(&dentry->d_lock); + goto out; } + block_revalidate(dentry); + spin_unlock(&dentry->d_lock); error = nfs_safe_remove(dentry); - if (!error || error == -ENOENT) { - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - } else if (need_rehash) - d_rehash(dentry); + nfs_dentry_remove_handle_error(dir, dentry, error); + unblock_revalidate(dentry); out: trace_nfs_unlink_exit(dir, dentry, error); return error; @@ -1871,9 +2579,10 @@ EXPORT_SYMBOL_GPL(nfs_unlink); * now have a new file handle and can instantiate an in-core NFS inode * and move the raw page into its mapping. */ -int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *symname) { - struct page *page; + struct folio *folio; char *kaddr; struct iattr attr; unsigned int pathlen = strlen(symname); @@ -1888,43 +2597,40 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; - page = alloc_page(GFP_USER); - if (!page) + folio = folio_alloc(GFP_USER, 0); + if (!folio) return -ENOMEM; - kaddr = page_address(page); + kaddr = folio_address(folio); memcpy(kaddr, symname, pathlen); if (pathlen < PAGE_SIZE) memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); trace_nfs_symlink_enter(dir, dentry); - error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); + error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr); trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, dentry, symname, error); d_drop(dentry); - __free_page(page); + folio_put(folio); return error; } + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + /* * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0, - GFP_KERNEL)) { - SetPageUptodate(page); - unlock_page(page); - /* - * add_to_page_cache_lru() grabs an extra page refcount. - * Drop it here to avoid leaking this page later. - */ - put_page(page); - } else - __free_page(page); + if (filemap_add_folio(d_inode(dentry)->i_mapping, folio, 0, + GFP_KERNEL) == 0) { + folio_mark_uptodate(folio); + folio_unlock(folio); + } + folio_put(folio); return 0; } EXPORT_SYMBOL_GPL(nfs_symlink); @@ -1939,11 +2645,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry, dentry); trace_nfs_link_enter(inode, dir, dentry); - NFS_PROTO(inode)->return_delegation(inode); - d_drop(dentry); + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); ihold(inode); d_add(dentry, inode); } @@ -1952,6 +2659,26 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL_GPL(nfs_link); +static void +nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) +{ + struct dentry *new_dentry = data->new_dentry; + + unblock_revalidate(new_dentry); +} + +static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, + struct dentry *new_dentry) +{ + struct nfs_server *server = NFS_SB(old_dentry->d_sb); + + if (old_dentry->d_parent != new_dentry->d_parent) + return false; + if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) + return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); + return true; +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -1976,14 +2703,15 @@ EXPORT_SYMBOL_GPL(nfs_link); * If these conditions are met, we can drop the dentries before doing * the rename. */ -int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct dentry *dentry = NULL, *rehash = NULL; + struct dentry *dentry = NULL; struct rpc_task *task; + bool must_unblock = false; int error = -EBUSY; if (flags) @@ -2001,18 +2729,22 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * the new target. */ if (new_inode && !S_ISDIR(new_inode->i_mode)) { - /* - * To prevent any new references to the target during the - * rename, we unhash the dentry in advance. + /* We must prevent any concurrent open until the unlink + * completes. ->d_revalidate will wait for ->d_fsdata + * to clear. We set it here to ensure no lookup succeeds until + * the unlink is complete on the server. */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - rehash = new_dentry; - } + error = -ETXTBSY; + if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) || + WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED)) + goto out; + spin_lock(&new_dentry->d_lock); if (d_count(new_dentry) > 2) { int err; + spin_unlock(&new_dentry->d_lock); + /* copy the target dentry's name */ dentry = d_alloc(new_dentry->d_parent, &new_dentry->d_name); @@ -2025,17 +2757,23 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; new_dentry = dentry; - rehash = NULL; new_inode = NULL; + } else { + block_revalidate(new_dentry); + must_unblock = true; + spin_unlock(&new_dentry->d_lock); } - } - NFS_PROTO(old_inode)->return_delegation(old_inode); - if (new_inode != NULL) - NFS_PROTO(new_inode)->return_delegation(new_inode); + } - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + if (S_ISREG(old_inode->i_mode) && + nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry)) + nfs_sync_inode(old_inode); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, + must_unblock ? nfs_unblock_rename : NULL); if (IS_ERR(task)) { + if (must_unblock) + unblock_revalidate(new_dentry); error = PTR_ERR(task); goto out; } @@ -2048,10 +2786,16 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else error = task->tk_status; rpc_put_task(task); - nfs_mark_for_revalidate(old_inode); + /* Ensure the inode attributes are revalidated */ + if (error == 0) { + spin_lock(&old_inode->i_lock); + NFS_I(old_inode)->attr_gencount = nfs_inc_attr_generation_counter(); + nfs_set_cache_invalid(old_inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_REVAL_FORCED); + spin_unlock(&old_inode->i_lock); + } out: - if (rehash) - d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); if (!error) { @@ -2064,7 +2808,7 @@ out: * should mark the directories for revalidation. */ d_move(old_dentry, new_dentry); - nfs_set_verifier(new_dentry, + nfs_set_verifier(old_dentry, nfs_save_change_attribute(new_dir)); } else if (error == -ENOENT) nfs_dentry_handle_enoent(old_dentry); @@ -2080,13 +2824,13 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock); static LIST_HEAD(nfs_access_lru_list); static atomic_long_t nfs_access_nr_entries; -static unsigned long nfs_access_max_cachesize = ULONG_MAX; +static unsigned long nfs_access_max_cachesize = 4*1024*1024; module_param(nfs_access_max_cachesize, ulong, 0644); MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length"); static void nfs_access_free_entry(struct nfs_access_entry *entry) { - put_rpccred(entry->cred); + put_group_info(entry->group_info); kfree_rcu(entry, rcu_head); smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); @@ -2212,17 +2956,55 @@ void nfs_access_zap_cache(struct inode *inode) } EXPORT_SYMBOL_GPL(nfs_access_zap_cache); -static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) +static int access_cmp(const struct cred *a, const struct nfs_access_entry *b) +{ + struct group_info *ga, *gb; + int g; + + if (uid_lt(a->fsuid, b->fsuid)) + return -1; + if (uid_gt(a->fsuid, b->fsuid)) + return 1; + + if (gid_lt(a->fsgid, b->fsgid)) + return -1; + if (gid_gt(a->fsgid, b->fsgid)) + return 1; + + ga = a->group_info; + gb = b->group_info; + if (ga == gb) + return 0; + if (ga == NULL) + return -1; + if (gb == NULL) + return 1; + if (ga->ngroups < gb->ngroups) + return -1; + if (ga->ngroups > gb->ngroups) + return 1; + + for (g = 0; g < ga->ngroups; g++) { + if (gid_lt(ga->gid[g], gb->gid[g])) + return -1; + if (gid_gt(ga->gid[g], gb->gid[g])) + return 1; + } + return 0; +} + +static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred) { struct rb_node *n = NFS_I(inode)->access_cache.rb_node; - struct nfs_access_entry *entry; while (n != NULL) { - entry = rb_entry(n, struct nfs_access_entry, rb_node); + struct nfs_access_entry *entry = + rb_entry(n, struct nfs_access_entry, rb_node); + int cmp = access_cmp(cred, entry); - if (cred < entry->cred) + if (cmp < 0) n = n->rb_left; - else if (cred > entry->cred) + else if (cmp > 0) n = n->rb_right; else return entry; @@ -2230,9 +3012,30 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st return NULL; } -static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block) +static u64 nfs_access_login_time(const struct task_struct *task, + const struct cred *cred) +{ + const struct task_struct *parent; + const struct cred *pcred; + u64 ret; + + rcu_read_lock(); + for (;;) { + parent = rcu_dereference(task->real_parent); + pcred = __task_cred(parent); + if (parent == task || cred_fscmp(pcred, cred) != 0) + break; + task = parent; + } + ret = task->start_time; + rcu_read_unlock(); + return ret; +} + +static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); + u64 login_time = nfs_access_login_time(current, cred); struct nfs_access_entry *cache; bool retry = true; int err; @@ -2248,11 +3051,11 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str /* Found an entry, is our attribute cache valid? */ if (!nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) break; + if (!retry) + break; err = -ECHILD; if (!may_block) goto out; - if (!retry) - goto out_zap; spin_unlock(&inode->i_lock); err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (err) @@ -2260,9 +3063,10 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str spin_lock(&inode->i_lock); retry = false; } - res->jiffies = cache->jiffies; - res->cred = cache->cred; - res->mask = cache->mask; + err = -ENOENT; + if ((s64)(login_time - cache->timestamp) > 0) + goto out; + *mask = cache->mask; list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); err = 0; out: @@ -2274,12 +3078,13 @@ out_zap: return -ENOENT; } -static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, u32 *mask) { /* Only check the most recently returned cache entry, * but do it without locking. */ struct nfs_inode *nfsi = NFS_I(inode); + u64 login_time = nfs_access_login_time(current, cred); struct nfs_access_entry *cache; int err = -ECHILD; struct list_head *lh; @@ -2287,40 +3092,58 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, rcu_read_lock(); if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) goto out; - lh = rcu_dereference(nfsi->access_cache_entry_lru.prev); + lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru)); cache = list_entry(lh, struct nfs_access_entry, lru); if (lh == &nfsi->access_cache_entry_lru || - cred != cache->cred) + access_cmp(cred, cache) != 0) cache = NULL; if (cache == NULL) goto out; + if ((s64)(login_time - cache->timestamp) > 0) + goto out; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; - res->jiffies = cache->jiffies; - res->cred = cache->cred; - res->mask = cache->mask; + *mask = cache->mask; err = 0; out: rcu_read_unlock(); return err; } -static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) +int nfs_access_get_cached(struct inode *inode, const struct cred *cred, + u32 *mask, bool may_block) +{ + int status; + + status = nfs_access_get_cached_rcu(inode, cred, mask); + if (status != 0) + status = nfs_access_get_cached_locked(inode, cred, mask, + may_block); + + return status; +} +EXPORT_SYMBOL_GPL(nfs_access_get_cached); + +static void nfs_access_add_rbtree(struct inode *inode, + struct nfs_access_entry *set, + const struct cred *cred) { struct nfs_inode *nfsi = NFS_I(inode); struct rb_root *root_node = &nfsi->access_cache; struct rb_node **p = &root_node->rb_node; struct rb_node *parent = NULL; struct nfs_access_entry *entry; + int cmp; spin_lock(&inode->i_lock); while (*p != NULL) { parent = *p; entry = rb_entry(parent, struct nfs_access_entry, rb_node); + cmp = access_cmp(cred, entry); - if (set->cred < entry->cred) + if (cmp < 0) p = &parent->rb_left; - else if (set->cred > entry->cred) + else if (cmp > 0) p = &parent->rb_right; else goto found; @@ -2338,22 +3161,25 @@ found: nfs_access_free_entry(entry); } -void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set, + const struct cred *cred) { struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) return; RB_CLEAR_NODE(&cache->rb_node); - cache->jiffies = set->jiffies; - cache->cred = get_rpccred(set->cred); + cache->fsuid = cred->fsuid; + cache->fsgid = cred->fsgid; + cache->group_info = get_group_info(cred->group_info); cache->mask = set->mask; + cache->timestamp = ktime_get_ns(); /* The above field assignments must be visible * before this item appears on the lru. We cannot easily * use rcu_assign_pointer, so just force the memory barrier. */ smp_wmb(); - nfs_access_add_rbtree(inode, cache); + nfs_access_add_rbtree(inode, cache, cred); /* Update accounting */ smp_mb__before_atomic(); @@ -2372,15 +3198,15 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) } EXPORT_SYMBOL_GPL(nfs_access_add_cache); -#define NFS_MAY_READ (NFS4_ACCESS_READ) -#define NFS_MAY_WRITE (NFS4_ACCESS_MODIFY | \ - NFS4_ACCESS_EXTEND | \ - NFS4_ACCESS_DELETE) -#define NFS_FILE_MAY_WRITE (NFS4_ACCESS_MODIFY | \ - NFS4_ACCESS_EXTEND) +#define NFS_MAY_READ (NFS_ACCESS_READ) +#define NFS_MAY_WRITE (NFS_ACCESS_MODIFY | \ + NFS_ACCESS_EXTEND | \ + NFS_ACCESS_DELETE) +#define NFS_FILE_MAY_WRITE (NFS_ACCESS_MODIFY | \ + NFS_ACCESS_EXTEND) #define NFS_DIR_MAY_WRITE NFS_MAY_WRITE -#define NFS_MAY_LOOKUP (NFS4_ACCESS_LOOKUP) -#define NFS_MAY_EXECUTE (NFS4_ACCESS_EXECUTE) +#define NFS_MAY_LOOKUP (NFS_ACCESS_LOOKUP) +#define NFS_MAY_EXECUTE (NFS_ACCESS_EXECUTE) static int nfs_access_calc_mask(u32 access_result, umode_t umode) { @@ -2409,18 +3235,16 @@ void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) } EXPORT_SYMBOL_GPL(nfs_access_set_mask); -static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) +static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) { struct nfs_access_entry cache; bool may_block = (mask & MAY_NOT_BLOCK) == 0; - int cache_mask; + int cache_mask = -1; int status; trace_nfs_access_enter(inode); - status = nfs_access_get_cached_rcu(inode, cred, &cache); - if (status != 0) - status = nfs_access_get_cached(inode, cred, &cache, may_block); + status = nfs_access_get_cached(inode, cred, &cache.mask, may_block); if (status == 0) goto out_cached; @@ -2428,27 +3252,32 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) if (!may_block) goto out; - /* Be clever: ask server to check for all possible rights */ - cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE - | NFS_MAY_WRITE | NFS_MAY_READ; - cache.cred = cred; - cache.jiffies = jiffies; - status = NFS_PROTO(inode)->access(inode, &cache); + /* + * Determine which access bits we want to ask for... + */ + cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND | + nfs_access_xattr_mask(NFS_SERVER(inode)); + if (S_ISDIR(inode->i_mode)) + cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; + else + cache.mask |= NFS_ACCESS_EXECUTE; + status = NFS_PROTO(inode)->access(inode, &cache, cred); if (status != 0) { if (status == -ESTALE) { - nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); + else + nfs_zap_caches(inode); } goto out; } - nfs_access_add_cache(inode, &cache); + nfs_access_add_cache(inode, &cache, cred); out_cached: cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode); if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) status = -EACCES; out: - trace_nfs_access_exit(inode, status); + trace_nfs_access_exit(inode, mask, cache_mask, status); return status; } @@ -2469,7 +3298,7 @@ static int nfs_open_permission_mask(int openflags) return mask; } -int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) +int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags) { return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); } @@ -2480,7 +3309,9 @@ static int nfs_execute_ok(struct inode *inode, int mask) struct nfs_server *server = NFS_SERVER(inode); int ret = 0; - if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) { + if (S_ISDIR(inode->i_mode)) + return 0; + if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_MODE)) { if (mask & MAY_NOT_BLOCK) return -ECHILD; ret = __nfs_revalidate_inode(server, inode); @@ -2490,9 +3321,11 @@ static int nfs_execute_ok(struct inode *inode, int mask) return ret; } -int nfs_permission(struct inode *inode, int mask) +int nfs_permission(struct mnt_idmap *idmap, + struct inode *inode, + int mask) { - struct rpc_cred *cred; + const struct cred *cred = current_cred(); int res = 0; nfs_inc_stats(inode, NFSIOS_VFSACCESS); @@ -2524,23 +3357,7 @@ force_lookup: if (!NFS_PROTO(inode)->access) goto out_notsup; - /* Always try fast lookups first */ - rcu_read_lock(); - cred = rpc_lookup_cred_nonblock(); - if (!IS_ERR(cred)) - res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK); - else - res = PTR_ERR(cred); - rcu_read_unlock(); - if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) { - /* Fast lookup failed, try the slow way */ - cred = rpc_lookup_cred(); - if (!IS_ERR(cred)) { - res = nfs_do_access(inode, cred, mask); - put_rpccred(cred); - } else - res = PTR_ERR(cred); - } + res = nfs_do_access(inode, cred, mask); out: if (!res && (mask & MAY_EXEC)) res = nfs_execute_ok(inode, mask); @@ -2552,16 +3369,10 @@ out_notsup: if (mask & MAY_NOT_BLOCK) return -ECHILD; - res = nfs_revalidate_inode(NFS_SERVER(inode), inode); + res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE | + NFS_INO_INVALID_OTHER); if (res == 0) - res = generic_permission(inode, mask); + res = generic_permission(&nop_mnt_idmap, inode, mask); goto out; } EXPORT_SYMBOL_GPL(nfs_permission); - -/* - * Local variables: - * version-control: t - * kept-new-versions: 5 - * End: - */ diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6fb9fad2d1e6..48d89716193a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/direct.c * @@ -55,54 +56,17 @@ #include <linux/uaccess.h> #include <linux/atomic.h> +#include "delegation.h" #include "internal.h" #include "iostat.h" #include "pnfs.h" +#include "fscache.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_VFS static struct kmem_cache *nfs_direct_cachep; -/* - * This represents a set of asynchronous requests that we're waiting on - */ -struct nfs_direct_mirror { - ssize_t count; -}; - -struct nfs_direct_req { - struct kref kref; /* release manager */ - - /* I/O parameters */ - struct nfs_open_context *ctx; /* file open context info */ - struct nfs_lock_context *l_ctx; /* Lock context info */ - struct kiocb * iocb; /* controlling i/o request */ - struct inode * inode; /* target file of i/o */ - - /* completion state */ - atomic_t io_count; /* i/os we're waiting for */ - spinlock_t lock; /* protect completion state */ - - struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX]; - int mirror_count; - - ssize_t count, /* bytes actually processed */ - max_count, /* max expected count */ - bytes_left, /* bytes left to be sent */ - io_start, /* start of IO */ - error; /* any reported error */ - struct completion completion; /* wait for i/o completion */ - - /* commit state */ - struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ - struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ - struct work_struct work; - int flags; -#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ -#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ - struct nfs_writeverf verf; /* unstable write verifier */ -}; - static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; static void nfs_direct_write_complete(struct nfs_direct_req *dreq); @@ -119,157 +83,86 @@ static inline int put_dreq(struct nfs_direct_req *dreq) } static void -nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) +nfs_direct_handle_truncated(struct nfs_direct_req *dreq, + const struct nfs_pgio_header *hdr, + ssize_t dreq_len) { - int i; - ssize_t count; - - WARN_ON_ONCE(dreq->count >= dreq->max_count); - - if (dreq->mirror_count == 1) { - dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; - dreq->count += hdr->good_bytes; - } else { - /* mirrored writes */ - count = dreq->mirrors[hdr->pgio_mirror_idx].count; - if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { - count = hdr->io_start + hdr->good_bytes - dreq->io_start; - dreq->mirrors[hdr->pgio_mirror_idx].count = count; - } - /* update the dreq->count by finding the minimum agreed count from all - * mirrors */ - count = dreq->mirrors[0].count; - - for (i = 1; i < dreq->mirror_count; i++) - count = min(count, dreq->mirrors[i].count); - - dreq->count = count; + if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) || + test_bit(NFS_IOHDR_EOF, &hdr->flags))) + return; + if (dreq->max_count >= dreq_len) { + dreq->max_count = dreq_len; + if (dreq->count > dreq_len) + dreq->count = dreq_len; } + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error) + dreq->error = hdr->error; } -/* - * nfs_direct_select_verf - select the right verifier - * @dreq - direct request possibly spanning multiple servers - * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs - * @commit_idx - commit bucket index for the DS - * - * returns the correct verifier to use given the role of the server - */ -static struct nfs_writeverf * -nfs_direct_select_verf(struct nfs_direct_req *dreq, - struct nfs_client *ds_clp, - int commit_idx) +static void +nfs_direct_count_bytes(struct nfs_direct_req *dreq, + const struct nfs_pgio_header *hdr) { - struct nfs_writeverf *verfp = &dreq->verf; - -#ifdef CONFIG_NFS_V4_1 - /* - * pNFS is in use, use the DS verf except commit_through_mds is set - * for layout segment where nbuckets is zero. - */ - if (ds_clp && dreq->ds_cinfo.nbuckets > 0) { - if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets) - verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf; - else - WARN_ON_ONCE(1); - } -#endif - return verfp; -} + loff_t hdr_end = hdr->io_start + hdr->good_bytes; + ssize_t dreq_len = 0; + if (hdr_end > dreq->io_start) + dreq_len = hdr_end - dreq->io_start; -/* - * nfs_direct_set_hdr_verf - set the write/commit verifier - * @dreq - direct request possibly spanning multiple servers - * @hdr - pageio header to validate against previously seen verfs - * - * Set the server's (MDS or DS) "seen" verifier - */ -static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, - struct nfs_pgio_header *hdr) -{ - struct nfs_writeverf *verfp; + nfs_direct_handle_truncated(dreq, hdr, dreq_len); - verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx); - WARN_ON_ONCE(verfp->committed >= 0); - memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); - WARN_ON_ONCE(verfp->committed < 0); -} + if (dreq_len > dreq->max_count) + dreq_len = dreq->max_count; -static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1, - const struct nfs_writeverf *v2) -{ - return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier); + if (dreq->count < dreq_len) + dreq->count = dreq_len; } -/* - * nfs_direct_cmp_hdr_verf - compare verifier for pgio header - * @dreq - direct request possibly spanning multiple servers - * @hdr - pageio header to validate against previously seen verf - * - * set the server's "seen" verf if not initialized. - * returns result of comparison between @hdr->verf and the "seen" - * verf of the server used by @hdr (DS or MDS) - */ -static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, - struct nfs_pgio_header *hdr) +static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, + struct nfs_page *req) { - struct nfs_writeverf *verfp; + loff_t offs = req_offset(req); + size_t req_start = (size_t)(offs - dreq->io_start); - verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx); - if (verfp->committed < 0) { - nfs_direct_set_hdr_verf(dreq, hdr); - return 0; - } - return nfs_direct_cmp_verf(verfp, &hdr->verf); + if (req_start < dreq->max_count) + dreq->max_count = req_start; + if (req_start < dreq->count) + dreq->count = req_start; } -/* - * nfs_direct_cmp_commit_data_verf - compare verifier for commit data - * @dreq - direct request possibly spanning multiple servers - * @data - commit data to validate against previously seen verf - * - * returns result of comparison between @data->verf and the verf of - * the server used by @data (DS or MDS) - */ -static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, - struct nfs_commit_data *data) +static void nfs_direct_file_adjust_size_locked(struct inode *inode, + loff_t offset, size_t count) { - struct nfs_writeverf *verfp; - - verfp = nfs_direct_select_verf(dreq, data->ds_clp, - data->ds_commit_index); - - /* verifier not set so always fail */ - if (verfp->committed < 0) - return 1; - - return nfs_direct_cmp_verf(verfp, &data->verf); + loff_t newsize = offset + (loff_t)count; + loff_t oldsize = i_size_read(inode); + + if (newsize > oldsize) { + i_size_write(inode, newsize); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; + trace_nfs_size_grow(inode, newsize); + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); + } } /** - * nfs_direct_IO - NFS address space operation for direct I/O + * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block * @iter: I/O buffer * - * The presence of this routine in the address space ops vector means - * the NFS client supports direct I/O. However, for most direct IO, we - * shunt off direct read and write requests before the VFS gets them, - * so this method is only ever called for swap. + * Perform IO to the swap-file. This is much like direct IO. */ -ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - - /* we only support swap file calling nfs_direct_IO */ - if (!IS_SWAPFILE(inode)) - return 0; - - VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); + ssize_t ret; if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter); - return nfs_file_direct_write(iocb, iter); + ret = nfs_file_direct_read(iocb, iter, true); + else + ret = nfs_file_direct_write(iocb, iter, true); + if (ret < 0) + return ret; + return 0; } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -289,18 +182,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, cinfo->completion_ops = &nfs_direct_commit_completion_ops; } -static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq, - struct nfs_pageio_descriptor *pgio, - struct nfs_page *req) -{ - int mirror_count = 1; - - if (pgio->pg_ops->pg_get_mirror_count) - mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); - - dreq->mirror_count = mirror_count; -} - static inline struct nfs_direct_req *nfs_direct_req_alloc(void) { struct nfs_direct_req *dreq; @@ -313,9 +194,8 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) kref_get(&dreq->kref); init_completion(&dreq->completion); INIT_LIST_HEAD(&dreq->mds_cinfo.list); - dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ + pnfs_init_ds_commit_info(&dreq->ds_cinfo); INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); - dreq->mirror_count = 1; spin_lock_init(&dreq->lock); return dreq; @@ -325,7 +205,7 @@ static void nfs_direct_req_free(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); - nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo); + pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode); if (dreq->l_ctx != NULL) nfs_put_lock_context(dreq->l_ctx); if (dreq->ctx != NULL) @@ -338,9 +218,10 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq) kref_put(&dreq->kref, nfs_direct_req_free); } -ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset) { - return dreq->bytes_left; + loff_t start = offset - dreq->io_start; + return dreq->max_count - start; } EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); @@ -384,7 +265,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) res = (long) dreq->count; WARN_ON_ONCE(dreq->count < 0); } - dreq->iocb->ki_complete(dreq->iocb, res, 0); + dreq->iocb->ki_complete(dreq->iocb, res); } complete(&dreq->completion); @@ -397,22 +278,23 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) unsigned long bytes = 0; struct nfs_direct_req *dreq = hdr->dreq; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) - goto out_put; - spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) - dreq->error = hdr->error; - else - nfs_direct_good_bytes(dreq, hdr); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { + spin_unlock(&dreq->lock); + goto out_put; + } + nfs_direct_count_bytes(dreq, hdr); spin_unlock(&dreq->lock); + nfs_update_delegated_atime(dreq->inode); + while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; - if (!PageCompound(page) && bytes < hdr->good_bytes) + if (!PageCompound(page) && bytes < hdr->good_bytes && + (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY)) set_page_dirty(page); bytes += req->wb_bytes; nfs_list_remove_request(req); @@ -424,7 +306,7 @@ out_put: hdr->release(hdr); } -static void nfs_read_sync_pgio_error(struct list_head *head) +static void nfs_read_sync_pgio_error(struct list_head *head, int error) { struct nfs_page *req; @@ -438,6 +320,7 @@ static void nfs_read_sync_pgio_error(struct list_head *head) static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr) { get_dreq(hdr->dreq); + set_bit(NFS_IOHDR_ODIRECT, &hdr->flags); } static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { @@ -476,26 +359,23 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, size_t pgbase; unsigned npages, i; - result = iov_iter_get_pages_alloc(iter, &pagevec, + result = iov_iter_get_pages_alloc2(iter, &pagevec, rsize, &pgbase); if (result < 0) break; bytes = result; - iov_iter_advance(iter, bytes); npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; for (i = 0; i < npages; i++) { struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); /* XXX do we need to do the eof zeroing found in async_filler? */ - req = nfs_create_request(dreq->ctx, pagevec[i], NULL, - pgbase, req_len); + req = nfs_page_create_from_page(dreq->ctx, pagevec[i], + pgbase, pos, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); break; } - req->wb_index = pos >> PAGE_SHIFT; - req->wb_offset = pos & ~PAGE_MASK; if (!nfs_pageio_add_request(&desc, req)) { result = desc.pg_error; nfs_release_request(req); @@ -505,7 +385,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, bytes -= req_len; requested_bytes += req_len; pos += req_len; - dreq->bytes_left -= req_len; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); @@ -534,6 +413,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -549,14 +429,15 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; - ssize_t result = -EINVAL, requested; + ssize_t result, requested; size_t count = iov_iter_count(iter); nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); @@ -575,24 +456,38 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) goto out; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = count; + dreq->max_count = count; dreq->io_start = iocb->ki_pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { result = PTR_ERR(l_ctx); + nfs_direct_req_release(dreq); goto out_release; } dreq->l_ctx = l_ctx; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - nfs_start_io_direct(inode); + if (user_backed_iter(iter)) + dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; + + if (!swap) { + result = nfs_start_io_direct(inode); + if (result) { + /* release the reference that would usually be + * consumed by nfs_direct_read_schedule_iovec() + */ + nfs_direct_req_release(dreq); + goto out_release; + } + } NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - nfs_end_io_direct(inode); + if (!swap) + nfs_end_io_direct(inode); if (requested > 0) { result = nfs_direct_wait(dreq); @@ -611,71 +506,114 @@ out: return result; } +static void nfs_direct_add_page_head(struct list_head *list, + struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + if (!list_empty(&head->wb_list) || !nfs_lock_request(head)) + return; + if (!list_empty(&head->wb_list)) { + nfs_unlock_request(head); + return; + } + list_add(&head->wb_list, list); + kref_get(&head->wb_kref); + kref_get(&head->wb_kref); +} + +static void nfs_direct_join_group(struct list_head *list, + struct nfs_commit_info *cinfo, + struct inode *inode) +{ + struct nfs_page *req, *subreq; + + list_for_each_entry(req, list, wb_list) { + if (req->wb_head != req) { + nfs_direct_add_page_head(&req->wb_list, req); + continue; + } + subreq = req->wb_this_page; + if (subreq == req) + continue; + do { + /* + * Remove subrequests from this list before freeing + * them in the call to nfs_join_page_group(). + */ + if (!list_empty(&subreq->wb_list)) { + nfs_list_remove_request(subreq); + nfs_release_request(subreq); + } + } while ((subreq = subreq->wb_this_page) != req); + nfs_join_page_group(req, cinfo, inode); + } +} + static void nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, struct nfs_commit_info *cinfo) { - spin_lock(&cinfo->inode->i_lock); -#ifdef CONFIG_NFS_V4_1 - if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) - NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); -#endif + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + pnfs_recover_commit_reqs(list, cinfo); nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; - struct nfs_page *req, *tmp; + struct nfs_page *req; LIST_HEAD(reqs); struct nfs_commit_info cinfo; - LIST_HEAD(failed); - int i; nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); - dreq->count = 0; - dreq->verf.committed = NFS_INVALID_STABLE_HOW; + nfs_direct_join_group(&reqs, &cinfo, dreq->inode); + nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); - for (i = 0; i < dreq->mirror_count; i++) - dreq->mirrors[i].count = 0; get_dreq(dreq); nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - req = nfs_list_entry(reqs.next); - nfs_direct_setup_mirroring(dreq, &desc, req); - if (desc.pg_error < 0) { - list_splice_init(&reqs, &failed); - goto out_failed; - } - - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); + /* Bump the transmission count */ + req->wb_nio++; if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_remove_request(req); - nfs_list_add_request(req, &failed); - spin_lock(&cinfo.inode->i_lock); - dreq->flags = 0; - if (desc.pg_error < 0) + spin_lock(&dreq->lock); + if (dreq->error < 0) { + desc.pg_error = dreq->error; + } else if (desc.pg_error != -EAGAIN) { + dreq->flags = 0; + if (!desc.pg_error) + desc.pg_error = -EIO; dreq->error = desc.pg_error; - else - dreq->error = -EIO; - spin_unlock(&cinfo.inode->i_lock); + } else + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + break; } nfs_release_request(req); } nfs_pageio_complete(&desc); -out_failed: - while (!list_empty(&failed)) { - req = nfs_list_entry(failed.next); + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); nfs_unlock_and_release_request(req); + if (desc.pg_error == -EAGAIN) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } else { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } } if (put_dreq(dreq)) @@ -684,27 +622,51 @@ out_failed: static void nfs_direct_commit_complete(struct nfs_commit_data *data) { + const struct nfs_writeverf *verf = data->res.verf; struct nfs_direct_req *dreq = data->dreq; struct nfs_commit_info cinfo; struct nfs_page *req; int status = data->task.tk_status; + trace_nfs_direct_commit_complete(dreq); + + spin_lock(&dreq->lock); + if (status < 0) { + /* Errors in commit are fatal */ + dreq->error = status; + dreq->flags = NFS_ODIRECT_DONE; + } else { + status = dreq->error; + } + spin_unlock(&dreq->lock); + nfs_init_cinfo_from_dreq(&cinfo, dreq); - if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data)) - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { - /* Note the rewrite will go through mds */ + if (status < 0) { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } else if (!nfs_write_match_verf(verf, req)) { + spin_lock(&dreq->lock); + if (dreq->flags == 0) + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + /* + * Despite the reboot, the write was successful, + * so reset wb_nio. + */ + req->wb_nio = 0; nfs_mark_request_commit(req, NULL, &cinfo, 0); } else nfs_release_request(req); nfs_unlock_and_release_request(req); } - if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) + if (nfs_commit_end(cinfo.mds)) nfs_direct_write_complete(dreq); } @@ -713,8 +675,11 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, { struct nfs_direct_req *dreq = cinfo->dreq; + trace_nfs_direct_resched_write(dreq); + spin_lock(&dreq->lock); - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + if (dreq->flags != NFS_ODIRECT_DONE) + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; spin_unlock(&dreq->lock); nfs_mark_request_commit(req, NULL, cinfo, 0); } @@ -731,10 +696,35 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) LIST_HEAD(mds_list); nfs_init_cinfo_from_dreq(&cinfo, dreq); + nfs_commit_begin(cinfo.mds); nfs_scan_commit(dreq->inode, &mds_list, &cinfo); res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo); - if (res < 0) /* res == -ENOMEM */ - nfs_direct_write_reschedule(dreq); + if (res < 0) { /* res == -ENOMEM */ + spin_lock(&dreq->lock); + if (dreq->flags == 0) + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + } + if (nfs_commit_end(cinfo.mds)) + nfs_direct_write_complete(dreq); +} + +static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) +{ + struct nfs_commit_info cinfo; + struct nfs_page *req; + LIST_HEAD(reqs); + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); + + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); + nfs_list_remove_request(req); + nfs_direct_truncate_request(dreq, req); + nfs_release_request(req); + nfs_unlock_and_release_request(req); + } } static void nfs_direct_write_schedule_work(struct work_struct *work) @@ -751,6 +741,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: + nfs_direct_write_clear_reqs(dreq); nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); nfs_direct_complete(dreq); } @@ -758,54 +749,55 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) static void nfs_direct_write_complete(struct nfs_direct_req *dreq) { - schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ + trace_nfs_direct_write_complete(dreq); + queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */ } static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; - bool request_commit = false; - struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct inode *inode = dreq->inode; + int flags = NFS_ODIRECT_DONE; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) - goto out_put; + trace_nfs_direct_write_completion(dreq); nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { - dreq->flags = 0; - dreq->error = hdr->error; + if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { + spin_unlock(&dreq->lock); + goto out_put; } - if (dreq->error == 0) { - nfs_direct_good_bytes(dreq, hdr); - if (nfs_write_need_commit(hdr)) { - if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) - request_commit = true; - else if (dreq->flags == 0) { - nfs_direct_set_hdr_verf(dreq, hdr); - request_commit = true; - dreq->flags = NFS_ODIRECT_DO_COMMIT; - } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { - request_commit = true; - if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) - dreq->flags = - NFS_ODIRECT_RESCHED_WRITES; - } - } + + nfs_direct_count_bytes(dreq, hdr); + if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) && + !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { + if (!dreq->flags) + dreq->flags = NFS_ODIRECT_DO_COMMIT; + flags = dreq->flags; } spin_unlock(&dreq->lock); + spin_lock(&inode->i_lock); + nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count); + nfs_update_delegated_mtime_locked(dreq->inode); + spin_unlock(&inode->i_lock); + while (!list_empty(&hdr->pages)) { + struct nfs_page *req; req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); - if (request_commit) { + if (flags == NFS_ODIRECT_DO_COMMIT) { kref_get(&req->wb_kref); + memcpy(&req->wb_verf, &hdr->verf.verifier, + sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo, hdr->ds_commit_idx); + } else if (flags == NFS_ODIRECT_RESCHED_WRITES) { + kref_get(&req->wb_kref); + nfs_mark_request_commit(req, NULL, &cinfo, 0); } nfs_unlock_and_release_request(req); } @@ -816,7 +808,7 @@ out_put: hdr->release(hdr); } -static void nfs_write_sync_pgio_error(struct list_head *head) +static void nfs_write_sync_pgio_error(struct list_head *head, int error) { struct nfs_page *req; @@ -830,15 +822,23 @@ static void nfs_write_sync_pgio_error(struct list_head *head) static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; + struct nfs_page *req; + struct nfs_commit_info cinfo; + + trace_nfs_direct_write_reschedule_io(dreq); + nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - if (dreq->error == 0) { + if (dreq->error == 0) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.count; - } + set_bit(NFS_IOHDR_REDO, &hdr->flags); spin_unlock(&dreq->lock); + while (!list_empty(&hdr->pages)) { + req = nfs_list_entry(hdr->pages.next); + nfs_list_remove_request(req); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } } static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { @@ -862,15 +862,19 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { */ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct iov_iter *iter, - loff_t pos) + loff_t pos, int ioflags) { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; + struct nfs_commit_info cinfo; ssize_t result = 0; size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + bool defer = false; + + trace_nfs_direct_write_schedule_iovec(dreq); - nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, + nfs_pageio_init_write(&desc, inode, ioflags, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); @@ -883,45 +887,60 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t pgbase; unsigned npages, i; - result = iov_iter_get_pages_alloc(iter, &pagevec, + result = iov_iter_get_pages_alloc2(iter, &pagevec, wsize, &pgbase); if (result < 0) break; bytes = result; - iov_iter_advance(iter, bytes); npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; for (i = 0; i < npages; i++) { struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); - req = nfs_create_request(dreq->ctx, pagevec[i], NULL, - pgbase, req_len); + req = nfs_page_create_from_page(dreq->ctx, pagevec[i], + pgbase, pos, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); break; } - nfs_direct_setup_mirroring(dreq, &desc, req); if (desc.pg_error < 0) { nfs_free_request(req); result = desc.pg_error; break; } + pgbase = 0; + bytes -= req_len; + requested_bytes += req_len; + pos += req_len; + + if (defer) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + continue; + } + nfs_lock_request(req); - req->wb_index = pos >> PAGE_SHIFT; - req->wb_offset = pos & ~PAGE_MASK; - if (!nfs_pageio_add_request(&desc, req)) { + if (nfs_pageio_add_request(&desc, req)) + continue; + + /* Exit on hard errors */ + if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { result = desc.pg_error; nfs_unlock_and_release_request(req); break; } - pgbase = 0; - bytes -= req_len; - requested_bytes += req_len; - pos += req_len; - dreq->bytes_left -= req_len; + + /* If the error is soft, defer remaining requests */ + nfs_init_cinfo_from_dreq(&cinfo, dreq); + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + desc.pg_error = 0; + defer = true; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); @@ -949,6 +968,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -965,9 +985,10 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { - ssize_t result = -EINVAL, requested; + ssize_t result, requested; size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -979,7 +1000,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - result = generic_write_checks(iocb, iter); + if (swap) + /* bypass generic checks */ + result = iov_iter_count(iter); + else + result = generic_write_checks(iocb, iter); if (result <= 0) return result; count = result; @@ -996,28 +1021,43 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) goto out; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = count; + dreq->max_count = count; dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { result = PTR_ERR(l_ctx); + nfs_direct_req_release(dreq); goto out_release; } dreq->l_ctx = l_ctx; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); - nfs_start_io_direct(inode); + if (swap) { + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_STABLE); + } else { + result = nfs_start_io_direct(inode); + if (result) { + /* release the reference that would usually be + * consumed by nfs_direct_write_schedule_iovec() + */ + nfs_direct_req_release(dreq); + goto out_release; + } - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_COND_STABLE); - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + } - nfs_end_io_direct(inode); + nfs_end_io_direct(inode); + } if (requested > 0) { result = nfs_direct_wait(dreq); @@ -1031,6 +1071,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) } else { result = requested; } + nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE); out_release: nfs_direct_req_release(dreq); out: @@ -1045,8 +1086,7 @@ int __init nfs_init_directcache(void) { nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", sizeof(struct nfs_direct_req), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + 0, SLAB_RECLAIM_ACCOUNT, NULL); if (nfs_direct_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index d25f10fb4926..714975e5c0db 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/dns_resolve.c * @@ -6,22 +7,26 @@ * Resolves DNS hostnames into valid ip addresses */ -#ifdef CONFIG_NFS_USE_KERNEL_DNS - #include <linux/module.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/addr.h> -#include <linux/dns_resolver.h> + #include "dns_resolve.h" +#ifdef CONFIG_NFS_USE_KERNEL_DNS + +#include <linux/dns_resolver.h> + ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, - struct sockaddr *sa, size_t salen) + struct sockaddr_storage *ss, size_t salen) { + struct sockaddr *sa = (struct sockaddr *)ss; ssize_t ret; char *ip_addr = NULL; int ip_len; - ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL); + ip_len = dns_query(net, NULL, name, namelen, NULL, &ip_addr, NULL, + false); if (ip_len > 0) ret = rpc_pton(net, ip_addr, ip_len, sa, salen); else @@ -32,24 +37,19 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #else -#include <linux/module.h> #include <linux/hash.h> #include <linux/string.h> #include <linux/kmod.h> #include <linux/slab.h> -#include <linux/module.h> #include <linux/socket.h> #include <linux/seq_file.h> #include <linux/inet.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/addr.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" -#include "dns_resolve.h" #include "cache_lib.h" #include "netns.h" @@ -64,6 +64,7 @@ struct nfs_dns_ent { struct sockaddr_storage addr; size_t addrlen; + struct rcu_head rcu_head; }; @@ -90,7 +91,7 @@ static void nfs_dns_ent_init(struct cache_head *cnew, key = container_of(ckey, struct nfs_dns_ent, h); kfree(new->hostname); - new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); + new->hostname = kmemdup_nul(key->hostname, key->namelen, GFP_KERNEL); if (new->hostname) { new->namelen = key->namelen; nfs_dns_ent_update(cnew, ckey); @@ -100,15 +101,23 @@ static void nfs_dns_ent_init(struct cache_head *cnew, } } -static void nfs_dns_ent_put(struct kref *ref) +static void nfs_dns_ent_free_rcu(struct rcu_head *head) { struct nfs_dns_ent *item; - item = container_of(ref, struct nfs_dns_ent, h.ref); + item = container_of(head, struct nfs_dns_ent, rcu_head); kfree(item->hostname); kfree(item); } +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + call_rcu(&item->rcu_head, nfs_dns_ent_free_rcu); +} + static struct cache_head *nfs_dns_ent_alloc(void) { struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); @@ -141,12 +150,13 @@ static int nfs_dns_upcall(struct cache_detail *cd, struct cache_head *ch) { struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); - int ret; - ret = nfs_cache_upcall(cd, key->hostname); - if (ret) - ret = sunrpc_cache_pipe_upcall(cd, ch); - return ret; + if (test_and_set_bit(CACHE_PENDING, &ch->flags)) + return 0; + if (!nfs_cache_upcall(cd, key->hostname)) + return 0; + clear_bit(CACHE_PENDING, &ch->flags); + return sunrpc_cache_pipe_upcall_timeout(cd, ch); } static int nfs_dns_match(struct cache_head *ca, @@ -194,7 +204,7 @@ static struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, { struct cache_head *ch; - ch = sunrpc_cache_lookup(cd, + ch = sunrpc_cache_lookup_rcu(cd, &key->h, nfs_dns_hash(key)); if (!ch) @@ -330,7 +340,7 @@ out: } ssize_t nfs_dns_resolve_name(struct net *net, char *name, - size_t namelen, struct sockaddr *sa, size_t salen) + size_t namelen, struct sockaddr_storage *ss, size_t salen) { struct nfs_dns_ent key = { .hostname = name, @@ -343,7 +353,7 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item); if (ret == 0) { if (salen >= item->addrlen) { - memcpy(sa, &item->addr, item->addrlen); + memcpy(ss, &item->addr, item->addrlen); ret = item->addrlen; } else ret = -EOVERFLOW; diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h index 2e4f596d2923..fe3b172c4de1 100644 --- a/fs/nfs/dns_resolve.h +++ b/fs/nfs/dns_resolve.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Resolve DNS hostnames into valid ip addresses */ @@ -31,6 +32,6 @@ extern void nfs_dns_resolver_cache_destroy(struct net *net); #endif extern ssize_t nfs_dns_resolve_name(struct net *net, char *name, - size_t namelen, struct sockaddr *sa, size_t salen); + size_t namelen, struct sockaddr_storage *sa, size_t salen); #endif diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 249cb96cc5b5..a10dd5f9d078 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2015, Primary Data, Inc. All rights reserved. * @@ -41,16 +42,12 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) dprintk("%s: max fh len %d inode %p parent %p", __func__, *max_len, inode, parent); - if (*max_len < len || IS_AUTOMOUNT(inode)) { + if (*max_len < len) { dprintk("%s: fh len %d too small, required %d\n", __func__, *max_len, len); *max_len = len; return FILEID_INVALID; } - if (IS_AUTOMOUNT(inode)) { - *max_len = FILEID_INVALID; - goto out; - } p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32; p[FILEID_LOW_OFF] = NFS_FILEID(inode); @@ -58,7 +55,6 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) p[len - 1] = 0; /* Padding */ nfs_copy_fh(clnt_fh, server_fh); *max_len = len; -out: dprintk("%s: result fh fileid %llu mode %u size %d\n", __func__, NFS_FILEID(inode), inode->i_mode, *max_len); return *max_len; @@ -68,22 +64,28 @@ static struct dentry * nfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - struct nfs4_label *label = NULL; struct nfs_fattr *fattr = NULL; struct nfs_fh *server_fh = nfs_exp_embedfh(fid->raw); - size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size; + size_t fh_size = offsetof(struct nfs_fh, data); const struct nfs_rpc_ops *rpc_ops; struct dentry *dentry; struct inode *inode; - int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size); + int len = EMBED_FH_OFF; u32 *p = fid->raw; int ret; + /* Initial check of bounds */ + if (fh_len < len + XDR_QUADLEN(fh_size) || + fh_len > XDR_QUADLEN(NFS_MAXFHSIZE)) + return NULL; + /* Calculate embedded filehandle size */ + fh_size += server_fh->size; + len += XDR_QUADLEN(fh_size); /* NULL translates to ESTALE */ if (fh_len < len || fh_type != len) return NULL; - fattr = nfs_alloc_fattr(); + fattr = nfs_alloc_fattr_with_label(NFS_SB(sb)); if (fattr == NULL) { dentry = ERR_PTR(-ENOMEM); goto out; @@ -99,27 +101,19 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid, if (inode) goto out_found; - label = nfs4_label_alloc(NFS_SB(sb), GFP_KERNEL); - if (IS_ERR(label)) { - dentry = ERR_CAST(label); - goto out_free_fattr; - } - rpc_ops = NFS_SB(sb)->nfs_client->rpc_ops; - ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label); + ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, NULL); if (ret) { dprintk("%s: getattr failed %d\n", __func__, ret); + trace_nfs_fh_to_dentry(sb, server_fh, fattr->fileid, ret); dentry = ERR_PTR(ret); - goto out_free_label; + goto out_free_fattr; } - inode = nfs_fhget(sb, server_fh, fattr, label); + inode = nfs_fhget(sb, server_fh, fattr); out_found: dentry = d_obtain_alias(inode); - -out_free_label: - nfs4_label_free(label); out_free_fattr: nfs_free_fattr(fattr); out: @@ -134,7 +128,6 @@ nfs_get_parent(struct dentry *dentry) struct super_block *sb = inode->i_sb; struct nfs_server *server = NFS_SB(sb); struct nfs_fattr *fattr = NULL; - struct nfs4_label *label = NULL; struct dentry *parent; struct nfs_rpc_ops const *ops = server->nfs_client->rpc_ops; struct nfs_fh fh; @@ -142,31 +135,20 @@ nfs_get_parent(struct dentry *dentry) if (!ops->lookupp) return ERR_PTR(-EACCES); - fattr = nfs_alloc_fattr(); - if (fattr == NULL) { - parent = ERR_PTR(-ENOMEM); - goto out; - } - - label = nfs4_label_alloc(server, GFP_KERNEL); - if (IS_ERR(label)) { - parent = ERR_CAST(label); - goto out_free_fattr; - } + fattr = nfs_alloc_fattr_with_label(server); + if (fattr == NULL) + return ERR_PTR(-ENOMEM); - ret = ops->lookupp(inode, &fh, fattr, label); + ret = ops->lookupp(inode, &fh, fattr); if (ret) { parent = ERR_PTR(ret); - goto out_free_label; + goto out; } - pinode = nfs_fhget(sb, &fh, fattr, label); + pinode = nfs_fhget(sb, &fh, fattr); parent = d_obtain_alias(pinode); -out_free_label: - nfs4_label_free(label); -out_free_fattr: - nfs_free_fattr(fattr); out: + nfs_free_fattr(fattr); return parent; } @@ -174,4 +156,11 @@ const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, + .flags = EXPORT_OP_NOWCC | + EXPORT_OP_NOSUBTREECHK | + EXPORT_OP_CLOSE_BEFORE_UNLINK | + EXPORT_OP_REMOTE_FS | + EXPORT_OP_NOATOMIC_ATTR | + EXPORT_OP_FLUSH_ON_CLOSE | + EXPORT_OP_NOLOCKS, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index af330c31f627..d020aab40c64 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/file.c * @@ -27,9 +28,12 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/gfp.h> +#include <linux/rmap.h> #include <linux/swap.h> +#include <linux/compaction.h> #include <linux/uaccess.h> +#include <linux/filelock.h> #include "delegation.h" #include "internal.h" @@ -43,11 +47,6 @@ static const struct vm_operations_struct nfs_file_vm_ops; -/* Hack for future NFS swap support */ -#ifndef IS_SWAPFILE -# define IS_SWAPFILE(inode) (0) -#endif - int nfs_check_flags(int flags) { if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT)) @@ -73,6 +72,8 @@ nfs_file_open(struct inode *inode, struct file *filp) return res; res = nfs_open(inode, filp); + if (res == 0) + filp->f_mode |= FMODE_CAN_ODIRECT; return res; } @@ -83,14 +84,15 @@ nfs_file_release(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSRELEASE); nfs_file_clear_open_context(filp); + nfs_fscache_release_file(inode, filp); return 0; } EXPORT_SYMBOL_GPL(nfs_file_release); /** - * nfs_revalidate_size - Revalidate the file size - * @inode - pointer to inode struct - * @file - pointer to struct file + * nfs_revalidate_file_size - Revalidate the file size + * @inode: pointer to inode struct + * @filp: pointer to struct file * * Revalidates the file length. This is basically a wrapper around * nfs_revalidate_inode() that takes into account the fact that we may @@ -104,7 +106,7 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) if (filp->f_flags & O_DIRECT) goto force_reval; - if (nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE)) + if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_SIZE)) goto force_reval; return 0; force_reval: @@ -139,6 +141,7 @@ static int nfs_file_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); + errseq_t since; dprintk("NFS: flush(%pD2)\n", file); @@ -147,7 +150,9 @@ nfs_file_flush(struct file *file, fl_owner_t id) return 0; /* Flush writes to the server and return any errors */ - return vfs_fsync(file, 0); + since = filemap_sample_wb_err(file->f_mapping); + nfs_wb_all(inode); + return filemap_check_wb_err(file->f_mapping, since); } ssize_t @@ -156,14 +161,19 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t result; + trace_nfs_file_read(iocb, to); + if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to); + return nfs_file_direct_read(iocb, to, false); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - nfs_start_io_read(inode); + result = nfs_start_io_read(inode); + if (result) + return result; + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); @@ -175,103 +185,146 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) } EXPORT_SYMBOL_GPL(nfs_file_read); +ssize_t +nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + ssize_t result; + + dprintk("NFS: splice_read(%pD2, %zu@%llu)\n", in, len, *ppos); + + result = nfs_start_io_read(inode); + if (result) + return result; + + result = nfs_revalidate_mapping(inode, in->f_mapping); + if (!result) { + result = filemap_splice_read(in, ppos, pipe, len, flags); + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); + } + nfs_end_io_read(inode); + return result; +} +EXPORT_SYMBOL_GPL(nfs_file_splice_read); + int -nfs_file_mmap(struct file * file, struct vm_area_struct * vma) +nfs_file_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); int status; dprintk("NFS: mmap(%pD2)\n", file); - /* Note: generic_file_mmap() returns ENOSYS on nommu systems + /* Note: generic_file_mmap_prepare() returns ENOSYS on nommu systems * so we call that before revalidating the mapping */ - status = generic_file_mmap(file, vma); + status = generic_file_mmap_prepare(desc); if (!status) { - vma->vm_ops = &nfs_file_vm_ops; + desc->vm_ops = &nfs_file_vm_ops; status = nfs_revalidate_mapping(inode, file->f_mapping); } return status; } -EXPORT_SYMBOL_GPL(nfs_file_mmap); +EXPORT_SYMBOL_GPL(nfs_file_mmap_prepare); /* * Flush any dirty pages for this process, and check for write errors. * The return status from this call provides a reliable indication of * whether any write errors occurred for this process. - * - * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to - * disk, but it retrieves and clears ctx->error after synching, despite - * the two being set at the same time in nfs_context_set_write_error(). - * This is because the former is used to notify the _next_ call to - * nfs_file_write() that a write error occurred, and hence cause it to - * fall back to doing a synchronous write. */ static int -nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync) +nfs_file_fsync_commit(struct file *file, int datasync) { - struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = file_inode(file); - int have_error, do_resend, status; - int ret = 0; + int ret, ret2; dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); - have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - status = nfs_commit_inode(inode, FLUSH_SYNC); - have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - if (have_error) { - ret = xchg(&ctx->error, 0); - if (ret) - goto out; - } - if (status < 0) { - ret = status; - goto out; - } - do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); - if (do_resend) - ret = -EAGAIN; -out: + ret = nfs_commit_inode(inode, FLUSH_SYNC); + ret2 = file_check_and_advance_wb_err(file); + if (ret2 < 0) + return ret2; return ret; } int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - int ret; struct inode *inode = file_inode(file); + struct nfs_inode *nfsi = NFS_I(inode); + long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages); + long nredirtied; + int ret; trace_nfs_fsync_enter(inode); - do { - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + for (;;) { + ret = file_write_and_wait_range(file, start, end); if (ret != 0) break; - ret = nfs_file_fsync_commit(file, start, end, datasync); - if (!ret) - ret = pnfs_sync_inode(inode, !!datasync); - /* - * If nfs_file_fsync_commit detected a server reboot, then - * resend all dirty pages that might have been covered by - * the NFS_CONTEXT_RESEND_WRITES flag - */ - start = 0; - end = LLONG_MAX; - } while (ret == -EAGAIN); + ret = nfs_file_fsync_commit(file, datasync); + if (ret != 0) + break; + ret = pnfs_sync_inode(inode, !!datasync); + if (ret != 0) + break; + nredirtied = atomic_long_read(&nfsi->redirtied_pages); + if (nredirtied == save_nredirtied) + break; + save_nredirtied = nredirtied; + } trace_nfs_fsync_exit(inode, ret); return ret; } EXPORT_SYMBOL_GPL(nfs_file_fsync); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to) +{ + struct folio *folio; + + if (from >= to) + return; + + folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + + if (folio_test_uptodate(folio)) { + loff_t fpos = folio_pos(folio); + size_t offset = from - fpos; + size_t end = folio_size(folio); + + if (to - fpos < end) + end = to - fpos; + folio_zero_segment(folio, offset, end); + trace_nfs_size_truncate_folio(mapping->host, to); + } + + folio_unlock(folio); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(nfs_truncate_last_folio); + /* * Decide whether a read/modify/write cycle may be more efficient * then a modify/write/read cycle when writing to a page in the * page cache. * + * Some pNFS layout drivers can only read/write at a certain block + * granularity like all block devices and therefore we must perform + * read/modify/write whenever a page hasn't read yet and the data + * to be written there is not aligned to a block boundary and/or + * smaller than the block size. + * * The modify/write/read cycle may occur if a page is read before * being completely filled by the writer. In this situation, the * page must be completely written to stable storage on the server @@ -287,26 +340,35 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync); * and that the new data won't completely replace the old data in * that range of the file. */ -static int nfs_want_read_modify_write(struct file *file, struct page *page, - loff_t pos, unsigned len) +static bool nfs_folio_is_full_write(struct folio *folio, loff_t pos, + unsigned int len) { - unsigned int pglen = nfs_page_length(page); - unsigned int offset = pos & (PAGE_SIZE - 1); + unsigned int pglen = nfs_folio_length(folio); + unsigned int offset = offset_in_folio(folio, pos); unsigned int end = offset + len; - if (pnfs_ld_read_whole_page(file->f_mapping->host)) { - if (!PageUptodate(page)) - return 1; - return 0; - } + return !pglen || (end >= pglen && !offset); +} - if ((file->f_mode & FMODE_READ) && /* open for read? */ - !PageUptodate(page) && /* Uptodate? */ - !PagePrivate(page) && /* i/o request already? */ - pglen && /* valid bytes of file? */ - (end < pglen || offset)) /* replace all valid bytes? */ - return 1; - return 0; +static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, + loff_t pos, unsigned int len) +{ + /* + * Up-to-date pages, those with ongoing or full-page write + * don't need read/modify/write + */ + if (folio_test_uptodate(folio) || folio_test_private(folio) || + nfs_folio_is_full_write(folio, pos, len)) + return false; + + if (pnfs_ld_read_whole_page(file_inode(file))) + return true; + if (folio_test_dropbehind(folio)) + return false; + /* Open for reading too? */ + if (file->f_mode & FMODE_READ) + return true; + return false; } /* @@ -317,47 +379,59 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page, * If the writer ends up delaying the write, the writer needs to * increment the page use counts until he is done with the page. */ -static int nfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) +static int nfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata) { - int ret; - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; + struct folio *folio; + struct file *file = iocb->ki_filp; int once_thru = 0; + int ret; + + trace_nfs_write_begin(file_inode(file), pos, len); dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); + nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); start: - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - *pagep = page; + folio = write_begin_get_folio(iocb, mapping, pos >> PAGE_SHIFT, len); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out; + } + *foliop = folio; - ret = nfs_flush_incompatible(file, page); + ret = nfs_flush_incompatible(file, folio); if (ret) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } else if (!once_thru && - nfs_want_read_modify_write(file, page, pos, len)) { + nfs_want_read_modify_write(file, folio, pos, len)) { once_thru = 1; - ret = nfs_readpage(file, page); - put_page(page); + folio_clear_dropbehind(folio); + ret = nfs_read_folio(file, folio); + folio_put(folio); if (!ret) goto start; } +out: + trace_nfs_write_begin_done(file_inode(file), pos, len, ret); return ret; } -static int nfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +static int nfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { - unsigned offset = pos & (PAGE_SIZE - 1); + struct file *file = iocb->ki_filp; struct nfs_open_context *ctx = nfs_file_open_context(file); + unsigned offset = offset_in_folio(folio, pos); int status; + trace_nfs_write_end(file_inode(file), pos, len); dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); @@ -365,37 +439,37 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, * Zero any uninitialised parts of the page, and then mark the page * as up to date if it turns out that we're extending the file. */ - if (!PageUptodate(page)) { - unsigned pglen = nfs_page_length(page); + if (!folio_test_uptodate(folio)) { + size_t fsize = folio_size(folio); + unsigned pglen = nfs_folio_length(folio); unsigned end = offset + copied; if (pglen == 0) { - zero_user_segments(page, 0, offset, - end, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segments(folio, 0, offset, end, fsize); + folio_mark_uptodate(folio); } else if (end >= pglen) { - zero_user_segment(page, end, PAGE_SIZE); + folio_zero_segment(folio, end, fsize); if (offset == 0) - SetPageUptodate(page); + folio_mark_uptodate(folio); } else - zero_user_segment(page, pglen, PAGE_SIZE); + folio_zero_segment(folio, pglen, fsize); } - status = nfs_updatepage(file, page, offset, copied); + status = nfs_update_folio(file, folio, offset, copied); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); - if (status < 0) + if (status < 0) { + trace_nfs_write_end_done(file_inode(file), pos, len, status); return status; + } NFS_I(mapping->host)->write_io += copied; - if (nfs_ctx_key_to_expire(ctx, mapping->host)) { - status = nfs_wb_all(mapping->host); - if (status < 0) - return status; - } + if (nfs_ctx_key_to_expire(ctx, mapping->host)) + nfs_wb_all(mapping->host); + trace_nfs_write_end_done(file_inode(file), pos, len, copied); return copied; } @@ -406,49 +480,53 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, * - Called if either PG_private or PG_fscache is set on the page * - Caller holds page lock */ -static void nfs_invalidate_page(struct page *page, unsigned int offset, - unsigned int length) +static void nfs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n", - page, offset, length); + struct inode *inode = folio->mapping->host; + dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", + folio->index, offset, length); - if (offset != 0 || length < PAGE_SIZE) - return; /* Cancel any unstarted writes on this page */ - nfs_wb_page_cancel(page_file_mapping(page)->host, page); - - nfs_fscache_invalidate_page(page, page->mapping->host); + if (offset != 0 || length < folio_size(folio)) + nfs_wb_folio(inode, folio); + else + nfs_wb_folio_cancel(inode, folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ + trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length); } /* - * Attempt to release the private state associated with a page - * - Called if either PG_private or PG_fscache is set on the page - * - Caller holds page lock - * - Return true (may release page) or false (may not) + * Attempt to release the private state associated with a folio + * - Called if either private or fscache flags are set on the folio + * - Caller holds folio lock + * - Return true (may release folio) or false (may not) */ -static int nfs_release_page(struct page *page, gfp_t gfp) +static bool nfs_release_folio(struct folio *folio, gfp_t gfp) { - dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - - /* If PagePrivate() is set, then the page is not freeable */ - if (PagePrivate(page)) - return 0; - return nfs_fscache_release_page(page, gfp); + dfprintk(PAGECACHE, "NFS: release_folio(%p)\n", folio); + + /* If the private flag is set, then the folio is not freeable */ + if (folio_test_private(folio)) { + if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL || + current_is_kswapd() || current_is_kcompactd()) + return false; + if (nfs_wb_folio(folio->mapping->host, folio) < 0) + return false; + } + return nfs_fscache_release_folio(folio, gfp); } -static void nfs_check_dirty_writeback(struct page *page, +static void nfs_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct nfs_inode *nfsi; - struct address_space *mapping = page_file_mapping(page); - - if (!mapping || PageSwapCache(page)) - return; + struct address_space *mapping = folio->mapping; /* - * Check if an unstable page is currently being committed and - * if so, have the VM treat it as if the page is under writeback - * so it will not block due to pages that will shortly be freeable. + * Check if an unstable folio is currently being committed and + * if so, have the VM treat it as if the folio is under writeback + * so it will not block due to folios that will shortly be freeable. */ nfsi = NFS_I(mapping->host); if (atomic_read(&nfsi->commit_info.rpcs_out)) { @@ -457,11 +535,11 @@ static void nfs_check_dirty_writeback(struct page *page, } /* - * If PagePrivate() is set, then the page is not freeable and as the - * inode is not being committed, it's not going to be cleaned in the - * near future so treat it as dirty + * If the private flag is set, then the folio is not freeable + * and as the inode is not being committed, it's not going to + * be cleaned in the near future so treat it as dirty */ - if (PagePrivate(page)) + if (folio_test_private(folio)) *dirty = true; } @@ -473,54 +551,85 @@ static void nfs_check_dirty_writeback(struct page *page, * - Caller holds page lock * - Return 0 if successful, -error otherwise */ -static int nfs_launder_page(struct page *page) +static int nfs_launder_folio(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_inode *nfsi = NFS_I(inode); + struct inode *inode = folio->mapping->host; + int ret; - dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", - inode->i_ino, (long long)page_offset(page)); + dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n", + inode->i_ino, folio_pos(folio)); - nfs_fscache_wait_on_page_write(nfsi, page); - return nfs_wb_page(inode, page); + folio_wait_private_2(folio); /* [DEPRECATED] */ + ret = nfs_wb_folio(inode, folio); + trace_nfs_launder_folio_done(inode, folio_pos(folio), + folio_size(folio), ret); + return ret; } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { - struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + unsigned long blocks; + long long isize; + int ret; + struct inode *inode = file_inode(file); + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; + + spin_lock(&inode->i_lock); + blocks = inode->i_blocks; + isize = inode->i_size; + spin_unlock(&inode->i_lock); + if (blocks*512 < isize) { + pr_warn("swap activate: swapfile has holes\n"); + return -EINVAL; + } + + ret = rpc_clnt_swap_activate(clnt); + if (ret) + return ret; + ret = add_swap_extent(sis, 0, sis->max, 0); + if (ret < 0) { + rpc_clnt_swap_deactivate(clnt); + return ret; + } *span = sis->pages; - return rpc_clnt_swap_activate(clnt); + if (cl->rpc_ops->enable_swap) + cl->rpc_ops->enable_swap(inode); + + sis->flags |= SWP_FS_OPS; + return ret; } static void nfs_swap_deactivate(struct file *file) { - struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + struct inode *inode = file_inode(file); + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; rpc_clnt_swap_deactivate(clnt); + if (cl->rpc_ops->disable_swap) + cl->rpc_ops->disable_swap(file_inode(file)); } const struct address_space_operations nfs_file_aops = { - .readpage = nfs_readpage, - .readpages = nfs_readpages, - .set_page_dirty = __set_page_dirty_nobuffers, - .writepage = nfs_writepage, + .read_folio = nfs_read_folio, + .readahead = nfs_readahead, + .dirty_folio = filemap_dirty_folio, .writepages = nfs_writepages, .write_begin = nfs_write_begin, .write_end = nfs_write_end, - .invalidatepage = nfs_invalidate_page, - .releasepage = nfs_release_page, - .direct_IO = nfs_direct_IO, -#ifdef CONFIG_MIGRATION - .migratepage = nfs_migrate_page, -#endif - .launder_page = nfs_launder_page, + .invalidate_folio = nfs_invalidate_folio, + .release_folio = nfs_release_folio, + .migrate_folio = nfs_migrate_folio, + .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, + .swap_rw = nfs_swap_rw, }; /* @@ -528,46 +637,51 @@ const struct address_space_operations nfs_file_aops = { * writable, implying that someone is about to modify the page through a * shared-writable mapping */ -static int nfs_vm_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; struct file *filp = vmf->vma->vm_file; struct inode *inode = file_inode(filp); unsigned pagelen; - int ret = VM_FAULT_NOPAGE; + vm_fault_t ret = VM_FAULT_NOPAGE; struct address_space *mapping; + struct folio *folio = page_folio(vmf->page); dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", - filp, filp->f_mapping->host->i_ino, - (long long)page_offset(page)); + filp, filp->f_mapping->host->i_ino, + (long long)folio_pos(folio)); sb_start_pagefault(inode->i_sb); /* make sure the cache has finished storing the page */ - nfs_fscache_wait_on_page_write(NFS_I(inode), page); + if (folio_test_private_2(folio) && /* [DEPRECATED] */ + folio_wait_private_2_killable(folio) < 0) { + ret = VM_FAULT_RETRY; + goto out; + } wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); - lock_page(page); - mapping = page_file_mapping(page); + folio_lock(folio); + mapping = folio->mapping; if (mapping != inode->i_mapping) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(folio); - pagelen = nfs_page_length(page); + pagelen = nfs_folio_length(folio); if (pagelen == 0) goto out_unlock; ret = VM_FAULT_LOCKED; - if (nfs_flush_incompatible(filp, page) == 0 && - nfs_updatepage(filp, page, 0, pagelen) == 0) + if (nfs_flush_incompatible(filp, folio) == 0 && + nfs_update_folio(filp, folio, 0, pagelen) == 0) goto out; ret = VM_FAULT_SIGBUS; out_unlock: - unlock_page(page); + folio_unlock(folio); out: sb_end_pagefault(inode->i_sb); return ret; @@ -579,30 +693,23 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_check_write(struct file *filp, struct inode *inode) -{ - struct nfs_open_context *ctx; - - ctx = nfs_file_open_context(filp); - if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || - nfs_ctx_key_to_expire(ctx, inode)) - return 1; - return 0; -} - ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - unsigned long written = 0; - ssize_t result; + unsigned int mntflags = NFS_SERVER(inode)->flags; + ssize_t result, written; + errseq_t since; + int error; + + trace_nfs_file_write(iocb, from); result = nfs_key_timeout_notify(file, inode); if (result) return result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_write(iocb, from); + return nfs_file_direct_write(iocb, from, false); dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, iov_iter_count(from), (long long) iocb->ki_pos); @@ -612,44 +719,63 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) /* * O_APPEND implies that we must revalidate the file length. */ - if (iocb->ki_flags & IOCB_APPEND) { + if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) { result = nfs_revalidate_file_size(inode, file); if (result) - goto out; + return result; } - if (iocb->ki_pos > i_size_read(inode)) - nfs_revalidate_mapping(inode, file->f_mapping); - nfs_start_io_write(inode); + nfs_clear_invalid_mapping(file->f_mapping); + + since = filemap_sample_wb_err(file->f_mapping); + error = nfs_start_io_write(inode); + if (error) + return error; result = generic_write_checks(iocb, from); - if (result > 0) { - current->backing_dev_info = inode_to_bdi(inode); - result = generic_perform_write(file, from, iocb->ki_pos); - current->backing_dev_info = NULL; - } + if (result > 0) + result = generic_perform_write(iocb, from); nfs_end_io_write(inode); if (result <= 0) goto out; - result = generic_write_sync(iocb, result); - if (result < 0) - goto out; written = result; - iocb->ki_pos += written; + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); - /* Return error values */ - if (nfs_need_check_write(file, inode)) { - int err = vfs_fsync(file, 0); - if (err < 0) - result = err; + if (mntflags & NFS_MOUNT_WRITE_EAGER) { + result = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + if (result < 0) + goto out; } - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); + if (mntflags & NFS_MOUNT_WRITE_WAIT) { + filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + } + result = generic_write_sync(iocb, written); + if (result < 0) + return result; + out: + /* Return error values */ + error = filemap_check_wb_err(file->f_mapping, since); + switch (error) { + default: + break; + case -EDQUOT: + case -EFBIG: + case -ENOSPC: + nfs_wb_all(inode); + error = file_check_and_advance_wb_err(file); + if (error < 0) + result = error; + } return result; out_swapfile: printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); - return -EBUSY; + return -ETXTBSY; } EXPORT_SYMBOL_GPL(nfs_file_write); @@ -658,17 +784,17 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status = 0; - unsigned int saved_type = fl->fl_type; + unsigned int saved_type = fl->c.flc_type; /* Try local locking first */ posix_test_lock(filp, fl); - if (fl->fl_type != F_UNLCK) { + if (fl->c.flc_type != F_UNLCK) { /* found a conflict */ goto out; } - fl->fl_type = saved_type; + fl->c.flc_type = saved_type; - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + if (nfs_have_read_or_write_delegation(inode)) goto out_noconflict; if (is_local) @@ -678,7 +804,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) out: return status; out_noconflict: - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; goto out; } @@ -693,7 +819,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * Flush all pending writes before doing anything * with locks.. */ - vfs_fsync(filp, 0); + nfs_wb_all(inode); l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { @@ -703,7 +829,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * If we're signalled while cleaning up locks on process exit, we * still need to complete the unlock. */ - if (status < 0 && !(fl->fl_flags & FL_CLOSE)) + if (status < 0 && !(fl->c.flc_flags & FL_CLOSE)) return status; } @@ -744,15 +870,18 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) goto out; /* - * Revalidate the cache if the server has time stamps granular - * enough to detect subsecond changes. Otherwise, clear the - * cache to prevent missing any changes. + * Invalidate cache to prevent missing any changes. If + * the file is mapped, clear the page cache as well so + * those mappings will be loaded. * * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + if (!nfs_have_read_or_write_delegation(inode)) { nfs_zap_caches(inode); + if (mapping_mapped(filp->f_mapping)) + nfs_revalidate_mapping(inode, filp->f_mapping); + } out: return status; } @@ -767,14 +896,13 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) int is_local = 0; dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n", - filp, fl->fl_type, fl->fl_flags, + filp, fl->c.flc_type, fl->c.flc_flags, (long long)fl->fl_start, (long long)fl->fl_end); nfs_inc_stats(inode, NFSIOS_VFSLOCK); - /* No mandatory locks over NFS */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - goto out_err; + if (fl->c.flc_flags & FL_RECLAIM) + return -ENOGRACE; if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) is_local = 1; @@ -787,7 +915,7 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) if (IS_GETLK(cmd)) ret = do_getlk(filp, cmd, fl, is_local); - else if (fl->fl_type == F_UNLCK) + else if (lock_is_unlock(fl)) ret = do_unlk(filp, cmd, fl, is_local); else ret = do_setlk(filp, cmd, fl, is_local); @@ -805,40 +933,17 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) int is_local = 0; dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n", - filp, fl->fl_type, fl->fl_flags); + filp, fl->c.flc_type, fl->c.flc_flags); - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; - /* - * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of - * any standard. In principle we might be able to support LOCK_MAND - * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the - * NFS code is not set up for it. - */ - if (fl->fl_type & LOCK_MAND) - return -EINVAL; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; - /* - * VFS doesn't require the open mode to match a flock() lock's type. - * NFS, however, may simulate flock() locking with posix locking which - * requires the open mode to match the lock type. - */ - switch (fl->fl_type) { - case F_UNLCK: + /* We're simulating flock() locks using posix locks on the server */ + if (lock_is_unlock(fl)) return do_unlk(filp, cmd, fl, is_local); - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - } - return do_setlk(filp, cmd, fl, is_local); } EXPORT_SYMBOL_GPL(nfs_flock); @@ -847,16 +952,17 @@ const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, .read_iter = nfs_file_read, .write_iter = nfs_file_write, - .mmap = nfs_file_mmap, + .mmap_prepare = nfs_file_mmap_prepare, .open = nfs_file_open, .flush = nfs_file_flush, .release = nfs_file_release, .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, - .splice_read = generic_file_splice_read, + .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, + .fop_flags = FOP_DONTCACHE, }; EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile index 8516cdffb9e9..de056312d374 100644 --- a/fs/nfs/filelayout/Makefile +++ b/fs/nfs/filelayout/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Makefile for the pNFS Files Layout Driver kernel module # diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 44c638b7876c..5c4551117c58 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -49,6 +49,7 @@ MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); MODULE_DESCRIPTION("The NFSv4 file layout driver"); #define FILELAYOUT_POLL_RETRY_MAX (15*HZ) +static const struct pnfs_commit_ops filelayout_commit_ops; static loff_t filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, @@ -180,13 +181,15 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -EIO: case -ETIMEDOUT: case -EPIPE: + case -EPROTO: + case -ENODEV: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_mark_deviceid_unavailable(devid); pnfs_error_mark_layout_for_return(inode, lseg); pnfs_set_lo_fail(lseg); rpc_wake_up(&tbl->slot_tbl_waitq); - /* fall through */ + fallthrough; default: reset: dprintk("%s Retry through MDS. Error %d\n", __func__, @@ -292,8 +295,6 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs41_sequence_done(task, &hdr->res.seq_res); @@ -471,10 +472,10 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr) return PNFS_NOT_ATTEMPTED; dprintk("%s USE DS: %s cl_count %d\n", __func__, - ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count)); /* No multipath support. Use first DS */ - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); @@ -487,7 +488,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr) /* Perform an asynchronous read to ds */ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred, NFS_PROTO(hdr->inode), &filelayout_read_call_ops, - 0, RPC_TASK_SOFTCONN); + 0, RPC_TASK_SOFTCONN, NULL); return PNFS_ATTEMPTED; } @@ -515,10 +516,10 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, - offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count)); hdr->pgio_done_cb = filelayout_write_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); @@ -529,7 +530,7 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) /* Perform an asynchronous write */ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred, NFS_PROTO(hdr->inode), &filelayout_write_call_ops, - sync, RPC_TASK_SOFTCONN); + sync, RPC_TASK_SOFTCONN, NULL); return PNFS_ATTEMPTED; } @@ -604,14 +605,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); - /* FIXME: remove this check when layout segment support is added */ - if (lgr->range.offset != 0 || - lgr->range.length != NFS4_MAX_UINT64) { - dprintk("%s Only whole file layouts supported. Use MDS i/o\n", - __func__); - goto out; - } - if (fl->pattern_offset > lgr->range.offset) { dprintk("%s pattern_offset %lld too large\n", __func__, fl->pattern_offset); @@ -653,19 +646,19 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, { struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; __be32 *p; uint32_t nfl_util; int i; dprintk("%s: set_layout_map Begin\n", __func__); - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_folio(&stream, scratch); /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), * num_fh (4) */ @@ -717,7 +710,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, if (unlikely(!p)) goto out_err; fl->fh_array[i]->size = be32_to_cpup(p++); - if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { + if (fl->fh_array[i]->size > NFS_MAXFHSIZE) { printk(KERN_ERR "NFS: Too big fh %d received %d\n", i, fl->fh_array[i]->size); goto out_err; @@ -731,11 +724,11 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, fl->fh_array[i]->size); } - __free_page(scratch); + folio_put(scratch); return 0; out_err: - __free_page(scratch); + folio_put(scratch); return -EIO; } @@ -745,76 +738,22 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); dprintk("--> %s\n", __func__); - nfs4_fl_put_deviceid(fl->dsaddr); + if (fl->dsaddr != NULL) + nfs4_fl_put_deviceid(fl->dsaddr); /* This assumes a single RW lseg */ if (lseg->pls_range.iomode == IOMODE_RW) { struct nfs4_filelayout *flo; + struct inode *inode; flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); - flo->commit_info.nbuckets = 0; - kfree(flo->commit_info.buckets); - flo->commit_info.buckets = NULL; + inode = flo->generic_hdr.plh_inode; + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg); + spin_unlock(&inode->i_lock); } _filelayout_free_lseg(fl); } -static int -filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - gfp_t gfp_flags) -{ - struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); - struct pnfs_commit_bucket *buckets; - int size, i; - - if (fl->commit_through_mds) - return 0; - - size = (fl->stripe_type == STRIPE_SPARSE) ? - fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - - if (cinfo->ds->nbuckets >= size) { - /* This assumes there is only one IOMODE_RW lseg. What - * we really want to do is have a layout_hdr level - * dictionary of <multipath_list4, fh> keys, each - * associated with a struct list_head, populated by calls - * to filelayout_write_pagelist(). - * */ - return 0; - } - - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), - gfp_flags); - if (!buckets) - return -ENOMEM; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - /* mark direct verifier as unset */ - buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; - } - - spin_lock(&cinfo->inode->i_lock); - if (cinfo->ds->nbuckets >= size) - goto out; - for (i = 0; i < cinfo->ds->nbuckets; i++) { - list_splice(&cinfo->ds->buckets[i].written, - &buckets[i].written); - list_splice(&cinfo->ds->buckets[i].committing, - &buckets[i].committing); - buckets[i].direct_verf.committed = - cinfo->ds->buckets[i].direct_verf.committed; - buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; - buckets[i].clseg = cinfo->ds->buckets[i].clseg; - } - swap(cinfo->ds->buckets, buckets); - cinfo->ds->nbuckets = size; -out: - spin_unlock(&cinfo->inode->i_lock); - kfree(buckets); - return 0; -} - static struct pnfs_layout_segment * filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, @@ -836,6 +775,12 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, return &fl->generic_hdr; } +static bool +filelayout_lseg_is_striped(const struct nfs4_filelayout_segment *flseg) +{ + return flseg->num_fh > 1; +} + /* * filelayout_pg_test(). Called by nfs_can_coalesce_requests() * @@ -856,6 +801,8 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, size = pnfs_generic_pg_test(pgio, prev, req); if (!size) return 0; + else if (!filelayout_lseg_is_striped(FILELAYOUT_LSEG(pgio->pg_lseg))) + return size; /* see if req and prev are in the same stripe */ if (prev) { @@ -894,9 +841,12 @@ fl_pnfs_update_layout(struct inode *ino, lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, gfp_flags); - if (!lseg) - lseg = ERR_PTR(-ENOMEM); - if (IS_ERR(lseg)) + if (IS_ERR(lseg)) { + /* Fall back to MDS on recoverable errors */ + if (!nfs_error_is_fatal_on_server(PTR_ERR(lseg))) + lseg = NULL; + goto out; + } else if (!lseg) goto out; lo = NFS_I(ino)->layout; @@ -904,8 +854,10 @@ fl_pnfs_update_layout(struct inode *ino, status = filelayout_check_deviceid(lo, fl, gfp_flags); if (status) { + pnfs_error_mark_layout_for_return(ino, lseg); + pnfs_set_lo_fail(lseg); pnfs_put_lseg(lseg); - lseg = ERR_PTR(status); + lseg = NULL; } out: return lseg; @@ -915,15 +867,15 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, + nfs_req_openctx(req), + req_offset(req), + req->wb_bytes, IOMODE_READ, false, - GFP_KERNEL); + nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -939,18 +891,15 @@ static void filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - struct nfs_commit_info cinfo; - int status; - - pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, + nfs_req_openctx(req), + req_offset(req), + req->wb_bytes, IOMODE_RW, false, - GFP_NOFS); + nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -960,17 +909,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) - goto out_mds; - nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); - status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); - if (status < 0) { - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; - goto out_mds; - } - return; -out_mds: - nfs_pageio_reset_write_mds(pgio); + nfs_pageio_reset_write_mds(pgio); } static const struct nfs_pageio_ops filelayout_pg_read_ops = { @@ -1063,52 +1002,22 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) goto out_err; dprintk("%s ino %lu, how %d cl_count %d\n", __func__, - data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); + data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count)); data->commit_done_cb = filelayout_commit_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) data->args.fh = fh; return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode), &filelayout_commit_call_ops, how, - RPC_TASK_SOFTCONN); + RPC_TASK_SOFTCONN, NULL); out_err: pnfs_generic_prepare_to_resend_writes(data); pnfs_generic_commit_release(data); return -EAGAIN; } -/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest - * for @page - * @cinfo - commit info for current inode - * @page - page to search for matching head request - * - * Returns a the head request if one is found, otherwise returns NULL. - */ -static struct nfs_page * -filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) -{ - struct nfs_page *freq, *t; - struct pnfs_commit_bucket *b; - int i; - - /* Linearly search the commit lists for each bucket until a matching - * request is found */ - for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { - list_for_each_entry_safe(freq, t, &b->written, wb_list) { - if (freq->wb_page == page) - return freq->wb_head; - } - list_for_each_entry_safe(freq, t, &b->committing, wb_list) { - if (freq->wb_page == page) - return freq->wb_head; - } - } - - return NULL; -} - static int filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) @@ -1141,13 +1050,17 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct nfs4_filelayout *flo; flo = kzalloc(sizeof(*flo), gfp_flags); - return flo != NULL ? &flo->generic_hdr : NULL; + if (flo == NULL) + return NULL; + pnfs_init_ds_commit_info(&flo->commit_info); + flo->commit_info.ops = &filelayout_commit_ops; + return &flo->generic_hdr; } static void filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo) { - kfree(FILELAYOUT_FROM_HDR(lo)); + kfree_rcu(FILELAYOUT_FROM_HDR(lo), generic_hdr.plh_rcu); } static struct pnfs_ds_commit_info * @@ -1161,10 +1074,51 @@ filelayout_get_ds_info(struct inode *inode) return &FILELAYOUT_FROM_HDR(layout)->commit_info; } +static void +filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + struct inode *inode = lseg->pls_layout->plh_inode; + struct pnfs_commit_array *array, *new; + unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask()); + if (new) { + spin_lock(&inode->i_lock); + array = pnfs_add_commit_array(fl_cinfo, new, lseg); + spin_unlock(&inode->i_lock); + if (array != new) + pnfs_free_commit_array(new); + } +} + +static void +filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_destroy(fl_cinfo); + spin_unlock(&inode->i_lock); +} + +static const struct pnfs_commit_ops filelayout_commit_ops = { + .setup_ds_info = filelayout_setup_ds_info, + .release_ds_info = filelayout_release_ds_info, + .mark_request_commit = filelayout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .commit_pagelist = filelayout_commit_pagelist, +}; + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTGET_ON_OPEN, + .max_layoutget_response = 4096, /* 1 page or so... */ .alloc_layout_hdr = filelayout_alloc_layout_hdr, .free_layout_hdr = filelayout_free_layout_hdr, .alloc_lseg = filelayout_alloc_lseg, @@ -1172,12 +1126,6 @@ static struct pnfs_layoutdriver_type filelayout_type = { .pg_read_ops = &filelayout_pg_read_ops, .pg_write_ops = &filelayout_pg_write_ops, .get_ds_info = &filelayout_get_ds_info, - .mark_request_commit = filelayout_mark_request_commit, - .clear_request_commit = pnfs_generic_clear_request_commit, - .scan_commit_lists = pnfs_generic_scan_commit_lists, - .recover_commit_reqs = pnfs_generic_recover_commit_reqs, - .search_commit_reqs = filelayout_search_commit_reqs, - .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, .alloc_deviceid_node = filelayout_alloc_deviceid_node, diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index 79323b5dab0c..c7bb5da93307 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -51,7 +51,7 @@ struct nfs4_file_layout_dsaddr { u32 stripe_count; u8 *stripe_indices; u32 ds_num; - struct nfs4_pnfs_ds *ds_list[1]; + struct nfs4_pnfs_ds *ds_list[] __counted_by(ds_num); }; struct nfs4_filelayout_segment { diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index d913e818858f..df79aeb68db4 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -35,6 +35,7 @@ #include "../internal.h" #include "../nfs4session.h" #include "filelayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -72,17 +73,18 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct nfs4_file_layout_dsaddr *dsaddr = NULL; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; + struct net *net = server->nfs_client->cl_net; /* set up xdr stream */ - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) goto out_err; xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_folio(&stream, scratch); /* Get the stripe count (number of stripe index) */ p = xdr_inline_decode(&stream, 4); @@ -136,9 +138,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_stripe_indices; } - dsaddr = kzalloc(sizeof(*dsaddr) + - (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), - gfp_flags); + dsaddr = kzalloc(struct_size(dsaddr, ds_list, num), gfp_flags); if (!dsaddr) goto out_err_free_stripe_indices; @@ -160,8 +160,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -171,9 +170,10 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_deviceid; } - dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!dsaddr->ds_list[i]) goto out_err_drain_dsaddrs; + trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr); /* If DS was already in cache, free ds addrs */ while (!list_empty(&dsaddrs)) { @@ -186,7 +186,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, } } - __free_page(scratch); + folio_put(scratch); return dsaddr; out_err_drain_dsaddrs: @@ -204,7 +204,7 @@ out_err_free_deviceid: out_err_free_stripe_indices: kfree(stripe_indices); out_err_free_scratch: - __free_page(scratch); + folio_put(scratch); out_err: dprintk("%s ERROR: returning NULL\n", __func__); return NULL; diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile index 1d2c9f6bbcd4..49f03422b6ad 100644 --- a/fs/nfs/flexfilelayout/Makefile +++ b/fs/nfs/flexfilelayout/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Makefile for the pNFS Flexfile Layout Driver kernel module # diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index b0fa83a60754..9056f05a67dc 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Module for pnfs flexfile layout driver. * @@ -7,8 +8,11 @@ */ #include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/file.h> +#include <linux/sched/mm.h> #include <linux/sunrpc/metrics.h> @@ -27,17 +31,23 @@ #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) #define FF_LAYOUTRETURN_MAXERR 20 +enum nfs4_ff_op_type { + NFS4_FF_OP_LAYOUTSTATS, + NFS4_FF_OP_LAYOUTRETURN, +}; -static struct group_info *ff_zero_group; +static unsigned short io_maxretrans; +static const struct pnfs_commit_ops ff_layout_commit_ops; static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr); -static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, +static int +ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, struct nfs42_layoutstat_devinfo *devinfo, - int dev_limit); + int dev_limit, enum nfs4_ff_op_type type); static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, - struct nfs4_ff_layout_mirror *mirror); + struct nfs4_ff_layout_ds_stripe *dss_info); static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) @@ -46,9 +56,11 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) ffl = kzalloc(sizeof(*ffl), gfp_flags); if (ffl) { + pnfs_init_ds_commit_info(&ffl->commit_info); INIT_LIST_HEAD(&ffl->error_list); INIT_LIST_HEAD(&ffl->mirrors); ffl->last_report_time = ktime_get(); + ffl->commit_info.ops = &ff_layout_commit_ops; return &ffl->generic_hdr; } else return NULL; @@ -57,14 +69,14 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) static void ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo) { + struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_ds_err *err, *n; - list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list, - list) { + list_for_each_entry_safe(err, n, &ffl->error_list, list) { list_del(&err->list); kfree(err); } - kfree(FF_LAYOUT_FROM_HDR(lo)); + kfree_rcu(ffl, generic_hdr.plh_rcu); } static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) @@ -101,7 +113,7 @@ static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh) if (unlikely(!p)) return -ENOBUFS; fh->size = be32_to_cpup(p++); - if (fh->size > sizeof(struct nfs_fh)) { + if (fh->size > NFS_MAXFHSIZE) { printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n", fh->size); return -EOVERFLOW; @@ -151,18 +163,33 @@ decode_name(struct xdr_stream *xdr, u32 *id) return 0; } -static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, - const struct nfs4_ff_layout_mirror *m2) +static struct nfsd_file * +ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id, + struct nfs_client *clp, const struct cred *cred, + struct nfs_fh *fh, fmode_t mode) +{ +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); + + return nfs_local_open_fh(clp, cred, fh, &mirror->dss[dss_id].nfl, mode); +#else + return NULL; +#endif +} + +static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1, + const struct nfs4_ff_layout_ds_stripe *dss2) { int i, j; - if (m1->fh_versions_cnt != m2->fh_versions_cnt) + if (dss1->fh_versions_cnt != dss2->fh_versions_cnt) return false; - for (i = 0; i < m1->fh_versions_cnt; i++) { + + for (i = 0; i < dss1->fh_versions_cnt; i++) { bool found_fh = false; - for (j = 0; j < m2->fh_versions_cnt; j++) { - if (nfs_compare_fh(&m1->fh_versions[i], - &m2->fh_versions[j]) == 0) { + for (j = 0; j < dss2->fh_versions_cnt; j++) { + if (nfs_compare_fh(&dss1->fh_versions[i], + &dss2->fh_versions[j]) == 0) { found_fh = true; break; } @@ -173,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, return true; } +static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + u32 dss_id; + + if (m1->dss_count != m2->dss_count) + return false; + + for (dss_id = 0; dss_id < m1->dss_count; dss_id++) + if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id])) + return false; + + return true; +} + +static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + u32 dss_id; + + if (m1->dss_count != m2->dss_count) + return false; + + for (dss_id = 0; dss_id < m1->dss_count; dss_id++) + if (memcmp(&m1->dss[dss_id].devid, + &m2->dss[dss_id].devid, + sizeof(m1->dss[dss_id].devid)) != 0) + return false; + + return true; +} + static struct nfs4_ff_layout_mirror * ff_layout_add_mirror(struct pnfs_layout_hdr *lo, struct nfs4_ff_layout_mirror *mirror) @@ -183,11 +242,11 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo, spin_lock(&inode->i_lock); list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { - if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0) + if (!ff_mirror_match_devid(mirror, pos)) continue; if (!ff_mirror_match_fh(mirror, pos)) continue; - if (atomic_inc_not_zero(&pos->ref)) { + if (refcount_inc_not_zero(&pos->ref)) { spin_unlock(&inode->i_lock); return pos; } @@ -211,73 +270,67 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror) mirror->layout = NULL; } -static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) +static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(u32 dss_count, + gfp_t gfp_flags) { struct nfs4_ff_layout_mirror *mirror; mirror = kzalloc(sizeof(*mirror), gfp_flags); - if (mirror != NULL) { - spin_lock_init(&mirror->lock); - atomic_set(&mirror->ref, 1); - INIT_LIST_HEAD(&mirror->mirrors); + if (mirror == NULL) + return NULL; + + spin_lock_init(&mirror->lock); + refcount_set(&mirror->ref, 1); + INIT_LIST_HEAD(&mirror->mirrors); + + mirror->dss_count = dss_count; + mirror->dss = + kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), + gfp_flags); + if (mirror->dss == NULL) { + kfree(mirror); + return NULL; } + + for (u32 dss_id = 0; dss_id < mirror->dss_count; dss_id++) + nfs_localio_file_init(&mirror->dss[dss_id].nfl); + return mirror; } static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) { - struct rpc_cred *cred; + const struct cred *cred; + u32 dss_id; ff_layout_remove_mirror(mirror); - kfree(mirror->fh_versions); - cred = rcu_access_pointer(mirror->ro_cred); - if (cred) - put_rpccred(cred); - cred = rcu_access_pointer(mirror->rw_cred); - if (cred) - put_rpccred(cred); - nfs4_ff_layout_put_deviceid(mirror->mirror_ds); + + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + kfree(mirror->dss[dss_id].fh_versions); + cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred); + put_cred(cred); + cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred); + put_cred(cred); + nfs_close_local_fh(&mirror->dss[dss_id].nfl); + nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds); + } + + kfree(mirror->dss); kfree(mirror); } static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror) { - if (mirror != NULL && atomic_dec_and_test(&mirror->ref)) + if (mirror != NULL && refcount_dec_and_test(&mirror->ref)) ff_layout_free_mirror(mirror); } static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) { - int i; - - if (fls->mirror_array) { - for (i = 0; i < fls->mirror_array_cnt; i++) { - /* normally mirror_ds is freed in - * .free_deviceid_node but we still do it here - * for .alloc_lseg error path */ - ff_layout_put_mirror(fls->mirror_array[i]); - } - kfree(fls->mirror_array); - fls->mirror_array = NULL; - } -} - -static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr) -{ - int ret = 0; + u32 i; - dprintk("--> %s\n", __func__); - - /* FIXME: remove this check when layout segment support is added */ - if (lgr->range.offset != 0 || - lgr->range.length != NFS4_MAX_UINT64) { - dprintk("%s Only whole file layouts supported. Use MDS i/o\n", - __func__); - ret = -EINVAL; - } - - dprintk("--> %s returns %d\n", __func__, ret); - return ret; + for (i = 0; i < fls->mirror_array_cnt; i++) + ff_layout_put_mirror(fls->mirror_array[i]); } static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) @@ -289,6 +342,23 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) } static bool +ff_lseg_match_mirrors(struct pnfs_layout_segment *l1, + struct pnfs_layout_segment *l2) +{ + const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1); + const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2); + u32 i; + + if (fl1->mirror_array_cnt != fl2->mirror_array_cnt) + return false; + for (i = 0; i < fl1->mirror_array_cnt; i++) { + if (fl1->mirror_array[i] != fl2->mirror_array[i]) + return false; + } + return true; +} + +static bool ff_lseg_range_is_after(const struct pnfs_layout_range *l1, const struct pnfs_layout_range *l2) { @@ -323,6 +393,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new, new->pls_range.length); if (new_end < old->pls_range.offset) return false; + if (!ff_lseg_match_mirrors(new, old)) + return false; /* Mergeable: copy info from 'old' to 'new' */ if (new_end < old_end) @@ -347,14 +419,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo, free_me); } +static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror) +{ + u32 dss_id, sum = 0; + + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) + sum += mirror->dss[dss_id].efficiency; + + return sum; +} + static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { int i, j; for (i = 0; i < fls->mirror_array_cnt - 1; i++) { for (j = i + 1; j < fls->mirror_array_cnt; j++) - if (fls->mirror_array[i]->efficiency < - fls->mirror_array[j]->efficiency) + if (ff_mirror_efficiency_sum(fls->mirror_array[i]) < + ff_mirror_efficiency_sum(fls->mirror_array[j])) swap(fls->mirror_array[i], fls->mirror_array[j]); } @@ -369,20 +451,21 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_ff_layout_segment *fls = NULL; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; u64 stripe_unit; u32 mirror_array_cnt; __be32 *p; int i, rc; + struct nfs4_ff_layout_ds_stripe *dss_info; dprintk("--> %s\n", __func__); - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) return ERR_PTR(-ENOMEM); xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_folio(&stream, scratch); /* stripe unit and mirror_array_cnt */ rc = -EIO; @@ -400,118 +483,142 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; rc = -ENOMEM; - fls = kzalloc(sizeof(*fls), gfp_flags); + fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt), + gfp_flags); if (!fls) goto out_err_free; fls->mirror_array_cnt = mirror_array_cnt; fls->stripe_unit = stripe_unit; - fls->mirror_array = kcalloc(fls->mirror_array_cnt, - sizeof(fls->mirror_array[0]), gfp_flags); - if (fls->mirror_array == NULL) - goto out_err_free; + u32 dss_count = 0; for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; - struct auth_cred acred = { .group_info = ff_zero_group }; - struct rpc_cred __rcu *cred; - u32 ds_count, fh_count, id; - int j; + struct cred *kcred; + const struct cred __rcu *cred; + kuid_t uid; + kgid_t gid; + u32 fh_count, id; + int j, dss_id; rc = -EIO; p = xdr_inline_decode(&stream, 4); if (!p) goto out_err_free; - ds_count = be32_to_cpup(p); - /* FIXME: allow for striping? */ - if (ds_count != 1) + // Ensure all mirrors have same stripe count. + if (dss_count == 0) + dss_count = be32_to_cpup(p); + else if (dss_count != be32_to_cpup(p)) goto out_err_free; - fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); + if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT || + dss_count == 0) + goto out_err_free; + + if (dss_count > 1 && stripe_unit == 0) + goto out_err_free; + + fls->mirror_array[i] = ff_layout_alloc_mirror(dss_count, gfp_flags); if (fls->mirror_array[i] == NULL) { rc = -ENOMEM; goto out_err_free; } - fls->mirror_array[i]->ds_count = ds_count; + for (dss_id = 0; dss_id < dss_count; dss_id++) { + dss_info = &fls->mirror_array[i]->dss[dss_id]; + dss_info->mirror = fls->mirror_array[i]; - /* deviceid */ - rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid); - if (rc) - goto out_err_free; + /* deviceid */ + rc = decode_deviceid(&stream, &dss_info->devid); + if (rc) + goto out_err_free; - /* efficiency */ - rc = -EIO; - p = xdr_inline_decode(&stream, 4); - if (!p) - goto out_err_free; - fls->mirror_array[i]->efficiency = be32_to_cpup(p); + /* efficiency */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + dss_info->efficiency = be32_to_cpup(p); - /* stateid */ - rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid); - if (rc) - goto out_err_free; + /* stateid */ + rc = decode_pnfs_stateid(&stream, &dss_info->stateid); + if (rc) + goto out_err_free; - /* fh */ - rc = -EIO; - p = xdr_inline_decode(&stream, 4); - if (!p) - goto out_err_free; - fh_count = be32_to_cpup(p); + /* fh */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + fh_count = be32_to_cpup(p); - fls->mirror_array[i]->fh_versions = - kzalloc(fh_count * sizeof(struct nfs_fh), - gfp_flags); - if (fls->mirror_array[i]->fh_versions == NULL) { - rc = -ENOMEM; - goto out_err_free; - } + dss_info->fh_versions = + kcalloc(fh_count, sizeof(struct nfs_fh), + gfp_flags); + if (dss_info->fh_versions == NULL) { + rc = -ENOMEM; + goto out_err_free; + } + + for (j = 0; j < fh_count; j++) { + rc = decode_nfs_fh(&stream, + &dss_info->fh_versions[j]); + if (rc) + goto out_err_free; + } + + dss_info->fh_versions_cnt = fh_count; - for (j = 0; j < fh_count; j++) { - rc = decode_nfs_fh(&stream, - &fls->mirror_array[i]->fh_versions[j]); + /* user */ + rc = decode_name(&stream, &id); if (rc) goto out_err_free; - } - - fls->mirror_array[i]->fh_versions_cnt = fh_count; - /* user */ - rc = decode_name(&stream, &id); - if (rc) - goto out_err_free; + uid = make_kuid(&init_user_ns, id); - acred.uid = make_kuid(&init_user_ns, id); + /* group */ + rc = decode_name(&stream, &id); + if (rc) + goto out_err_free; - /* group */ - rc = decode_name(&stream, &id); - if (rc) - goto out_err_free; + gid = make_kgid(&init_user_ns, id); - acred.gid = make_kgid(&init_user_ns, id); + if (gfp_flags & __GFP_FS) + kcred = prepare_kernel_cred(&init_task); + else { + unsigned int nofs_flags = memalloc_nofs_save(); - /* find the cred for it */ - rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags)); - if (IS_ERR(cred)) { - rc = PTR_ERR(cred); - goto out_err_free; + kcred = prepare_kernel_cred(&init_task); + memalloc_nofs_restore(nofs_flags); + } + rc = -ENOMEM; + if (!kcred) + goto out_err_free; + kcred->fsuid = uid; + kcred->fsgid = gid; + cred = RCU_INITIALIZER(kcred); + + if (lgr->range.iomode == IOMODE_READ) + rcu_assign_pointer(dss_info->ro_cred, cred); + else + rcu_assign_pointer(dss_info->rw_cred, cred); } - if (lgr->range.iomode == IOMODE_READ) - rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); - else - rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); - mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); if (mirror != fls->mirror_array[i]) { - /* swap cred ptrs so free_mirror will clean up old */ - if (lgr->range.iomode == IOMODE_READ) { - cred = xchg(&mirror->ro_cred, cred); - rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); - } else { - cred = xchg(&mirror->rw_cred, cred); - rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + for (dss_id = 0; dss_id < dss_count; dss_id++) { + dss_info = &fls->mirror_array[i]->dss[dss_id]; + /* swap cred ptrs so free_mirror will clean up old */ + if (lgr->range.iomode == IOMODE_READ) { + cred = xchg(&mirror->dss[dss_id].ro_cred, + dss_info->ro_cred); + rcu_assign_pointer(dss_info->ro_cred, cred); + } else { + cred = xchg(&mirror->dss[dss_id].rw_cred, + dss_info->rw_cred); + rcu_assign_pointer(dss_info->rw_cred, cred); + } } ff_layout_free_mirror(fls->mirror_array[i]); fls->mirror_array[i] = mirror; @@ -519,8 +626,8 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, dprintk("%s: iomode %s uid %u gid %u\n", __func__, lgr->range.iomode == IOMODE_READ ? "READ" : "RW", - from_kuid(&init_user_ns, acred.uid), - from_kgid(&init_user_ns, acred.gid)); + from_kuid(&init_user_ns, uid), + from_kgid(&init_user_ns, gid)); } p = xdr_inline_decode(&stream, 4); @@ -536,13 +643,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, out_sort_mirrors: ff_layout_sort_mirrors(fls); - rc = ff_layout_check_layout(lgr); - if (rc) - goto out_err_free; ret = &fls->generic_hdr; dprintk("<-- %s (success)\n", __func__); out_free_page: - __free_page(scratch); + folio_put(scratch); return ret; out_err_free: _ff_layout_free_lseg(fls); @@ -551,17 +655,6 @@ out_err_free: goto out_free_page; } -static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout) -{ - struct pnfs_layout_segment *lseg; - - list_for_each_entry(lseg, &layout->plh_segs, pls_list) - if (lseg->pls_range.iomode == IOMODE_RW) - return true; - - return false; -} - static void ff_layout_free_lseg(struct pnfs_layout_segment *lseg) { @@ -576,21 +669,30 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg) ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout); inode = ffl->generic_hdr.plh_inode; spin_lock(&inode->i_lock); - if (!ff_layout_has_rw_segments(lseg->pls_layout)) { - ffl->commit_info.nbuckets = 0; - kfree(ffl->commit_info.buckets); - ffl->commit_info.buckets = NULL; - } + pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg); spin_unlock(&inode->i_lock); } _ff_layout_free_lseg(fls); } -/* Return 1 until we have multiple lsegs support */ -static int -ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) +static u32 calc_commit_idx(struct pnfs_layout_segment *lseg, + u32 mirror_idx, u32 dss_id) { - return 1; + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + + return (mirror_idx * flseg->mirror_array[0]->dss_count) + dss_id; +} + +static u32 calc_mirror_idx_from_commit(struct pnfs_layout_segment *lseg, + u32 commit_index) +{ + return commit_index / FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count; +} + +static u32 calc_dss_id_from_commit(struct pnfs_layout_segment *lseg, + u32 commit_index) +{ + return commit_index % FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count; } static void @@ -617,6 +719,7 @@ nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) static bool nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, struct nfs4_ff_layoutstat *layoutstat, ktime_t now) { @@ -624,8 +727,8 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout); nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); - if (!mirror->start_time) - mirror->start_time = now; + if (!mirror->dss[dss_id].start_time) + mirror->dss[dss_id].start_time = now; if (mirror->report_interval != 0) report_interval = (s64)mirror->report_interval * 1000LL; else if (layoutstats_timer != 0) @@ -675,28 +778,32 @@ nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, static void nfs4_ff_layout_stat_io_start_read(struct inode *inode, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now); - nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + report = nfs4_ff_layoutstat_start_io( + mirror, dss_id, &mirror->dss[dss_id].read_stat, now); + nfs4_ff_layout_stat_io_update_requested( + &mirror->dss[dss_id].read_stat, requested); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); if (report) - pnfs_report_layoutstat(inode, GFP_KERNEL); + pnfs_report_layoutstat(inode, nfs_io_gfp_mask()); } static void nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, __u64 completed) { spin_lock(&mirror->lock); - nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, + nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].read_stat, requested, completed, ktime_get(), task->tk_start); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); @@ -706,23 +813,31 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, static void nfs4_ff_layout_stat_io_start_write(struct inode *inode, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now); - nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + report = nfs4_ff_layoutstat_start_io( + mirror, + dss_id, + &mirror->dss[dss_id].write_stat, + now); + nfs4_ff_layout_stat_io_update_requested( + &mirror->dss[dss_id].write_stat, + requested); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); if (report) - pnfs_report_layoutstat(inode, GFP_NOIO); + pnfs_report_layoutstat(inode, nfs_io_gfp_mask()); } static void nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, __u64 completed, enum nfs3_stable_how committed) @@ -731,80 +846,114 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, requested = completed = 0; spin_lock(&mirror->lock); - nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, + nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].write_stat, requested, completed, ktime_get(), task->tk_start); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); } -static int -ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - gfp_t gfp_flags) +static void +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) +{ + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); + + if (devid) + nfs4_mark_deviceid_unavailable(devid); +} + +static void +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) +{ + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); + + if (devid) + nfs4_mark_deviceid_available(devid); +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id, + bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); - struct pnfs_commit_bucket *buckets; - int size; - - if (cinfo->ds->nbuckets != 0) { - /* This assumes there is only one RW lseg per file. - * To support multiple lseg per file, we need to - * change struct pnfs_commit_bucket to allow dynamic - * increasing nbuckets. - */ - return 0; - } + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_pnfs_ds *ds = ERR_PTR(-EAGAIN); + u32 idx; - size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg); + /* mirrors are initially sorted by efficiency */ + for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { + mirror = FF_LAYOUT_COMP(lseg, idx); + *dss_id = nfs4_ff_layout_calc_dss_id( + fls->stripe_unit, + fls->mirror_array[idx]->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false); + if (IS_ERR(ds)) + continue; - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), - gfp_flags); - if (!buckets) - return -ENOMEM; - else { - int i; - - spin_lock(&cinfo->inode->i_lock); - if (cinfo->ds->nbuckets != 0) - kfree(buckets); - else { - cinfo->ds->buckets = buckets; - cinfo->ds->nbuckets = size; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - /* mark direct verifier as unset */ - buckets[i].direct_verf.committed = - NFS_INVALID_STABLE_HOW; - } + if (check_device && + nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) { + // reinitialize the error state in case if this is the last iteration + ds = ERR_PTR(-EINVAL); + continue; } - spin_unlock(&cinfo->inode->i_lock); - return 0; + + *best_idx = idx; + break; } + + return ds; +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) +{ + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id, false); +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) +{ + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id, true); } static struct nfs4_pnfs_ds * ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, - int *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { - struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_pnfs_ds *ds; - bool fail_return = false; - int idx; - /* mirrors are sorted by efficiency */ - for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { - if (idx+1 == fls->mirror_array_cnt) - fail_return = true; - ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return); - if (ds) { - *best_idx = idx; - return ds; - } - } + ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id); + if (!IS_ERR(ds)) + return ds; + return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id); +} - return NULL; +static struct nfs4_pnfs_ds * +ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, + u32 *best_idx, + u32 offset, + u32 *dss_id) +{ + struct pnfs_layout_segment *lseg = pgio->pg_lseg; + struct nfs4_pnfs_ds *ds; + + ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, + best_idx, offset, dss_id); + if (!IS_ERR(ds) || !pgio->pg_mirror_idx) + return ds; + return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx, + offset, dss_id); } static void @@ -812,29 +961,65 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, bool strict_iomode) { -retry_strict: pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, - IOMODE_READ, - strict_iomode, - GFP_KERNEL); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), req->wb_bytes, IOMODE_READ, + strict_iomode, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; } +} - /* If we don't have checking, do get a IOMODE_RW - * segment, and the server wants to avoid READs - * there, then retry! - */ - if (pgio->pg_lseg && !strict_iomode && - ff_layout_avoid_read_on_rw(pgio->pg_lseg)) { - strict_iomode = true; - goto retry_strict; +static bool +ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls) +{ + return fls->mirror_array[0]->dss_count > 1; +} + +/* + * ff_layout_pg_test(). Called by nfs_can_coalesce_requests() + * + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + unsigned int size; + u64 p_stripe, r_stripe; + u32 stripe_offset; + u64 segment_offset = pgio->pg_lseg->pls_range.offset; + u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + /* calls nfs_generic_pg_test */ + size = pnfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; + else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg))) + return size; + + /* see if req and prev are in the same stripe */ + if (prev) { + p_stripe = (u64)req_offset(prev) - segment_offset; + r_stripe = (u64)req_offset(req) - segment_offset; + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + if (p_stripe != r_stripe) + return 0; } + + /* calculate remaining bytes in the current stripe */ + div_u64_rem((u64)req_offset(req) - segment_offset, + stripe_unit, + &stripe_offset); + WARN_ON_ONCE(stripe_offset > stripe_unit); + if (stripe_offset >= stripe_unit) + return 0; + return min(stripe_unit - (unsigned int)stripe_offset, size); } static void @@ -844,43 +1029,67 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_pgio_mirror *pgm; struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - int ds_idx; + u32 ds_idx, dss_id; + if (NFS_SERVER(pgio->pg_inode)->flags & + (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) + pgio->pg_maxretrans = io_maxretrans; retry: - pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_layout(pgio, req); /* Use full layout for now */ - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { ff_layout_pg_get_read(pgio, req, false); - else if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) + if (!pgio->pg_lseg) + goto out_nolseg; + } + if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) { ff_layout_pg_get_read(pgio, req, true); + if (!pgio->pg_lseg) + goto out_nolseg; + } + /* Reset wb_nio, since getting layout segment was successful */ + req->wb_nio = 0; - /* If no lseg, fall back to read through mds */ - if (pgio->pg_lseg == NULL) - goto out_mds; - - ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); - if (!ds) { + ds = ff_layout_get_ds_for_read(pgio, &ds_idx, + req_offset(req), &dss_id); + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; + pnfs_generic_pg_cleanup(pgio); /* Sleep for 1 second before retrying */ ssleep(1); goto retry; } mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); - - pgio->pg_mirror_idx = ds_idx; - - /* read always uses only one mirror - idx 0 for pgio layer */ pgm = &pgio->pg_mirrors[0]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; + pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize; + pgio->pg_mirror_idx = ds_idx; return; +out_nolseg: + if (pgio->pg_error < 0) { + if (pgio->pg_error != -EAGAIN) + return; + /* Retry getting layout segment if lower layer returned -EAGAIN */ + if (pgio->pg_maxretrans && req->wb_nio++ > pgio->pg_maxretrans) { + if (NFS_SERVER(pgio->pg_inode)->flags & NFS_MOUNT_SOFTERR) + pgio->pg_error = -ETIMEDOUT; + else + pgio->pg_error = -EIO; + return; + } + pgio->pg_error = 0; + /* Sleep for 1 second before retrying */ + ssleep(1); + goto retry; + } out_mds: - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; + trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode, + 0, NFS4_MAX_UINT64, IOMODE_READ, + NFS_I(pgio->pg_inode)->layout, + pgio->pg_lseg); + pgio->pg_maxretrans = 0; nfs_pageio_reset_read_mds(pgio); } @@ -890,21 +1099,16 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, { struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; - struct nfs_commit_info cinfo; struct nfs4_pnfs_ds *ds; - int i; - int status; + u32 i, dss_id; retry: - pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, - IOMODE_RW, - false, - GFP_NOFS); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), req->wb_bytes, + IOMODE_RW, false, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -915,38 +1119,46 @@ retry: if (pgio->pg_lseg == NULL) goto out_mds; - nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); - status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); - if (status < 0) - goto out_mds; - /* Use a direct mapping of ds_idx to pgio mirror_idx */ - if (WARN_ON_ONCE(pgio->pg_mirror_count != - FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))) - goto out_mds; + if (pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)) + goto out_eagain; for (i = 0; i < pgio->pg_mirror_count; i++) { - ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); - if (!ds) { + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit, + mirror->dss_count, + req_offset(req)); + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, + dss_id, true); + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; + pnfs_generic_pg_cleanup(pgio); /* Sleep for 1 second before retrying */ ssleep(1); goto retry; } pgm = &pgio->pg_mirrors[i]; - mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; + pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize; } + if (NFS_SERVER(pgio->pg_inode)->flags & + (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) + pgio->pg_maxretrans = io_maxretrans; + return; +out_eagain: + pnfs_generic_pg_cleanup(pgio); + pgio->pg_error = -EAGAIN; return; - out_mds: - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; + trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode, + 0, NFS4_MAX_UINT64, IOMODE_RW, + NFS_I(pgio->pg_inode)->layout, + pgio->pg_lseg); + pgio->pg_maxretrans = 0; nfs_pageio_reset_write_mds(pgio); + pgio->pg_error = -EAGAIN; } static unsigned int @@ -954,13 +1166,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, - IOMODE_RW, - false, - GFP_NOFS); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), req->wb_bytes, + IOMODE_RW, false, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -970,25 +1179,46 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, if (pgio->pg_lseg) return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); + trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode, + 0, NFS4_MAX_UINT64, IOMODE_RW, + NFS_I(pgio->pg_inode)->layout, + pgio->pg_lseg); /* no lseg means that pnfs is not in use, so no mirroring here */ nfs_pageio_reset_write_mds(pgio); out: return 1; } +static u32 +ff_layout_pg_set_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx) +{ + u32 old = desc->pg_mirror_idx; + + desc->pg_mirror_idx = idx; + return old; +} + +static struct nfs_pgio_mirror * +ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx) +{ + return &desc->pg_mirrors[idx]; +} + static const struct nfs_pageio_ops ff_layout_pg_read_ops = { .pg_init = ff_layout_pg_init_read, - .pg_test = pnfs_generic_pg_test, + .pg_test = ff_layout_pg_test, .pg_doio = pnfs_generic_pg_readpages, .pg_cleanup = pnfs_generic_pg_cleanup, }; static const struct nfs_pageio_ops ff_layout_pg_write_ops = { .pg_init = ff_layout_pg_init_write, - .pg_test = pnfs_generic_pg_test, + .pg_test = ff_layout_pg_test, .pg_doio = pnfs_generic_pg_writepages, .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write, .pg_cleanup = pnfs_generic_pg_cleanup, + .pg_get_mirror = ff_layout_pg_get_mirror_write, + .pg_set_mirror = ff_layout_pg_set_mirror_write, }; static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) @@ -1019,15 +1249,36 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) hdr->args.count, (unsigned long long)hdr->args.offset); + trace_pnfs_mds_fallback_write_done(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_RW, NFS_I(hdr->inode)->layout, + hdr->lseg); task->tk_status = pnfs_write_done_resend_to_mds(hdr); } } +static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) +{ + u32 idx = hdr->pgio_mirror_idx + 1; + u32 new_idx = 0; + u32 dss_id = 0; + struct nfs4_pnfs_ds *ds; + + ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx, + hdr->args.offset, &dss_id); + if (IS_ERR(ds)) + pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); + else + ff_layout_send_layouterror(hdr->lseg); + pnfs_read_resend_pnfs(hdr, new_idx); +} + static void ff_layout_reset_read(struct nfs_pgio_header *hdr) { struct rpc_task *task = &hdr->task; pnfs_layoutcommit_inode(hdr->inode, false); + pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " @@ -1038,47 +1289,62 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr) hdr->args.count, (unsigned long long)hdr->args.offset); + trace_pnfs_mds_fallback_read_done(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_READ, NFS_I(hdr->inode)->layout, + hdr->lseg); task->tk_status = pnfs_read_done_resend_to_mds(hdr); } } static int ff_layout_async_handle_error_v4(struct rpc_task *task, + u32 op_status, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - int idx) + u32 idx, u32 dss_id) { struct pnfs_layout_hdr *lo = lseg->pls_layout; struct inode *inode = lo->plh_inode; - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; - switch (task->tk_status) { - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: + switch (op_status) { + case NFS4_OK: + case NFS4ERR_NXIO: + break; + case NFSERR_PERM: + if (!task->tk_xprt) + break; + xprt_force_disconnect(task->tk_xprt); + goto out_retry; + case NFS4ERR_BADSESSION: + case NFS4ERR_BADSLOT: + case NFS4ERR_BAD_HIGH_SLOT: + case NFS4ERR_DEADSESSION: + case NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case NFS4ERR_SEQ_FALSE_RETRY: + case NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session. Exchangeid " "flags 0x%x\n", __func__, task->tk_status, clp->cl_exchange_flags); nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - break; - case -NFS4ERR_DELAY: - case -NFS4ERR_GRACE: + goto out_retry; + case NFS4ERR_DELAY: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + fallthrough; + case NFS4ERR_GRACE: rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); - break; - case -NFS4ERR_RETRY_UNCACHED_REP: - break; + goto out_retry; + case NFS4ERR_RETRY_UNCACHED_REP: + goto out_retry; /* Invalidate Layout errors */ - case -NFS4ERR_PNFS_NO_LAYOUT: - case -ESTALE: /* mapped NFS4ERR_STALE */ - case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ - case -EISDIR: /* mapped NFS4ERR_ISDIR */ - case -NFS4ERR_FHEXPIRED: - case -NFS4ERR_WRONG_TYPE: + case NFS4ERR_PNFS_NO_LAYOUT: + case NFS4ERR_STALE: + case NFS4ERR_BADHANDLE: + case NFS4ERR_ISDIR: + case NFS4ERR_FHEXPIRED: + case NFS4ERR_WRONG_TYPE: dprintk("%s Invalid layout error %d\n", __func__, task->tk_status); /* @@ -1091,38 +1357,79 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, pnfs_destroy_layout(NFS_I(inode)); rpc_wake_up(&tbl->slot_tbl_waitq); goto reset; + default: + break; + } + + switch (task->tk_status) { /* RPC connection errors */ + case -ENETDOWN: + case -ENETUNREACH: + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + return -NFS4ERR_FATAL_IOERROR; + fallthrough; case -ECONNREFUSED: case -EHOSTDOWN: case -EHOSTUNREACH: - case -ENETUNREACH: case -EIO: case -ETIMEDOUT: case -EPIPE: + case -EPROTO: + case -ENODEV: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); rpc_wake_up(&tbl->slot_tbl_waitq); - /* fall through */ + break; default: - if (ff_layout_avoid_mds_available_ds(lseg)) - return -NFS4ERR_RESET_TO_PNFS; -reset: - dprintk("%s Retry through MDS. Error %d\n", __func__, - task->tk_status); - return -NFS4ERR_RESET_TO_MDS; + break; } + + if (ff_layout_avoid_mds_available_ds(lseg)) + return -NFS4ERR_RESET_TO_PNFS; +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, + task->tk_status); + return -NFS4ERR_RESET_TO_MDS; + +out_retry: task->tk_status = 0; return -EAGAIN; } /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ static int ff_layout_async_handle_error_v3(struct rpc_task *task, + u32 op_status, + struct nfs_client *clp, struct pnfs_layout_segment *lseg, - int idx) + u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); + + switch (op_status) { + case NFS_OK: + case NFSERR_NXIO: + break; + case NFSERR_PERM: + if (!task->tk_xprt) + break; + xprt_force_disconnect(task->tk_xprt); + goto out_retry; + case NFSERR_ACCES: + case NFSERR_BADHANDLE: + case NFSERR_FBIG: + case NFSERR_IO: + case NFSERR_NOSPC: + case NFSERR_ROFS: + case NFSERR_STALE: + goto out_reset_to_pnfs; + case NFSERR_JUKEBOX: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + goto out_retry; + default: + break; + } switch (task->tk_status) { /* File access problems. Don't mark the device as unavailable */ @@ -1136,12 +1443,18 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, case -EJUKEBOX: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); goto out_retry; + case -ENETDOWN: + case -ENETUNREACH: + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + return -NFS4ERR_FATAL_IOERROR; + fallthrough; default: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); } +out_reset_to_pnfs: /* FIXME: Need to prevent infinite looping here. */ return -NFS4ERR_RESET_TO_PNFS; out_retry: @@ -1152,15 +1465,18 @@ out_retry: } static int ff_layout_async_handle_error(struct rpc_task *task, + u32 op_status, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - int idx) + u32 idx, u32 dss_id) { int vers = clp->cl_nfs_mod->rpc_vers->number; - if (task->tk_status >= 0) + if (task->tk_status >= 0) { + ff_layout_mark_ds_reachable(lseg, idx, dss_id); return 0; + } /* Handle the case of an invalid layout segment */ if (!pnfs_is_valid_lseg(lseg)) @@ -1168,10 +1484,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task, switch (vers) { case 3: - return ff_layout_async_handle_error_v3(task, lseg, idx); + return ff_layout_async_handle_error_v3(task, op_status, clp, + lseg, idx, dss_id); case 4: - return ff_layout_async_handle_error_v4(task, state, clp, - lseg, idx); + return ff_layout_async_handle_error_v4(task, op_status, state, + clp, lseg, idx, dss_id); default: /* should never happen */ WARN_ON_ONCE(1); @@ -1180,10 +1497,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task, } static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, - int idx, u64 offset, u64 length, - u32 status, int opnum, int error) + u32 idx, u32 dss_id, u64 offset, u64 length, + u32 *op_status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; + u32 status = *op_status; int err; if (status == 0) { @@ -1192,38 +1510,53 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -EPFNOSUPPORT: case -EPROTONOSUPPORT: case -EOPNOTSUPP: + case -EINVAL: case -ECONNREFUSED: case -ECONNRESET: case -EHOSTDOWN: case -EHOSTUNREACH: + case -ENETDOWN: case -ENETUNREACH: case -EADDRINUSE: case -ENOBUFS: case -EPIPE: case -EPERM: - status = NFS4ERR_NXIO; + case -EPROTO: + case -ENODEV: + *op_status = status = NFS4ERR_NXIO; break; case -EACCES: - status = NFS4ERR_ACCESS; + *op_status = status = NFS4ERR_ACCESS; break; default: return; } } + mirror = FF_LAYOUT_COMP(lseg, idx); + err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, dss_id, offset, length, status, opnum, + nfs_io_gfp_mask()); + switch (status) { case NFS4ERR_DELAY: case NFS4ERR_GRACE: - return; - default: + case NFS4ERR_PERM: break; + case NFS4ERR_NXIO: + ff_layout_mark_ds_unreachable(lseg, idx, dss_id); + /* + * Don't return the layout if this is a read and we still + * have layouts to try + */ + if (opnum == OP_READ) + break; + fallthrough; + default: + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, + lseg); } - mirror = FF_LAYOUT_COMP(lseg, idx); - err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, offset, length, status, opnum, - GFP_NOIO); - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); } @@ -1231,32 +1564,43 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg); + u32 dss_id = nfs4_ff_layout_calc_dss_id( + flseg->stripe_unit, + flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count, + hdr->args.offset); int err; - trace_nfs4_pnfs_read(hdr, task->tk_status); - if (task->tk_status < 0) - ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + if (task->tk_status < 0) { + ff_layout_io_track_ds_error(hdr->lseg, + hdr->pgio_mirror_idx, dss_id, hdr->args.offset, hdr->args.count, - hdr->res.op_status, OP_READ, + &hdr->res.op_status, OP_READ, task->tk_status); - err = ff_layout_async_handle_error(task, hdr->args.context->state, + trace_ff_layout_read_error(hdr, task->tk_status); + } + + err = ff_layout_async_handle_error(task, hdr->res.op_status, + hdr->args.context->state, hdr->ds_clp, hdr->lseg, - hdr->pgio_mirror_idx); + hdr->pgio_mirror_idx, + dss_id); + trace_nfs4_pnfs_read(hdr, err); + clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); + clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); switch (err) { case -NFS4ERR_RESET_TO_PNFS: - if (ff_layout_choose_best_ds_for_read(hdr->lseg, - hdr->pgio_mirror_idx + 1, - &hdr->pgio_mirror_idx)) - goto out_eagain; - ff_layout_read_record_layoutstats_done(task, hdr); - pnfs_read_resend_pnfs(hdr); + set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - ff_layout_reset_read(hdr); + set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); return task->tk_status; case -EAGAIN: goto out_eagain; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } return 0; @@ -1293,35 +1637,51 @@ ff_layout_set_layoutcommit(struct inode *inode, (unsigned long long) NFS_I(inode)->layout->plh_lwb); } -static bool -ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx) -{ - /* No mirroring for now */ - struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); - - return ff_layout_test_devid_unavailable(node); -} - static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_start_read(hdr->inode, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - task->tk_start); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_start_read( + hdr->inode, + mirror, + dss_id, + hdr->args.count, + task->tk_start); } static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_end_read(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - hdr->res.count); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_end_read( + task, + mirror, + dss_id, + hdr->args.count, + hdr->res.count); + set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags); } static int ff_layout_read_prepare_common(struct rpc_task *task, @@ -1331,8 +1691,9 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, rpc_exit(task, -EIO); return -EIO; } - if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { - rpc_exit(task, -EHOSTDOWN); + + if (!pnfs_is_valid_lseg(hdr->lseg)) { + rpc_exit(task, -EAGAIN); return -EAGAIN; } @@ -1365,20 +1726,13 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) task)) return; - if (ff_layout_read_prepare_common(task, hdr)) - return; - - if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, - hdr->args.lock_context, FMODE_READ) == -EIO) - rpc_exit(task, -EIO); /* lost lock, terminate I/O */ + ff_layout_read_prepare_common(task, hdr); } static void ff_layout_read_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1403,6 +1757,10 @@ static void ff_layout_read_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_read_record_layoutstats_done(&hdr->task, hdr); + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + ff_layout_resend_pnfs_read(hdr); + else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + ff_layout_reset_read(hdr); pnfs_generic_rw_release(data); } @@ -1410,28 +1768,44 @@ static void ff_layout_read_release(void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg); + u32 dss_id = nfs4_ff_layout_calc_dss_id( + flseg->stripe_unit, + flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count, + hdr->args.offset); loff_t end_offs = 0; int err; - trace_nfs4_pnfs_write(hdr, task->tk_status); - if (task->tk_status < 0) - ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + if (task->tk_status < 0) { + ff_layout_io_track_ds_error(hdr->lseg, + hdr->pgio_mirror_idx, dss_id, hdr->args.offset, hdr->args.count, - hdr->res.op_status, OP_WRITE, + &hdr->res.op_status, OP_WRITE, task->tk_status); - err = ff_layout_async_handle_error(task, hdr->args.context->state, + trace_ff_layout_write_error(hdr, task->tk_status); + } + + err = ff_layout_async_handle_error(task, hdr->res.op_status, + hdr->args.context->state, hdr->ds_clp, hdr->lseg, - hdr->pgio_mirror_idx); + hdr->pgio_mirror_idx, + dss_id); + trace_nfs4_pnfs_write(hdr, err); + clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); + clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); switch (err) { case -NFS4ERR_RESET_TO_PNFS: - ff_layout_reset_write(hdr, true); + set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - ff_layout_reset_write(hdr, false); + set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); return task->tk_status; case -EAGAIN: return -EAGAIN; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } if (hdr->res.verf->committed == NFS_FILE_SYNC || @@ -1453,16 +1827,22 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { int err; + u32 idx = calc_mirror_idx_from_commit(data->lseg, data->ds_commit_index); + u32 dss_id = calc_dss_id_from_commit(data->lseg, data->ds_commit_index); - trace_nfs4_pnfs_commit_ds(data, task->tk_status); - if (task->tk_status < 0) - ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, + if (task->tk_status < 0) { + ff_layout_io_track_ds_error(data->lseg, idx, dss_id, data->args.offset, data->args.count, - data->res.op_status, OP_COMMIT, + &data->res.op_status, OP_COMMIT, task->tk_status); - err = ff_layout_async_handle_error(task, NULL, data->ds_clp, - data->lseg, data->ds_commit_index); + trace_ff_layout_commit_error(data, task->tk_status); + } + + err = ff_layout_async_handle_error(task, data->res.op_status, + NULL, data->ds_clp, data->lseg, idx, + dss_id); + trace_nfs4_pnfs_commit_ds(data, err); switch (err) { case -NFS4ERR_RESET_TO_PNFS: pnfs_generic_prepare_to_resend_writes(data); @@ -1473,33 +1853,61 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, case -EAGAIN: rpc_restart_call_prepare(task); return -EAGAIN; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); - return 0; } static void ff_layout_write_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_start_write(hdr->inode, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - task->tk_start); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_start_write( + hdr->inode, + mirror, + dss_id, + hdr->args.count, + task->tk_start); } static void ff_layout_write_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count, - hdr->res.verf->committed); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_end_write( + task, + mirror, + dss_id, + hdr->args.count, + hdr->res.count, + hdr->res.verf->committed); + set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags); } static int ff_layout_write_prepare_common(struct rpc_task *task, @@ -1510,8 +1918,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } - if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { - rpc_exit(task, -EHOSTDOWN); + if (!pnfs_is_valid_lseg(hdr->lseg)) { + rpc_exit(task, -EAGAIN); return -EAGAIN; } @@ -1539,12 +1947,7 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) task)) return; - if (ff_layout_write_prepare_common(task, hdr)) - return; - - if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, - hdr->args.lock_context, FMODE_WRITE) == -EIO) - rpc_exit(task, -EIO); /* lost lock, terminate I/O */ + ff_layout_write_prepare_common(task, hdr); } static void ff_layout_write_call_done(struct rpc_task *task, void *data) @@ -1575,16 +1978,27 @@ static void ff_layout_write_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_write_record_layoutstats_done(&hdr->task, hdr); + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) { + ff_layout_send_layouterror(hdr->lseg); + ff_layout_reset_write(hdr, true); + } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + ff_layout_reset_write(hdr, false); pnfs_generic_rw_release(data); } static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task, struct nfs_commit_data *cdata) { + u32 idx, dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags)) return; + + idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index); + dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index); nfs4_ff_layout_stat_io_start_write(cdata->inode, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + FF_LAYOUT_COMP(cdata->lseg, idx), + dss_id, 0, task->tk_start); } @@ -1593,6 +2007,7 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, { struct nfs_page *req; __u64 count = 0; + u32 idx, dss_id; if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags)) return; @@ -1601,20 +2016,33 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, list_for_each_entry(req, &cdata->pages, wb_list) count += req->wb_bytes; } + + idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index); + dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index); nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + FF_LAYOUT_COMP(cdata->lseg, idx), + dss_id, count, count, NFS_FILE_SYNC); + set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags); } -static void ff_layout_commit_prepare_common(struct rpc_task *task, - struct nfs_commit_data *cdata) +static int ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) { + if (!pnfs_is_valid_lseg(cdata->lseg)) { + rpc_exit(task, -EAGAIN); + return -EAGAIN; + } + ff_layout_commit_record_layoutstats_start(task, cdata); + return 0; } static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { - ff_layout_commit_prepare_common(task, data); + if (ff_layout_commit_prepare_common(task, data)) + return; + rpc_call_start(task); } @@ -1700,40 +2128,54 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; - struct rpc_cred *ds_cred; + struct nfsd_file *localio; + struct nfs4_ff_layout_mirror *mirror; + const struct cred *ds_cred; loff_t offset = hdr->args.offset; u32 idx = hdr->pgio_mirror_idx; int vers; struct nfs_fh *fh; + u32 dss_id; + bool ds_fatal_error = false; dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n", __func__, hdr->inode->i_ino, hdr->args.pgbase, (size_t)hdr->args.count, offset); - ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); - if (!ds) + mirror = FF_LAYOUT_COMP(lseg, idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(lseg)->stripe_unit, + mirror->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false); + if (IS_ERR(ds)) { + ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds)); goto out_failed; + } - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, - hdr->inode); + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, + hdr->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, - ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); + ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); hdr->pgio_done_cb = ff_layout_read_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id); if (fh) hdr->args.fh = fh; + + nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid); + /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. @@ -1741,17 +2183,28 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->args.offset = offset; hdr->mds_offset = offset; + /* Start IO accounting for local read */ + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, + FMODE_READ); + if (localio) { + hdr->task.tk_start = ktime_get(); + ff_layout_read_record_layoutstats_start(&hdr->task, hdr); + } + /* Perform an asynchronous read to ds */ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_read_call_ops_v3 : &ff_layout_read_call_ops_v4, - 0, RPC_TASK_SOFTCONN); - put_rpccred(ds_cred); + 0, RPC_TASK_SOFTCONN, localio); + put_cred(ds_cred); return PNFS_ATTEMPTED; out_failed: - if (ff_layout_avoid_mds_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error) return PNFS_TRY_AGAIN; + trace_pnfs_mds_fallback_read_pagelist(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_READ, NFS_I(hdr->inode)->layout, lseg); return PNFS_NOT_ATTEMPTED; } @@ -1762,74 +2215,93 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; - struct rpc_cred *ds_cred; + struct nfsd_file *localio; + struct nfs4_ff_layout_mirror *mirror; + const struct cred *ds_cred; loff_t offset = hdr->args.offset; int vers; struct nfs_fh *fh; - int idx = hdr->pgio_mirror_idx; + u32 idx = hdr->pgio_mirror_idx; + u32 dss_id; + bool ds_fatal_error = false; - ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); - if (!ds) + mirror = FF_LAYOUT_COMP(lseg, idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(lseg)->stripe_unit, + mirror->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true); + if (IS_ERR(ds)) { + ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds)); goto out_failed; + } - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, - hdr->inode); + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, + hdr->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, - offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), + offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); hdr->pgio_done_cb = ff_layout_write_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - hdr->ds_commit_idx = idx; - fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + hdr->ds_commit_idx = calc_commit_idx(lseg, idx, dss_id); + fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id); if (fh) hdr->args.fh = fh; + nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid); + /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. */ hdr->args.offset = offset; + /* Start IO accounting for local write */ + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, + FMODE_READ|FMODE_WRITE); + if (localio) { + hdr->task.tk_start = ktime_get(); + ff_layout_write_record_layoutstats_start(&hdr->task, hdr); + } + /* Perform an asynchronous write */ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_write_call_ops_v3 : &ff_layout_write_call_ops_v4, - sync, RPC_TASK_SOFTCONN); - put_rpccred(ds_cred); + sync, RPC_TASK_SOFTCONN, localio); + put_cred(ds_cred); return PNFS_ATTEMPTED; out_failed: - if (ff_layout_avoid_mds_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error) return PNFS_TRY_AGAIN; + trace_pnfs_mds_fallback_write_pagelist(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_RW, NFS_I(hdr->inode)->layout, lseg); return PNFS_NOT_ATTEMPTED; } -static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) -{ - return i; -} - static struct nfs_fh * -select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i, u32 dss_id) { struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); /* FIXME: Assume that there is only one NFS version available * for the DS. */ - return &flseg->mirror_array[i]->fh_versions[0]; + return &flseg->mirror_array[i]->dss[dss_id].fh_versions[0]; } static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) @@ -1837,8 +2309,10 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct pnfs_layout_segment *lseg = data->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; - struct rpc_cred *ds_cred; - u32 idx; + struct nfsd_file *localio; + struct nfs4_ff_layout_mirror *mirror; + const struct cred *ds_cred; + u32 idx, dss_id; int vers, ret; struct nfs_fh *fh; @@ -1846,38 +2320,48 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))) goto out_err; - idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); - ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); - if (!ds) + idx = calc_mirror_idx_from_commit(lseg, data->ds_commit_index); + mirror = FF_LAYOUT_COMP(lseg, idx); + dss_id = calc_dss_id_from_commit(lseg, data->ds_commit_index); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true); + if (IS_ERR(ds)) goto out_err; - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, - data->inode); + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, + data->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_err; - ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred, dss_id); if (!ds_cred) goto out_err; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, - data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count), + data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), vers); data->commit_done_cb = ff_layout_commit_done_cb; data->cred = ds_cred; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; - fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + fh = select_ds_fh_from_commit(lseg, idx, dss_id); if (fh) data->args.fh = fh; + /* Start IO accounting for local commit */ + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, + FMODE_READ|FMODE_WRITE); + if (localio) { + data->task.tk_start = ktime_get(); + ff_layout_commit_record_layoutstats_start(&data->task, data); + } + ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, - how, RPC_TASK_SOFTCONN); - put_rpccred(ds_cred); + how, RPC_TASK_SOFTCONN, localio); + put_cred(ds_cred); return ret; out_err: pnfs_generic_prepare_to_resend_writes(data); @@ -1893,6 +2377,68 @@ ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, ff_layout_initiate_commit); } +static bool ff_layout_match_rw(const struct rpc_task *task, + const struct nfs_pgio_header *hdr, + const struct pnfs_layout_segment *lseg) +{ + return hdr->lseg == lseg; +} + +static bool ff_layout_match_commit(const struct rpc_task *task, + const struct nfs_commit_data *cdata, + const struct pnfs_layout_segment *lseg) +{ + return cdata->lseg == lseg; +} + +static bool ff_layout_match_io(const struct rpc_task *task, const void *data) +{ + const struct rpc_call_ops *ops = task->tk_ops; + + if (ops == &ff_layout_read_call_ops_v3 || + ops == &ff_layout_read_call_ops_v4 || + ops == &ff_layout_write_call_ops_v3 || + ops == &ff_layout_write_call_ops_v4) + return ff_layout_match_rw(task, task->tk_calldata, data); + if (ops == &ff_layout_commit_call_ops_v3 || + ops == &ff_layout_commit_call_ops_v4) + return ff_layout_match_commit(task, task->tk_calldata, data); + return false; +} + +static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_ff_layout_ds *mirror_ds; + struct nfs4_pnfs_ds *ds; + struct nfs_client *ds_clp; + struct rpc_clnt *clnt; + u32 idx, dss_id; + + for (idx = 0; idx < flseg->mirror_array_cnt; idx++) { + mirror = flseg->mirror_array[idx]; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + mirror_ds = mirror->dss[dss_id].mirror_ds; + if (IS_ERR_OR_NULL(mirror_ds)) + continue; + ds = mirror->dss[dss_id].mirror_ds->ds; + if (!ds) + continue; + ds_clp = ds->ds_clp; + if (!ds_clp) + continue; + clnt = ds_clp->cl_rpcclient; + if (!clnt) + continue; + if (!rpc_cancel_tasks(clnt, -EAGAIN, + ff_layout_match_io, lseg)) + continue; + rpc_clnt_disconnect(clnt); + } + } +} + static struct pnfs_ds_commit_info * ff_layout_get_ds_info(struct inode *inode) { @@ -1905,6 +2451,35 @@ ff_layout_get_ds_info(struct inode *inode) } static void +ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + struct inode *inode = lseg->pls_layout->plh_inode; + struct pnfs_commit_array *array, *new; + u32 size = flseg->mirror_array_cnt * flseg->mirror_array[0]->dss_count; + + new = pnfs_alloc_commit_array(size, + nfs_io_gfp_mask()); + if (new) { + spin_lock(&inode->i_lock); + array = pnfs_add_commit_array(fl_cinfo, new, lseg); + spin_unlock(&inode->i_lock); + if (array != new) + pnfs_free_commit_array(new); + } +} + +static void +ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_destroy(fl_cinfo); + spin_unlock(&inode->i_lock); +} + +static void ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, @@ -1927,12 +2502,6 @@ static int ff_layout_encode_ioerr(struct xdr_stream *xdr, } static void -encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) -{ - WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0); -} - -static void ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr, const nfs4_stateid *stateid, const struct nfs42_layoutstat_devinfo *devinfo) @@ -2024,7 +2593,7 @@ ff_layout_encode_layoutreturn(struct xdr_stream *xdr, dprintk("%s: Begin\n", __func__); - xdr_init_encode(&tmp_xdr, &tmp_buf, NULL); + xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL); ff_layout_encode_ioerr(&tmp_xdr, args, ff_args); ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args); @@ -2064,10 +2633,10 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args) struct nfs4_flexfile_layoutreturn_args *ff_args; struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout); - ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL); + ff_args = kmalloc(sizeof(*ff_args), nfs_io_gfp_mask()); if (!ff_args) goto out_nomem; - ff_args->pages[0] = alloc_page(GFP_KERNEL); + ff_args->pages[0] = alloc_page(nfs_io_gfp_mask()); if (!ff_args->pages[0]) goto out_nomem_free; @@ -2077,8 +2646,9 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args) FF_LAYOUTRETURN_MAXERR); spin_lock(&args->inode->i_lock); - ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo)); + ff_args->num_dev = ff_layout_mirror_prepare_stats( + &ff_layout->generic_hdr, &ff_args->devinfo[0], + ARRAY_SIZE(ff_args->devinfo), NFS4_FF_OP_LAYOUTRETURN); spin_unlock(&args->inode->i_lock); args->ld_private->ops = &layoutreturn_ops; @@ -2090,6 +2660,52 @@ out_nomem: return -ENOMEM; } +#ifdef CONFIG_NFS_V4_2 +void +ff_layout_send_layouterror(struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo = lseg->pls_layout; + struct nfs42_layout_error *errors; + LIST_HEAD(head); + + if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR)) + return; + ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1); + if (list_empty(&head)) + return; + + errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, sizeof(*errors), + nfs_io_gfp_mask()); + if (errors != NULL) { + const struct nfs4_ff_layout_ds_err *pos; + size_t n = 0; + + list_for_each_entry(pos, &head, list) { + errors[n].offset = pos->offset; + errors[n].length = pos->length; + nfs4_stateid_copy(&errors[n].stateid, &pos->stateid); + errors[n].errors[0].dev_id = pos->deviceid; + errors[n].errors[0].status = pos->status; + errors[n].errors[0].opnum = pos->opnum; + n++; + if (!list_is_last(&pos->list, &head) && + n < NFS42_LAYOUTERROR_MAX) + continue; + if (nfs42_proc_layouterror(lseg, errors, n) < 0) + break; + n = 0; + } + kfree(errors); + } + ff_layout_free_ds_ioerr(&head); +} +#else +void +ff_layout_send_layouterror(struct pnfs_layout_segment *lseg) +{ +} +#endif + static int ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) { @@ -2144,7 +2760,6 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) struct sockaddr *sap = (struct sockaddr *)&da->da_addr; char portbuf[RPCBIND_MAXUADDRPLEN]; char addrbuf[RPCBIND_MAXUADDRLEN]; - char *netid; unsigned short port; int len, netid_len; __be32 *p; @@ -2154,18 +2769,13 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0) return; port = ntohs(((struct sockaddr_in *)sap)->sin_port); - netid = "tcp"; - netid_len = 3; break; case AF_INET6: if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0) return; port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); - netid = "tcp6"; - netid_len = 4; break; default: - /* we only support tcp and tcp6 */ WARN_ON_ONCE(1); return; } @@ -2173,8 +2783,9 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff); len = strlcat(addrbuf, portbuf, sizeof(addrbuf)); + netid_len = strlen(da->da_netid); p = xdr_reserve_space(xdr, 4 + netid_len); - xdr_encode_opaque(p, netid, netid_len); + xdr_encode_opaque(p, da->da_netid, netid_len); p = xdr_reserve_space(xdr, 4 + len); xdr_encode_opaque(p, addrbuf, len); @@ -2212,11 +2823,11 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr, static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_ds_stripe *dss_info) { struct nfs4_pnfs_ds_addr *da; - struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; - struct nfs_fh *fh = &mirror->fh_versions[0]; + struct nfs4_pnfs_ds *ds = dss_info->mirror_ds->ds; + struct nfs_fh *fh = &dss_info->fh_versions[0]; __be32 *p; da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); @@ -2228,13 +2839,17 @@ ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, p = xdr_reserve_space(xdr, 4 + fh->size); xdr_encode_opaque(p, fh->data, fh->size); /* ff_io_latency4 read */ - spin_lock(&mirror->lock); - ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat); + spin_lock(&dss_info->mirror->lock); + ff_layout_encode_io_latency(xdr, + &dss_info->read_stat.io_stat); /* ff_io_latency4 write */ - ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat); - spin_unlock(&mirror->lock); + ff_layout_encode_io_latency(xdr, + &dss_info->write_stat.io_stat); + spin_unlock(&dss_info->mirror->lock); /* nfstime4 */ - ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time)); + ff_layout_encode_nfstime(xdr, + ktime_sub(ktime_get(), + dss_info->start_time)); /* bool */ p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(false); @@ -2258,7 +2873,8 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args, static void ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque) { - struct nfs4_ff_layout_mirror *mirror = opaque->data; + struct nfs4_ff_layout_ds_stripe *dss_info = opaque->data; + struct nfs4_ff_layout_mirror *mirror = dss_info->mirror; ff_layout_put_mirror(mirror); } @@ -2271,58 +2887,76 @@ static const struct nfs4_xdr_opaque_ops layoutstat_ops = { static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, struct nfs42_layoutstat_devinfo *devinfo, - int dev_limit) + int dev_limit, enum nfs4_ff_op_type type) { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; + struct nfs4_ff_layout_ds_stripe *dss_info; struct nfs4_deviceid_node *dev; - int i = 0; + int i = 0, dss_id; list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { - if (i >= dev_limit) - break; - if (IS_ERR_OR_NULL(mirror->mirror_ds)) - continue; - if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags)) - continue; - /* mirror refcount put in cleanup_layoutstats */ - if (!atomic_inc_not_zero(&mirror->ref)) - continue; - dev = &mirror->mirror_ds->id_node; - memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); - devinfo->offset = 0; - devinfo->length = NFS4_MAX_UINT64; - spin_lock(&mirror->lock); - devinfo->read_count = mirror->read_stat.io_stat.ops_completed; - devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; - devinfo->write_count = mirror->write_stat.io_stat.ops_completed; - devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; - spin_unlock(&mirror->lock); - devinfo->layout_type = LAYOUT_FLEX_FILES; - devinfo->ld_private.ops = &layoutstat_ops; - devinfo->ld_private.data = mirror; - - devinfo++; - i++; + for (dss_id = 0; dss_id < mirror->dss_count; ++dss_id) { + dss_info = &mirror->dss[dss_id]; + if (i >= dev_limit) + break; + if (IS_ERR_OR_NULL(dss_info->mirror_ds)) + continue; + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, + &mirror->flags) && + type != NFS4_FF_OP_LAYOUTRETURN) + continue; + /* mirror refcount put in cleanup_layoutstats */ + if (!refcount_inc_not_zero(&mirror->ref)) + continue; + dev = &dss_info->mirror_ds->id_node; + memcpy(&devinfo->dev_id, + &dev->deviceid, + NFS4_DEVICEID4_SIZE); + devinfo->offset = 0; + devinfo->length = NFS4_MAX_UINT64; + spin_lock(&mirror->lock); + devinfo->read_count = + dss_info->read_stat.io_stat.ops_completed; + devinfo->read_bytes = + dss_info->read_stat.io_stat.bytes_completed; + devinfo->write_count = + dss_info->write_stat.io_stat.ops_completed; + devinfo->write_bytes = + dss_info->write_stat.io_stat.bytes_completed; + spin_unlock(&mirror->lock); + devinfo->layout_type = LAYOUT_FLEX_FILES; + devinfo->ld_private.ops = &layoutstat_ops; + devinfo->ld_private.data = &mirror->dss[dss_id]; + + devinfo++; + i++; + } } return i; } -static int -ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) +static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) { + struct pnfs_layout_hdr *lo; struct nfs4_flexfile_layout *ff_layout; const int dev_count = PNFS_LAYOUTSTATS_MAXDEV; /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ - args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO); + args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), + nfs_io_gfp_mask()); if (!args->devinfo) return -ENOMEM; spin_lock(&args->inode->i_lock); - ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); - args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &args->devinfo[0], dev_count); + lo = NFS_I(args->inode)->layout; + if (lo && pnfs_layout_is_valid(lo)) { + ff_layout = FF_LAYOUT_FROM_HDR(lo); + args->num_dev = ff_layout_mirror_prepare_stats( + &ff_layout->generic_hdr, &args->devinfo[0], dev_count, + NFS4_FF_OP_LAYOUTSTATS); + } else + args->num_dev = 0; spin_unlock(&args->inode->i_lock); if (!args->num_dev) { kfree(args->devinfo); @@ -2338,15 +2972,27 @@ ff_layout_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *dummy) { #if IS_ENABLED(CONFIG_NFS_V4_2) - server->caps |= NFS_CAP_LAYOUTSTATS; + server->caps |= NFS_CAP_LAYOUTSTATS | NFS_CAP_REBOOT_LAYOUTRETURN; #endif return 0; } +static const struct pnfs_commit_ops ff_layout_commit_ops = { + .setup_ds_info = ff_layout_setup_ds_info, + .release_ds_info = ff_layout_release_ds_info, + .mark_request_commit = pnfs_layout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .commit_pagelist = ff_layout_commit_pagelist, +}; + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTGET_ON_OPEN, + .max_layoutget_response = 4096, /* 1 page or so... */ .set_layoutdriver = ff_layout_set_layoutdriver, .alloc_layout_hdr = ff_layout_alloc_layout_hdr, .free_layout_hdr = ff_layout_free_layout_hdr, @@ -2357,28 +3003,19 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, .free_deviceid_node = ff_layout_free_deviceid_node, - .mark_request_commit = pnfs_layout_mark_request_commit, - .clear_request_commit = pnfs_generic_clear_request_commit, - .scan_commit_lists = pnfs_generic_scan_commit_lists, - .recover_commit_reqs = pnfs_generic_recover_commit_reqs, - .commit_pagelist = ff_layout_commit_pagelist, .read_pagelist = ff_layout_read_pagelist, .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, .prepare_layoutreturn = ff_layout_prepare_layoutreturn, .sync = pnfs_nfs_generic_sync, .prepare_layoutstats = ff_layout_prepare_layoutstats, + .cancel_io = ff_layout_cancel_io, }; static int __init nfs4flexfilelayout_init(void) { printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n", __func__); - if (!ff_zero_group) { - ff_zero_group = groups_alloc(0); - if (!ff_zero_group) - return -ENOMEM; - } return pnfs_register_layoutdriver(&flexfilelayout_type); } @@ -2387,10 +3024,6 @@ static void __exit nfs4flexfilelayout_exit(void) printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n", __func__); pnfs_unregister_layoutdriver(&flexfilelayout_type); - if (ff_zero_group) { - put_group_info(ff_zero_group); - ff_zero_group = NULL; - } } MODULE_ALIAS("nfs-layouttype4-4"); @@ -2400,3 +3033,7 @@ MODULE_DESCRIPTION("The NFSv4 flexfile layout driver"); module_init(nfs4flexfilelayout_init); module_exit(nfs4flexfilelayout_exit); + +module_param(io_maxretrans, ushort, 0644); +MODULE_PARM_DESC(io_maxretrans, "The number of times the NFSv4.1 client " + "retries an I/O request before returning an error. "); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 98b34c9b0564..17a008c8e97c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NFSv4 flexfile layout driver data structures. * @@ -13,12 +14,15 @@ #define FF_FLAGS_NO_IO_THRU_MDS 2 #define FF_FLAGS_NO_READ_IO 4 +#include <linux/refcount.h> #include "../pnfs.h" /* XXX: Let's filter out insanely large mirror count for now to avoid oom * due to network error etc. */ #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 +#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096 + /* LAYOUTSTATS report interval in ms */ #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) #define FF_LAYOUTSTATS_MAXDEV 4 @@ -69,24 +73,32 @@ struct nfs4_ff_layoutstat { struct nfs4_ff_busy_timer busy_timer; }; -struct nfs4_ff_layout_mirror { - struct pnfs_layout_hdr *layout; - struct list_head mirrors; - u32 ds_count; - u32 efficiency; +struct nfs4_ff_layout_mirror; + +struct nfs4_ff_layout_ds_stripe { + struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; + u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; nfs4_stateid stateid; - struct rpc_cred __rcu *ro_cred; - struct rpc_cred __rcu *rw_cred; - atomic_t ref; - spinlock_t lock; - unsigned long flags; + const struct cred __rcu *ro_cred; + const struct cred __rcu *rw_cred; + struct nfs_file_localio nfl; struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; +}; + +struct nfs4_ff_layout_mirror { + struct pnfs_layout_hdr *layout; + struct list_head mirrors; + u32 dss_count; + struct nfs4_ff_layout_ds_stripe *dss; + refcount_t ref; + spinlock_t lock; + unsigned long flags; u32 report_interval; }; @@ -97,7 +109,7 @@ struct nfs4_ff_layout_segment { u64 stripe_unit; u32 flags; u32 mirror_array_cnt; - struct nfs4_ff_layout_mirror **mirror_array; + struct nfs4_ff_layout_mirror *mirror_array[] __counted_by(mirror_array_cnt); }; struct nfs4_flexfile_layout { @@ -130,16 +142,6 @@ FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg) generic_hdr); } -static inline struct nfs4_deviceid_node * -FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) -{ - if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt || - FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL || - FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL) - return NULL; - return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node; -} - static inline struct nfs4_ff_layout_ds * FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) { @@ -149,9 +151,25 @@ FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) static inline struct nfs4_ff_layout_mirror * FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) { - if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt) - return NULL; - return FF_LAYOUT_LSEG(lseg)->mirror_array[idx]; + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + + if (idx < fls->mirror_array_cnt) + return fls->mirror_array[idx]; + return NULL; +} + +static inline struct nfs4_deviceid_node * +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx); + + if (mirror != NULL) { + struct nfs4_ff_layout_ds *mirror_ds = mirror->dss[dss_id].mirror_ds; + + if (!IS_ERR_OR_NULL(mirror_ds)) + return &mirror_ds->id_node; + } + return NULL; } static inline u32 @@ -172,28 +190,23 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO; } -static inline bool -ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) +static inline int +nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror, u32 dss_id) { - /* - * Flexfiles should never mark a DS unavailable, but if it does - * print a (ratelimited) warning as this can affect performance. - */ - if (nfs4_test_deviceid_unavailable(node)) { - u32 *p = (u32 *)node->deviceid.data; - - pr_warn_ratelimited("NFS: flexfiles layout referencing an " - "unavailable device [%x%x%x%x]\n", - p[0], p[1], p[2], p[3]); - return true; - } - return false; + return mirror->dss[dss_id].mirror_ds->ds_versions[0].version; } -static inline int -nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx) +static inline u32 +nfs4_ff_layout_calc_dss_id(const u64 stripe_unit, const u32 dss_count, const loff_t offset) { - return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version; + u64 tmp = offset; + + if (dss_count == 1 || stripe_unit == 0) + return 0; + + do_div(tmp, stripe_unit); + + return do_div(tmp, dss_count); } struct nfs4_ff_layout_ds * @@ -202,9 +215,10 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds); void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds); int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, - struct nfs4_ff_layout_mirror *mirror, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - gfp_t gfp_flags); + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, u64 offset, u64 length, int status, + enum nfs_opnum4 opnum, gfp_t gfp_flags); +void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg); int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head); void ff_layout_free_ds_ioerr(struct list_head *head); unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, @@ -212,19 +226,27 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, struct list_head *head, unsigned int maxnum); struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id); +void +nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * -nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, bool fail_return); struct rpc_clnt * -nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, - u32 ds_idx, +nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, - struct inode *inode); -struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, - u32 ds_idx, struct rpc_cred *mdscred); + struct inode *inode, + u32 dss_id); +const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, + const struct pnfs_layout_range *range, + const struct cred *mdscred, + u32 dss_id); bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 6df7a0cf5660..c55ea8fa3bfa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Device operations for the pnfs nfs4 file layout driver. * @@ -17,7 +18,7 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD -static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS; +static unsigned int dataserver_timeo = NFS_DEF_TCP_TIMEO; static unsigned int dataserver_retrans; static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); @@ -32,6 +33,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); + kfree(mirror_ds->ds_versions); kfree_rcu(mirror_ds, id_node.rcu); } @@ -42,18 +44,19 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, { struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; struct nfs4_ff_layout_ds *new_ds = NULL; struct nfs4_ff_ds_version *ds_versions = NULL; + struct net *net = server->nfs_client->cl_net; u32 mp_count; u32 version_count; __be32 *p; int i, ret = -ENOMEM; /* set up xdr stream */ - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) goto out_err; @@ -67,7 +70,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, INIT_LIST_HEAD(&dsaddrs); xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_folio(&stream, scratch); /* multipath count */ p = xdr_inline_decode(&stream, 4); @@ -78,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, for (i = 0; i < mp_count; i++) { /* multipath ds */ - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -97,7 +99,8 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, version_count = be32_to_cpup(p); dprintk("%s: version count %d\n", __func__, version_count); - ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version), + ds_versions = kcalloc(version_count, + sizeof(struct nfs4_ff_ds_version), gfp_flags); if (!ds_versions) goto out_scratch; @@ -110,8 +113,10 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_drain_dsaddrs; ds_versions[i].version = be32_to_cpup(p++); ds_versions[i].minor_version = be32_to_cpup(p++); - ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL); - ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL); + ds_versions[i].rsize = nfs_io_size(be32_to_cpup(p++), + server->nfs_client->cl_proto); + ds_versions[i].wsize = nfs_io_size(be32_to_cpup(p++), + server->nfs_client->cl_proto); ds_versions[i].tightly_coupled = be32_to_cpup(p); if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE) @@ -144,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, new_ds->ds_versions = ds_versions; new_ds->ds_versions_cnt = version_count; - new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!new_ds->ds) goto out_err_drain_dsaddrs; @@ -158,7 +163,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, kfree(da); } - __free_page(scratch); + folio_put(scratch); return new_ds; out_err_drain_dsaddrs: @@ -172,7 +177,7 @@ out_err_drain_dsaddrs: kfree(ds_versions); out_scratch: - __free_page(scratch); + folio_put(scratch); out_err: kfree(new_ds); @@ -180,56 +185,6 @@ out_err: return NULL; } -static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg, - struct nfs4_deviceid_node *devid) -{ - nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); - if (!ff_layout_has_available_ds(lseg)) - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, - lseg); -} - -static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, - struct nfs4_ff_layout_mirror *mirror, - bool create) -{ - if (mirror == NULL || IS_ERR(mirror->mirror_ds)) - goto outerr; - if (mirror->mirror_ds == NULL) { - if (create) { - struct nfs4_deviceid_node *node; - struct pnfs_layout_hdr *lh = lseg->pls_layout; - struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); - - node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), - &mirror->devid, lh->plh_lc_cred, - GFP_KERNEL); - if (node) - mirror_ds = FF_LAYOUT_MIRROR_DS(node); - - /* check for race with another call to this function */ - if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && - mirror_ds != ERR_PTR(-ENODEV)) - nfs4_put_deviceid_node(node); - } else - goto outerr; - } - - if (IS_ERR(mirror->mirror_ds)) - goto outerr; - - if (mirror->mirror_ds->ds == NULL) { - struct nfs4_deviceid_node *devid; - devid = &mirror->mirror_ds->id_node; - ff_layout_mark_devid_invalid(lseg, devid); - return false; - } - return true; -outerr: - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); - return false; -} - static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, u64 offset, u64 length) { @@ -295,16 +250,16 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, } int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, - struct nfs4_ff_layout_mirror *mirror, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - gfp_t gfp_flags) + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, u64 offset, u64 length, int status, + enum nfs_opnum4 opnum, gfp_t gfp_flags) { struct nfs4_ff_layout_ds_err *dserr; if (status == 0) return 0; - if (mirror->mirror_ds == NULL) + if (IS_ERR_OR_NULL(mirror->dss[dss_id].mirror_ds)) return -EINVAL; dserr = kmalloc(sizeof(*dserr), gfp_flags); @@ -316,26 +271,25 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, dserr->length = length; dserr->status = status; dserr->opnum = opnum; - nfs4_stateid_copy(&dserr->stateid, &mirror->stateid); - memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid, + nfs4_stateid_copy(&dserr->stateid, &mirror->dss[dss_id].stateid); + memcpy(&dserr->deviceid, &mirror->dss[dss_id].mirror_ds->id_node.deviceid, NFS4_DEVICEID4_SIZE); spin_lock(&flo->generic_hdr.plh_inode->i_lock); ff_layout_add_ds_error_locked(flo, dserr); spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - return 0; } -static struct rpc_cred * -ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) +static const struct cred * +ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode, u32 dss_id) { - struct rpc_cred *cred, __rcu **pcred; + const struct cred *cred, __rcu **pcred; if (iomode == IOMODE_READ) - pcred = &mirror->ro_cred; + pcred = &mirror->dss[dss_id].ro_cred; else - pcred = &mirror->rw_cred; + pcred = &mirror->dss[dss_id].rw_cred; rcu_read_lock(); do { @@ -343,34 +297,64 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) if (!cred) break; - cred = get_rpccred_rcu(cred); + cred = get_cred_rcu(cred); } while(!cred); rcu_read_unlock(); return cred; } struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id) +{ + /* FIXME: For now assume there is only 1 version available for the DS */ + return &mirror->dss[dss_id].fh_versions[0]; +} + +void +nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, + nfs4_stateid *stateid) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); - struct nfs_fh *fh = NULL; + if (nfs4_ff_layout_ds_version(mirror, dss_id) == 4) + nfs4_stateid_copy(stateid, &mirror->dss[dss_id].stateid); +} - if (!ff_layout_mirror_valid(lseg, mirror, false)) { - pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", - __func__, mirror_idx); - goto out; +static bool +ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo, + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id) +{ + if (mirror == NULL) + goto outerr; + if (mirror->dss[dss_id].mirror_ds == NULL) { + struct nfs4_deviceid_node *node; + struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); + + node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), + &mirror->dss[dss_id].devid, lo->plh_lc_cred, + GFP_KERNEL); + if (node) + mirror_ds = FF_LAYOUT_MIRROR_DS(node); + + /* check for race with another call to this function */ + if (cmpxchg(&mirror->dss[dss_id].mirror_ds, NULL, mirror_ds) && + mirror_ds != ERR_PTR(-ENODEV)) + nfs4_put_deviceid_node(node); } - /* FIXME: For now assume there is only 1 version available for the DS */ - fh = &mirror->fh_versions[0]; -out: - return fh; + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) + goto outerr; + + return true; +outerr: + return false; } /** * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on - * @ds_idx: index of the DS to use + * @mirror: layout mirror describing the DS to use + * @dss_id: DS stripe id to select stripe to use * @fail_return: return layout on connect failure? * * Try to prepare a DS connection to accept an RPC call. This involves @@ -385,92 +369,98 @@ out: * Returns a pointer to a connected DS object on success or NULL on failure. */ struct nfs4_pnfs_ds * -nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, bool fail_return) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - struct nfs4_pnfs_ds *ds = NULL; - struct nfs4_deviceid_node *devid; + struct nfs4_pnfs_ds *ds; struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; - int status; - - if (!ff_layout_mirror_valid(lseg, mirror, true)) { - pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", - __func__, ds_idx); - goto out; - } + int status = -EAGAIN; - devid = &mirror->mirror_ds->id_node; - if (ff_layout_test_devid_unavailable(devid)) - goto out_fail; + if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror, dss_id)) + goto noconnect; - ds = mirror->mirror_ds->ds; + ds = mirror->dss[dss_id].mirror_ds->ds; + if (READ_ONCE(ds->ds_clp)) + goto out; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ smp_rmb(); - if (ds->ds_clp) - goto out; /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. */ - status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, - dataserver_retrans, - mirror->mirror_ds->ds_versions[0].version, - mirror->mirror_ds->ds_versions[0].minor_version); + status = nfs4_pnfs_ds_connect(s, ds, &mirror->dss[dss_id].mirror_ds->id_node, + dataserver_timeo, dataserver_retrans, + mirror->dss[dss_id].mirror_ds->ds_versions[0].version, + mirror->dss[dss_id].mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ if (!status) { + /* + * ds_clp is put in destroy_ds(). + * keep ds_clp even if DS is local, so that if local IO cannot + * proceed somehow, we can fall back to NFS whenever we want. + */ + nfs_local_probe_async(ds->ds_clp); max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); - if (mirror->mirror_ds->ds_versions[0].rsize > max_payload) - mirror->mirror_ds->ds_versions[0].rsize = max_payload; - if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) - mirror->mirror_ds->ds_versions[0].wsize = max_payload; + if (mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize > max_payload) + mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize = max_payload; + if (mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize > max_payload) + mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize = max_payload; goto out; } -out_fail: +noconnect: ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, lseg->pls_range.offset, + mirror, dss_id, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); + ff_layout_send_layouterror(lseg); if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); - ds = NULL; + ds = ERR_PTR(status); out: return ds; } -struct rpc_cred * -ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, - struct rpc_cred *mdscred) +const struct cred * +ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, + const struct pnfs_layout_range *range, + const struct cred *mdscred, + u32 dss_id) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - struct rpc_cred *cred; + const struct cred *cred; - if (mirror) { - cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); + if (mirror && !mirror->dss[dss_id].mirror_ds->ds_versions[0].tightly_coupled) { + cred = ff_layout_get_mirror_cred(mirror, range->iomode, dss_id); if (!cred) - cred = get_rpccred(mdscred); + cred = get_cred(mdscred); } else { - cred = get_rpccred(mdscred); + cred = get_cred(mdscred); } return cred; } /** -* Find or create a DS rpc client with th MDS server rpc client auth flavor -* in the nfs_client cl_ds_clients list. -*/ + * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client + * @mirror: pointer to the mirror + * @ds_clp: nfs_client for the DS + * @inode: pointer to inode + * @dss_id: DS stripe id + * + * Find or create a DS rpc client with th MDS server rpc client auth flavor + * in the nfs_client cl_ds_clients list. + */ struct rpc_clnt * -nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx, - struct nfs_client *ds_clp, struct inode *inode) +nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, + struct nfs_client *ds_clp, struct inode *inode, + u32 dss_id) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - - switch (mirror->mirror_ds->ds_versions[0].version) { + switch (mirror->dss[dss_id].mirror_ds->ds_versions[0].version) { case 3: /* For NFSv3 DS, flavor is set when creating DS connections */ return ds_clp->cl_rpcclient; @@ -576,17 +566,19 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (mirror) { - if (!mirror->mirror_ds) + if (!mirror) + continue; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + if (!mirror->dss[dss_id].mirror_ds) return true; - if (IS_ERR(mirror->mirror_ds)) + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) continue; - devid = &mirror->mirror_ds->id_node; - if (!ff_layout_test_devid_unavailable(devid)) + devid = &mirror->dss[dss_id].mirror_ds->id_node; + if (!nfs4_test_deviceid_unavailable(devid)) return true; } } @@ -598,17 +590,21 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (!mirror || IS_ERR(mirror->mirror_ds)) - return false; - if (!mirror->mirror_ds) - continue; - devid = &mirror->mirror_ds->id_node; - if (ff_layout_test_devid_unavailable(devid)) + if (!mirror) return false; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) + return false; + if (!mirror->dss[dss_id].mirror_ds) + continue; + devid = &mirror->dss[dss_id].mirror_ds->id_node; + if (nfs4_test_deviceid_unavailable(devid)) + return false; + } } return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c new file mode 100644 index 000000000000..b4679b7161b0 --- /dev/null +++ b/fs/nfs/fs_context.c @@ -0,0 +1,1788 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/fs/nfs/fs_context.c + * + * Copyright (C) 1992 Rick Sladkey + * Conversion to new mount api Copyright (C) David Howells + * + * NFS mount handling. + * + * Split from fs/nfs/super.c by David Howells <dhowells@redhat.com> + */ + +#include <linux/compat.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> + +#include <net/handshake.h> + +#include "nfs.h" +#include "internal.h" + +#include "nfstrace.h" + +#define NFSDBG_FACILITY NFSDBG_MOUNT + +#if IS_ENABLED(CONFIG_NFS_V3) +#define NFS_DEFAULT_VERSION 3 +#else +#define NFS_DEFAULT_VERSION 2 +#endif + +#define NFS_MAX_CONNECTIONS 16 + +enum nfs_param { + Opt_ac, + Opt_acdirmax, + Opt_acdirmin, + Opt_acl, + Opt_acregmax, + Opt_acregmin, + Opt_actimeo, + Opt_addr, + Opt_bg, + Opt_bsize, + Opt_clientaddr, + Opt_cto, + Opt_alignwrite, + Opt_fatal_neterrors, + Opt_fg, + Opt_fscache, + Opt_fscache_flag, + Opt_hard, + Opt_intr, + Opt_local_lock, + Opt_lock, + Opt_lookupcache, + Opt_migration, + Opt_minorversion, + Opt_mountaddr, + Opt_mounthost, + Opt_mountport, + Opt_mountproto, + Opt_mountvers, + Opt_namelen, + Opt_nconnect, + Opt_max_connect, + Opt_port, + Opt_posix, + Opt_proto, + Opt_rdirplus, + Opt_rdirplus_none, + Opt_rdirplus_force, + Opt_rdma, + Opt_resvport, + Opt_retrans, + Opt_retry, + Opt_rsize, + Opt_sec, + Opt_sharecache, + Opt_sloppy, + Opt_soft, + Opt_softerr, + Opt_softreval, + Opt_source, + Opt_tcp, + Opt_timeo, + Opt_trunkdiscovery, + Opt_udp, + Opt_v, + Opt_vers, + Opt_wsize, + Opt_write, + Opt_xprtsec, + Opt_cert_serial, + Opt_privkey_serial, +}; + +enum { + Opt_fatal_neterrors_default, + Opt_fatal_neterrors_enetunreach, + Opt_fatal_neterrors_none, +}; + +static const struct constant_table nfs_param_enums_fatal_neterrors[] = { + { "default", Opt_fatal_neterrors_default }, + { "ENETDOWN:ENETUNREACH", Opt_fatal_neterrors_enetunreach }, + { "ENETUNREACH:ENETDOWN", Opt_fatal_neterrors_enetunreach }, + { "none", Opt_fatal_neterrors_none }, + {} +}; + +enum { + Opt_local_lock_all, + Opt_local_lock_flock, + Opt_local_lock_none, + Opt_local_lock_posix, +}; + +static const struct constant_table nfs_param_enums_local_lock[] = { + { "all", Opt_local_lock_all }, + { "flock", Opt_local_lock_flock }, + { "posix", Opt_local_lock_posix }, + { "none", Opt_local_lock_none }, + {} +}; + +enum { + Opt_lookupcache_all, + Opt_lookupcache_none, + Opt_lookupcache_positive, +}; + +static const struct constant_table nfs_param_enums_lookupcache[] = { + { "all", Opt_lookupcache_all }, + { "none", Opt_lookupcache_none }, + { "pos", Opt_lookupcache_positive }, + { "positive", Opt_lookupcache_positive }, + {} +}; + +enum { + Opt_write_lazy, + Opt_write_eager, + Opt_write_wait, +}; + +static const struct constant_table nfs_param_enums_write[] = { + { "lazy", Opt_write_lazy }, + { "eager", Opt_write_eager }, + { "wait", Opt_write_wait }, + {} +}; + +static const struct fs_parameter_spec nfs_fs_parameters[] = { + fsparam_flag_no("ac", Opt_ac), + fsparam_u32 ("acdirmax", Opt_acdirmax), + fsparam_u32 ("acdirmin", Opt_acdirmin), + fsparam_flag_no("acl", Opt_acl), + fsparam_u32 ("acregmax", Opt_acregmax), + fsparam_u32 ("acregmin", Opt_acregmin), + fsparam_u32 ("actimeo", Opt_actimeo), + fsparam_string("addr", Opt_addr), + fsparam_flag ("bg", Opt_bg), + fsparam_u32 ("bsize", Opt_bsize), + fsparam_string("clientaddr", Opt_clientaddr), + fsparam_flag_no("cto", Opt_cto), + fsparam_flag_no("alignwrite", Opt_alignwrite), + fsparam_enum("fatal_neterrors", Opt_fatal_neterrors, + nfs_param_enums_fatal_neterrors), + fsparam_flag ("fg", Opt_fg), + fsparam_flag_no("fsc", Opt_fscache_flag), + fsparam_string("fsc", Opt_fscache), + fsparam_flag ("hard", Opt_hard), + __fsparam(NULL, "intr", Opt_intr, + fs_param_neg_with_no|fs_param_deprecated, NULL), + fsparam_enum ("local_lock", Opt_local_lock, nfs_param_enums_local_lock), + fsparam_flag_no("lock", Opt_lock), + fsparam_enum ("lookupcache", Opt_lookupcache, nfs_param_enums_lookupcache), + fsparam_flag_no("migration", Opt_migration), + fsparam_u32 ("minorversion", Opt_minorversion), + fsparam_string("mountaddr", Opt_mountaddr), + fsparam_string("mounthost", Opt_mounthost), + fsparam_u32 ("mountport", Opt_mountport), + fsparam_string("mountproto", Opt_mountproto), + fsparam_u32 ("mountvers", Opt_mountvers), + fsparam_u32 ("namlen", Opt_namelen), + fsparam_u32 ("nconnect", Opt_nconnect), + fsparam_u32 ("max_connect", Opt_max_connect), + fsparam_string("nfsvers", Opt_vers), + fsparam_u32 ("port", Opt_port), + fsparam_flag_no("posix", Opt_posix), + fsparam_string("proto", Opt_proto), + fsparam_flag_no("rdirplus", Opt_rdirplus), // rdirplus|nordirplus + fsparam_string("rdirplus", Opt_rdirplus), // rdirplus=... + fsparam_flag ("rdma", Opt_rdma), + fsparam_flag_no("resvport", Opt_resvport), + fsparam_u32 ("retrans", Opt_retrans), + fsparam_string("retry", Opt_retry), + fsparam_u32 ("rsize", Opt_rsize), + fsparam_string("sec", Opt_sec), + fsparam_flag_no("sharecache", Opt_sharecache), + fsparam_flag ("sloppy", Opt_sloppy), + fsparam_flag ("soft", Opt_soft), + fsparam_flag ("softerr", Opt_softerr), + fsparam_flag ("softreval", Opt_softreval), + fsparam_string("source", Opt_source), + fsparam_flag ("tcp", Opt_tcp), + fsparam_u32 ("timeo", Opt_timeo), + fsparam_flag_no("trunkdiscovery", Opt_trunkdiscovery), + fsparam_flag ("udp", Opt_udp), + fsparam_flag ("v2", Opt_v), + fsparam_flag ("v3", Opt_v), + fsparam_flag ("v4", Opt_v), + fsparam_flag ("v4.0", Opt_v), + fsparam_flag ("v4.1", Opt_v), + fsparam_flag ("v4.2", Opt_v), + fsparam_string("vers", Opt_vers), + fsparam_enum ("write", Opt_write, nfs_param_enums_write), + fsparam_u32 ("wsize", Opt_wsize), + fsparam_string("xprtsec", Opt_xprtsec), + fsparam_s32("cert_serial", Opt_cert_serial), + fsparam_s32("privkey_serial", Opt_privkey_serial), + {} +}; + +enum { + Opt_vers_2, + Opt_vers_3, + Opt_vers_4, + Opt_vers_4_0, + Opt_vers_4_1, + Opt_vers_4_2, +}; + +static const struct constant_table nfs_vers_tokens[] = { + { "2", Opt_vers_2 }, + { "3", Opt_vers_3 }, + { "4", Opt_vers_4 }, + { "4.0", Opt_vers_4_0 }, + { "4.1", Opt_vers_4_1 }, + { "4.2", Opt_vers_4_2 }, + {} +}; + +enum { + Opt_xprt_rdma, + Opt_xprt_rdma6, + Opt_xprt_tcp, + Opt_xprt_tcp6, + Opt_xprt_udp, + Opt_xprt_udp6, + nr__Opt_xprt +}; + +static const struct constant_table nfs_xprt_protocol_tokens[] = { + { "rdma", Opt_xprt_rdma }, + { "rdma6", Opt_xprt_rdma6 }, + { "tcp", Opt_xprt_tcp }, + { "tcp6", Opt_xprt_tcp6 }, + { "udp", Opt_xprt_udp }, + { "udp6", Opt_xprt_udp6 }, + {} +}; + +enum { + Opt_sec_krb5, + Opt_sec_krb5i, + Opt_sec_krb5p, + Opt_sec_lkey, + Opt_sec_lkeyi, + Opt_sec_lkeyp, + Opt_sec_none, + Opt_sec_spkm, + Opt_sec_spkmi, + Opt_sec_spkmp, + Opt_sec_sys, + nr__Opt_sec +}; + +static const struct constant_table nfs_secflavor_tokens[] = { + { "krb5", Opt_sec_krb5 }, + { "krb5i", Opt_sec_krb5i }, + { "krb5p", Opt_sec_krb5p }, + { "lkey", Opt_sec_lkey }, + { "lkeyi", Opt_sec_lkeyi }, + { "lkeyp", Opt_sec_lkeyp }, + { "none", Opt_sec_none }, + { "null", Opt_sec_none }, + { "spkm3", Opt_sec_spkm }, + { "spkm3i", Opt_sec_spkmi }, + { "spkm3p", Opt_sec_spkmp }, + { "sys", Opt_sec_sys }, + {} +}; + +enum { + Opt_xprtsec_none, + Opt_xprtsec_tls, + Opt_xprtsec_mtls, + nr__Opt_xprtsec +}; + +static const struct constant_table nfs_xprtsec_policies[] = { + { "none", Opt_xprtsec_none }, + { "tls", Opt_xprtsec_tls }, + { "mtls", Opt_xprtsec_mtls }, + {} +}; + +static const struct constant_table nfs_rdirplus_tokens[] = { + { "none", Opt_rdirplus_none }, + { "force", Opt_rdirplus_force }, + {} +}; + +/* + * Sanity-check a server address provided by the mount command. + * + * Address family must be initialized, and address must not be + * the ANY address for that family. + */ +static int nfs_verify_server_address(struct sockaddr_storage *addr) +{ + switch (addr->ss_family) { + case AF_INET: { + struct sockaddr_in *sa = (struct sockaddr_in *)addr; + return sa->sin_addr.s_addr != htonl(INADDR_ANY); + } + case AF_INET6: { + struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr; + return !ipv6_addr_any(sa); + } + } + + return 0; +} + +#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT +static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx) +{ + return true; +} +#else +static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx) +{ + if (ctx->version == 4) + return true; + return false; +} +#endif + +/* + * Sanity check the NFS transport protocol. + */ +static int nfs_validate_transport_protocol(struct fs_context *fc, + struct nfs_fs_context *ctx) +{ + switch (ctx->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + if (nfs_server_transport_udp_invalid(ctx)) + goto out_invalid_transport_udp; + break; + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + break; + default: + ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; + } + + if (ctx->xprtsec.policy != RPC_XPRTSEC_NONE) + switch (ctx->nfs_server.protocol) { + case XPRT_TRANSPORT_TCP: + ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP_TLS; + break; + default: + goto out_invalid_xprtsec_policy; + } + + return 0; +out_invalid_transport_udp: + return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); +out_invalid_xprtsec_policy: + return nfs_invalf(fc, "NFS: Transport does not support xprtsec"); +} + +/* + * For text based NFSv2/v3 mounts, the mount protocol transport default + * settings should depend upon the specified NFS transport. + */ +static void nfs_set_mount_transport_protocol(struct nfs_fs_context *ctx) +{ + if (ctx->mount_server.protocol == XPRT_TRANSPORT_UDP || + ctx->mount_server.protocol == XPRT_TRANSPORT_TCP) + return; + switch (ctx->nfs_server.protocol) { + case XPRT_TRANSPORT_UDP: + ctx->mount_server.protocol = XPRT_TRANSPORT_UDP; + break; + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_RDMA: + ctx->mount_server.protocol = XPRT_TRANSPORT_TCP; + } +} + +/* + * Add 'flavor' to 'auth_info' if not already present. + * Returns true if 'flavor' ends up in the list, false otherwise + */ +static int nfs_auth_info_add(struct fs_context *fc, + struct nfs_auth_info *auth_info, + rpc_authflavor_t flavor) +{ + unsigned int i; + unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors); + + /* make sure this flavor isn't already in the list */ + for (i = 0; i < auth_info->flavor_len; i++) { + if (flavor == auth_info->flavors[i]) + return 0; + } + + if (auth_info->flavor_len + 1 >= max_flavor_len) + return nfs_invalf(fc, "NFS: too many sec= flavors"); + + auth_info->flavors[auth_info->flavor_len++] = flavor; + return 0; +} + +/* + * Parse the value of the 'sec=' option. + */ +static int nfs_parse_security_flavors(struct fs_context *fc, + struct fs_parameter *param) +{ + struct nfs_fs_context *ctx = nfs_fc2context(fc); + rpc_authflavor_t pseudoflavor; + char *string = param->string, *p; + int ret; + + trace_nfs_mount_assign(param->key, string); + + while ((p = strsep(&string, ":")) != NULL) { + if (!*p) + continue; + switch (lookup_constant(nfs_secflavor_tokens, p, -1)) { + case Opt_sec_none: + pseudoflavor = RPC_AUTH_NULL; + break; + case Opt_sec_sys: + pseudoflavor = RPC_AUTH_UNIX; + break; + case Opt_sec_krb5: + pseudoflavor = RPC_AUTH_GSS_KRB5; + break; + case Opt_sec_krb5i: + pseudoflavor = RPC_AUTH_GSS_KRB5I; + break; + case Opt_sec_krb5p: + pseudoflavor = RPC_AUTH_GSS_KRB5P; + break; + case Opt_sec_lkey: + pseudoflavor = RPC_AUTH_GSS_LKEY; + break; + case Opt_sec_lkeyi: + pseudoflavor = RPC_AUTH_GSS_LKEYI; + break; + case Opt_sec_lkeyp: + pseudoflavor = RPC_AUTH_GSS_LKEYP; + break; + case Opt_sec_spkm: + pseudoflavor = RPC_AUTH_GSS_SPKM; + break; + case Opt_sec_spkmi: + pseudoflavor = RPC_AUTH_GSS_SPKMI; + break; + case Opt_sec_spkmp: + pseudoflavor = RPC_AUTH_GSS_SPKMP; + break; + default: + return nfs_invalf(fc, "NFS: sec=%s option not recognized", p); + } + + ret = nfs_auth_info_add(fc, &ctx->auth_info, pseudoflavor); + if (ret < 0) + return ret; + } + + return 0; +} + +static int nfs_parse_xprtsec_policy(struct fs_context *fc, + struct fs_parameter *param) +{ + struct nfs_fs_context *ctx = nfs_fc2context(fc); + + trace_nfs_mount_assign(param->key, param->string); + + switch (lookup_constant(nfs_xprtsec_policies, param->string, -1)) { + case Opt_xprtsec_none: + ctx->xprtsec.policy = RPC_XPRTSEC_NONE; + break; + case Opt_xprtsec_tls: + ctx->xprtsec.policy = RPC_XPRTSEC_TLS_ANON; + break; + case Opt_xprtsec_mtls: + ctx->xprtsec.policy = RPC_XPRTSEC_TLS_X509; + break; + default: + return nfs_invalf(fc, "NFS: Unrecognized transport security policy"); + } + return 0; +} + +static int nfs_parse_version_string(struct fs_context *fc, + const char *string) +{ + struct nfs_fs_context *ctx = nfs_fc2context(fc); + + ctx->flags &= ~NFS_MOUNT_VER3; + switch (lookup_constant(nfs_vers_tokens, string, -1)) { + case Opt_vers_2: + ctx->version = 2; + break; + case Opt_vers_3: + ctx->flags |= NFS_MOUNT_VER3; + ctx->version = 3; + break; + case Opt_vers_4: + /* Backward compatibility option. In future, + * the mount program should always supply + * a NFSv4 minor version number. + */ + ctx->version = 4; + break; + case Opt_vers_4_0: + ctx->version = 4; + ctx->minorversion = 0; + break; + case Opt_vers_4_1: + ctx->version = 4; + ctx->minorversion = 1; + break; + case Opt_vers_4_2: + ctx->version = 4; + ctx->minorversion = 2; + break; + default: + return nfs_invalf(fc, "NFS: Unsupported NFS version"); + } + return 0; +} + +#ifdef CONFIG_KEYS +static int nfs_tls_key_verify(key_serial_t key_id) +{ + struct key *key = key_lookup(key_id); + int error = 0; + + if (IS_ERR(key)) { + pr_err("key id %08x not found\n", key_id); + return PTR_ERR(key); + } + if (test_bit(KEY_FLAG_REVOKED, &key->flags) || + test_bit(KEY_FLAG_INVALIDATED, &key->flags)) { + pr_err("key id %08x revoked\n", key_id); + error = -EKEYREVOKED; + } + + key_put(key); + return error; +} +#else +static inline int nfs_tls_key_verify(key_serial_t key_id) +{ + return -ENOENT; +} +#endif /* CONFIG_KEYS */ + +/* + * Parse a single mount parameter. + */ +static int nfs_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct fs_parse_result result; + struct nfs_fs_context *ctx = nfs_fc2context(fc); + unsigned short protofamily, mountfamily; + unsigned int len; + int ret, opt; + + trace_nfs_mount_option(param); + + opt = fs_parse(fc, nfs_fs_parameters, param, &result); + if (opt < 0) + return (opt == -ENOPARAM && ctx->sloppy) ? 1 : opt; + + if (fc->security) + ctx->has_sec_mnt_opts = 1; + + switch (opt) { + case Opt_source: + if (fc->source) + return nfs_invalf(fc, "NFS: Multiple sources not supported"); + fc->source = param->string; + param->string = NULL; + break; + + /* + * boolean options: foo/nofoo + */ + case Opt_soft: + ctx->flags |= NFS_MOUNT_SOFT; + ctx->flags &= ~NFS_MOUNT_SOFTERR; + break; + case Opt_softerr: + ctx->flags |= NFS_MOUNT_SOFTERR | NFS_MOUNT_SOFTREVAL; + ctx->flags &= ~NFS_MOUNT_SOFT; + break; + case Opt_hard: + ctx->flags &= ~(NFS_MOUNT_SOFT | + NFS_MOUNT_SOFTERR | + NFS_MOUNT_SOFTREVAL); + break; + case Opt_softreval: + if (result.negated) + ctx->flags &= ~NFS_MOUNT_SOFTREVAL; + else + ctx->flags |= NFS_MOUNT_SOFTREVAL; + break; + case Opt_posix: + if (result.negated) + ctx->flags &= ~NFS_MOUNT_POSIX; + else + ctx->flags |= NFS_MOUNT_POSIX; + break; + case Opt_cto: + if (result.negated) + ctx->flags |= NFS_MOUNT_NOCTO; + else + ctx->flags &= ~NFS_MOUNT_NOCTO; + break; + case Opt_trunkdiscovery: + if (result.negated) + ctx->flags &= ~NFS_MOUNT_TRUNK_DISCOVERY; + else + ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY; + break; + case Opt_alignwrite: + if (result.negated) + ctx->flags |= NFS_MOUNT_NO_ALIGNWRITE; + else + ctx->flags &= ~NFS_MOUNT_NO_ALIGNWRITE; + break; + case Opt_ac: + if (result.negated) + ctx->flags |= NFS_MOUNT_NOAC; + else + ctx->flags &= ~NFS_MOUNT_NOAC; + break; + case Opt_lock: + if (result.negated) { + ctx->lock_status = NFS_LOCK_NOLOCK; + ctx->flags |= NFS_MOUNT_NONLM; + ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); + } else { + ctx->lock_status = NFS_LOCK_LOCK; + ctx->flags &= ~NFS_MOUNT_NONLM; + ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); + } + break; + case Opt_udp: + ctx->flags &= ~NFS_MOUNT_TCP; + ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; + break; + case Opt_tcp: + case Opt_rdma: + ctx->flags |= NFS_MOUNT_TCP; /* for side protocols */ + ret = xprt_find_transport_ident(param->key); + if (ret < 0) + goto out_bad_transport; + ctx->nfs_server.protocol = ret; + break; + case Opt_acl: + if (result.negated) + ctx->flags |= NFS_MOUNT_NOACL; + else + ctx->flags &= ~NFS_MOUNT_NOACL; + break; + case Opt_rdirplus: + if (result.negated) { + ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS; + ctx->flags |= NFS_MOUNT_NORDIRPLUS; + } else if (!param->string) { + ctx->flags &= ~(NFS_MOUNT_NORDIRPLUS | NFS_MOUNT_FORCE_RDIRPLUS); + } else { + switch (lookup_constant(nfs_rdirplus_tokens, param->string, -1)) { + case Opt_rdirplus_none: + ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS; + ctx->flags |= NFS_MOUNT_NORDIRPLUS; + break; + case Opt_rdirplus_force: + ctx->flags &= ~NFS_MOUNT_NORDIRPLUS; + ctx->flags |= NFS_MOUNT_FORCE_RDIRPLUS; + break; + default: + goto out_invalid_value; + } + } + break; + case Opt_sharecache: + if (result.negated) + ctx->flags |= NFS_MOUNT_UNSHARED; + else + ctx->flags &= ~NFS_MOUNT_UNSHARED; + break; + case Opt_resvport: + if (result.negated) + ctx->flags |= NFS_MOUNT_NORESVPORT; + else + ctx->flags &= ~NFS_MOUNT_NORESVPORT; + break; + case Opt_fscache_flag: + if (result.negated) + ctx->options &= ~NFS_OPTION_FSCACHE; + else + ctx->options |= NFS_OPTION_FSCACHE; + kfree(ctx->fscache_uniq); + ctx->fscache_uniq = NULL; + break; + case Opt_fscache: + trace_nfs_mount_assign(param->key, param->string); + ctx->options |= NFS_OPTION_FSCACHE; + kfree(ctx->fscache_uniq); + ctx->fscache_uniq = param->string; + param->string = NULL; + break; + case Opt_migration: + if (result.negated) + ctx->options &= ~NFS_OPTION_MIGRATION; + else + ctx->options |= NFS_OPTION_MIGRATION; + break; + + /* + * options that take numeric values + */ + case Opt_port: + if (result.uint_32 > USHRT_MAX) + goto out_of_bounds; + ctx->nfs_server.port = result.uint_32; + break; + case Opt_rsize: + ctx->rsize = result.uint_32; + break; + case Opt_wsize: + ctx->wsize = result.uint_32; + break; + case Opt_bsize: + ctx->bsize = result.uint_32; + break; + case Opt_timeo: + if (result.uint_32 < 1 || result.uint_32 > INT_MAX) + goto out_of_bounds; + ctx->timeo = result.uint_32; + break; + case Opt_retrans: + if (result.uint_32 > INT_MAX) + goto out_of_bounds; + ctx->retrans = result.uint_32; + break; + case Opt_acregmin: + ctx->acregmin = result.uint_32; + break; + case Opt_acregmax: + ctx->acregmax = result.uint_32; + break; + case Opt_acdirmin: + ctx->acdirmin = result.uint_32; + break; + case Opt_acdirmax: + ctx->acdirmax = result.uint_32; + break; + case Opt_actimeo: + ctx->acregmin = result.uint_32; + ctx->acregmax = result.uint_32; + ctx->acdirmin = result.uint_32; + ctx->acdirmax = result.uint_32; + break; + case Opt_namelen: + ctx->namlen = result.uint_32; + break; + case Opt_mountport: + if (result.uint_32 > USHRT_MAX) + goto out_of_bounds; + ctx->mount_server.port = result.uint_32; + break; + case Opt_mountvers: + if (result.uint_32 < NFS_MNT_VERSION || + result.uint_32 > NFS_MNT3_VERSION) + goto out_of_bounds; + ctx->mount_server.version = result.uint_32; + break; + case Opt_minorversion: + if (result.uint_32 > NFS4_MAX_MINOR_VERSION) + goto out_of_bounds; + ctx->minorversion = result.uint_32; + break; + + /* + * options that take text values + */ + case Opt_v: + ret = nfs_parse_version_string(fc, param->key + 1); + if (ret < 0) + return ret; + break; + case Opt_vers: + if (!param->string) + goto out_invalid_value; + trace_nfs_mount_assign(param->key, param->string); + ret = nfs_parse_version_string(fc, param->string); + if (ret < 0) + return ret; + break; + case Opt_sec: + ret = nfs_parse_security_flavors(fc, param); + if (ret < 0) + return ret; + break; + case Opt_xprtsec: + ret = nfs_parse_xprtsec_policy(fc, param); + if (ret < 0) + return ret; + break; + case Opt_cert_serial: + ret = nfs_tls_key_verify(result.int_32); + if (ret < 0) + return ret; + ctx->xprtsec.cert_serial = result.int_32; + break; + case Opt_privkey_serial: + ret = nfs_tls_key_verify(result.int_32); + if (ret < 0) + return ret; + ctx->xprtsec.privkey_serial = result.int_32; + break; + + case Opt_proto: + if (!param->string) + goto out_invalid_value; + trace_nfs_mount_assign(param->key, param->string); + protofamily = AF_INET; + switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { + case Opt_xprt_udp6: + protofamily = AF_INET6; + fallthrough; + case Opt_xprt_udp: + ctx->flags &= ~NFS_MOUNT_TCP; + ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; + break; + case Opt_xprt_tcp6: + protofamily = AF_INET6; + fallthrough; + case Opt_xprt_tcp: + ctx->flags |= NFS_MOUNT_TCP; + ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; + break; + case Opt_xprt_rdma6: + protofamily = AF_INET6; + fallthrough; + case Opt_xprt_rdma: + /* vector side protocols to TCP */ + ctx->flags |= NFS_MOUNT_TCP; + ret = xprt_find_transport_ident(param->string); + if (ret < 0) + goto out_bad_transport; + ctx->nfs_server.protocol = ret; + break; + default: + goto out_bad_transport; + } + + ctx->protofamily = protofamily; + break; + + case Opt_mountproto: + if (!param->string) + goto out_invalid_value; + trace_nfs_mount_assign(param->key, param->string); + mountfamily = AF_INET; + switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { + case Opt_xprt_udp6: + mountfamily = AF_INET6; + fallthrough; + case Opt_xprt_udp: + ctx->mount_server.protocol = XPRT_TRANSPORT_UDP; + break; + case Opt_xprt_tcp6: + mountfamily = AF_INET6; + fallthrough; + case Opt_xprt_tcp: + ctx->mount_server.protocol = XPRT_TRANSPORT_TCP; + break; + case Opt_xprt_rdma: /* not used for side protocols */ + default: + goto out_bad_transport; + } + ctx->mountfamily = mountfamily; + break; + + case Opt_addr: + trace_nfs_mount_assign(param->key, param->string); + len = rpc_pton(fc->net_ns, param->string, param->size, + &ctx->nfs_server.address, + sizeof(ctx->nfs_server._address)); + if (len == 0) + goto out_invalid_address; + ctx->nfs_server.addrlen = len; + break; + case Opt_clientaddr: + trace_nfs_mount_assign(param->key, param->string); + kfree(ctx->client_address); + ctx->client_address = param->string; + param->string = NULL; + break; + case Opt_mounthost: + trace_nfs_mount_assign(param->key, param->string); + kfree(ctx->mount_server.hostname); + ctx->mount_server.hostname = param->string; + param->string = NULL; + break; + case Opt_mountaddr: + trace_nfs_mount_assign(param->key, param->string); + len = rpc_pton(fc->net_ns, param->string, param->size, + &ctx->mount_server.address, + sizeof(ctx->mount_server._address)); + if (len == 0) + goto out_invalid_address; + ctx->mount_server.addrlen = len; + break; + case Opt_nconnect: + trace_nfs_mount_assign(param->key, param->string); + if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_CONNECTIONS) + goto out_of_bounds; + ctx->nfs_server.nconnect = result.uint_32; + break; + case Opt_max_connect: + trace_nfs_mount_assign(param->key, param->string); + if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_TRANSPORTS) + goto out_of_bounds; + ctx->nfs_server.max_connect = result.uint_32; + break; + case Opt_fatal_neterrors: + trace_nfs_mount_assign(param->key, param->string); + switch (result.uint_32) { + case Opt_fatal_neterrors_default: + if (fc->net_ns != &init_net) + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + else + ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL; + break; + case Opt_fatal_neterrors_enetunreach: + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + break; + case Opt_fatal_neterrors_none: + ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL; + break; + default: + goto out_invalid_value; + } + break; + case Opt_lookupcache: + trace_nfs_mount_assign(param->key, param->string); + switch (result.uint_32) { + case Opt_lookupcache_all: + ctx->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); + break; + case Opt_lookupcache_positive: + ctx->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; + ctx->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; + break; + case Opt_lookupcache_none: + ctx->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; + break; + default: + goto out_invalid_value; + } + break; + case Opt_local_lock: + trace_nfs_mount_assign(param->key, param->string); + switch (result.uint_32) { + case Opt_local_lock_all: + ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + case Opt_local_lock_flock: + ctx->flags |= NFS_MOUNT_LOCAL_FLOCK; + break; + case Opt_local_lock_posix: + ctx->flags |= NFS_MOUNT_LOCAL_FCNTL; + break; + case Opt_local_lock_none: + ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + break; + default: + goto out_invalid_value; + } + break; + case Opt_write: + trace_nfs_mount_assign(param->key, param->string); + switch (result.uint_32) { + case Opt_write_lazy: + ctx->flags &= + ~(NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT); + break; + case Opt_write_eager: + ctx->flags |= NFS_MOUNT_WRITE_EAGER; + ctx->flags &= ~NFS_MOUNT_WRITE_WAIT; + break; + case Opt_write_wait: + ctx->flags |= + NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT; + break; + default: + goto out_invalid_value; + } + break; + + /* + * Special options + */ + case Opt_sloppy: + ctx->sloppy = true; + break; + } + + return 0; + +out_invalid_value: + return nfs_invalf(fc, "NFS: Bad mount option value specified"); +out_invalid_address: + return nfs_invalf(fc, "NFS: Bad IP address specified"); +out_of_bounds: + return nfs_invalf(fc, "NFS: Value for '%s' out of range", param->key); +out_bad_transport: + return nfs_invalf(fc, "NFS: Unrecognized transport protocol"); +} + +/* + * Split fc->source into "hostname:export_path". + * + * The leftmost colon demarks the split between the server's hostname + * and the export path. If the hostname starts with a left square + * bracket, then it may contain colons. + * + * Note: caller frees hostname and export path, even on error. + */ +static int nfs_parse_source(struct fs_context *fc, + size_t maxnamlen, size_t maxpathlen) +{ + struct nfs_fs_context *ctx = nfs_fc2context(fc); + const char *dev_name = fc->source; + size_t len; + const char *end; + + if (unlikely(!dev_name || !*dev_name)) + return -EINVAL; + + /* Is the host name protected with square brakcets? */ + if (*dev_name == '[') { + end = strchr(++dev_name, ']'); + if (end == NULL || end[1] != ':') + goto out_bad_devname; + + len = end - dev_name; + end++; + } else { + const char *comma; + + end = strchr(dev_name, ':'); + if (end == NULL) + goto out_bad_devname; + len = end - dev_name; + + /* kill possible hostname list: not supported */ + comma = memchr(dev_name, ',', len); + if (comma) + len = comma - dev_name; + } + + if (len > maxnamlen) + goto out_hostname; + + kfree(ctx->nfs_server.hostname); + + /* N.B. caller will free nfs_server.hostname in all cases */ + ctx->nfs_server.hostname = kmemdup_nul(dev_name, len, GFP_KERNEL); + if (!ctx->nfs_server.hostname) + goto out_nomem; + len = strlen(++end); + if (len > maxpathlen) + goto out_path; + ctx->nfs_server.export_path = kmemdup_nul(end, len, GFP_KERNEL); + if (!ctx->nfs_server.export_path) + goto out_nomem; + + trace_nfs_mount_path(ctx->nfs_server.export_path); + return 0; + +out_bad_devname: + return nfs_invalf(fc, "NFS: device name not in host:path format"); +out_nomem: + nfs_errorf(fc, "NFS: not enough memory to parse device name"); + return -ENOMEM; +out_hostname: + nfs_errorf(fc, "NFS: server hostname too long"); + return -ENAMETOOLONG; +out_path: + nfs_errorf(fc, "NFS: export pathname too long"); + return -ENAMETOOLONG; +} + +static inline bool is_remount_fc(struct fs_context *fc) +{ + return fc->root != NULL; +} + +/* + * Parse monolithic NFS2/NFS3 mount data + * - fills in the mount root filehandle + * + * For option strings, user space handles the following behaviors: + * + * + DNS: mapping server host name to IP address ("addr=" option) + * + * + failure mode: how to behave if a mount request can't be handled + * immediately ("fg/bg" option) + * + * + retry: how often to retry a mount request ("retry=" option) + * + * + breaking back: trying proto=udp after proto=tcp, v2 after v3, + * mountproto=tcp after mountproto=udp, and so on + */ +static int nfs23_parse_monolithic(struct fs_context *fc, + struct nfs_mount_data *data) +{ + struct nfs_fs_context *ctx = nfs_fc2context(fc); + struct nfs_fh *mntfh = ctx->mntfh; + struct sockaddr_storage *sap = &ctx->nfs_server._address; + int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; + int ret; + + if (data == NULL) + goto out_no_data; + + ctx->version = NFS_DEFAULT_VERSION; + switch (data->version) { + case 1: + data->namlen = 0; + fallthrough; + case 2: + data->bsize = 0; + fallthrough; + case 3: + if (data->flags & NFS_MOUNT_VER3) + goto out_no_v3; + data->root.size = NFS2_FHSIZE; + memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); + /* Turn off security negotiation */ + extra_flags |= NFS_MOUNT_SECFLAVOUR; + fallthrough; + case 4: + if (data->flags & NFS_MOUNT_SECFLAVOUR) + goto out_no_sec; + fallthrough; + case 5: + memset(data->context, 0, sizeof(data->context)); + fallthrough; + case 6: + if (data->flags & NFS_MOUNT_VER3) { + if (data->root.size > NFS3_FHSIZE || data->root.size == 0) + goto out_invalid_fh; + mntfh->size = data->root.size; + ctx->version = 3; + } else { + mntfh->size = NFS2_FHSIZE; + ctx->version = 2; + } + + + memcpy(mntfh->data, data->root.data, mntfh->size); + if (mntfh->size < sizeof(mntfh->data)) + memset(mntfh->data + mntfh->size, 0, + sizeof(mntfh->data) - mntfh->size); + + /* + * for proto == XPRT_TRANSPORT_UDP, which is what uses + * to_exponential, implying shift: limit the shift value + * to BITS_PER_LONG (majortimeo is unsigned long) + */ + if (!(data->flags & NFS_MOUNT_TCP)) /* this will be UDP */ + if (data->retrans >= 64) /* shift value is too large */ + goto out_invalid_data; + + /* + * Translate to nfs_fs_context, which nfs_fill_super + * can deal with. + */ + ctx->flags = data->flags & NFS_MOUNT_FLAGMASK; + ctx->flags |= extra_flags; + ctx->rsize = data->rsize; + ctx->wsize = data->wsize; + ctx->timeo = data->timeo; + ctx->retrans = data->retrans; + ctx->acregmin = data->acregmin; + ctx->acregmax = data->acregmax; + ctx->acdirmin = data->acdirmin; + ctx->acdirmax = data->acdirmax; + ctx->need_mount = false; + + if (!is_remount_fc(fc)) { + memcpy(sap, &data->addr, sizeof(data->addr)); + ctx->nfs_server.addrlen = sizeof(data->addr); + ctx->nfs_server.port = ntohs(data->addr.sin_port); + } + + if (sap->ss_family != AF_INET || + !nfs_verify_server_address(sap)) + goto out_no_address; + + if (!(data->flags & NFS_MOUNT_TCP)) + ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; + /* N.B. caller will free nfs_server.hostname in all cases */ + ctx->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); + if (!ctx->nfs_server.hostname) + goto out_nomem; + + ctx->namlen = data->namlen; + ctx->bsize = data->bsize; + + if (data->flags & NFS_MOUNT_SECFLAVOUR) + ctx->selected_flavor = data->pseudoflavor; + else + ctx->selected_flavor = RPC_AUTH_UNIX; + + if (!(data->flags & NFS_MOUNT_NONLM)) + ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); + else + ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK| + NFS_MOUNT_LOCAL_FCNTL); + + /* + * The legacy version 6 binary mount data from userspace has a + * field used only to transport selinux information into the + * kernel. To continue to support that functionality we + * have a touch of selinux knowledge here in the NFS code. The + * userspace code converted context=blah to just blah so we are + * converting back to the full string selinux understands. + */ + if (data->context[0]){ +#ifdef CONFIG_SECURITY_SELINUX + int ret; + + data->context[NFS_MAX_CONTEXT_LEN] = '\0'; + ret = vfs_parse_fs_string(fc, "context", data->context); + if (ret < 0) + return ret; +#else + return -EINVAL; +#endif + } + + break; + default: + goto generic; + } + + ret = nfs_validate_transport_protocol(fc, ctx); + if (ret) + return ret; + + ctx->skip_reconfig_option_check = true; + return 0; + +generic: + return generic_parse_monolithic(fc, data); + +out_no_data: + if (is_remount_fc(fc)) { + ctx->skip_reconfig_option_check = true; + return 0; + } + return nfs_invalf(fc, "NFS: mount program didn't pass any mount data"); + +out_no_v3: + return nfs_invalf(fc, "NFS: nfs_mount_data version does not support v3"); + +out_no_sec: + return nfs_invalf(fc, "NFS: nfs_mount_data version supports only AUTH_SYS"); + +out_nomem: + return -ENOMEM; + +out_no_address: + return nfs_invalf(fc, "NFS: mount program didn't pass remote address"); + +out_invalid_fh: + return nfs_invalf(fc, "NFS: invalid root filehandle"); + +out_invalid_data: + return nfs_invalf(fc, "NFS: invalid binary mount data"); +} + +#if IS_ENABLED(CONFIG_NFS_V4) +struct compat_nfs_string { + compat_uint_t len; + compat_uptr_t data; +}; + +static inline void compat_nfs_string(struct nfs_string *dst, + struct compat_nfs_string *src) +{ + dst->data = compat_ptr(src->data); + dst->len = src->len; +} + +struct compat_nfs4_mount_data_v1 { + compat_int_t version; + compat_int_t flags; + compat_int_t rsize; + compat_int_t wsize; + compat_int_t timeo; + compat_int_t retrans; + compat_int_t acregmin; + compat_int_t acregmax; + compat_int_t acdirmin; + compat_int_t acdirmax; + struct compat_nfs_string client_addr; + struct compat_nfs_string mnt_path; + struct compat_nfs_string hostname; + compat_uint_t host_addrlen; + compat_uptr_t host_addr; + compat_int_t proto; + compat_int_t auth_flavourlen; + compat_uptr_t auth_flavours; +}; + +static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data) +{ + struct compat_nfs4_mount_data_v1 *compat = + (struct compat_nfs4_mount_data_v1 *)data; + + /* copy the fields backwards */ + data->auth_flavours = compat_ptr(compat->auth_flavours); + data->auth_flavourlen = compat->auth_flavourlen; + data->proto = compat->proto; + data->host_addr = compat_ptr(compat->host_addr); + data->host_addrlen = compat->host_addrlen; + compat_nfs_string(&data->hostname, &compat->hostname); + compat_nfs_string(&data->mnt_path, &compat->mnt_path); + compat_nfs_string(&data->client_addr, &compat->client_addr); + data->acdirmax = compat->acdirmax; + data->acdirmin = compat->acdirmin; + data->acregmax = compat->acregmax; + data->acregmin = compat->acregmin; + data->retrans = compat->retrans; + data->timeo = compat->timeo; + data->wsize = compat->wsize; + data->rsize = compat->rsize; + data->flags = compat->flags; + data->version = compat->version; +} + +/* + * Validate NFSv4 mount options + */ +static int nfs4_parse_monolithic(struct fs_context *fc, + struct nfs4_mount_data *data) +{ + struct nfs_fs_context *ctx = nfs_fc2context(fc); + struct sockaddr_storage *sap = &ctx->nfs_server._address; + int ret; + char *c; + + if (!data) { + if (is_remount_fc(fc)) + goto done; + return nfs_invalf(fc, + "NFS4: mount program didn't pass any mount data"); + } + + ctx->version = 4; + + if (data->version != 1) + return generic_parse_monolithic(fc, data); + + if (in_compat_syscall()) + nfs4_compat_mount_data_conv(data); + + if (data->host_addrlen > sizeof(ctx->nfs_server.address)) + goto out_no_address; + if (data->host_addrlen == 0) + goto out_no_address; + ctx->nfs_server.addrlen = data->host_addrlen; + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) + return -EFAULT; + if (!nfs_verify_server_address(sap)) + goto out_no_address; + ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); + + if (data->auth_flavourlen) { + rpc_authflavor_t pseudoflavor; + + if (data->auth_flavourlen > 1) + goto out_inval_auth; + if (copy_from_user(&pseudoflavor, data->auth_flavours, + sizeof(pseudoflavor))) + return -EFAULT; + ctx->selected_flavor = pseudoflavor; + } else { + ctx->selected_flavor = RPC_AUTH_UNIX; + } + + c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->nfs_server.hostname = c; + + c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->nfs_server.export_path = c; + trace_nfs_mount_path(c); + + c = strndup_user(data->client_addr.data, 16); + if (IS_ERR(c)) + return PTR_ERR(c); + ctx->client_address = c; + + /* + * Translate to nfs_fs_context, which nfs_fill_super + * can deal with. + */ + + ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK; + ctx->rsize = data->rsize; + ctx->wsize = data->wsize; + ctx->timeo = data->timeo; + ctx->retrans = data->retrans; + ctx->acregmin = data->acregmin; + ctx->acregmax = data->acregmax; + ctx->acdirmin = data->acdirmin; + ctx->acdirmax = data->acdirmax; + ctx->nfs_server.protocol = data->proto; + ret = nfs_validate_transport_protocol(fc, ctx); + if (ret) + return ret; +done: + ctx->skip_reconfig_option_check = true; + return 0; + +out_inval_auth: + return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d", + data->auth_flavourlen); + +out_no_address: + return nfs_invalf(fc, "NFS4: mount program didn't pass remote address"); +} +#endif + +/* + * Parse a monolithic block of data from sys_mount(). + */ +static int nfs_fs_context_parse_monolithic(struct fs_context *fc, + void *data) +{ + if (fc->fs_type == &nfs_fs_type) + return nfs23_parse_monolithic(fc, data); + +#if IS_ENABLED(CONFIG_NFS_V4) + if (fc->fs_type == &nfs4_fs_type) + return nfs4_parse_monolithic(fc, data); +#endif + + return nfs_invalf(fc, "NFS: Unsupported monolithic data version"); +} + +/* + * Validate the preparsed information in the config. + */ +static int nfs_fs_context_validate(struct fs_context *fc) +{ + struct nfs_fs_context *ctx = nfs_fc2context(fc); + struct nfs_subversion *nfs_mod; + struct sockaddr_storage *sap = &ctx->nfs_server._address; + int max_namelen = PAGE_SIZE; + int max_pathlen = NFS_MAXPATHLEN; + int port = 0; + int ret; + + if (!fc->source) + goto out_no_device_name; + + /* Check for sanity first. */ + if (ctx->minorversion && ctx->version != 4) + goto out_minorversion_mismatch; + + if (ctx->options & NFS_OPTION_MIGRATION && + (ctx->version != 4 || ctx->minorversion != 0)) + goto out_migration_misuse; + + /* Verify that any proto=/mountproto= options match the address + * families in the addr=/mountaddr= options. + */ + if (ctx->protofamily != AF_UNSPEC && + ctx->protofamily != ctx->nfs_server.address.sa_family) + goto out_proto_mismatch; + + if (ctx->mountfamily != AF_UNSPEC) { + if (ctx->mount_server.addrlen) { + if (ctx->mountfamily != ctx->mount_server.address.sa_family) + goto out_mountproto_mismatch; + } else { + if (ctx->mountfamily != ctx->nfs_server.address.sa_family) + goto out_mountproto_mismatch; + } + } + + if (!nfs_verify_server_address(sap)) + goto out_no_address; + + ret = nfs_validate_transport_protocol(fc, ctx); + if (ret) + return ret; + + if (ctx->version == 4) { + if (IS_ENABLED(CONFIG_NFS_V4)) { + if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA) + port = NFS_RDMA_PORT; + else + port = NFS_PORT; + max_namelen = NFS4_MAXNAMLEN; + max_pathlen = NFS4_MAXPATHLEN; + ctx->flags &= ~(NFS_MOUNT_NONLM | NFS_MOUNT_NOACL | + NFS_MOUNT_VER3 | NFS_MOUNT_LOCAL_FLOCK | + NFS_MOUNT_LOCAL_FCNTL); + } else { + goto out_v4_not_compiled; + } + } else { + nfs_set_mount_transport_protocol(ctx); + if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA) + port = NFS_RDMA_PORT; + } + + nfs_set_port(sap, &ctx->nfs_server.port, port); + + ret = nfs_parse_source(fc, max_namelen, max_pathlen); + if (ret < 0) + return ret; + + /* Load the NFS protocol module if we haven't done so yet */ + if (!ctx->nfs_mod) { + nfs_mod = find_nfs_version(ctx->version); + if (IS_ERR(nfs_mod)) { + ret = PTR_ERR(nfs_mod); + goto out_version_unavailable; + } + ctx->nfs_mod = nfs_mod; + } + + /* Ensure the filesystem context has the correct fs_type */ + if (fc->fs_type != ctx->nfs_mod->nfs_fs) { + module_put(fc->fs_type->owner); + __module_get(ctx->nfs_mod->nfs_fs->owner); + fc->fs_type = ctx->nfs_mod->nfs_fs; + } + return 0; + +out_no_device_name: + return nfs_invalf(fc, "NFS: Device name not specified"); +out_v4_not_compiled: + nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel"); + return -EPROTONOSUPPORT; +out_no_address: + return nfs_invalf(fc, "NFS: mount program didn't pass remote address"); +out_mountproto_mismatch: + return nfs_invalf(fc, "NFS: Mount server address does not match mountproto= option"); +out_proto_mismatch: + return nfs_invalf(fc, "NFS: Server address does not match proto= option"); +out_minorversion_mismatch: + return nfs_invalf(fc, "NFS: Mount option vers=%u does not support minorversion=%u", + ctx->version, ctx->minorversion); +out_migration_misuse: + return nfs_invalf(fc, "NFS: 'Migration' not supported for this NFS version"); +out_version_unavailable: + nfs_errorf(fc, "NFS: Version unavailable"); + return ret; +} + +/* + * Create an NFS superblock by the appropriate method. + */ +static int nfs_get_tree(struct fs_context *fc) +{ + struct nfs_fs_context *ctx = nfs_fc2context(fc); + int err = nfs_fs_context_validate(fc); + + if (err) + return err; + if (!ctx->internal) + return ctx->nfs_mod->rpc_ops->try_get_tree(fc); + else + return nfs_get_tree_common(fc); +} + +/* + * Handle duplication of a configuration. The caller copied *src into *sc, but + * it can't deal with resource pointers in the filesystem context, so we have + * to do that. We need to clear pointers, copy data or get extra refs as + * appropriate. + */ +static int nfs_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) +{ + struct nfs_fs_context *src = nfs_fc2context(src_fc), *ctx; + + ctx = kmemdup(src, sizeof(struct nfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->mntfh = nfs_alloc_fhandle(); + if (!ctx->mntfh) { + kfree(ctx); + return -ENOMEM; + } + nfs_copy_fh(ctx->mntfh, src->mntfh); + + get_nfs_version(ctx->nfs_mod); + ctx->client_address = NULL; + ctx->mount_server.hostname = NULL; + ctx->nfs_server.export_path = NULL; + ctx->nfs_server.hostname = NULL; + ctx->fscache_uniq = NULL; + ctx->clone_data.fattr = NULL; + fc->fs_private = ctx; + return 0; +} + +static void nfs_fs_context_free(struct fs_context *fc) +{ + struct nfs_fs_context *ctx = nfs_fc2context(fc); + + if (ctx) { + if (ctx->server) + nfs_free_server(ctx->server); + if (ctx->nfs_mod) + put_nfs_version(ctx->nfs_mod); + kfree(ctx->client_address); + kfree(ctx->mount_server.hostname); + kfree(ctx->nfs_server.export_path); + kfree(ctx->nfs_server.hostname); + kfree(ctx->fscache_uniq); + nfs_free_fhandle(ctx->mntfh); + nfs_free_fattr(ctx->clone_data.fattr); + kfree(ctx); + } +} + +static const struct fs_context_operations nfs_fs_context_ops = { + .free = nfs_fs_context_free, + .dup = nfs_fs_context_dup, + .parse_param = nfs_fs_context_parse_param, + .parse_monolithic = nfs_fs_context_parse_monolithic, + .get_tree = nfs_get_tree, + .reconfigure = nfs_reconfigure, +}; + +/* + * Prepare superblock configuration. We use the namespaces attached to the + * context. This may be the current process's namespaces, or it may be a + * container's namespaces. + */ +static int nfs_init_fs_context(struct fs_context *fc) +{ + struct nfs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct nfs_fs_context), GFP_KERNEL); + if (unlikely(!ctx)) + return -ENOMEM; + + ctx->mntfh = nfs_alloc_fhandle(); + if (unlikely(!ctx->mntfh)) { + kfree(ctx); + return -ENOMEM; + } + + ctx->protofamily = AF_UNSPEC; + ctx->mountfamily = AF_UNSPEC; + ctx->mount_server.port = NFS_UNSPEC_PORT; + + if (fc->root) { + /* reconfigure, start with the current config */ + struct nfs_server *nfss = fc->root->d_sb->s_fs_info; + struct net *net = nfss->nfs_client->cl_net; + + ctx->flags = nfss->flags; + ctx->rsize = nfss->rsize; + ctx->wsize = nfss->wsize; + ctx->retrans = nfss->client->cl_timeout->to_retries; + ctx->selected_flavor = nfss->client->cl_auth->au_flavor; + ctx->acregmin = nfss->acregmin / HZ; + ctx->acregmax = nfss->acregmax / HZ; + ctx->acdirmin = nfss->acdirmin / HZ; + ctx->acdirmax = nfss->acdirmax / HZ; + ctx->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + ctx->nfs_server.port = nfss->port; + ctx->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; + ctx->version = nfss->nfs_client->rpc_ops->version; + ctx->minorversion = nfss->nfs_client->cl_minorversion; + + memcpy(&ctx->nfs_server._address, &nfss->nfs_client->cl_addr, + ctx->nfs_server.addrlen); + + if (fc->net_ns != net) { + put_net(fc->net_ns); + fc->net_ns = get_net(net); + } + + ctx->nfs_mod = nfss->nfs_client->cl_nfs_mod; + get_nfs_version(ctx->nfs_mod); + } else { + /* defaults */ + ctx->timeo = NFS_UNSPEC_TIMEO; + ctx->retrans = NFS_UNSPEC_RETRANS; + ctx->acregmin = NFS_DEF_ACREGMIN; + ctx->acregmax = NFS_DEF_ACREGMAX; + ctx->acdirmin = NFS_DEF_ACDIRMIN; + ctx->acdirmax = NFS_DEF_ACDIRMAX; + ctx->nfs_server.port = NFS_UNSPEC_PORT; + ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; + ctx->selected_flavor = RPC_AUTH_MAXFLAVOR; + ctx->minorversion = 0; + ctx->need_mount = true; + ctx->xprtsec.policy = RPC_XPRTSEC_NONE; + ctx->xprtsec.cert_serial = TLS_NO_CERT; + ctx->xprtsec.privkey_serial = TLS_NO_PRIVKEY; + + if (fc->net_ns != &init_net) + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + + fc->s_iflags |= SB_I_STABLE_WRITES; + } + fc->fs_private = ctx; + fc->ops = &nfs_fs_context_ops; + return 0; +} + +struct file_system_type nfs_fs_type = { + .owner = THIS_MODULE, + .name = "nfs", + .init_fs_context = nfs_init_fs_context, + .parameters = nfs_fs_parameters, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; +MODULE_ALIAS_FS("nfs"); +EXPORT_SYMBOL_GPL(nfs_fs_type); + +#if IS_ENABLED(CONFIG_NFS_V4) +struct file_system_type nfs4_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .init_fs_context = nfs_init_fs_context, + .parameters = nfs_fs_parameters, + .kill_sb = nfs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +}; +MODULE_ALIAS_FS("nfs4"); +MODULE_ALIAS("nfs4"); +EXPORT_SYMBOL_GPL(nfs4_fs_type); +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c deleted file mode 100644 index 777b055063f6..000000000000 --- a/fs/nfs/fscache-index.c +++ /dev/null @@ -1,336 +0,0 @@ -/* NFS FS-Cache index structure definition - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/nfs_fs.h> -#include <linux/nfs_fs_sb.h> -#include <linux/in6.h> - -#include "internal.h" -#include "fscache.h" - -#define NFSDBG_FACILITY NFSDBG_FSCACHE - -/* - * Define the NFS filesystem for FS-Cache. Upon registration FS-Cache sticks - * the cookie for the top-level index object for NFS into here. The top-level - * index can than have other cache objects inserted into it. - */ -struct fscache_netfs nfs_fscache_netfs = { - .name = "nfs", - .version = 0, -}; - -/* - * Register NFS for caching - */ -int nfs_fscache_register(void) -{ - return fscache_register_netfs(&nfs_fscache_netfs); -} - -/* - * Unregister NFS for caching - */ -void nfs_fscache_unregister(void) -{ - fscache_unregister_netfs(&nfs_fscache_netfs); -} - -/* - * Layout of the key for an NFS server cache object. - */ -struct nfs_server_key { - uint16_t nfsversion; /* NFS protocol version */ - uint16_t family; /* address family */ - uint16_t port; /* IP port */ - union { - struct in_addr ipv4_addr; /* IPv4 address */ - struct in6_addr ipv6_addr; /* IPv6 address */ - } addr[0]; -}; - -/* - * Generate a key to describe a server in the main NFS index - * - We return the length of the key, or 0 if we can't generate one - */ -static uint16_t nfs_server_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct nfs_client *clp = cookie_netfs_data; - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; - const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; - struct nfs_server_key *key = buffer; - uint16_t len = sizeof(struct nfs_server_key); - - memset(key, 0, len); - key->nfsversion = clp->rpc_ops->version; - key->family = clp->cl_addr.ss_family; - - switch (clp->cl_addr.ss_family) { - case AF_INET: - key->port = sin->sin_port; - key->addr[0].ipv4_addr = sin->sin_addr; - len += sizeof(key->addr[0].ipv4_addr); - break; - - case AF_INET6: - key->port = sin6->sin6_port; - key->addr[0].ipv6_addr = sin6->sin6_addr; - len += sizeof(key->addr[0].ipv6_addr); - break; - - default: - printk(KERN_WARNING "NFS: Unknown network family '%d'\n", - clp->cl_addr.ss_family); - len = 0; - break; - } - - return len; -} - -/* - * Define the server object for FS-Cache. This is used to describe a server - * object to fscache_acquire_cookie(). It is keyed by the NFS protocol and - * server address parameters. - */ -const struct fscache_cookie_def nfs_fscache_server_index_def = { - .name = "NFS.server", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = nfs_server_get_key, -}; - -/* - * Generate a key to describe a superblock key in the main NFS index - */ -static uint16_t nfs_super_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct nfs_fscache_key *key; - const struct nfs_server *nfss = cookie_netfs_data; - uint16_t len; - - key = nfss->fscache_key; - len = sizeof(key->key) + key->key.uniq_len; - if (len > bufmax) { - len = 0; - } else { - memcpy(buffer, &key->key, sizeof(key->key)); - memcpy(buffer + sizeof(key->key), - key->key.uniquifier, key->key.uniq_len); - } - - return len; -} - -/* - * Define the superblock object for FS-Cache. This is used to describe a - * superblock object to fscache_acquire_cookie(). It is keyed by all the NFS - * parameters that might cause a separate superblock. - */ -const struct fscache_cookie_def nfs_fscache_super_index_def = { - .name = "NFS.super", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = nfs_super_get_key, -}; - -/* - * Definition of the auxiliary data attached to NFS inode storage objects - * within the cache. - * - * The contents of this struct are recorded in the on-disk local cache in the - * auxiliary data attached to the data storage object backing an inode. This - * permits coherency to be managed when a new inode binds to an already extant - * cache object. - */ -struct nfs_fscache_inode_auxdata { - struct timespec mtime; - struct timespec ctime; - loff_t size; - u64 change_attr; -}; - -/* - * Generate a key to describe an NFS inode in an NFS server's index - */ -static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct nfs_inode *nfsi = cookie_netfs_data; - uint16_t nsize; - - /* use the inode's NFS filehandle as the key */ - nsize = nfsi->fh.size; - memcpy(buffer, nfsi->fh.data, nsize); - return nsize; -} - -/* - * Get certain file attributes from the netfs data - * - This function can be absent for an index - * - Not permitted to return an error - * - The netfs data from the cookie being used as the source is presented - */ -static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data, - uint64_t *size) -{ - const struct nfs_inode *nfsi = cookie_netfs_data; - - *size = nfsi->vfs_inode.i_size; -} - -/* - * Get the auxiliary data from netfs data - * - This function can be absent if the index carries no state data - * - Should store the auxiliary data in the buffer - * - Should return the amount of amount stored - * - Not permitted to return an error - * - The netfs data from the cookie being used as the source is presented - */ -static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - struct nfs_fscache_inode_auxdata auxdata; - const struct nfs_inode *nfsi = cookie_netfs_data; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.size = nfsi->vfs_inode.i_size; - auxdata.mtime = nfsi->vfs_inode.i_mtime; - auxdata.ctime = nfsi->vfs_inode.i_ctime; - - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->vfs_inode.i_version; - - if (bufmax > sizeof(auxdata)) - bufmax = sizeof(auxdata); - - memcpy(buffer, &auxdata, bufmax); - return bufmax; -} - -/* - * Consult the netfs about the state of an object - * - This function can be absent if the index carries no state data - * - The netfs data from the cookie being used as the target is - * presented, as is the auxiliary data - */ -static -enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen) -{ - struct nfs_fscache_inode_auxdata auxdata; - struct nfs_inode *nfsi = cookie_netfs_data; - - if (datalen != sizeof(auxdata)) - return FSCACHE_CHECKAUX_OBSOLETE; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.size = nfsi->vfs_inode.i_size; - auxdata.mtime = nfsi->vfs_inode.i_mtime; - auxdata.ctime = nfsi->vfs_inode.i_ctime; - - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = nfsi->vfs_inode.i_version; - - if (memcmp(data, &auxdata, datalen) != 0) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; -} - -/* - * Indication from FS-Cache that the cookie is no longer cached - * - This function is called when the backing store currently caching a cookie - * is removed - * - The netfs should use this to clean up any markers indicating cached pages - * - This is mandatory for any object that may have data - */ -static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) -{ - struct nfs_inode *nfsi = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); - - for (;;) { - /* grab a bunch of pages to unmark */ - nr_pages = pagevec_lookup(&pvec, - nfsi->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - -/* - * Get an extra reference on a read context. - * - This function can be absent if the completion function doesn't require a - * context. - * - The read context is passed back to NFS in the event that a data read on the - * cache fails with EIO - in which case the server must be contacted to - * retrieve the data, which requires the read context for security. - */ -static void nfs_fh_get_context(void *cookie_netfs_data, void *context) -{ - get_nfs_open_context(context); -} - -/* - * Release an extra reference on a read context. - * - This function can be absent if the completion function doesn't require a - * context. - */ -static void nfs_fh_put_context(void *cookie_netfs_data, void *context) -{ - if (context) - put_nfs_open_context(context); -} - -/* - * Define the inode object for FS-Cache. This is used to describe an inode - * object to fscache_acquire_cookie(). It is keyed by the NFS file handle for - * an inode. - * - * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime - * held in the cache auxiliary data for the data storage object with those in - * the inode struct in memory. - */ -const struct fscache_cookie_def nfs_fscache_inode_object_def = { - .name = "NFS.fh", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = nfs_fscache_inode_get_key, - .get_attr = nfs_fscache_inode_get_attr, - .get_aux = nfs_fscache_inode_get_aux, - .check_aux = nfs_fscache_inode_check_aux, - .now_uncached = nfs_fscache_inode_now_uncached, - .get_context = nfs_fh_get_context, - .put_context = nfs_fh_put_context, -}; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index d63bea8bbfbb..8b0785178731 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* NFS filesystem cache interface * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #include <linux/init.h> @@ -18,15 +14,28 @@ #include <linux/in6.h> #include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/iversion.h> +#include <linux/xarray.h> +#include <linux/fscache.h> +#include <linux/netfs.h> #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "nfstrace.h" -#define NFSDBG_FACILITY NFSDBG_FSCACHE +#define NFS_MAX_KEY_LEN 1000 -static struct rb_root nfs_fscache_keys = RB_ROOT; -static DEFINE_SPINLOCK(nfs_fscache_keys_lock); +static bool nfs_append_int(char *key, int *_len, unsigned long long x) +{ + if (*_len > NFS_MAX_KEY_LEN) + return false; + if (x == 0) + key[(*_len)++] = ','; + else + *_len += sprintf(key + *_len, ",%llx", x); + return true; +} /* * Get the per-client index cookie for an NFS client if the appropriate mount @@ -34,123 +43,106 @@ static DEFINE_SPINLOCK(nfs_fscache_keys_lock); * - We always try and get an index cookie for the client, but get filehandle * cookies on a per-superblock basis, depending on the mount flags */ -void nfs_fscache_get_client_cookie(struct nfs_client *clp) -{ - /* create a cache index for looking up filehandles */ - clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index, - &nfs_fscache_server_index_def, - clp, true); - dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n", - clp, clp->fscache); -} - -/* - * Dispose of a per-client cookie - */ -void nfs_fscache_release_client_cookie(struct nfs_client *clp) +static bool nfs_fscache_get_client_key(struct nfs_client *clp, + char *key, int *_len) { - dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n", - clp, clp->fscache); + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr; + const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr; + + *_len += snprintf(key + *_len, NFS_MAX_KEY_LEN - *_len, + ",%u.%u,%x", + clp->rpc_ops->version, + clp->cl_minorversion, + clp->cl_addr.ss_family); + + switch (clp->cl_addr.ss_family) { + case AF_INET: + if (!nfs_append_int(key, _len, sin->sin_port) || + !nfs_append_int(key, _len, sin->sin_addr.s_addr)) + return false; + return true; + + case AF_INET6: + if (!nfs_append_int(key, _len, sin6->sin6_port) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[0]) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[1]) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[2]) || + !nfs_append_int(key, _len, sin6->sin6_addr.s6_addr32[3])) + return false; + return true; - fscache_relinquish_cookie(clp->fscache, 0); - clp->fscache = NULL; + default: + printk(KERN_WARNING "NFS: Unknown network family '%d'\n", + clp->cl_addr.ss_family); + return false; + } } /* - * Get the cache cookie for an NFS superblock. We have to handle - * uniquification here because the cache doesn't do it for us. + * Get the cache cookie for an NFS superblock. * * The default uniquifier is just an empty string, but it may be overridden * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent * superblock across an automount point of some nature. */ -void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen) +int nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int ulen) { - struct nfs_fscache_key *key, *xkey; + struct fscache_volume *vcookie; struct nfs_server *nfss = NFS_SB(sb); - struct rb_node **p, *parent; - int diff; + unsigned int len = 3; + char *key; - if (!uniq) { - uniq = ""; - ulen = 1; + if (uniq) { + nfss->fscache_uniq = kmemdup_nul(uniq, ulen, GFP_KERNEL); + if (!nfss->fscache_uniq) + return -ENOMEM; } - key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); + key = kmalloc(NFS_MAX_KEY_LEN + 24, GFP_KERNEL); if (!key) - return; - - key->nfs_client = nfss->nfs_client; - key->key.super.s_flags = sb->s_flags & NFS_MS_MASK; - key->key.nfs_server.flags = nfss->flags; - key->key.nfs_server.rsize = nfss->rsize; - key->key.nfs_server.wsize = nfss->wsize; - key->key.nfs_server.acregmin = nfss->acregmin; - key->key.nfs_server.acregmax = nfss->acregmax; - key->key.nfs_server.acdirmin = nfss->acdirmin; - key->key.nfs_server.acdirmax = nfss->acdirmax; - key->key.nfs_server.fsid = nfss->fsid; - key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor; - - key->key.uniq_len = ulen; - memcpy(key->key.uniquifier, uniq, ulen); - - spin_lock(&nfs_fscache_keys_lock); - p = &nfs_fscache_keys.rb_node; - parent = NULL; - while (*p) { - parent = *p; - xkey = rb_entry(parent, struct nfs_fscache_key, node); - - if (key->nfs_client < xkey->nfs_client) - goto go_left; - if (key->nfs_client > xkey->nfs_client) - goto go_right; - - diff = memcmp(&key->key, &xkey->key, sizeof(key->key)); - if (diff < 0) - goto go_left; - if (diff > 0) - goto go_right; - - if (key->key.uniq_len == 0) - goto non_unique; - diff = memcmp(key->key.uniquifier, - xkey->key.uniquifier, - key->key.uniq_len); - if (diff < 0) - goto go_left; - if (diff > 0) - goto go_right; - goto non_unique; - - go_left: - p = &(*p)->rb_left; - continue; - go_right: - p = &(*p)->rb_right; + return -ENOMEM; + + memcpy(key, "nfs", 3); + if (!nfs_fscache_get_client_key(nfss->nfs_client, key, &len) || + !nfs_append_int(key, &len, nfss->fsid.major) || + !nfs_append_int(key, &len, nfss->fsid.minor) || + !nfs_append_int(key, &len, sb->s_flags & NFS_SB_MASK) || + !nfs_append_int(key, &len, nfss->flags) || + !nfs_append_int(key, &len, nfss->rsize) || + !nfs_append_int(key, &len, nfss->wsize) || + !nfs_append_int(key, &len, nfss->acregmin) || + !nfs_append_int(key, &len, nfss->acregmax) || + !nfs_append_int(key, &len, nfss->acdirmin) || + !nfs_append_int(key, &len, nfss->acdirmax) || + !nfs_append_int(key, &len, nfss->client->cl_auth->au_flavor)) + goto out; + + if (ulen > 0) { + if (ulen > NFS_MAX_KEY_LEN - len) + goto out; + key[len++] = ','; + memcpy(key + len, uniq, ulen); + len += ulen; } - - rb_link_node(&key->node, parent, p); - rb_insert_color(&key->node, &nfs_fscache_keys); - spin_unlock(&nfs_fscache_keys_lock); - nfss->fscache_key = key; + key[len] = 0; /* create a cache index for looking up filehandles */ - nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, - &nfs_fscache_super_index_def, - nfss, true); - dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", - nfss, nfss->fscache); - return; - -non_unique: - spin_unlock(&nfs_fscache_keys_lock); + vcookie = fscache_acquire_volume(key, + NULL, /* preferred_cache */ + NULL, 0 /* coherency_data */); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + kfree(key); + return PTR_ERR(vcookie); + } + pr_err("NFS: Cache volume key already in use (%s)\n", key); + vcookie = NULL; + } + nfss->fscache = vcookie; + +out: kfree(key); - nfss->fscache_key = NULL; - nfss->fscache = NULL; - printk(KERN_WARNING "NFS:" - " Cache request denied due to non-unique superblock keys\n"); + return 0; } /* @@ -160,19 +152,9 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) { struct nfs_server *nfss = NFS_SB(sb); - dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", - nfss, nfss->fscache); - - fscache_relinquish_cookie(nfss->fscache, 0); + fscache_relinquish_volume(nfss->fscache, NULL, false); nfss->fscache = NULL; - - if (nfss->fscache_key) { - spin_lock(&nfs_fscache_keys_lock); - rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys); - spin_unlock(&nfs_fscache_keys_lock); - kfree(nfss->fscache_key); - nfss->fscache_key = NULL; - } + kfree(nfss->fscache_uniq); } /* @@ -180,14 +162,27 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) */ void nfs_fscache_init_inode(struct inode *inode) { + struct nfs_fscache_inode_auxdata auxdata; + struct nfs_server *nfss = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); - nfsi->fscache = NULL; - if (!S_ISREG(inode->i_mode)) + netfs_inode(inode)->cache = NULL; + if (!(nfss->fscache && S_ISREG(inode->i_mode))) return; - nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, - &nfs_fscache_inode_object_def, - nfsi, false); + + nfs_fscache_update_auxdata(&auxdata, inode); + + netfs_inode(inode)->cache = fscache_acquire_cookie( + nfss->fscache, + 0, + nfsi->fh.data, /* index_key */ + nfsi->fh.size, + &auxdata, /* aux_data */ + sizeof(auxdata), + i_size_read(inode)); + + if (netfs_inode(inode)->cache) + mapping_set_release_always(inode->i_mapping); } /* @@ -195,20 +190,8 @@ void nfs_fscache_init_inode(struct inode *inode) */ void nfs_fscache_clear_inode(struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - struct fscache_cookie *cookie = nfs_i_fscache(inode); - - dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - - fscache_relinquish_cookie(cookie, false); - nfsi->fscache = NULL; -} - -static bool nfs_fscache_can_enable(void *data) -{ - struct inode *inode = data; - - return !inode_is_open_for_write(inode); + fscache_relinquish_cookie(netfs_i_cookie(netfs_inode(inode)), false); + netfs_inode(inode)->cache = NULL; } /* @@ -232,208 +215,173 @@ static bool nfs_fscache_can_enable(void *data) */ void nfs_fscache_open_file(struct inode *inode, struct file *filp) { - struct nfs_inode *nfsi = NFS_I(inode); - struct fscache_cookie *cookie = nfs_i_fscache(inode); + struct nfs_fscache_inode_auxdata auxdata; + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); + bool open_for_write = inode_is_open_for_write(inode); if (!fscache_cookie_valid(cookie)) return; - if (inode_is_open_for_write(inode)) { - dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); - clear_bit(NFS_INO_FSCACHE, &nfsi->flags); - fscache_disable_cookie(cookie, true); - fscache_uncache_all_inode_pages(cookie, inode); - } else { - dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi); - fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode); - if (fscache_cookie_enabled(cookie)) - set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags); + fscache_use_cookie(cookie, open_for_write); + if (open_for_write) { + nfs_fscache_update_auxdata(&auxdata, inode); + fscache_invalidate(cookie, &auxdata, i_size_read(inode), + FSCACHE_INVAL_DIO_WRITE); } } EXPORT_SYMBOL_GPL(nfs_fscache_open_file); -/* - * Release the caching state associated with a page, if the page isn't busy - * interacting with the cache. - * - Returns true (can release page) or false (page busy). - */ -int nfs_fscache_release_page(struct page *page, gfp_t gfp) +void nfs_fscache_release_file(struct inode *inode, struct file *filp) { - if (PageFsCache(page)) { - struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host); + struct nfs_fscache_inode_auxdata auxdata; + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); + loff_t i_size = i_size_read(inode); - BUG_ON(!cookie); - dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", - cookie, page, NFS_I(page->mapping->host)); + nfs_fscache_update_auxdata(&auxdata, inode); + fscache_unuse_cookie(cookie, &auxdata, &i_size); +} - if (!fscache_maybe_release_page(cookie, page, gfp)) - return 0; +int nfs_netfs_read_folio(struct file *file, struct folio *folio) +{ + if (!netfs_inode(folio_inode(folio))->cache) + return -ENOBUFS; - nfs_inc_fscache_stats(page->mapping->host, - NFSIOS_FSCACHE_PAGES_UNCACHED); - } + return netfs_read_folio(file, folio); +} - return 1; +int nfs_netfs_readahead(struct readahead_control *ractl) +{ + struct inode *inode = ractl->mapping->host; + + if (!netfs_inode(inode)->cache) + return -ENOBUFS; + + netfs_readahead(ractl); + return 0; } -/* - * Release the caching state associated with a page if undergoing complete page - * invalidation. - */ -void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) +static atomic_t nfs_netfs_debug_id; +static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file) { - struct fscache_cookie *cookie = nfs_i_fscache(inode); + if (!file) { + if (WARN_ON_ONCE(rreq->origin != NETFS_PGPRIV2_COPY_TO_CACHE)) + return -EIO; + return 0; + } - BUG_ON(!cookie); + rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); + rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); + /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ + __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + rreq->io_streams[0].sreq_max_len = NFS_SB(rreq->inode->i_sb)->rsize; - dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n", - cookie, page, NFS_I(inode)); + return 0; +} - fscache_wait_on_page_write(cookie, page); +static void nfs_netfs_free_request(struct netfs_io_request *rreq) +{ + if (rreq->netfs_priv) + put_nfs_open_context(rreq->netfs_priv); +} - BUG_ON(!PageLocked(page)); - fscache_uncache_page(cookie, page); - nfs_inc_fscache_stats(page->mapping->host, - NFSIOS_FSCACHE_PAGES_UNCACHED); +static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) +{ + struct nfs_netfs_io_data *netfs; + + netfs = kzalloc(sizeof(*netfs), GFP_KERNEL_ACCOUNT); + if (!netfs) + return NULL; + netfs->sreq = sreq; + refcount_set(&netfs->refcount, 1); + return netfs; } -/* - * Handle completion of a page being read from the cache. - * - Called in process (keventd) context. - */ -static void nfs_readpage_from_fscache_complete(struct page *page, - void *context, - int error) +static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) { - dfprintk(FSCACHE, - "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", - page, context, error); - - /* if the read completes with an error, we just unlock the page and let - * the VM reissue the readpage */ - if (!error) { - SetPageUptodate(page); - unlock_page(page); - } else { - error = nfs_readpage_async(context, page->mapping->host, page); - if (error) - unlock_page(page); + struct nfs_netfs_io_data *netfs; + struct nfs_pageio_descriptor pgio; + struct inode *inode = sreq->rreq->inode; + struct nfs_open_context *ctx = sreq->rreq->netfs_priv; + struct page *page; + unsigned long idx; + pgoff_t start, last; + int err; + + start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; + last = ((sreq->start + sreq->len - sreq->transferred - 1) >> PAGE_SHIFT); + + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); + + netfs = nfs_netfs_alloc(sreq); + if (!netfs) { + sreq->error = -ENOMEM; + return netfs_read_subreq_terminated(sreq); } + + pgio.pg_netfs = netfs; /* used in completion */ + + xa_for_each_range(&sreq->rreq->mapping->i_pages, idx, page, start, last) { + /* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs */ + err = nfs_read_add_folio(&pgio, ctx, page_folio(page)); + if (err < 0) { + netfs->error = err; + goto out; + } + } +out: + nfs_pageio_complete_read(&pgio); + nfs_netfs_put(netfs); } -/* - * Retrieve a page from fscache - */ -int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, struct page *page) +void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) { - int ret; - - dfprintk(FSCACHE, - "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", - nfs_i_fscache(inode), page, page->index, page->flags, inode); - - ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), - page, - nfs_readpage_from_fscache_complete, - ctx, - GFP_KERNEL); - - switch (ret) { - case 0: /* read BIO submitted (page in fscache) */ - dfprintk(FSCACHE, - "NFS: readpage_from_fscache: BIO submitted\n"); - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); - return ret; - - case -ENOBUFS: /* inode not in cache */ - case -ENODATA: /* page not in cache */ - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); - dfprintk(FSCACHE, - "NFS: readpage_from_fscache %d\n", ret); - return 1; + struct nfs_netfs_io_data *netfs = hdr->netfs; - default: - dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); - } - return ret; + if (!netfs) + return; + + nfs_netfs_get(netfs); } -/* - * Retrieve a set of pages from fscache - */ -int __nfs_readpages_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) +int nfs_netfs_folio_unlock(struct folio *folio) { - unsigned npages = *nr_pages; - int ret; - - dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n", - nfs_i_fscache(inode), npages, inode); - - ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode), - mapping, pages, nr_pages, - nfs_readpage_from_fscache_complete, - ctx, - mapping_gfp_mask(mapping)); - if (*nr_pages < npages) - nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, - npages); - if (*nr_pages > 0) - nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, - *nr_pages); - - switch (ret) { - case 0: /* read submitted to the cache for all pages */ - BUG_ON(!list_empty(pages)); - BUG_ON(*nr_pages != 0); - dfprintk(FSCACHE, - "NFS: nfs_getpages_from_fscache: submitted\n"); - - return ret; - - case -ENOBUFS: /* some pages aren't cached and can't be */ - case -ENODATA: /* some pages aren't cached */ - dfprintk(FSCACHE, - "NFS: nfs_getpages_from_fscache: no page: %d\n", ret); - return 1; + struct inode *inode = folio->mapping->host; - default: - dfprintk(FSCACHE, - "NFS: nfs_getpages_from_fscache: ret %d\n", ret); - } + /* + * If fscache is enabled, netfs will unlock pages. + */ + if (netfs_inode(inode)->cache) + return 0; - return ret; + return 1; } -/* - * Store a newly fetched page in fscache - * - PG_fscache must be set on the page - */ -void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) +void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) { - int ret; - - dfprintk(FSCACHE, - "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n", - nfs_i_fscache(inode), page, page->index, page->flags, sync); - - ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL); - dfprintk(FSCACHE, - "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", - page, page->index, page->flags, ret); - - if (ret != 0) { - fscache_uncache_page(nfs_i_fscache(inode), page); - nfs_inc_fscache_stats(inode, - NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED); - } else { - nfs_inc_fscache_stats(inode, - NFSIOS_FSCACHE_PAGES_WRITTEN_OK); - } + struct nfs_netfs_io_data *netfs = hdr->netfs; + struct netfs_io_subrequest *sreq; + + if (!netfs) + return; + + sreq = netfs->sreq; + if (test_bit(NFS_IOHDR_EOF, &hdr->flags) && + sreq->rreq->origin != NETFS_UNBUFFERED_READ && + sreq->rreq->origin != NETFS_DIO_READ) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); + + if (hdr->error) + netfs->error = hdr->error; + else + atomic64_add(hdr->res.count, &netfs->transferred); + + nfs_netfs_put(netfs); + hdr->netfs = NULL; } + +const struct netfs_request_ops nfs_netfs_ops = { + .init_request = nfs_netfs_init_request, + .free_request = nfs_netfs_free_request, + .issue_read = nfs_netfs_issue_read, +}; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index d7fe3e799f2f..9d86868f4998 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -1,229 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* NFS filesystem cache interface definitions * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #ifndef _NFS_FSCACHE_H #define _NFS_FSCACHE_H +#include <linux/swap.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/fscache.h> +#include <linux/iversion.h> #ifdef CONFIG_NFS_FSCACHE /* - * set of NFS FS-Cache objects that form a superblock key + * Definition of the auxiliary data attached to NFS inode storage objects + * within the cache. + * + * The contents of this struct are recorded in the on-disk local cache in the + * auxiliary data attached to the data storage object backing an inode. This + * permits coherency to be managed when a new inode binds to an already extant + * cache object. */ -struct nfs_fscache_key { - struct rb_node node; - struct nfs_client *nfs_client; /* the server */ - - /* the elements of the unique key - as used by nfs_compare_super() and - * nfs_compare_mount_options() to distinguish superblocks */ - struct { - struct { - unsigned long s_flags; /* various flags - * (& NFS_MS_MASK) */ - } super; - - struct { - struct nfs_fsid fsid; - int flags; - unsigned int rsize; /* read size */ - unsigned int wsize; /* write size */ - unsigned int acregmin; /* attr cache timeouts */ - unsigned int acregmax; - unsigned int acdirmin; - unsigned int acdirmax; - } nfs_server; - - struct { - rpc_authflavor_t au_flavor; - } rpc_auth; - - /* uniquifier - can be used if nfs_server.flags includes - * NFS_MOUNT_UNSHARED */ - u8 uniq_len; - char uniquifier[0]; - } key; +struct nfs_fscache_inode_auxdata { + s64 mtime_sec; + s64 mtime_nsec; + s64 ctime_sec; + s64 ctime_nsec; + u64 change_attr; }; -/* - * fscache-index.c - */ -extern struct fscache_netfs nfs_fscache_netfs; -extern const struct fscache_cookie_def nfs_fscache_server_index_def; -extern const struct fscache_cookie_def nfs_fscache_super_index_def; -extern const struct fscache_cookie_def nfs_fscache_inode_object_def; +struct nfs_netfs_io_data { + /* + * NFS may split a netfs_io_subrequest into multiple RPCs, each + * with their own read completion. In netfs, we can only call + * netfs_subreq_terminated() once for each subrequest. Use the + * refcount here to double as a marker of the last RPC completion, + * and only call netfs via netfs_subreq_terminated() once. + */ + refcount_t refcount; + struct netfs_io_subrequest *sreq; + + /* + * Final disposition of the netfs_io_subrequest, sent in + * netfs_subreq_terminated() + */ + atomic64_t transferred; + int error; +}; + +static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs) +{ + refcount_inc(&netfs->refcount); +} -extern int nfs_fscache_register(void); -extern void nfs_fscache_unregister(void); +static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) +{ + /* Only the last RPC completion should call netfs_subreq_terminated() */ + if (!refcount_dec_and_test(&netfs->refcount)) + return; + + /* + * The NFS pageio interface may read a complete page, even when netfs + * only asked for a partial page. Specifically, this may be seen when + * one thread is truncating a file while another one is reading the last + * page of the file. + * Correct the final length here to be no larger than the netfs subrequest + * length, and thus avoid netfs's "Subreq overread" warning message. + */ + netfs->sreq->transferred = min_t(s64, netfs->sreq->len, + atomic64_read(&netfs->transferred)); + netfs->sreq->error = netfs->error; + netfs_read_subreq_terminated(netfs->sreq); + kfree(netfs); +} +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) +{ + netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false); +} +extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); +extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); +extern int nfs_netfs_folio_unlock(struct folio *folio); /* * fscache.c */ -extern void nfs_fscache_get_client_cookie(struct nfs_client *); -extern void nfs_fscache_release_client_cookie(struct nfs_client *); - -extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int); +extern int nfs_fscache_get_super_cookie(struct super_block *, const char *, int); extern void nfs_fscache_release_super_cookie(struct super_block *); extern void nfs_fscache_init_inode(struct inode *); extern void nfs_fscache_clear_inode(struct inode *); extern void nfs_fscache_open_file(struct inode *, struct file *); +extern void nfs_fscache_release_file(struct inode *, struct file *); +extern int nfs_netfs_readahead(struct readahead_control *ractl); +extern int nfs_netfs_read_folio(struct file *file, struct folio *folio); -extern void __nfs_fscache_invalidate_page(struct page *, struct inode *); -extern int nfs_fscache_release_page(struct page *, gfp_t); - -extern int __nfs_readpage_from_fscache(struct nfs_open_context *, - struct inode *, struct page *); -extern int __nfs_readpages_from_fscache(struct nfs_open_context *, - struct inode *, struct address_space *, - struct list_head *, unsigned *); -extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int); - -/* - * wait for a page to complete writing to the cache - */ -static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, - struct page *page) +static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) { - if (PageFsCache(page)) - fscache_wait_on_page_write(nfsi->fscache, page); + if (folio_test_private_2(folio)) { /* [DEPRECATED] */ + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + folio_wait_private_2(folio); + } + fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host))); + return true; } -/* - * release the caching state associated with a page if undergoing complete page - * invalidation - */ -static inline void nfs_fscache_invalidate_page(struct page *page, - struct inode *inode) +static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, + struct inode *inode) { - if (PageFsCache(page)) - __nfs_fscache_invalidate_page(page, inode); + memset(auxdata, 0, sizeof(*auxdata)); + auxdata->mtime_sec = inode_get_mtime(inode).tv_sec; + auxdata->mtime_nsec = inode_get_mtime(inode).tv_nsec; + auxdata->ctime_sec = inode_get_ctime(inode).tv_sec; + auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec; + + if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4) + auxdata->change_attr = inode_peek_iversion_raw(inode); } /* - * Retrieve a page from an inode data storage object. + * Invalidate the contents of fscache for this inode. This will not sleep. */ -static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct page *page) +static inline void nfs_fscache_invalidate(struct inode *inode, int flags) { - if (NFS_I(inode)->fscache) - return __nfs_readpage_from_fscache(ctx, inode, page); - return -ENOBUFS; -} + struct nfs_fscache_inode_auxdata auxdata; + struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs); -/* - * Retrieve a set of pages from an inode data storage object. - */ -static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - if (NFS_I(inode)->fscache) - return __nfs_readpages_from_fscache(ctx, inode, mapping, pages, - nr_pages); - return -ENOBUFS; + nfs_fscache_update_auxdata(&auxdata, inode); + fscache_invalidate(cookie, &auxdata, i_size_read(inode), flags); } /* - * Store a page newly fetched from the server in an inode data storage object - * in the cache. + * indicate the client caching state as readable text */ -static inline void nfs_readpage_to_fscache(struct inode *inode, - struct page *page, - int sync) +static inline const char *nfs_server_fscache_state(struct nfs_server *server) { - if (PageFsCache(page)) - __nfs_readpage_to_fscache(inode, page, sync); + if (server->fscache) + return "yes"; + return "no "; } -/* - * Invalidate the contents of fscache for this inode. This will not sleep. - */ -static inline void nfs_fscache_invalidate(struct inode *inode) +static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr, + struct nfs_pageio_descriptor *desc) { - fscache_invalidate(NFS_I(inode)->fscache); + hdr->netfs = desc->pg_netfs; } - -/* - * Wait for an object to finish being invalidated. - */ -static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) +static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) { - fscache_wait_on_invalidate(NFS_I(inode)->fscache); + desc->pg_netfs = hdr->netfs; } - -/* - * indicate the client caching state as readable text - */ -static inline const char *nfs_server_fscache_state(struct nfs_server *server) +static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc) { - if (server->fscache && (server->options & NFS_OPTION_FSCACHE)) - return "yes"; - return "no "; + desc->pg_netfs = NULL; } - #else /* CONFIG_NFS_FSCACHE */ -static inline int nfs_fscache_register(void) { return 0; } -static inline void nfs_fscache_unregister(void) {} - -static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {} -static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {} - +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {} +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) {} +static inline void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) {} +static inline int nfs_netfs_folio_unlock(struct folio *folio) +{ + return 1; +} static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} static inline void nfs_fscache_init_inode(struct inode *inode) {} static inline void nfs_fscache_clear_inode(struct inode *inode) {} static inline void nfs_fscache_open_file(struct inode *inode, struct file *filp) {} - -static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) -{ - return 1; /* True: may release page */ -} -static inline void nfs_fscache_invalidate_page(struct page *page, - struct inode *inode) {} -static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi, - struct page *page) {} - -static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct page *page) +static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {} +static inline int nfs_netfs_readahead(struct readahead_control *ractl) { return -ENOBUFS; } -static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx, - struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) +static inline int nfs_netfs_read_folio(struct file *file, struct folio *folio) { return -ENOBUFS; } -static inline void nfs_readpage_to_fscache(struct inode *inode, - struct page *page, int sync) {} - -static inline void nfs_fscache_invalidate(struct inode *inode) {} -static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {} +static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) +{ + return true; /* may release folio */ +} +static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {} static inline const char *nfs_server_fscache_state(struct nfs_server *server) { return "no "; } - +static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr, + struct nfs_pageio_descriptor *desc) {} +static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) {} +static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc) {} #endif /* CONFIG_NFS_FSCACHE */ #endif /* _NFS_FSCACHE_H */ diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 391dafaf9182..f13d25d95b85 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* getroot.c: get the root dentry for an NFS mount * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> @@ -66,68 +62,103 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i } /* - * get an NFS2/NFS3 root dentry from the root filehandle + * get a root dentry from the root filehandle */ -struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, - const char *devname) +int nfs_get_root(struct super_block *s, struct fs_context *fc) { - struct nfs_server *server = NFS_SB(sb); + struct nfs_fs_context *ctx = nfs_fc2context(fc); + struct nfs_server *server = NFS_SB(s), *clone_server; struct nfs_fsinfo fsinfo; - struct dentry *ret; + struct dentry *root; struct inode *inode; - void *name = kstrdup(devname, GFP_KERNEL); - int error; + char *name; + int error = -ENOMEM; + unsigned long kflags = 0, kflags_out = 0; + name = kstrdup(fc->source, GFP_KERNEL); if (!name) - return ERR_PTR(-ENOMEM); + goto out; /* get the actual root for this mount */ - fsinfo.fattr = nfs_alloc_fattr(); - if (fsinfo.fattr == NULL) { - kfree(name); - return ERR_PTR(-ENOMEM); - } + fsinfo.fattr = nfs_alloc_fattr_with_label(server); + if (fsinfo.fattr == NULL) + goto out_name; - error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + error = server->nfs_client->rpc_ops->getroot(server, ctx->mntfh, &fsinfo); if (error < 0) { dprintk("nfs_get_root: getattr error = %d\n", -error); - ret = ERR_PTR(error); - goto out; + nfs_errorf(fc, "NFS: Couldn't getattr on root"); + goto out_fattr; } - inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL); + inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr); if (IS_ERR(inode)) { dprintk("nfs_get_root: get root inode failed\n"); - ret = ERR_CAST(inode); - goto out; + error = PTR_ERR(inode); + nfs_errorf(fc, "NFS: Couldn't get root inode"); + goto out_fattr; } - error = nfs_superblock_set_dummy_root(sb, inode); - if (error != 0) { - ret = ERR_PTR(error); - goto out; - } + error = nfs_superblock_set_dummy_root(s, inode); + if (error != 0) + goto out_fattr; /* root dentries normally start off anonymous and get spliced in later * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root */ - ret = d_obtain_root(inode); - if (IS_ERR(ret)) { + root = d_obtain_root(inode); + if (IS_ERR(root)) { dprintk("nfs_get_root: get root dentry failed\n"); - goto out; + error = PTR_ERR(root); + nfs_errorf(fc, "NFS: Couldn't get root dentry"); + goto out_fattr; } - security_d_instantiate(ret, inode); - spin_lock(&ret->d_lock); - if (IS_ROOT(ret) && !ret->d_fsdata && - !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { - ret->d_fsdata = name; + security_d_instantiate(root, inode); + spin_lock(&root->d_lock); + if (IS_ROOT(root) && !root->d_fsdata && + !(root->d_flags & DCACHE_NFSFS_RENAMED)) { + root->d_fsdata = name; name = NULL; } - spin_unlock(&ret->d_lock); -out: - kfree(name); + spin_unlock(&root->d_lock); + fc->root = root; + if (server->caps & NFS_CAP_SECURITY_LABEL) + kflags |= SECURITY_LSM_NATIVE_LABELS; + if (ctx->clone_data.sb) { + if (d_inode(fc->root)->i_fop != &nfs_dir_operations) { + error = -ESTALE; + goto error_splat_root; + } + /* clone lsm security options from the parent to the new sb */ + error = security_sb_clone_mnt_opts(ctx->clone_data.sb, + s, kflags, &kflags_out); + if (error) + goto error_splat_root; + clone_server = NFS_SB(ctx->clone_data.sb); + server->has_sec_mnt_opts = clone_server->has_sec_mnt_opts; + } else { + error = security_sb_set_mnt_opts(s, fc->security, + kflags, &kflags_out); + } + if (error) + goto error_splat_root; + if (server->caps & NFS_CAP_SECURITY_LABEL && + !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) + server->caps &= ~NFS_CAP_SECURITY_LABEL; + + nfs_setsecurity(inode, fsinfo.fattr); + error = 0; + +out_fattr: nfs_free_fattr(fsinfo.fattr); - return ret; +out_name: + kfree(name); +out: + return error; +error_splat_root: + dput(fc->root); + fc->root = NULL; + goto out_fattr; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 109279d6d91b..f76fe406937a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/inode.c * @@ -38,8 +39,8 @@ #include <linux/slab.h> #include <linux/compat.h> #include <linux/freezer.h> - #include <linux/uaccess.h> +#include <linux/iversion.h> #include "nfs4_fs.h" #include "callback.h" @@ -50,6 +51,7 @@ #include "pnfs.h" #include "nfs.h" #include "netns.h" +#include "sysfs.h" #include "nfstrace.h" @@ -60,7 +62,6 @@ /* Default is to see 64-bit inode numbers */ static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; -static void nfs_invalidate_inode(struct inode *); static int nfs_update_inode(struct inode *, struct nfs_fattr *); static struct kmem_cache * nfs_inode_cachep; @@ -71,25 +72,17 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -static int nfs_wait_killable(int mode) +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } - -int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) -{ - return nfs_wait_killable(mode); -} EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); -int nfs_wait_atomic_killable(atomic_t *p) -{ - return nfs_wait_killable(TASK_KILLABLE); -} - /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid @@ -115,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_drop_inode(struct inode *inode) { - return NFS_STALE(inode) || generic_drop_inode(inode); + return NFS_STALE(inode) || inode_generic_drop(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); @@ -148,6 +141,7 @@ EXPORT_SYMBOL_GPL(nfs_sync_inode); /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk + * @mapping: pointer to struct address_space */ int nfs_sync_mapping(struct address_space *mapping) { @@ -167,46 +161,67 @@ static int nfs_attribute_timeout(struct inode *inode) return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); } -static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags) +static bool nfs_check_cache_flags_invalid(struct inode *inode, + unsigned long flags) { unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); - /* Special case for the pagecache or access cache */ - if (flags == NFS_INO_REVAL_PAGECACHE && - !(cache_validity & NFS_INO_REVAL_FORCED)) - return false; return (cache_validity & flags) != 0; } -static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags) +bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) { - unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); - - if ((cache_validity & flags) != 0) - return true; - if (nfs_attribute_timeout(inode)) + if (nfs_check_cache_flags_invalid(inode, flags)) return true; - return false; + return nfs_attribute_cache_expired(inode); } +EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); -bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) +#ifdef CONFIG_NFS_V4_2 +static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) { - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) - return nfs_check_cache_invalid_delegated(inode, flags); - - return nfs_check_cache_invalid_not_delegated(inode, flags); + return nfsi->xattr_cache != NULL; +} +#else +static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) +{ + return false; } +#endif -static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); + if (nfs_have_delegated_attributes(inode)) { + if (!(flags & NFS_INO_REVAL_FORCED)) + flags &= ~(NFS_INO_INVALID_MODE | + NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_BTIME | + NFS_INO_INVALID_XATTR); + flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); + } + + if (!nfs_has_xattr_cache(nfsi)) + flags &= ~NFS_INO_INVALID_XATTR; + if (flags & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode, 0); + flags &= ~NFS_INO_REVAL_FORCED; + + flags |= nfsi->cache_validity; if (inode->i_mapping->nrpages == 0) flags &= ~NFS_INO_INVALID_DATA; - nfsi->cache_validity |= flags; - if (flags & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); + + /* pairs with nfs_clear_invalid_mapping()'s smp_load_acquire() */ + smp_store_release(&nfsi->cache_validity, flags); + + if (inode->i_mapping->nrpages == 0 || + nfsi->cache_validity & NFS_INO_INVALID_DATA) { + nfs_ooo_clear(nfsi); + } + trace_nfs_set_cache_invalid(inode, 0); } +EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); /* * Invalidate the local caches @@ -221,18 +236,17 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); - if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE); - } else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | + NFS_INO_INVALID_DATA | + NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | + NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); nfs_zap_label_cache_locked(nfsi); } @@ -267,6 +281,8 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { + if (nfs_have_delegated_atime(inode)) + return; spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); @@ -277,10 +293,18 @@ EXPORT_SYMBOL_GPL(nfs_invalidate_atime); * Invalidate, but do not unhash, the inode. * NB: must be called with inode->i_lock held! */ -static void nfs_invalidate_inode(struct inode *inode) +static void nfs_set_inode_stale_locked(struct inode *inode) { set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); nfs_zap_caches_locked(inode); + trace_nfs_set_inode_stale(inode); +} + +void nfs_set_inode_stale(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_set_inode_stale_locked(inode); + spin_unlock(&inode->i_lock); } struct nfs_find_desc { @@ -297,13 +321,13 @@ struct nfs_find_desc { static int nfs_find_actor(struct inode *inode, void *opaque) { - struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_find_desc *desc = opaque; struct nfs_fh *fh = desc->fh; struct nfs_fattr *fattr = desc->fattr; if (NFS_FILEID(inode) != fattr->fileid) return 0; - if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode)) + if (inode_wrong_type(inode, fattr->mode)) return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; @@ -315,7 +339,7 @@ nfs_find_actor(struct inode *inode, void *opaque) static int nfs_init_locked(struct inode *inode, void *opaque) { - struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; + struct nfs_find_desc *desc = opaque; struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); @@ -332,37 +356,32 @@ static void nfs_clear_label_invalid(struct inode *inode) spin_unlock(&inode->i_lock); } -void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, - struct nfs4_label *label) +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr) { int error; - if (label == NULL) + if (fattr->label == NULL) return; if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { - error = security_inode_notifysecctx(inode, label->label, - label->len); + error = security_inode_notifysecctx(inode, fattr->label->label, + fattr->label->len); if (error) printk(KERN_ERR "%s() %s %d " "security_inode_notifysecctx() %d\n", __func__, - (char *)label->label, - label->len, error); + (char *)fattr->label->label, + fattr->label->len, error); nfs_clear_label_invalid(inode); } } struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { - struct nfs4_label *label = NULL; - int minor_version = server->nfs_client->cl_minorversion; - - if (minor_version < 2) - return label; + struct nfs4_label *label; if (!(server->caps & NFS_CAP_SECURITY_LABEL)) - return label; + return NULL; label = kzalloc(sizeof(struct nfs4_label), flags); if (label == NULL) @@ -379,8 +398,7 @@ struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) } EXPORT_SYMBOL_GPL(nfs4_label_alloc); #else -void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, - struct nfs4_label *label) +void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr) { } #endif @@ -408,18 +426,36 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh) return inode; } +static void nfs_inode_init_regular(struct nfs_inode *nfsi) +{ + atomic_long_set(&nfsi->nrequests, 0); + atomic_long_set(&nfsi->redirtied_pages, 0); + INIT_LIST_HEAD(&nfsi->commit_info.list); + atomic_long_set(&nfsi->commit_info.ncommit, 0); + atomic_set(&nfsi->commit_info.rpcs_out, 0); + mutex_init(&nfsi->commit_mutex); +} + +static void nfs_inode_init_dir(struct nfs_inode *nfsi) +{ + nfsi->cache_change_attribute = 0; + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + init_rwsem(&nfsi->rmdir_sem); +} + /* * This is our front-end to iget that looks up inodes by file handle * instead of inode number. */ struct inode * -nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label) +nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) { struct nfs_find_desc desc = { .fh = fh, .fattr = fattr }; struct inode *inode = ERR_PTR(-ENOENT); + u64 fattr_supported = NFS_SB(sb)->fattr_valid; unsigned long hash; nfs_attr_check_mountpoint(sb, fattr); @@ -439,7 +475,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st goto out_no_inode; } - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long now = jiffies; @@ -450,9 +486,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; + nfsi->cache_validity = 0; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 - && nfs_server_capable(inode, NFS_CAP_MODE)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + && (fattr_supported & NFS_ATTR_FATTR_MODE)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -460,10 +497,13 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (S_ISREG(inode->i_mode)) { inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; + nfs_inode_init_regular(nfsi); + mapping_set_large_folios(inode->i_mapping); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; inode->i_data.a_ops = &nfs_dir_aops; + nfs_inode_init_dir(nfsi); /* Deal with crossing mountpoints */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { @@ -480,65 +520,74 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st } else init_special_inode(inode, inode->i_mode, fattr->rdev); - memset(&inode->i_atime, 0, sizeof(inode->i_atime)); - memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); - memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); - inode->i_version = 0; + inode_set_atime(inode, 0, 0); + inode_set_mtime(inode, 0, 0); + inode_set_ctime(inode, 0, 0); + memset(&nfsi->btime, 0, sizeof(nfsi->btime)); + inode_set_iversion_raw(inode, 0); inode->i_size = 0; clear_nlink(inode); inode->i_uid = make_kuid(&init_user_ns, -2); inode->i_gid = make_kgid(&init_user_ns, -2); inode->i_blocks = 0; - memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); nfsi->write_io = 0; nfsi->read_io = 0; nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; - else if (nfs_server_capable(inode, NFS_CAP_ATIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + inode_set_atime_to_ts(inode, fattr->atime); + else if (fattr_supported & NFS_ATTR_FATTR_ATIME) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; - else if (nfs_server_capable(inode, NFS_CAP_MTIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + inode_set_mtime_to_ts(inode, fattr->mtime); + else if (fattr_supported & NFS_ATTR_FATTR_MTIME) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; - else if (nfs_server_capable(inode, NFS_CAP_CTIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + inode_set_ctime_to_ts(inode, fattr->ctime); + else if (fattr_supported & NFS_ATTR_FATTR_CTIME) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); + if (fattr->valid & NFS_ATTR_FATTR_BTIME) + nfsi->btime = fattr->btime; + else if (fattr_supported & NFS_ATTR_FATTR_BTIME) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) - inode->i_version = fattr->change_attr; + inode_set_iversion_raw(inode, fattr->change_attr); else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); - else if (nfs_server_capable(inode, NFS_CAP_NLINK)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + else if (fattr_supported & NFS_ATTR_FATTR_NLINK) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); + else + set_nlink(inode, 1); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; - else if (nfs_server_capable(inode, NFS_CAP_OWNER)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + else if (fattr_supported & NFS_ATTR_FATTR_OWNER) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; - else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + else if (fattr_supported & NFS_ATTR_FATTR_GROUP) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; + else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED && + fattr->size != 0) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } + } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED && + fattr->size != 0) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); - nfs_setsecurity(inode, fattr, label); + nfs_setsecurity(inode, fattr); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; @@ -559,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - atomic_read(&inode->i_count)); + icount_read(inode)); out: return inode; @@ -570,14 +619,107 @@ out_no_inode: } EXPORT_SYMBOL_GPL(nfs_fhget); +static void +nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) +{ + unsigned long cache_validity = NFS_I(inode)->cache_validity; + + if (nfs_have_delegated_mtime(inode)) { + if (!(cache_validity & NFS_INO_INVALID_CTIME)) + fattr->valid &= ~(NFS_ATTR_FATTR_PRECTIME | + NFS_ATTR_FATTR_CTIME); + + if (!(cache_validity & NFS_INO_INVALID_MTIME)) + fattr->valid &= ~(NFS_ATTR_FATTR_PREMTIME | + NFS_ATTR_FATTR_MTIME); + + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + fattr->valid &= ~NFS_ATTR_FATTR_ATIME; + } else if (nfs_have_delegated_atime(inode)) { + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + fattr->valid &= ~NFS_ATTR_FATTR_ATIME; + } +} + +static void nfs_set_timestamps_to_ts(struct inode *inode, struct iattr *attr) +{ + unsigned int cache_flags = 0; + + if (attr->ia_valid & ATTR_MTIME_SET) { + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + struct timespec64 now; + int updated = 0; + + now = inode_set_ctime_current(inode); + if (!timespec64_equal(&now, &ctime)) + updated |= S_CTIME; + + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (!timespec64_equal(&now, &mtime)) + updated |= S_MTIME; + + inode_maybe_inc_iversion(inode, updated); + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (attr->ia_valid & ATTR_ATIME_SET) { + inode_set_atime_to_ts(inode, attr->ia_atime); + cache_flags |= NFS_INO_INVALID_ATIME; + } + NFS_I(inode)->cache_validity &= ~cache_flags; +} + +static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) +{ + enum file_time_flags time_flags = 0; + unsigned int cache_flags = 0; + + if (ia_valid & ATTR_MTIME) { + time_flags |= S_MTIME | S_CTIME; + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (ia_valid & ATTR_ATIME) { + time_flags |= S_ATIME; + cache_flags |= NFS_INO_INVALID_ATIME; + } + inode_update_timestamps(inode, time_flags); + NFS_I(inode)->cache_validity &= ~cache_flags; +} + +void nfs_update_delegated_atime(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (nfs_have_delegated_atime(inode)) + nfs_update_timestamps(inode, ATTR_ATIME); + spin_unlock(&inode->i_lock); +} + +void nfs_update_delegated_mtime_locked(struct inode *inode) +{ + if (nfs_have_delegated_mtime(inode)) + nfs_update_timestamps(inode, ATTR_MTIME); +} + +void nfs_update_delegated_mtime(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_update_delegated_mtime_locked(inode); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_update_delegated_mtime); + #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int -nfs_setattr(struct dentry *dentry, struct iattr *attr) +nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; + loff_t oldsize = i_size_read(inode); int error = 0; + kuid_t task_uid = current_fsuid(); + kuid_t owner_uid = inode->i_uid; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -592,35 +734,64 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if (attr->ia_size == i_size_read(inode)) + if (attr->ia_size == oldsize) attr->ia_valid &= ~ATTR_SIZE; } + if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { + spin_lock(&inode->i_lock); + if (attr->ia_valid & ATTR_MTIME_SET) { + if (uid_eq(task_uid, owner_uid)) { + nfs_set_timestamps_to_ts(inode, attr); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| + ATTR_ATIME|ATTR_ATIME_SET); + } + } else { + nfs_update_timestamps(inode, attr->ia_valid); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); + } + spin_unlock(&inode->i_lock); + } else if (nfs_have_delegated_atime(inode) && + attr->ia_valid & ATTR_ATIME && + !(attr->ia_valid & ATTR_MTIME)) { + if (attr->ia_valid & ATTR_ATIME_SET) { + if (uid_eq(task_uid, owner_uid)) { + spin_lock(&inode->i_lock); + nfs_set_timestamps_to_ts(inode, attr); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + } + } else { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; + } + } + /* Optimization: if the end result is no change, don't RPC */ - attr->ia_valid &= NFS_VALID_ATTRS; - if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) + if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + nfs_file_block_o_direct(NFS_I(inode)); nfs_sync_inode(inode); + } - fattr = nfs_alloc_fattr(); + fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) { error = -ENOMEM; goto out; } - /* - * Return any delegations if we're going to change ACLs - */ - if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) - NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); - if (error == 0) + if (error == 0) { + if (attr->ia_valid & ATTR_SIZE) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + attr->ia_size); error = nfs_refresh_inode(inode, fattr); + } nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); @@ -646,13 +817,18 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) if (err) goto out; + trace_nfs_size_truncate(inode, offset); i_size_write(inode, offset); /* Optimisation */ - if (offset == 0) + if (offset == 0) { NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; + nfs_ooo_clear(NFS_I(inode)); + } + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); + nfs_update_delegated_mtime_locked(inode); spin_lock(&inode->i_lock); out: return err; @@ -662,6 +838,7 @@ out: * nfs_setattr_update_inode - Update inode metadata after a setattr call. * @inode: pointer to struct inode * @attr: pointer to struct iattr + * @fattr: pointer to struct nfs_fattr * * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. @@ -674,7 +851,20 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; + if ((attr->ia_valid & ATTR_SIZE) != 0) { + if (!nfs_have_delegated_mtime(inode)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); + nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); + nfs_vmtruncate(inode, attr->ia_size); + } if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME; + if ((attr->ia_valid & ATTR_KILL_SUID) != 0 && + inode->i_mode & S_ISUID) + inode->i_mode &= ~S_ISUID; + if (setattr_should_drop_sgid(&nop_mnt_idmap, inode)) + inode->i_mode &= ~S_ISGID; if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -684,66 +874,145 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode_set_ctime_to_ts(inode, fattr->ctime); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL); } - if ((attr->ia_valid & ATTR_SIZE) != 0) { - nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); - nfs_vmtruncate(inode, attr->ia_size); + if (attr->ia_valid & (ATTR_ATIME_SET|ATTR_ATIME)) { + NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME + | NFS_INO_INVALID_CTIME); + if (fattr->valid & NFS_ATTR_FATTR_ATIME) + inode_set_atime_to_ts(inode, fattr->atime); + else if (attr->ia_valid & ATTR_ATIME_SET) + inode_set_atime_to_ts(inode, attr->ia_atime); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); + + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode_set_ctime_to_ts(inode, fattr->ctime); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); + } + if (attr->ia_valid & (ATTR_MTIME_SET|ATTR_MTIME)) { + NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME + | NFS_INO_INVALID_CTIME); + if (fattr->valid & NFS_ATTR_FATTR_MTIME) + inode_set_mtime_to_ts(inode, fattr->mtime); + else if (attr->ia_valid & ATTR_MTIME_SET) + inode_set_mtime_to_ts(inode, attr->ia_mtime); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); + + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode_set_ctime_to_ts(inode, fattr->ctime); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); } if (fattr->valid) nfs_update_inode(inode, fattr); - else - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); -static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) +/* + * Don't request help from readdirplus if the file is being written to, + * or if attribute caching is turned off + */ +static bool nfs_getattr_readdirplus_enable(const struct inode *inode) { - struct dentry *parent; - - if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) - return; - parent = dget_parent(dentry); - nfs_force_use_readdirplus(d_inode(parent)); - dput(parent); + return nfs_server_capable(inode, NFS_CAP_READDIRPLUS) && + !nfs_have_writebacks(inode) && NFS_MAXATTRTIMEO(inode) > 5 * HZ; } -static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) +static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) { - struct dentry *parent; - - if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) - return; - parent = dget_parent(dentry); - nfs_advise_use_readdirplus(d_inode(parent)); - dput(parent); + if (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); + nfs_readdir_record_entry_cache_miss(d_inode(parent)); + dput(parent); + } } -static bool nfs_need_revalidate_inode(struct inode *inode) +static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) { - if (NFS_I(inode)->cache_validity & - (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) - return true; - if (nfs_attribute_cache_expired(inode)) - return true; - return false; + if (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); + nfs_readdir_record_entry_cache_hit(d_inode(parent)); + dput(parent); + } } -int nfs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +static u32 nfs_get_valid_attrmask(struct inode *inode) +{ + u64 fattr_valid = NFS_SERVER(inode)->fattr_valid; + unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + u32 reply_mask = STATX_INO | STATX_TYPE; + + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + reply_mask |= STATX_ATIME; + if (!(cache_validity & NFS_INO_INVALID_CTIME)) + reply_mask |= STATX_CTIME; + if (!(cache_validity & NFS_INO_INVALID_MTIME)) + reply_mask |= STATX_MTIME; + if (!(cache_validity & NFS_INO_INVALID_SIZE)) + reply_mask |= STATX_SIZE; + if (!(cache_validity & NFS_INO_INVALID_NLINK)) + reply_mask |= STATX_NLINK; + if (!(cache_validity & NFS_INO_INVALID_MODE)) + reply_mask |= STATX_MODE; + if (!(cache_validity & NFS_INO_INVALID_OTHER)) + reply_mask |= STATX_UID | STATX_GID; + if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) + reply_mask |= STATX_BLOCKS; + if (!(cache_validity & NFS_INO_INVALID_BTIME) && + (fattr_valid & NFS_ATTR_FATTR_BTIME)) + reply_mask |= STATX_BTIME; + if (!(cache_validity & NFS_INO_INVALID_CHANGE)) + reply_mask |= STATX_CHANGE_COOKIE; + return reply_mask; +} + +int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; + struct nfs_server *server = NFS_SERVER(inode); + u64 fattr_valid = server->fattr_valid; + unsigned long cache_validity; int err = 0; + bool force_sync = query_flags & AT_STATX_FORCE_SYNC; + bool do_update = false; + bool readdirplus_enabled = nfs_getattr_readdirplus_enable(inode); trace_nfs_getattr_enter(inode); - /* Flush out writes to the server in order to update c/mtime. */ - if (S_ISREG(inode->i_mode)) { - err = filemap_write_and_wait(inode->i_mapping); - if (err) - goto out; + + request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | + STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | + STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | + STATX_CHANGE_COOKIE; + + if (!(fattr_valid & NFS_ATTR_FATTR_BTIME)) + request_mask &= ~STATX_BTIME; + + if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { + if (readdirplus_enabled) + nfs_readdirplus_parent_cache_hit(path->dentry); + goto out_no_revalidate; + } + + /* Flush out writes to the server in order to update c/mtime/version. */ + if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) && + S_ISREG(inode->i_mode)) { + if (nfs_have_delegated_mtime(inode)) + filemap_fdatawrite(inode->i_mapping); + else + filemap_write_and_wait(inode->i_mapping); } /* @@ -752,28 +1021,78 @@ int nfs_getattr(const struct path *path, struct kstat *stat, * Note that we only have to check the vfsmount flags here: * - NFS always sets S_NOATIME by so checking it would give a * bogus result - * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is + * - NFS never sets SB_NOATIME or SB_NODIRATIME so there is * no point in checking those. */ if ((path->mnt->mnt_flags & MNT_NOATIME) || ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) - need_atime = 0; - - if (need_atime || nfs_need_revalidate_inode(inode)) { - struct nfs_server *server = NFS_SERVER(inode); - - if (!(server->flags & NFS_MOUNT_NOAC)) + request_mask &= ~STATX_ATIME; + + /* Is the user requesting attributes that might need revalidation? */ + if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| + STATX_MTIME|STATX_UID|STATX_GID| + STATX_SIZE|STATX_BLOCKS|STATX_BTIME| + STATX_CHANGE_COOKIE))) + goto out_no_revalidate; + + /* Check whether the cached attributes are stale */ + do_update |= force_sync || nfs_attribute_cache_expired(inode); + cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + do_update |= cache_validity & NFS_INO_INVALID_CHANGE; + if (request_mask & STATX_ATIME) + do_update |= cache_validity & NFS_INO_INVALID_ATIME; + if (request_mask & STATX_CTIME) + do_update |= cache_validity & NFS_INO_INVALID_CTIME; + if (request_mask & STATX_MTIME) + do_update |= cache_validity & NFS_INO_INVALID_MTIME; + if (request_mask & STATX_SIZE) + do_update |= cache_validity & NFS_INO_INVALID_SIZE; + if (request_mask & STATX_NLINK) + do_update |= cache_validity & NFS_INO_INVALID_NLINK; + if (request_mask & STATX_MODE) + do_update |= cache_validity & NFS_INO_INVALID_MODE; + if (request_mask & (STATX_UID | STATX_GID)) + do_update |= cache_validity & NFS_INO_INVALID_OTHER; + if (request_mask & STATX_BLOCKS) + do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; + if (request_mask & STATX_BTIME) + do_update |= cache_validity & NFS_INO_INVALID_BTIME; + + if (do_update) { + if (readdirplus_enabled) nfs_readdirplus_parent_cache_miss(path->dentry); - else - nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); - } else + if (err) + goto out; + } else if (readdirplus_enabled) nfs_readdirplus_parent_cache_hit(path->dentry); - if (!err) { - generic_fillattr(inode, stat); - stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); - if (S_ISDIR(inode->i_mode)) - stat->blksize = NFS_SERVER(inode)->dtsize; +out_no_revalidate: + /* Only return attributes that were revalidated. */ + stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; + + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + stat->change_cookie = inode_peek_iversion_raw(inode); + stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED) + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; + if (S_ISDIR(inode->i_mode)) + stat->blksize = NFS_SERVER(inode)->dtsize; + stat->btime = NFS_I(inode)->btime; + + /* Special handling for STATX_DIOALIGN and STATX_DIO_READ_ALIGN + * - NFS doesn't have DIO alignment constraints, avoid getting + * these DIO attrs from remote and just respond with most + * accommodating limits (so client will issue supported DIO). + * - this is unintuitive, but the most coarse-grained + * dio_offset_align is the most accommodating. + */ + if ((request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) && + S_ISREG(inode->i_mode)) { + stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN; + stat->dio_mem_align = 4; /* 4-byte alignment */ + stat->dio_offset_align = PAGE_SIZE; + stat->dio_read_offset_align = stat->dio_offset_align; } out: trace_nfs_getattr_exit(inode, err); @@ -783,7 +1102,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { - atomic_set(&l_ctx->count, 1); + refcount_set(&l_ctx->count, 1); l_ctx->lockowner = current->files; INIT_LIST_HEAD(&l_ctx->list); atomic_set(&l_ctx->io_count, 0); @@ -791,15 +1110,14 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) { - struct nfs_lock_context *head = &ctx->lock_context; - struct nfs_lock_context *pos = head; + struct nfs_lock_context *pos; - do { + list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) { if (pos->lockowner != current->files) continue; - atomic_inc(&pos->count); - return pos; - } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); + if (refcount_inc_not_zero(&pos->count)) + return pos; + } return NULL; } @@ -808,25 +1126,29 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) struct nfs_lock_context *res, *new = NULL; struct inode *inode = d_inode(ctx->dentry); - spin_lock(&inode->i_lock); + rcu_read_lock(); res = __nfs_find_lock_context(ctx); + rcu_read_unlock(); if (res == NULL) { - spin_unlock(&inode->i_lock); - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); if (new == NULL) return ERR_PTR(-ENOMEM); nfs_init_lock_context(new); spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); if (res == NULL) { - list_add_tail(&new->list, &ctx->lock_context.list); - new->open_context = ctx; - res = new; - new = NULL; + new->open_context = get_nfs_open_context(ctx); + if (new->open_context) { + list_add_tail_rcu(&new->list, + &ctx->lock_context.list); + res = new; + new = NULL; + } else + res = ERR_PTR(-EBADF); } + spin_unlock(&inode->i_lock); + kfree(new); } - spin_unlock(&inode->i_lock); - kfree(new); return res; } EXPORT_SYMBOL_GPL(nfs_get_lock_context); @@ -836,11 +1158,12 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx) struct nfs_open_context *ctx = l_ctx->open_context; struct inode *inode = d_inode(ctx->dentry); - if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) + if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; - list_del(&l_ctx->list); + list_del_rcu(&l_ctx->list); spin_unlock(&inode->i_lock); - kfree(l_ctx); + put_nfs_open_context(ctx); + kfree_rcu(l_ctx, rcu_head); } EXPORT_SYMBOL_GPL(nfs_put_lock_context); @@ -857,14 +1180,13 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) { struct nfs_inode *nfsi; struct inode *inode; - struct nfs_server *server; if (!(ctx->mode & FMODE_WRITE)) return; if (!is_sync) return; inode = d_inode(ctx->dentry); - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + if (nfs_have_read_or_write_delegation(inode)) return; nfsi = NFS_I(inode); if (inode->i_mapping->nrpages == 0) @@ -873,10 +1195,10 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) return; if (!list_empty(&nfsi->open_files)) return; - server = NFS_SERVER(inode); - if (server->flags & NFS_MOUNT_NOCTO) + if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO) return; - nfs_revalidate_inode(server, inode); + nfs_revalidate_inode(inode, + NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); } EXPORT_SYMBOL_GPL(nfs_close_context); @@ -885,18 +1207,17 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct file *filp) { struct nfs_open_context *ctx; - struct rpc_cred *cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return ERR_CAST(cred); - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { - put_rpccred(cred); + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); + if (!ctx) return ERR_PTR(-ENOMEM); - } nfs_sb_active(dentry->d_sb); ctx->dentry = dget(dentry); - ctx->cred = cred; + if (filp) + ctx->cred = get_cred(filp->f_cred); + else + ctx->cred = get_current_cred(); + rcu_assign_pointer(ctx->ll_cred, NULL); ctx->state = NULL; ctx->mode = f_mode; ctx->flags = 0; @@ -906,15 +1227,17 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); ctx->mdsthreshold = NULL; + nfs_localio_file_init(&ctx->nfl); + return ctx; } EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { - if (ctx != NULL) - atomic_inc(&ctx->lock_context.count); - return ctx; + if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count)) + return ctx; + return NULL; } EXPORT_SYMBOL_GPL(get_nfs_open_context); @@ -923,21 +1246,22 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) struct inode *inode = d_inode(ctx->dentry); struct super_block *sb = ctx->dentry->d_sb; + if (!refcount_dec_and_test(&ctx->lock_context.count)) + return; if (!list_empty(&ctx->list)) { - if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) - return; - list_del(&ctx->list); + spin_lock(&inode->i_lock); + list_del_rcu(&ctx->list); spin_unlock(&inode->i_lock); - } else if (!atomic_dec_and_test(&ctx->lock_context.count)) - return; + } if (inode != NULL) NFS_PROTO(inode)->close_context(ctx, is_sync); - if (ctx->cred != NULL) - put_rpccred(ctx->cred); + put_cred(ctx->cred); dput(ctx->dentry); nfs_sb_deactive(sb); + put_rpccred(rcu_dereference_protected(ctx->ll_cred, 1)); kfree(ctx->mdsthreshold); - kfree(ctx); + nfs_close_local_fh(&ctx->nfl); + kfree_rcu(ctx, rcu_head); } void put_nfs_open_context(struct nfs_open_context *ctx) @@ -961,10 +1285,11 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - if (ctx->mode & FMODE_WRITE) - list_add(&ctx->list, &nfsi->open_files); - else - list_add_tail(&ctx->list, &nfsi->open_files); + if (list_empty(&nfsi->open_files) && + nfs_ooo_test(nfsi)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA | + NFS_INO_REVAL_FORCED); + list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); @@ -972,6 +1297,7 @@ EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { filp->private_data = get_nfs_open_context(ctx); + set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); if (list_empty(&ctx->list)) nfs_inode_attach_open_context(ctx); } @@ -980,21 +1306,24 @@ EXPORT_SYMBOL_GPL(nfs_file_set_open_context); /* * Given an inode, search for an open context with the desired characteristics */ -struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode) +struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *pos, *ctx = NULL; - spin_lock(&inode->i_lock); - list_for_each_entry(pos, &nfsi->open_files, list) { - if (cred != NULL && pos->cred != cred) + rcu_read_lock(); + list_for_each_entry_rcu(pos, &nfsi->open_files, list) { + if (cred != NULL && cred_fscmp(pos->cred, cred) != 0) continue; if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) continue; + if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags)) + continue; ctx = get_nfs_open_context(pos); - break; + if (ctx) + break; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ctx; } @@ -1005,6 +1334,7 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx) { struct inode *inode = d_inode(ctx->dentry); + clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); /* * We fatal error on write before. Try to writeback * every page again. @@ -1012,9 +1342,6 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx->error < 0) invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; - spin_lock(&inode->i_lock); - list_move_tail(&ctx->list, &NFS_I(inode)->open_files); - spin_unlock(&inode->i_lock); put_nfs_open_context_sync(ctx); } } @@ -1026,7 +1353,8 @@ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); + ctx = alloc_nfs_open_context(file_dentry(filp), + flags_to_mode(filp->f_flags), filp); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); @@ -1043,7 +1371,6 @@ int __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { int status = -ESTALE; - struct nfs4_label *label = NULL; struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); @@ -1065,29 +1392,30 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) } status = -ENOMEM; - fattr = nfs_alloc_fattr(); + fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) goto out; nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); - label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); - if (IS_ERR(label)) { - status = PTR_ERR(label); - goto out; - } - - status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label); + status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, inode); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), status); - if (status == -ESTALE) { - nfs_zap_caches(inode); + switch (status) { + case -ETIMEDOUT: + /* A soft timeout occurred. Use cached information? */ + if (server->flags & NFS_MOUNT_SOFTREVAL) + status = 0; + break; + case -ESTALE: if (!S_ISDIR(inode->i_mode)) - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); + else + nfs_zap_caches(inode); } - goto err_out; + goto out; } status = nfs_refresh_inode(inode, fattr); @@ -1095,20 +1423,18 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), status); - goto err_out; + goto out; } if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); - nfs_setsecurity(inode, fattr, label); + nfs_setsecurity(inode, fattr); dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); -err_out: - nfs4_label_free(label); out: nfs_free_fattr(fattr); trace_nfs_revalidate_inode_exit(inode, status); @@ -1124,27 +1450,26 @@ int nfs_attribute_cache_expired(struct inode *inode) /** * nfs_revalidate_inode - Revalidate the inode attributes - * @server - pointer to nfs_server struct - * @inode - pointer to inode struct + * @inode: pointer to inode struct + * @flags: cache flags to check * * Updates inode attribute information by retrieving the data from the server. */ -int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) +int nfs_revalidate_inode(struct inode *inode, unsigned long flags) { - if (!nfs_need_revalidate_inode(inode)) + if (!nfs_check_cache_invalid(inode, flags)) return NFS_STALE(inode) ? -ESTALE : 0; - return __nfs_revalidate_inode(server, inode); + return __nfs_revalidate_inode(NFS_SERVER(inode), inode); } EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { - struct nfs_inode *nfsi = NFS_I(inode); int ret; + nfs_fscache_invalidate(inode, 0); if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { - unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; @@ -1153,13 +1478,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (ret < 0) return ret; } - if (S_ISDIR(inode->i_mode)) { - spin_lock(&inode->i_lock); - memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); - spin_unlock(&inode->i_lock); - } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); - nfs_fscache_wait_on_invalidate(inode); dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", inode->i_sb->s_id, @@ -1167,55 +1486,19 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map return 0; } -bool nfs_mapping_need_revalidate_inode(struct inode *inode) -{ - return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) || - NFS_STALE(inode); -} - -int nfs_revalidate_mapping_rcu(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - unsigned long *bitlock = &nfsi->flags; - int ret = 0; - - if (IS_SWAPFILE(inode)) - goto out; - if (nfs_mapping_need_revalidate_inode(inode)) { - ret = -ECHILD; - goto out; - } - spin_lock(&inode->i_lock); - if (test_bit(NFS_INO_INVALIDATING, bitlock) || - (nfsi->cache_validity & NFS_INO_INVALID_DATA)) - ret = -ECHILD; - spin_unlock(&inode->i_lock); -out: - return ret; -} - /** - * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping + * nfs_clear_invalid_mapping - Conditionally clear a mapping + * @mapping: pointer to mapping + * + * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping. */ -int nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping) +int nfs_clear_invalid_mapping(struct address_space *mapping) { + struct inode *inode = mapping->host; struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; int ret = 0; - /* swapfiles are not supposed to be shared. */ - if (IS_SWAPFILE(inode)) - goto out; - - if (nfs_mapping_need_revalidate_inode(inode)) { - ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (ret < 0) - goto out; - } - /* * We must clear NFS_INO_INVALID_DATA first to ensure that * invalidations that come in while we're shooting down the mappings @@ -1230,9 +1513,17 @@ int nfs_revalidate_mapping(struct inode *inode, */ for (;;) { ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; + smp_rmb(); /* pairs with smp_wmb() below */ + if (test_bit(NFS_INO_INVALIDATING, bitlock)) + continue; + /* pairs with nfs_set_cache_invalid()'s smp_store_release() */ + if (!(smp_load_acquire(&nfsi->cache_validity) & NFS_INO_INVALID_DATA)) + goto out; + /* Slow-path that double-checks with spinlock held */ spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock)) { spin_unlock(&inode->i_lock); @@ -1247,6 +1538,7 @@ int nfs_revalidate_mapping(struct inode *inode, set_bit(NFS_INO_INVALIDATING, bitlock); smp_wmb(); nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + nfs_ooo_clear(nfsi); spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); @@ -1259,23 +1551,62 @@ out: return ret; } +bool nfs_mapping_need_revalidate_inode(struct inode *inode) +{ + return nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE) || + NFS_STALE(inode); +} + +int nfs_revalidate_mapping_rcu(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; + int ret = 0; + + if (IS_SWAPFILE(inode)) + goto out; + if (nfs_mapping_need_revalidate_inode(inode)) { + ret = -ECHILD; + goto out; + } + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock) || + (nfsi->cache_validity & NFS_INO_INVALID_DATA)) + ret = -ECHILD; + spin_unlock(&inode->i_lock); +out: + return ret; +} + +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode: pointer to host inode + * @mapping: pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + /* swapfiles are not supposed to be shared. */ + if (IS_SWAPFILE(inode)) + return 0; + + if (nfs_mapping_need_revalidate_inode(inode)) { + int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret < 0) + return ret; + } + + return nfs_clear_invalid_mapping(mapping); +} + static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; - assert_spin_locked(&inode->i_lock); - if (!S_ISREG(inode->i_mode)) return false; if (list_empty(&nfsi->open_files)) return false; - /* Note: This relies on nfsi->open_files being ordered with writers - * being placed at the head of the list. - * See nfs_inode_attach_open_context() - */ - return (list_first_entry(&nfsi->open_files, - struct nfs_open_context, - list)->mode & FMODE_WRITE) == FMODE_WRITE; + return inode_is_open_for_write(inode); } static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) @@ -1283,50 +1614,46 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); } -static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); - unsigned long ret = 0; + struct timespec64 ts; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) - && inode->i_version == fattr->pre_change_attr) { - inode->i_version = fattr->change_attr; + && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) { + inode_set_iversion_raw(inode, fattr->change_attr); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); - ret |= NFS_INO_INVALID_ATTR; + else if (nfs_server_capable(inode, NFS_CAP_XATTR)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); } /* If we have atomic WCC data, we may update some attributes */ + ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) - && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - ret |= NFS_INO_INVALID_ATTR; + && timespec64_equal(&ts, &fattr->pre_ctime)) { + inode_set_ctime_to_ts(inode, fattr->ctime); } + ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) - && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - if (S_ISDIR(inode->i_mode)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); - ret |= NFS_INO_INVALID_ATTR; + && timespec64_equal(&ts, &fattr->pre_mtime)) { + inode_set_mtime_to_ts(inode, fattr->mtime); } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) - && nfsi->nrequests == 0) { + && !nfs_have_writebacks(inode)) { + trace_nfs_size_wcc(inode, fattr->size); i_size_write(inode, nfs_size_to_loff_t(fattr->size)); - ret |= NFS_INO_INVALID_ATTR; } - - return ret; } /** * nfs_check_inode_attributes - verify consistency of the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * Verifies the attribute cache. If we have just changed the attributes, * so that fattr carries weak cache consistency data, then it may @@ -1337,52 +1664,66 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_size, new_isize; unsigned long invalid = 0; - + struct timespec64 ts; if (nfs_have_delegated_attributes(inode)) return 0; + + if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { + /* Only a mounted-on-fileid? Just exit */ + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + return 0; /* Has the inode gone and changed behind our back? */ - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) + } else if (nfsi->fileid != fattr->fileid) { + /* Is this perhaps the mounted-on fileid? */ + if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && + nfsi->fileid == fattr->mounted_on_fileid) + return 0; return -ESTALE; - if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + } + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) return -ESTALE; + if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) + invalid |= NFS_INO_INVALID_CHANGE; - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR; + ts = inode_get_mtime(inode); + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) + invalid |= NFS_INO_INVALID_MTIME; - if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ATTR; + ts = inode_get_ctime(inode); + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime)) + invalid |= NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); if (cur_size != new_isize) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + invalid |= NFS_INO_INVALID_SIZE; } } /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_MODE; if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_OTHER; /* Has the link count changed? */ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_NLINK; - if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) + ts = inode_get_atime(inode); + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) - nfs_set_cache_invalid(inode, invalid | NFS_INO_REVAL_FORCED); + nfs_set_cache_invalid(inode, invalid); nfsi->read_cache_jiffies = fattr->time_start; return 0; @@ -1408,6 +1749,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr) fattr->gencount = nfs_inc_attr_generation_counter(); fattr->owner_name = NULL; fattr->group_name = NULL; + fattr->mdsthreshold = NULL; } EXPORT_SYMBOL_GPL(nfs_fattr_init); @@ -1431,18 +1773,37 @@ struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; - fattr = kmalloc(sizeof(*fattr), GFP_NOFS); - if (fattr != NULL) + fattr = kmalloc(sizeof(*fattr), GFP_KERNEL); + if (fattr != NULL) { nfs_fattr_init(fattr); + fattr->label = NULL; + } return fattr; } EXPORT_SYMBOL_GPL(nfs_alloc_fattr); +struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server) +{ + struct nfs_fattr *fattr = nfs_alloc_fattr(); + + if (!fattr) + return NULL; + + fattr->label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(fattr->label)) { + kfree(fattr); + return NULL; + } + + return fattr; +} +EXPORT_SYMBOL_GPL(nfs_alloc_fattr_with_label); + struct nfs_fh *nfs_alloc_fhandle(void) { struct nfs_fh *fh; - fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); + fh = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); if (fh != NULL) fh->size = 0; return fh; @@ -1513,42 +1874,209 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle); #endif /** - * nfs_inode_attrs_need_update - check if the inode attributes need updating - * @inode - pointer to inode - * @fattr - attributes + * nfs_inode_attrs_cmp_generic - compare attributes + * @fattr: attributes + * @inode: pointer to inode * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. - * - * To do so, the function first assumes that a more recent ctime means - * that the attributes in fattr are newer, however it also attempt to - * catch the case where ctime either didn't change, or went backwards - * (if someone reset the clock on the server) by looking at whether - * or not this RPC call was started after the inode was last updated. * Note also the check for wraparound of 'attr_gencount' * - * The function returns 'true' if it thinks the attributes in 'fattr' are - * more recent than the ones cached in the inode. + * The function returns '1' if it thinks the attributes in @fattr are + * more recent than the ones cached in @inode. Otherwise it returns + * the value '0'. + */ +static int nfs_inode_attrs_cmp_generic(const struct nfs_fattr *fattr, + const struct inode *inode) +{ + unsigned long attr_gencount = NFS_I(inode)->attr_gencount; + + return (long)(fattr->gencount - attr_gencount) > 0 || + (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0; +} + +/** + * nfs_inode_attrs_cmp_monotonic - compare attributes + * @fattr: attributes + * @inode: pointer to inode + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * We assume that the server observes monotonic semantics for + * the change attribute, so a larger value means that the attributes in + * @fattr are more recent, in which case the function returns the + * value '1'. + * A return value of '0' indicates no measurable change + * A return value of '-1' means that the attributes in @inode are + * more recent. + */ +static int nfs_inode_attrs_cmp_monotonic(const struct nfs_fattr *fattr, + const struct inode *inode) +{ + s64 diff = fattr->change_attr - inode_peek_iversion_raw(inode); + if (diff > 0) + return 1; + return diff == 0 ? 0 : -1; +} + +/** + * nfs_inode_attrs_cmp_strict_monotonic - compare attributes + * @fattr: attributes + * @inode: pointer to inode + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * We assume that the server observes strictly monotonic semantics for + * the change attribute, so a larger value means that the attributes in + * @fattr are more recent, in which case the function returns the + * value '1'. + * A return value of '-1' means that the attributes in @inode are + * more recent or unchanged. + */ +static int nfs_inode_attrs_cmp_strict_monotonic(const struct nfs_fattr *fattr, + const struct inode *inode) +{ + return nfs_inode_attrs_cmp_monotonic(fattr, inode) > 0 ? 1 : -1; +} + +/** + * nfs_inode_attrs_cmp - compare attributes + * @fattr: attributes + * @inode: pointer to inode * + * This function returns '1' if it thinks the attributes in @fattr are + * more recent than the ones cached in @inode. It returns '-1' if + * the attributes in @inode are more recent than the ones in @fattr, + * and it returns 0 if not sure. */ -static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +static int nfs_inode_attrs_cmp(const struct nfs_fattr *fattr, + const struct inode *inode) { - const struct nfs_inode *nfsi = NFS_I(inode); + if (nfs_inode_attrs_cmp_generic(fattr, inode) > 0) + return 1; + switch (NFS_SERVER(inode)->change_attr_type) { + case NFS4_CHANGE_TYPE_IS_UNDEFINED: + break; + case NFS4_CHANGE_TYPE_IS_TIME_METADATA: + if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE)) + break; + return nfs_inode_attrs_cmp_monotonic(fattr, inode); + default: + if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE)) + break; + return nfs_inode_attrs_cmp_strict_monotonic(fattr, inode); + } + return 0; +} - return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || - ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); +/** + * nfs_inode_finish_partial_attr_update - complete a previous inode update + * @fattr: attributes + * @inode: pointer to inode + * + * Returns '1' if the last attribute update left the inode cached + * attributes in a partially unrevalidated state, and @fattr + * matches the change attribute of that partial update. + * Otherwise returns '0'. + */ +static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr, + const struct inode *inode) +{ + const unsigned long check_valid = + NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | + NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_NLINK | NFS_INO_INVALID_BTIME; + unsigned long cache_validity = NFS_I(inode)->cache_validity; + enum nfs4_change_attr_type ctype = NFS_SERVER(inode)->change_attr_type; + + if (ctype != NFS4_CHANGE_TYPE_IS_UNDEFINED && + !(cache_validity & NFS_INO_INVALID_CHANGE) && + (cache_validity & check_valid) != 0 && + (fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + nfs_inode_attrs_cmp_monotonic(fattr, inode) == 0) + return 1; + return 0; } -static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +static void nfs_ooo_merge(struct nfs_inode *nfsi, + u64 start, u64 end) { - int ret; + int i, cnt; + + if (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER) + /* No point merging anything */ + return; + + if (!nfsi->ooo) { + nfsi->ooo = kmalloc(sizeof(*nfsi->ooo), GFP_ATOMIC); + if (!nfsi->ooo) { + nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; + return; + } + nfsi->ooo->cnt = 0; + } + + /* add this range, merging if possible */ + cnt = nfsi->ooo->cnt; + for (i = 0; i < cnt; i++) { + if (end == nfsi->ooo->gap[i].start) + end = nfsi->ooo->gap[i].end; + else if (start == nfsi->ooo->gap[i].end) + start = nfsi->ooo->gap[i].start; + else + continue; + /* Remove 'i' from table and loop to insert the new range */ + cnt -= 1; + nfsi->ooo->gap[i] = nfsi->ooo->gap[cnt]; + i = -1; + } + if (start != end) { + if (cnt >= ARRAY_SIZE(nfsi->ooo->gap)) { + nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; + kfree(nfsi->ooo); + nfsi->ooo = NULL; + return; + } + nfsi->ooo->gap[cnt].start = start; + nfsi->ooo->gap[cnt].end = end; + cnt += 1; + } + nfsi->ooo->cnt = cnt; +} + +static void nfs_ooo_record(struct nfs_inode *nfsi, + struct nfs_fattr *fattr) +{ + /* This reply was out-of-order, so record in the + * pre/post change id, possibly cancelling + * gaps created when iversion was jumpped forward. + */ + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) && + (fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) + nfs_ooo_merge(nfsi, + fattr->change_attr, + fattr->pre_change_attr); +} + +static int nfs_refresh_inode_locked(struct inode *inode, + struct nfs_fattr *fattr) +{ + int attr_cmp = nfs_inode_attrs_cmp(fattr, inode); + int ret = 0; trace_nfs_refresh_inode_enter(inode); - if (nfs_inode_attrs_need_update(inode, fattr)) + if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode)) ret = nfs_update_inode(inode, fattr); - else - ret = nfs_check_inode_attributes(inode, fattr); + else { + nfs_ooo_record(NFS_I(inode), fattr); + + if (attr_cmp == 0) + ret = nfs_check_inode_attributes(inode, fattr); + } trace_nfs_refresh_inode_exit(inode, ret); return ret; @@ -1556,8 +2084,8 @@ static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr /** * nfs_refresh_inode - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * Check that an RPC call that returned attributes has not overlapped with * other recent updates of the inode metadata, then decide whether it is @@ -1578,10 +2106,9 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) } EXPORT_SYMBOL_GPL(nfs_refresh_inode); -static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +static int nfs_post_op_update_inode_locked(struct inode *inode, + struct nfs_fattr *fattr, unsigned int invalid) { - unsigned long invalid = NFS_INO_INVALID_ATTR; - if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); @@ -1592,8 +2119,8 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr /** * nfs_post_op_update_inode - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. @@ -1610,7 +2137,10 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) spin_lock(&inode->i_lock); nfs_fattr_set_barrier(fattr); - status = nfs_post_op_update_inode_locked(inode, fattr); + status = nfs_post_op_update_inode_locked(inode, fattr, + NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME + | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); return status; @@ -1619,8 +2149,8 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up @@ -1630,11 +2160,15 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); */ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) { + int attr_cmp = nfs_inode_attrs_cmp(fattr, inode); int status; /* Don't do a WCC update if these attributes are already stale */ - if ((fattr->valid & NFS_ATTR_FATTR) == 0 || - !nfs_inode_attrs_need_update(inode, fattr)) { + if (attr_cmp < 0) + return 0; + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) { + /* Record the pre/post change info before clearing PRECHANGE */ + nfs_ooo_record(NFS_I(inode), fattr); fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE | NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PREMTIME @@ -1643,17 +2177,17 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { - fattr->pre_change_attr = inode->i_version; + fattr->pre_change_attr = inode_peek_iversion_raw(inode); fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { - memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); + fattr->pre_ctime = inode_get_ctime(inode); fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { - memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); + fattr->pre_mtime = inode_get_mtime(inode); fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && @@ -1662,14 +2196,18 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa fattr->valid |= NFS_ATTR_FATTR_PRESIZE; } out_noforce: - status = nfs_post_op_update_inode_locked(inode, fattr); + status = nfs_post_op_update_inode_locked(inode, fattr, + NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME + | NFS_INO_INVALID_BLOCKS); return status; } /** * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up @@ -1690,18 +2228,6 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); -static inline bool nfs_fileid_valid(struct nfs_inode *nfsi, - struct nfs_fattr *fattr) -{ - bool ret1 = true, ret2 = true; - - if (fattr->valid & NFS_ATTR_FATTR_FILEID) - ret1 = (nfsi->fileid == fattr->fileid); - if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) - ret2 = (nfsi->fileid == fattr->mounted_on_fileid); - return ret1 || ret2; -} - /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -1716,21 +2242,33 @@ static inline bool nfs_fileid_valid(struct nfs_inode *nfsi, */ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_server *server; + struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_isize, new_isize; + u64 fattr_supported = server->fattr_valid; unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; bool have_writers = nfs_file_has_buffered_writers(nfsi); bool cache_revalidated = true; + bool attr_changed = false; + bool have_delegation; - dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", + dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - atomic_read(&inode->i_count), fattr->valid); + icount_read(inode), fattr->valid); - if (!nfs_fileid_valid(nfsi, fattr)) { + if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { + /* Only a mounted-on-fileid? Just exit */ + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + return 0; + /* Has the inode gone and changed behind our back? */ + } else if (nfsi->fileid != fattr->fileid) { + /* Is this perhaps the mounted-on fileid? */ + if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && + nfsi->fileid == fattr->mounted_on_fileid) + return 0; printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, @@ -1742,7 +2280,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* * Make sure the inode's type hasn't changed. */ - if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) { /* * Big trouble! The inode has become a different object. */ @@ -1751,178 +2289,193 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) goto out_err; } - server = NFS_SERVER(inode); /* Update the fsid? */ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && !IS_AUTOMOUNT(inode)) server->fsid = fattr->fsid; + /* Save the delegation state before clearing cache_validity */ + have_delegation = nfs_have_delegated_attributes(inode); + /* * Update the read time so we don't revalidate too often. */ nfsi->read_cache_jiffies = fattr->time_start; + /* Fix up any delegated attributes in the struct nfs_fattr */ + nfs_fattr_fixup_delegated(inode, fattr); + save_cache_validity = nfsi->cache_validity; nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED - | NFS_INO_REVAL_PAGECACHE); + | NFS_INO_INVALID_BLOCKS); /* Do atomic weak cache consistency updates */ - invalid |= nfs_wcc_update_inode(inode, fattr); + nfs_wcc_update_inode(inode, fattr); if (pnfs_layoutcommit_outstanding(inode)) { - nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; + nfsi->cache_validity |= + save_cache_validity & + (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | + NFS_INO_INVALID_BLOCKS); cache_revalidated = false; } /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { - if (inode->i_version != fattr->change_attr) { - dprintk("NFS: change_attr change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); + if (!have_writers && nfsi->ooo && nfsi->ooo->cnt == 1 && + nfsi->ooo->gap[0].end == inode_peek_iversion_raw(inode)) { + /* There is one remaining gap that hasn't been + * merged into iversion - do that now. + */ + inode_set_iversion_raw(inode, nfsi->ooo->gap[0].start); + kfree(nfsi->ooo); + nfsi->ooo = NULL; + } + if (!inode_eq_iversion_raw(inode, fattr->change_attr)) { /* Could it be a race with writeback? */ - if (!have_writers) { - invalid |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA + if (!(have_writers || have_delegation)) { + invalid |= NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_XATTR; + /* Force revalidate of all attributes */ + save_cache_validity |= NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME + | NFS_INO_INVALID_SIZE + | NFS_INO_INVALID_BLOCKS + | NFS_INO_INVALID_NLINK + | NFS_INO_INVALID_MODE + | NFS_INO_INVALID_OTHER + | NFS_INO_INVALID_BTIME; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); + attr_changed = true; + dprintk("NFS: change_attr change on server for file %s/%ld\n", + inode->i_sb->s_id, + inode->i_ino); + } else if (!have_delegation) { + nfs_ooo_record(nfsi, fattr); + nfs_ooo_merge(nfsi, inode_peek_iversion_raw(inode), + fattr->change_attr); } - inode->i_version = fattr->change_attr; + inode_set_iversion_raw(inode, fattr->change_attr); } } else { - nfsi->cache_validity |= save_cache_validity; - cache_revalidated = false; + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_CHANGE; + if (!have_delegation || + (nfsi->cache_validity & NFS_INO_INVALID_CHANGE) != 0) + cache_revalidated = false; } - if (fattr->valid & NFS_ATTR_FATTR_MTIME) { - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - } else if (server->caps & NFS_CAP_MTIME) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_FORCED); - cache_revalidated = false; - } + if (fattr->valid & NFS_ATTR_FATTR_MTIME) + inode_set_mtime_to_ts(inode, fattr->mtime); + else if (fattr_supported & NFS_ATTR_FATTR_MTIME) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_MTIME; - if (fattr->valid & NFS_ATTR_FATTR_CTIME) { - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - } else if (server->caps & NFS_CAP_CTIME) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_FORCED); - cache_revalidated = false; - } + if (fattr->valid & NFS_ATTR_FATTR_CTIME) + inode_set_ctime_to_ts(inode, fattr->ctime); + else if (fattr_supported & NFS_ATTR_FATTR_CTIME) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_CTIME; + + if (fattr->valid & NFS_ATTR_FATTR_BTIME) + nfsi->btime = fattr->btime; + else if (fattr_supported & NFS_ATTR_FATTR_BTIME) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_BTIME; /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { new_isize = nfs_size_to_loff_t(fattr->size); cur_isize = i_size_read(inode); - if (new_isize != cur_isize) { + if (new_isize != cur_isize && !have_delegation) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if (nfsi->nrequests == 0 || new_isize > cur_isize) { + if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { + trace_nfs_size_update(inode, new_isize); i_size_write(inode, new_isize); if (!have_writers) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid |= NFS_INO_INVALID_DATA; } - dprintk("NFS: isize change on server for file %s/%ld " - "(%Ld to %Ld)\n", - inode->i_sb->s_id, - inode->i_ino, - (long long)cur_isize, - (long long)new_isize); } - } else { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE - | NFS_INO_REVAL_FORCED); - cache_revalidated = false; - } - + if (new_isize == 0 && + !(fattr->valid & (NFS_ATTR_FATTR_SPACE_USED | + NFS_ATTR_FATTR_BLOCKS_USED))) { + fattr->du.nfs3.used = 0; + fattr->valid |= NFS_ATTR_FATTR_SPACE_USED; + } + } else + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_SIZE; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); - else if (server->caps & NFS_CAP_ATIME) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_FORCED); - cache_revalidated = false; - } + inode_set_atime_to_ts(inode, fattr->atime); + else if (fattr_supported & NFS_ATTR_FATTR_ATIME) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_ATIME; if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { umode_t newmode = inode->i_mode & S_IFMT; newmode |= fattr->mode & S_IALLUGO; inode->i_mode = newmode; - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; } - } else if (server->caps & NFS_CAP_MODE) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_FORCED); - cache_revalidated = false; - } + } else if (fattr_supported & NFS_ATTR_FATTR_MODE) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_MODE; if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } - } else if (server->caps & NFS_CAP_OWNER) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_FORCED); - cache_revalidated = false; - } + } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_OTHER; if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } - } else if (server->caps & NFS_CAP_OWNER_GROUP) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_FORCED); - cache_revalidated = false; - } + } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_OTHER; if (fattr->valid & NFS_ATTR_FATTR_NLINK) { - if (inode->i_nlink != fattr->nlink) { - invalid |= NFS_INO_INVALID_ATTR; - if (S_ISDIR(inode->i_mode)) - invalid |= NFS_INO_INVALID_DATA; + if (inode->i_nlink != fattr->nlink) set_nlink(inode, fattr->nlink); - } - } else if (server->caps & NFS_CAP_NLINK) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_FORCED); - cache_revalidated = false; - } + } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_NLINK; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_BLOCKS; + + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; - else - cache_revalidated = false; + else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_BLOCKS; /* Update attrtimeo value if we're out of the unstable period */ - if (invalid & NFS_INO_INVALID_ATTR) { + if (attr_changed) { nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; @@ -1939,21 +2492,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attrtimeo_timestamp = now; } /* Set the barrier to be more recent than this fattr */ - if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0) + if ((long)(fattr->gencount - nfsi->attr_gencount) > 0) nfsi->attr_gencount = fattr->gencount; } - /* Don't declare attrcache up to date if there were no attrs! */ - if (cache_revalidated) - invalid &= ~NFS_INO_INVALID_ATTR; - /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) || - (save_cache_validity & NFS_INO_REVAL_FORCED)) - nfs_set_cache_invalid(inode, invalid); + nfs_set_cache_invalid(inode, invalid); return 0; out_err: @@ -1962,36 +2509,37 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) * lookup validation will know that the inode is bad. * (But we fall through to invalidate the caches.) */ - nfs_invalidate_inode(inode); + nfs_set_inode_stale_locked(inode); return -ESTALE; } struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; - nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + nfsi = alloc_inode_sb(sb, nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; + nfsi->ooo = NULL; #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ +#ifdef CONFIG_NFS_V4_2 + nfsi->xattr_cache = NULL; +#endif + nfs_netfs_inode_init(nfsi); + return &nfsi->vfs_inode; } EXPORT_SYMBOL_GPL(nfs_alloc_inode); -static void nfs_i_callback(struct rcu_head *head) +void nfs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); + kfree(NFS_I(inode)->ooo); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } - -void nfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, nfs_i_callback); -} -EXPORT_SYMBOL_GPL(nfs_destroy_inode); +EXPORT_SYMBOL_GPL(nfs_free_inode); static inline void nfs4_init_once(struct nfs_inode *nfsi) { @@ -2005,17 +2553,12 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) static void init_once(void *foo) { - struct nfs_inode *nfsi = (struct nfs_inode *) foo; + struct nfs_inode *nfsi = foo; inode_init_once(&nfsi->vfs_inode); INIT_LIST_HEAD(&nfsi->open_files); INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); - INIT_LIST_HEAD(&nfsi->commit_info.list); - nfsi->nrequests = 0; - nfsi->commit_info.ncommit = 0; - atomic_set(&nfsi->commit_info.rpcs_out, 0); - init_rwsem(&nfsi->rmdir_sem); nfs4_init_once(nfsi); } @@ -2024,7 +2567,7 @@ static int __init nfs_init_inodecache(void) nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; @@ -2042,35 +2585,54 @@ static void nfs_destroy_inodecache(void) kmem_cache_destroy(nfs_inode_cachep); } +struct workqueue_struct *nfslocaliod_workqueue; struct workqueue_struct *nfsiod_workqueue; EXPORT_SYMBOL_GPL(nfsiod_workqueue); /* - * start up the nfsiod workqueue + * Destroy the nfsiod workqueues */ -static int nfsiod_start(void) +static void nfsiod_stop(void) { struct workqueue_struct *wq; - dprintk("RPC: creating workqueue nfsiod\n"); - wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0); - if (wq == NULL) - return -ENOMEM; - nfsiod_workqueue = wq; - return 0; + + wq = nfsiod_workqueue; + if (wq != NULL) { + nfsiod_workqueue = NULL; + destroy_workqueue(wq); + } +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + wq = nfslocaliod_workqueue; + if (wq != NULL) { + nfslocaliod_workqueue = NULL; + destroy_workqueue(wq); + } +#endif /* CONFIG_NFS_LOCALIO */ } /* - * Destroy the nfsiod workqueue + * Start the nfsiod workqueues */ -static void nfsiod_stop(void) +static int nfsiod_start(void) { - struct workqueue_struct *wq; - - wq = nfsiod_workqueue; - if (wq == NULL) - return; - nfsiod_workqueue = NULL; - destroy_workqueue(wq); + dprintk("RPC: creating workqueue nfsiod\n"); + nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (nfsiod_workqueue == NULL) + return -ENOMEM; +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + /* + * localio writes need to use a normal (non-memreclaim) workqueue. + * When we start getting low on space, XFS goes and calls flush_work() on + * a non-memreclaim work queue, which causes a priority inversion problem. + */ + dprintk("RPC: creating workqueue nfslocaliod\n"); + nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0); + if (unlikely(nfslocaliod_workqueue == NULL)) { + nfsiod_stop(); + return -ENOMEM; + } +#endif /* CONFIG_NFS_LOCALIO */ + return 0; } unsigned int nfs_net_id; @@ -2078,14 +2640,34 @@ EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { + struct nfs_net *nn = net_generic(net, nfs_net_id); + int err; + nfs_clients_init(net); - return nfs_fs_proc_net_init(net); + + if (!rpc_proc_register(net, &nn->rpcstats)) { + err = -ENOMEM; + goto err_proc_rpc; + } + + err = nfs_fs_proc_net_init(net); + if (err) + goto err_proc_nfs; + + return 0; + +err_proc_nfs: + rpc_proc_unregister(net, "nfs"); +err_proc_rpc: + nfs_clients_exit(net); + return err; } static void nfs_net_exit(struct net *net) { + rpc_proc_unregister(net, "nfs"); nfs_fs_proc_net_exit(net); - nfs_cleanup_cb_ident_idr(net); + nfs_clients_exit(net); } static struct pernet_operations nfs_net_ops = { @@ -2095,6 +2677,35 @@ static struct pernet_operations nfs_net_ops = { .size = sizeof(struct nfs_net), }; +#ifdef CONFIG_KEYS +static struct key *nfs_keyring; + +static int __init nfs_init_keyring(void) +{ + nfs_keyring = keyring_alloc(".nfs", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + (KEY_USR_ALL & ~KEY_USR_SETATTR), + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + return PTR_ERR_OR_ZERO(nfs_keyring); +} + +static void nfs_exit_keyring(void) +{ + key_put(nfs_keyring); +} +#else +static inline int nfs_init_keyring(void) +{ + return 0; +} + +static inline void nfs_exit_keyring(void) +{ +} +#endif /* CONFIG_KEYS */ + /* * Initialize NFS */ @@ -2102,13 +2713,17 @@ static int __init init_nfs_fs(void) { int err; - err = register_pernet_subsys(&nfs_net_ops); + err = nfs_init_keyring(); + if (err) + return err; + + err = nfs_sysfs_init(); if (err < 0) - goto out9; + goto out10; - err = nfs_fscache_register(); + err = register_pernet_subsys(&nfs_net_ops); if (err < 0) - goto out8; + goto out9; err = nfsiod_start(); if (err) @@ -2138,15 +2753,12 @@ static int __init init_nfs_fs(void) if (err) goto out1; - rpc_proc_register(&init_net, &nfs_rpcstat); - err = register_nfs_fs(); if (err) goto out0; return 0; out0: - rpc_proc_unregister(&init_net, "nfs"); nfs_destroy_directcache(); out1: nfs_destroy_writepagecache(); @@ -2161,10 +2773,11 @@ out5: out6: nfsiod_stop(); out7: - nfs_fscache_unregister(); -out8: unregister_pernet_subsys(&nfs_net_ops); out9: + nfs_sysfs_exit(); +out10: + nfs_exit_keyring(); return err; } @@ -2175,16 +2788,17 @@ static void __exit exit_nfs_fs(void) nfs_destroy_readpagecache(); nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); - nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); - rpc_proc_unregister(&init_net, "nfs"); unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); + nfs_sysfs_exit(); + nfs_exit_keyring(); } /* Not quite true; I just maintain it */ MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); +MODULE_DESCRIPTION("NFS client support"); MODULE_LICENSE("GPL"); module_param(enable_ino64, bool, 0644); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index dc456416d2be..2ecd38e1d17a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -1,27 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NFS internal definitions */ #include "nfs4_fs.h" -#include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/security.h> +#include <linux/compiler_attributes.h> #include <linux/crc32.h> +#include <linux/sunrpc/addr.h> #include <linux/nfs_page.h> +#include <linux/nfslocalio.h> #include <linux/wait_bit.h> -#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) +#define NFS_SB_MASK (SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; struct nfs_string; - -/* Maximum number of readahead requests - * FIXME: this should really be a sysctl so that users may tune it to suit - * their needs. People that do NFS over a slow network, might for - * instance want to reduce it to something closer to 1 for improved - * interactive response. - */ -#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) +struct nfs_pageio_descriptor; static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { @@ -38,17 +35,24 @@ static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) return 1; } -struct nfs_clone_mount { - const struct super_block *sb; - const struct dentry *dentry; - struct nfs_fh *fh; - struct nfs_fattr *fattr; - char *hostname; - char *mnt_path; - struct sockaddr *addr; - size_t addrlen; - rpc_authflavor_t authflavor; -}; +static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry) +{ + if (!(NFS_SB(dentry->d_sb)->flags & NFS_MOUNT_SOFTREVAL)) + return false; + if (!d_is_positive(dentry) || !NFS_FH(d_inode(dentry))->size) + return false; + return true; +} + +static inline fmode_t flags_to_mode(int flags) +{ + fmode_t res = (__force fmode_t)flags & FMODE_EXEC; + if ((flags & O_ACCMODE) != O_WRONLY) + res |= FMODE_READ; + if ((flags & O_ACCMODE) != O_RDONLY) + res |= FMODE_WRITE; + return res; +} /* * Note: RFC 1813 doesn't limit the number of auth flavors that @@ -64,48 +68,59 @@ struct nfs_clone_mount { #define NFS_UNSPEC_RETRANS (UINT_MAX) #define NFS_UNSPEC_TIMEO (UINT_MAX) -/* - * Maximum number of pages that readdir can use for creating - * a vmapped array of pages. - */ -#define NFS_MAX_READDIR_PAGES 8 - struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ - const struct sockaddr *addr; /* Address of the server */ + const struct sockaddr_storage *addr; /* Address of the server */ const char *nodename; /* Hostname of the client */ const char *ip_addr; /* IP address of the client */ size_t addrlen; struct nfs_subversion *nfs_mod; int proto; u32 minorversion; + unsigned int nconnect; + unsigned int max_connect; struct net *net; const struct rpc_timeout *timeparms; + const struct cred *cred; + struct xprtsec_parms xprtsec; + unsigned long connect_timeout; + unsigned long reconnect_timeout; }; /* * In-kernel mount arguments */ -struct nfs_parsed_mount_data { - int flags; +struct nfs_fs_context { + bool internal; + bool skip_reconfig_option_check; + bool need_mount; + bool sloppy; + unsigned int flags; /* NFS{,4}_MOUNT_* flags */ unsigned int rsize, wsize; unsigned int timeo, retrans; - unsigned int acregmin, acregmax, - acdirmin, acdirmax; + unsigned int acregmin, acregmax; + unsigned int acdirmin, acdirmax; unsigned int namlen; unsigned int options; unsigned int bsize; struct nfs_auth_info auth_info; rpc_authflavor_t selected_flavor; + struct xprtsec_parms xprtsec; char *client_address; unsigned int version; unsigned int minorversion; char *fscache_uniq; - bool need_mount; + unsigned short protofamily; + unsigned short mountfamily; + bool has_sec_mnt_opts; + int lock_status; struct { - struct sockaddr_storage address; + union { + struct sockaddr address; + struct sockaddr_storage _address; + }; size_t addrlen; char *hostname; u32 version; @@ -114,21 +129,71 @@ struct nfs_parsed_mount_data { } mount_server; struct { - struct sockaddr_storage address; + union { + struct sockaddr address; + struct sockaddr_storage _address; + }; size_t addrlen; char *hostname; char *export_path; int port; unsigned short protocol; + unsigned short nconnect; + unsigned short max_connect; + unsigned short export_path_len; } nfs_server; - struct security_mnt_opts lsm_opts; - struct net *net; + struct nfs_fh *mntfh; + struct nfs_server *server; + struct nfs_subversion *nfs_mod; + + /* Information for a cloned mount. */ + struct nfs_clone_mount { + struct super_block *sb; + struct dentry *dentry; + struct nfs_fattr *fattr; + unsigned int inherited_bsize; + } clone_data; +}; + +enum nfs_lock_status { + NFS_LOCK_NOT_SET = 0, + NFS_LOCK_LOCK = 1, + NFS_LOCK_NOLOCK = 2, }; +#define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \ + errorf(fc, fmt, ## __VA_ARGS__) : \ + ({ dprintk(fmt "\n", ## __VA_ARGS__); })) + +#define nfs_ferrorf(fc, fac, fmt, ...) ((fc)->log.log ? \ + errorf(fc, fmt, ## __VA_ARGS__) : \ + ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); })) + +#define nfs_invalf(fc, fmt, ...) ((fc)->log.log ? \ + invalf(fc, fmt, ## __VA_ARGS__) : \ + ({ dprintk(fmt "\n", ## __VA_ARGS__); -EINVAL; })) + +#define nfs_finvalf(fc, fac, fmt, ...) ((fc)->log.log ? \ + invalf(fc, fmt, ## __VA_ARGS__) : \ + ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); -EINVAL; })) + +#define nfs_warnf(fc, fmt, ...) ((fc)->log.log ? \ + warnf(fc, fmt, ## __VA_ARGS__) : \ + ({ dprintk(fmt "\n", ## __VA_ARGS__); })) + +#define nfs_fwarnf(fc, fac, fmt, ...) ((fc)->log.log ? \ + warnf(fc, fmt, ## __VA_ARGS__) : \ + ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); })) + +static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc) +{ + return fc->fs_private; +} + /* mount_clnt.c */ struct nfs_mount_request { - struct sockaddr *sap; + struct sockaddr_storage *sap; size_t salen; char *hostname; char *dirpath; @@ -141,24 +206,16 @@ struct nfs_mount_request { struct net *net; }; -struct nfs_mount_info { - void (*fill_super)(struct super_block *, struct nfs_mount_info *); - int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *); - struct nfs_parsed_mount_data *parsed; - struct nfs_clone_mount *cloned; - struct nfs_fh *mntfh; -}; - -extern int nfs_mount(struct nfs_mount_request *info); -extern void nfs_umount(const struct nfs_mount_request *info); +extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans); /* client.c */ extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); +extern void nfs_clients_exit(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *); -int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); +int nfs_probe_server(struct nfs_server *, struct nfs_fh *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans); @@ -167,22 +224,18 @@ int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, struct nfs_server *nfs_alloc_server(void); void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *); -extern void nfs_cleanup_cb_ident_idr(struct net *); extern void nfs_put_client(struct nfs_client *); extern void nfs_free_client(struct nfs_client *); extern struct nfs_client *nfs4_find_client_ident(struct net *, int); extern struct nfs_client * nfs4_find_client_sessionid(struct net *, const struct sockaddr *, struct nfs4_sessionid *, u32); -extern struct nfs_server *nfs_create_server(struct nfs_mount_info *, - struct nfs_subversion *); -extern struct nfs_server *nfs4_create_server( - struct nfs_mount_info *, - struct nfs_subversion *); -extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, - struct nfs_fh *); +extern struct nfs_server *nfs_create_server(struct fs_context *); +extern void nfs_server_set_init_caps(struct nfs_server *); +extern struct nfs_server *nfs4_create_server(struct fs_context *); +extern struct nfs_server *nfs4_create_referral_server(struct fs_context *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, - struct sockaddr *sap, size_t salen, + struct sockaddr_storage *sap, size_t salen, struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, @@ -194,7 +247,7 @@ extern int nfs_client_init_status(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, @@ -202,7 +255,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans); #ifdef CONFIG_PROC_FS @@ -231,7 +284,9 @@ static inline void nfs_fs_proc_exit(void) extern const struct svc_version nfs4_callback_version1; extern const struct svc_version nfs4_callback_version4; -struct nfs_pageio_descriptor; +/* fs_context.c */ +extern struct file_system_type nfs_fs_type; + /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); extern void nfs_destroy_nfspagecache(void); @@ -251,25 +306,19 @@ int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); -void nfs_pgio_data_destroy(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, - struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, - const struct rpc_call_ops *call_ops, int how, int flags); + const struct cred *cred, const struct nfs_rpc_ops *rpc_ops, + const struct rpc_call_ops *call_ops, int how, int flags, + struct nfsd_file *localio); void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); -static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc) -{ - WARN_ON_ONCE(desc->pg_mirror_count < 1); - return desc->pg_mirror_count > 1; -} - static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { - return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; + return cred_fscmp(ctx1->cred, ctx2->cred) == 0 && ctx1->state == ctx2->state; } /* nfs2xdr.c */ @@ -316,14 +365,6 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) return dst; } -static inline void nfs4_label_free(struct nfs4_label *label) -{ - if (label) { - kfree(label->label); - kfree(label); - } - return; -} static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { @@ -332,7 +373,6 @@ static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) } #else static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } -static inline void nfs4_label_free(void *label) {} static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { } @@ -349,81 +389,152 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); /* dir.c */ -extern void nfs_advise_use_readdirplus(struct inode *dir); -extern void nfs_force_use_readdirplus(struct inode *dir); +extern void nfs_readdir_record_entry_cache_hit(struct inode *dir); +extern void nfs_readdir_record_entry_cache_miss(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); -int nfs_create(struct inode *, struct dentry *, umode_t, bool); -int nfs_mkdir(struct inode *, struct dentry *, umode_t); +void nfs_d_prune_case_insensitive_aliases(struct inode *inode); +int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, + umode_t, bool); +struct dentry *nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, + umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); -int nfs_symlink(struct inode *, struct dentry *, const char *); +int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, + const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); -int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); -int nfs_rename(struct inode *, struct dentry *, +int nfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, + dev_t); +int nfs_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); +#ifdef CONFIG_NFS_V4_2 +static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) +{ + if (!(server->caps & NFS_CAP_XATTR)) + return 0; + return NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | NFS4_ACCESS_XALIST; +} +#else +static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) +{ + return 0; +} +#endif + /* file.c */ int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); -int nfs_file_mmap(struct file *, struct vm_area_struct *); +ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, + size_t len, unsigned int flags); +int nfs_file_mmap_prepare(struct vm_area_desc *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; +extern struct workqueue_struct *nfslocaliod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); -extern void nfs_destroy_inode(struct inode *); +extern void nfs_free_inode(struct inode *); extern int nfs_write_inode(struct inode *, struct writeback_control *); extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); -void nfs_zap_acl_cache(struct inode *inode); +extern void nfs_zap_acl_cache(struct inode *inode); +extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); -extern int nfs_wait_atomic_killable(atomic_t *p); + +#if IS_ENABLED(CONFIG_NFS_LOCALIO) +/* localio.c */ +struct nfs_local_dio { + u32 mem_align; + u32 offset_align; + loff_t middle_offset; + loff_t end_offset; + ssize_t start_len; /* Length for misaligned first extent */ + ssize_t middle_len; /* Length for DIO-aligned middle extent */ + ssize_t end_len; /* Length for misaligned last extent */ +}; + +extern void nfs_local_probe_async(struct nfs_client *); +extern void nfs_local_probe_async_work(struct work_struct *); +extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *, + const struct cred *, + struct nfs_fh *, + struct nfs_file_localio *, + const fmode_t); +extern int nfs_local_doio(struct nfs_client *, + struct nfsd_file *, + struct nfs_pgio_header *, + const struct rpc_call_ops *); +extern int nfs_local_commit(struct nfsd_file *, + struct nfs_commit_data *, + const struct rpc_call_ops *, int); +extern bool nfs_server_is_local(const struct nfs_client *clp); + +#else /* CONFIG_NFS_LOCALIO */ +static inline void nfs_local_probe(struct nfs_client *clp) {} +static inline void nfs_local_probe_async(struct nfs_client *clp) {} +static inline struct nfsd_file * +nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, + struct nfs_fh *fh, struct nfs_file_localio *nfl, + const fmode_t mode) +{ + return NULL; +} +static inline int nfs_local_doio(struct nfs_client *clp, + struct nfsd_file *localio, + struct nfs_pgio_header *hdr, + const struct rpc_call_ops *call_ops) +{ + return -EINVAL; +} +static inline int nfs_local_commit(struct nfsd_file *localio, + struct nfs_commit_data *data, + const struct rpc_call_ops *call_ops, int how) +{ + return -EINVAL; +} +static inline bool nfs_server_is_local(const struct nfs_client *clp) +{ + return false; +} +#endif /* CONFIG_NFS_LOCALIO */ /* super.c */ extern const struct super_operations nfs_sops; -extern struct file_system_type nfs_fs_type; -extern struct file_system_type nfs_xdev_fs_type; -#if IS_ENABLED(CONFIG_NFS_V4) -extern struct file_system_type nfs4_xdev_fs_type; -extern struct file_system_type nfs4_referral_fs_type; -#endif bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); -struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *, - struct nfs_subversion *); -int nfs_set_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); -int nfs_clone_sb_security(struct super_block *, struct dentry *, struct nfs_mount_info *); -struct dentry *nfs_fs_mount_common(struct nfs_server *, int, const char *, - struct nfs_mount_info *, struct nfs_subversion *); -struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *); -struct dentry * nfs_xdev_mount_common(struct file_system_type *, int, - const char *, struct nfs_mount_info *); +int nfs_try_get_tree(struct fs_context *); +int nfs_get_tree_common(struct fs_context *); void nfs_kill_super(struct super_block *); -void nfs_fill_super(struct super_block *, struct nfs_mount_info *); - -extern struct rpc_stat nfs_rpcstat; extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); +extern int nfs_client_for_each_server(struct nfs_client *clp, + int (*fn)(struct nfs_server *, void *), + void *data); +#ifdef CONFIG_NFS_FSCACHE +extern const struct netfs_request_ops nfs_netfs_ops; +#endif /* io.c */ -extern void nfs_start_io_read(struct inode *inode); +extern __must_check int nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); -extern void nfs_start_io_write(struct inode *inode); +extern __must_check int nfs_start_io_write(struct inode *inode); extern void nfs_end_io_write(struct inode *inode); -extern void nfs_start_io_direct(struct inode *inode); +extern __must_check int nfs_start_io_direct(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) @@ -431,32 +542,41 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } +/* Must be called with exclusively locked inode->i_rwsem */ +static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(&nfsi->vfs_inode); + } +} + + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, unsigned flags); extern struct vfsmount *nfs_d_automount(struct path *path); -struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *, - struct nfs_fh *, struct nfs_fattr *); -struct vfsmount *nfs_do_submount(struct dentry *, struct nfs_fh *, - struct nfs_fattr *, rpc_authflavor_t); +int nfs_submount(struct fs_context *, struct nfs_server *); +int nfs_do_submount(struct fs_context *); /* getroot.c */ -extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, - const char *); +extern int nfs_get_root(struct super_block *s, struct fs_context *fc); #if IS_ENABLED(CONFIG_NFS_V4) -extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, - const char *); - extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool); #endif struct nfs_pgio_completion_ops; /* read.c */ +extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); -extern void nfs_read_prepare(struct rpc_task *task, void *calldata); +extern bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size); +extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, + struct nfs_open_context *ctx, + struct folio *folio); +extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ @@ -466,7 +586,7 @@ int nfs_show_options(struct seq_file *, struct dentry *); int nfs_show_devname(struct seq_file *, struct dentry *); int nfs_show_path(struct seq_file *, struct dentry *); int nfs_show_stats(struct seq_file *, struct dentry *); -int nfs_remount(struct super_block *sb, int *flags, char *raw_data); +int nfs_reconfigure(struct fs_context *); /* write.c */ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, @@ -474,13 +594,13 @@ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_commit_free(struct nfs_commit_data *p); -extern void nfs_write_prepare(struct rpc_task *task, void *calldata); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, const struct nfs_rpc_ops *nfs_ops, const struct rpc_call_ops *call_ops, - int how, int flags); + int how, int flags, + struct nfsd_file *localio); extern void nfs_init_commit(struct nfs_commit_data *data, struct list_head *head, struct pnfs_layout_segment *lseg, @@ -521,13 +641,25 @@ int nfs_filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); #ifdef CONFIG_NFS_V4_1 +static inline void +pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets, + unsigned int nbuckets) +{ + unsigned int i; + + for (i = 0; i < nbuckets; i++) + buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; +} static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { - int i; + struct pnfs_commit_array *array; - for (i = 0; i < cinfo->nbuckets; i++) - cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; + rcu_read_lock(); + list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list) + pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets, + array->nbuckets); + rcu_read_unlock(); } #else static inline @@ -537,8 +669,10 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) #endif #ifdef CONFIG_MIGRATION -extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *, enum migrate_mode); +int nfs_migrate_folio(struct address_space *, struct folio *dst, + struct folio *src, enum migrate_mode); +#else +#define nfs_migrate_folio NULL #endif static inline int @@ -548,6 +682,49 @@ nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, return memcmp(v1->data, v2->data, sizeof(v1->data)); } +static inline bool +nfs_write_match_verf(const struct nfs_writeverf *verf, + struct nfs_page *req) +{ + return verf->committed > NFS_UNSTABLE && + !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); +} + +static inline gfp_t nfs_io_gfp_mask(void) +{ + gfp_t ret = current_gfp_context(GFP_KERNEL); + + /* For workers __GFP_NORETRY only with __GFP_IO or __GFP_FS */ + if ((current->flags & PF_WQ_WORKER) && ret == GFP_KERNEL) + ret |= __GFP_NORETRY | __GFP_NOWARN; + return ret; +} + +/* + * Special version of should_remove_suid() that ignores capabilities. + */ +static inline int nfs_should_remove_suid(const struct inode *inode) +{ + umode_t mode = inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && S_ISREG(mode))) + return kill; + + return 0; +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, @@ -558,29 +735,31 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); -extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset); /* nfs4proc.c */ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, - struct rpc_cred *cred); + const struct cred *cred); extern int nfs41_walk_client_list(struct nfs_client *clp, struct nfs_client **result, - struct rpc_cred *cred); -extern int nfs4_test_session_trunk(struct rpc_clnt *, - struct rpc_xprt *, - void *); + const struct cred *cred); +extern void nfs4_test_session_trunk(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + void *data); static inline struct inode *nfs_igrab_and_active(struct inode *inode) { - inode = igrab(inode); - if (inode != NULL && !nfs_sb_active(inode->i_sb)) { - iput(inode); - inode = NULL; + struct super_block *sb = inode->i_sb; + + if (sb && nfs_sb_active(sb)) { + if (igrab(inode)) + return inode; + nfs_sb_deactive(sb); } - return inode; + return NULL; } static inline void nfs_iput_and_deactive(struct inode *inode) @@ -613,9 +792,9 @@ unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) if ((bsize & (bsize - 1)) || nrbitsp) { unsigned char nrbits; - for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--) + for (nrbits = 31; nrbits && !(bsize & (1UL << nrbits)); nrbits--) ; - bsize = 1 << nrbits; + bsize = 1UL << nrbits; if (nrbitsp) *nrbitsp = nrbits; } @@ -647,6 +826,22 @@ unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) } /* + * Compute and set NFS server rsize / wsize + */ +static inline +unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto) +{ + if (iosize < NFS_MIN_FILE_IO_SIZE) + iosize = NFS_DEF_FILE_IO_SIZE; + else if (iosize >= NFS_MAX_FILE_IO_SIZE) + iosize = NFS_MAX_FILE_IO_SIZE; + + if (proto == XPRT_TRANSPORT_UDP || iosize < PAGE_SIZE) + return nfs_block_bits(iosize, NULL); + return iosize & PAGE_MASK; +} + +/* * Determine the maximum file size for a superblock */ static inline @@ -658,16 +853,21 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) } /* - * Record the page as unstable and mark its inode as dirty. + * Record the page as unstable (an extra writeback period) and mark its + * inode as dirty. */ -static inline -void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) -{ - if (!cinfo->dreq) { - struct inode *inode = page_file_mapping(page)->host; - - inc_node_page_state(page, NR_UNSTABLE_NFS); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); +static inline void nfs_folio_mark_unstable(struct folio *folio, + struct nfs_commit_info *cinfo) +{ + if (folio && !cinfo->dreq) { + struct inode *inode = folio->mapping->host; + long nr = folio_nr_pages(folio); + + /* This page is really still in write-back - just that the + * writeback is happening on the server now. + */ + node_stat_mod_folio(folio, NR_WRITEBACK, nr); + wb_stat_mod(&inode_to_bdi(inode)->wb, WB_WRITEBACK, nr); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } @@ -675,18 +875,17 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) /* * Determine the number of bytes of data the page contains */ -static inline -unsigned int nfs_page_length(struct page *page) +static inline size_t nfs_folio_length(struct folio *folio) { - loff_t i_size = i_size_read(page_file_mapping(page)->host); + loff_t i_size = i_size_read(folio->mapping->host); if (i_size > 0) { - pgoff_t index = page_index(page); - pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT; + pgoff_t index = folio->index >> folio_order(folio); + pgoff_t end_index = (i_size - 1) >> folio_shift(folio); if (index < end_index) - return PAGE_SIZE; + return folio_size(folio); if (index == end_index) - return ((i_size - 1) & ~PAGE_MASK) + 1; + return offset_in_folio(folio, i_size - 1) + 1; } return 0; } @@ -704,58 +903,41 @@ unsigned char nfs_umode_to_dtype(umode_t mode) * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ -static inline -unsigned int nfs_page_array_len(unsigned int base, size_t len) +static inline unsigned int nfs_page_array_len(unsigned int base, size_t len) { - return ((unsigned long)len + (unsigned long)base + - PAGE_SIZE - 1) >> PAGE_SHIFT; + return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> + PAGE_SHIFT; } /* - * Convert a struct timespec into a 64-bit change attribute + * Convert a struct timespec64 into a 64-bit change attribute * - * This does approximately the same thing as timespec_to_ns(), + * This does approximately the same thing as timespec64_to_ns(), * but for calculation efficiency, we multiply the seconds by * 1024*1024*1024. */ static inline -u64 nfs_timespec_to_change_attr(const struct timespec *ts) +u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) { return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } -#ifdef CONFIG_CRC32 -/** - * nfs_fhandle_hash - calculate the crc32 hash for the filehandle - * @fh - pointer to filehandle - * - * returns a crc32 hash for the filehandle that is compatible with - * the one displayed by "wireshark". - */ -static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) -{ - return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); -} static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } -#else -static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) -{ - return 0; -} -static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) + +static inline bool nfs_current_task_exiting(void) { - return 0; + return (current->flags & PF_EXITING) != 0; } -#endif static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: + case -EINTR: case -EACCES: case -EDQUOT: case -EFBIG: @@ -764,8 +946,67 @@ static inline bool nfs_error_is_fatal(int err) case -EROFS: case -ESTALE: case -E2BIG: + case -ENOMEM: + case -ETIMEDOUT: return true; default: return false; } } + +static inline bool nfs_error_is_fatal_on_server(int err) +{ + switch (err) { + case 0: + case -ERESTARTSYS: + case -EINTR: + case -ENOMEM: + return false; + } + return nfs_error_is_fatal(err); +} + +/* + * Select between a default port value and a user-specified port value. + * If a zero value is set, then autobind will be used. + */ +static inline void nfs_set_port(struct sockaddr_storage *sap, int *port, + const unsigned short default_port) +{ + if (*port == NFS_UNSPEC_PORT) + *port = default_port; + + rpc_set_port((struct sockaddr *)sap, *port); +} + +struct nfs_direct_req { + struct kref kref; /* release manager */ + + /* I/O parameters */ + struct nfs_open_context *ctx; /* file open context info */ + struct nfs_lock_context *l_ctx; /* Lock context info */ + struct kiocb * iocb; /* controlling i/o request */ + struct inode * inode; /* target file of i/o */ + + /* completion state */ + atomic_t io_count; /* i/os we're waiting for */ + spinlock_t lock; /* protect completion state */ + + loff_t io_start; /* Start offset for I/O */ + ssize_t count, /* bytes actually processed */ + max_count, /* max expected count */ + error; /* any reported error */ + struct completion completion; /* wait for i/o completion */ + + /* commit state */ + struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ + struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ + struct work_struct work; + int flags; + /* for write */ +#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ +#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ + /* for read */ +#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ +#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */ +}; diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 1fc5d1ce327e..d275b0a250bf 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016 Trond Myklebust * @@ -13,18 +14,9 @@ #include "internal.h" -/* Call with exclusively locked inode->i_rwsem */ -static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) -{ - if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { - clear_bit(NFS_INO_ODIRECT, &nfsi->flags); - inode_dio_wait(inode); - } -} - /** * nfs_start_io_read - declare the file is being used for buffered reads - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. @@ -38,24 +30,33 @@ static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. */ -void +int nfs_start_io_read(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + int err; + /* Be an optimist! */ - down_read(&inode->i_rwsem); + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) - return; + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); - nfs_block_o_direct(nfsi, inode); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; + nfs_file_block_o_direct(nfsi); downgrade_write(&inode->i_rwsem); + + return 0; } /** * nfs_end_io_read - declare that the buffered read operation is done - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is done, and release the shared * lock on inode->i_rwsem. @@ -68,21 +69,25 @@ nfs_end_io_read(struct inode *inode) /** * nfs_start_io_write - declare the file is being used for buffered writes - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. */ -void +int nfs_start_io_write(struct inode *inode) { - down_write(&inode->i_rwsem); - nfs_block_o_direct(NFS_I(inode), inode); + int err; + + err = down_write_killable(&inode->i_rwsem); + if (!err) + nfs_file_block_o_direct(NFS_I(inode)); + return err; } /** * nfs_end_io_write - declare that the buffered write operation is done - * @inode - file inode + * @inode: file inode * * Declare that a buffered write operation is done, and release the * lock on inode->i_rwsem. @@ -98,13 +103,13 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) { if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { set_bit(NFS_INO_ODIRECT, &nfsi->flags); - nfs_wb_all(inode); + nfs_sync_mapping(inode->i_mapping); } } /** - * nfs_end_io_direct - declare the file is being used for direct i/o - * @inode - file inode + * nfs_start_io_direct - declare the file is being used for direct i/o + * @inode: file inode * * Declare that a direct I/O operation is about to start, and ensure * that we block all buffered I/O. @@ -118,24 +123,33 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. */ -void +int nfs_start_io_direct(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + int err; + /* Be an optimist! */ - down_read(&inode->i_rwsem); + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) - return; + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; nfs_block_buffered(nfsi, inode); downgrade_write(&inode->i_rwsem); + + return 0; } /** * nfs_end_io_direct - declare that the direct i/o operation is done - * @inode - file inode + * @inode: file inode * * Declare that a direct I/O operation is done, and release the shared * lock on inode->i_rwsem. diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 0cb806fbd4c4..49862c95b224 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/nfs/iostat.h * @@ -16,9 +17,6 @@ struct nfs_iostats { unsigned long long bytes[__NFSIOS_BYTESMAX]; -#ifdef CONFIG_NFS_FSCACHE - unsigned long long fscache[__NFSIOS_FSCACHEMAX]; -#endif unsigned long events[__NFSIOS_COUNTSMAX]; } ____cacheline_aligned; @@ -48,24 +46,11 @@ static inline void nfs_add_stats(const struct inode *inode, nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } -#ifdef CONFIG_NFS_FSCACHE -static inline void nfs_add_fscache_stats(struct inode *inode, - enum nfs_stat_fscachecounters stat, - long addend) -{ - this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); -} -static inline void nfs_inc_fscache_stats(struct inode *inode, - enum nfs_stat_fscachecounters stat) -{ - this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]); -} -#endif - -static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) -{ - return alloc_percpu(struct nfs_iostats); -} +/* + * This specialized allocator has to be a macro for its allocations to be + * accounted separately (to have a separate alloc_tag). + */ +#define nfs_alloc_iostats() alloc_percpu(struct nfs_iostats) static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats) { diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c new file mode 100644 index 000000000000..f33bfa7b58e6 --- /dev/null +++ b/fs/nfs/localio.c @@ -0,0 +1,1072 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * NFS client support for local clients to bypass network stack + * + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> + * Copyright (C) 2024 NeilBrown <neilb@suse.de> + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/vfs.h> +#include <linux/file.h> +#include <linux/inet.h> +#include <linux/sunrpc/addr.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#include <linux/nfs_common.h> +#include <linux/nfslocalio.h> +#include <linux/bvec.h> + +#include <linux/nfs.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_xdr.h> + +#include "internal.h" +#include "pnfs.h" +#include "nfstrace.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +#define NFSLOCAL_MAX_IOS 3 + +struct nfs_local_kiocb { + struct kiocb kiocb; + struct bio_vec *bvec; + struct nfs_pgio_header *hdr; + struct work_struct work; + void (*aio_complete_work)(struct work_struct *); + struct nfsd_file *localio; + /* Begin mostly DIO-specific members */ + size_t end_len; + short int end_iter_index; + atomic_t n_iters; + bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; + struct iov_iter iters[NFSLOCAL_MAX_IOS] ____cacheline_aligned; + /* End mostly DIO-specific members */ +}; + +struct nfs_local_fsync_ctx { + struct nfsd_file *localio; + struct nfs_commit_data *data; + struct work_struct work; + struct completion *done; +}; + +static bool localio_enabled __read_mostly = true; +module_param(localio_enabled, bool, 0644); + +static inline bool nfs_client_is_local(const struct nfs_client *clp) +{ + return !!rcu_access_pointer(clp->cl_uuid.net); +} + +bool nfs_server_is_local(const struct nfs_client *clp) +{ + return nfs_client_is_local(clp) && localio_enabled; +} +EXPORT_SYMBOL_GPL(nfs_server_is_local); + +/* + * UUID_IS_LOCAL XDR functions + */ + +static void localio_xdr_enc_uuidargs(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const u8 *uuid = data; + + encode_opaque_fixed(xdr, uuid, UUID_SIZE); +} + +static int localio_xdr_dec_uuidres(struct rpc_rqst *req, + struct xdr_stream *xdr, + void *result) +{ + /* void return */ + return 0; +} + +static const struct rpc_procinfo nfs_localio_procedures[] = { + [LOCALIOPROC_UUID_IS_LOCAL] = { + .p_proc = LOCALIOPROC_UUID_IS_LOCAL, + .p_encode = localio_xdr_enc_uuidargs, + .p_decode = localio_xdr_dec_uuidres, + .p_arglen = XDR_QUADLEN(UUID_SIZE), + .p_replen = 0, + .p_statidx = LOCALIOPROC_UUID_IS_LOCAL, + .p_name = "UUID_IS_LOCAL", + }, +}; + +static unsigned int nfs_localio_counts[ARRAY_SIZE(nfs_localio_procedures)]; +static const struct rpc_version nfslocalio_version1 = { + .number = 1, + .nrprocs = ARRAY_SIZE(nfs_localio_procedures), + .procs = nfs_localio_procedures, + .counts = nfs_localio_counts, +}; + +static const struct rpc_version *nfslocalio_version[] = { + [1] = &nfslocalio_version1, +}; + +extern const struct rpc_program nfslocalio_program; +static struct rpc_stat nfslocalio_rpcstat = { &nfslocalio_program }; + +const struct rpc_program nfslocalio_program = { + .name = "nfslocalio", + .number = NFS_LOCALIO_PROGRAM, + .nrvers = ARRAY_SIZE(nfslocalio_version), + .version = nfslocalio_version, + .stats = &nfslocalio_rpcstat, +}; + +/* + * nfs_init_localioclient - Initialise an NFS localio client connection + */ +static struct rpc_clnt *nfs_init_localioclient(struct nfs_client *clp) +{ + struct rpc_clnt *rpcclient_localio; + + rpcclient_localio = rpc_bind_new_program(clp->cl_rpcclient, + &nfslocalio_program, 1); + + dprintk_rcu("%s: server (%s) %s NFS LOCALIO.\n", + __func__, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), + (IS_ERR(rpcclient_localio) ? "does not support" : "supports")); + + return rpcclient_localio; +} + +static bool nfs_server_uuid_is_local(struct nfs_client *clp) +{ + u8 uuid[UUID_SIZE]; + struct rpc_message msg = { + .rpc_argp = &uuid, + }; + struct rpc_clnt *rpcclient_localio; + int status; + + rpcclient_localio = nfs_init_localioclient(clp); + if (IS_ERR(rpcclient_localio)) + return false; + + export_uuid(uuid, &clp->cl_uuid.uuid); + + msg.rpc_proc = &nfs_localio_procedures[LOCALIOPROC_UUID_IS_LOCAL]; + status = rpc_call_sync(rpcclient_localio, &msg, 0); + dprintk("%s: NFS reply UUID_IS_LOCAL: status=%d\n", + __func__, status); + rpc_shutdown_client(rpcclient_localio); + + /* Server is only local if it initialized required struct members */ + if (status || !rcu_access_pointer(clp->cl_uuid.net) || !clp->cl_uuid.dom) + return false; + + return true; +} + +/* + * nfs_local_probe - probe local i/o support for an nfs_server and nfs_client + * - called after alloc_client and init_client (so cl_rpcclient exists) + * - this function is idempotent, it can be called for old or new clients + */ +static void nfs_local_probe(struct nfs_client *clp) +{ + /* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */ + if (!localio_enabled || + clp->cl_rpcclient->cl_auth->au_flavor != RPC_AUTH_UNIX) { + nfs_localio_disable_client(clp); + return; + } + + if (nfs_client_is_local(clp)) + return; + + if (!nfs_uuid_begin(&clp->cl_uuid)) + return; + if (nfs_server_uuid_is_local(clp)) + nfs_localio_enable_client(clp); + nfs_uuid_end(&clp->cl_uuid); +} + +void nfs_local_probe_async_work(struct work_struct *work) +{ + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_local_probe_work); + + if (!refcount_inc_not_zero(&clp->cl_count)) + return; + nfs_local_probe(clp); + nfs_put_client(clp); +} + +void nfs_local_probe_async(struct nfs_client *clp) +{ + queue_work(nfsiod_workqueue, &clp->cl_local_probe_work); +} +EXPORT_SYMBOL_GPL(nfs_local_probe_async); + +static inline void nfs_local_file_put(struct nfsd_file *localio) +{ + /* nfs_to_nfsd_file_put_local() expects an __rcu pointer + * but we have a __kernel pointer. It is always safe + * to cast a __kernel pointer to an __rcu pointer + * because the cast only weakens what is known about the pointer. + */ + struct nfsd_file __rcu *nf = (struct nfsd_file __rcu*) localio; + + nfs_to_nfsd_file_put_local(&nf); +} + +/* + * __nfs_local_open_fh - open a local filehandle in terms of nfsd_file. + * + * Returns a pointer to a struct nfsd_file or ERR_PTR. + * Caller must release returned nfsd_file with nfs_to_nfsd_file_put_local(). + */ +static struct nfsd_file * +__nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, + struct nfs_fh *fh, struct nfs_file_localio *nfl, + struct nfsd_file __rcu **pnf, + const fmode_t mode) +{ + int status = 0; + struct nfsd_file *localio; + + localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient, + cred, fh, nfl, pnf, mode); + if (IS_ERR(localio)) { + status = PTR_ERR(localio); + switch (status) { + case -ENOMEM: + case -ENXIO: + case -ENOENT: + /* Revalidate localio */ + nfs_localio_disable_client(clp); + nfs_local_probe(clp); + } + } + trace_nfs_local_open_fh(fh, mode, status); + return localio; +} + +/* + * nfs_local_open_fh - open a local filehandle in terms of nfsd_file. + * First checking if the open nfsd_file is already cached, otherwise + * must __nfs_local_open_fh and insert the nfsd_file in nfs_file_localio. + * + * Returns a pointer to a struct nfsd_file or NULL. + */ +struct nfsd_file * +nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, + struct nfs_fh *fh, struct nfs_file_localio *nfl, + const fmode_t mode) +{ + struct nfsd_file *nf, __rcu **pnf; + + if (!nfs_server_is_local(clp)) + return NULL; + if (mode & ~(FMODE_READ | FMODE_WRITE)) + return NULL; + + if (mode & FMODE_WRITE) + pnf = &nfl->rw_file; + else + pnf = &nfl->ro_file; + + nf = __nfs_local_open_fh(clp, cred, fh, nfl, pnf, mode); + if (IS_ERR(nf)) + return NULL; + return nf; +} +EXPORT_SYMBOL_GPL(nfs_local_open_fh); + +static void +nfs_local_iocb_free(struct nfs_local_kiocb *iocb) +{ + kfree(iocb->bvec); + kfree(iocb); +} + +static struct nfs_local_kiocb * +nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, + struct file *file, gfp_t flags) +{ + struct nfs_local_kiocb *iocb; + + iocb = kzalloc(sizeof(*iocb), flags); + if (iocb == NULL) + return NULL; + + iocb->bvec = kmalloc_array(hdr->page_array.npages, + sizeof(struct bio_vec), flags); + if (iocb->bvec == NULL) { + kfree(iocb); + return NULL; + } + + init_sync_kiocb(&iocb->kiocb, file); + + iocb->hdr = hdr; + iocb->kiocb.ki_pos = hdr->args.offset; + iocb->kiocb.ki_flags &= ~IOCB_APPEND; + iocb->kiocb.ki_complete = NULL; + iocb->aio_complete_work = NULL; + + iocb->end_iter_index = -1; + + return iocb; +} + +static bool +nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw, + size_t len, struct nfs_local_dio *local_dio) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + loff_t offset = hdr->args.offset; + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; + loff_t start_end, orig_end, middle_end; + + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, + &nf_dio_offset_align, &nf_dio_read_offset_align); + if (rw == ITER_DEST) + nf_dio_offset_align = nf_dio_read_offset_align; + + if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align)) + return false; + if (unlikely(nf_dio_offset_align > PAGE_SIZE)) + return false; + if (unlikely(len < nf_dio_offset_align)) + return false; + + local_dio->mem_align = nf_dio_mem_align; + local_dio->offset_align = nf_dio_offset_align; + + start_end = round_up(offset, nf_dio_offset_align); + orig_end = offset + len; + middle_end = round_down(orig_end, nf_dio_offset_align); + + local_dio->middle_offset = start_end; + local_dio->end_offset = middle_end; + + local_dio->start_len = start_end - offset; + local_dio->middle_len = middle_end - start_end; + local_dio->end_len = orig_end - middle_end; + + if (rw == ITER_DEST) + trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio); + else + trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio); + return true; +} + +static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, + unsigned int addr_mask, unsigned int len_mask) +{ + const struct bio_vec *bvec = i->bvec; + size_t skip = i->iov_offset; + size_t size = i->count; + + if (size & len_mask) + return false; + do { + size_t len = bvec->bv_len; + + if (len > size) + len = size; + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) + return false; + bvec++; + size -= len; + skip = 0; + } while (size); + + return true; +} + +static void +nfs_local_iter_setup(struct iov_iter *iter, int rw, struct bio_vec *bvec, + unsigned int nvecs, unsigned long total, + size_t start, size_t len) +{ + iov_iter_bvec(iter, rw, bvec, nvecs, total); + if (start) + iov_iter_advance(iter, start); + iov_iter_truncate(iter, len); +} + +/* + * Setup as many as 3 iov_iter based on extents described by @local_dio. + * Returns the number of iov_iter that were setup. + */ +static int +nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, + unsigned int nvecs, unsigned long total, + struct nfs_local_dio *local_dio) +{ + int n_iters = 0; + struct iov_iter *iters = iocb->iters; + + /* Setup misaligned start? */ + if (local_dio->start_len) { + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, 0, local_dio->start_len); + ++n_iters; + } + + /* + * Setup DIO-aligned middle, if there is no misaligned end (below) + * then AIO completion is used, see nfs_local_call_{read,write} + */ + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, nvecs, + total, local_dio->start_len, local_dio->middle_len); + + iocb->iter_is_dio_aligned[n_iters] = + nfs_iov_iter_aligned_bvec(&iters[n_iters], + local_dio->mem_align-1, local_dio->offset_align-1); + + if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) { + trace_nfs_local_dio_misaligned(iocb->hdr->inode, + local_dio->start_len, local_dio->middle_len, local_dio); + return 0; /* no DIO-aligned IO possible */ + } + iocb->end_iter_index = n_iters; + ++n_iters; + + /* Setup misaligned end? */ + if (local_dio->end_len) { + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, local_dio->start_len + + local_dio->middle_len, local_dio->end_len); + iocb->end_iter_index = n_iters; + ++n_iters; + } + + atomic_set(&iocb->n_iters, n_iters); + return n_iters; +} + +static noinline_for_stack void +nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + struct page **pagevec = hdr->page_array.pagevec; + unsigned long v, total; + unsigned int base; + size_t len; + + v = 0; + total = hdr->args.count; + base = hdr->args.pgbase; + while (total && v < hdr->page_array.npages) { + len = min_t(size_t, total, PAGE_SIZE - base); + bvec_set_page(&iocb->bvec[v], *pagevec, len, base); + total -= len; + ++pagevec; + ++v; + base = 0; + } + len = hdr->args.count - total; + + /* + * For each iocb, iocb->n_iters is always at least 1 and we always + * end io after first nfs_local_pgio_done call unless misaligned DIO. + */ + atomic_set(&iocb->n_iters, 1); + + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { + struct nfs_local_dio local_dio; + + if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) && + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) { + /* Ensure DIO WRITE's IO on stable storage upon completion */ + if (rw == ITER_SOURCE) + iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC; + return; /* is DIO-aligned */ + } + } + + /* Use buffered IO */ + iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); +} + +static void +nfs_local_hdr_release(struct nfs_pgio_header *hdr, + const struct rpc_call_ops *call_ops) +{ + call_ops->rpc_call_done(&hdr->task, hdr); + call_ops->rpc_release(hdr); +} + +static void +nfs_local_pgio_init(struct nfs_pgio_header *hdr, + const struct rpc_call_ops *call_ops) +{ + hdr->task.tk_ops = call_ops; + if (!hdr->task.tk_start) + hdr->task.tk_start = ktime_get(); +} + +static bool +nfs_local_pgio_done(struct nfs_local_kiocb *iocb, long status, bool force) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + + /* Must handle partial completions */ + if (status >= 0) { + hdr->res.count += status; + /* @hdr was initialized to 0 (zeroed during allocation) */ + if (hdr->task.tk_status == 0) + hdr->res.op_status = NFS4_OK; + } else { + hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status); + hdr->task.tk_status = status; + } + + if (force) + return true; + + BUG_ON(atomic_read(&iocb->n_iters) <= 0); + return atomic_dec_and_test(&iocb->n_iters); +} + +static void +nfs_local_iocb_release(struct nfs_local_kiocb *iocb) +{ + nfs_local_file_put(iocb->localio); + nfs_local_iocb_free(iocb); +} + +static void +nfs_local_pgio_release(struct nfs_local_kiocb *iocb) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + + nfs_local_iocb_release(iocb); + nfs_local_hdr_release(hdr, hdr->task.tk_ops); +} + +/* + * Complete the I/O from iocb->kiocb.ki_complete() + * + * Note that this function can be called from a bottom half context, + * hence we need to queue the rpc_call_done() etc to a workqueue + */ +static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb) +{ + INIT_WORK(&iocb->work, iocb->aio_complete_work); + queue_work(nfsiod_workqueue, &iocb->work); +} + +static void nfs_local_read_done(struct nfs_local_kiocb *iocb) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + struct file *filp = iocb->kiocb.ki_filp; + long status = hdr->task.tk_status; + + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n"); + } + + /* + * Must clear replen otherwise NFSv3 data corruption will occur + * if/when switching from LOCALIO back to using normal RPC. + */ + hdr->res.replen = 0; + + /* nfs_readpage_result() handles short read */ + + if (hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) + hdr->res.eof = true; + + dprintk("%s: read %ld bytes eof %d.\n", __func__, + status > 0 ? status : 0, hdr->res.eof); +} + +static inline void nfs_local_read_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_read_done(iocb); + nfs_local_pgio_release(iocb); +} + +static void nfs_local_read_aio_complete_work(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + + nfs_local_read_iocb_done(iocb); +} + +static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) +{ + struct nfs_local_kiocb *iocb = + container_of(kiocb, struct nfs_local_kiocb, kiocb); + + /* AIO completion of DIO read should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ +} + +static void nfs_local_call_read(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + struct file *filp = iocb->kiocb.ki_filp; + bool force_done = false; + ssize_t status; + int n_iters; + + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_read_aio_complete; + iocb->aio_complete_work = nfs_local_read_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + + scoped_with_creds(filp->f_cred) + status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); + + if (status != -EIOCBQUEUED) { + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial read */ + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_read_iocb_done(iocb); + break; + } + } + } +} + +static int +nfs_local_do_read(struct nfs_local_kiocb *iocb, + const struct rpc_call_ops *call_ops) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + + dprintk("%s: vfs_read count=%u pos=%llu\n", + __func__, hdr->args.count, hdr->args.offset); + + nfs_local_pgio_init(hdr, call_ops); + hdr->res.eof = false; + + INIT_WORK(&iocb->work, nfs_local_call_read); + queue_work(nfslocaliod_workqueue, &iocb->work); + + return 0; +} + +static void +nfs_copy_boot_verifier(struct nfs_write_verifier *verifier, struct inode *inode) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + u32 *verf = (u32 *)verifier->data; + unsigned int seq; + + do { + seq = read_seqbegin(&clp->cl_boot_lock); + verf[0] = (u32)clp->cl_nfssvc_boot.tv_sec; + verf[1] = (u32)clp->cl_nfssvc_boot.tv_nsec; + } while (read_seqretry(&clp->cl_boot_lock, seq)); +} + +static void +nfs_reset_boot_verifier(struct inode *inode) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + + write_seqlock(&clp->cl_boot_lock); + ktime_get_real_ts64(&clp->cl_nfssvc_boot); + write_sequnlock(&clp->cl_boot_lock); +} + +static void +nfs_set_local_verifier(struct inode *inode, + struct nfs_writeverf *verf, + enum nfs3_stable_how how) +{ + nfs_copy_boot_verifier(&verf->verifier, inode); + verf->committed = how; +} + +/* Factored out from fs/nfsd/vfs.h:fh_getattr() */ +static int __vfs_getattr(const struct path *p, struct kstat *stat, int version) +{ + u32 request_mask = STATX_BASIC_STATS; + + if (version == 4) + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); + return vfs_getattr(p, stat, request_mask, AT_STATX_SYNC_AS_STAT); +} + +/* Copied from fs/nfsd/nfsfh.c:nfsd4_change_attribute() */ +static u64 __nfsd4_change_attribute(const struct kstat *stat, + const struct inode *inode) +{ + u64 chattr; + + if (stat->result_mask & STATX_CHANGE_COOKIE) { + chattr = stat->change_cookie; + if (S_ISREG(inode->i_mode) && + !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) { + chattr += (u64)stat->ctime.tv_sec << 30; + chattr += stat->ctime.tv_nsec; + } + } else { + chattr = time_to_chattr(&stat->ctime); + } + return chattr; +} + +static void nfs_local_vfs_getattr(struct nfs_local_kiocb *iocb) +{ + struct kstat stat; + struct file *filp = iocb->kiocb.ki_filp; + struct nfs_pgio_header *hdr = iocb->hdr; + struct nfs_fattr *fattr = hdr->res.fattr; + int version = NFS_PROTO(hdr->inode)->version; + + if (unlikely(!fattr) || __vfs_getattr(&filp->f_path, &stat, version)) + return; + + fattr->valid = (NFS_ATTR_FATTR_FILEID | + NFS_ATTR_FATTR_CHANGE | + NFS_ATTR_FATTR_SIZE | + NFS_ATTR_FATTR_ATIME | + NFS_ATTR_FATTR_MTIME | + NFS_ATTR_FATTR_CTIME | + NFS_ATTR_FATTR_SPACE_USED); + + fattr->fileid = stat.ino; + fattr->size = stat.size; + fattr->atime = stat.atime; + fattr->mtime = stat.mtime; + fattr->ctime = stat.ctime; + if (version == 4) { + fattr->change_attr = + __nfsd4_change_attribute(&stat, file_inode(filp)); + } else + fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime); + fattr->du.nfs3.used = stat.blocks << 9; +} + +static void nfs_local_write_done(struct nfs_local_kiocb *iocb) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + long status = hdr->task.tk_status; + + dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); + + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n"); + } + + /* Handle short writes as if they are ENOSPC */ + status = hdr->res.count; + if (status > 0 && status < hdr->args.count) { + hdr->mds_offset += status; + hdr->args.offset += status; + hdr->args.pgbase += status; + hdr->args.count -= status; + nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset); + status = -ENOSPC; + /* record -ENOSPC in terms of nfs_local_pgio_done */ + (void) nfs_local_pgio_done(iocb, status, true); + } + if (hdr->task.tk_status < 0) + nfs_reset_boot_verifier(hdr->inode); +} + +static inline void nfs_local_write_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_write_done(iocb); + nfs_local_vfs_getattr(iocb); + nfs_local_pgio_release(iocb); +} + +static void nfs_local_write_aio_complete_work(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + + nfs_local_write_iocb_done(iocb); +} + +static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) +{ + struct nfs_local_kiocb *iocb = + container_of(kiocb, struct nfs_local_kiocb, kiocb); + + /* AIO completion of DIO write should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ +} + +static void nfs_local_call_write(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + struct file *filp = iocb->kiocb.ki_filp; + unsigned long old_flags = current->flags; + bool force_done = false; + ssize_t status; + int n_iters; + + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; + + file_start_write(filp); + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_write_aio_complete; + iocb->aio_complete_work = nfs_local_write_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + + scoped_with_creds(filp->f_cred) + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); + + if (status != -EIOCBQUEUED) { + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial write */ + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_write_iocb_done(iocb); + break; + } + } + } + file_end_write(filp); + + current->flags = old_flags; +} + +static int +nfs_local_do_write(struct nfs_local_kiocb *iocb, + const struct rpc_call_ops *call_ops) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + + dprintk("%s: vfs_write count=%u pos=%llu %s\n", + __func__, hdr->args.count, hdr->args.offset, + (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); + + switch (hdr->args.stable) { + default: + break; + case NFS_DATA_SYNC: + iocb->kiocb.ki_flags |= IOCB_DSYNC; + break; + case NFS_FILE_SYNC: + iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC; + } + + nfs_local_pgio_init(hdr, call_ops); + + nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable); + + INIT_WORK(&iocb->work, nfs_local_call_write); + queue_work(nfslocaliod_workqueue, &iocb->work); + + return 0; +} + +static struct nfs_local_kiocb * +nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio) +{ + struct file *file = nfs_to->nfsd_file_file(localio); + struct nfs_local_kiocb *iocb; + gfp_t gfp_mask; + int rw; + + if (hdr->rw_mode & FMODE_READ) { + if (!file->f_op->read_iter) + return ERR_PTR(-EOPNOTSUPP); + gfp_mask = GFP_KERNEL; + rw = ITER_DEST; + } else { + if (!file->f_op->write_iter) + return ERR_PTR(-EOPNOTSUPP); + gfp_mask = GFP_NOIO; + rw = ITER_SOURCE; + } + + iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask); + if (iocb == NULL) + return ERR_PTR(-ENOMEM); + iocb->hdr = hdr; + iocb->localio = localio; + + nfs_local_iters_init(iocb, rw); + + return iocb; +} + +int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, + struct nfs_pgio_header *hdr, + const struct rpc_call_ops *call_ops) +{ + struct nfs_local_kiocb *iocb; + int status = 0; + + if (!hdr->args.count) + return 0; + + iocb = nfs_local_iocb_init(hdr, localio); + if (IS_ERR(iocb)) + return PTR_ERR(iocb); + + switch (hdr->rw_mode) { + case FMODE_READ: + status = nfs_local_do_read(iocb, call_ops); + break; + case FMODE_WRITE: + status = nfs_local_do_write(iocb, call_ops); + break; + default: + dprintk("%s: invalid mode: %d\n", __func__, + hdr->rw_mode); + status = -EOPNOTSUPP; + } + + if (status != 0) { + if (status == -EAGAIN) + nfs_localio_disable_client(clp); + nfs_local_iocb_release(iocb); + hdr->task.tk_status = status; + nfs_local_hdr_release(hdr, call_ops); + } + return status; +} + +static void +nfs_local_init_commit(struct nfs_commit_data *data, + const struct rpc_call_ops *call_ops) +{ + data->task.tk_ops = call_ops; +} + +static int +nfs_local_run_commit(struct file *filp, struct nfs_commit_data *data) +{ + loff_t start = data->args.offset; + loff_t end = LLONG_MAX; + + if (data->args.count > 0) { + end = start + data->args.count - 1; + if (end < start) + end = LLONG_MAX; + } + + dprintk("%s: commit %llu - %llu\n", __func__, start, end); + return vfs_fsync_range(filp, start, end, 0); +} + +static void +nfs_local_commit_done(struct nfs_commit_data *data, int status) +{ + if (status >= 0) { + nfs_set_local_verifier(data->inode, + data->res.verf, + NFS_FILE_SYNC); + data->res.op_status = NFS4_OK; + data->task.tk_status = 0; + } else { + nfs_reset_boot_verifier(data->inode); + data->res.op_status = nfs_localio_errno_to_nfs4_stat(status); + data->task.tk_status = status; + } +} + +static void +nfs_local_release_commit_data(struct nfsd_file *localio, + struct nfs_commit_data *data, + const struct rpc_call_ops *call_ops) +{ + nfs_local_file_put(localio); + call_ops->rpc_call_done(&data->task, data); + call_ops->rpc_release(data); +} + +static void +nfs_local_fsync_ctx_free(struct nfs_local_fsync_ctx *ctx) +{ + nfs_local_release_commit_data(ctx->localio, ctx->data, + ctx->data->task.tk_ops); + kfree(ctx); +} + +static void +nfs_local_fsync_work(struct work_struct *work) +{ + struct nfs_local_fsync_ctx *ctx; + int status; + + ctx = container_of(work, struct nfs_local_fsync_ctx, work); + + status = nfs_local_run_commit(nfs_to->nfsd_file_file(ctx->localio), + ctx->data); + nfs_local_commit_done(ctx->data, status); + if (ctx->done != NULL) + complete(ctx->done); + nfs_local_fsync_ctx_free(ctx); +} + +static struct nfs_local_fsync_ctx * +nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data, + struct nfsd_file *localio, gfp_t flags) +{ + struct nfs_local_fsync_ctx *ctx = kmalloc(sizeof(*ctx), flags); + + if (ctx != NULL) { + ctx->localio = localio; + ctx->data = data; + INIT_WORK(&ctx->work, nfs_local_fsync_work); + ctx->done = NULL; + } + return ctx; +} + +int nfs_local_commit(struct nfsd_file *localio, + struct nfs_commit_data *data, + const struct rpc_call_ops *call_ops, int how) +{ + struct nfs_local_fsync_ctx *ctx; + + ctx = nfs_local_fsync_ctx_alloc(data, localio, GFP_KERNEL); + if (!ctx) { + nfs_local_commit_done(data, -ENOMEM); + nfs_local_release_commit_data(localio, data, call_ops); + return -ENOMEM; + } + + nfs_local_init_commit(data, call_ops); + + if (how & FLUSH_SYNC) { + DECLARE_COMPLETION_ONSTACK(done); + ctx->done = &done; + queue_work(nfsiod_workqueue, &ctx->work); + wait_for_completion(&done); + } else + queue_work(nfsiod_workqueue, &ctx->work); + + return 0; +} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 60bad882c123..db8dfb920394 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * In-kernel MOUNT protocol client * @@ -28,9 +29,8 @@ */ #define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN)) #define MNT_status_sz (1) -#define MNT_fhs_status_sz (1) #define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE) -#define MNT_fhandle3_sz (1 + XDR_QUADLEN(NFS3_FHSIZE)) +#define MNT_fhandlev3_sz XDR_QUADLEN(NFS3_FHSIZE) #define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS) /* @@ -38,7 +38,7 @@ */ #define MNT_enc_dirpath_sz encode_dirpath_sz #define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz) -#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \ +#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandlev3_sz + \ MNT_authflav3_sz) /* @@ -128,22 +128,19 @@ struct mountres { rpc_authflavor_t *auth_flavors; }; -struct mnt_fhstatus { - u32 status; - struct nfs_fh *fh; -}; - /** * nfs_mount - Obtain an NFS file handle for the given host and path * @info: pointer to mount request arguments + * @timeo: deciseconds the mount waits for a response before it retries + * @retrans: number of times the mount retries a request * - * Uses default timeout parameters specified by underlying transport. On - * successful return, the auth_flavs list and auth_flav_len will be populated - * with the list from the server or a faked-up list if the server didn't - * provide one. + * Uses timeout parameters specified by caller. On successful return, the + * auth_flavs list and auth_flav_len will be populated with the list from the + * server or a faked-up list if the server didn't provide one. */ -int nfs_mount(struct nfs_mount_request *info) +int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans) { + struct rpc_timeout mnt_timeout; struct mountres result = { .fh = info->fh, .auth_count = info->auth_flav_len, @@ -156,12 +153,14 @@ int nfs_mount(struct nfs_mount_request *info) struct rpc_create_args args = { .net = info->net, .protocol = info->protocol, - .address = info->sap, + .address = (struct sockaddr *)info->sap, .addrsize = info->salen, + .timeout = &mnt_timeout, .servername = info->hostname, .program = &mnt_program, .version = info->version, .authflavor = RPC_AUTH_UNIX, + .cred = current_cred(), }; struct rpc_clnt *mnt_clnt; int status; @@ -176,6 +175,7 @@ int nfs_mount(struct nfs_mount_request *info) if (info->noresvport) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + nfs_init_timeout_values(&mnt_timeout, info->protocol, timeo, retrans); mnt_clnt = rpc_create(&args); if (IS_ERR(mnt_clnt)) goto out_clnt_err; @@ -223,73 +223,6 @@ out_mnt_err: goto out; } -/** - * nfs_umount - Notify a server that we have unmounted this export - * @info: pointer to umount request arguments - * - * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always - * use UDP. - */ -void nfs_umount(const struct nfs_mount_request *info) -{ - static const struct rpc_timeout nfs_umnt_timeout = { - .to_initval = 1 * HZ, - .to_maxval = 3 * HZ, - .to_retries = 2, - }; - struct rpc_create_args args = { - .net = info->net, - .protocol = IPPROTO_UDP, - .address = info->sap, - .addrsize = info->salen, - .timeout = &nfs_umnt_timeout, - .servername = info->hostname, - .program = &mnt_program, - .version = info->version, - .authflavor = RPC_AUTH_UNIX, - .flags = RPC_CLNT_CREATE_NOPING, - }; - struct rpc_message msg = { - .rpc_argp = info->dirpath, - }; - struct rpc_clnt *clnt; - int status; - - if (strlen(info->dirpath) > MNTPATHLEN) - return; - - if (info->noresvport) - args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; - - clnt = rpc_create(&args); - if (IS_ERR(clnt)) - goto out_clnt_err; - - dprintk("NFS: sending UMNT request for %s:%s\n", - (info->hostname ? info->hostname : "server"), info->dirpath); - - if (info->version == NFS_MNT3_VERSION) - msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; - else - msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; - - status = rpc_call_sync(clnt, &msg, 0); - rpc_shutdown_client(clnt); - - if (unlikely(status < 0)) - goto out_call_err; - - return; - -out_clnt_err: - dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", - PTR_ERR(clnt)); - return; - -out_call_err: - dprintk("NFS: UMNT request failed, status=%d\n", status); -} - /* * XDR encode/decode functions for MOUNT */ diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index e5686be67be8..5a4d193da1a9 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/namespace.c * @@ -18,6 +19,7 @@ #include <linux/vfs.h> #include <linux/sunrpc/gss_api.h> #include "internal.h" +#include "nfs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -30,9 +32,9 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; /* * nfs_path - reconstruct the path given an arbitrary dentry * @base - used to return pointer to the end of devname part of path - * @dentry - pointer to dentry + * @dentry_in - pointer to dentry * @buffer - result buffer - * @buflen - length of buffer + * @buflen_in - length of buffer * @flags - options (see below) * * Helper function for constructing the server pathname @@ -47,15 +49,19 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; * the original device (export) name * (if unset, the original name is returned verbatim) */ -char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, - unsigned flags) +char *nfs_path(char **p, struct dentry *dentry_in, char *buffer, + ssize_t buflen_in, unsigned flags) { char *end; int namelen; unsigned seq; const char *base; + struct dentry *dentry; + ssize_t buflen; rename_retry: + buflen = buflen_in; + dentry = dentry_in; end = buffer+buflen; *--end = '\0'; buflen--; @@ -138,49 +144,87 @@ EXPORT_SYMBOL_GPL(nfs_path); */ struct vfsmount *nfs_d_automount(struct path *path) { - struct vfsmount *mnt; - struct nfs_server *server = NFS_SERVER(d_inode(path->dentry)); - struct nfs_fh *fh = NULL; - struct nfs_fattr *fattr = NULL; + struct nfs_fs_context *ctx; + struct fs_context *fc; + struct vfsmount *mnt = ERR_PTR(-ENOMEM); + struct nfs_server *server = NFS_SB(path->dentry->d_sb); + struct nfs_client *client = server->nfs_client; + int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout); + int ret; if (IS_ROOT(path->dentry)) return ERR_PTR(-ESTALE); - mnt = ERR_PTR(-ENOMEM); - fh = nfs_alloc_fhandle(); - fattr = nfs_alloc_fattr(); - if (fh == NULL || fattr == NULL) - goto out; + /* Open a new filesystem context, transferring parameters from the + * parent superblock, including the network namespace. + */ + fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); + if (IS_ERR(fc)) + return ERR_CAST(fc); - mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr); + ctx = nfs_fc2context(fc); + ctx->clone_data.dentry = path->dentry; + ctx->clone_data.sb = path->dentry->d_sb; + ctx->clone_data.fattr = nfs_alloc_fattr(); + if (!ctx->clone_data.fattr) + goto out_fc; + + if (fc->net_ns != client->cl_net) { + put_net(fc->net_ns); + fc->net_ns = get_net(client->cl_net); + } + + /* for submounts we want the same server; referrals will reassign */ + memcpy(&ctx->nfs_server._address, &client->cl_addr, client->cl_addrlen); + ctx->nfs_server.addrlen = client->cl_addrlen; + ctx->nfs_server.port = server->port; + + ctx->version = client->rpc_ops->version; + ctx->minorversion = client->cl_minorversion; + ctx->nfs_mod = client->cl_nfs_mod; + get_nfs_version(ctx->nfs_mod); + + ret = client->rpc_ops->submount(fc, server); + if (ret < 0) { + mnt = ERR_PTR(ret); + goto out_fc; + } + + up_write(&fc->root->d_sb->s_umount); + mnt = vfs_create_mount(fc); if (IS_ERR(mnt)) - goto out; + goto out_fc; + + if (timeout <= 0) + goto out_fc; - mntget(mnt); /* prevent immediate expiration */ mnt_set_expiry(mnt, &nfs_automount_list); - schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + schedule_delayed_work(&nfs_automount_task, timeout); -out: - nfs_free_fattr(fattr); - nfs_free_fhandle(fh); +out_fc: + put_fs_context(fc); return mnt; } static int -nfs_namespace_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +nfs_namespace_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { if (NFS_FH(d_inode(path->dentry))->size != 0) - return nfs_getattr(path, stat, request_mask, query_flags); - generic_fillattr(d_inode(path->dentry), stat); + return nfs_getattr(idmap, path, stat, request_mask, + query_flags); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); return 0; } static int -nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +nfs_namespace_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) { if (NFS_FH(d_inode(dentry))->size != 0) - return nfs_setattr(dentry, attr); + return nfs_setattr(idmap, dentry, attr); return -EACCES; } @@ -197,10 +241,11 @@ const struct inode_operations nfs_referral_inode_operations = { static void nfs_expire_automounts(struct work_struct *work) { struct list_head *list = &nfs_automount_list; + int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout); mark_mounts_for_expiry(list); - if (!list_empty(list)) - schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + if (!list_empty(list) && timeout > 0) + schedule_delayed_work(&nfs_automount_task, timeout); } void nfs_release_automount_timer(void) @@ -209,64 +254,117 @@ void nfs_release_automount_timer(void) cancel_delayed_work(&nfs_automount_task); } -/* - * Clone a mountpoint of the appropriate type - */ -static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, - const char *devname, - struct nfs_clone_mount *mountdata) -{ - return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata); -} - /** * nfs_do_submount - set up mountpoint when crossing a filesystem boundary - * @dentry - parent directory - * @fh - filehandle for new root dentry - * @fattr - attributes for new root inode - * @authflavor - security flavor to use when performing the mount + * @fc: pointer to struct nfs_fs_context * */ -struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, - struct nfs_fattr *fattr, rpc_authflavor_t authflavor) +int nfs_do_submount(struct fs_context *fc) { - struct nfs_clone_mount mountdata = { - .sb = dentry->d_sb, - .dentry = dentry, - .fh = fh, - .fattr = fattr, - .authflavor = authflavor, - }; - struct vfsmount *mnt; - char *page = (char *) __get_free_page(GFP_USER); - char *devname; - - if (page == NULL) - return ERR_PTR(-ENOMEM); - - devname = nfs_devname(dentry, page, PAGE_SIZE); - if (IS_ERR(devname)) - mnt = ERR_CAST(devname); - else - mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); - - free_page((unsigned long)page); - return mnt; + struct nfs_fs_context *ctx = nfs_fc2context(fc); + struct dentry *dentry = ctx->clone_data.dentry; + struct nfs_server *server; + char *buffer, *p; + int ret; + + /* create a new volume representation */ + server = ctx->nfs_mod->rpc_ops->clone_server(NFS_SB(ctx->clone_data.sb), + ctx->mntfh, + ctx->clone_data.fattr, + ctx->selected_flavor); + + if (IS_ERR(server)) + return PTR_ERR(server); + + ctx->server = server; + + buffer = kmalloc(4096, GFP_USER); + if (!buffer) + return -ENOMEM; + + ctx->internal = true; + ctx->clone_data.inherited_bsize = ctx->clone_data.sb->s_blocksize_bits; + + p = nfs_devname(dentry, buffer, 4096); + if (IS_ERR(p)) { + nfs_errorf(fc, "NFS: Couldn't determine submount pathname"); + ret = PTR_ERR(p); + } else { + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(p, buffer + 4096 - p)); + if (!ret) + ret = vfs_get_tree(fc); + } + kfree(buffer); + return ret; } EXPORT_SYMBOL_GPL(nfs_do_submount); -struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, - struct nfs_fh *fh, struct nfs_fattr *fattr) +int nfs_submount(struct fs_context *fc, struct nfs_server *server) { - int err; + struct nfs_fs_context *ctx = nfs_fc2context(fc); + struct dentry *dentry = ctx->clone_data.dentry; struct dentry *parent = dget_parent(dentry); + int err; /* Look it up again to get its attributes */ - err = server->nfs_client->rpc_ops->lookup(d_inode(parent), &dentry->d_name, fh, fattr, NULL); + err = server->nfs_client->rpc_ops->lookup(d_inode(parent), dentry, &dentry->d_name, + ctx->mntfh, ctx->clone_data.fattr); dput(parent); if (err != 0) - return ERR_PTR(err); + return err; - return nfs_do_submount(dentry, fh, fattr, server->client->cl_auth->au_flavor); + ctx->selected_flavor = server->client->cl_auth->au_flavor; + return nfs_do_submount(fc); } EXPORT_SYMBOL_GPL(nfs_submount); + +static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp) +{ + long num; + int ret; + + if (!val) + return -EINVAL; + ret = kstrtol(val, 0, &num); + if (ret) + return -EINVAL; + if (num > 0) { + if (num >= INT_MAX / HZ) + num = INT_MAX; + else + num *= HZ; + *((int *)kp->arg) = num; + if (!list_empty(&nfs_automount_list)) + mod_delayed_work(system_percpu_wq, &nfs_automount_task, num); + } else { + *((int *)kp->arg) = -1*HZ; + cancel_delayed_work(&nfs_automount_task); + } + return 0; +} + +static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp) +{ + long num = *((int *)kp->arg); + + if (num > 0) { + if (num >= INT_MAX - (HZ - 1)) + num = INT_MAX / HZ; + else + num = (num + (HZ - 1)) / HZ; + } else + num = -1; + return sysfs_emit(buffer, "%li\n", num); +} + +static const struct kernel_param_ops param_ops_nfs_timeout = { + .set = param_set_nfs_timeout, + .get = param_get_nfs_timeout, +}; +#define param_check_nfs_timeout(name, p) __param_check(name, p, int) + +module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644); +MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout, + "Set the NFS automounted mountpoint timeout value (seconds)." + "Values <= 0 turn expiration off."); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index 5fbd2bde91ba..6ba3ea39e928 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NFS-private data for each "struct net". Accessed with net_generic(). */ @@ -8,12 +9,15 @@ #include <linux/nfs4.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/sunrpc/stats.h> struct bl_dev_msg { int32_t status; uint32_t major, minor; }; +struct nfs_netns_client; + struct nfs_net { struct cache_detail *nfs_dns_resolve; struct rpc_pipe *bl_device_pipe; @@ -27,9 +31,15 @@ struct nfs_net { unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; int cb_users[NFS4_MAX_MINOR_VERSION + 1]; -#endif +#endif /* CONFIG_NFS_V4 */ +#if IS_ENABLED(CONFIG_NFS_V4_1) + struct list_head nfs4_data_server_cache; + spinlock_t nfs4_data_server_lock; +#endif /* CONFIG_NFS_V4_1 */ + struct nfs_netns_client *nfs_client; spinlock_t nfs_client_lock; ktime_t boot_time; + struct rpc_stat rpcstats; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_nfsfs; #endif diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h index 43679df56cd0..8a5f51be013a 100644 --- a/fs/nfs/nfs.h +++ b/fs/nfs/nfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2012 Netapp, Inc. All rights reserved. * @@ -17,11 +18,11 @@ struct nfs_subversion { const struct rpc_version *rpc_vers; /* NFS version information */ const struct nfs_rpc_ops *rpc_ops; /* NFS operations */ const struct super_operations *sops; /* NFS Super operations */ - const struct xattr_handler **xattr; /* NFS xattr handlers */ - struct list_head list; /* List of NFS versions */ + const struct xattr_handler * const *xattr; /* NFS xattr handlers */ }; -struct nfs_subversion *get_nfs_version(unsigned int); +struct nfs_subversion *find_nfs_version(unsigned int); +int get_nfs_version(struct nfs_subversion *); void put_nfs_version(struct nfs_subversion *); void register_nfs_version(struct nfs_subversion *); void unregister_nfs_version(struct nfs_subversion *); diff --git a/fs/nfs/nfs2super.c b/fs/nfs/nfs2super.c index 0a9782c9171a..b1badc70bd71 100644 --- a/fs/nfs/nfs2super.c +++ b/fs/nfs/nfs2super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012 Netapp, Inc. All rights reserved. */ @@ -25,6 +26,7 @@ static void __exit exit_nfs_v2(void) unregister_nfs_version(&nfs_v2); } +MODULE_DESCRIPTION("NFSv2 client support"); MODULE_LICENSE("GPL"); module_init(init_nfs_v2); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index fe68dabfbde6..9eff09158518 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/nfs2xdr.c * @@ -21,17 +22,17 @@ #include <linux/nfs.h> #include <linux/nfs2.h> #include <linux/nfs_fs.h> +#include <linux/nfs_common.h> #include "internal.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_XDR -/* Mapping from NFS error code to "errno" error code. */ -#define errno_NFSERR_IO EIO - /* * Declare the space requirements for NFS arguments and replies as * number of 32bit-words */ +#define NFS_pagepad_sz (1) /* Page padding */ #define NFS_fhandle_sz (8) #define NFS_sattr_sz (8) #define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2)) @@ -54,41 +55,13 @@ #define NFS_attrstat_sz (1+NFS_fattr_sz) #define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz) -#define NFS_readlinkres_sz (2) -#define NFS_readres_sz (1+NFS_fattr_sz+1) +#define NFS_readlinkres_sz (2+NFS_pagepad_sz) +#define NFS_readres_sz (1+NFS_fattr_sz+1+NFS_pagepad_sz) #define NFS_writeres_sz (NFS_attrstat_sz) #define NFS_stat_sz (1) -#define NFS_readdirres_sz (1) +#define NFS_readdirres_sz (1+NFS_pagepad_sz) #define NFS_statfsres_sz (1+NFS_info_sz) -static int nfs_stat_to_errno(enum nfs_stat); - -/* - * While encoding arguments, set up the reply buffer in advance to - * receive reply data directly into the page cache. - */ -static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, - unsigned int base, unsigned int len, - unsigned int bufsize) -{ - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - - replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); -} - -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - /* * Encode/decode NFSv2 basic data types * @@ -100,6 +73,20 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) * or decoded inline. */ +static struct user_namespace *rpc_userns(const struct rpc_clnt *clnt) +{ + if (clnt && clnt->cl_cred) + return clnt->cl_cred->user_ns; + return &init_user_ns; +} + +static struct user_namespace *rpc_rqst_userns(const struct rpc_rqst *rqstp) +{ + if (rqstp->rq_task) + return rpc_userns(rqstp->rq_task->tk_client); + return &init_user_ns; +} + /* * typedef opaque nfsdata<>; */ @@ -109,8 +96,8 @@ static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); recvd = xdr_read_pages(xdr, count); if (unlikely(count > recvd)) @@ -124,9 +111,6 @@ out_cheating: "count %u > recvd %u\n", count, recvd); count = recvd; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -156,13 +140,16 @@ static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != cpu_to_be32(NFS_OK))) + goto out_status; + *status = 0; + return 0; +out_status: *status = be32_to_cpup(p); + trace_nfs_xdr_status(xdr, (int)*status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -204,14 +191,11 @@ static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) __be32 *p; p = xdr_inline_decode(xdr, NFS2_FHSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fh->size = NFS2_FHSIZE; memcpy(fh->data, p, NFS2_FHSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -222,9 +206,9 @@ out_overflow: * unsigned int useconds; * }; */ -static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) +static __be32 *xdr_encode_time(__be32 *p, const struct timespec64 *timep) { - *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32((u32)timep->tv_sec); if (timep->tv_nsec != 0) *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC); else @@ -240,14 +224,14 @@ static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5. */ static __be32 *xdr_encode_current_server_time(__be32 *p, - const struct timespec *timep) + const struct timespec64 *timep) { *p++ = cpu_to_be32(timep->tv_sec); *p++ = cpu_to_be32(1000000); return p; } -static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep) +static __be32 *xdr_decode_time(__be32 *p, struct timespec64 *timep) { timep->tv_sec = be32_to_cpup(p++); timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC; @@ -275,14 +259,15 @@ static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep) * }; * */ -static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct user_namespace *userns) { u32 rdev, type; __be32 *p; p = xdr_inline_decode(xdr, NFS_fattr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fattr->valid |= NFS_ATTR_FATTR_V2; @@ -290,10 +275,10 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->mode = be32_to_cpup(p++); fattr->nlink = be32_to_cpup(p++); - fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + fattr->uid = make_kuid(userns, be32_to_cpup(p++)); if (!uid_valid(fattr->uid)) goto out_uid; - fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + fattr->gid = make_kgid(userns, be32_to_cpup(p++)); if (!gid_valid(fattr->gid)) goto out_gid; @@ -324,9 +309,6 @@ out_uid: out_gid: dprintk("NFS: returned invalid gid\n"); return -EINVAL; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -351,7 +333,8 @@ static __be32 *xdr_time_not_set(__be32 *p) return p; } -static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) +static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr, + struct user_namespace *userns) { __be32 *p; @@ -362,11 +345,11 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr) else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_UID) - *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); + *p++ = cpu_to_be32(from_kuid_munged(userns, attr->ia_uid)); else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_GID) - *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); + *p++ = cpu_to_be32(from_kgid_munged(userns, attr->ia_gid)); else *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_SIZE) @@ -410,23 +393,20 @@ static int decode_filename_inline(struct xdr_stream *xdr, u32 count; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (count > NFS3_MAXNAMLEN) goto out_nametoolong; p = xdr_inline_decode(xdr, count); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *name = (const char *)p; *length = count; return 0; out_nametoolong: dprintk("NFS: returned filename too long: %u\n", count); return -ENAMETOOLONG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -449,8 +429,8 @@ static int decode_path(struct xdr_stream *xdr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; length = be32_to_cpup(p); if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) goto out_size; @@ -466,9 +446,6 @@ out_cheating: dprintk("NFS: server cheating in pathname result: " "length %u > received %u\n", length, recvd); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -482,7 +459,8 @@ out_overflow: * }; */ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result, - __u32 *op_status) + __u32 *op_status, + struct user_namespace *userns) { enum nfs_stat status; int error; @@ -494,7 +472,7 @@ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result, *op_status = status; if (status != NFS_OK) goto out_default; - error = decode_fattr(xdr, result); + error = decode_fattr(xdr, result, userns); out: return error; out_default: @@ -529,19 +507,21 @@ static void encode_diropargs(struct xdr_stream *xdr, const struct nfs_fh *fh, * void; * }; */ -static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result) +static int decode_diropok(struct xdr_stream *xdr, struct nfs_diropok *result, + struct user_namespace *userns) { int error; error = decode_fhandle(xdr, result->fh); if (unlikely(error)) goto out; - error = decode_fattr(xdr, result->fattr); + error = decode_fattr(xdr, result->fattr, userns); out: return error; } -static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result) +static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result, + struct user_namespace *userns) { enum nfs_stat status; int error; @@ -551,7 +531,7 @@ static int decode_diropres(struct xdr_stream *xdr, struct nfs_diropok *result) goto out; if (status != NFS_OK) goto out_default; - error = decode_diropok(xdr, result); + error = decode_diropok(xdr, result, userns); out: return error; out_default: @@ -590,7 +570,7 @@ static void nfs2_xdr_enc_sattrargs(struct rpc_rqst *req, const struct nfs_sattrargs *args = data; encode_fhandle(xdr, args->fh); - encode_sattr(xdr, args->sattr); + encode_sattr(xdr, args->sattr, rpc_rqst_userns(req)); } static void nfs2_xdr_enc_diropargs(struct rpc_rqst *req, @@ -609,8 +589,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, const struct nfs_readlinkargs *args = data; encode_fhandle(xdr, args->fh); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->pglen, NFS_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen, + NFS_readlinkres_sz - NFS_pagepad_sz); } /* @@ -645,8 +625,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, const struct nfs_pgio_args *args = data; encode_readargs(xdr, args); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, NFS_readres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count, + NFS_readres_sz - NFS_pagepad_sz); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -705,7 +685,7 @@ static void nfs2_xdr_enc_createargs(struct rpc_rqst *req, const struct nfs_createargs *args = data; encode_diropargs(xdr, args->fh, args->name, args->len); - encode_sattr(xdr, args->sattr); + encode_sattr(xdr, args->sattr, rpc_rqst_userns(req)); } static void nfs2_xdr_enc_removeargs(struct rpc_rqst *req, @@ -772,7 +752,7 @@ static void nfs2_xdr_enc_symlinkargs(struct rpc_rqst *req, encode_diropargs(xdr, args->fromfh, args->fromname, args->fromlen); encode_path(xdr, args->pages, args->pathlen); - encode_sattr(xdr, args->sattr); + encode_sattr(xdr, args->sattr, rpc_rqst_userns(req)); } /* @@ -803,8 +783,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, const struct nfs_readdirargs *args = data; encode_readdirargs(xdr, args); - prepare_reply_buffer(req, args->pages, 0, - args->count, NFS_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, args->count, + NFS_readdirres_sz - NFS_pagepad_sz); } /* @@ -834,13 +814,13 @@ out_default: static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, void *result) { - return decode_attrstat(xdr, result, NULL); + return decode_attrstat(xdr, result, NULL, rpc_rqst_userns(req)); } static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, void *result) { - return decode_diropres(xdr, result); + return decode_diropres(xdr, result, rpc_rqst_userns(req)); } /* @@ -895,7 +875,7 @@ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, result->op_status = status; if (status != NFS_OK) goto out_default; - error = decode_fattr(xdr, result->fattr); + error = decode_fattr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; error = decode_nfsdata(xdr, result); @@ -912,7 +892,8 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, /* All NFSv2 writes are "file sync" writes */ result->verf->committed = NFS_FILE_SYNC; - return decode_attrstat(xdr, result->fattr, &result->op_status); + return decode_attrstat(xdr, result->fattr, &result->op_status, + rpc_rqst_userns(req)); } /** @@ -945,12 +926,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int error; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p++ == xdr_zero) { p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p++ == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -958,31 +939,26 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, } p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; entry->ino = be32_to_cpup(p); error = decode_filename_inline(xdr, &entry->name, &entry->len); if (unlikely(error)) - return error; + return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN; /* * The type (size and byte order) of nfscookie isn't defined in * RFC 1094. This implementation assumes that it's an XDR uint32. */ - entry->prev_cookie = entry->cookie; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; entry->cookie = be32_to_cpup(p); entry->d_type = DT_UNKNOWN; return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; } /* @@ -1046,17 +1022,14 @@ static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result) __be32 *p; p = xdr_inline_decode(xdr, NFS_info_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->tsize = be32_to_cpup(p++); result->bsize = be32_to_cpup(p++); result->blocks = be32_to_cpup(p++); result->bfree = be32_to_cpup(p++); result->bavail = be32_to_cpup(p); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -1077,70 +1050,6 @@ out_default: return nfs_stat_to_errno(status); } - -/* - * We need to translate between nfs status return values and - * the local errno values which may not be the same. - */ -static const struct { - int stat; - int errno; -} nfs_errtbl[] = { - { NFS_OK, 0 }, - { NFSERR_PERM, -EPERM }, - { NFSERR_NOENT, -ENOENT }, - { NFSERR_IO, -errno_NFSERR_IO}, - { NFSERR_NXIO, -ENXIO }, -/* { NFSERR_EAGAIN, -EAGAIN }, */ - { NFSERR_ACCES, -EACCES }, - { NFSERR_EXIST, -EEXIST }, - { NFSERR_XDEV, -EXDEV }, - { NFSERR_NODEV, -ENODEV }, - { NFSERR_NOTDIR, -ENOTDIR }, - { NFSERR_ISDIR, -EISDIR }, - { NFSERR_INVAL, -EINVAL }, - { NFSERR_FBIG, -EFBIG }, - { NFSERR_NOSPC, -ENOSPC }, - { NFSERR_ROFS, -EROFS }, - { NFSERR_MLINK, -EMLINK }, - { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, - { NFSERR_NOTEMPTY, -ENOTEMPTY }, - { NFSERR_DQUOT, -EDQUOT }, - { NFSERR_STALE, -ESTALE }, - { NFSERR_REMOTE, -EREMOTE }, -#ifdef EWFLUSH - { NFSERR_WFLUSH, -EWFLUSH }, -#endif - { NFSERR_BADHANDLE, -EBADHANDLE }, - { NFSERR_NOT_SYNC, -ENOTSYNC }, - { NFSERR_BAD_COOKIE, -EBADCOOKIE }, - { NFSERR_NOTSUPP, -ENOTSUPP }, - { NFSERR_TOOSMALL, -ETOOSMALL }, - { NFSERR_SERVERFAULT, -EREMOTEIO }, - { NFSERR_BADTYPE, -EBADTYPE }, - { NFSERR_JUKEBOX, -EJUKEBOX }, - { -1, -EIO } -}; - -/** - * nfs_stat_to_errno - convert an NFS status code to a local errno - * @status: NFS status code to convert - * - * Returns a local errno value, or -EIO if the NFS status code is - * not recognized. This function is used jointly by NFSv2 and NFSv3. - */ -static int nfs_stat_to_errno(enum nfs_stat status) -{ - int i; - - for (i = 0; nfs_errtbl[i].stat != -1; i++) { - if (nfs_errtbl[i].stat == (int)status) - return nfs_errtbl[i].errno; - } - dprintk("NFS: Unrecognized nfs status value: %u\n", status); - return nfs_errtbl[i].errno; -} - #define PROC(proc, argtype, restype, timer) \ [NFSPROC_##proc] = { \ .p_proc = NFSPROC_##proc, \ diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index e134d6548ab7..b333ea119ef5 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Anna Schumaker. * @@ -10,12 +11,12 @@ * nfs3acl.c */ #ifdef CONFIG_NFS_V3_ACL -extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type); -extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu); +extern int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t); -extern const struct xattr_handler *nfs3_xattr_handlers[]; #else static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl) @@ -26,7 +27,7 @@ static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, #endif /* CONFIG_NFS_V3_ACL */ /* nfs3client.c */ -struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); +struct nfs_server *nfs3_create_server(struct fs_context *); struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 720d92f5abfb..a126eb31f62f 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/gfp.h> #include <linux/nfs.h> @@ -20,9 +21,8 @@ static void nfs3_prepare_get_acl(struct posix_acl **p) { struct posix_acl *sentinel = uncached_acl_sentinel(current); - if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) { - /* Not the first reader or sentinel already in place. */ - } + /* If the ACL isn't being read yet, set our sentinel. */ + cmpxchg(p, ACL_NOT_CACHED, sentinel); } static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl) @@ -43,7 +43,7 @@ static void nfs3_abort_get_acl(struct posix_acl **p) cmpxchg(p, sentinel, ACL_NOT_CACHED); } -struct posix_acl *nfs3_get_acl(struct inode *inode, int type) +struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFSACL_MAXPAGES] = { }; @@ -61,10 +61,13 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) }; int status, count; + if (rcu) + return ERR_PTR(-ECHILD); + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) return ERR_PTR(-EOPNOTSUPP); - status = nfs_revalidate_inode(server, inode); + status = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); if (status < 0) return ERR_PTR(status); @@ -101,14 +104,16 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) switch (status) { case 0: - status = nfs_refresh_inode(inode, res.fattr); + nfs_refresh_inode(inode, res.fattr); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: dprintk("NFS_V3_ACL extension not supported; disabling\n"); server->caps &= ~NFS_CAP_ACLS; + fallthrough; case -ENOTSUPP: status = -EOPNOTSUPP; + goto getout; default: goto getout; } @@ -220,14 +225,13 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, switch (status) { case 0: status = nfs_refresh_inode(inode, fattr); - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: dprintk("NFS_V3_ACL SETACL RPC not supported" "(will not retry)\n"); server->caps &= ~NFS_CAP_ACLS; + fallthrough; case -ENOTSUPP: status = -EOPNOTSUPP; } @@ -250,47 +254,51 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, } -int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type) { - struct posix_acl *alloc = NULL, *dfacl = NULL; + struct posix_acl *orig = acl, *dfacl = NULL, *alloc; + struct inode *inode = d_inode(dentry); int status; if (S_ISDIR(inode->i_mode)) { switch(type) { case ACL_TYPE_ACCESS: - alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT); + alloc = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(alloc)) goto fail; + dfacl = alloc; break; case ACL_TYPE_DEFAULT: - dfacl = acl; - alloc = acl = get_acl(inode, ACL_TYPE_ACCESS); + alloc = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(alloc)) goto fail; + dfacl = acl; + acl = alloc; break; } } if (acl == NULL) { - alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + alloc = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); if (IS_ERR(alloc)) goto fail; + acl = alloc; } status = __nfs3_proc_setacls(inode, acl, dfacl); - posix_acl_release(alloc); +out: + if (acl != orig) + posix_acl_release(acl); + if (dfacl != orig) + posix_acl_release(dfacl); return status; fail: - return PTR_ERR(alloc); + status = PTR_ERR(alloc); + goto out; } -const struct xattr_handler *nfs3_xattr_handlers[] = { - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, - NULL, -}; - static int nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, size_t size, ssize_t *result) @@ -298,7 +306,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, struct posix_acl *acl; char *p = data + *result; - acl = get_acl(inode, type); + acl = get_inode_acl(inode, type); if (IS_ERR_OR_NULL(acl)) return 0; diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 7879f2a0fcfd..5d97c1d38bb6 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -1,8 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/sunrpc/addr.h> +#include <net/handshake.h> #include "internal.h" #include "nfs3_fs.h" +#include "netns.h" +#include "sysfs.h" #ifdef CONFIG_NFS_V3_ACL static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; @@ -30,6 +34,8 @@ static void nfs_init_server_aclclient(struct nfs_server *server) if (IS_ERR(server->client_acl)) goto out_noacl; + nfs_sysfs_link_rpc_client(server, server->client_acl, NULL); + /* No errors! Assume that Sun nfsacls are supported */ server->caps |= NFS_CAP_ACLS; return; @@ -45,10 +51,10 @@ static inline void nfs_init_server_aclclient(struct nfs_server *server) } #endif -struct nfs_server *nfs3_create_server(struct nfs_mount_info *mount_info, - struct nfs_subversion *nfs_mod) +struct nfs_server *nfs3_create_server(struct fs_context *fc) { - struct nfs_server *server = nfs_create_server(mount_info, nfs_mod); + struct nfs_server *server = nfs_create_server(fc); + /* Create a client RPC handle for the NFS v3 ACL management interface */ if (!IS_ERR(server)) nfs_init_server_aclclient(server); @@ -77,10 +83,11 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, * the MDS. */ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct rpc_timeout ds_timeout; + unsigned long connect_timeout = ds_timeo * (ds_retrans + 1) * HZ / 10; struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, @@ -91,17 +98,42 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .proto = ds_proto, .net = mds_clp->cl_net, .timeparms = &ds_timeout, + .cred = mds_srv->cred, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, + .connect_timeout = connect_timeout, + .reconnect_timeout = connect_timeout, }; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; /* fake a hostname because lockd wants it */ - if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0) return ERR_PTR(-EINVAL); cl_init.hostname = buf; + switch (ds_proto) { + case XPRT_TRANSPORT_TCP_TLS: + if (mds_clp->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_clp->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; + case XPRT_TRANSPORT_RDMA: + case XPRT_TRANSPORT_TCP: + if (mds_clp->cl_nconnect > 1) + cl_init.nconnect = mds_clp->cl_nconnect; + } + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) - set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); + + __set_bit(NFS_CS_DS, &cl_init.init_flags); /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d1e87ec0df84..a4cb67573aa7 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/nfs3proc.c * @@ -35,9 +36,10 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) res = rpc_call_sync(clnt, msg, flags); if (res != -EJUKEBOX) break; - freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME); + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; - } while (!fatal_signal_pending(current)); + } while (!fatal_signal_pending(current) && !nfs_current_task_exiting()); return res; } @@ -48,8 +50,7 @@ nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) { if (task->tk_status != -EJUKEBOX) return 0; - if (task->tk_status == -EJUKEBOX) - nfs_inc_stats(inode, NFSIOS_DELAY); + nfs_inc_stats(inode, NFSIOS_DELAY); task->tk_status = 0; rpc_restart_call(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); @@ -100,7 +101,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, */ static int nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) + struct nfs_fattr *fattr, struct inode *inode) { struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], @@ -108,10 +109,15 @@ nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = fattr, }; int status; + unsigned short task_flags = 0; + + /* Is this is an attribute revalidation, subject to softreval? */ + if (inode && (server->flags & NFS_MOUNT_SOFTREVAL)) + task_flags |= RPC_TASK_TIMEOUT; dprintk("NFS call getattr\n"); nfs_fattr_init(fattr); - status = rpc_call_sync(server->client, &msg, 0); + status = rpc_call_sync(server->client, &msg, task_flags); dprintk("NFS reply getattr: %d\n", status); return status; } @@ -137,21 +143,24 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, msg.rpc_cred = nfs_file_cred(sattr->ia_file); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - if (status == 0) + if (status == 0) { nfs_setattr_update_inode(inode, sattr, fattr); + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); + } dprintk("NFS reply setattr: %d\n", status); return status; } static int -nfs3_proc_lookup(struct inode *dir, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr, - struct nfs4_label *label) +__nfs3_proc_lookup(struct inode *dir, const char *name, size_t len, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + unsigned short task_flags) { struct nfs3_diropargs arg = { .fh = NFS_FH(dir), - .name = name->name, - .len = name->len + .name = name, + .len = len }; struct nfs3_diropres res = { .fh = fhandle, @@ -164,56 +173,70 @@ nfs3_proc_lookup(struct inode *dir, const struct qstr *name, }; int status; - dprintk("NFS call lookup %s\n", name->name); res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) return -ENOMEM; nfs_fattr_init(fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags); nfs_refresh_inode(dir, res.dir_attr); if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; msg.rpc_argp = fhandle; msg.rpc_resp = fattr; - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags); } nfs_free_fattr(res.dir_attr); dprintk("NFS reply lookup: %d\n", status); return status; } -static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int +nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) +{ + unsigned short task_flags = 0; + + /* Is this is an attribute revalidation, subject to softreval? */ + if (nfs_lookup_is_soft_revalidate(dentry)) + task_flags |= RPC_TASK_TIMEOUT; + + dprintk("NFS call lookup %pd2\n", dentry); + return __nfs3_proc_lookup(dir, name->name, name->len, fhandle, fattr, + task_flags); +} + +static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + const char dotdot[] = ".."; + const size_t len = strlen(dotdot); + unsigned short task_flags = 0; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) + task_flags |= RPC_TASK_TIMEOUT; + + return __nfs3_proc_lookup(inode, dotdot, len, fhandle, fattr, + task_flags); +} + +static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs3_accessargs arg = { .fh = NFS_FH(inode), + .access = entry->mask, }; struct nfs3_accessres res; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], .rpc_argp = &arg, .rpc_resp = &res, - .rpc_cred = entry->cred, + .rpc_cred = cred, }; - int mode = entry->mask; int status = -ENOMEM; dprintk("NFS call access\n"); - - if (mode & MAY_READ) - arg.access |= NFS3_ACCESS_READ; - if (S_ISDIR(inode->i_mode)) { - if (mode & MAY_WRITE) - arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE; - if (mode & MAY_EXEC) - arg.access |= NFS3_ACCESS_LOOKUP; - } else { - if (mode & MAY_WRITE) - arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND; - if (mode & MAY_EXEC) - arg.access |= NFS3_ACCESS_EXECUTE; - } - res.fattr = nfs_alloc_fattr(); if (res.fattr == NULL) goto out; @@ -289,15 +312,17 @@ static struct nfs3_createdata *nfs3_alloc_createdata(void) return data; } -static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +static struct dentry * +nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) { int status; status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); nfs_post_op_update_inode(dir, data->res.dir_attr); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); - return status; + if (status != 0) + return ERR_PTR(status); + + return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); } static void nfs3_free_createdata(struct nfs3_createdata *data) @@ -314,6 +339,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call create %pd\n", dentry); @@ -340,7 +366,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; for (;;) { - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != -ENOTSUPP) break; @@ -356,7 +383,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, break; case NFS3_CREATE_UNCHECKED: - goto out; + goto out_release_acls; } nfs_fattr_init(data->res.dir_attr); nfs_fattr_init(data->res.fattr); @@ -365,6 +392,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { @@ -382,11 +412,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) - goto out_release_acls; + goto out_dput; } status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); +out_dput: + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -397,11 +429,11 @@ out: } static int -nfs3_proc_remove(struct inode *dir, const struct qstr *name) +nfs3_proc_remove(struct inode *dir, struct dentry *dentry) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name = *name, + .name = dentry->d_name, }; struct nfs_removeres res; struct rpc_message msg = { @@ -411,7 +443,7 @@ nfs3_proc_remove(struct inode *dir, const struct qstr *name) }; int status = -ENOMEM; - dprintk("NFS call remove %s\n", name->name); + dprintk("NFS call remove %pd2\n", dentry); res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) goto out; @@ -425,7 +457,9 @@ out: } static void -nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +nfs3_proc_unlink_setup(struct rpc_message *msg, + struct dentry *dentry, + struct inode *inode) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; } @@ -447,7 +481,9 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) } static void -nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +nfs3_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; } @@ -506,10 +542,12 @@ out: } static int -nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, +nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio, unsigned int len, struct iattr *sattr) { + struct page *page = &folio->page; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; if (len > NFS3_MAXPATHLEN) @@ -528,7 +566,11 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, data->arg.symlink.pathlen = len; data->arg.symlink.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + + if (status == 0) + dput(d_alias); nfs3_free_createdata(data); out: @@ -536,12 +578,13 @@ out: return status; } -static int +static struct dentry * nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - int status = -ENOMEM; + struct dentry *ret = ERR_PTR(-ENOMEM); + int status; dprintk("NFS call mkdir %pd\n", dentry); @@ -549,8 +592,9 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) if (data == NULL) goto out; - status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); - if (status) + ret = ERR_PTR(posix_acl_create(dir, &sattr->ia_mode, + &default_acl, &acl)); + if (IS_ERR(ret)) goto out; data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; @@ -559,19 +603,27 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); - if (status != 0) + ret = nfs3_do_create(dir, dentry, data); + + if (IS_ERR(ret)) goto out_release_acls; + if (ret) + dentry = ret; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + if (status) { + dput(ret); + ret = ERR_PTR(status); + } out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); out: nfs3_free_createdata(data); - dprintk("NFS reply mkdir: %d\n", status); - return status; + dprintk("NFS reply mkdir: %d\n", PTR_ERR_OR_ZERO(ret)); + return ret; } static int @@ -612,37 +664,36 @@ out: * Also note that this implementation handles both plain readdir and * readdirplus. */ -static int -nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); - __be32 *verf = NFS_I(dir)->cookieverf; + struct inode *dir = d_inode(nr_arg->dentry); struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), - .cookie = cookie, - .verf = {verf[0], verf[1]}, - .plus = plus, - .count = count, - .pages = pages + .cookie = nr_arg->cookie, + .plus = nr_arg->plus, + .count = nr_arg->page_len, + .pages = nr_arg->pages }; struct nfs3_readdirres res = { - .verf = verf, - .plus = plus + .verf = nr_res->verf, + .plus = nr_arg->plus, }; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_READDIR], .rpc_argp = &arg, .rpc_resp = &res, - .rpc_cred = cred + .rpc_cred = nr_arg->cred, }; int status = -ENOMEM; - if (plus) + if (nr_arg->plus) msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; + if (arg.cookie) + memcpy(arg.verf, nr_arg->verf, sizeof(arg.verf)); - dprintk("NFS call readdir%s %d\n", - plus? "plus" : "", (unsigned int) cookie); + dprintk("NFS call readdir%s %llu\n", nr_arg->plus ? "plus" : "", + (unsigned long long)nr_arg->cookie); res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) @@ -655,8 +706,8 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, nfs_free_fattr(res.dir_attr); out: - dprintk("NFS reply readdir%s: %d\n", - plus? "plus" : "", status); + dprintk("NFS reply readdir%s: %d\n", nr_arg->plus ? "plus" : "", + status); return status; } @@ -666,6 +717,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mknod %pd %u:%u\n", dentry, @@ -701,15 +753,20 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, break; default: status = -EINVAL; - goto out; + goto out_release_acls; } - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -789,9 +846,45 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return status; } +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +static unsigned nfs3_localio_probe_throttle __read_mostly = 0; +module_param(nfs3_localio_probe_throttle, uint, 0644); +MODULE_PARM_DESC(nfs3_localio_probe_throttle, + "Probe for NFSv3 LOCALIO every N IO requests. Must be power-of-2, defaults to 0 (probing disabled)."); + +static void nfs3_localio_probe(struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + + /* Throttled to reduce nfs_local_probe_async() frequency */ + if (!nfs3_localio_probe_throttle || nfs_server_is_local(clp)) + return; + + /* + * Try (re)enabling LOCALIO if isn't enabled -- admin deems + * it worthwhile to periodically check if LOCALIO possible by + * setting the 'nfs3_localio_probe_throttle' module parameter. + * + * This is useful if LOCALIO was previously enabled, but was + * disabled due to server restart, and IO has successfully + * completed in terms of normal RPC. + */ + if ((clp->cl_uuid.nfs3_localio_probe_count++ & + (nfs3_localio_probe_throttle - 1)) == 0) { + if (!nfs_server_is_local(clp)) + nfs_local_probe_async(clp); + } +} + +#else +static void nfs3_localio_probe(struct nfs_server *server) {} +#endif + static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct inode *inode = hdr->inode; + struct nfs_server *server = NFS_SERVER(inode); if (hdr->pgio_done_cb != NULL) return hdr->pgio_done_cb(task, hdr); @@ -799,6 +892,12 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; + if (task->tk_status >= 0) { + if (!server->read_hdrsize) + cmpxchg(&server->read_hdrsize, 0, hdr->res.replen); + nfs3_localio_probe(server); + } + nfs_invalidate_atime(inode); nfs_refresh_inode(inode, &hdr->fattr); return 0; @@ -808,6 +907,7 @@ static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; + hdr->args.replen = NFS_SERVER(hdr->inode)->read_hdrsize; } static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, @@ -826,13 +926,16 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; - if (task->tk_status >= 0) + if (task->tk_status >= 0) { nfs_writeback_update_inode(hdr); + nfs3_localio_probe(NFS_SERVER(inode)); + } return 0; } static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr, - struct rpc_message *msg) + struct rpc_message *msg, + struct rpc_clnt **clnt) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; } @@ -853,7 +956,8 @@ static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data) return 0; } -static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) +static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg, + struct rpc_clnt **clnt) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT]; } @@ -887,7 +991,7 @@ static void nfs3_nlm_release_call(void *data) } } -const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { +static const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { .nlmclnt_alloc_call = nfs3_nlm_alloc_call, .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare, .nlmclnt_release_call = nfs3_nlm_release_call, @@ -901,7 +1005,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) struct nfs_open_context *ctx = nfs_file_open_context(filp); int status; - if (fl->fl_flags & FL_CLOSE) { + if (fl->c.flc_flags & FL_CLOSE) { l_ctx = nfs_get_lock_context(ctx); if (IS_ERR(l_ctx)) l_ctx = NULL; @@ -917,19 +1021,21 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) return status; } -static int nfs3_have_delegation(struct inode *inode, fmode_t flags) +static int nfs3_have_delegation(struct inode *inode, fmode_t type, int flags) { return 0; } static int nfs3_return_delegation(struct inode *inode) { - nfs_wb_all(inode); + if (S_ISREG(inode->i_mode)) + nfs_wb_all(inode); return 0; } static const struct inode_operations nfs3_dir_inode_operations = { .create = nfs_create, + .atomic_open = nfs_atomic_open_v23, .lookup = nfs_lookup, .link = nfs_link, .unlink = nfs_unlink, @@ -943,7 +1049,7 @@ static const struct inode_operations nfs3_dir_inode_operations = { .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .get_acl = nfs3_get_acl, + .get_inode_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif }; @@ -954,7 +1060,7 @@ static const struct inode_operations nfs3_file_inode_operations = { .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .get_acl = nfs3_get_acl, + .get_inode_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif }; @@ -968,10 +1074,11 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .nlmclnt_ops = &nlmclnt_fl_close_lock_ops, .getroot = nfs3_proc_get_root, .submount = nfs_submount, - .try_mount = nfs_try_mount, + .try_get_tree = nfs_try_get_tree, .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, .lookup = nfs3_proc_lookup, + .lookupp = nfs3_proc_lookupp, .access = nfs3_proc_access, .readlink = nfs3_proc_readlink, .create = nfs3_proc_create, diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index 5c4394e4656b..20a80478449e 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012 Netapp, Inc. All rights reserved. */ @@ -13,9 +14,6 @@ struct nfs_subversion nfs_v3 = { .rpc_vers = &nfs_version3, .rpc_ops = &nfs_v3_clientops, .sops = &nfs_sops, -#ifdef CONFIG_NFS_V3_ACL - .xattr = nfs3_xattr_handlers, -#endif }; static int __init init_nfs_v3(void) @@ -29,6 +27,7 @@ static void __exit exit_nfs_v3(void) unregister_nfs_version(&nfs_v3); } +MODULE_DESCRIPTION("NFSv3 client support"); MODULE_LICENSE("GPL"); module_init(init_nfs_v3); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index e82c9e553224..e17d72908412 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/nfs3xdr.c * @@ -20,19 +21,21 @@ #include <linux/nfs3.h> #include <linux/nfs_fs.h> #include <linux/nfsacl.h> +#include <linux/nfs_common.h> + #include "internal.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_XDR -/* Mapping from NFS error code to "errno" error code. */ -#define errno_NFSERR_IO EIO - /* * Declare the space requirements for NFS arguments and replies as * number of 32bit-words */ +#define NFS3_pagepad_sz (1) /* Page padding */ #define NFS3_fhandle_sz (1+16) #define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */ +#define NFS3_post_op_fh_sz (1+NFS3_fh_sz) #define NFS3_sattr_sz (15) #define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) #define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) @@ -67,13 +70,13 @@ #define NFS3_removeres_sz (NFS3_setattrres_sz) #define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) #define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) -#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) -#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3) +#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+NFS3_pagepad_sz) +#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+NFS3_pagepad_sz) #define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) -#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) +#define NFS3_createres_sz (1+NFS3_post_op_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) #define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) #define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) -#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2) +#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+NFS3_pagepad_sz) #define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13) #define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12) #define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6) @@ -83,11 +86,10 @@ #define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ - XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+\ + NFS3_pagepad_sz) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) -static int nfs3_stat_to_errno(enum nfs_stat); - /* * Map file type to S_IFMT bits */ @@ -102,32 +104,20 @@ static const umode_t nfs_type2fmt[] = { [NF3FIFO] = S_IFIFO, }; -/* - * While encoding arguments, set up the reply buffer in advance to - * receive reply data directly into the page cache. - */ -static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, - unsigned int base, unsigned int len, - unsigned int bufsize) +static struct user_namespace *rpc_userns(const struct rpc_clnt *clnt) { - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - - replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); + if (clnt && clnt->cl_cred) + return clnt->cl_cred->user_ns; + return &init_user_ns; } -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +static struct user_namespace *rpc_rqst_userns(const struct rpc_rqst *rqstp) { - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); + if (rqstp->rq_task) + return rpc_userns(rqstp->rq_task->tk_client); + return &init_user_ns; } - /* * Encode/decode NFSv3 basic data types * @@ -150,13 +140,10 @@ static int decode_uint32(struct xdr_stream *xdr, u32 *value) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *value = be32_to_cpup(p); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_uint64(struct xdr_stream *xdr, u64 *value) @@ -164,13 +151,10 @@ static int decode_uint64(struct xdr_stream *xdr, u64 *value) __be32 *p; p = xdr_inline_decode(xdr, 8); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; xdr_decode_hyper(p, value); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -210,14 +194,14 @@ static int decode_inline_filename3(struct xdr_stream *xdr, u32 count; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (count > NFS3_MAXNAMLEN) goto out_nametoolong; p = xdr_inline_decode(xdr, count); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *name = (const char *)p; *length = count; return 0; @@ -225,9 +209,6 @@ static int decode_inline_filename3(struct xdr_stream *xdr, out_nametoolong: dprintk("NFS: returned filename too long: %u\n", count); return -ENAMETOOLONG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -248,8 +229,8 @@ static int decode_nfspath3(struct xdr_stream *xdr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN)) goto out_nametoolong; @@ -266,9 +247,6 @@ out_cheating: dprintk("NFS: server cheating in pathname result: " "count %u > recvd %u\n", count, recvd); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -302,13 +280,10 @@ static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier) __be32 *p; p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; memcpy(verifier, p, NFS3_COOKIEVERFSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -329,13 +304,10 @@ static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier * __be32 *p; p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; memcpy(verifier->data, p, NFS3_WRITEVERFSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -363,13 +335,16 @@ static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != cpu_to_be32(NFS3_OK))) + goto out_status; + *status = 0; + return 0; +out_status: *status = be32_to_cpup(p); + trace_nfs_xdr_status(xdr, (int)*status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -452,23 +427,20 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; length = be32_to_cpup(p++); - if (unlikely(length > NFS3_FHSIZE)) + if (unlikely(length > NFS3_FHSIZE || length == 0)) goto out_toobig; p = xdr_inline_decode(xdr, length); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fh->size = length; memcpy(fh->data, p, length); return 0; out_toobig: - dprintk("NFS: file handle size (%u) too big\n", length); + trace_nfs_xdr_bad_filehandle(xdr, NFSERR_BADHANDLE); return -E2BIG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static void zero_nfs_fh3(struct nfs_fh *fh) @@ -484,14 +456,14 @@ static void zero_nfs_fh3(struct nfs_fh *fh) * uint32 nseconds; * }; */ -static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep) +static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec64 *timep) { - *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32((u32)timep->tv_sec); *p++ = cpu_to_be32(timep->tv_nsec); return p; } -static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) +static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec64 *timep) { timep->tv_sec = be32_to_cpup(p++); timep->tv_nsec = be32_to_cpup(p++); @@ -558,7 +530,8 @@ static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) * set_mtime mtime; * }; */ -static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) +static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr, + struct user_namespace *userns) { u32 nbytes; __be32 *p; @@ -592,13 +565,13 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) if (attr->ia_valid & ATTR_UID) { *p++ = xdr_one; - *p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid)); + *p++ = cpu_to_be32(from_kuid_munged(userns, attr->ia_uid)); } else *p++ = xdr_zero; if (attr->ia_valid & ATTR_GID) { *p++ = xdr_one; - *p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid)); + *p++ = cpu_to_be32(from_kgid_munged(userns, attr->ia_gid)); } else *p++ = xdr_zero; @@ -644,23 +617,24 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr) * nfstime3 ctime; * }; */ -static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) +static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct user_namespace *userns) { umode_t fmode; __be32 *p; p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; p = xdr_decode_ftype3(p, &fmode); fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode; fattr->nlink = be32_to_cpup(p++); - fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++)); + fattr->uid = make_kuid(userns, be32_to_cpup(p++)); if (!uid_valid(fattr->uid)) goto out_uid; - fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++)); + fattr->gid = make_kgid(userns, be32_to_cpup(p++)); if (!gid_valid(fattr->gid)) goto out_gid; @@ -685,9 +659,6 @@ out_uid: out_gid: dprintk("NFS: returned invalid gid\n"); return -EINVAL; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -700,19 +671,17 @@ out_overflow: * void; * }; */ -static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) +static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct user_namespace *userns) { __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) - return decode_fattr3(xdr, fattr); + return decode_fattr3(xdr, fattr, userns); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -728,8 +697,8 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fattr->valid |= NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PRECHANGE @@ -742,9 +711,6 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -768,24 +734,22 @@ static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_wcc_attr(xdr, fattr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } -static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr) +static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr, + struct user_namespace *userns) { int error; error = decode_pre_op_attr(xdr, fattr); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, fattr); + error = decode_post_op_attr(xdr, fattr, userns); out: return error; } @@ -803,15 +767,12 @@ out: static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) { __be32 *p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_nfs_fh3(xdr, fh); zero_nfs_fh3(fh); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -890,7 +851,7 @@ static void nfs3_xdr_enc_setattr3args(struct rpc_rqst *req, { const struct nfs3_sattrargs *args = data; encode_nfs_fh3(xdr, args->fh); - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, rpc_rqst_userns(req)); encode_sattrguard3(xdr, args); } @@ -948,8 +909,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, const struct nfs3_readlinkargs *args = data; encode_nfs_fh3(xdr, args->fh); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->pglen, NFS3_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen, + NFS3_readlinkres_sz - NFS3_pagepad_sz); } /* @@ -978,10 +939,12 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, const void *data) { const struct nfs_pgio_args *args = data; + unsigned int replen = args->replen ? args->replen : + NFS3_readres_sz - NFS3_pagepad_sz; encode_read3args(xdr, args); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, NFS3_readres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, replen); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -1050,13 +1013,14 @@ static void nfs3_xdr_enc_write3args(struct rpc_rqst *req, * }; */ static void encode_createhow3(struct xdr_stream *xdr, - const struct nfs3_createargs *args) + const struct nfs3_createargs *args, + struct user_namespace *userns) { encode_uint32(xdr, args->createmode); switch (args->createmode) { case NFS3_CREATE_UNCHECKED: case NFS3_CREATE_GUARDED: - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, userns); break; case NFS3_CREATE_EXCLUSIVE: encode_createverf3(xdr, args->verifier); @@ -1073,7 +1037,7 @@ static void nfs3_xdr_enc_create3args(struct rpc_rqst *req, const struct nfs3_createargs *args = data; encode_diropargs3(xdr, args->fh, args->name, args->len); - encode_createhow3(xdr, args); + encode_createhow3(xdr, args, rpc_rqst_userns(req)); } /* @@ -1091,7 +1055,7 @@ static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, const struct nfs3_mkdirargs *args = data; encode_diropargs3(xdr, args->fh, args->name, args->len); - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, rpc_rqst_userns(req)); } /* @@ -1108,11 +1072,12 @@ static void nfs3_xdr_enc_mkdir3args(struct rpc_rqst *req, * }; */ static void encode_symlinkdata3(struct xdr_stream *xdr, - const void *data) + const void *data, + struct user_namespace *userns) { const struct nfs3_symlinkargs *args = data; - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, userns); encode_nfspath3(xdr, args->pages, args->pathlen); } @@ -1123,7 +1088,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, const struct nfs3_symlinkargs *args = data; encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen); - encode_symlinkdata3(xdr, args); + encode_symlinkdata3(xdr, args, rpc_rqst_userns(req)); xdr->buf->flags |= XDRBUF_WRITE; } @@ -1152,24 +1117,26 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req, * }; */ static void encode_devicedata3(struct xdr_stream *xdr, - const struct nfs3_mknodargs *args) + const struct nfs3_mknodargs *args, + struct user_namespace *userns) { - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, userns); encode_specdata3(xdr, args->rdev); } static void encode_mknoddata3(struct xdr_stream *xdr, - const struct nfs3_mknodargs *args) + const struct nfs3_mknodargs *args, + struct user_namespace *userns) { encode_ftype3(xdr, args->type); switch (args->type) { case NF3CHR: case NF3BLK: - encode_devicedata3(xdr, args); + encode_devicedata3(xdr, args, userns); break; case NF3SOCK: case NF3FIFO: - encode_sattr3(xdr, args->sattr); + encode_sattr3(xdr, args->sattr, userns); break; case NF3REG: case NF3DIR: @@ -1186,7 +1153,7 @@ static void nfs3_xdr_enc_mknod3args(struct rpc_rqst *req, const struct nfs3_mknodargs *args = data; encode_diropargs3(xdr, args->fh, args->name, args->len); - encode_mknoddata3(xdr, args); + encode_mknoddata3(xdr, args, rpc_rqst_userns(req)); } /* @@ -1273,8 +1240,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdir3args(xdr, args); - prepare_reply_buffer(req, args->pages, 0, - args->count, NFS3_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, args->count, + NFS3_readdirres_sz - NFS3_pagepad_sz); } /* @@ -1291,6 +1258,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, static void encode_readdirplus3args(struct xdr_stream *xdr, const struct nfs3_readdirargs *args) { + uint32_t dircount = args->count; + uint32_t maxcount = args->count; __be32 *p; encode_nfs_fh3(xdr, args->fh); @@ -1303,9 +1272,8 @@ static void encode_readdirplus3args(struct xdr_stream *xdr, * readdirplus: need dircount + buffer size. * We just make sure we make dircount big enough */ - *p++ = cpu_to_be32(args->count >> 3); - - *p = cpu_to_be32(args->count); + *p++ = cpu_to_be32(dircount); + *p = cpu_to_be32(maxcount); } static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, @@ -1315,8 +1283,8 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdirplus3args(xdr, args); - prepare_reply_buffer(req, args->pages, 0, - args->count, NFS3_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, args->count, + NFS3_readdirres_sz - NFS3_pagepad_sz); } /* @@ -1359,10 +1327,12 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, encode_nfs_fh3(xdr, args->fh); encode_uint32(xdr, args->mask); - if (args->mask & (NFS_ACL | NFS_DFACL)) - prepare_reply_buffer(req, args->pages, 0, + if (args->mask & (NFS_ACL | NFS_DFACL)) { + rpc_prepare_reply_pages(req, args->pages, 0, NFSACL_MAXPAGES << PAGE_SHIFT, - ACL3_getaclres_sz); + ACL3_getaclres_sz - NFS3_pagepad_sz); + req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; + } } static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, @@ -1429,11 +1399,11 @@ static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_default; - error = decode_fattr3(xdr, result); + error = decode_fattr3(xdr, result, rpc_rqst_userns(req)); out: return error; out_default: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1464,7 +1434,7 @@ static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result); + error = decode_wcc_data(xdr, result, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1472,7 +1442,7 @@ static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1499,6 +1469,7 @@ static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { + struct user_namespace *userns = rpc_rqst_userns(req); struct nfs3_diropres *result = data; enum nfs_stat status; int error; @@ -1511,17 +1482,17 @@ static int nfs3_xdr_dec_lookup3res(struct rpc_rqst *req, error = decode_nfs_fh3(xdr, result->fh); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, userns); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->dir_attr); + error = decode_post_op_attr(xdr, result->dir_attr, userns); out: return error; out_default: - error = decode_post_op_attr(xdr, result->dir_attr); + error = decode_post_op_attr(xdr, result->dir_attr, userns); if (unlikely(error)) goto out; - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1554,7 +1525,7 @@ static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1563,7 +1534,7 @@ static int nfs3_xdr_dec_access3res(struct rpc_rqst *req, out: return error; out_default: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1595,7 +1566,7 @@ static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result); + error = decode_post_op_attr(xdr, result, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1604,7 +1575,7 @@ static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req, out: return error; out_default: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1635,8 +1606,8 @@ static int decode_read3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 + 4 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p++); eof = be32_to_cpup(p++); ocount = be32_to_cpup(p++); @@ -1659,32 +1630,32 @@ out_cheating: count = recvd; eof = 0; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { struct nfs_pgio_res *result = data; + unsigned int pos; enum nfs_stat status; int error; + pos = xdr_stream_pos(xdr); error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; result->op_status = status; if (status != NFS3_OK) goto out_status; + result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2); error = decode_read3resok(xdr, result); out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1720,22 +1691,18 @@ static int decode_write3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->count = be32_to_cpup(p++); result->verf->committed = be32_to_cpup(p++); if (unlikely(result->verf->committed > NFS_FILE_SYNC)) goto out_badvalue; if (decode_writeverf3(xdr, &result->verf->verifier)) - goto out_eio; + return -EIO; return result->count; out_badvalue: dprintk("NFS: bad stable_how value: %u\n", result->verf->committed); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); -out_eio: - return -EIO; } static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -1748,7 +1715,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->fattr); + error = decode_wcc_data(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; result->op_status = status; @@ -1758,7 +1725,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1782,14 +1749,15 @@ out_status: * }; */ static int decode_create3resok(struct xdr_stream *xdr, - struct nfs3_diropres *result) + struct nfs3_diropres *result, + struct user_namespace *userns) { int error; error = decode_post_op_fh3(xdr, result->fh); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, userns); if (unlikely(error)) goto out; /* The server isn't required to return a file handle. @@ -1798,7 +1766,7 @@ static int decode_create3resok(struct xdr_stream *xdr, * values for the new object. */ if (result->fh->size == 0) result->fattr->valid = 0; - error = decode_wcc_data(xdr, result->dir_attr); + error = decode_wcc_data(xdr, result->dir_attr, userns); out: return error; } @@ -1807,6 +1775,7 @@ static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { + struct user_namespace *userns = rpc_rqst_userns(req); struct nfs3_diropres *result = data; enum nfs_stat status; int error; @@ -1816,14 +1785,14 @@ static int nfs3_xdr_dec_create3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_default; - error = decode_create3resok(xdr, result); + error = decode_create3resok(xdr, result, userns); out: return error; out_default: - error = decode_wcc_data(xdr, result->dir_attr); + error = decode_wcc_data(xdr, result->dir_attr, userns); if (unlikely(error)) goto out; - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1855,7 +1824,7 @@ static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->dir_attr); + error = decode_wcc_data(xdr, result->dir_attr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1863,7 +1832,7 @@ static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1890,6 +1859,7 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { + struct user_namespace *userns = rpc_rqst_userns(req); struct nfs_renameres *result = data; enum nfs_stat status; int error; @@ -1897,10 +1867,10 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->old_fattr); + error = decode_wcc_data(xdr, result->old_fattr, userns); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->new_fattr); + error = decode_wcc_data(xdr, result->new_fattr, userns); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1908,7 +1878,7 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -1934,6 +1904,7 @@ out_status: static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { + struct user_namespace *userns = rpc_rqst_userns(req); struct nfs3_linkres *result = data; enum nfs_stat status; int error; @@ -1941,10 +1912,10 @@ static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, userns); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->dir_attr); + error = decode_wcc_data(xdr, result->dir_attr, userns); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -1952,7 +1923,7 @@ static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /** @@ -1993,17 +1964,18 @@ out_status: int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, bool plus) { - struct nfs_entry old = *entry; + struct user_namespace *userns = rpc_userns(entry->server->client); __be32 *p; int error; + u64 new_cookie; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p == xdr_zero) { p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -2012,24 +1984,23 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_fileid3(xdr, &entry->ino); if (unlikely(error)) - return error; + return -EAGAIN; error = decode_inline_filename3(xdr, &entry->name, &entry->len); if (unlikely(error)) - return error; + return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN; - entry->prev_cookie = entry->cookie; - error = decode_cookie3(xdr, &entry->cookie); + error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) - return error; + return -EAGAIN; entry->d_type = DT_UNKNOWN; if (plus) { entry->fattr->valid = 0; - error = decode_post_op_attr(xdr, entry->fattr); + error = decode_post_op_attr(xdr, entry->fattr, userns); if (unlikely(error)) - return error; + return -EAGAIN; if (entry->fattr->valid & NFS_ATTR_FATTR_V3) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); @@ -2040,28 +2011,19 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p != xdr_zero) { error = decode_nfs_fh3(xdr, entry->fh); - if (unlikely(error)) { - if (error == -E2BIG) - goto out_truncated; - return error; - } + if (unlikely(error)) + return -EAGAIN; } else zero_nfs_fh3(entry->fh); } - return 0; + entry->cookie = new_cookie; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; -out_truncated: - dprintk("NFS: directory entry contains invalid file handle\n"); - *entry = old; - return -EAGAIN; + return 0; } /* @@ -2099,11 +2061,12 @@ static int decode_dirlist3(struct xdr_stream *xdr) } static int decode_readdir3resok(struct xdr_stream *xdr, - struct nfs3_readdirres *result) + struct nfs3_readdirres *result, + struct user_namespace *userns) { int error; - error = decode_post_op_attr(xdr, result->dir_attr); + error = decode_post_op_attr(xdr, result->dir_attr, userns); if (unlikely(error)) goto out; /* XXX: do we need to check if result->verf != NULL ? */ @@ -2128,14 +2091,14 @@ static int nfs3_xdr_dec_readdir3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_default; - error = decode_readdir3resok(xdr, result); + error = decode_readdir3resok(xdr, result, rpc_rqst_userns(req)); out: return error; out_default: - error = decode_post_op_attr(xdr, result->dir_attr); + error = decode_post_op_attr(xdr, result->dir_attr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -2169,8 +2132,8 @@ static int decode_fsstat3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 8 * 6 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; p = xdr_decode_size3(p, &result->tbytes); p = xdr_decode_size3(p, &result->fbytes); p = xdr_decode_size3(p, &result->abytes); @@ -2179,9 +2142,6 @@ static int decode_fsstat3resok(struct xdr_stream *xdr, xdr_decode_size3(p, &result->afiles); /* ignore invarsec */ return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, @@ -2195,7 +2155,7 @@ static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -2204,7 +2164,7 @@ static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -2241,8 +2201,8 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->rtmax = be32_to_cpup(p++); result->rtpref = be32_to_cpup(p++); result->rtmult = be32_to_cpup(p++); @@ -2255,10 +2215,9 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, /* ignore properties */ result->lease_time = 0; + result->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + result->xattr_support = 0; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, @@ -2272,7 +2231,7 @@ static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -2281,7 +2240,7 @@ static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -2314,15 +2273,12 @@ static int decode_pathconf3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 * 6); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->max_link = be32_to_cpup(p++); result->max_namelen = be32_to_cpup(p); /* ignore remaining fields */ return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, @@ -2336,7 +2292,7 @@ static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; if (status != NFS3_OK) @@ -2345,7 +2301,7 @@ static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } /* @@ -2372,36 +2328,40 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, void *data) { struct nfs_commitres *result = data; + struct nfs_writeverf *verf = result->verf; enum nfs_stat status; int error; error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; - error = decode_wcc_data(xdr, result->fattr); + error = decode_wcc_data(xdr, result->fattr, rpc_rqst_userns(req)); if (unlikely(error)) goto out; result->op_status = status; if (status != NFS3_OK) goto out_status; - error = decode_writeverf3(xdr, &result->verf->verifier); + error = decode_writeverf3(xdr, &verf->verifier); + if (!error) + verf->committed = NFS_FILE_SYNC; out: return error; out_status: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } #ifdef CONFIG_NFS_V3_ACL static inline int decode_getacl3resok(struct xdr_stream *xdr, - struct nfs3_getaclres *result) + struct nfs3_getaclres *result, + struct user_namespace *userns) { struct posix_acl **acl; unsigned int *aclcnt; size_t hdrlen; int error; - error = decode_post_op_attr(xdr, result->fattr); + error = decode_post_op_attr(xdr, result->fattr, userns); if (unlikely(error)) goto out; error = decode_uint32(xdr, &result->mask); @@ -2449,11 +2409,11 @@ static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_default; - error = decode_getacl3resok(xdr, result); + error = decode_getacl3resok(xdr, result, rpc_rqst_userns(req)); out: return error; out_default: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, @@ -2468,80 +2428,15 @@ static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req, goto out; if (status != NFS3_OK) goto out_default; - error = decode_post_op_attr(xdr, result); + error = decode_post_op_attr(xdr, result, rpc_rqst_userns(req)); out: return error; out_default: - return nfs3_stat_to_errno(status); + return nfs_stat_to_errno(status); } #endif /* CONFIG_NFS_V3_ACL */ - -/* - * We need to translate between nfs status return values and - * the local errno values which may not be the same. - */ -static const struct { - int stat; - int errno; -} nfs_errtbl[] = { - { NFS_OK, 0 }, - { NFSERR_PERM, -EPERM }, - { NFSERR_NOENT, -ENOENT }, - { NFSERR_IO, -errno_NFSERR_IO}, - { NFSERR_NXIO, -ENXIO }, -/* { NFSERR_EAGAIN, -EAGAIN }, */ - { NFSERR_ACCES, -EACCES }, - { NFSERR_EXIST, -EEXIST }, - { NFSERR_XDEV, -EXDEV }, - { NFSERR_NODEV, -ENODEV }, - { NFSERR_NOTDIR, -ENOTDIR }, - { NFSERR_ISDIR, -EISDIR }, - { NFSERR_INVAL, -EINVAL }, - { NFSERR_FBIG, -EFBIG }, - { NFSERR_NOSPC, -ENOSPC }, - { NFSERR_ROFS, -EROFS }, - { NFSERR_MLINK, -EMLINK }, - { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, - { NFSERR_NOTEMPTY, -ENOTEMPTY }, - { NFSERR_DQUOT, -EDQUOT }, - { NFSERR_STALE, -ESTALE }, - { NFSERR_REMOTE, -EREMOTE }, -#ifdef EWFLUSH - { NFSERR_WFLUSH, -EWFLUSH }, -#endif - { NFSERR_BADHANDLE, -EBADHANDLE }, - { NFSERR_NOT_SYNC, -ENOTSYNC }, - { NFSERR_BAD_COOKIE, -EBADCOOKIE }, - { NFSERR_NOTSUPP, -ENOTSUPP }, - { NFSERR_TOOSMALL, -ETOOSMALL }, - { NFSERR_SERVERFAULT, -EREMOTEIO }, - { NFSERR_BADTYPE, -EBADTYPE }, - { NFSERR_JUKEBOX, -EJUKEBOX }, - { -1, -EIO } -}; - -/** - * nfs3_stat_to_errno - convert an NFS status code to a local errno - * @status: NFS status code to convert - * - * Returns a local errno value, or -EIO if the NFS status code is - * not recognized. This function is used jointly by NFSv2 and NFSv3. - */ -static int nfs3_stat_to_errno(enum nfs_stat status) -{ - int i; - - for (i = 0; nfs_errtbl[i].stat != -1; i++) { - if (nfs_errtbl[i].stat == (int)status) - return nfs_errtbl[i].errno; - } - dprintk("NFS: Unrecognized nfs status value: %u\n", status); - return nfs_errtbl[i].errno; -} - - #define PROC(proc, argtype, restype, timer) \ [NFS3PROC_##proc] = { \ .p_proc = NFS3PROC_##proc, \ diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index b6cd15314bab..aafd15a4afce 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> */ @@ -5,19 +6,64 @@ #ifndef __LINUX_FS_NFS_NFS4_2_H #define __LINUX_FS_NFS_NFS4_2_H +#include <linux/xattr.h> + /* * FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support * more? Need to consider not to pre-alloc too much for a compound. */ #define PNFS_LAYOUTSTATS_MAXDEV (4) +#define READ_PLUS_SCRATCH_SIZE (16) /* nfs4.2proc.c */ +#ifdef CONFIG_NFS_V4_2 int nfs42_proc_allocate(struct file *, loff_t, loff_t); -ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t); +ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t, + struct nl4_server *, nfs4_stateid *, bool); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); +int nfs42_proc_zero_range(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t); +int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, + const struct nfs42_layout_error *errors, + size_t n); +int nfs42_proc_copy_notify(struct file *, struct file *, + struct nfs42_copy_notify_res *); +static inline bool nfs42_files_from_same_server(struct file *in, + struct file *out) +{ + struct nfs_client *c_in = (NFS_SERVER(file_inode(in)))->nfs_client; + struct nfs_client *c_out = (NFS_SERVER(file_inode(out)))->nfs_client; + + return nfs4_check_serverowner_major_id(c_in->cl_serverowner, + c_out->cl_serverowner); +} + +ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, + void *buf, size_t buflen); +int nfs42_proc_setxattr(struct inode *inode, const char *name, + const void *buf, size_t buflen, int flags); +ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf, + size_t buflen, u64 *cookiep, bool *eofp); +int nfs42_proc_removexattr(struct inode *inode, const char *name); + +/* + * Maximum XDR buffer size needed for a listxattr buffer of buflen size. + * + * The upper boundary is a buffer with all 1-byte sized attribute names. + * They would be 7 bytes long in the eventual buffer ("user.x\0"), and + * 8 bytes long XDR-encoded. + * + * Include the trailing eof word as well and make the result a multiple + * of 4 bytes. + */ +static inline u32 nfs42_listxattr_xdrsize(u32 buflen) +{ + u32 size = 8 * buflen / (XATTR_USER_PREFIX_LEN + 2) + 4; + return (size + 3) & ~3; +} +#endif /* CONFIG_NFS_V4_2 */ #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6c2db51e67a7..d537fb0c230e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1,7 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> */ #include <linux/fs.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/sched.h> #include <linux/nfs.h> #include <linux/nfs3.h> @@ -14,19 +16,44 @@ #include "pnfs.h" #include "nfs4session.h" #include "internal.h" +#include "delegation.h" +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PROC +static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std); +static int nfs42_proc_offload_status(struct file *file, nfs4_stateid *stateid, + u64 *copied); + +static void nfs42_set_netaddr(struct file *filep, struct nfs42_netaddr *naddr) +{ + struct nfs_client *clp = (NFS_SERVER(file_inode(filep)))->nfs_client; + unsigned short port = 2049; + + rcu_read_lock(); + naddr->netid_len = scnprintf(naddr->netid, + sizeof(naddr->netid), "%s", + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_NETID)); + naddr->addr_len = scnprintf(naddr->addr, + sizeof(naddr->addr), + "%s.%u.%u", + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR), + port >> 8, port & 255); + rcu_read_unlock(); +} static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, struct nfs_lock_context *lock, loff_t offset, loff_t len) { struct inode *inode = file_inode(filep); struct nfs_server *server = NFS_SERVER(inode); + u32 bitmask[NFS_BITMASK_SZ]; struct nfs42_falloc_args args = { .falloc_fh = NFS_FH(inode), .falloc_offset = offset, .falloc_length = len, - .falloc_bitmask = server->cache_consistency_bitmask, + .falloc_bitmask = bitmask, }; struct nfs42_falloc_res res = { .falloc_server = server, @@ -38,8 +65,14 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context, lock, FMODE_WRITE); - if (status) + if (status) { + if (status == -EAGAIN) + status = -NFS4ERR_BAD_STATEID; return status; + } + + nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask, inode, + NFS_INO_INVALID_BLOCKS); res.falloc_fattr = nfs_alloc_fattr(); if (!res.falloc_fattr) @@ -47,9 +80,20 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, status = nfs4_call_sync(server->client, server, msg, &args.seq_args, &res.seq_res, 0); - if (status == 0) - status = nfs_post_op_update_inode(inode, res.falloc_fattr); - + if (status == 0) { + if (nfs_should_remove_suid(inode)) { + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_MODE); + spin_unlock(&inode->i_lock); + } + status = nfs_post_op_update_inode_force_wcc(inode, + res.falloc_fattr); + } + if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE]) + trace_nfs4_fallocate(inode, &args, status); + else + trace_nfs4_deallocate(inode, &args, status); kfree(res.falloc_fattr); return status; } @@ -57,7 +101,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, loff_t offset, loff_t len) { - struct nfs_server *server = NFS_SERVER(file_inode(filep)); + struct inode *inode = file_inode(filep); + struct nfs_server *server = NFS_SERVER(inode); struct nfs4_exception exception = { }; struct nfs_lock_context *lock; int err; @@ -66,9 +111,14 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (IS_ERR(lock)) return PTR_ERR(lock); - exception.inode = file_inode(filep); + exception.inode = inode; exception.state = lock->open_context->state; + nfs_file_block_o_direct(NFS_I(inode)); + err = nfs_sync_inode(inode); + if (err) + goto out; + do { err = _nfs42_proc_fallocate(msg, filep, lock, offset, len); if (err == -ENOTSUPP) { @@ -77,7 +127,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, } err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); - +out: nfs_put_lock_context(lock); return err; } @@ -88,6 +138,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) @@ -96,8 +147,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + + if (err == 0) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + else if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | + NFS_CAP_ZERO_RANGE); inode_unlock(inode); return err; @@ -115,26 +171,239 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return -EOPNOTSUPP; inode_lock(inode); - err = nfs_sync_inode(inode); - if (err) - goto out_unlock; err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; -out_unlock: + NFS_SERVER(inode)->caps &= ~(NFS_CAP_DEALLOCATE | + NFS_CAP_ZERO_RANGE); + inode_unlock(inode); return err; } +int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], + }; + struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) { + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + truncate_pagecache_range(inode, offset, (offset + len) -1); + } else if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; + + inode_unlock(inode); + return err; +} + +static void nfs4_copy_dequeue_callback(struct nfs_server *dst_server, + struct nfs_server *src_server, + struct nfs4_copy_state *copy) +{ + spin_lock(&dst_server->nfs_client->cl_lock); + list_del_init(©->copies); + spin_unlock(&dst_server->nfs_client->cl_lock); + if (dst_server != src_server) { + spin_lock(&src_server->nfs_client->cl_lock); + list_del_init(©->src_copies); + spin_unlock(&src_server->nfs_client->cl_lock); + } +} + +static int handle_async_copy(struct nfs42_copy_res *res, + struct nfs_server *dst_server, + struct nfs_server *src_server, + struct file *src, + struct file *dst, + nfs4_stateid *src_stateid, + bool *restart) +{ + struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter; + struct nfs_open_context *dst_ctx = nfs_file_open_context(dst); + struct nfs_open_context *src_ctx = nfs_file_open_context(src); + struct nfs_client *clp = dst_server->nfs_client; + unsigned long timeout = 3 * HZ; + int status = NFS4_OK; + u64 copied; + + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); + if (!copy) + return -ENOMEM; + + spin_lock(&dst_server->nfs_client->cl_lock); + list_for_each_entry(iter, + &dst_server->nfs_client->pending_cb_stateids, + copies) { + if (memcmp(&res->write_res.stateid, &iter->stateid, + NFS4_STATEID_SIZE)) + continue; + tmp_copy = iter; + list_del(&iter->copies); + break; + } + if (tmp_copy) { + spin_unlock(&dst_server->nfs_client->cl_lock); + kfree(copy); + copy = tmp_copy; + goto out; + } + + memcpy(©->stateid, &res->write_res.stateid, NFS4_STATEID_SIZE); + init_completion(©->completion); + copy->parent_dst_state = dst_ctx->state; + copy->parent_src_state = src_ctx->state; + + list_add_tail(©->copies, &dst_server->ss_copies); + spin_unlock(&dst_server->nfs_client->cl_lock); + + if (dst_server != src_server) { + spin_lock(&src_server->nfs_client->cl_lock); + list_add_tail(©->src_copies, &src_server->ss_src_copies); + spin_unlock(&src_server->nfs_client->cl_lock); + } + +wait: + status = wait_for_completion_interruptible_timeout(©->completion, + timeout); + if (!status) + goto timeout; + nfs4_copy_dequeue_callback(dst_server, src_server, copy); + if (status == -ERESTARTSYS) { + goto out_cancel; + } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) { + status = -EAGAIN; + *restart = true; + goto out_cancel; + } +out: + res->write_res.count = copy->count; + /* Copy out the updated write verifier provided by CB_OFFLOAD. */ + memcpy(&res->write_res.verifier, ©->verf, sizeof(copy->verf)); + status = -copy->error; + +out_free: + kfree(copy); + return status; +out_cancel: + nfs42_do_offload_cancel_async(dst, ©->stateid); + if (!nfs42_files_from_same_server(src, dst)) + nfs42_do_offload_cancel_async(src, src_stateid); + goto out_free; +timeout: + timeout <<= 1; + if (timeout > (clp->cl_lease_time >> 1)) + timeout = clp->cl_lease_time >> 1; + status = nfs42_proc_offload_status(dst, ©->stateid, &copied); + if (status == -EINPROGRESS) + goto wait; + nfs4_copy_dequeue_callback(dst_server, src_server, copy); + switch (status) { + case 0: + /* The server recognized the copy stateid, so it hasn't + * rebooted. Don't overwrite the verifier returned in the + * COPY result. */ + res->write_res.count = copied; + goto out_free; + case -EREMOTEIO: + /* COPY operation failed on the server. */ + status = -EOPNOTSUPP; + res->write_res.count = copied; + goto out_free; + case -EBADF: + /* Server did not recognize the copy stateid. It has + * probably restarted and lost the plot. */ + res->write_res.count = 0; + status = -EOPNOTSUPP; + break; + case -EOPNOTSUPP: + /* RFC 7862 REQUIREs server to support OFFLOAD_STATUS when + * it has signed up for an async COPY, so server is not + * spec-compliant. */ + res->write_res.count = 0; + } + goto out_free; +} + +static int process_copy_commit(struct file *dst, loff_t pos_dst, + struct nfs42_copy_res *res) +{ + struct nfs_commitres cres; + int status = -ENOMEM; + + cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL); + if (!cres.verf) + goto out; + + status = nfs4_proc_commit(dst, pos_dst, res->write_res.count, &cres); + if (status) + goto out_free; + if (nfs_write_verifier_cmp(&res->write_res.verifier.verifier, + &cres.verf->verifier)) { + dprintk("commit verf differs from copy verf\n"); + status = -EAGAIN; + } +out_free: + kfree(cres.verf); +out: + return status; +} + +/** + * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload + * @file: pointer to destination file + * @pos: destination offset + * @len: copy length + * @oldsize: length of the file prior to clone/copy + * + * Punch a hole in the inode page cache, so that the NFS client will + * know to retrieve new data. + * Update the file size if necessary, and then mark the inode as having + * invalid cached values for change attribute, ctime, mtime and space used. + */ +static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len, + loff_t oldsize) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + loff_t newsize = pos + len; + loff_t end = newsize - 1; + + nfs_truncate_last_folio(mapping, oldsize, pos); + WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT)); + + spin_lock(&inode->i_lock); + if (newsize > i_size_read(inode)) + i_size_write(inode, newsize); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_MTIME | + NFS_INO_INVALID_BLOCKS); + spin_unlock(&inode->i_lock); +} + static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_lock_context *src_lock, struct file *dst, struct nfs_lock_context *dst_lock, struct nfs42_copy_args *args, - struct nfs42_copy_res *res) + struct nfs42_copy_res *res, + struct nl4_server *nss, + nfs4_stateid *cnr_stateid, + bool *restart) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], @@ -142,59 +411,99 @@ static ssize_t _nfs42_proc_copy(struct file *src, .rpc_resp = res, }; struct inode *dst_inode = file_inode(dst); - struct nfs_server *server = NFS_SERVER(dst_inode); + struct inode *src_inode = file_inode(src); + struct nfs_server *dst_server = NFS_SERVER(dst_inode); + struct nfs_server *src_server = NFS_SERVER(src_inode); loff_t pos_src = args->src_pos; loff_t pos_dst = args->dst_pos; + loff_t oldsize_dst = i_size_read(dst_inode); size_t count = args->count; ssize_t status; - status = nfs4_set_rw_stateid(&args->src_stateid, src_lock->open_context, - src_lock, FMODE_READ); - if (status) - return status; - - status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, + if (nss) { + args->cp_src = nss; + nfs4_stateid_copy(&args->src_stateid, cnr_stateid); + } else { + status = nfs4_set_rw_stateid(&args->src_stateid, + src_lock->open_context, src_lock, FMODE_READ); + if (status) { + if (status == -EAGAIN) + status = -NFS4ERR_BAD_STATEID; + return status; + } + } + status = nfs_filemap_write_and_wait_range(src->f_mapping, pos_src, pos_src + (loff_t)count - 1); if (status) return status; status = nfs4_set_rw_stateid(&args->dst_stateid, dst_lock->open_context, dst_lock, FMODE_WRITE); - if (status) + if (status) { + if (status == -EAGAIN) + status = -NFS4ERR_BAD_STATEID; return status; + } + nfs_file_block_o_direct(NFS_I(dst_inode)); status = nfs_sync_inode(dst_inode); if (status) return status; - res->commit_res.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS); - if (!res->commit_res.verf) - return -ENOMEM; - status = nfs4_call_sync(server->client, server, &msg, + res->commit_res.verf = NULL; + if (args->sync) { + res->commit_res.verf = + kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL); + if (!res->commit_res.verf) + return -ENOMEM; + } + set_bit(NFS_CLNT_SRC_SSC_COPY_STATE, + &src_lock->open_context->state->flags); + set_bit(NFS_CLNT_DST_SSC_COPY_STATE, + &dst_lock->open_context->state->flags); + + status = nfs4_call_sync(dst_server->client, dst_server, &msg, &args->seq_args, &res->seq_res, 0); + trace_nfs4_copy(src_inode, dst_inode, args, res, nss, status); if (status == -ENOTSUPP) - server->caps &= ~NFS_CAP_COPY; + dst_server->caps &= ~NFS_CAP_COPY; if (status) goto out; - if (nfs_write_verifier_cmp(&res->write_res.verifier.verifier, + if (args->sync && + nfs_write_verifier_cmp(&res->write_res.verifier.verifier, &res->commit_res.verf->verifier)) { status = -EAGAIN; goto out; } - truncate_pagecache_range(dst_inode, pos_dst, - pos_dst + res->write_res.count); + if (!res->synchronous) { + status = handle_async_copy(res, dst_server, src_server, src, + dst, &args->src_stateid, restart); + if (status) + goto out; + } + + if ((!res->synchronous || !args->sync) && + res->write_res.verifier.committed != NFS_FILE_SYNC) { + status = process_copy_commit(dst, pos_dst, res); + if (status) + goto out; + } + nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst); + nfs_invalidate_atime(src_inode); status = res->write_res.count; out: - kfree(res->commit_res.verf); + if (args->sync) + kfree(res->commit_res.verf); return status; } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, - struct file *dst, loff_t pos_dst, - size_t count) + struct file *dst, loff_t pos_dst, size_t count, + struct nl4_server *nss, + nfs4_stateid *cnr_stateid, bool sync) { struct nfs_server *server = NFS_SERVER(file_inode(dst)); struct nfs_lock_context *src_lock; @@ -205,6 +514,7 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, .dst_fh = NFS_FH(file_inode(dst)), .dst_pos = pos_dst, .count = count, + .sync = sync, }; struct nfs42_copy_res res; struct nfs4_exception src_exception = { @@ -216,9 +526,7 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, .stateid = &args.dst_stateid, }; ssize_t err, err2; - - if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) - return -EOPNOTSUPP; + bool restart = false; src_lock = nfs_get_lock_context(nfs_file_open_context(src)); if (IS_ERR(src_lock)) @@ -238,17 +546,35 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, inode_lock(file_inode(dst)); err = _nfs42_proc_copy(src, src_lock, dst, dst_lock, - &args, &res); + &args, &res, + nss, cnr_stateid, &restart); inode_unlock(file_inode(dst)); if (err >= 0) break; - if (err == -ENOTSUPP) { + if ((err == -ENOTSUPP || + err == -NFS4ERR_OFFLOAD_DENIED) && + nfs42_files_from_same_server(src, dst)) { err = -EOPNOTSUPP; break; - } if (err == -EAGAIN) { + } else if (err == -EAGAIN) { + if (!restart) { + dst_exception.retry = 1; + continue; + } + break; + } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && + args.sync != res.synchronous) { + args.sync = res.synchronous; dst_exception.retry = 1; continue; + } else if ((err == -ESTALE || + err == -NFS4ERR_OFFLOAD_DENIED || + err == -ENOTSUPP) && + !nfs42_files_from_same_server(src, dst)) { + nfs42_do_offload_cancel_async(src, &args.src_stateid); + err = -EOPNOTSUPP; + break; } err2 = nfs4_handle_exception(server, err, &src_exception); @@ -263,6 +589,269 @@ out_put_src_lock: return err; } +struct nfs42_offload_data { + struct nfs_server *seq_server; + struct nfs42_offload_status_args args; + struct nfs42_offload_status_res res; +}; + +static void nfs42_offload_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs42_offload_data *data = calldata; + + nfs4_setup_sequence(data->seq_server->nfs_client, + &data->args.osa_seq_args, + &data->res.osr_seq_res, task); +} + +static void nfs42_offload_cancel_done(struct rpc_task *task, void *calldata) +{ + struct nfs42_offload_data *data = calldata; + + trace_nfs4_offload_cancel(&data->args, task->tk_status); + nfs41_sequence_done(task, &data->res.osr_seq_res); + if (task->tk_status && + nfs4_async_handle_error(task, data->seq_server, NULL, + NULL) == -EAGAIN) + rpc_restart_call_prepare(task); +} + +static void nfs42_offload_release(void *data) +{ + kfree(data); +} + +static const struct rpc_call_ops nfs42_offload_cancel_ops = { + .rpc_call_prepare = nfs42_offload_prepare, + .rpc_call_done = nfs42_offload_cancel_done, + .rpc_release = nfs42_offload_release, +}; + +static int nfs42_do_offload_cancel_async(struct file *dst, + nfs4_stateid *stateid) +{ + struct nfs_server *dst_server = NFS_SERVER(file_inode(dst)); + struct nfs42_offload_data *data = NULL; + struct nfs_open_context *ctx = nfs_file_open_context(dst); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OFFLOAD_CANCEL], + .rpc_cred = ctx->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = dst_server->client, + .rpc_message = &msg, + .callback_ops = &nfs42_offload_cancel_ops, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE, + }; + int status; + + if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL)) + return -EOPNOTSUPP; + + data = kzalloc(sizeof(struct nfs42_offload_data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + data->seq_server = dst_server; + data->args.osa_src_fh = NFS_FH(file_inode(dst)); + memcpy(&data->args.osa_stateid, stateid, + sizeof(data->args.osa_stateid)); + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + task_setup_data.callback_data = data; + nfs4_init_sequence(&data->args.osa_seq_args, &data->res.osr_seq_res, + 1, 0); + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = rpc_wait_for_completion_task(task); + if (status == -ENOTSUPP) + dst_server->caps &= ~NFS_CAP_OFFLOAD_CANCEL; + rpc_put_task(task); + return status; +} + +static int +_nfs42_proc_offload_status(struct nfs_server *server, struct file *file, + struct nfs42_offload_data *data) +{ + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OFFLOAD_STATUS], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = ctx->cred, + }; + int status; + + status = nfs4_call_sync(server->client, server, &msg, + &data->args.osa_seq_args, + &data->res.osr_seq_res, 1); + trace_nfs4_offload_status(&data->args, status); + switch (status) { + case 0: + break; + + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + /* + * Server does not recognize the COPY stateid. CB_OFFLOAD + * could have purged it, or server might have rebooted. + * Since COPY stateids don't have an associated inode, + * avoid triggering state recovery. + */ + status = -EBADF; + break; + case -NFS4ERR_NOTSUPP: + case -ENOTSUPP: + case -EOPNOTSUPP: + server->caps &= ~NFS_CAP_OFFLOAD_STATUS; + status = -EOPNOTSUPP; + break; + } + + return status; +} + +/** + * nfs42_proc_offload_status - Poll completion status of an async copy operation + * @dst: handle of file being copied into + * @stateid: copy stateid (from async COPY result) + * @copied: OUT: number of bytes copied so far + * + * Return values: + * %0: Server returned an NFS4_OK completion status + * %-EINPROGRESS: Server returned no completion status + * %-EREMOTEIO: Server returned an error completion status + * %-EBADF: Server did not recognize the copy stateid + * %-EOPNOTSUPP: Server does not support OFFLOAD_STATUS + * %-ERESTARTSYS: Wait interrupted by signal + * + * Other negative errnos indicate the client could not complete the + * request. + */ +static int +nfs42_proc_offload_status(struct file *dst, nfs4_stateid *stateid, u64 *copied) +{ + struct inode *inode = file_inode(dst); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_exception exception = { + .inode = inode, + }; + struct nfs42_offload_data *data; + int status; + + if (!(server->caps & NFS_CAP_OFFLOAD_STATUS)) + return -EOPNOTSUPP; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->seq_server = server; + data->args.osa_src_fh = NFS_FH(inode); + memcpy(&data->args.osa_stateid, stateid, + sizeof(data->args.osa_stateid)); + exception.stateid = &data->args.osa_stateid; + do { + status = _nfs42_proc_offload_status(server, dst, data); + if (status == -EOPNOTSUPP) + goto out; + status = nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + if (status) + goto out; + + *copied = data->res.osr_count; + if (!data->res.complete_count) + status = -EINPROGRESS; + else if (data->res.osr_complete != NFS_OK) + status = -EREMOTEIO; + +out: + kfree(data); + return status; +} + +static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, + struct nfs42_copy_notify_args *args, + struct nfs42_copy_notify_res *res) +{ + struct nfs_server *src_server = NFS_SERVER(file_inode(src)); + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY_NOTIFY], + .rpc_argp = args, + .rpc_resp = res, + }; + int status; + struct nfs_open_context *ctx; + struct nfs_lock_context *l_ctx; + + ctx = get_nfs_open_context(nfs_file_open_context(src)); + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) { + status = PTR_ERR(l_ctx); + goto out; + } + + status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx, + FMODE_READ); + nfs_put_lock_context(l_ctx); + if (status) { + if (status == -EAGAIN) + status = -NFS4ERR_BAD_STATEID; + goto out; + } + + status = nfs4_call_sync(src_server->client, src_server, &msg, + &args->cna_seq_args, &res->cnr_seq_res, 0); + trace_nfs4_copy_notify(file_inode(src), args, res, status); + if (status == -ENOTSUPP) + src_server->caps &= ~NFS_CAP_COPY_NOTIFY; + +out: + put_nfs_open_context(nfs_file_open_context(src)); + return status; +} + +int nfs42_proc_copy_notify(struct file *src, struct file *dst, + struct nfs42_copy_notify_res *res) +{ + struct nfs_server *src_server = NFS_SERVER(file_inode(src)); + struct nfs42_copy_notify_args *args; + struct nfs4_exception exception = { + .inode = file_inode(src), + }; + int status; + + if (!(src_server->caps & NFS_CAP_COPY_NOTIFY)) + return -EOPNOTSUPP; + + args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_KERNEL); + if (args == NULL) + return -ENOMEM; + + args->cna_src_fh = NFS_FH(file_inode(src)), + args->cna_dst.nl4_type = NL4_NETADDR; + nfs42_set_netaddr(dst, &args->cna_dst.u.nl4_addr); + exception.stateid = &args->cna_src_stateid; + + do { + status = _nfs42_proc_copy_notify(src, dst, args, res); + if (status == -ENOTSUPP) { + status = -EOPNOTSUPP; + goto out; + } + status = nfs4_handle_exception(src_server, status, &exception); + } while (exception.retry); + +out: + kfree(args); + return status; +} + static loff_t _nfs42_proc_llseek(struct file *filep, struct nfs_lock_context *lock, loff_t offset, int whence) { @@ -287,8 +876,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep, status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context, lock, FMODE_READ); - if (status) + if (status) { + if (status == -EAGAIN) + status = -NFS4ERR_BAD_STATEID; return status; + } status = nfs_filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -297,12 +889,16 @@ static loff_t _nfs42_proc_llseek(struct file *filep, status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + trace_nfs4_llseek(inode, &args, &res, status); if (status == -ENOTSUPP) server->caps &= ~NFS_CAP_SEEK; if (status) return status; - return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); + if (whence == SEEK_DATA && res.sr_eof) + return -NFS4ERR_NXIO; + else + return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); } loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) @@ -368,6 +964,10 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: + return; + case -NFS4ERR_BADHANDLE: + case -ESTALE: + pnfs_destroy_layout(NFS_I(inode)); break; case -NFS4ERR_EXPIRED: case -NFS4ERR_ADMIN_REVOKED: @@ -410,6 +1010,8 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; } + + trace_nfs4_layoutstats(inode, &data->args.stateid, task->tk_status); } static void @@ -452,7 +1054,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, .rpc_message = &msg, .callback_ops = &nfs42_layoutstat_ops, .callback_data = data, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE, }; struct rpc_task *task; @@ -461,7 +1063,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, nfs42_layoutstat_release(data); return -EAGAIN; } - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0); task = rpc_run_task(&task_setup); if (IS_ERR(task)) return PTR_ERR(task); @@ -469,6 +1071,173 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, return 0; } +static struct nfs42_layouterror_data * +nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags) +{ + struct nfs42_layouterror_data *data; + struct inode *inode = lseg->pls_layout->plh_inode; + + data = kzalloc(sizeof(*data), gfp_flags); + if (data) { + data->args.inode = data->inode = nfs_igrab_and_active(inode); + if (data->inode) { + data->lseg = pnfs_get_lseg(lseg); + if (data->lseg) + return data; + nfs_iput_and_deactive(data->inode); + } + kfree(data); + } + return NULL; +} + +static void +nfs42_free_layouterror_data(struct nfs42_layouterror_data *data) +{ + pnfs_put_lseg(data->lseg); + nfs_iput_and_deactive(data->inode); + kfree(data); +} + +static void +nfs42_layouterror_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo = data->lseg->pls_layout; + unsigned i; + + spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + rpc_exit(task, 0); + return; + } + for (i = 0; i < data->args.num_errors; i++) + nfs4_stateid_copy(&data->args.errors[i].stateid, + &lo->plh_stateid); + spin_unlock(&inode->i_lock); + nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + &data->res.seq_res, task); +} + +static void +nfs42_layouterror_done(struct rpc_task *task, void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + struct inode *inode = data->inode; + struct pnfs_layout_hdr *lo = data->lseg->pls_layout; + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + return; + case -NFS4ERR_BADHANDLE: + case -ESTALE: + pnfs_destroy_layout(NFS_I(inode)); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match(&data->args.errors[0].stateid, + &lo->plh_stateid)) { + LIST_HEAD(head); + + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ + pnfs_mark_layout_stateid_invalid(lo, &head); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + nfs_commit_inode(inode, 0); + } else + spin_unlock(&inode->i_lock); + break; + case -NFS4ERR_OLD_STATEID: + spin_lock(&inode->i_lock); + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&data->args.errors[0].stateid, + &lo->plh_stateid)) { + /* Do we need to delay before resending? */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, + &data->args.errors[0].stateid)) + rpc_delay(task, HZ); + rpc_restart_call_prepare(task); + } + spin_unlock(&inode->i_lock); + break; + case -ENOTSUPP: + case -EOPNOTSUPP: + NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR; + } + + trace_nfs4_layouterror(inode, &data->args.errors[0].stateid, + task->tk_status); +} + +static void +nfs42_layouterror_release(void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + + nfs42_free_layouterror_data(data); +} + +static const struct rpc_call_ops nfs42_layouterror_ops = { + .rpc_call_prepare = nfs42_layouterror_prepare, + .rpc_call_done = nfs42_layouterror_done, + .rpc_release = nfs42_layouterror_release, +}; + +int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, + const struct nfs42_layout_error *errors, size_t n) +{ + struct inode *inode = lseg->pls_layout->plh_inode; + struct nfs42_layouterror_data *data; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR], + }; + struct rpc_task_setup task_setup = { + .rpc_message = &msg, + .callback_ops = &nfs42_layouterror_ops, + .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE, + }; + unsigned int i; + + if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR)) + return -EOPNOTSUPP; + if (n > NFS42_LAYOUTERROR_MAX) + return -EINVAL; + data = nfs42_alloc_layouterror_data(lseg, nfs_io_gfp_mask()); + if (!data) + return -ENOMEM; + for (i = 0; i < n; i++) { + data->args.errors[i] = errors[i]; + data->args.num_errors++; + data->res.num_errors++; + } + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + task_setup.callback_data = data; + task_setup.rpc_client = NFS_SERVER(inode)->client; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0); + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs42_proc_layouterror); + static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct file *dst_f, struct nfs_lock_context *src_lock, struct nfs_lock_context *dst_lock, loff_t src_offset, @@ -477,17 +1246,19 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct inode *src_inode = file_inode(src_f); struct inode *dst_inode = file_inode(dst_f); struct nfs_server *server = NFS_SERVER(dst_inode); + __u32 dst_bitmask[NFS_BITMASK_SZ]; struct nfs42_clone_args args = { .src_fh = NFS_FH(src_inode), .dst_fh = NFS_FH(dst_inode), .src_offset = src_offset, .dst_offset = dst_offset, .count = count, - .dst_bitmask = server->cache_consistency_bitmask, + .dst_bitmask = dst_bitmask, }; struct nfs42_clone_res res = { .server = server, }; + loff_t oldsize_dst = i_size_read(dst_inode); int status; msg->rpc_argp = &args; @@ -495,22 +1266,36 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, src_lock, FMODE_READ); - if (status) + if (status) { + if (status == -EAGAIN) + status = -NFS4ERR_BAD_STATEID; return status; - + } status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, dst_lock, FMODE_WRITE); - if (status) + if (status) { + if (status == -EAGAIN) + status = -NFS4ERR_BAD_STATEID; return status; + } res.dst_fattr = nfs_alloc_fattr(); if (!res.dst_fattr) return -ENOMEM; + nfs4_bitmask_set(dst_bitmask, server->cache_consistency_bitmask, + dst_inode, NFS_INO_INVALID_BLOCKS); + status = nfs4_call_sync(server->client, server, msg, &args.seq_args, &res.seq_res, 0); - if (status == 0) + trace_nfs4_clone(src_inode, dst_inode, &args, status); + if (status == 0) { + /* a zero-length count means clone to EOF in src */ + if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) + count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; + nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); + } kfree(res.dst_fattr); return status; @@ -569,3 +1354,302 @@ out_put_src_lock: nfs_put_lock_context(src_lock); return err; } + +#define NFS4XATTR_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE) + +static int _nfs42_proc_removexattr(struct inode *inode, const char *name) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs42_removexattrargs args = { + .fh = NFS_FH(inode), + .xattr_name = name, + }; + struct nfs42_removexattrres res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVEXATTR], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int ret; + unsigned long timestamp = jiffies; + + ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args, + &res.seq_res, 1); + trace_nfs4_removexattr(inode, name, ret); + if (!ret) + nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); + + return ret; +} + +static int _nfs42_proc_setxattr(struct inode *inode, const char *name, + const void *buf, size_t buflen, int flags) +{ + struct nfs_server *server = NFS_SERVER(inode); + __u32 bitmask[NFS_BITMASK_SZ]; + struct page *pages[NFS4XATTR_MAXPAGES]; + struct nfs42_setxattrargs arg = { + .fh = NFS_FH(inode), + .bitmask = bitmask, + .xattr_pages = pages, + .xattr_len = buflen, + .xattr_name = name, + .xattr_flags = flags, + }; + struct nfs42_setxattrres res = { + .server = server, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETXATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int ret, np; + unsigned long timestamp = jiffies; + + if (buflen > server->sxasize) + return -ERANGE; + + res.fattr = nfs_alloc_fattr(); + if (!res.fattr) + return -ENOMEM; + + if (buflen > 0) { + np = nfs4_buf_to_pages_noslab(buf, buflen, arg.xattr_pages); + if (np < 0) { + ret = np; + goto out; + } + } else + np = 0; + + nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask, + inode, NFS_INO_INVALID_CHANGE); + + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, + &res.seq_res, 1); + trace_nfs4_setxattr(inode, name, ret); + + for (; np > 0; np--) + put_page(pages[np - 1]); + + if (!ret) { + nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); + ret = nfs_post_op_update_inode(inode, res.fattr); + } + +out: + kfree(res.fattr); + return ret; +} + +static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, + void *buf, size_t buflen, struct page **pages, + size_t plen) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs42_getxattrargs arg = { + .fh = NFS_FH(inode), + .xattr_name = name, + }; + struct nfs42_getxattrres res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETXATTR], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + ssize_t ret; + + arg.xattr_len = plen; + arg.xattr_pages = pages; + + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, + &res.seq_res, 0); + trace_nfs4_getxattr(inode, name, ret); + if (ret < 0) + return ret; + + /* + * Normally, the caching is done one layer up, but for successful + * RPCS, always cache the result here, even if the caller was + * just querying the length, or if the reply was too big for + * the caller. This avoids a second RPC in the case of the + * common query-alloc-retrieve cycle for xattrs. + * + * Note that xattr_len is always capped to XATTR_SIZE_MAX. + */ + + nfs4_xattr_cache_add(inode, name, NULL, pages, res.xattr_len); + + if (buflen) { + if (res.xattr_len > buflen) + return -ERANGE; + _copy_from_pages(buf, pages, 0, res.xattr_len); + } + + return res.xattr_len; +} + +static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, + size_t buflen, u64 *cookiep, bool *eofp) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct page **pages; + struct nfs42_listxattrsargs arg = { + .fh = NFS_FH(inode), + .cookie = *cookiep, + }; + struct nfs42_listxattrsres res = { + .eof = false, + .xattr_buf = buf, + .xattr_len = buflen, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LISTXATTRS], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + u32 xdrlen; + int ret, np, i; + + + ret = -ENOMEM; + res.scratch = folio_alloc(GFP_KERNEL, 0); + if (!res.scratch) + goto out; + + xdrlen = nfs42_listxattr_xdrsize(buflen); + if (xdrlen > server->lxasize) + xdrlen = server->lxasize; + np = xdrlen / PAGE_SIZE + 1; + + pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out_free_scratch; + for (i = 0; i < np; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free_pages; + } + + arg.xattr_pages = pages; + arg.count = xdrlen; + + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, + &res.seq_res, 0); + trace_nfs4_listxattr(inode, ret); + + if (ret >= 0) { + ret = res.copied; + *cookiep = res.cookie; + *eofp = res.eof; + } + +out_free_pages: + while (--np >= 0) { + if (pages[np]) + __free_page(pages[np]); + } + kfree(pages); +out_free_scratch: + folio_put(res.scratch); +out: + return ret; + +} + +ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, + void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + ssize_t err, np, i; + struct page **pages; + + np = nfs_page_array_len(0, buflen ?: XATTR_SIZE_MAX); + pages = kmalloc_array(np, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (i = 0; i < np; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) { + err = -ENOMEM; + goto out; + } + } + + /* + * The GETXATTR op has no length field in the call, and the + * xattr data is at the end of the reply. + * + * There is no downside in using the page-aligned length. It will + * allow receiving and caching xattrs that are too large for the + * caller but still fit in the page-rounded value. + */ + do { + err = _nfs42_proc_getxattr(inode, name, buf, buflen, + pages, np * PAGE_SIZE); + if (err >= 0) + break; + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + +out: + while (--i >= 0) + __free_page(pages[i]); + kfree(pages); + + return err; +} + +int nfs42_proc_setxattr(struct inode *inode, const char *name, + const void *buf, size_t buflen, int flags) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_setxattr(inode, name, buf, buflen, flags); + if (!err) + break; + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + + return err; +} + +ssize_t nfs42_proc_listxattrs(struct inode *inode, void *buf, + size_t buflen, u64 *cookiep, bool *eofp) +{ + struct nfs4_exception exception = { }; + ssize_t err; + + do { + err = _nfs42_proc_listxattrs(inode, buf, buflen, + cookiep, eofp); + if (err >= 0) + break; + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + + return err; +} + +int nfs42_proc_removexattr(struct inode *inode, const char *name) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_removexattr(inode, name); + if (!err) + break; + err = nfs4_handle_exception(NFS_SERVER(inode), err, + &exception); + } while (exception.retry); + + return err; +} diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c new file mode 100644 index 000000000000..37d79400e5f4 --- /dev/null +++ b/fs/nfs/nfs42xattr.c @@ -0,0 +1,1067 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2019, 2020 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * User extended attribute client side cache functions. + * + * Author: Frank van der Linden <fllinden@amazon.com> + */ +#include <linux/errno.h> +#include <linux/nfs_fs.h> +#include <linux/hashtable.h> +#include <linux/refcount.h> +#include <uapi/linux/xattr.h> + +#include "nfs4_fs.h" +#include "internal.h" + +/* + * User extended attributes client side caching is implemented by having + * a cache structure attached to NFS inodes. This structure is allocated + * when needed, and freed when the cache is zapped. + * + * The cache structure contains as hash table of entries, and a pointer + * to a special-cased entry for the listxattr cache. + * + * Accessing and allocating / freeing the caches is done via reference + * counting. The cache entries use a similar refcounting scheme. + * + * This makes freeing a cache, both from the shrinker and from the + * zap cache path, easy. It also means that, in current use cases, + * the large majority of inodes will not waste any memory, as they + * will never have any user extended attributes assigned to them. + * + * Attribute entries are hashed in to a simple hash table. They are + * also part of an LRU. + * + * There are three shrinkers. + * + * Two shrinkers deal with the cache entries themselves: one for + * large entries (> PAGE_SIZE), and one for smaller entries. The + * shrinker for the larger entries works more aggressively than + * those for the smaller entries. + * + * The other shrinker frees the cache structures themselves. + */ + +/* + * 64 buckets is a good default. There is likely no reasonable + * workload that uses more than even 64 user extended attributes. + * You can certainly add a lot more - but you get what you ask for + * in those circumstances. + */ +#define NFS4_XATTR_HASH_SIZE 64 + +#define NFSDBG_FACILITY NFSDBG_XATTRCACHE + +struct nfs4_xattr_cache; +struct nfs4_xattr_entry; + +struct nfs4_xattr_bucket { + spinlock_t lock; + struct hlist_head hlist; + struct nfs4_xattr_cache *cache; + bool draining; +}; + +struct nfs4_xattr_cache { + struct kref ref; + struct nfs4_xattr_bucket buckets[NFS4_XATTR_HASH_SIZE]; + struct list_head lru; + struct list_head dispose; + atomic_long_t nent; + spinlock_t listxattr_lock; + struct inode *inode; + struct nfs4_xattr_entry *listxattr; +}; + +struct nfs4_xattr_entry { + struct kref ref; + struct hlist_node hnode; + struct list_head lru; + struct list_head dispose; + char *xattr_name; + void *xattr_value; + size_t xattr_size; + struct nfs4_xattr_bucket *bucket; + uint32_t flags; +}; + +#define NFS4_XATTR_ENTRY_EXTVAL 0x0001 + +/* + * LRU list of NFS inodes that have xattr caches. + */ +static struct list_lru nfs4_xattr_cache_lru; +static struct list_lru nfs4_xattr_entry_lru; +static struct list_lru nfs4_xattr_large_entry_lru; + +static struct kmem_cache *nfs4_xattr_cache_cachep; + +/* + * Hashing helper functions. + */ +static void +nfs4_xattr_hash_init(struct nfs4_xattr_cache *cache) +{ + unsigned int i; + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&cache->buckets[i].hlist); + spin_lock_init(&cache->buckets[i].lock); + cache->buckets[i].cache = cache; + cache->buckets[i].draining = false; + } +} + +/* + * Locking order: + * 1. inode i_lock or bucket lock + * 2. list_lru lock (taken by list_lru_* functions) + */ + +/* + * Wrapper functions to add a cache entry to the right LRU. + */ +static bool +nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) +{ + struct list_lru *lru; + + lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + return list_lru_add_obj(lru, &entry->lru); +} + +static bool +nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) +{ + struct list_lru *lru; + + lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + return list_lru_del_obj(lru, &entry->lru); +} + +/* + * This function allocates cache entries. They are the normal + * extended attribute name/value pairs, but may also be a listxattr + * cache. Those allocations use the same entry so that they can be + * treated as one by the memory shrinker. + * + * xattr cache entries are allocated together with names. If the + * value fits in to one page with the entry structure and the name, + * it will also be part of the same allocation (kmalloc). This is + * expected to be the vast majority of cases. Larger allocations + * have a value pointer that is allocated separately by kvmalloc. + * + * Parameters: + * + * @name: Name of the extended attribute. NULL for listxattr cache + * entry. + * @value: Value of attribute, or listxattr cache. NULL if the + * value is to be copied from pages instead. + * @pages: Pages to copy the value from, if not NULL. Passed in to + * make it easier to copy the value after an RPC, even if + * the value will not be passed up to application (e.g. + * for a 'query' getxattr with NULL buffer). + * @len: Length of the value. Can be 0 for zero-length attributes. + * @value and @pages will be NULL if @len is 0. + */ +static struct nfs4_xattr_entry * +nfs4_xattr_alloc_entry(const char *name, const void *value, + struct page **pages, size_t len) +{ + struct nfs4_xattr_entry *entry; + void *valp; + char *namep; + size_t alloclen, slen; + char *buf; + uint32_t flags; + + BUILD_BUG_ON(sizeof(struct nfs4_xattr_entry) + + XATTR_NAME_MAX + 1 > PAGE_SIZE); + + alloclen = sizeof(struct nfs4_xattr_entry); + if (name != NULL) { + slen = strlen(name) + 1; + alloclen += slen; + } else + slen = 0; + + if (alloclen + len <= PAGE_SIZE) { + alloclen += len; + flags = 0; + } else { + flags = NFS4_XATTR_ENTRY_EXTVAL; + } + + buf = kmalloc(alloclen, GFP_KERNEL); + if (buf == NULL) + return NULL; + entry = (struct nfs4_xattr_entry *)buf; + + if (name != NULL) { + namep = buf + sizeof(struct nfs4_xattr_entry); + memcpy(namep, name, slen); + } else { + namep = NULL; + } + + + if (flags & NFS4_XATTR_ENTRY_EXTVAL) { + valp = kvmalloc(len, GFP_KERNEL); + if (valp == NULL) { + kfree(buf); + return NULL; + } + } else if (len != 0) { + valp = buf + sizeof(struct nfs4_xattr_entry) + slen; + } else + valp = NULL; + + if (valp != NULL) { + if (value != NULL) + memcpy(valp, value, len); + else + _copy_from_pages(valp, pages, 0, len); + } + + entry->flags = flags; + entry->xattr_value = valp; + kref_init(&entry->ref); + entry->xattr_name = namep; + entry->xattr_size = len; + entry->bucket = NULL; + INIT_LIST_HEAD(&entry->lru); + INIT_LIST_HEAD(&entry->dispose); + INIT_HLIST_NODE(&entry->hnode); + + return entry; +} + +static void +nfs4_xattr_free_entry(struct nfs4_xattr_entry *entry) +{ + if (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) + kvfree(entry->xattr_value); + kfree(entry); +} + +static void +nfs4_xattr_free_entry_cb(struct kref *kref) +{ + struct nfs4_xattr_entry *entry; + + entry = container_of(kref, struct nfs4_xattr_entry, ref); + + if (WARN_ON(!list_empty(&entry->lru))) + return; + + nfs4_xattr_free_entry(entry); +} + +static void +nfs4_xattr_free_cache_cb(struct kref *kref) +{ + struct nfs4_xattr_cache *cache; + int i; + + cache = container_of(kref, struct nfs4_xattr_cache, ref); + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + if (WARN_ON(!hlist_empty(&cache->buckets[i].hlist))) + return; + cache->buckets[i].draining = false; + } + + cache->listxattr = NULL; + + kmem_cache_free(nfs4_xattr_cache_cachep, cache); + +} + +static struct nfs4_xattr_cache * +nfs4_xattr_alloc_cache(void) +{ + struct nfs4_xattr_cache *cache; + + cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, GFP_KERNEL); + if (cache == NULL) + return NULL; + + kref_init(&cache->ref); + atomic_long_set(&cache->nent, 0); + + return cache; +} + +/* + * Set the listxattr cache, which is a special-cased cache entry. + * The special value ERR_PTR(-ESTALE) is used to indicate that + * the cache is being drained - this prevents a new listxattr + * cache from being added to what is now a stale cache. + */ +static int +nfs4_xattr_set_listcache(struct nfs4_xattr_cache *cache, + struct nfs4_xattr_entry *new) +{ + struct nfs4_xattr_entry *old; + int ret = 1; + + spin_lock(&cache->listxattr_lock); + + old = cache->listxattr; + + if (old == ERR_PTR(-ESTALE)) { + ret = 0; + goto out; + } + + cache->listxattr = new; + if (new != NULL && new != ERR_PTR(-ESTALE)) + nfs4_xattr_entry_lru_add(new); + + if (old != NULL) { + nfs4_xattr_entry_lru_del(old); + kref_put(&old->ref, nfs4_xattr_free_entry_cb); + } +out: + spin_unlock(&cache->listxattr_lock); + + return ret; +} + +/* + * Unlink a cache from its parent inode, clearing out an invalid + * cache. Must be called with i_lock held. + */ +static struct nfs4_xattr_cache * +nfs4_xattr_cache_unlink(struct inode *inode) +{ + struct nfs_inode *nfsi; + struct nfs4_xattr_cache *oldcache; + + nfsi = NFS_I(inode); + + oldcache = nfsi->xattr_cache; + if (oldcache != NULL) { + list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru); + oldcache->inode = NULL; + } + nfsi->xattr_cache = NULL; + nfsi->cache_validity &= ~NFS_INO_INVALID_XATTR; + + return oldcache; + +} + +/* + * Discard a cache. Called by get_cache() if there was an old, + * invalid cache. Can also be called from a shrinker callback. + * + * The cache is dead, it has already been unlinked from its inode, + * and no longer appears on the cache LRU list. + * + * Mark all buckets as draining, so that no new entries are added. This + * could still happen in the unlikely, but possible case that another + * thread had grabbed a reference before it was unlinked from the inode, + * and is still holding it for an add operation. + * + * Remove all entries from the LRU lists, so that there is no longer + * any way to 'find' this cache. Then, remove the entries from the hash + * table. + * + * At that point, the cache will remain empty and can be freed when the final + * reference drops, which is very likely the kref_put at the end of + * this function, or the one called immediately afterwards in the + * shrinker callback. + */ +static void +nfs4_xattr_discard_cache(struct nfs4_xattr_cache *cache) +{ + unsigned int i; + struct nfs4_xattr_entry *entry; + struct nfs4_xattr_bucket *bucket; + struct hlist_node *n; + + nfs4_xattr_set_listcache(cache, ERR_PTR(-ESTALE)); + + for (i = 0; i < NFS4_XATTR_HASH_SIZE; i++) { + bucket = &cache->buckets[i]; + + spin_lock(&bucket->lock); + bucket->draining = true; + hlist_for_each_entry_safe(entry, n, &bucket->hlist, hnode) { + nfs4_xattr_entry_lru_del(entry); + hlist_del_init(&entry->hnode); + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } + spin_unlock(&bucket->lock); + } + + atomic_long_set(&cache->nent, 0); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +/* + * Get a referenced copy of the cache structure. Avoid doing allocs + * while holding i_lock. Which means that we do some optimistic allocation, + * and might have to free the result in rare cases. + * + * This function only checks the NFS_INO_INVALID_XATTR cache validity bit + * and acts accordingly, replacing the cache when needed. For the read case + * (!add), this means that the caller must make sure that the cache + * is valid before caling this function. getxattr and listxattr call + * revalidate_inode to do this. The attribute cache timeout (for the + * non-delegated case) is expected to be dealt with in the revalidate + * call. + */ + +static struct nfs4_xattr_cache * +nfs4_xattr_get_cache(struct inode *inode, int add) +{ + struct nfs_inode *nfsi; + struct nfs4_xattr_cache *cache, *oldcache, *newcache; + + nfsi = NFS_I(inode); + + cache = oldcache = NULL; + + spin_lock(&inode->i_lock); + + if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) + oldcache = nfs4_xattr_cache_unlink(inode); + else + cache = nfsi->xattr_cache; + + if (cache != NULL) + kref_get(&cache->ref); + + spin_unlock(&inode->i_lock); + + if (add && cache == NULL) { + newcache = NULL; + + cache = nfs4_xattr_alloc_cache(); + if (cache == NULL) + goto out; + + spin_lock(&inode->i_lock); + if (nfsi->cache_validity & NFS_INO_INVALID_XATTR) { + /* + * The cache was invalidated again. Give up, + * since what we want to enter is now likely + * outdated anyway. + */ + spin_unlock(&inode->i_lock); + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + cache = NULL; + goto out; + } + + /* + * Check if someone beat us to it. + */ + if (nfsi->xattr_cache != NULL) { + newcache = nfsi->xattr_cache; + kref_get(&newcache->ref); + } else { + kref_get(&cache->ref); + nfsi->xattr_cache = cache; + cache->inode = inode; + list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru); + } + + spin_unlock(&inode->i_lock); + + /* + * If there was a race, throw away the cache we just + * allocated, and use the new one allocated by someone + * else. + */ + if (newcache != NULL) { + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + cache = newcache; + } + } + +out: + /* + * Discard the now orphaned old cache. + */ + if (oldcache != NULL) + nfs4_xattr_discard_cache(oldcache); + + return cache; +} + +static inline struct nfs4_xattr_bucket * +nfs4_xattr_hash_bucket(struct nfs4_xattr_cache *cache, const char *name) +{ + return &cache->buckets[jhash(name, strlen(name), 0) & + (ARRAY_SIZE(cache->buckets) - 1)]; +} + +static struct nfs4_xattr_entry * +nfs4_xattr_get_entry(struct nfs4_xattr_bucket *bucket, const char *name) +{ + struct nfs4_xattr_entry *entry; + + entry = NULL; + + hlist_for_each_entry(entry, &bucket->hlist, hnode) { + if (!strcmp(entry->xattr_name, name)) + break; + } + + return entry; +} + +static int +nfs4_xattr_hash_add(struct nfs4_xattr_cache *cache, + struct nfs4_xattr_entry *entry) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *oldentry = NULL; + int ret = 1; + + bucket = nfs4_xattr_hash_bucket(cache, entry->xattr_name); + entry->bucket = bucket; + + spin_lock(&bucket->lock); + + if (bucket->draining) { + ret = 0; + goto out; + } + + oldentry = nfs4_xattr_get_entry(bucket, entry->xattr_name); + if (oldentry != NULL) { + hlist_del_init(&oldentry->hnode); + nfs4_xattr_entry_lru_del(oldentry); + } else { + atomic_long_inc(&cache->nent); + } + + hlist_add_head(&entry->hnode, &bucket->hlist); + nfs4_xattr_entry_lru_add(entry); + +out: + spin_unlock(&bucket->lock); + + if (oldentry != NULL) + kref_put(&oldentry->ref, nfs4_xattr_free_entry_cb); + + return ret; +} + +static void +nfs4_xattr_hash_remove(struct nfs4_xattr_cache *cache, const char *name) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *entry; + + bucket = nfs4_xattr_hash_bucket(cache, name); + + spin_lock(&bucket->lock); + + entry = nfs4_xattr_get_entry(bucket, name); + if (entry != NULL) { + hlist_del_init(&entry->hnode); + nfs4_xattr_entry_lru_del(entry); + atomic_long_dec(&cache->nent); + } + + spin_unlock(&bucket->lock); + + if (entry != NULL) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); +} + +static struct nfs4_xattr_entry * +nfs4_xattr_hash_find(struct nfs4_xattr_cache *cache, const char *name) +{ + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_entry *entry; + + bucket = nfs4_xattr_hash_bucket(cache, name); + + spin_lock(&bucket->lock); + + entry = nfs4_xattr_get_entry(bucket, name); + if (entry != NULL) + kref_get(&entry->ref); + + spin_unlock(&bucket->lock); + + return entry; +} + +/* + * Entry point to retrieve an entry from the cache. + */ +ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, char *buf, + ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + ssize_t ret; + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return -ENOENT; + + ret = 0; + entry = nfs4_xattr_hash_find(cache, name); + + if (entry != NULL) { + dprintk("%s: cache hit '%s', len %lu\n", __func__, + entry->xattr_name, (unsigned long)entry->xattr_size); + if (buflen == 0) { + /* Length probe only */ + ret = entry->xattr_size; + } else if (buflen < entry->xattr_size) + ret = -ERANGE; + else { + memcpy(buf, entry->xattr_value, entry->xattr_size); + ret = entry->xattr_size; + } + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } else { + dprintk("%s: cache miss '%s'\n", __func__, name); + ret = -ENOENT; + } + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + + return ret; +} + +/* + * Retrieve a cached list of xattrs from the cache. + */ +ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + ssize_t ret; + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return -ENOENT; + + spin_lock(&cache->listxattr_lock); + + entry = cache->listxattr; + + if (entry != NULL && entry != ERR_PTR(-ESTALE)) { + if (buflen == 0) { + /* Length probe only */ + ret = entry->xattr_size; + } else if (entry->xattr_size > buflen) + ret = -ERANGE; + else { + memcpy(buf, entry->xattr_value, entry->xattr_size); + ret = entry->xattr_size; + } + } else { + ret = -ENOENT; + } + + spin_unlock(&cache->listxattr_lock); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + + return ret; +} + +/* + * Add an xattr to the cache. + * + * This also invalidates the xattr list cache. + */ +void nfs4_xattr_cache_add(struct inode *inode, const char *name, + const char *buf, struct page **pages, ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + + dprintk("%s: add '%s' len %lu\n", __func__, + name, (unsigned long)buflen); + + cache = nfs4_xattr_get_cache(inode, 1); + if (cache == NULL) + return; + + entry = nfs4_xattr_alloc_entry(name, buf, pages, buflen); + if (entry == NULL) + goto out; + + (void)nfs4_xattr_set_listcache(cache, NULL); + + if (!nfs4_xattr_hash_add(cache, entry)) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + +out: + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + + +/* + * Remove an xattr from the cache. + * + * This also invalidates the xattr list cache. + */ +void nfs4_xattr_cache_remove(struct inode *inode, const char *name) +{ + struct nfs4_xattr_cache *cache; + + dprintk("%s: remove '%s'\n", __func__, name); + + cache = nfs4_xattr_get_cache(inode, 0); + if (cache == NULL) + return; + + (void)nfs4_xattr_set_listcache(cache, NULL); + nfs4_xattr_hash_remove(cache, name); + + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +/* + * Cache listxattr output, replacing any possible old one. + */ +void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf, + ssize_t buflen) +{ + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry; + + cache = nfs4_xattr_get_cache(inode, 1); + if (cache == NULL) + return; + + entry = nfs4_xattr_alloc_entry(NULL, buf, NULL, buflen); + if (entry == NULL) + goto out; + + /* + * This is just there to be able to get to bucket->cache, + * which is obviously the same for all buckets, so just + * use bucket 0. + */ + entry->bucket = &cache->buckets[0]; + + if (!nfs4_xattr_set_listcache(cache, entry)) + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + +out: + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); +} + +/* + * Zap the entire cache. Called when an inode is evicted. + */ +void nfs4_xattr_cache_zap(struct inode *inode) +{ + struct nfs4_xattr_cache *oldcache; + + spin_lock(&inode->i_lock); + oldcache = nfs4_xattr_cache_unlink(inode); + spin_unlock(&inode->i_lock); + + if (oldcache) + nfs4_xattr_discard_cache(oldcache); +} + +/* + * The entry LRU is shrunk more aggressively than the cache LRU, + * by settings @seeks to 1. + * + * Cache structures are freed only when they've become empty, after + * pruning all but one entry. + */ + +static unsigned long nfs4_xattr_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_entry_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, + struct shrink_control *sc); + +static struct shrinker *nfs4_xattr_cache_shrinker; +static struct shrinker *nfs4_xattr_entry_shrinker; +static struct shrinker *nfs4_xattr_large_entry_shrinker; + +static enum lru_status +cache_lru_isolate(struct list_head *item, + struct list_lru_one *lru, void *arg) +{ + struct list_head *dispose = arg; + struct inode *inode; + struct nfs4_xattr_cache *cache = container_of(item, + struct nfs4_xattr_cache, lru); + + if (atomic_long_read(&cache->nent) > 1) + return LRU_SKIP; + + /* + * If a cache structure is on the LRU list, we know that + * its inode is valid. Try to lock it to break the link. + * Since we're inverting the lock order here, only try. + */ + inode = cache->inode; + + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; + + kref_get(&cache->ref); + + cache->inode = NULL; + NFS_I(inode)->xattr_cache = NULL; + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_XATTR; + list_lru_isolate(lru, &cache->lru); + + spin_unlock(&inode->i_lock); + + list_add_tail(&cache->dispose, dispose); + return LRU_REMOVED; +} + +static unsigned long +nfs4_xattr_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + struct nfs4_xattr_cache *cache; + + freed = list_lru_shrink_walk(&nfs4_xattr_cache_lru, sc, + cache_lru_isolate, &dispose); + while (!list_empty(&dispose)) { + cache = list_first_entry(&dispose, struct nfs4_xattr_cache, + dispose); + list_del_init(&cache->dispose); + nfs4_xattr_discard_cache(cache); + kref_put(&cache->ref, nfs4_xattr_free_cache_cb); + } + + return freed; +} + + +static unsigned long +nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + + count = list_lru_shrink_count(&nfs4_xattr_cache_lru, sc); + return vfs_pressure_ratio(count); +} + +static enum lru_status +entry_lru_isolate(struct list_head *item, + struct list_lru_one *lru, void *arg) +{ + struct list_head *dispose = arg; + struct nfs4_xattr_bucket *bucket; + struct nfs4_xattr_cache *cache; + struct nfs4_xattr_entry *entry = container_of(item, + struct nfs4_xattr_entry, lru); + + bucket = entry->bucket; + cache = bucket->cache; + + /* + * Unhook the entry from its parent (either a cache bucket + * or a cache structure if it's a listxattr buf), so that + * it's no longer found. Then add it to the isolate list, + * to be freed later. + * + * In both cases, we're reverting lock order, so use + * trylock and skip the entry if we can't get the lock. + */ + if (entry->xattr_name != NULL) { + /* Regular cache entry */ + if (!spin_trylock(&bucket->lock)) + return LRU_SKIP; + + kref_get(&entry->ref); + + hlist_del_init(&entry->hnode); + atomic_long_dec(&cache->nent); + list_lru_isolate(lru, &entry->lru); + + spin_unlock(&bucket->lock); + } else { + /* Listxattr cache entry */ + if (!spin_trylock(&cache->listxattr_lock)) + return LRU_SKIP; + + kref_get(&entry->ref); + + cache->listxattr = NULL; + list_lru_isolate(lru, &entry->lru); + + spin_unlock(&cache->listxattr_lock); + } + + list_add_tail(&entry->dispose, dispose); + return LRU_REMOVED; +} + +static unsigned long +nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + struct nfs4_xattr_entry *entry; + struct list_lru *lru; + + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); + + while (!list_empty(&dispose)) { + entry = list_first_entry(&dispose, struct nfs4_xattr_entry, + dispose); + list_del_init(&entry->dispose); + + /* + * Drop two references: the one that we just grabbed + * in entry_lru_isolate, and the one that was set + * when the entry was first allocated. + */ + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + kref_put(&entry->ref, nfs4_xattr_free_entry_cb); + } + + return freed; +} + +static unsigned long +nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long count; + struct list_lru *lru; + + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? + &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; + + count = list_lru_shrink_count(lru, sc); + return vfs_pressure_ratio(count); +} + + +static void nfs4_xattr_cache_init_once(void *p) +{ + struct nfs4_xattr_cache *cache = p; + + spin_lock_init(&cache->listxattr_lock); + atomic_long_set(&cache->nent, 0); + nfs4_xattr_hash_init(cache); + cache->listxattr = NULL; + INIT_LIST_HEAD(&cache->lru); + INIT_LIST_HEAD(&cache->dispose); +} + +typedef unsigned long (*count_objects_cb)(struct shrinker *s, + struct shrink_control *sc); +typedef unsigned long (*scan_objects_cb)(struct shrinker *s, + struct shrink_control *sc); + +static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker, + struct list_lru *lru, const char *name, + count_objects_cb count, + scan_objects_cb scan, long batch, int seeks) +{ + int ret; + + *shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name); + if (!*shrinker) + return -ENOMEM; + + ret = list_lru_init_memcg(lru, *shrinker); + if (ret) { + shrinker_free(*shrinker); + return ret; + } + + (*shrinker)->count_objects = count; + (*shrinker)->scan_objects = scan; + (*shrinker)->batch = batch; + (*shrinker)->seeks = seeks; + + shrinker_register(*shrinker); + + return ret; +} + +static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker, + struct list_lru *lru) +{ + shrinker_free(shrinker); + list_lru_destroy(lru); +} + +int __init nfs4_xattr_cache_init(void) +{ + int ret = 0; + + nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache", + sizeof(struct nfs4_xattr_cache), 0, + (SLAB_RECLAIM_ACCOUNT), + nfs4_xattr_cache_init_once); + if (nfs4_xattr_cache_cachep == NULL) + return -ENOMEM; + + ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker, + &nfs4_xattr_cache_lru, "nfs-xattr_cache", + nfs4_xattr_cache_count, + nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS); + if (ret) + goto out1; + + ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker, + &nfs4_xattr_entry_lru, "nfs-xattr_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS); + if (ret) + goto out2; + + ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker, + &nfs4_xattr_large_entry_lru, + "nfs-xattr_large_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, 1); + if (!ret) + return 0; + + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, + &nfs4_xattr_entry_lru); +out2: + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, + &nfs4_xattr_cache_lru); +out1: + kmem_cache_destroy(nfs4_xattr_cache_cachep); + + return ret; +} + +void nfs4_xattr_cache_exit(void) +{ + nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker, + &nfs4_xattr_large_entry_lru); + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, + &nfs4_xattr_entry_lru); + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, + &nfs4_xattr_cache_lru); + kmem_cache_destroy(nfs4_xattr_cache_cachep); +} diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 5ee1b0f0d904..e10d83ba835e 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> */ @@ -6,6 +7,9 @@ #include "nfs42.h" +/* Not limited by NFS itself, limited by the generic xattr code */ +#define nfs4_xattr_name_maxsz XDR_QUADLEN(XATTR_NAME_MAX) + #define encode_fallocate_maxsz (encode_stateid_maxsz + \ 2 /* offset */ + \ 2 /* length */) @@ -20,14 +24,51 @@ #define encode_copy_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE) + \ XDR_QUADLEN(NFS4_STATEID_SIZE) + \ - 2 + 2 + 2 + 1 + 1 + 1) + 2 + 2 + 2 + 1 + 1 + 1 +\ + 1 + /* One cnr_source_server */\ + 1 + /* nl4_type */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) #define decode_copy_maxsz (op_decode_hdr_maxsz + \ NFS42_WRITE_RES_SIZE + \ 1 /* cr_consecutive */ + \ 1 /* cr_synchronous */) +#define encode_offload_cancel_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_offload_cancel_maxsz (op_decode_hdr_maxsz) +#define encode_offload_status_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_offload_status_maxsz (op_decode_hdr_maxsz + \ + 2 /* osr_count */ + \ + 2 /* osr_complete */) +#define encode_copy_notify_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 1 + /* nl4_type */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) +#define decode_copy_notify_maxsz (op_decode_hdr_maxsz + \ + 3 + /* cnr_lease_time */\ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 1 + /* Support 1 cnr_source_server */\ + 1 + /* nl4_type */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) #define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_deallocate_maxsz (op_decode_hdr_maxsz) +#define encode_read_plus_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + 3) +#define NFS42_READ_PLUS_DATA_SEGMENT_SIZE \ + (1 /* data_content4 */ + \ + 2 /* data_info4.di_offset */ + \ + 1 /* data_info4.di_length */) +#define NFS42_READ_PLUS_HOLE_SEGMENT_SIZE \ + (1 /* data_content4 */ + \ + 2 /* data_info4.di_offset */ + \ + 2 /* data_info4.di_length */) +#define READ_PLUS_SEGMENT_SIZE_DIFF (NFS42_READ_PLUS_HOLE_SEGMENT_SIZE - \ + NFS42_READ_PLUS_DATA_SEGMENT_SIZE) +#define decode_read_plus_maxsz (op_decode_hdr_maxsz + \ + 1 /* rpr_eof */ + \ + 1 /* rpr_contents count */ + \ + NFS42_READ_PLUS_HOLE_SEGMENT_SIZE) #define encode_seek_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ 2 /* offset */ + \ @@ -47,45 +88,118 @@ 1 /* opaque devaddr4 length */ + \ XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE)) #define decode_layoutstats_maxsz (op_decode_hdr_maxsz) +#define encode_device_error_maxsz (XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \ + 1 /* status */ + 1 /* opnum */) +#define encode_layouterror_maxsz (op_decode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + encode_stateid_maxsz + \ + 1 /* Array size */ + \ + encode_device_error_maxsz) +#define decode_layouterror_maxsz (op_decode_hdr_maxsz) #define encode_clone_maxsz (encode_stateid_maxsz + \ encode_stateid_maxsz + \ 2 /* src offset */ + \ 2 /* dst offset */ + \ 2 /* count */) #define decode_clone_maxsz (op_decode_hdr_maxsz) +#define encode_getxattr_maxsz (op_encode_hdr_maxsz + 1 + \ + nfs4_xattr_name_maxsz) +#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz) +#define encode_setxattr_maxsz (op_encode_hdr_maxsz + \ + 1 + nfs4_xattr_name_maxsz + 1) +#define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) +#define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1) +#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + 1) +#define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \ + nfs4_xattr_name_maxsz) +#define decode_removexattr_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz) #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_allocate_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_allocate_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ encode_copy_maxsz + \ encode_commit_maxsz) #define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ decode_copy_maxsz + \ decode_commit_maxsz) +#define NFS4_enc_offload_cancel_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_offload_cancel_maxsz) +#define NFS4_dec_offload_cancel_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_offload_cancel_maxsz) +#define NFS4_enc_offload_status_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_offload_status_maxsz) +#define NFS4_dec_offload_status_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_offload_status_maxsz) +#define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_copy_notify_maxsz) +#define NFS4_dec_copy_notify_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_copy_notify_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_deallocate_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_zero_range_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_deallocate_maxsz + \ + encode_allocate_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_zero_range_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ + decode_allocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_read_plus_maxsz) +#define NFS4_dec_read_plus_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_read_plus_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_seek_maxsz) #define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_seek_maxsz) #define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \ @@ -96,6 +210,16 @@ decode_sequence_maxsz + \ decode_putfh_maxsz + \ PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz) +#define NFS4_enc_layouterror_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + NFS42_LAYOUTERROR_MAX * \ + encode_layouterror_maxsz) +#define NFS4_dec_layouterror_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + NFS42_LAYOUTERROR_MAX * \ + decode_layouterror_maxsz) #define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -110,6 +234,63 @@ decode_putfh_maxsz + \ decode_clone_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_getxattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getxattr_maxsz) +#define NFS4_dec_getxattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getxattr_maxsz) +#define NFS4_enc_setxattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_setxattr_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_setxattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_setxattr_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_listxattrs_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_listxattrs_maxsz) +#define NFS4_dec_listxattrs_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_listxattrs_maxsz) +#define NFS4_enc_removexattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_removexattr_maxsz) +#define NFS4_dec_removexattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_removexattr_maxsz) + +/* + * These values specify the maximum amount of data that is not + * associated with the extended attribute name or extended + * attribute list in the SETXATTR, GETXATTR and LISTXATTR + * respectively. + */ +const u32 nfs42_maxsetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + + encode_sequence_maxsz + + encode_putfh_maxsz + 1 + + nfs4_xattr_name_maxsz) + * XDR_UNIT); + +const u32 nfs42_maxgetxattr_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz + + decode_putfh_maxsz + 1) * XDR_UNIT); + +const u32 nfs42_maxlistxattrs_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_decode_hdr_maxsz + + decode_sequence_maxsz + + decode_putfh_maxsz + 3) * XDR_UNIT); static void encode_fallocate(struct xdr_stream *xdr, const struct nfs42_falloc_args *args) @@ -127,6 +308,26 @@ static void encode_allocate(struct xdr_stream *xdr, encode_fallocate(xdr, args); } +static void encode_nl4_server(struct xdr_stream *xdr, + const struct nl4_server *ns) +{ + encode_uint32(xdr, ns->nl4_type); + switch (ns->nl4_type) { + case NL4_NAME: + case NL4_URL: + encode_string(xdr, ns->u.nl4_str_sz, ns->u.nl4_str); + break; + case NL4_NETADDR: + encode_string(xdr, ns->u.nl4_addr.netid_len, + ns->u.nl4_addr.netid); + encode_string(xdr, ns->u.nl4_addr.addr_len, + ns->u.nl4_addr.addr); + break; + default: + WARN_ON_ONCE(1); + } +} + static void encode_copy(struct xdr_stream *xdr, const struct nfs42_copy_args *args, struct compound_hdr *hdr) @@ -140,8 +341,50 @@ static void encode_copy(struct xdr_stream *xdr, encode_uint64(xdr, args->count); encode_uint32(xdr, 1); /* consecutive = true */ - encode_uint32(xdr, 1); /* synchronous = true */ - encode_uint32(xdr, 0); /* src server list */ + encode_uint32(xdr, args->sync); + if (args->cp_src == NULL) { /* intra-ssc */ + encode_uint32(xdr, 0); /* no src server list */ + return; + } + encode_uint32(xdr, 1); /* supporting 1 server */ + encode_nl4_server(xdr, args->cp_src); +} + +static void encode_copy_commit(struct xdr_stream *xdr, + const struct nfs42_copy_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->dst_pos); + *p = cpu_to_be32(args->count); +} + +static void encode_offload_cancel(struct xdr_stream *xdr, + const struct nfs42_offload_status_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_OFFLOAD_CANCEL, decode_offload_cancel_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->osa_stateid); +} + +static void encode_offload_status(struct xdr_stream *xdr, + const struct nfs42_offload_status_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_OFFLOAD_STATUS, decode_offload_status_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->osa_stateid); +} + +static void encode_copy_notify(struct xdr_stream *xdr, + const struct nfs42_copy_notify_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_COPY_NOTIFY, decode_copy_notify_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->cna_src_stateid); + encode_nl4_server(xdr, &args->cna_dst); } static void encode_deallocate(struct xdr_stream *xdr, @@ -152,6 +395,16 @@ static void encode_deallocate(struct xdr_stream *xdr, encode_fallocate(xdr, args); } +static void encode_read_plus(struct xdr_stream *xdr, + const struct nfs_pgio_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->stateid); + encode_uint64(xdr, args->offset); + encode_uint32(xdr, args->count); +} + static void encode_seek(struct xdr_stream *xdr, const struct nfs42_seek_args *args, struct compound_hdr *hdr) @@ -205,6 +458,88 @@ static void encode_clone(struct xdr_stream *xdr, xdr_encode_hyper(p, args->count); } +static void encode_device_error(struct xdr_stream *xdr, + const struct nfs42_device_error *error) +{ + __be32 *p; + + p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4); + p = xdr_encode_opaque_fixed(p, error->dev_id.data, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(error->status); + *p = cpu_to_be32(error->opnum); +} + +static void encode_layouterror(struct xdr_stream *xdr, + const struct nfs42_layout_error *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr); + p = reserve_space(xdr, 8 + 8); + p = xdr_encode_hyper(p, args->offset); + p = xdr_encode_hyper(p, args->length); + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(1); + encode_device_error(xdr, &args->errors[0]); +} + +static void encode_setxattr(struct xdr_stream *xdr, + const struct nfs42_setxattrargs *arg, + struct compound_hdr *hdr) +{ + __be32 *p; + + BUILD_BUG_ON(XATTR_CREATE != SETXATTR4_CREATE); + BUILD_BUG_ON(XATTR_REPLACE != SETXATTR4_REPLACE); + + encode_op_hdr(xdr, OP_SETXATTR, decode_setxattr_maxsz, hdr); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->xattr_flags); + encode_string(xdr, strlen(arg->xattr_name), arg->xattr_name); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->xattr_len); + if (arg->xattr_len) + xdr_write_pages(xdr, arg->xattr_pages, 0, arg->xattr_len); +} + +static void encode_getxattr(struct xdr_stream *xdr, const char *name, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_GETXATTR, decode_getxattr_maxsz, hdr); + encode_string(xdr, strlen(name), name); +} + +static void encode_removexattr(struct xdr_stream *xdr, const char *name, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_REMOVEXATTR, decode_removexattr_maxsz, hdr); + encode_string(xdr, strlen(name), name); +} + +static void encode_listxattrs(struct xdr_stream *xdr, + const struct nfs42_listxattrsargs *arg, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LISTXATTRS, decode_listxattrs_maxsz, hdr); + + p = reserve_space(xdr, 12); + if (unlikely(!p)) + return; + + p = xdr_encode_hyper(p, arg->cookie); + /* + * RFC 8276 says to specify the full max length of the LISTXATTRS + * XDR reply. Count is set to the XDR length of the names array + * plus the EOF marker. So, add the cookie and the names count. + */ + *p = cpu_to_be32(arg->count + 8 + 4); +} + /* * Encode ALLOCATE request */ @@ -225,18 +560,6 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_nops(&hdr); } -static void encode_copy_commit(struct xdr_stream *xdr, - const struct nfs42_copy_args *args, - struct compound_hdr *hdr) -{ - __be32 *p; - - encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); - p = reserve_space(xdr, 12); - p = xdr_encode_hyper(p, args->dst_pos); - *p = cpu_to_be32(args->count); -} - /* * Encode COPY request */ @@ -255,7 +578,65 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req, encode_savefh(xdr, &hdr); encode_putfh(xdr, args->dst_fh, &hdr); encode_copy(xdr, args, &hdr); - encode_copy_commit(xdr, args, &hdr); + if (args->sync) + encode_copy_commit(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode OFFLOAD_CANCEL request + */ +static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_offload_status_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->osa_seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->osa_seq_args, &hdr); + encode_putfh(xdr, args->osa_src_fh, &hdr); + encode_offload_cancel(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode OFFLOAD_STATUS request + */ +static void nfs4_xdr_enc_offload_status(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_offload_status_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->osa_seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->osa_seq_args, &hdr); + encode_putfh(xdr, args->osa_src_fh, &hdr); + encode_offload_status(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode COPY_NOTIFY request + */ +static void nfs4_xdr_enc_copy_notify(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_copy_notify_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->cna_seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->cna_seq_args, &hdr); + encode_putfh(xdr, args->cna_src_fh, &hdr); + encode_copy_notify(xdr, args, &hdr); encode_nops(&hdr); } @@ -280,6 +661,49 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, } /* + * Encode ZERO_RANGE request + */ +static void nfs4_xdr_enc_zero_range(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_falloc_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_deallocate(xdr, args, &hdr); + encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode READ_PLUS request + */ +static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs_pgio_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_read_plus(xdr, args, &hdr); + + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count, + hdr.replen - READ_PLUS_SEGMENT_SIZE_DIFF); + encode_nops(&hdr); +} + +/* * Encode SEEK request */ static void nfs4_xdr_enc_seek(struct rpc_rqst *req, @@ -343,6 +767,111 @@ static void nfs4_xdr_enc_clone(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode LAYOUTERROR request + */ +static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_layouterror_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + int i; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + for (i = 0; i < args->num_errors; i++) + encode_layouterror(xdr, &args->errors[i], &hdr); + encode_nops(&hdr); +} + +/* + * Encode SETXATTR request + */ +static void nfs4_xdr_enc_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_setxattrargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setxattr(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode GETXATTR request + */ +static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_getxattrargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + uint32_t replen; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + 1; + encode_getxattr(xdr, args->xattr_name, &hdr); + + rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->xattr_len, + replen); + + encode_nops(&hdr); +} + +/* + * Encode LISTXATTR request + */ +static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfs42_listxattrsargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + uint32_t replen; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + 2 + 1; + encode_listxattrs(xdr, args, &hdr); + + rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, replen); + + encode_nops(&hdr); +} + +/* + * Encode REMOVEXATTR request + */ +static void nfs4_xdr_enc_removexattr(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfs42_removexattrargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_removexattr(xdr, args->xattr_name, &hdr); + encode_nops(&hdr); +} + static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_ALLOCATE); @@ -352,28 +881,78 @@ static int decode_write_response(struct xdr_stream *xdr, struct nfs42_write_res *res) { __be32 *p; + int status, count; - p = xdr_inline_decode(xdr, 4 + 8 + 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; - - /* - * We never use asynchronous mode, so warn if a server returns - * a stateid. - */ - if (unlikely(*p != 0)) { - pr_err_once("%s: server has set unrequested " - "asynchronous mode\n", __func__); + return -EIO; + count = be32_to_cpup(p); + if (count > 1) return -EREMOTEIO; + else if (count == 1) { + status = decode_opaque_fixed(xdr, &res->stateid, + NFS4_STATEID_SIZE); + if (unlikely(status)) + return -EIO; } - p++; + p = xdr_inline_decode(xdr, 8 + 4); + if (unlikely(!p)) + return -EIO; p = xdr_decode_hyper(p, &res->count); res->verifier.committed = be32_to_cpup(p); return decode_verifier(xdr, &res->verifier.verifier); +} + +static int decode_nl4_server(struct xdr_stream *xdr, struct nl4_server *ns) +{ + struct nfs42_netaddr *naddr; + uint32_t dummy; + char *dummy_str; + __be32 *p; + int status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; + /* nl_type */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + ns->nl4_type = be32_to_cpup(p); + switch (ns->nl4_type) { + case NL4_NAME: + case NL4_URL: + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(&ns->u.nl4_str, dummy_str, dummy); + ns->u.nl4_str_sz = dummy; + break; + case NL4_NETADDR: + naddr = &ns->u.nl4_addr; + + /* netid string */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > RPCBIND_MAXNETIDLEN)) + return -EIO; + naddr->netid_len = dummy; + memcpy(naddr->netid, dummy_str, naddr->netid_len); + + /* uaddr string */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > RPCBIND_MAXUADDRLEN)) + return -EIO; + naddr->addr_len = dummy; + memcpy(naddr->addr, dummy_str, naddr->addr_len); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + return 0; } static int decode_copy_requirements(struct xdr_stream *xdr, @@ -382,14 +961,11 @@ static int decode_copy_requirements(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4 + 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->consecutive = be32_to_cpup(p++); res->synchronous = be32_to_cpup(p++); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res) @@ -412,11 +988,197 @@ static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res) return decode_copy_requirements(xdr, res); } +static int decode_offload_cancel(struct xdr_stream *xdr, + struct nfs42_offload_status_res *res) +{ + return decode_op_hdr(xdr, OP_OFFLOAD_CANCEL); +} + +static int decode_offload_status(struct xdr_stream *xdr, + struct nfs42_offload_status_res *res) +{ + ssize_t result; + int status; + + status = decode_op_hdr(xdr, OP_OFFLOAD_STATUS); + if (status) + return status; + /* osr_count */ + if (xdr_stream_decode_u64(xdr, &res->osr_count) < 0) + return -EIO; + /* osr_complete<1> */ + result = xdr_stream_decode_uint32_array(xdr, &res->osr_complete, 1); + if (result < 0) + return -EIO; + res->complete_count = result; + return 0; +} + +static int decode_copy_notify(struct xdr_stream *xdr, + struct nfs42_copy_notify_res *res) +{ + __be32 *p; + int status, count; + + status = decode_op_hdr(xdr, OP_COPY_NOTIFY); + if (status) + return status; + /* cnr_lease_time */ + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + return -EIO; + p = xdr_decode_hyper(p, &res->cnr_lease_time.seconds); + res->cnr_lease_time.nseconds = be32_to_cpup(p); + + status = decode_opaque_fixed(xdr, &res->cnr_stateid, NFS4_STATEID_SIZE); + if (unlikely(status)) + return -EIO; + + /* number of source addresses */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + count = be32_to_cpup(p); + if (count > 1) + pr_warn("NFS: %s: nsvr %d > Supported. Use first servers\n", + __func__, count); + + status = decode_nl4_server(xdr, &res->cnr_src); + if (unlikely(status)) + return -EIO; + return 0; +} + static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_DEALLOCATE); } +struct read_plus_segment { + enum data_content4 type; + uint64_t offset; + union { + struct { + uint64_t length; + } hole; + + struct { + uint32_t length; + unsigned int from; + } data; + }; +}; + +static inline uint64_t read_plus_segment_length(struct read_plus_segment *seg) +{ + return seg->type == NFS4_CONTENT_DATA ? seg->data.length : seg->hole.length; +} + +static int decode_read_plus_segment(struct xdr_stream *xdr, + struct read_plus_segment *seg) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + seg->type = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, seg->type == NFS4_CONTENT_DATA ? 12 : 16); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &seg->offset); + + if (seg->type == NFS4_CONTENT_DATA) { + struct xdr_buf buf; + uint32_t len = be32_to_cpup(p); + + seg->data.length = len; + seg->data.from = xdr_stream_pos(xdr); + + if (!xdr_stream_subsegment(xdr, &buf, xdr_align_size(len))) + return -EIO; + } else if (seg->type == NFS4_CONTENT_HOLE) { + xdr_decode_hyper(p, &seg->hole.length); + } else + return -EINVAL; + return 0; +} + +static int process_read_plus_segment(struct xdr_stream *xdr, + struct nfs_pgio_args *args, + struct nfs_pgio_res *res, + struct read_plus_segment *seg) +{ + unsigned long offset = seg->offset; + unsigned long length = read_plus_segment_length(seg); + unsigned int bufpos; + + if (offset + length < args->offset) + return 0; + else if (offset > args->offset + args->count) { + res->eof = 0; + return 0; + } else if (offset < args->offset) { + length -= (args->offset - offset); + offset = args->offset; + } else if (offset + length > args->offset + args->count) { + length = (args->offset + args->count) - offset; + res->eof = 0; + } + + bufpos = xdr->buf->head[0].iov_len + (offset - args->offset); + if (seg->type == NFS4_CONTENT_HOLE) + return xdr_stream_zero(xdr, bufpos, length); + else + return xdr_stream_move_subsegment(xdr, seg->data.from, bufpos, length); +} + +static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) +{ + struct nfs_pgio_header *hdr = + container_of(res, struct nfs_pgio_header, res); + struct nfs_pgio_args *args = &hdr->args; + uint32_t segments; + struct read_plus_segment *segs; + int status, i; + __be32 *p; + + status = decode_op_hdr(xdr, OP_READ_PLUS); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(!p)) + return -EIO; + + res->count = 0; + res->eof = be32_to_cpup(p++); + segments = be32_to_cpup(p++); + if (segments == 0) + return 0; + + segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL); + if (!segs) + return -ENOMEM; + + for (i = 0; i < segments; i++) { + status = decode_read_plus_segment(xdr, &segs[i]); + if (status < 0) + goto out; + } + + xdr_set_pagelen(xdr, xdr_align_size(args->count)); + for (i = segments; i > 0; i--) + res->count += process_read_plus_segment(xdr, args, res, &segs[i-1]); + status = 0; + +out: + kfree(segs); + return status; +} + static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) { int status; @@ -428,15 +1190,11 @@ static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) p = xdr_inline_decode(xdr, 4 + 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->sr_eof = be32_to_cpup(p++); p = xdr_decode_hyper(p, &res->sr_offset); return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutstats(struct xdr_stream *xdr) @@ -449,6 +1207,173 @@ static int decode_clone(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_CLONE); } +static int decode_layouterror(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_LAYOUTERROR); +} + +static int decode_setxattr(struct xdr_stream *xdr, + struct nfs4_change_info *cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_SETXATTR); + if (status) + goto out; + status = decode_change_info(xdr, cinfo); +out: + return status; +} + +static int decode_getxattr(struct xdr_stream *xdr, + struct nfs42_getxattrres *res, + struct rpc_rqst *req) +{ + int status; + __be32 *p; + u32 len, rdlen; + + status = decode_op_hdr(xdr, OP_GETXATTR); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + len = be32_to_cpup(p); + + /* + * Only check against the page length here. The actual + * requested length may be smaller, but that is only + * checked against after possibly caching a valid reply. + */ + if (len > req->rq_rcv_buf.page_len) + return -ERANGE; + + res->xattr_len = len; + + if (len > 0) { + rdlen = xdr_read_pages(xdr, len); + if (rdlen < len) + return -EIO; + } + + return 0; +} + +static int decode_removexattr(struct xdr_stream *xdr, + struct nfs4_change_info *cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_REMOVEXATTR); + if (status) + goto out; + + status = decode_change_info(xdr, cinfo); +out: + return status; +} + +static int decode_listxattrs(struct xdr_stream *xdr, + struct nfs42_listxattrsres *res) +{ + int status; + __be32 *p; + u32 count, len, ulen; + size_t left, copied; + char *buf; + + status = decode_op_hdr(xdr, OP_LISTXATTRS); + if (status) { + /* + * Special case: for LISTXATTRS, NFS4ERR_TOOSMALL + * should be translated to ERANGE. + */ + if (status == -ETOOSMALL) + status = -ERANGE; + /* + * Special case: for LISTXATTRS, NFS4ERR_NOXATTR + * should be translated to success with zero-length reply. + */ + if (status == -ENODATA) { + res->eof = true; + status = 0; + } + goto out; + } + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + return -EIO; + + xdr_decode_hyper(p, &res->cookie); + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + left = res->xattr_len; + buf = res->xattr_buf; + + count = be32_to_cpup(p); + copied = 0; + + /* + * We have asked for enough room to encode the maximum number + * of possible attribute names, so everything should fit. + * + * But, don't rely on that assumption. Just decode entries + * until they don't fit anymore, just in case the server did + * something odd. + */ + while (count--) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + len = be32_to_cpup(p); + if (len > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) { + status = -ERANGE; + goto out; + } + + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + return -EIO; + + ulen = len + XATTR_USER_PREFIX_LEN + 1; + if (buf) { + if (ulen > left) { + status = -ERANGE; + goto out; + } + + memcpy(buf, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + memcpy(buf + XATTR_USER_PREFIX_LEN, p, len); + + buf[ulen - 1] = 0; + buf += ulen; + left -= ulen; + } + copied += ulen; + } + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + res->eof = be32_to_cpup(p); + res->copied = copied; + +out: + if (status == -ERANGE && res->xattr_len == XATTR_LIST_MAX) + status = -E2BIG; + + return status; +} + /* * Decode ALLOCATE request */ @@ -506,7 +1431,86 @@ static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp, status = decode_copy(xdr, res); if (status) goto out; - status = decode_commit(xdr, &res->commit_res); + if (res->commit_res.verf) + status = decode_commit(xdr, &res->commit_res); +out: + return status; +} + +/* + * Decode OFFLOAD_CANCEL response + */ +static int nfs4_xdr_dec_offload_cancel(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_offload_status_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->osr_seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_offload_cancel(xdr, res); + +out: + return status; +} + +/* + * Decode OFFLOAD_STATUS response + */ +static int nfs4_xdr_dec_offload_status(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_offload_status_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->osr_seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_offload_status(xdr, res); + +out: + return status; +} + +/* + * Decode COPY_NOTIFY response + */ +static int nfs4_xdr_dec_copy_notify(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_copy_notify_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->cnr_seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_copy_notify(xdr, res); + out: return status; } @@ -540,6 +1544,66 @@ out: } /* + * Decode ZERO_RANGE request + */ +static int nfs4_xdr_dec_zero_range(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_falloc_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_deallocate(xdr, res); + if (status) + goto out; + status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); +out: + return status; +} + +/* + * Decode READ_PLUS request + */ +static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs_pgio_res *res = data; + struct compound_hdr hdr; + int status; + + xdr_set_scratch_buffer(xdr, res->scratch, READ_PLUS_SCRATCH_SIZE); + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_read_plus(xdr, res); + if (!status) + status = res->count; +out: + return status; +} + +/* * Decode SEEK request */ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp, @@ -624,11 +1688,137 @@ static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp, status = decode_clone(xdr); if (status) goto out; - status = decode_getfattr(xdr, res->dst_fattr, res->server); + decode_getfattr(xdr, res->dst_fattr, res->server); +out: + res->rpc_status = status; + return status; +} + +/* + * Decode LAYOUTERROR request + */ +static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_layouterror_res *res = data; + struct compound_hdr hdr; + int status, i; + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + + for (i = 0; i < res->num_errors && status == 0; i++) + status = decode_layouterror(xdr); out: res->rpc_status = status; return status; } +/* + * Decode SETXATTR request + */ +static int nfs4_xdr_dec_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr, + void *data) +{ + struct nfs42_setxattrres *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, req); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_setxattr(xdr, &res->cinfo); + if (status) + goto out; + status = decode_getfattr(xdr, res->fattr, res->server); +out: + return status; +} + +/* + * Decode GETXATTR request + */ +static int nfs4_xdr_dec_getxattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, void *data) +{ + struct nfs42_getxattrres *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_getxattr(xdr, res, rqstp); +out: + return status; +} + +/* + * Decode LISTXATTR request + */ +static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, void *data) +{ + struct nfs42_listxattrsres *res = data; + struct compound_hdr hdr; + int status; + + xdr_set_scratch_folio(xdr, res->scratch); + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_listxattrs(xdr, res); +out: + return status; +} + +/* + * Decode REMOVEXATTR request + */ +static int nfs4_xdr_dec_removexattr(struct rpc_rqst *req, + struct xdr_stream *xdr, void *data) +{ + struct nfs42_removexattrres *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, req); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + + status = decode_removexattr(xdr, &res->cinfo); +out: + return status; +} #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 40bd05f05e74..c34c89af9c7d 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/nfs/nfs4_fs.h * @@ -22,6 +23,7 @@ #define NFS4_MAX_LOOP_ON_RECOVER (10) #include <linux/seqlock.h> +#include <linux/filelock.h> struct idmap; @@ -40,6 +42,12 @@ enum nfs4_client_state { NFS4CLNT_MOVED, NFS4CLNT_LEASE_MOVED, NFS4CLNT_DELEGATION_EXPIRED, + NFS4CLNT_RUN_MANAGER, + NFS4CLNT_MANAGER_AVAILABLE, + NFS4CLNT_RECALL_RUNNING, + NFS4CLNT_RECALL_ANY_LAYOUT_READ, + NFS4CLNT_RECALL_ANY_LAYOUT_RW, + NFS4CLNT_DELEGRETURN_DELAYED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -55,14 +63,15 @@ struct nfs4_minor_version_ops { bool (*match_stateid)(const nfs4_stateid *, const nfs4_stateid *); int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, - struct nfs_fsinfo *); + struct nfs_fattr *); void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); int (*test_and_free_expired)(struct nfs_server *, - nfs4_stateid *, struct rpc_cred *); + nfs4_stateid *, const struct cred *); struct nfs_seqid * (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); - int (*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *); + void (*session_trunk)(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, void *data); const struct rpc_call_ops *call_sync_ops; const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; @@ -73,7 +82,7 @@ struct nfs4_minor_version_ops { #define NFS_SEQID_CONFIRMED 1 struct nfs_seqid_counter { ktime_t create_time; - int owner_id; + u64 owner_id; int flags; u32 counter; spinlock_t lock; /* Protects the list */ @@ -104,14 +113,13 @@ struct nfs4_state_owner { unsigned long so_expires; struct rb_node so_server_node; - struct rpc_cred *so_cred; /* Associated cred */ + const struct cred *so_cred; /* Associated cred */ spinlock_t so_lock; atomic_t so_count; unsigned long so_flags; struct list_head so_states; struct nfs_seqid_counter so_seqid; - seqcount_t so_reclaim_seqcount; struct mutex so_delegreturn_mutex; }; @@ -141,10 +149,11 @@ struct nfs4_lock_state { struct nfs4_state * ls_state; /* Pointer to open state */ #define NFS_LOCK_INITIALIZED 0 #define NFS_LOCK_LOST 1 +#define NFS_LOCK_UNLOCKING 2 unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; nfs4_stateid ls_stateid; - atomic_t ls_count; + refcount_t ls_count; fl_owner_t ls_owner; }; @@ -161,6 +170,10 @@ enum { NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */ + NFS_STATE_CHANGE_WAIT, /* A state changing operation is outstanding */ + NFS_CLNT_DST_SSC_COPY_STATE, /* dst server open state on client*/ + NFS_CLNT_SRC_SSC_COPY_STATE, /* src server open state on client*/ + NFS_SRV_SSC_COPY_STATE, /* ssc state on the dst server */ }; struct nfs4_state { @@ -183,7 +196,10 @@ struct nfs4_state { unsigned int n_wronly; /* Number of write-only references */ unsigned int n_rdwr; /* Number of read/write references */ fmode_t state; /* State on the server (R,W, or RW) */ - atomic_t count; + refcount_t count; + + wait_queue_head_t waitq; + struct rcu_head rcu_head; }; @@ -192,9 +208,12 @@ struct nfs4_exception { struct inode *inode; nfs4_stateid *stateid; long timeout; + unsigned short retrans; + unsigned char task_is_privileged : 1; unsigned char delay : 1, recovering : 1, retry : 1; + bool interruptible; }; struct nfs4_state_recovery_ops { @@ -202,82 +221,129 @@ struct nfs4_state_recovery_ops { int state_flag_bit; int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *); int (*recover_lock)(struct nfs4_state *, struct file_lock *); - int (*establish_clid)(struct nfs_client *, struct rpc_cred *); - int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *); + int (*establish_clid)(struct nfs_client *, const struct cred *); + int (*reclaim_complete)(struct nfs_client *, const struct cred *); int (*detect_trunking)(struct nfs_client *, struct nfs_client **, - struct rpc_cred *); + const struct cred *); +}; + +struct nfs4_opendata { + struct kref kref; + struct nfs_openargs o_arg; + struct nfs_openres o_res; + struct nfs_open_confirmargs c_arg; + struct nfs_open_confirmres c_res; + struct nfs4_string owner_name; + struct nfs4_string group_name; + struct nfs4_label *a_label; + struct nfs_fattr f_attr; + struct dentry *dir; + struct dentry *dentry; + struct nfs4_state_owner *owner; + struct nfs4_state *state; + struct iattr attrs; + struct nfs4_layoutget *lgp; + unsigned long timestamp; + bool rpc_done; + bool file_created; + bool is_recover; + bool cancelled; + int rpc_status; }; struct nfs4_add_xprt_data { struct nfs_client *clp; - struct rpc_cred *cred; + const struct cred *cred; }; struct nfs4_state_maintenance_ops { - int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned); - struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); - int (*renew_lease)(struct nfs_client *, struct rpc_cred *); + int (*sched_state_renewal)(struct nfs_client *, const struct cred *, unsigned); + const struct cred * (*get_state_renewal_cred)(struct nfs_client *); + int (*renew_lease)(struct nfs_client *, const struct cred *); }; struct nfs4_mig_recovery_ops { - int (*get_locations)(struct inode *, struct nfs4_fs_locations *, - struct page *, struct rpc_cred *); - int (*fsid_present)(struct inode *, struct rpc_cred *); + int (*get_locations)(struct nfs_server *, struct nfs_fh *, + struct nfs4_fs_locations *, struct page *, const struct cred *); + int (*fsid_present)(struct inode *, const struct cred *); }; extern const struct dentry_operations nfs4_dentry_operations; /* dir.c */ int nfs_atomic_open(struct inode *, struct dentry *, struct file *, - unsigned, umode_t, int *); + unsigned, umode_t); -/* super.c */ +/* fs_context.c */ extern struct file_system_type nfs4_fs_type; /* nfs4namespace.c */ struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, const struct qstr *); -struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, - struct nfs_fh *, struct nfs_fattr *); +int nfs4_submount(struct fs_context *, struct nfs_server *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); - +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss, + size_t salen, struct net *net, int port); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); +extern int nfs4_async_handle_error(struct rpc_task *task, + struct nfs_server *server, + struct nfs4_state *state, long *timeout); extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, struct rpc_message *, struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); -extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); -extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); -extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); -extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); -extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, struct rpc_cred *cred); -extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred); +extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int, int); +extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, const struct cred *, struct nfs4_setclientid_res *); +extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, const struct cred *); +extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, + struct nfs_fattr *, bool); +extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, const struct cred *cred); +extern int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred); extern int nfs4_destroy_clientid(struct nfs_client *clp); -extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); -extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); +extern int nfs4_init_clientid(struct nfs_client *, const struct cred *); +extern int nfs41_init_clientid(struct nfs_client *, const struct cred *); extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, struct nfs4_fs_locations *, struct page *); -extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, - struct page *page, struct rpc_cred *); -extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *); -extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, const struct qstr *, - struct nfs_fh *, struct nfs_fattr *); +extern int nfs4_proc_get_locations(struct nfs_server *, struct nfs_fh *, + struct nfs4_fs_locations *, + struct page *page, const struct cred *); +extern int nfs4_proc_fsid_present(struct inode *, const struct cred *); +extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, + struct dentry *, + struct nfs_fh *, + struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); -extern const struct xattr_handler *nfs4_xattr_handlers[]; +extern const struct xattr_handler * const nfs4_xattr_handlers[]; extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, fmode_t fmode); +extern void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], + struct inode *inode, unsigned long cache_validity); +extern int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct inode *inode); +extern int update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *deleg_stateid, + fmode_t fmode); +extern int nfs4_proc_setlease(struct file *file, int arg, + struct file_lease **lease, void **priv); +extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); +extern void nfs4_update_changeattr(struct inode *dir, + struct nfs4_change_info *cinfo, + unsigned long timestamp, + unsigned long cache_validity); +extern int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen, + struct page **pages); #if defined(CONFIG_NFS_V4_1) extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *); -extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); -extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); -extern int nfs4_proc_get_lease_time(struct nfs_client *clp, - struct nfs_fsinfo *fsinfo); +extern int nfs4_proc_create_session(struct nfs_client *, const struct cred *); +extern int nfs4_proc_destroy_session(struct nfs4_session *, const struct cred *); extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync); extern int nfs4_detect_session_trunking(struct nfs_client *clp, @@ -300,17 +366,21 @@ static inline bool _nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, struct rpc_clnt **clntp, struct rpc_message *msg) { - struct rpc_cred *newcred = NULL; rpc_authflavor_t flavor; + if (sp4_mode == NFS_SP4_MACH_CRED_CLEANUP || + sp4_mode == NFS_SP4_MACH_CRED_PNFS_CLEANUP) { + /* Using machine creds for cleanup operations + * is only relevent if the client credentials + * might expire. So don't bother for + * RPC_AUTH_UNIX. If file was only exported to + * sec=sys, the PUTFH would fail anyway. + */ + if ((*clntp)->cl_auth->au_flavor == RPC_AUTH_UNIX) + return false; + } if (test_bit(sp4_mode, &clp->cl_sp4_flags)) { - spin_lock(&clp->cl_lock); - if (clp->cl_machine_cred != NULL) - /* don't call get_rpccred on the machine cred - - * a reference will be held for life of clp */ - newcred = clp->cl_machine_cred; - spin_unlock(&clp->cl_lock); - msg->rpc_cred = newcred; + msg->rpc_cred = rpc_machine_cred(); flavor = clp->cl_rpcclient->cl_auth->au_flavor; WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I && @@ -392,36 +462,38 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *); /* nfs4renewd.c */ extern void nfs4_schedule_state_renewal(struct nfs_client *); -extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(struct work_struct *); -extern void nfs4_set_lease_period(struct nfs_client *clp, - unsigned long lease, - unsigned long lastrenewed); +extern void nfs4_set_lease_period(struct nfs_client *clp, unsigned long lease); /* nfs4state.c */ -struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp); -struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); -struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); +extern const nfs4_stateid current_stateid; + +const struct cred *nfs4_get_clid_cred(struct nfs_client *clp); +const struct cred *nfs4_get_machine_cred(struct nfs_client *clp); +const struct cred *nfs4_get_renew_cred(struct nfs_client *clp); int nfs4_discover_server_trunking(struct nfs_client *clp, struct nfs_client **); int nfs40_discover_server_trunking(struct nfs_client *clp, - struct nfs_client **, struct rpc_cred *); + struct nfs_client **, const struct cred *); #if defined(CONFIG_NFS_V4_1) int nfs41_discover_server_trunking(struct nfs_client *clp, - struct nfs_client **, struct rpc_cred *); + struct nfs_client **, const struct cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); extern void nfs41_notify_server(struct nfs_client *); +bool nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2); #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { } #endif /* CONFIG_NFS_V4_1 */ -extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t); +extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, const struct cred *, gfp_t); extern void nfs4_put_state_owner(struct nfs4_state_owner *); -extern void nfs4_purge_state_owners(struct nfs_server *); +extern void nfs4_purge_state_owners(struct nfs_server *, struct list_head *); +extern void nfs4_free_state_owners(struct list_head *head); extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct nfs4_state *, fmode_t); @@ -445,7 +517,9 @@ extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, const struct nfs_lock_context *, nfs4_stateid *, - struct rpc_cred **); + const struct cred **); +extern bool nfs4_copy_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -453,7 +527,7 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); -extern int nfs4_setup_sequence(const struct nfs_client *client, +extern int nfs4_setup_sequence(struct nfs_client *client, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); @@ -461,22 +535,26 @@ extern int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res); extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); - +extern int nfs4_proc_commit(struct file *dst, __u64 offset, __u32 count, struct nfs_commitres *res); extern const nfs4_stateid zero_stateid; +extern const nfs4_stateid invalid_stateid; /* nfs4super.c */ struct nfs_mount_info; extern struct nfs_subversion nfs_v4; -struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); extern bool nfs4_disable_idmapping; extern unsigned short max_session_slots; extern unsigned short max_session_cb_slots; extern unsigned short send_implementation_id; extern bool recover_lost_locks; +extern short nfs_delay_retrans; #define NFS4_CLIENT_ID_UNIQ_LEN (64) extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN]; +extern int nfs4_try_get_tree(struct fs_context *); +extern int nfs4_get_referral_tree(struct fs_context *); + /* nfs4sysctl.c */ #ifdef CONFIG_SYSCTL int nfs4_register_sysctl(void); @@ -495,6 +573,12 @@ static inline void nfs4_unregister_sysctl(void) /* nfs4xdr.c */ extern const struct rpc_procinfo nfs4_procedures[]; +#ifdef CONFIG_NFS_V4_2 +extern const u32 nfs42_maxsetxattr_overhead; +extern const u32 nfs42_maxgetxattr_overhead; +extern const u32 nfs42_maxlistxattrs_overhead; +#endif + struct nfs4_mount_data; /* callback_xdr.c */ @@ -524,6 +608,29 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; } +static inline bool nfs4_stateid_is_next(const nfs4_stateid *s1, const nfs4_stateid *s2) +{ + u32 seq1 = be32_to_cpu(s1->seqid); + u32 seq2 = be32_to_cpu(s2->seqid); + + return seq2 == seq1 + 1U || (seq2 == 1U && seq1 == 0xffffffffU); +} + +static inline bool nfs4_stateid_match_or_older(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return nfs4_stateid_match_other(dst, src) && + !(src->seqid && nfs4_stateid_is_newer(dst, src)); +} + +static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1) +{ + u32 seqid = be32_to_cpu(s1->seqid); + + if (++seqid == 0) + ++seqid; + s1->seqid = cpu_to_be32(seqid); +} + static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) { return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; @@ -536,12 +643,34 @@ static inline bool nfs4_state_match_open_stateid_other(const struct nfs4_state * nfs4_stateid_match_other(&state->open_stateid, stateid); } +/* nfs42xattr.c */ +#ifdef CONFIG_NFS_V4_2 +extern int __init nfs4_xattr_cache_init(void); +extern void nfs4_xattr_cache_exit(void); +extern void nfs4_xattr_cache_add(struct inode *inode, const char *name, + const char *buf, struct page **pages, + ssize_t buflen); +extern void nfs4_xattr_cache_remove(struct inode *inode, const char *name); +extern ssize_t nfs4_xattr_cache_get(struct inode *inode, const char *name, + char *buf, ssize_t buflen); +extern void nfs4_xattr_cache_set_list(struct inode *inode, const char *buf, + ssize_t buflen); +extern ssize_t nfs4_xattr_cache_list(struct inode *inode, char *buf, + ssize_t buflen); +extern void nfs4_xattr_cache_zap(struct inode *inode); #else +static inline void nfs4_xattr_cache_zap(struct inode *inode) +{ +} +#endif /* CONFIG_NFS_V4_2 */ + +#else /* CONFIG_NFS_V4 */ #define nfs4_close_state(a, b) do { } while (0) #define nfs4_close_sync(a, b) do { } while (0) #define nfs4_state_protect(a, b, c, d) do { } while (0) #define nfs4_state_protect_write(a, b, c, d) do { } while (0) + #endif /* CONFIG_NFS_V4 */ #endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index e9bea90dc017..3a4baed993c9 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -10,6 +11,7 @@ #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <net/handshake.h> #include "internal.h" #include "callback.h" #include "delegation.h" @@ -17,6 +19,7 @@ #include "nfs4idmap.h" #include "pnfs.h" #include "netns.h" +#include "sysfs.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -42,7 +45,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) } #ifdef CONFIG_NFS_V4_1 -/** +/* * Per auth flavor data server rpc clients */ struct nfs4_ds_server { @@ -51,7 +54,9 @@ struct nfs4_ds_server { }; /** - * Common lookup case for DS I/O + * nfs4_find_ds_client - Common lookup case for DS I/O + * @ds_clp: pointer to the DS's nfs_client + * @flavor: rpc auth flavour to match */ static struct nfs4_ds_server * nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor) @@ -118,9 +123,13 @@ nfs4_free_ds_server(struct nfs4_ds_server *dss) } /** -* Find or create a DS rpc client with th MDS server rpc client auth flavor -* in the nfs_client cl_ds_clients list. -*/ + * nfs4_find_or_create_ds_client - Find or create a DS rpc client + * @ds_clp: pointer to the DS's nfs_client + * @inode: pointer to the inode + * + * Find or create a DS rpc client with th MDS server rpc client auth flavor + * in the nfs_client cl_ds_clients list. + */ struct rpc_clnt * nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode) { @@ -145,7 +154,6 @@ static void nfs4_shutdown_ds_clients(struct nfs_client *clp) { struct nfs4_ds_server *dss; - LIST_HEAD(shutdown_list); while (!list_empty(&clp->cl_ds_clients)) { dss = list_entry(clp->cl_ds_clients.next, @@ -156,9 +164,23 @@ nfs4_shutdown_ds_clients(struct nfs_client *clp) } } +static void +nfs4_cleanup_callback(struct nfs_client *clp) +{ + struct nfs4_copy_state *cp_state; + + while (!list_empty(&clp->pending_cb_stateids)) { + cp_state = list_entry(clp->pending_cb_stateids.next, + struct nfs4_copy_state, copies); + list_del(&cp_state->copies); + kfree(cp_state); + } +} + void nfs41_shutdown_client(struct nfs_client *clp) { if (nfs4_has_session(clp)) { + nfs4_cleanup_callback(clp); nfs4_shutdown_ds_clients(clp); nfs4_destroy_session(clp->cl_session); nfs4_destroy_clientid(clp); @@ -177,8 +199,11 @@ void nfs40_shutdown_client(struct nfs_client *clp) struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) { - int err; + char buf[INET6_ADDRSTRLEN + 1]; + const char *ip_addr = cl_init->ip_addr; struct nfs_client *clp = nfs_alloc_client(cl_init); + int err; + if (IS_ERR(clp)) return clp; @@ -196,12 +221,54 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) INIT_LIST_HEAD(&clp->cl_ds_clients); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; - clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; + clp->cl_last_renewal = jiffies; #if IS_ENABLED(CONFIG_NFS_V4_1) init_waitqueue_head(&clp->cl_lock_waitq); #endif + INIT_LIST_HEAD(&clp->pending_cb_stateids); + + if (cl_init->minorversion != 0) + __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); + __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + if (test_bit(NFS_CS_PNFS, &cl_init->init_flags)) + __set_bit(NFS_CS_PNFS, &clp->cl_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &cl_init->init_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags); + /* + * Set up the connection to the server before we add add to the + * global list. + */ + err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); + if (err == -EINVAL) + err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); + if (err < 0) + goto error; + + /* If no clientaddr= option was specified, find a usable cb address */ + if (ip_addr == NULL) { + struct sockaddr_storage cb_addr; + struct sockaddr *sap = (struct sockaddr *)&cb_addr; + + err = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); + if (err < 0) + goto error; + err = rpc_ntop(sap, buf, sizeof(buf)); + if (err < 0) + goto error; + ip_addr = (const char *)buf; + } + strscpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + + err = nfs_idmap_new(clp); + if (err < 0) { + dprintk("%s: failed to create idmapper. Error = %d\n", + __func__, err); + goto error; + } + __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); return clp; error: @@ -269,7 +336,7 @@ static int nfs4_init_callback(struct nfs_client *clp) /** * nfs40_init_client - nfs_client initialization tasks for NFSv4.0 - * @clp - nfs_client to initialize + * @clp: nfs_client to initialize * * Returns zero on success, or a negative errno if some error occurred. */ @@ -285,6 +352,7 @@ int nfs40_init_client(struct nfs_client *clp) ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE, "NFSv4.0 transport Slot table"); if (ret) { + nfs4_shutdown_slot_table(tbl); kfree(tbl); return ret; } @@ -297,7 +365,7 @@ int nfs40_init_client(struct nfs_client *clp) /** * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+ - * @clp - nfs_client to initialize + * @clp: nfs_client to initialize * * Returns zero on success, or a negative errno if some error occurred. */ @@ -341,21 +409,46 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) return nfs4_init_callback(clp); } +static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) +{ + struct sockaddr_storage clp_addr, old_addr; + struct sockaddr *clp_sap = (struct sockaddr *)&clp_addr; + struct sockaddr *old_sap = (struct sockaddr *)&old_addr; + size_t clp_salen; + struct xprt_create xprt_args = { + .ident = old->cl_proto, + .net = old->cl_net, + .servername = old->cl_hostname, + }; + int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ? + clp->cl_max_connect : old->cl_max_connect; + + if (clp->cl_proto != old->cl_proto) + return; + clp_salen = rpc_peeraddr(clp->cl_rpcclient, clp_sap, sizeof(clp_addr)); + rpc_peeraddr(old->cl_rpcclient, old_sap, sizeof(old_addr)); + + if (clp_addr.ss_family != old_addr.ss_family) + return; + + xprt_args.dstaddr = clp_sap; + xprt_args.addrlen = clp_salen; + + rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args, + rpc_clnt_test_and_add_xprt, &max_connect); +} + /** * nfs4_init_client - Initialise an NFS4 client record * * @clp: nfs_client to initialise - * @timeparms: timeout parameters for underlying RPC transport - * @ip_addr: callback IP address in presentation format - * @authflavor: authentication flavor for underlying RPC transport + * @cl_init: pointer to nfs_client_initdata * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init) { - char buf[INET6_ADDRSTRLEN + 1]; - const char *ip_addr = cl_init->ip_addr; struct nfs_client *old; int error; @@ -363,58 +456,27 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, /* the client is initialised already */ return clp; - /* Check NFS protocol revision and initialize RPC op vector */ - clp->rpc_ops = &nfs_v4_clientops; - - if (clp->cl_minorversion != 0) - __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); - __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); - __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - - error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); - if (error == -EINVAL) - error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); - if (error < 0) - goto error; - - /* If no clientaddr= option was specified, find a usable cb address */ - if (ip_addr == NULL) { - struct sockaddr_storage cb_addr; - struct sockaddr *sap = (struct sockaddr *)&cb_addr; - - error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); - if (error < 0) - goto error; - error = rpc_ntop(sap, buf, sizeof(buf)); - if (error < 0) - goto error; - ip_addr = (const char *)buf; - } - strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); - - error = nfs_idmap_new(clp); - if (error < 0) { - dprintk("%s: failed to create idmapper. Error = %d\n", - __func__, error); - goto error; - } - __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); - error = nfs4_init_client_minor_version(clp); if (error < 0) goto error; - if (!nfs4_has_session(clp)) - nfs_mark_client_ready(clp, NFS_CS_READY); - error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; - if (clp != old) + if (clp != old) { clp->cl_preserve_clid = true; - nfs_put_client(clp); + /* + * Mark the client as having failed initialization so other + * processes walking the nfs_client_list in nfs_match_client() + * won't try to use it. + */ + nfs_mark_client_ready(clp, -EPERM); + if (old->cl_mvops->session_trunk) + nfs4_add_trunk(clp, old); + } clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags); + nfs_put_client(clp); return old; error: @@ -483,7 +545,7 @@ static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new, * ID and serverowner fields. Wait for CREATE_SESSION * to finish. */ if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); + refcount_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); nfs_put_client(*prev); @@ -526,7 +588,7 @@ static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new, */ int nfs40_walk_client_list(struct nfs_client *new, struct nfs_client **result, - struct rpc_cred *cred) + const struct cred *cred) { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); struct nfs_client *pos, *prev = NULL; @@ -539,6 +601,9 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + if (pos == new) + goto found; + status = nfs4_match_client(pos, new, &prev, nn); if (status < 0) goto out_unlock; @@ -559,7 +624,8 @@ int nfs40_walk_client_list(struct nfs_client *new, * way that a SETCLIENTID_CONFIRM to pos can succeed is * if new and pos point to the same server: */ - atomic_inc(&pos->cl_count); +found: + refcount_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); nfs_put_client(prev); @@ -572,6 +638,7 @@ int nfs40_walk_client_list(struct nfs_client *new, case 0: nfs4_swap_callback_idents(pos, new); pos->cl_confirm = new->cl_confirm; + nfs_mark_client_ready(pos, NFS_CS_READY); prev = NULL; *result = pos; @@ -582,6 +649,7 @@ int nfs40_walk_client_list(struct nfs_client *new, * changed. Schedule recovery! */ nfs4_schedule_path_down_recovery(pos); + goto out; default: goto out; } @@ -601,7 +669,7 @@ out: /* * Returns true if the server major ids match */ -static bool +bool nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, struct nfs41_server_owner *o2) { @@ -625,13 +693,13 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1, /** * nfs4_detect_session_trunking - Checks for session trunking. - * - * Called after a successful EXCHANGE_ID on a multi-addr connection. - * Upon success, add the transport. - * * @clp: original mount nfs_client * @res: result structure from an exchange_id using the original mount * nfs_client with a new multi_addr transport + * @xprt: pointer to the transport to add. + * + * Called after a successful EXCHANGE_ID on a multi-addr connection. + * Upon success, add the transport. * * Returns zero on success, otherwise -EINVAL * @@ -687,7 +755,7 @@ out_err: */ int nfs41_walk_client_list(struct nfs_client *new, struct nfs_client **result, - struct rpc_cred *cred) + const struct cred *cred) { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); struct nfs_client *pos, *prev = NULL; @@ -715,7 +783,7 @@ int nfs41_walk_client_list(struct nfs_client *new, continue; found: - atomic_inc(&pos->cl_count); + refcount_inc(&pos->cl_count); *result = pos; status = 0; break; @@ -730,9 +798,13 @@ out: static void nfs4_destroy_server(struct nfs_server *server) { + LIST_HEAD(freeme); + nfs_server_return_all_delegations(server); unset_pnfs_layoutdriver(server); - nfs4_purge_state_owners(server); + nfs4_purge_state_owners(server, &freeme); + nfs4_free_state_owners(&freeme); + kfree(server->delegation_hash_table); } /* @@ -749,7 +821,7 @@ nfs4_find_client_ident(struct net *net, int cb_ident) spin_lock(&nn->nfs_client_lock); clp = idr_find(&nn->cb_ident_idr, cb_ident); if (clp) - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); return clp; } @@ -793,7 +865,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, spin_lock(&nn->nfs_client_lock); list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { - if (nfs4_cb_match_client(addr, clp, minorversion) == false) + if (!nfs4_cb_match_client(addr, clp, minorversion)) continue; if (!nfs4_has_session(clp)) @@ -804,7 +876,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, sid->data, NFS4_MAX_SESSIONID_LEN) != 0) continue; - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); return clp; } @@ -826,40 +898,47 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, * Set up an NFS4 client */ static int nfs4_set_client(struct nfs_server *server, - const char *hostname, - const struct sockaddr *addr, - const size_t addrlen, - const char *ip_addr, - int proto, const struct rpc_timeout *timeparms, - u32 minorversion, struct net *net) + struct nfs_client_initdata *cl_init) { - struct nfs_client_initdata cl_init = { - .hostname = hostname, - .addr = addr, - .addrlen = addrlen, - .ip_addr = ip_addr, - .nfs_mod = &nfs_v4, - .proto = proto, - .minorversion = minorversion, - .net = net, - .timeparms = timeparms, - }; struct nfs_client *clp; + cl_init->nfs_mod = &nfs_v4; + cl_init->cred = server->cred; + + if (cl_init->minorversion == 0) { + __set_bit(NFS_CS_REUSEPORT, &cl_init->init_flags); + cl_init->max_connect = 0; + } + + switch (cl_init->proto) { + case XPRT_TRANSPORT_RDMA: + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_TCP_TLS: + break; + default: + cl_init->nconnect = 0; + } + if (server->flags & NFS_MOUNT_NORESVPORT) - set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_NORESVPORT, &cl_init->init_flags); if (server->options & NFS_OPTION_MIGRATION) - set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); + __set_bit(NFS_CS_MIGRATION, &cl_init->init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) - set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); + __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init->init_flags); + server->port = rpc_get_port((struct sockaddr *)cl_init->addr); + + if (server->flags & NFS_MOUNT_NETUNREACH_FATAL) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init->init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); + clp = nfs_get_client(cl_init); if (IS_ERR(clp)) return PTR_ERR(clp); - if (server->nfs_client == clp) + if (server->nfs_client == clp) { + nfs_put_client(clp); return -ELOOP; + } /* * Query for the lease time on clientid setup or renewal @@ -871,6 +950,9 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); server->nfs_client = clp; + nfs_sysfs_add_server(server); + nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state"); + return 0; } @@ -885,7 +967,7 @@ static int nfs4_set_client(struct nfs_server *server, * the MDS. */ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, int ds_addrlen, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version) { @@ -901,16 +983,41 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .minorversion = minor_version, .net = mds_clp->cl_net, .timeparms = &ds_timeout, + .cred = mds_srv->cred, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, }; char buf[INET6_ADDRSTRLEN + 1]; - if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + if (rpc_ntop((struct sockaddr *)ds_addr, buf, sizeof(buf)) <= 0) return ERR_PTR(-EINVAL); cl_init.hostname = buf; + switch (ds_proto) { + case XPRT_TRANSPORT_TCP_TLS: + if (mds_srv->nfs_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_srv->nfs_client->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; + case XPRT_TRANSPORT_RDMA: + case XPRT_TRANSPORT_TCP: + if (mds_clp->cl_nconnect > 1) { + cl_init.nconnect = mds_clp->cl_nconnect; + cl_init.max_connect = NFS_MAX_TRANSPORTS; + } + } + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); + __set_bit(NFS_CS_PNFS, &cl_init.init_flags); + cl_init.max_connect = NFS_MAX_TRANSPORTS; /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS @@ -923,10 +1030,10 @@ EXPORT_SYMBOL_GPL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. - * Set the mount rsize and wsize with negotiated fore channel - * attributes which will be bound checked in nfs_server_set_fsinfo. + * Limit the mount rsize, wsize and dtsize using negotiated fore + * channel attributes. */ -static void nfs4_session_set_rwsize(struct nfs_server *server) +static void nfs4_session_limit_rwsize(struct nfs_server *server) { #ifdef CONFIG_NFS_V4_1 struct nfs4_session *sess; @@ -939,60 +1046,81 @@ static void nfs4_session_set_rwsize(struct nfs_server *server) server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; - if (!server->rsize || server->rsize > server_resp_sz) + if (server->dtsize > server_resp_sz) + server->dtsize = server_resp_sz; + if (server->rsize > server_resp_sz) server->rsize = server_resp_sz; - if (!server->wsize || server->wsize > server_rqst_sz) + if (server->wsize > server_rqst_sz) server->wsize = server_rqst_sz; #endif /* CONFIG_NFS_V4_1 */ } +/* + * Limit xattr sizes using the channel attributes. + */ +static void nfs4_session_limit_xasize(struct nfs_server *server) +{ +#ifdef CONFIG_NFS_V4_2 + struct nfs4_session *sess; + u32 server_gxa_sz; + u32 server_sxa_sz; + u32 server_lxa_sz; + + if (!nfs4_has_session(server->nfs_client)) + return; + + sess = server->nfs_client->cl_session; + + server_gxa_sz = sess->fc_attrs.max_resp_sz - nfs42_maxgetxattr_overhead; + server_sxa_sz = sess->fc_attrs.max_rqst_sz - nfs42_maxsetxattr_overhead; + server_lxa_sz = sess->fc_attrs.max_resp_sz - + nfs42_maxlistxattrs_overhead; + + if (server->gxasize > server_gxa_sz) + server->gxasize = server_gxa_sz; + if (server->sxasize > server_sxa_sz) + server->sxasize = server_sxa_sz; + if (server->lxasize > server_lxa_sz) + server->lxasize = server_lxa_sz; +#endif +} + static int nfs4_server_common_setup(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) { - struct nfs_fattr *fattr; int error; + error = nfs4_delegation_hash_alloc(server); + if (error) + return error; + /* data servers support only a subset of NFSv4.1 */ if (is_ds_only_client(server->nfs_client)) return -EPROTONOSUPPORT; - fattr = nfs_alloc_fattr(); - if (fattr == NULL) - return -ENOMEM; - /* We must ensure the session is initialised first */ error = nfs4_init_session(server->nfs_client); if (error < 0) - goto out; - - /* Set the basic capabilities */ - server->caps |= server->nfs_client->cl_mvops->init_caps; - if (server->flags & NFS_MOUNT_NORDIRPLUS) - server->caps &= ~NFS_CAP_READDIRPLUS; - /* - * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower - * authentication. - */ - if (nfs4_disable_idmapping && - server->client->cl_auth->au_flavor == RPC_AUTH_UNIX) - server->caps |= NFS_CAP_UIDGID_NOMAP; + return error; + nfs_server_set_init_caps(server); /* Probe the root fh to retrieve its FSID and filehandle */ error = nfs4_get_rootfh(server, mntfh, auth_probe); if (error < 0) - goto out; + return error; dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); - nfs4_session_set_rwsize(server); - - error = nfs_probe_fsinfo(server, mntfh, fattr); + error = nfs_probe_server(server, mntfh); if (error < 0) - goto out; + return error; + + nfs4_session_limit_rwsize(server); + nfs4_session_limit_xasize(server); if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; @@ -1000,73 +1128,74 @@ static int nfs4_server_common_setup(struct nfs_server *server, nfs_server_insert_lists(server); server->mount_time = jiffies; server->destroy = nfs4_destroy_server; -out: - nfs_free_fattr(fattr); - return error; + return 0; } /* * Create a version 4 volume record */ -static int nfs4_init_server(struct nfs_server *server, - struct nfs_parsed_mount_data *data) +static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) { + struct nfs_fs_context *ctx = nfs_fc2context(fc); struct rpc_timeout timeparms; + struct nfs_client_initdata cl_init = { + .hostname = ctx->nfs_server.hostname, + .addr = &ctx->nfs_server._address, + .addrlen = ctx->nfs_server.addrlen, + .ip_addr = ctx->client_address, + .proto = ctx->nfs_server.protocol, + .minorversion = ctx->minorversion, + .net = fc->net_ns, + .timeparms = &timeparms, + .xprtsec = ctx->xprtsec, + .nconnect = ctx->nfs_server.nconnect, + .max_connect = ctx->nfs_server.max_connect, + }; int error; - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, - data->timeo, data->retrans); + nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol, + ctx->timeo, ctx->retrans); /* Initialise the client representation from the mount data */ - server->flags = data->flags; - server->options = data->options; - server->auth_info = data->auth_info; + server->flags = ctx->flags; + server->options = ctx->options; + server->auth_info = ctx->auth_info; /* Use the first specified auth flavor. If this flavor isn't * allowed by the server, use the SECINFO path to try the * other specified flavors */ - if (data->auth_info.flavor_len >= 1) - data->selected_flavor = data->auth_info.flavors[0]; + if (ctx->auth_info.flavor_len >= 1) + ctx->selected_flavor = ctx->auth_info.flavors[0]; else - data->selected_flavor = RPC_AUTH_UNIX; + ctx->selected_flavor = RPC_AUTH_UNIX; /* Get a client record */ - error = nfs4_set_client(server, - data->nfs_server.hostname, - (const struct sockaddr *)&data->nfs_server.address, - data->nfs_server.addrlen, - data->client_address, - data->nfs_server.protocol, - &timeparms, - data->minorversion, - data->net); + error = nfs4_set_client(server, &cl_init); if (error < 0) return error; - if (data->rsize) - server->rsize = nfs_block_size(data->rsize, NULL); - if (data->wsize) - server->wsize = nfs_block_size(data->wsize, NULL); + if (ctx->rsize) + server->rsize = nfs_io_size(ctx->rsize, server->nfs_client->cl_proto); + if (ctx->wsize) + server->wsize = nfs_io_size(ctx->wsize, server->nfs_client->cl_proto); - server->acregmin = data->acregmin * HZ; - server->acregmax = data->acregmax * HZ; - server->acdirmin = data->acdirmin * HZ; - server->acdirmax = data->acdirmax * HZ; - server->port = data->nfs_server.port; + server->acregmin = ctx->acregmin * HZ; + server->acregmax = ctx->acregmax * HZ; + server->acdirmin = ctx->acdirmin * HZ; + server->acdirmax = ctx->acdirmax * HZ; + server->port = ctx->nfs_server.port; return nfs_init_server_rpcclient(server, &timeparms, - data->selected_flavor); + ctx->selected_flavor); } /* * Create a version 4 volume record * - keyed on server and FSID */ -/*struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, - struct nfs_fh *mntfh)*/ -struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, - struct nfs_subversion *nfs_mod) +struct nfs_server *nfs4_create_server(struct fs_context *fc) { + struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_server *server; bool auth_probe; int error; @@ -1075,14 +1204,16 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, if (!server) return ERR_PTR(-ENOMEM); - auth_probe = mount_info->parsed->auth_info.flavor_len < 1; + server->cred = get_cred(fc->cred); + + auth_probe = ctx->auth_info.flavor_len < 1; /* set up the general RPC client */ - error = nfs4_init_server(server, mount_info->parsed); + error = nfs4_init_server(server, fc); if (error < 0) goto error; - error = nfs4_server_common_setup(server, mount_info->mntfh, auth_probe); + error = nfs4_server_common_setup(server, ctx->mntfh, auth_probe); if (error < 0) goto error; @@ -1096,11 +1227,24 @@ error: /* * Create an NFS4 referral server record */ -struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, - struct nfs_fh *mntfh) +struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) { - struct nfs_client *parent_client; - struct nfs_server *server, *parent_server; + struct nfs_fs_context *ctx = nfs_fc2context(fc); + struct nfs_server *parent_server = NFS_SB(ctx->clone_data.sb); + struct nfs_client *parent_client = parent_server->nfs_client; + struct nfs_client_initdata cl_init = { + .hostname = ctx->nfs_server.hostname, + .addr = &ctx->nfs_server._address, + .addrlen = ctx->nfs_server.addrlen, + .ip_addr = parent_client->cl_ipaddr, + .minorversion = parent_client->cl_mvops->minor_version, + .net = parent_client->cl_net, + .timeparms = parent_server->client->cl_timeout, + .xprtsec = parent_client->cl_xprtsec, + .nconnect = parent_client->cl_nconnect, + .max_connect = parent_client->cl_max_connect, + }; + struct nfs_server *server; bool auth_probe; int error; @@ -1108,32 +1252,39 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (!server) return ERR_PTR(-ENOMEM); - parent_server = NFS_SB(data->sb); - parent_client = parent_server->nfs_client; + server->cred = get_cred(parent_server->cred); /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - /* Get a client representation. - * Note: NFSv4 always uses TCP, */ - error = nfs4_set_client(server, data->hostname, - data->addr, - data->addrlen, - parent_client->cl_ipaddr, - rpc_protocol(parent_server->client), - parent_server->client->cl_timeout, - parent_client->cl_mvops->minor_version, - parent_client->cl_net); + /* Get a client representation */ +#if IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) + rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT); + cl_init.proto = XPRT_TRANSPORT_RDMA; + error = nfs4_set_client(server, &cl_init); + if (!error) + goto init_server; +#endif /* IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) */ + + cl_init.proto = XPRT_TRANSPORT_TCP; + if (parent_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.proto = XPRT_TRANSPORT_TCP_TLS; + rpc_set_port(&ctx->nfs_server.address, NFS_PORT); + error = nfs4_set_client(server, &cl_init); if (error < 0) goto error; - error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor); +#if IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) +init_server: +#endif + error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, + ctx->selected_flavor); if (error < 0) goto error; auth_probe = parent_server->auth_info.flavor_len < 1; - error = nfs4_server_common_setup(server, mntfh, auth_probe); + error = nfs4_server_common_setup(server, ctx->mntfh, auth_probe); if (error < 0) goto error; @@ -1144,30 +1295,6 @@ error: return ERR_PTR(error); } -/* - * Grab the destination's particulars, including lease expiry time. - * - * Returns zero if probe succeeded and retrieved FSID matches the FSID - * we have cached. - */ -static int nfs_probe_destination(struct nfs_server *server) -{ - struct inode *inode = d_inode(server->super->s_root); - struct nfs_fattr *fattr; - int error; - - fattr = nfs_alloc_fattr(); - if (fattr == NULL) - return -ENOMEM; - - /* Sanity: the probe won't work if the destination server - * does not recognize the migrated FH. */ - error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr); - - nfs_free_fattr(fattr); - return error; -} - /** * nfs4_update_server - Move an nfs_server to a different nfs_client * @@ -1184,20 +1311,34 @@ static int nfs_probe_destination(struct nfs_server *server) * Returns zero on success, or a negative errno value. */ int nfs4_update_server(struct nfs_server *server, const char *hostname, - struct sockaddr *sap, size_t salen, struct net *net) + struct sockaddr_storage *sap, size_t salen, struct net *net) { struct nfs_client *clp = server->nfs_client; struct rpc_clnt *clnt = server->client; struct xprt_create xargs = { .ident = clp->cl_proto, .net = net, - .dstaddr = sap, + .dstaddr = (struct sockaddr *)sap, .addrlen = salen, .servername = hostname, + /* cel: bleh. We might need to pass TLS parameters here */ }; char buf[INET6_ADDRSTRLEN + 1]; struct sockaddr_storage address; struct sockaddr *localaddr = (struct sockaddr *)&address; + struct nfs_client_initdata cl_init = { + .hostname = hostname, + .addr = sap, + .addrlen = salen, + .ip_addr = buf, + .proto = clp->cl_proto, + .minorversion = clp->cl_minorversion, + .net = net, + .timeparms = clnt->cl_timeout, + .xprtsec = clp->cl_xprtsec, + .nconnect = clp->cl_nconnect, + .max_connect = clp->cl_max_connect, + }; int error; error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); @@ -1213,19 +1354,20 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, nfs_server_remove_lists(server); set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); - error = nfs4_set_client(server, hostname, sap, salen, buf, - clp->cl_proto, clnt->cl_timeout, - clp->cl_minorversion, net); + error = nfs4_set_client(server, &cl_init); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); - nfs_put_client(clp); if (error != 0) { nfs_server_insert_lists(server); return error; } + nfs_put_client(clp); - if (server->nfs_client->cl_hostname == NULL) + if (server->nfs_client->cl_hostname == NULL) { server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + if (server->nfs_client->cl_hostname == NULL) + return -ENOMEM; + } nfs_server_insert_lists(server); - return nfs_probe_destination(server); + return nfs_probe_server(server, NFS_FH(d_inode(server->super->s_root))); } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 0efba77789b9..7317f26892c5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/file.c * @@ -6,8 +7,10 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/falloc.h> +#include <linux/mount.h> #include <linux/nfs_fs.h> -#include <uapi/linux/btrfs.h> /* BTRFS_IOC_CLONE/BTRFS_IOC_CLONE_RANGE */ +#include <linux/nfs_ssc.h> +#include <linux/splice.h> #include "delegation.h" #include "internal.h" #include "iostat.h" @@ -48,16 +51,14 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (err) return err; - if ((openflags & O_ACCMODE) == 3) - openflags--; - /* We can't create new files here */ openflags &= ~(O_CREAT|O_EXCL); parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); + ctx = alloc_nfs_open_context(file_dentry(filp), + flags_to_mode(openflags), filp); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; @@ -73,23 +74,23 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (IS_ERR(inode)) { err = PTR_ERR(inode); switch (err) { - case -EPERM: - case -EACCES: - case -EDQUOT: - case -ENOSPC: - case -EROFS: - goto out_put_ctx; default: + goto out_put_ctx; + case -ENOENT: + case -ESTALE: + case -EISDIR: + case -ENOTDIR: + case -ELOOP: goto out_drop; } } if (inode != d_inode(dentry)) goto out_drop; - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); nfs_file_set_open_context(filp, ctx); nfs_fscache_open_file(inode, filp); err = 0; + filp->f_mode |= FMODE_CAN_ODIRECT; out_put_ctx: put_nfs_open_context(ctx); @@ -110,6 +111,7 @@ static int nfs4_file_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); + errseq_t since; dprintk("NFS: flush(%pD2)\n", file); @@ -125,18 +127,78 @@ nfs4_file_flush(struct file *file, fl_owner_t id) return filemap_fdatawrite(file->f_mapping); /* Flush writes to the server and return any errors */ - return vfs_fsync(file, 0); + since = filemap_sample_wb_err(file->f_mapping); + nfs_wb_all(inode); + return filemap_check_wb_err(file->f_mapping, since); } #ifdef CONFIG_NFS_V4_2 +static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t count, unsigned int flags) +{ + struct nfs42_copy_notify_res *cn_resp = NULL; + struct nl4_server *nss = NULL; + nfs4_stateid *cnrs = NULL; + ssize_t ret; + bool sync = false; + + /* Only offload copy if superblock is the same */ + if (file_in->f_op != &nfs4_file_operations) + return -EXDEV; + if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY) || + !nfs_server_capable(file_inode(file_in), NFS_CAP_COPY)) + return -EOPNOTSUPP; + if (file_inode(file_in) == file_inode(file_out)) + return -EOPNOTSUPP; + /* if the copy size if smaller than 2 RPC payloads, make it + * synchronous + */ + if (count <= 2 * NFS_SERVER(file_inode(file_in))->rsize) + sync = true; +retry: + if (!nfs42_files_from_same_server(file_in, file_out)) { + /* + * for inter copy, if copy size is too small + * then fallback to generic copy. + */ + if (sync) + return -EOPNOTSUPP; + cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res), + GFP_KERNEL); + if (unlikely(cn_resp == NULL)) + return -ENOMEM; + + ret = nfs42_proc_copy_notify(file_in, file_out, cn_resp); + if (ret) { + ret = -EOPNOTSUPP; + goto out; + } + nss = &cn_resp->cnr_src; + cnrs = &cn_resp->cnr_stateid; + } + ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count, + nss, cnrs, sync); +out: + kfree(cn_resp); + + if (ret == -EAGAIN) + goto retry; + return ret; +} + static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { - if (file_inode(file_in) == file_inode(file_out)) - return -EINVAL; + ssize_t ret; - return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); + ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count, + flags); + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = splice_copy_file_range(file_in, pos_in, file_out, + pos_out, count); + return ret; } static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) @@ -147,8 +209,9 @@ static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) case SEEK_HOLE: case SEEK_DATA: ret = nfs42_proc_llseek(filep, offset, whence); - if (ret != -ENOTSUPP) + if (ret != -EOPNOTSUPP) return ret; + fallthrough; default: return nfs_file_llseek(filep, offset, whence); } @@ -162,8 +225,14 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE))) + switch (mode) { + case 0: + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + case FALLOC_FL_ZERO_RANGE: + break; + default: return -EOPNOTSUPP; + } ret = inode_newsize_ok(inode, offset + len); if (ret < 0) @@ -171,19 +240,31 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (mode & FALLOC_FL_PUNCH_HOLE) return nfs42_proc_deallocate(filep, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) + return nfs42_proc_zero_range(filep, offset ,len); return nfs42_proc_allocate(filep, offset, len); } -static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, - struct file *dst_file, loff_t dst_off, u64 count) +static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, loff_t count, + unsigned int remap_flags) { struct inode *dst_inode = file_inode(dst_file); struct nfs_server *server = NFS_SERVER(dst_inode); struct inode *src_inode = file_inode(src_file); unsigned int bs = server->clone_blksize; - bool same_inode = false; int ret; + /* NFS does not support deduplication. */ + if (remap_flags & REMAP_FILE_DEDUP) + return -EOPNOTSUPP; + + if (remap_flags & ~REMAP_FILE_ADVISORY) + return -EINVAL; + + if (IS_SWAPFILE(dst_inode) || IS_SWAPFILE(src_inode)) + return -ETXTBSY; + /* check alignment w.r.t. clone_blksize */ ret = -EINVAL; if (bs) { @@ -193,25 +274,15 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, goto out; } - if (src_inode == dst_inode) - same_inode = true; - /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ - if (same_inode) { - inode_lock(src_inode); - } else if (dst_inode < src_inode) { - inode_lock_nested(dst_inode, I_MUTEX_PARENT); - inode_lock_nested(src_inode, I_MUTEX_CHILD); - } else { - inode_lock_nested(src_inode, I_MUTEX_PARENT); - inode_lock_nested(dst_inode, I_MUTEX_CHILD); - } - + lock_two_nondirectories(src_inode, dst_inode); /* flush all pending writes on both src and dst so that server * has the latest data */ + nfs_file_block_o_direct(NFS_I(src_inode)); ret = nfs_sync_inode(src_inode); if (ret) goto out_unlock; + nfs_file_block_o_direct(NFS_I(dst_inode)); ret = nfs_sync_inode(dst_inode); if (ret) goto out_unlock; @@ -224,40 +295,168 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1); out_unlock: - if (same_inode) { - inode_unlock(src_inode); - } else if (dst_inode < src_inode) { - inode_unlock(src_inode); - inode_unlock(dst_inode); - } else { - inode_unlock(dst_inode); - inode_unlock(src_inode); + unlock_two_nondirectories(src_inode, dst_inode); +out: + return ret < 0 ? ret : count; +} + +static int read_name_gen = 1; +#define SSC_READ_NAME_BODY "ssc_read_%d" + +static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, + struct nfs_fh *src_fh, nfs4_stateid *stateid) +{ + struct nfs_fattr *fattr = nfs_alloc_fattr(); + struct file *filep, *res; + struct nfs_server *server; + struct inode *r_ino = NULL; + struct nfs_open_context *ctx; + struct nfs4_state_owner *sp; + char *read_name = NULL; + int len, status = 0; + + server = NFS_SB(ss_mnt->mnt_sb); + + if (!fattr) + return ERR_PTR(-ENOMEM); + + status = nfs4_proc_getattr(server, src_fh, fattr, NULL); + if (status < 0) { + res = ERR_PTR(status); + goto out; } + + if (!S_ISREG(fattr->mode)) { + res = ERR_PTR(-EBADF); + goto out; + } + + res = ERR_PTR(-ENOMEM); + len = strlen(SSC_READ_NAME_BODY) + 16; + read_name = kzalloc(len, GFP_KERNEL); + if (read_name == NULL) + goto out; + snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++); + + r_ino = nfs_fhget(ss_mnt->mnt_sb, src_fh, fattr); + if (IS_ERR(r_ino)) { + res = ERR_CAST(r_ino); + goto out_free_name; + } + + filep = alloc_file_pseudo(r_ino, ss_mnt, read_name, O_RDONLY, + r_ino->i_fop); + if (IS_ERR(filep)) { + res = ERR_CAST(filep); + iput(r_ino); + goto out_free_name; + } + + ctx = alloc_nfs_open_context(filep->f_path.dentry, + flags_to_mode(filep->f_flags), filep); + if (IS_ERR(ctx)) { + res = ERR_CAST(ctx); + goto out_filep; + } + + res = ERR_PTR(-EINVAL); + sp = nfs4_get_state_owner(server, ctx->cred, GFP_KERNEL); + if (sp == NULL) + goto out_ctx; + + ctx->state = nfs4_get_open_state(r_ino, sp); + if (ctx->state == NULL) + goto out_stateowner; + + set_bit(NFS_SRV_SSC_COPY_STATE, &ctx->state->flags); + memcpy(&ctx->state->open_stateid.other, &stateid->other, + NFS4_STATEID_OTHER_SIZE); + update_open_stateid(ctx->state, stateid, NULL, filep->f_mode); + set_bit(NFS_OPEN_STATE, &ctx->state->flags); + + nfs_file_set_open_context(filep, ctx); + put_nfs_open_context(ctx); + + file_ra_state_init(&filep->f_ra, filep->f_mapping->host->i_mapping); + res = filep; +out_free_name: + kfree(read_name); out: - return ret; + nfs_free_fattr(fattr); + return res; +out_stateowner: + nfs4_put_state_owner(sp); +out_ctx: + put_nfs_open_context(ctx); +out_filep: + fput(filep); + goto out_free_name; +} + +static void __nfs42_ssc_close(struct file *filep) +{ + struct nfs_open_context *ctx = nfs_file_open_context(filep); + + ctx->state->flags = 0; +} + +static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = { + .sco_open = __nfs42_ssc_open, + .sco_close = __nfs42_ssc_close, +}; + +/** + * nfs42_ssc_register_ops - Wrapper to register NFS_V4 ops in nfs_common + * + * Return values: + * None + */ +void nfs42_ssc_register_ops(void) +{ + nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl); +} + +/** + * nfs42_ssc_unregister_ops - wrapper to un-register NFS_V4 ops in nfs_common + * + * Return values: + * None. + */ +void nfs42_ssc_unregister_ops(void) +{ + nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl); } #endif /* CONFIG_NFS_V4_2 */ +static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease, + void **priv) +{ + if (!S_ISREG(file_inode(file)->i_mode)) + return -EINVAL; + return nfs4_proc_setlease(file, arg, lease, priv); +} + const struct file_operations nfs4_file_operations = { .read_iter = nfs_file_read, .write_iter = nfs_file_write, - .mmap = nfs_file_mmap, + .mmap_prepare = nfs_file_mmap_prepare, .open = nfs4_file_open, .flush = nfs4_file_flush, .release = nfs_file_release, .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, - .splice_read = generic_file_splice_read, + .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, - .setlease = simple_nosetlease, + .setlease = nfs4_setlease, #ifdef CONFIG_NFS_V4_2 .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, .fallocate = nfs42_fallocate, - .clone_file_range = nfs42_clone_file_range, + .remap_file_range = nfs42_remap_file_range, #else .llseek = nfs_file_llseek, #endif + .fop_flags = FOP_DONTCACHE, }; diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index ac8406018962..e67ea345de69 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -11,30 +12,28 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe) { - struct nfs_fsinfo fsinfo; + struct nfs_fattr *fattr = nfs_alloc_fattr(); int ret = -ENOMEM; - fsinfo.fattr = nfs_alloc_fattr(); - if (fsinfo.fattr == NULL) + if (fattr == NULL) goto out; /* Start by getting the root filehandle from the server */ - ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe); + ret = nfs4_proc_get_rootfh(server, mntfh, fattr, auth_probe); if (ret < 0) { dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret); goto out; } - if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE) - || !S_ISDIR(fsinfo.fattr->mode)) { + if (!(fattr->valid & NFS_ATTR_FATTR_TYPE) || !S_ISDIR(fattr->mode)) { printk(KERN_ERR "nfs4_get_rootfh:" " getroot encountered non-directory\n"); ret = -ENOTDIR; goto out; } - memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); out: - nfs_free_fattr(fsinfo.fattr); + nfs_free_fattr(fattr); return ret; } diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index dd5d27da8c0c..9e1c48c5c0b8 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -44,7 +44,9 @@ #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> +#include <keys/request_key_auth-type.h> #include <linux/module.h> +#include <linux/user_namespace.h> #include "internal.h" #include "netns.h" @@ -59,7 +61,7 @@ static struct key_type key_type_id_resolver_legacy; struct idmap_legacy_upcalldata { struct rpc_pipe_msg pipe_msg; struct idmap_msg idmap_msg; - struct key_construction *key_cons; + struct key *authkey; struct idmap *idmap; }; @@ -68,8 +70,16 @@ struct idmap { struct rpc_pipe *idmap_pipe; struct idmap_legacy_upcalldata *idmap_upcall_data; struct mutex idmap_mutex; + struct user_namespace *user_ns; }; +static struct user_namespace *idmap_userns(const struct idmap *idmap) +{ + if (idmap && idmap->user_ns) + return idmap->user_ns; + return &init_user_ns; +} + /** * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields * @fattr: fully initialised struct nfs_fattr @@ -193,7 +203,7 @@ int nfs_idmap_init(void) printk(KERN_NOTICE "NFS: Registering the %s key type\n", key_type_id_resolver.name); - cred = prepare_kernel_cred(NULL); + cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; @@ -270,18 +280,19 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen, const char *type, struct idmap *idmap) { char *desc; - struct key *rkey; + struct key *rkey = ERR_PTR(-EAGAIN); ssize_t ret; ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); - if (ret <= 0) + if (ret < 0) return ERR_PTR(ret); - rkey = request_key(&key_type_id_resolver, desc, ""); + if (!idmap->user_ns || idmap->user_ns == &init_user_ns) + rkey = request_key(&key_type_id_resolver, desc, ""); if (IS_ERR(rkey)) { mutex_lock(&idmap->idmap_mutex); rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, - desc, "", 0, idmap); + desc, NULL, "", 0, idmap); mutex_unlock(&idmap->idmap_mutex); } if (!IS_ERR(rkey)) @@ -295,15 +306,12 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, const char *type, void *data, size_t data_size, struct idmap *idmap) { - const struct cred *saved_cred; struct key *rkey; const struct user_key_payload *payload; ssize_t ret; - saved_cred = override_creds(id_resolver_cache); - rkey = nfs_idmap_request_key(name, namelen, type, idmap); - revert_creds(saved_cred); - + scoped_with_creds(id_resolver_cache) + rkey = nfs_idmap_request_key(name, namelen, type, idmap); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); goto out; @@ -343,7 +351,7 @@ static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, int id_len; ssize_t ret; - id_len = snprintf(id_str, sizeof(id_str), "%u", id); + id_len = nfs_map_numeric_to_string(id, id_str, sizeof(id_str)); ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap); if (ret < 0) return -EINVAL; @@ -384,7 +392,7 @@ static const match_table_t nfs_idmap_tokens = { { Opt_find_err, NULL } }; -static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *); +static int nfs_idmap_legacy_upcall(struct key *, void *); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_release_pipe(struct inode *); @@ -413,26 +421,16 @@ static void nfs_idmap_pipe_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; - struct rpc_pipe *pipe = idmap->idmap_pipe; - if (pipe->dentry) { - rpc_unlink(pipe->dentry); - pipe->dentry = NULL; - } + rpc_unlink(idmap->idmap_pipe); } static int nfs_idmap_pipe_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; - struct rpc_pipe *pipe = idmap->idmap_pipe; - struct dentry *dentry; - dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - pipe->dentry = dentry; - return 0; + return rpc_mkpipe_dentry(dir, "idmap", idmap, idmap->idmap_pipe); } static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { @@ -451,6 +449,9 @@ nfs_idmap_new(struct nfs_client *clp) if (idmap == NULL) return -ENOMEM; + mutex_init(&idmap->idmap_mutex); + idmap->user_ns = get_user_ns(clp->cl_rpcclient->cl_cred->user_ns); + rpc_init_pipe_dir_object(&idmap->idmap_pdo, &nfs_idmap_pipe_dir_object_ops, idmap); @@ -461,7 +462,6 @@ nfs_idmap_new(struct nfs_client *clp) goto err; } idmap->idmap_pipe = pipe; - mutex_init(&idmap->idmap_mutex); error = rpc_add_pipe_dir_object(clp->cl_net, &clp->cl_rpcclient->cl_pipedir_objects, @@ -474,6 +474,7 @@ nfs_idmap_new(struct nfs_client *clp) err_destroy_pipe: rpc_destroy_pipe_data(idmap->idmap_pipe); err: + put_user_ns(idmap->user_ns); kfree(idmap); return error; } @@ -490,6 +491,7 @@ nfs_idmap_delete(struct nfs_client *clp) &clp->cl_rpcclient->cl_pipedir_objects, &idmap->idmap_pdo); rpc_destroy_pipe_data(idmap->idmap_pipe); + put_user_ns(idmap->user_ns); kfree(idmap); } @@ -506,6 +508,7 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, switch (token) { case Opt_find_uid: im->im_type = IDMAP_TYPE_USER; + fallthrough; case Opt_find_gid: im->im_conv = IDMAP_CONV_NAMETOID; ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); @@ -513,9 +516,12 @@ static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, case Opt_find_user: im->im_type = IDMAP_TYPE_USER; + fallthrough; case Opt_find_group: im->im_conv = IDMAP_CONV_IDTONAME; ret = match_int(&substr, &im->im_id); + if (ret) + goto out; break; default: @@ -542,35 +548,37 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, return true; } -static void -nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) +static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data, + int ret) { - struct key_construction *cons = idmap->idmap_upcall_data->key_cons; - - kfree(idmap->idmap_upcall_data); - idmap->idmap_upcall_data = NULL; - complete_request_key(cons, ret); + complete_request_key(data->authkey, ret); + key_put(data->authkey); + kfree(data); } -static void -nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) +static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap, + struct idmap_legacy_upcalldata *data, + int ret) { - if (idmap->idmap_upcall_data != NULL) - nfs_idmap_complete_pipe_upcall_locked(idmap, ret); + if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data) + nfs_idmap_complete_pipe_upcall(data, ret); } -static int nfs_idmap_legacy_upcall(struct key_construction *cons, - const char *op, - void *aux) +static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) { struct idmap_legacy_upcalldata *data; + struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; - struct idmap *idmap = (struct idmap *)aux; - struct key *key = cons->key; - int ret = -ENOMEM; + struct idmap *idmap = aux; + struct key *key = rka->target_key; + int ret = -ENOKEY; + + if (!aux) + goto out1; /* msg and im are freed in idmap_pipe_destroy_msg */ + ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out1; @@ -578,7 +586,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, msg = &data->pipe_msg; im = &data->idmap_msg; data->idmap = idmap; - data->key_cons = cons; + data->authkey = key_get(authkey); ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); if (ret < 0) @@ -590,13 +598,13 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, ret = rpc_queue_upcall(idmap->idmap_pipe, msg); if (ret < 0) - nfs_idmap_abort_pipe_upcall(idmap, ret); + nfs_idmap_abort_pipe_upcall(idmap, data, ret); return ret; out2: kfree(data); out1: - complete_request_key(cons, ret); + complete_request_key(authkey, ret); return ret; } @@ -623,7 +631,8 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, if (strcmp(upcall->im_name, im->im_name) != 0) break; /* Note: here we store the NUL terminator too */ - len = sprintf(id_str, "%d", im->im_id) + 1; + len = 1 + nfs_map_numeric_to_string(im->im_id, id_str, + sizeof(id_str)); ret = nfs_idmap_instantiate(key, authkey, id_str, len); break; case IDMAP_CONV_IDTONAME: @@ -642,9 +651,11 @@ out: static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { + struct request_key_auth *rka; struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; - struct key_construction *cons; + struct idmap_legacy_upcalldata *data; + struct key *authkey; struct idmap_msg im; size_t namelen_in; int ret = -ENOKEY; @@ -653,10 +664,12 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) * will have been woken up and someone else may now have used * idmap_key_cons - so after this point we may no longer touch it. */ - if (idmap->idmap_upcall_data == NULL) + data = xchg(&idmap->idmap_upcall_data, NULL); + if (data == NULL) goto out_noupcall; - cons = idmap->idmap_upcall_data->key_cons; + authkey = data->authkey; + rka = get_request_key_auth(authkey); if (mlen != sizeof(im)) { ret = -ENOSPC; @@ -677,18 +690,17 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { ret = -EINVAL; goto out; -} + } - ret = nfs_idmap_read_and_verify_message(&im, - &idmap->idmap_upcall_data->idmap_msg, - cons->key, cons->authkey); + ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg, + rka->target_key, authkey); if (ret >= 0) { - key_set_timeout(cons->key, nfs_idmap_cache_timeout); + key_set_timeout(rka->target_key, nfs_idmap_cache_timeout); ret = mlen; } out: - nfs_idmap_complete_pipe_upcall_locked(idmap, ret); + nfs_idmap_complete_pipe_upcall(data, ret); out_noupcall: return ret; } @@ -702,7 +714,7 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) struct idmap *idmap = data->idmap; if (msg->errno) - nfs_idmap_abort_pipe_upcall(idmap, msg->errno); + nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno); } static void @@ -710,8 +722,11 @@ idmap_release_pipe(struct inode *inode) { struct rpc_inode *rpci = RPC_I(inode); struct idmap *idmap = (struct idmap *)rpci->private; + struct idmap_legacy_upcalldata *data; - nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); + data = xchg(&idmap->idmap_upcall_data, NULL); + if (data) + nfs_idmap_complete_pipe_upcall(data, -EPIPE); } int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) @@ -723,7 +738,7 @@ int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_ if (!nfs_map_string_to_numeric(name, namelen, &id)) ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); if (ret == 0) { - *uid = make_kuid(&init_user_ns, id); + *uid = make_kuid(idmap_userns(idmap), id); if (!uid_valid(*uid)) ret = -ERANGE; } @@ -740,7 +755,7 @@ int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size if (!nfs_map_string_to_numeric(name, namelen, &id)) ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); if (ret == 0) { - *gid = make_kgid(&init_user_ns, id); + *gid = make_kgid(idmap_userns(idmap), id); if (!gid_valid(*gid)) ret = -ERANGE; } @@ -754,7 +769,7 @@ int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, int ret = -EINVAL; __u32 id; - id = from_kuid(&init_user_ns, uid); + id = from_kuid_munged(idmap_userns(idmap), uid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); if (ret < 0) @@ -768,7 +783,7 @@ int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, int ret = -EINVAL; __u32 id; - id = from_kgid(&init_user_ns, gid); + id = from_kgid_munged(idmap_userns(idmap), gid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); if (ret < 0) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 7d531da1bae3..9a98595bb160 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/nfs4namespace.c * @@ -7,6 +8,7 @@ * NFSv4 namespace */ +#include <linux/module.h> #include <linux/dcache.h> #include <linux/mount.h> #include <linux/namei.h> @@ -20,37 +22,64 @@ #include <linux/inet.h> #include "internal.h" #include "nfs4_fs.h" +#include "nfs.h" #include "dns_resolve.h" #define NFSDBG_FACILITY NFSDBG_VFS /* + * Work out the length that an NFSv4 path would render to as a standard posix + * path, with a leading slash but no terminating slash. + */ +static ssize_t nfs4_pathname_len(const struct nfs4_pathname *pathname) +{ + ssize_t len = 0; + int i; + + for (i = 0; i < pathname->ncomponents; i++) { + const struct nfs4_string *component = &pathname->components[i]; + + if (component->len > NAME_MAX) + goto too_long; + len += 1 + component->len; /* Adding "/foo" */ + if (len > PATH_MAX) + goto too_long; + } + return len; + +too_long: + return -ENAMETOOLONG; +} + +/* * Convert the NFSv4 pathname components into a standard posix path. - * - * Note that the resulting string will be placed at the end of the buffer */ -static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, - char *buffer, ssize_t buflen) +static char *nfs4_pathname_string(const struct nfs4_pathname *pathname, + unsigned short *_len) { - char *end = buffer + buflen; - int n; + ssize_t len; + char *buf, *p; + int i; + + len = nfs4_pathname_len(pathname); + if (len < 0) + return ERR_PTR(len); + *_len = len; + + p = buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < pathname->ncomponents; i++) { + const struct nfs4_string *component = &pathname->components[i]; - *--end = '\0'; - buflen--; - - n = pathname->ncomponents; - while (--n >= 0) { - const struct nfs4_string *component = &pathname->components[n]; - buflen -= component->len + 1; - if (buflen < 0) - goto Elong; - end -= component->len; - memcpy(end, component->data, component->len); - *--end = '/'; + *p++ = '/'; + memcpy(p, component->data, component->len); + p += component->len; } - return end; -Elong: - return ERR_PTR(-ENAMETOOLONG); + + *p = 0; + return buf; } /* @@ -99,43 +128,65 @@ static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen) */ static int nfs4_validate_fspath(struct dentry *dentry, const struct nfs4_fs_locations *locations, - char *page, char *page2) + struct nfs_fs_context *ctx) { - const char *path, *fs_path; + const char *path; + char *fs_path; + unsigned short len; + char *buf; + int n; - path = nfs4_path(dentry, page, PAGE_SIZE); - if (IS_ERR(path)) + buf = kmalloc(4096, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + path = nfs4_path(dentry, buf, 4096); + if (IS_ERR(path)) { + kfree(buf); return PTR_ERR(path); + } - fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE); - if (IS_ERR(fs_path)) + fs_path = nfs4_pathname_string(&locations->fs_path, &len); + if (IS_ERR(fs_path)) { + kfree(buf); return PTR_ERR(fs_path); + } - if (strncmp(path, fs_path, strlen(fs_path)) != 0) { + n = strncmp(path, fs_path, len); + kfree(buf); + kfree(fs_path); + if (n != 0) { dprintk("%s: path %s does not begin with fsroot %s\n", - __func__, path, fs_path); + __func__, path, ctx->nfs_server.export_path); return -ENOENT; } return 0; } -static size_t nfs_parse_server_name(char *string, size_t len, - struct sockaddr *sa, size_t salen, struct net *net) +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr_storage *ss, + size_t salen, struct net *net, int port) { + struct sockaddr *sa = (struct sockaddr *)ss; ssize_t ret; ret = rpc_pton(net, string, len, sa, salen); if (ret == 0) { - ret = nfs_dns_resolve_name(net, string, len, sa, salen); - if (ret < 0) - ret = 0; + ret = rpc_uaddr2sockaddr(net, string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(net, string, len, ss, salen); + if (ret < 0) + ret = 0; + } + } else if (port) { + rpc_set_port(sa, port); } return ret; } /** * nfs_find_best_sec - Find a security mechanism supported locally + * @clnt: pointer to rpc_clnt * @server: NFS server struct * @flavors: List of security tuples returned by SECINFO procedure * @@ -234,97 +285,103 @@ out: return new; } -static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, - char *page, char *page2, - const struct nfs4_fs_location *location) +static int try_location(struct fs_context *fc, + const struct nfs4_fs_location *location) { - const size_t addr_bufsize = sizeof(struct sockaddr_storage); - struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client); - struct vfsmount *mnt = ERR_PTR(-ENOENT); - char *mnt_path; - unsigned int maxbuflen; - unsigned int s; + struct nfs_fs_context *ctx = nfs_fc2context(fc); + unsigned int len, s; + char *export_path, *source, *p; + int ret = -ENOENT; + + /* Allocate a buffer big enough to hold any of the hostnames plus a + * terminating char and also a buffer big enough to hold the hostname + * plus a colon plus the path. + */ + len = 0; + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + if (buf->len > len) + len = buf->len; + } + + kfree(ctx->nfs_server.hostname); + ctx->nfs_server.hostname = kmalloc(len + 1, GFP_KERNEL); + if (!ctx->nfs_server.hostname) + return -ENOMEM; - mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); - if (IS_ERR(mnt_path)) - return ERR_CAST(mnt_path); - mountdata->mnt_path = mnt_path; - maxbuflen = mnt_path - 1 - page2; + export_path = nfs4_pathname_string(&location->rootpath, + &ctx->nfs_server.export_path_len); + if (IS_ERR(export_path)) + return PTR_ERR(export_path); - mountdata->addr = kmalloc(addr_bufsize, GFP_KERNEL); - if (mountdata->addr == NULL) - return ERR_PTR(-ENOMEM); + kfree(ctx->nfs_server.export_path); + ctx->nfs_server.export_path = export_path; + source = kmalloc(len + 1 + ctx->nfs_server.export_path_len + 1, + GFP_KERNEL); + if (!source) + return -ENOMEM; + + kfree(fc->source); + fc->source = source; for (s = 0; s < location->nservers; s++) { const struct nfs4_string *buf = &location->servers[s]; - if (buf->len <= 0 || buf->len >= maxbuflen) - continue; - if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) continue; - mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, - mountdata->addr, addr_bufsize, net); - if (mountdata->addrlen == 0) + ctx->nfs_server.addrlen = + nfs_parse_server_name(buf->data, buf->len, + &ctx->nfs_server._address, + sizeof(ctx->nfs_server._address), + fc->net_ns, 0); + if (ctx->nfs_server.addrlen == 0) continue; - rpc_set_port(mountdata->addr, NFS_PORT); + rpc_set_port(&ctx->nfs_server.address, NFS_PORT); - memcpy(page2, buf->data, buf->len); - page2[buf->len] = '\0'; - mountdata->hostname = page2; + memcpy(ctx->nfs_server.hostname, buf->data, buf->len); + ctx->nfs_server.hostname[buf->len] = '\0'; - snprintf(page, PAGE_SIZE, "%s:%s", - mountdata->hostname, - mountdata->mnt_path); + p = source; + memcpy(p, buf->data, buf->len); + p += buf->len; + *p++ = ':'; + memcpy(p, ctx->nfs_server.export_path, ctx->nfs_server.export_path_len); + p += ctx->nfs_server.export_path_len; + *p = 0; - mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata); - if (!IS_ERR(mnt)) - break; + ret = nfs4_get_referral_tree(fc); + if (ret == 0) + return 0; } - kfree(mountdata->addr); - return mnt; + + return ret; } /** * nfs_follow_referral - set up mountpoint when hitting a referral on moved error - * @dentry - parent directory - * @locations - array of NFSv4 server location information + * @fc: pointer to struct nfs_fs_context + * @locations: array of NFSv4 server location information * */ -static struct vfsmount *nfs_follow_referral(struct dentry *dentry, - const struct nfs4_fs_locations *locations) +static int nfs_follow_referral(struct fs_context *fc, + const struct nfs4_fs_locations *locations) { - struct vfsmount *mnt = ERR_PTR(-ENOENT); - struct nfs_clone_mount mountdata = { - .sb = dentry->d_sb, - .dentry = dentry, - .authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor, - }; - char *page = NULL, *page2 = NULL; + struct nfs_fs_context *ctx = nfs_fc2context(fc); int loc, error; if (locations == NULL || locations->nlocations <= 0) - goto out; - - dprintk("%s: referral at %pd2\n", __func__, dentry); - - page = (char *) __get_free_page(GFP_USER); - if (!page) - goto out; + return -ENOENT; - page2 = (char *) __get_free_page(GFP_USER); - if (!page2) - goto out; + dprintk("%s: referral at %pd2\n", __func__, ctx->clone_data.dentry); /* Ensure fs path is a prefix of current dentry path */ - error = nfs4_validate_fspath(dentry, locations, page, page2); - if (error < 0) { - mnt = ERR_PTR(error); - goto out; - } + error = nfs4_validate_fspath(ctx->clone_data.dentry, locations, ctx); + if (error < 0) + return error; + error = -ENOENT; for (loc = 0; loc < locations->nlocations; loc++) { const struct nfs4_fs_location *location = &locations->locations[loc]; @@ -332,15 +389,12 @@ static struct vfsmount *nfs_follow_referral(struct dentry *dentry, location->rootpath.ncomponents == 0) continue; - mnt = try_location(&mountdata, page, page2, location); - if (!IS_ERR(mnt)) - break; + error = try_location(fc, location); + if (error == 0) + return 0; } -out: - free_page((unsigned long) page); - free_page((unsigned long) page2); - return mnt; + return error; } /* @@ -348,71 +402,77 @@ out: * @dentry - dentry of referral * */ -static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *dentry) +static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client) { - struct vfsmount *mnt = ERR_PTR(-ENOMEM); - struct dentry *parent; + struct nfs_fs_context *ctx = nfs_fc2context(fc); + struct dentry *dentry, *parent; struct nfs4_fs_locations *fs_locations = NULL; struct page *page; - int err; + int err = -ENOMEM; /* BUG_ON(IS_ROOT(dentry)); */ page = alloc_page(GFP_KERNEL); - if (page == NULL) - return mnt; + if (!page) + return -ENOMEM; fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); - if (fs_locations == NULL) + if (!fs_locations) goto out_free; + fs_locations->fattr = nfs_alloc_fattr(); + if (!fs_locations->fattr) + goto out_free_2; /* Get locations */ - mnt = ERR_PTR(-ENOENT); - + dentry = ctx->clone_data.dentry; parent = dget_parent(dentry); dprintk("%s: getting locations for %pd2\n", __func__, dentry); err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page); dput(parent); - if (err != 0 || - fs_locations->nlocations <= 0 || + if (err != 0) + goto out_free_3; + + err = -ENOENT; + if (fs_locations->nlocations <= 0 || fs_locations->fs_path.ncomponents <= 0) - goto out_free; + goto out_free_3; - mnt = nfs_follow_referral(dentry, fs_locations); + err = nfs_follow_referral(fc, fs_locations); +out_free_3: + kfree(fs_locations->fattr); +out_free_2: + kfree(fs_locations); out_free: __free_page(page); - kfree(fs_locations); - return mnt; + return err; } -struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, - struct nfs_fh *fh, struct nfs_fattr *fattr) +int nfs4_submount(struct fs_context *fc, struct nfs_server *server) { - rpc_authflavor_t flavor = server->client->cl_auth->au_flavor; + struct nfs_fs_context *ctx = nfs_fc2context(fc); + struct dentry *dentry = ctx->clone_data.dentry; struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); - const struct qstr *name = &dentry->d_name; struct rpc_clnt *client; - struct vfsmount *mnt; + int ret; /* Look it up again to get its attributes and sec flavor */ - client = nfs4_proc_lookup_mountpoint(dir, name, fh, fattr); + client = nfs4_proc_lookup_mountpoint(dir, dentry, ctx->mntfh, + ctx->clone_data.fattr); dput(parent); if (IS_ERR(client)) - return ERR_CAST(client); + return PTR_ERR(client); - if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { - mnt = nfs_do_refmount(client, dentry); - goto out; + ctx->selected_flavor = client->cl_auth->au_flavor; + if (ctx->clone_data.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { + ret = nfs_do_refmount(fc, client); + } else { + ret = nfs_do_submount(fc); } - if (client->cl_auth->au_flavor != flavor) - flavor = client->cl_auth->au_flavor; - mnt = nfs_do_submount(dentry, fh, fattr, flavor); -out: rpc_shutdown_client(client); - return mnt; + return ret; } /* @@ -424,14 +484,13 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, char *page, char *page2, const struct nfs4_fs_location *location) { - const size_t addr_bufsize = sizeof(struct sockaddr_storage); struct net *net = rpc_net_ns(server->client); - struct sockaddr *sap; + struct sockaddr_storage *sap; unsigned int s; size_t salen; int error; - sap = kmalloc(addr_bufsize, GFP_KERNEL); + sap = kmalloc(sizeof(*sap), GFP_KERNEL); if (sap == NULL) return -ENOMEM; @@ -447,13 +506,13 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, continue; salen = nfs_parse_server_name(buf->data, buf->len, - sap, addr_bufsize, net); + sap, sizeof(*sap), net, 0); if (salen == 0) continue; - rpc_set_port(sap, NFS_PORT); + rpc_set_port((struct sockaddr *)sap, NFS_PORT); error = -ENOMEM; - hostname = kstrndup(buf->data, buf->len, GFP_KERNEL); + hostname = kmemdup_nul(buf->data, buf->len, GFP_KERNEL); if (hostname == NULL) break; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ffd2e712595d..93c6ce04332b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -54,6 +54,7 @@ #include <linux/xattr.h> #include <linux/utsname.h> #include <linux/freezer.h> +#include <linux/iversion.h> #include "nfs4_fs.h" #include "delegation.h" @@ -62,14 +63,18 @@ #include "callback.h" #include "pnfs.h" #include "netns.h" +#include "sysfs.h" #include "nfs4idmap.h" #include "nfs4session.h" #include "fscache.h" +#include "nfs42.h" #include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PROC +#define NFS4_BITMASK_SZ 3 + #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) @@ -85,21 +90,23 @@ | ATTR_MTIME_SET) struct nfs4_opendata; -static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); -static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); -static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); -static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct inode *inode); +static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs_open_context *ctx, struct nfs4_label *ilabel, - struct nfs4_label *olabel); + struct nfs_open_context *ctx, struct nfs4_label *ilabel); #ifdef CONFIG_NFS_V4_1 -static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, - struct rpc_cred *); -static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, - struct rpc_cred *, bool); +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, + const struct cred *cred, + struct nfs4_slot *slot, + bool is_privileged); +static int nfs41_test_stateid(struct nfs_server *, const nfs4_stateid *, + const struct cred *); +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, + const struct cred *, bool); #endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL @@ -107,6 +114,7 @@ static inline struct nfs4_label * nfs4_label_init_security(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct nfs4_label *label) { + struct lsm_context shim; int err; if (label == NULL) @@ -115,18 +123,32 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0) return NULL; + label->lfs = 0; + label->pi = 0; + label->len = 0; + label->label = NULL; + err = security_dentry_init_security(dentry, sattr->ia_mode, - &dentry->d_name, (void **)&label->label, &label->len); - if (err == 0) - return label; + &dentry->d_name, NULL, &shim); + if (err) + return NULL; - return NULL; + label->lsmid = shim.id; + label->label = shim.context; + label->len = shim.len; + return label; } static inline void nfs4_label_release_security(struct nfs4_label *label) { - if (label) - security_release_secctx(label->label, label->len); + struct lsm_context shim; + + if (label) { + shim.context = label->label; + shim.len = label->len; + shim.id = label->lsmid; + security_release_secctx(&shim); + } } static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label) { @@ -157,6 +179,7 @@ static int nfs4_map_errors(int err) case -NFS4ERR_RESOURCE: case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: return -EREMOTEIO; case -NFS4ERR_WRONGSEC: case -NFS4ERR_WRONG_CRED: @@ -170,6 +193,11 @@ static int nfs4_map_errors(int err) return -EPROTONOSUPPORT; case -NFS4ERR_FILE_OPEN: return -EBUSY; + case -NFS4ERR_NOT_SAME: + return -ENOTSYNC; + case -ENETDOWN: + case -ENETUNREACH: + break; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -194,6 +222,7 @@ const u32 nfs4_fattr_bitmap[3] = { | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_MOUNTED_ON_FILEID, @@ -215,6 +244,7 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY, FATTR4_WORD2_MDSTHRESHOLD @@ -251,18 +281,17 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD1_FS_LAYOUT_TYPES, FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_CLONE_BLKSIZE + | FATTR4_WORD2_CHANGE_ATTR_TYPE + | FATTR4_WORD2_XATTR_SUPPORT }; const u32 nfs4_fs_locations_bitmap[3] = { - FATTR4_WORD0_TYPE - | FATTR4_WORD0_CHANGE + FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_FSID | FATTR4_WORD0_FILEID | FATTR4_WORD0_FS_LOCATIONS, - FATTR4_WORD1_MODE - | FATTR4_WORD1_NUMLINKS - | FATTR4_WORD1_OWNER + FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED @@ -272,6 +301,46 @@ const u32 nfs4_fs_locations_bitmap[3] = { | FATTR4_WORD1_MOUNTED_ON_FILEID, }; +static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, + struct inode *inode, unsigned long flags) +{ + unsigned long cache_validity; + + memcpy(dst, src, NFS4_BITMASK_SZ*sizeof(*dst)); + if (!inode || !nfs_have_read_or_write_delegation(inode)) + return; + + cache_validity = READ_ONCE(NFS_I(inode)->cache_validity) | flags; + + /* Remove the attributes over which we have full control */ + dst[1] &= ~FATTR4_WORD1_RAWDEV; + if (!(cache_validity & NFS_INO_INVALID_SIZE)) + dst[0] &= ~FATTR4_WORD0_SIZE; + + if (!(cache_validity & NFS_INO_INVALID_CHANGE)) + dst[0] &= ~FATTR4_WORD0_CHANGE; + + if (!(cache_validity & NFS_INO_INVALID_MODE)) + dst[1] &= ~FATTR4_WORD1_MODE; + if (!(cache_validity & NFS_INO_INVALID_OTHER)) + dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP); + + if (!(cache_validity & NFS_INO_INVALID_BTIME)) + dst[1] &= ~FATTR4_WORD1_TIME_CREATE; + + if (nfs_have_delegated_mtime(inode)) { + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); + if (!(cache_validity & NFS_INO_INVALID_MTIME)) + dst[1] &= ~(FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET); + if (!(cache_validity & NFS_INO_INVALID_CTIME)) + dst[1] &= ~(FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY_SET); + } else if (nfs_have_delegated_atime(inode)) { + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); + } +} + static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, struct nfs4_readdir_arg *readdir) { @@ -322,16 +391,26 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = htonl(attrs); /* bitmap */ *p++ = htonl(12); /* attribute buffer length */ *p++ = htonl(NF4DIR); + spin_lock(&dentry->d_lock); p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent))); + spin_unlock(&dentry->d_lock); readdir->pgbase = (char *)p - (char *)start; readdir->count -= readdir->pgbase; kunmap_atomic(start); } +static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) { + fattr->pre_change_attr = version; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + } +} + static void nfs4_test_and_free_stateid(struct nfs_server *server, nfs4_stateid *stateid, - struct rpc_cred *cred) + const struct cred *cred) { const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops; @@ -340,7 +419,7 @@ static void nfs4_test_and_free_stateid(struct nfs_server *server, static void __nfs4_free_revoked_stateid(struct nfs_server *server, nfs4_stateid *stateid, - struct rpc_cred *cred) + const struct cred *cred) { stateid->type = NFS4_REVOKED_STATEID_TYPE; nfs4_test_and_free_stateid(server, stateid, cred); @@ -348,7 +427,7 @@ static void __nfs4_free_revoked_stateid(struct nfs_server *server, static void nfs4_free_revoked_stateid(struct nfs_server *server, const nfs4_stateid *stateid, - struct rpc_cred *cred) + const struct cred *cred) { nfs4_stateid tmp; @@ -370,17 +449,53 @@ static long nfs4_update_delay(long *timeout) return ret; } -static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) +static int nfs4_delay_killable(long *timeout) { - int res = 0; + might_sleep(); + + if (unlikely(nfs_current_task_exiting())) + return -EINTR; + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(nfs4_update_delay(timeout)); + if (!__fatal_signal_pending(current)) + return 0; + return -EINTR; +} +static int nfs4_delay_interruptible(long *timeout) +{ might_sleep(); - freezable_schedule_timeout_killable_unsafe( - nfs4_update_delay(timeout)); - if (fatal_signal_pending(current)) - res = -ERESTARTSYS; - return res; + if (unlikely(nfs_current_task_exiting())) + return -EINTR; + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); + schedule_timeout(nfs4_update_delay(timeout)); + if (!signal_pending(current)) + return 0; + return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS; +} + +static int nfs4_delay(long *timeout, bool interruptible) +{ + if (interruptible) + return nfs4_delay_interruptible(timeout); + return nfs4_delay_killable(timeout); +} + +static const nfs4_stateid * +nfs4_recoverable_stateid(const nfs4_stateid *stateid) +{ + if (!stateid) + return NULL; + switch (stateid->type) { + case NFS4_OPEN_STATEID_TYPE: + case NFS4_LOCK_STATEID_TYPE: + case NFS4_DELEGATION_STATEID_TYPE: + return stateid; + default: + break; + } + return NULL; } /* This is the error handling routine for processes that are allowed @@ -391,7 +506,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; - const nfs4_stateid *stateid = exception->stateid; + const nfs4_stateid *stateid; struct inode *inode = exception->inode; int ret = errorcode; @@ -399,21 +514,29 @@ static int nfs4_do_handle_exception(struct nfs_server *server, exception->recovering = 0; exception->retry = 0; + stateid = nfs4_recoverable_stateid(exception->stateid); if (stateid == NULL && state != NULL) - stateid = &state->stateid; + stateid = nfs4_recoverable_stateid(&state->stateid); switch(errorcode) { case 0: return 0; + case -NFS4ERR_BADHANDLE: + case -ESTALE: + if (inode != NULL && S_ISREG(inode->i_mode)) + pnfs_destroy_layout(NFS_I(inode)); + break; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_PARTNER_NO_AUTH: if (inode != NULL && stateid != NULL) { nfs_inode_find_state_and_recover(inode, stateid); goto wait_on_recovery; } + fallthrough; case -NFS4ERR_OPENMODE: if (inode) { int err; @@ -453,9 +576,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DEADSESSION: case -NFS4ERR_SEQ_FALSE_RETRY: case -NFS4ERR_SEQ_MISORDERED: - dprintk("%s ERROR: %d Reset session\n", __func__, - errorcode); - nfs4_schedule_session_recovery(clp->cl_session, errorcode); + /* Handled in nfs41_sequence_process() */ goto wait_on_recovery; #endif /* defined(CONFIG_NFS_V4_1) */ case -NFS4ERR_FILE_OPEN: @@ -466,11 +587,14 @@ static int nfs4_do_handle_exception(struct nfs_server *server, ret = -EBUSY; break; } + fallthrough; case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); + fallthrough; case -NFS4ERR_GRACE: case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: exception->delay = 1; return 0; @@ -498,6 +622,21 @@ wait_on_recovery: return 0; } +/* + * Track the number of NFS4ERR_DELAY related retransmissions and return + * EAGAIN if the 'softerr' mount option is set, and we've exceeded the limit + * set by 'nfs_delay_retrans'. + */ +static int nfs4_exception_should_retrans(const struct nfs_server *server, + struct nfs4_exception *exception) +{ + if (server->flags & NFS_MOUNT_SOFTERR && nfs_delay_retrans >= 0) { + if (exception->retrans++ >= (unsigned short)nfs_delay_retrans) + return -EAGAIN; + } + return 0; +} + /* This is the error handling routine for processes that are allowed * to sleep. */ @@ -508,10 +647,18 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ ret = nfs4_do_handle_exception(server, errorcode, exception); if (exception->delay) { - ret = nfs4_delay(server->client, &exception->timeout); + int ret2 = nfs4_exception_should_retrans(server, exception); + if (ret2 < 0) { + exception->retry = 0; + return ret2; + } + ret = nfs4_delay(&exception->timeout, + exception->interruptible); goto out_retry; } if (exception->recovering) { + if (exception->task_is_privileged) + return -EDEADLOCK; ret = nfs4_wait_clnt_recover(clp); if (test_bit(NFS_MIG_FAILED, &server->mig_status)) return -EIO; @@ -531,12 +678,28 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server, struct nfs_client *clp = server->nfs_client; int ret; + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + exception->delay = 0; + exception->recovering = 0; + exception->retry = 0; + return -EIO; + } + ret = nfs4_do_handle_exception(server, errorcode, exception); if (exception->delay) { + int ret2 = nfs4_exception_should_retrans(server, exception); + if (ret2 < 0) { + exception->retry = 0; + return ret2; + } rpc_delay(task, nfs4_update_delay(&exception->timeout)); goto out_retry; } if (exception->recovering) { + if (exception->task_is_privileged) + return -EDEADLOCK; rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); @@ -546,12 +709,19 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server, ret = -EIO; return ret; out_retry: - if (ret == 0) + if (ret == 0) { exception->retry = 1; + /* + * For NFS4ERR_MOVED, the client transport will need to + * be recomputed after migration recovery has completed. + */ + if (errorcode == -NFS4ERR_MOVED) + rpc_task_release_transport(task); + } return ret; } -static int +int nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server, struct nfs4_state *state, long *timeout) { @@ -606,20 +776,16 @@ struct nfs4_call_sync_data { }; void nfs4_init_sequence(struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, int cache_reply) + struct nfs4_sequence_res *res, int cache_reply, + int privileged) { args->sa_slot = NULL; args->sa_cache_this = cache_reply; - args->sa_privileged = 0; + args->sa_privileged = privileged; res->sr_slot = NULL; } -static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) -{ - args->sa_privileged = 1; -} - static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot *slot = res->sr_slot; @@ -644,13 +810,14 @@ static int nfs40_sequence_done(struct rpc_task *task, #if defined(CONFIG_NFS_V4_1) -static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) +static void nfs41_release_slot(struct nfs4_slot *slot) { struct nfs4_session *session; struct nfs4_slot_table *tbl; - struct nfs4_slot *slot = res->sr_slot; bool send_new_highest_used_slotid = false; + if (!slot) + return; tbl = slot->table; session = tbl->session; @@ -676,50 +843,69 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) send_new_highest_used_slotid = false; out_unlock: spin_unlock(&tbl->slot_tbl_lock); - res->sr_slot = NULL; if (send_new_highest_used_slotid) nfs41_notify_server(session->clp); if (waitqueue_active(&tbl->slot_waitq)) wake_up_all(&tbl->slot_waitq); } +static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) +{ + nfs41_release_slot(res->sr_slot); + res->sr_slot = NULL; +} + +static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot, + u32 seqnr) +{ + if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0) + slot->seq_nr_highest_sent = seqnr; +} +static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, u32 seqnr) +{ + nfs4_slot_sequence_record_sent(slot, seqnr); + slot->seq_nr_last_acked = seqnr; +} + +static void nfs4_probe_sequence(struct nfs_client *client, const struct cred *cred, + struct nfs4_slot *slot) +{ + struct rpc_task *task = _nfs41_proc_sequence(client, cred, slot, true); + if (!IS_ERR(task)) + rpc_put_task_async(task); +} + static int nfs41_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot *slot = res->sr_slot; struct nfs_client *clp; - bool interrupted = false; + int status; int ret = 1; if (slot == NULL) goto out_noaction; /* don't increment the sequence number if the task wasn't sent */ - if (!RPC_WAS_SENT(task)) + if (!RPC_WAS_SENT(task) || slot->seq_done) goto out; session = slot->table->session; - - if (slot->interrupted) { - if (res->sr_status != -NFS4ERR_DELAY) - slot->interrupted = 0; - interrupted = true; - } + clp = session->clp; trace_nfs4_sequence_done(session, res); + + status = res->sr_status; + if (task->tk_status == -NFS4ERR_DEADSESSION) + status = -NFS4ERR_DEADSESSION; + /* Check the SEQUENCE operation status */ - switch (res->sr_status) { + switch (status) { case 0: - /* If previous op on slot was interrupted and we reused - * the seq# and got a reply from the cache, then retry - */ - if (task->tk_status == -EREMOTEIO && interrupted) { - ++slot->seq_nr; - goto retry_nowait; - } + /* Mark this sequence number as having been acked */ + nfs4_slot_sequence_acked(slot, slot->seq_nr); /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; - clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags, @@ -731,9 +917,9 @@ static int nfs41_sequence_process(struct rpc_task *task, * sr_status remains 1 if an RPC level error occurred. * The server may or may not have processed the sequence * operation.. - * Mark the slot as having hosted an interrupted RPC call. */ - slot->interrupted = 1; + nfs4_slot_sequence_record_sent(slot, slot->seq_nr); + slot->seq_done = 1; goto out; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and @@ -745,33 +931,57 @@ static int nfs41_sequence_process(struct rpc_task *task, slot->slot_nr, slot->seq_nr); goto out_retry; + case -NFS4ERR_RETRY_UNCACHED_REP: + case -NFS4ERR_SEQ_FALSE_RETRY: + /* + * The server thinks we tried to replay a request. + * Retry the call after bumping the sequence ID. + */ + nfs4_slot_sequence_acked(slot, slot->seq_nr); + goto retry_new_seq; case -NFS4ERR_BADSLOT: /* * The slot id we used was probably retired. Try again * using a different slot id. */ + if (slot->slot_nr < slot->table->target_highest_slotid) + goto session_recover; goto retry_nowait; case -NFS4ERR_SEQ_MISORDERED: + nfs4_slot_sequence_record_sent(slot, slot->seq_nr); /* - * Was the last operation on this sequence interrupted? - * If so, retry after bumping the sequence number. + * Were one or more calls using this slot interrupted? + * If the server never received the request, then our + * transmitted slot sequence number may be too high. However, + * if the server did receive the request then it might + * accidentally give us a reply with a mismatched operation. + * We can sort this out by sending a lone sequence operation + * to the server on the same slot. */ - if (interrupted) { - ++slot->seq_nr; + if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) { + slot->seq_nr--; + if (task->tk_msg.rpc_proc != &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE]) { + nfs4_probe_sequence(clp, task->tk_msg.rpc_cred, slot); + res->sr_slot = NULL; + } goto retry_nowait; } /* - * Could this slot have been previously retired? - * If so, then the server may be expecting seq_nr = 1! + * RFC5661: + * A retry might be sent while the original request is + * still in progress on the replier. The replier SHOULD + * deal with the issue by returning NFS4ERR_DELAY as the + * reply to SEQUENCE or CB_SEQUENCE operation, but + * implementations MAY return NFS4ERR_SEQ_MISORDERED. + * + * Restart the search after a delay. */ - if (slot->seq_nr != 1) { - slot->seq_nr = 1; - goto retry_nowait; - } - break; - case -NFS4ERR_SEQ_FALSE_RETRY: - ++slot->seq_nr; - goto retry_nowait; + slot->seq_nr = slot->seq_nr_highest_sent; + goto out_retry; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto session_recover; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -781,6 +991,14 @@ out: dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); out_noaction: return ret; +session_recover: + set_bit(NFS4_SLOT_TBL_DRAINING, &session->fc_slot_table.slot_tbl_state); + nfs4_schedule_session_recovery(session, status); + dprintk("%s ERROR: %d Reset session\n", __func__, status); + nfs41_sequence_free_slot(res); + goto out; +retry_new_seq: + ++slot->seq_nr; retry_nowait: if (rpc_restart_call_prepare(task)) { nfs41_sequence_free_slot(res); @@ -879,7 +1097,27 @@ EXPORT_SYMBOL_GPL(nfs4_sequence_done); #endif /* !CONFIG_NFS_V4_1 */ -int nfs4_setup_sequence(const struct nfs_client *client, +static void nfs41_sequence_res_init(struct nfs4_sequence_res *res) +{ + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + res->sr_status = 1; +} + +static +void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct nfs4_slot *slot) +{ + if (!slot) + return; + slot->privileged = args->sa_privileged ? 1 : 0; + args->sa_slot = slot; + + res->sr_slot = slot; +} + +int nfs4_setup_sequence(struct nfs_client *client, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task) @@ -892,10 +1130,8 @@ int nfs4_setup_sequence(const struct nfs_client *client, if (res->sr_slot != NULL) goto out_start; - if (session) { + if (session) tbl = &session->fc_slot_table; - task->tk_timeout = 0; - } spin_lock(&tbl->slot_tbl_lock); /* The state manager will wait until the slot table is empty */ @@ -904,32 +1140,33 @@ int nfs4_setup_sequence(const struct nfs_client *client, slot = nfs4_alloc_slot(tbl); if (IS_ERR(slot)) { - /* Try again in 1/4 second */ if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; + goto out_sleep_timeout; goto out_sleep; } spin_unlock(&tbl->slot_tbl_lock); - slot->privileged = args->sa_privileged ? 1 : 0; - args->sa_slot = slot; - - res->sr_slot = slot; - if (session) { - res->sr_timestamp = jiffies; - res->sr_status_flags = 0; - res->sr_status = 1; - } + nfs4_sequence_attach_slot(args, res, slot); trace_nfs4_setup_sequence(session, args); out_start: + nfs41_sequence_res_init(res); rpc_call_start(task); return 0; - +out_sleep_timeout: + /* Try again in 1/4 second */ + if (args->sa_privileged) + rpc_sleep_on_priority_timeout(&tbl->slot_tbl_waitq, task, + jiffies + (HZ >> 2), RPC_PRIORITY_PRIVILEGED); + else + rpc_sleep_on_timeout(&tbl->slot_tbl_waitq, task, + NULL, jiffies + (HZ >> 2)); + spin_unlock(&tbl->slot_tbl_lock); + return -EAGAIN; out_sleep: if (args->sa_privileged) rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task, - NULL, RPC_PRIORITY_PRIVILEGED); + RPC_PRIORITY_PRIVILEGED); else rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); spin_unlock(&tbl->slot_tbl_lock); @@ -955,14 +1192,27 @@ static const struct rpc_call_ops nfs40_call_sync_ops = { .rpc_call_done = nfs40_call_sync_done, }; -static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, - struct nfs_server *server, - struct rpc_message *msg, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res) +static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup) { int ret; struct rpc_task *task; + + task = rpc_run_task(task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + +static int nfs4_do_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + unsigned short task_flags) +{ struct nfs_client *clp = server->nfs_client; struct nfs4_call_sync_data data = { .seq_server = server, @@ -973,19 +1223,27 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, .rpc_client = clnt, .rpc_message = msg, .callback_ops = clp->cl_mvops->call_sync_ops, - .callback_data = &data + .callback_data = &data, + .flags = task_flags, }; - task = rpc_run_task(&task_setup); - if (IS_ERR(task)) - ret = PTR_ERR(task); - else { - ret = task->tk_status; - rpc_put_task(task); - } - return ret; + return nfs4_call_sync_custom(&task_setup); +} + +static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res) +{ + unsigned short task_flags = 0; + + if (server->caps & NFS_CAP_MOVEABLE) + task_flags = RPC_TASK_MOVEABLE; + return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags); } + int nfs4_call_sync(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, @@ -993,55 +1251,94 @@ int nfs4_call_sync(struct rpc_clnt *clnt, struct nfs4_sequence_res *res, int cache_reply) { - nfs4_init_sequence(args, res, cache_reply); + nfs4_init_sequence(args, res, cache_reply, 0); return nfs4_call_sync_sequence(clnt, server, msg, args, res); } -static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, - unsigned long timestamp) +static void +nfs4_inc_nlink_locked(struct inode *inode) +{ + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_NLINK); + inc_nlink(inode); +} + +static void +nfs4_inc_nlink(struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(dir); + spin_lock(&inode->i_lock); + nfs4_inc_nlink_locked(inode); + spin_unlock(&inode->i_lock); +} - spin_lock(&dir->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; - if (cinfo->atomic && cinfo->before == dir->i_version) { - nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; - nfsi->attrtimeo_timestamp = jiffies; - } else { - nfs_force_lookup_revalidate(dir); - if (cinfo->before != dir->i_version) - nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | - NFS_INO_INVALID_ACL; +static void +nfs4_dec_nlink_locked(struct inode *inode) +{ + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_NLINK); + drop_nlink(inode); +} + +static void +nfs4_update_changeattr_locked(struct inode *inode, + struct nfs4_change_info *cinfo, + unsigned long timestamp, unsigned long cache_validity) +{ + struct nfs_inode *nfsi = NFS_I(inode); + u64 change_attr = inode_peek_iversion_raw(inode); + + if (!nfs_have_delegated_mtime(inode)) + cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + if (S_ISDIR(inode->i_mode)) + cache_validity |= NFS_INO_INVALID_DATA; + + switch (NFS_SERVER(inode)->change_attr_type) { + case NFS4_CHANGE_TYPE_IS_UNDEFINED: + if (cinfo->after == change_attr) + goto out; + break; + default: + if ((s64)(change_attr - cinfo->after) >= 0) + goto out; + } + + inode_set_iversion_raw(inode, cinfo->after); + if (!cinfo->atomic || cinfo->before != change_attr) { + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + + if (!nfs_have_delegated_attributes(inode)) + cache_validity |= + NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | + NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | + NFS_INO_INVALID_MODE | NFS_INO_INVALID_BTIME | + NFS_INO_INVALID_XATTR; + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); } - dir->i_version = cinfo->after; + nfsi->attrtimeo_timestamp = jiffies; nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); - nfs_fscache_invalidate(dir); + nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE; +out: + nfs_set_cache_invalid(inode, cache_validity); +} + +void +nfs4_update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, + unsigned long timestamp, unsigned long cache_validity) +{ + spin_lock(&dir->i_lock); + nfs4_update_changeattr_locked(dir, cinfo, timestamp, cache_validity); spin_unlock(&dir->i_lock); } -struct nfs4_opendata { - struct kref kref; - struct nfs_openargs o_arg; - struct nfs_openres o_res; - struct nfs_open_confirmargs c_arg; - struct nfs_open_confirmres c_res; - struct nfs4_string owner_name; - struct nfs4_string group_name; - struct nfs4_label *a_label; - struct nfs_fattr f_attr; - struct nfs4_label *f_label; - struct dentry *dir; - struct dentry *dentry; - struct nfs4_state_owner *owner; - struct nfs4_state *state; - struct iattr attrs; - unsigned long timestamp; - bool rpc_done; - bool file_created; - bool is_recover; - bool cancelled; - int rpc_status; +struct nfs4_open_createattrs { + struct nfs4_label *label; + struct iattr *sattr; + const __u32 verf[2]; }; static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, @@ -1056,9 +1353,20 @@ static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, return true; } +static fmode_t _nfs4_ctx_to_accessmode(const struct nfs_open_context *ctx) +{ + return ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); +} + +static fmode_t _nfs4_ctx_to_openmode(const struct nfs_open_context *ctx) +{ + fmode_t ret = ctx->mode & (FMODE_READ|FMODE_WRITE); + + return (ctx->mode & FMODE_EXEC) ? FMODE_READ | ret : ret; +} + static u32 -nfs4_map_atomic_open_share(struct nfs_server *server, - fmode_t fmode, int openflags) +nfs4_fmode_to_share_access(fmode_t fmode) { u32 res = 0; @@ -1072,11 +1380,27 @@ nfs4_map_atomic_open_share(struct nfs_server *server, case FMODE_READ|FMODE_WRITE: res = NFS4_SHARE_ACCESS_BOTH; } + return res; +} + +static u32 +nfs4_map_atomic_open_share(struct nfs_server *server, + fmode_t fmode, int openflags) +{ + u32 res = nfs4_fmode_to_share_access(fmode); + if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1)) goto out; /* Want no delegation if we're using O_DIRECT */ - if (openflags & O_DIRECT) + if (openflags & O_DIRECT) { res |= NFS4_SHARE_WANT_NO_DELEG; + goto out; + } + /* res |= NFS4_SHARE_WANT_NO_PREFERENCE; */ + if (server->caps & NFS_CAP_DELEGTIME) + res |= NFS4_SHARE_WANT_DELEG_TIMESTAMPS; + if (server->caps & NFS_CAP_OPEN_XOR) + res |= NFS4_SHARE_WANT_OPEN_XOR_DELEGATION; out: return res; } @@ -1102,7 +1426,6 @@ nfs4_map_atomic_open_claim(struct nfs_server *server, static void nfs4_init_opendata_res(struct nfs4_opendata *p) { p->o_res.f_attr = &p->f_attr; - p->o_res.f_label = p->f_label; p->o_res.seqid = p->o_arg.seqid; p->c_res.seqid = p->c_arg.seqid; p->o_res.server = p->o_arg.server; @@ -1113,8 +1436,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct nfs4_state_owner *sp, fmode_t fmode, int flags, - const struct iattr *attrs, - struct nfs4_label *label, + const struct nfs4_open_createattrs *c, enum open_claim_type4 claim, gfp_t gfp_mask) { @@ -1122,14 +1444,15 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct inode *dir = d_inode(parent); struct nfs_server *server = NFS_SERVER(dir); struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + struct nfs4_label *label = (c != NULL) ? c->label : NULL; struct nfs4_opendata *p; p = kzalloc(sizeof(*p), gfp_mask); if (p == NULL) goto err; - p->f_label = nfs4_label_alloc(server, gfp_mask); - if (IS_ERR(p->f_label)) + p->f_attr.label = nfs4_label_alloc(server, gfp_mask); + if (IS_ERR(p->f_attr.label)) goto err_free_p; p->a_label = nfs4_label_alloc(server, gfp_mask); @@ -1147,26 +1470,32 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, atomic_inc(&sp->so_count); p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); - p->o_arg.umask = current_umask(); p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); p->o_arg.share_access = nfs4_map_atomic_open_share(server, fmode, flags); - /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS - * will return permission denied for all bits until close */ - if (!(flags & O_EXCL)) { - /* ask server to check for all possible rights as results - * are cached */ - switch (p->o_arg.claim) { - default: - break; - case NFS4_OPEN_CLAIM_NULL: - case NFS4_OPEN_CLAIM_FH: - p->o_arg.access = NFS4_ACCESS_READ | - NFS4_ACCESS_MODIFY | - NFS4_ACCESS_EXTEND | - NFS4_ACCESS_EXECUTE; + if (flags & O_CREAT) { + p->o_arg.umask = current_umask(); + p->o_arg.label = nfs4_label_copy(p->a_label, label); + if (c->sattr != NULL && c->sattr->ia_valid != 0) { + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, c->sattr, sizeof(p->attrs)); + + memcpy(p->o_arg.u.verifier.data, c->verf, + sizeof(p->o_arg.u.verifier.data)); } } + /* ask server to check for all possible rights as results + * are cached */ + switch (p->o_arg.claim) { + default: + break; + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | + NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE | + NFS4_ACCESS_EXECUTE | + nfs_access_xattr_mask(server); + } p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); p->o_arg.id.uniquifier = sp->so_seqid.owner_id; @@ -1174,7 +1503,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.server = server; p->o_arg.bitmask = nfs4_bitmask(server, label); p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; - p->o_arg.label = nfs4_label_copy(p->a_label, label); switch (p->o_arg.claim) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_DELEGATE_CUR: @@ -1187,17 +1515,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, case NFS4_OPEN_CLAIM_DELEG_PREV_FH: p->o_arg.fh = NFS_FH(d_inode(dentry)); } - if (attrs != NULL && attrs->ia_valid != 0) { - __u32 verf[2]; - - p->o_arg.u.attrs = &p->attrs; - memcpy(&p->attrs, attrs, sizeof(p->attrs)); - - verf[0] = jiffies; - verf[1] = current->pid; - memcpy(p->o_arg.u.verifier.data, verf, - sizeof(p->o_arg.u.verifier.data)); - } p->c_arg.fh = &p->o_res.fh; p->c_arg.stateid = &p->o_res.stateid; p->c_arg.seqid = p->o_arg.seqid; @@ -1208,7 +1525,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, err_free_label: nfs4_label_free(p->a_label); err_free_f: - nfs4_label_free(p->f_label); + nfs4_label_free(p->f_attr.label); err_free_p: kfree(p); err: @@ -1222,6 +1539,7 @@ static void nfs4_opendata_free(struct kref *kref) struct nfs4_opendata, kref); struct super_block *sb = p->dentry->d_sb; + nfs4_lgopen_release(p->lgp); nfs_free_seqid(p->o_arg.seqid); nfs4_sequence_free_slot(&p->o_res.seq_res); if (p->state != NULL) @@ -1229,7 +1547,7 @@ static void nfs4_opendata_free(struct kref *kref) nfs4_put_state_owner(p->owner); nfs4_label_free(p->a_label); - nfs4_label_free(p->f_label); + nfs4_label_free(p->f_attr.label); dput(p->dir); dput(p->dentry); @@ -1260,12 +1578,20 @@ static bool nfs4_mode_match_open_stateid(struct nfs4_state *state, return false; } -static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) +static int can_open_cached(struct nfs4_state *state, fmode_t mode, + int open_mode, enum open_claim_type4 claim) { int ret = 0; if (open_mode & (O_EXCL|O_TRUNC)) goto out; + switch (claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + goto out; + default: + break; + } switch (mode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 @@ -1290,8 +1616,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, return 0; if ((delegation->type & fmode) != fmode) return 0; - if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) - return 0; switch (claim) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_FH: @@ -1299,6 +1623,7 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, case NFS4_OPEN_CLAIM_PREVIOUS: if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) break; + fallthrough; default: return 0; } @@ -1334,6 +1659,12 @@ static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state) } #endif /* CONFIG_NFS_V4_1 */ +static void nfs_state_log_update_open_stateid(struct nfs4_state *state) +{ + if (test_and_clear_bit(NFS_STATE_CHANGE_WAIT, &state->flags)) + wake_up_all(&state->waitq); +} + static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) { struct nfs_client *clp = state->owner->so_server->nfs_client; @@ -1349,17 +1680,30 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) nfs4_state_mark_reclaim_nograce(clp, state); } -static bool nfs_need_update_open_stateid(struct nfs4_state *state, - const nfs4_stateid *stateid, nfs4_stateid *freeme) +/* + * Check for whether or not the caller may update the open stateid + * to the value passed in by stateid. + * + * Note: This function relies heavily on the server implementing + * RFC7530 Section 9.1.4.2, and RFC5661 Section 8.2.2 + * correctly. + * i.e. The stateid seqids have to be initialised to 1, and + * are then incremented on every state transition. + */ +static bool nfs_stateid_is_sequential(struct nfs4_state *state, + const nfs4_stateid *stateid) { - if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) - return true; - if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { - nfs4_stateid_copy(freeme, &state->open_stateid); - nfs_test_and_clear_all_open_stateid(state); - return true; + if (test_bit(NFS_OPEN_STATE, &state->flags)) { + /* The common case - we're updating to a new sequence number */ + if (nfs4_stateid_match_other(stateid, &state->open_stateid)) { + if (nfs4_stateid_is_next(&state->open_stateid, stateid)) + return true; + return false; + } + /* The server returned a new stateid */ } - if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) + /* This is the first OPEN in this generation */ + if (stateid->seqid == cpu_to_be32(1)) return true; return false; } @@ -1399,11 +1743,14 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state, if (nfs4_stateid_match_other(stateid, &state->open_stateid) && !nfs4_stateid_is_newer(stateid, &state->open_stateid)) { nfs_resync_open_stateid_locked(state); - return; + goto out; } if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); + trace_nfs4_open_stateid_update(state->inode, stateid, 0); +out: + nfs_state_log_update_open_stateid(state); } static void nfs_clear_open_stateid(struct nfs4_state *state, @@ -1420,29 +1767,66 @@ static void nfs_clear_open_stateid(struct nfs4_state *state, } static void nfs_set_open_stateid_locked(struct nfs4_state *state, - const nfs4_stateid *stateid, fmode_t fmode, - nfs4_stateid *freeme) + const nfs4_stateid *stateid, nfs4_stateid *freeme) + __must_hold(&state->owner->so_lock) + __must_hold(&state->seqlock) + __must_hold(RCU) + { - switch (fmode) { - case FMODE_READ: - set_bit(NFS_O_RDONLY_STATE, &state->flags); + DEFINE_WAIT(wait); + int status = 0; + for (;;) { + + if (nfs_stateid_is_sequential(state, stateid)) break; - case FMODE_WRITE: - set_bit(NFS_O_WRONLY_STATE, &state->flags); + + if (status) break; - case FMODE_READ|FMODE_WRITE: - set_bit(NFS_O_RDWR_STATE, &state->flags); + /* Rely on seqids for serialisation with NFSv4.0 */ + if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client)) + break; + + set_bit(NFS_STATE_CHANGE_WAIT, &state->flags); + prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE); + /* + * Ensure we process the state changes in the same order + * in which the server processed them by delaying the + * update of the stateid until we are in sequence. + */ + write_sequnlock(&state->seqlock); + spin_unlock(&state->owner->so_lock); + rcu_read_unlock(); + trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); + + if (!fatal_signal_pending(current) && + !nfs_current_task_exiting()) { + if (schedule_timeout(5*HZ) == 0) + status = -EAGAIN; + else + status = 0; + } else + status = -EINTR; + finish_wait(&state->waitq, &wait); + rcu_read_lock(); + spin_lock(&state->owner->so_lock); + write_seqlock(&state->seqlock); } - if (!nfs_need_update_open_stateid(state, stateid, freeme)) - return; + + if (test_bit(NFS_OPEN_STATE, &state->flags) && + !nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs4_stateid_copy(freeme, &state->open_stateid); + nfs_test_and_clear_all_open_stateid(state); + } + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); + trace_nfs4_open_stateid_update(state->inode, stateid, status); + nfs_state_log_update_open_stateid(state); } -static void __update_open_stateid(struct nfs4_state *state, +static void nfs_state_set_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, - const nfs4_stateid *deleg_stateid, fmode_t fmode, nfs4_stateid *freeme) { @@ -1450,20 +1834,53 @@ static void __update_open_stateid(struct nfs4_state *state, * Protect the call to nfs4_state_set_mode_locked and * serialise the stateid update */ - spin_lock(&state->owner->so_lock); write_seqlock(&state->seqlock); - if (deleg_stateid != NULL) { - nfs4_stateid_copy(&state->stateid, deleg_stateid); - set_bit(NFS_DELEGATED_STATE, &state->flags); + nfs_set_open_stateid_locked(state, open_stateid, freeme); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_WRITE: + set_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case FMODE_READ|FMODE_WRITE: + set_bit(NFS_O_RDWR_STATE, &state->flags); } - if (open_stateid != NULL) - nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme); + set_bit(NFS_OPEN_STATE, &state->flags); + write_sequnlock(&state->seqlock); +} + +static void nfs_state_clear_open_state_flags(struct nfs4_state *state) +{ + clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); +} + +static void nfs_state_set_delegation(struct nfs4_state *state, + const nfs4_stateid *deleg_stateid, + fmode_t fmode) +{ + /* + * Protect the call to nfs4_state_set_mode_locked and + * serialise the stateid update + */ + write_seqlock(&state->seqlock); + nfs4_stateid_copy(&state->stateid, deleg_stateid); + set_bit(NFS_DELEGATED_STATE, &state->flags); write_sequnlock(&state->seqlock); - update_open_stateflags(state, fmode); - spin_unlock(&state->owner->so_lock); } -static int update_open_stateid(struct nfs4_state *state, +static void nfs_state_clear_delegation(struct nfs4_state *state) +{ + write_seqlock(&state->seqlock); + nfs4_stateid_copy(&state->stateid, &state->open_stateid); + clear_bit(NFS_DELEGATED_STATE, &state->flags); + write_sequnlock(&state->seqlock); +} + +int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *delegation, fmode_t fmode) @@ -1478,7 +1895,13 @@ static int update_open_stateid(struct nfs4_state *state, fmode &= (FMODE_READ|FMODE_WRITE); rcu_read_lock(); - deleg_cur = rcu_dereference(nfsi->delegation); + spin_lock(&state->owner->so_lock); + if (open_stateid != NULL) { + nfs_state_set_open_stateid(state, open_stateid, fmode, &freeme); + ret = 1; + } + + deleg_cur = nfs4_get_valid_delegation(state->inode); if (deleg_cur == NULL) goto no_delegation; @@ -1490,22 +1913,20 @@ static int update_open_stateid(struct nfs4_state *state, if (delegation == NULL) delegation = &deleg_cur->stateid; - else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation)) + else if (!nfs4_stateid_match_other(&deleg_cur->stateid, delegation)) goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); - __update_open_stateid(state, open_stateid, &deleg_cur->stateid, - fmode, &freeme); + nfs_state_set_delegation(state, &deleg_cur->stateid, fmode); ret = 1; no_delegation_unlock: spin_unlock(&deleg_cur->lock); no_delegation: + if (ret) + update_open_stateflags(state, fmode); + spin_unlock(&state->owner->so_lock); rcu_read_unlock(); - if (!ret && open_stateid != NULL) { - __update_open_stateid(state, open_stateid, NULL, fmode, &freeme); - ret = 1; - } if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) nfs4_schedule_state_manager(clp); if (freeme.type != 0) @@ -1537,8 +1958,9 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo { struct nfs_delegation *delegation; + fmode &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); + delegation = nfs4_get_valid_delegation(inode); if (delegation == NULL || (delegation->type & fmode) == fmode) { rcu_read_unlock(); return; @@ -1550,7 +1972,6 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) { struct nfs4_state *state = opendata->state; - struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; @@ -1560,14 +1981,14 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) for (;;) { spin_lock(&state->owner->so_lock); - if (can_open_cached(state, fmode, open_mode)) { + if (can_open_cached(state, fmode, open_mode, claim)) { update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); goto out_return_state; } spin_unlock(&state->owner->so_lock); rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); + delegation = nfs4_get_valid_delegation(state->inode); if (!can_open_delegated(delegation, fmode, claim)) { rcu_read_unlock(); break; @@ -1590,41 +2011,46 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) out: return ERR_PTR(ret); out_return_state: - atomic_inc(&state->count); + refcount_inc(&state->count); return state; } static void -nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) -{ - struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client; - struct nfs_delegation *delegation; - int delegation_flags = 0; - - rcu_read_lock(); - delegation = rcu_dereference(NFS_I(state->inode)->delegation); - if (delegation) - delegation_flags = delegation->flags; - rcu_read_unlock(); - switch (data->o_arg.claim) { - default: +nfs4_process_delegation(struct inode *inode, const struct cred *cred, + enum open_claim_type4 claim, + const struct nfs4_open_delegation *delegation) +{ + switch (delegation->open_delegation_type) { + case NFS4_OPEN_DELEGATE_READ: + case NFS4_OPEN_DELEGATE_WRITE: + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: break; + default: + return; + } + switch (claim) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: pr_err_ratelimited("NFS: Broken NFSv4 server %s is " "returning a delegation for " "OPEN(CLAIM_DELEGATE_CUR)\n", - clp->cl_hostname); - return; + NFS_SERVER(inode)->nfs_client->cl_hostname); + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + nfs_inode_reclaim_delegation(inode, cred, delegation->type, + &delegation->stateid, + delegation->pagemod_limit, + delegation->open_delegation_type); + break; + default: + nfs_inode_set_delegation(inode, cred, delegation->type, + &delegation->stateid, + delegation->pagemod_limit, + delegation->open_delegation_type); } - if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) - nfs_inode_set_delegation(state->inode, - data->owner->so_cred, - &data->o_res); - else - nfs_inode_reclaim_delegation(state->inode, - data->owner->so_cred, - &data->o_res); + if (delegation->do_recall) + nfs_async_inode_return_delegation(inode, &delegation->stateid); } /* @@ -1641,30 +2067,75 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) if (!data->rpc_done) { if (data->rpc_status) return ERR_PTR(data->rpc_status); - /* cached opens have already been processed */ - goto update; + return nfs4_try_open_cached(data); } ret = nfs_refresh_inode(inode, &data->f_attr); if (ret) return ERR_PTR(ret); - if (data->o_res.delegation_type != 0) - nfs4_opendata_check_deleg(data, state); -update: - update_open_stateid(state, &data->o_res.stateid, NULL, - data->o_arg.fmode); - atomic_inc(&state->count); + nfs4_process_delegation(state->inode, + data->owner->so_cred, + data->o_arg.claim, + &data->o_res.delegation); + + if (!(data->o_res.rflags & NFS4_OPEN_RESULT_NO_OPEN_STATEID)) { + if (!update_open_stateid(state, &data->o_res.stateid, + NULL, data->o_arg.fmode)) + return ERR_PTR(-EAGAIN); + } else if (!update_open_stateid(state, NULL, NULL, data->o_arg.fmode)) + return ERR_PTR(-EAGAIN); + refcount_inc(&state->count); return state; } +static struct inode * +nfs4_opendata_get_inode(struct nfs4_opendata *data) +{ + struct inode *inode; + + switch (data->o_arg.claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + if (!(data->f_attr.valid & NFS_ATTR_FATTR)) + return ERR_PTR(-EAGAIN); + inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, + &data->f_attr); + break; + default: + inode = d_inode(data->dentry); + ihold(inode); + nfs_refresh_inode(inode, &data->f_attr); + } + return inode; +} + static struct nfs4_state * -_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data) { + struct nfs4_state *state; struct inode *inode; - struct nfs4_state *state = NULL; - int ret; + + inode = nfs4_opendata_get_inode(data); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (data->state != NULL && data->state->inode == inode) { + state = data->state; + refcount_inc(&state->count); + } else + state = nfs4_get_open_state(inode, data->owner); + iput(inode); + if (state == NULL) + state = ERR_PTR(-ENOMEM); + return state; +} + +static struct nfs4_state * +_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ + struct nfs4_state *state; if (!data->rpc_done) { state = nfs4_try_open_cached(data); @@ -1672,29 +2143,28 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) goto out; } - ret = -EAGAIN; - if (!(data->f_attr.valid & NFS_ATTR_FATTR)) - goto err; - inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label); - ret = PTR_ERR(inode); - if (IS_ERR(inode)) - goto err; - ret = -ENOMEM; - state = nfs4_get_open_state(inode, data->owner); - if (state == NULL) - goto err_put_inode; - if (data->o_res.delegation_type != 0) - nfs4_opendata_check_deleg(data, state); - update_open_stateid(state, &data->o_res.stateid, NULL, - data->o_arg.fmode); - iput(inode); + state = nfs4_opendata_find_nfs4_state(data); + if (IS_ERR(state)) + goto out; + + nfs4_process_delegation(state->inode, + data->owner->so_cred, + data->o_arg.claim, + &data->o_res.delegation); + + if (!(data->o_res.rflags & NFS4_OPEN_RESULT_NO_OPEN_STATEID)) { + if (!update_open_stateid(state, &data->o_res.stateid, + NULL, data->o_arg.fmode)) { + nfs4_put_open_state(state); + state = ERR_PTR(-EAGAIN); + } + } else if (!update_open_stateid(state, NULL, NULL, data->o_arg.fmode)) { + nfs4_put_open_state(state); + state = ERR_PTR(-EAGAIN); + } out: nfs_release_seqid(data->o_arg.seqid); return state; -err_put_inode: - iput(inode); -err: - return ERR_PTR(ret); } static struct nfs4_state * @@ -1710,50 +2180,68 @@ nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) return ret; } -static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) +static struct nfs_open_context * +nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode) { struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_open_context *ctx; - spin_lock(&state->inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { if (ctx->state != state) continue; - get_nfs_open_context(ctx); - spin_unlock(&state->inode->i_lock); + if ((ctx->mode & mode) != mode) + continue; + if (!get_nfs_open_context(ctx)) + continue; + rcu_read_unlock(); return ctx; } - spin_unlock(&state->inode->i_lock); + rcu_read_unlock(); return ERR_PTR(-ENOENT); } +static struct nfs_open_context * +nfs4_state_find_open_context(struct nfs4_state *state) +{ + struct nfs_open_context *ctx; + + ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE); + if (!IS_ERR(ctx)) + return ctx; + ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE); + if (!IS_ERR(ctx)) + return ctx; + return nfs4_state_find_open_context_mode(state, FMODE_READ); +} + static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state, enum open_claim_type4 claim) { struct nfs4_opendata *opendata; opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, - NULL, NULL, claim, GFP_NOFS); + NULL, claim, GFP_NOFS); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; - atomic_inc(&state->count); + refcount_inc(&state->count); return opendata; } static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, - fmode_t fmode) + fmode_t fmode) { struct nfs4_state *newstate; + struct nfs_server *server = NFS_SB(opendata->dentry->d_sb); + int openflags = opendata->o_arg.open_flags; int ret; if (!nfs4_mode_match_open_stateid(opendata->state, fmode)) return 0; - opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; - opendata->o_arg.share_access = nfs4_map_atomic_open_share( - NFS_SB(opendata->dentry->d_sb), - fmode, 0); + opendata->o_arg.share_access = + nfs4_map_atomic_open_share(server, fmode, openflags); memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); @@ -1773,13 +2261,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * { int ret; - /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ - clear_bit(NFS_O_RDWR_STATE, &state->flags); - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_O_RDONLY_STATE, &state->flags); /* memory barrier prior to reading state->n_* */ - clear_bit(NFS_DELEGATED_STATE, &state->flags); - clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE); if (ret != 0) @@ -1812,7 +2294,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state { struct nfs_delegation *delegation; struct nfs4_opendata *opendata; - fmode_t delegation_type = 0; + u32 delegation_type = NFS4_OPEN_DELEGATE_NONE; int status; opendata = nfs4_open_recoverdata_alloc(ctx, state, @@ -1821,8 +2303,20 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state return PTR_ERR(opendata); rcu_read_lock(); delegation = rcu_dereference(NFS_I(state->inode)->delegation); - if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) - delegation_type = delegation->type; + if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) { + switch(delegation->type) { + case FMODE_READ: + delegation_type = NFS4_OPEN_DELEGATE_READ; + if (test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags)) + delegation_type = NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG; + break; + case FMODE_WRITE: + case FMODE_READ|FMODE_WRITE: + delegation_type = NFS4_OPEN_DELEGATE_WRITE; + if (test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags)) + delegation_type = NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG; + } + } rcu_read_unlock(); opendata->o_arg.u.delegation_type = delegation_type; status = nfs4_open_recover(opendata, state); @@ -1855,33 +2349,34 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta ctx = nfs4_state_find_open_context(state); if (IS_ERR(ctx)) return -EAGAIN; + clear_bit(NFS_DELEGATED_STATE, &state->flags); + nfs_state_clear_open_state_flags(state); ret = nfs4_do_open_reclaim(ctx, state); put_nfs_open_context(ctx); return ret; } -static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err) +static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, struct file_lock *fl, int err) { switch (err) { default: printk(KERN_ERR "NFS: %s: unhandled error " "%d.\n", __func__, err); + fallthrough; case 0: case -ENOENT: case -EAGAIN: case -ESTALE: + case -ETIMEDOUT: break; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - set_bit(NFS_DELEGATED_STATE, &state->flags); - nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); return -EAGAIN; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: - set_bit(NFS_DELEGATED_STATE, &state->flags); /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); return -EAGAIN; @@ -1902,20 +2397,22 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: - set_bit(NFS_DELEGATED_STATE, &state->flags); ssleep(1); return -EAGAIN; case -ENOMEM: case -NFS4ERR_DENIED: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + if (fl) { + struct nfs4_lock_state *lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); + } return 0; } return err; } int nfs4_open_delegation_recall(struct nfs_open_context *ctx, - struct nfs4_state *state, const nfs4_stateid *stateid, - fmode_t type) + struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_opendata *opendata; @@ -1926,24 +2423,25 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, if (IS_ERR(opendata)) return PTR_ERR(opendata); nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); - write_seqlock(&state->seqlock); - nfs4_stateid_copy(&state->stateid, &state->open_stateid); - write_sequnlock(&state->seqlock); - clear_bit(NFS_DELEGATED_STATE, &state->flags); - switch (type & (FMODE_READ|FMODE_WRITE)) { - case FMODE_READ|FMODE_WRITE: - case FMODE_WRITE: + if (!test_bit(NFS_O_RDWR_STATE, &state->flags)) { err = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE); if (err) - break; + goto out; + } + if (!test_bit(NFS_O_WRONLY_STATE, &state->flags)) { err = nfs4_open_recover_helper(opendata, FMODE_WRITE); if (err) - break; - case FMODE_READ: + goto out; + } + if (!test_bit(NFS_O_RDONLY_STATE, &state->flags)) { err = nfs4_open_recover_helper(opendata, FMODE_READ); + if (err) + goto out; } + nfs_state_clear_delegation(state); +out: nfs4_opendata_put(opendata); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err); } static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) @@ -2012,17 +2510,16 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) .callback_ops = &nfs4_open_confirm_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int status; - nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1); + nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1, + data->is_recover); kref_get(&data->kref); data->rpc_done = false; data->rpc_status = 0; data->timestamp = jiffies; - if (data->is_recover) - nfs4_set_sequence_privileged(&data->c_arg.seq_args); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -2052,10 +2549,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) if (data->state != NULL) { struct nfs_delegation *delegation; - if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags)) + if (can_open_cached(data->state, data->o_arg.fmode, + data->o_arg.open_flags, claim)) goto out_no_action; rcu_read_lock(); - delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); + delegation = nfs4_get_valid_delegation(data->state->inode); if (can_open_delegated(delegation, data->o_arg.fmode, claim)) goto unlock_no_action; rcu_read_unlock(); @@ -2069,9 +2567,9 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) case NFS4_OPEN_CLAIM_DELEG_CUR_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; + fallthrough; case NFS4_OPEN_CLAIM_FH: task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; - nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; if (nfs4_setup_sequence(data->o_arg.server->nfs_client, @@ -2083,11 +2581,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) /* Set the create mode (note dependency on the session type) */ data->o_arg.createmode = NFS4_CREATE_UNCHECKED; if (data->o_arg.open_flags & O_EXCL) { - data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE; - if (nfs4_has_persistent_session(clp)) + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1; + if (clp->cl_mvops->minor_version == 0) { + data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE; + /* don't put an ACCESS op in OPEN compound if O_EXCL, + * because ACCESS will return permission denied for + * all bits until close */ + data->o_res.access_request = data->o_arg.access = 0; + } else if (nfs4_has_persistent_session(clp)) data->o_arg.createmode = NFS4_CREATE_GUARDED; - else if (clp->cl_mvops->minor_version > 0) - data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1; } return; unlock_no_action: @@ -2135,12 +2637,14 @@ static void nfs4_open_release(void *calldata) struct nfs4_opendata *data = calldata; struct nfs4_state *state = NULL; + /* In case of error, no cleanup! */ + if (data->rpc_status != 0 || !data->rpc_done) { + nfs_release_seqid(data->o_arg.seqid); + goto out_free; + } /* If this request hasn't been cancelled, do nothing */ if (!data->cancelled) goto out_free; - /* In case of error, no cleanup! */ - if (data->rpc_status != 0 || !data->rpc_done) - goto out_free; /* In case we need an open_confirm, no cleanup! */ if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) goto out_free; @@ -2157,7 +2661,8 @@ static const struct rpc_call_ops nfs4_open_ops = { .rpc_release = nfs4_open_release, }; -static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) +static int nfs4_run_open_task(struct nfs4_opendata *data, + struct nfs_open_context *ctx) { struct inode *dir = d_inode(data->dir); struct nfs_server *server = NFS_SERVER(dir); @@ -2176,19 +2681,25 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) .callback_ops = &nfs4_open_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int status; - nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1); + if (nfs_server_capable(dir, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + kref_get(&data->kref); data->rpc_done = false; data->rpc_status = 0; data->cancelled = false; data->is_recover = false; - if (isrecover) { - nfs4_set_sequence_privileged(&o_arg->seq_args); + if (!ctx) { + nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 1); data->is_recover = true; + task_setup_data.flags |= RPC_TASK_TIMEOUT; + } else { + nfs4_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1, 0); + pnfs_lgopen_prepare(data, ctx); } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -2210,7 +2721,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) struct nfs_openres *o_res = &data->o_res; int status; - status = nfs4_run_open_task(data, 1); + status = nfs4_run_open_task(data, NULL); if (status != 0 || !data->rpc_done) return status; @@ -2230,10 +2741,9 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) * Note that in the non-execute case, we want to turn off permission * checking if we just created a new file (POSIX open() semantics). */ -static int nfs4_opendata_access(struct rpc_cred *cred, +static int nfs4_opendata_access(const struct cred *cred, struct nfs4_opendata *opendata, - struct nfs4_state *state, fmode_t fmode, - int openflags) + struct nfs4_state *state, fmode_t fmode) { struct nfs_access_entry cache; u32 mask, flags; @@ -2244,11 +2754,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred, return 0; mask = 0; - /* - * Use openflags to check for exec, because fmode won't - * always have FMODE_EXEC set when file open for exec. - */ - if (openflags & __FMODE_EXEC) { + if (fmode & FMODE_EXEC) { /* ONLY check for exec rights */ if (S_ISDIR(state->inode->i_mode)) mask = NFS4_ACCESS_LOOKUP; @@ -2257,10 +2763,8 @@ static int nfs4_opendata_access(struct rpc_cred *cred, } else if ((fmode & FMODE_READ) && !opendata->file_created) mask = NFS4_ACCESS_READ; - cache.cred = cred; - cache.jiffies = jiffies; nfs_access_set_mask(&cache, opendata->o_res.access_result); - nfs_access_add_cache(state->inode, &cache); + nfs_access_add_cache(state->inode, &cache, cred); flags = NFS4_ACCESS_READ | NFS4_ACCESS_EXECUTE | NFS4_ACCESS_LOOKUP; if ((mask & ~cache.mask & flags) == 0) @@ -2272,7 +2776,8 @@ static int nfs4_opendata_access(struct rpc_cred *cred, /* * Note: On error, nfs4_proc_open will free the struct nfs4_opendata */ -static int _nfs4_proc_open(struct nfs4_opendata *data) +static int _nfs4_proc_open(struct nfs4_opendata *data, + struct nfs_open_context *ctx) { struct inode *dir = d_inode(data->dir); struct nfs_server *server = NFS_SERVER(dir); @@ -2280,7 +2785,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) struct nfs_openres *o_res = &data->o_res; int status; - status = nfs4_run_open_task(data, 0); + status = nfs4_run_open_task(data, ctx); if (!data->rpc_done) return status; if (status != 0) { @@ -2297,9 +2802,11 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) data->file_created = true; else if (o_res->cinfo.before != o_res->cinfo.after) data->file_created = true; - if (data->file_created || dir->i_version != o_res->cinfo.after) - update_changeattr(dir, &o_res->cinfo, - o_res->f_attr->time_start); + if (data->file_created || + inode_peek_iversion_raw(dir) != o_res->cinfo.after) + nfs4_update_changeattr(dir, &o_res->cinfo, + o_res->f_attr->time_start, + NFS_INO_INVALID_DATA); } if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) server->caps &= ~NFS_CAP_POSIX_LOCK; @@ -2309,8 +2816,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + struct nfs_fh *fh = &o_res->fh; + nfs4_sequence_free_slot(&o_res->seq_res); - nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + if (o_arg->claim == NFS4_OPEN_CLAIM_FH) + fh = NFS_FH(d_inode(data->dentry)); + nfs4_proc_getattr(server, fh, o_res->f_attr, NULL); } return 0; } @@ -2325,10 +2836,15 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s struct nfs4_opendata *opendata; int ret; - opendata = nfs4_open_recoverdata_alloc(ctx, state, - NFS4_OPEN_CLAIM_FH); + opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_FH); if (IS_ERR(opendata)) return PTR_ERR(opendata); + /* + * We're not recovering a delegation, so ask for no delegation. + * Otherwise the recovery thread could deadlock with an outstanding + * delegation return. + */ + opendata->o_arg.open_flags = O_DIRECT; ret = nfs4_open_recover(opendata, state); if (ret == -ESTALE) d_drop(ctx->dentry); @@ -2377,10 +2893,7 @@ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state, const nfs4_stateid *stateid) { nfs_remove_bad_delegation(state->inode, stateid); - write_seqlock(&state->seqlock); - nfs4_stateid_copy(&state->stateid, &state->open_stateid); - write_sequnlock(&state->seqlock); - clear_bit(NFS_DELEGATED_STATE, &state->flags); + nfs_state_clear_delegation(state); } static void nfs40_clear_delegation_stateid(struct nfs4_state *state) @@ -2393,20 +2906,19 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st { /* NFSv4.0 doesn't allow for delegation recovery on open expire */ nfs40_clear_delegation_stateid(state); + nfs_state_clear_open_state_flags(state); return nfs4_open_expired(sp, state); } static int nfs40_test_and_free_expired_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - struct rpc_cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { return -NFS4ERR_BAD_STATEID; } #if defined(CONFIG_NFS_V4_1) static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - struct rpc_cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { int status; @@ -2415,6 +2927,7 @@ static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, break; case NFS4_INVALID_STATEID_TYPE: case NFS4_SPECIAL_STATEID_TYPE: + case NFS4_FREED_STATEID_TYPE: return -NFS4ERR_BAD_STATEID; case NFS4_REVOKED_STATEID_TYPE: goto out_free; @@ -2435,39 +2948,59 @@ out_free: return -NFS4ERR_EXPIRED; } -static void nfs41_check_delegation_stateid(struct nfs4_state *state) +static int nfs41_check_delegation_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); nfs4_stateid stateid; struct nfs_delegation *delegation; - struct rpc_cred *cred; - int status; + const struct cred *cred = NULL; + int status, ret = NFS_OK; /* Get the delegation credential for use by test/free_stateid */ rcu_read_lock(); delegation = rcu_dereference(NFS_I(state->inode)->delegation); if (delegation == NULL) { rcu_read_unlock(); - return; + nfs_state_clear_delegation(state); + return NFS_OK; } + spin_lock(&delegation->lock); nfs4_stateid_copy(&stateid, &delegation->stateid); - if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) || - !test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, - &delegation->flags)) { + + if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags)) { + spin_unlock(&delegation->lock); rcu_read_unlock(); - nfs_finish_clear_delegation_stateid(state, &stateid); - return; + return NFS_OK; } - cred = get_rpccred(delegation->cred); + if (delegation->cred) + cred = get_cred(delegation->cred); + spin_unlock(&delegation->lock); rcu_read_unlock(); status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); trace_nfs4_test_delegation_stateid(state, NULL, status); if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) nfs_finish_clear_delegation_stateid(state, &stateid); + else + ret = status; - put_rpccred(cred); + put_cred(cred); + return ret; +} + +static void nfs41_delegation_recover_stateid(struct nfs4_state *state) +{ + nfs4_stateid tmp; + + if (test_bit(NFS_DELEGATED_STATE, &state->flags) && + nfs4_copy_delegation_stateid(state->inode, state->state, + &tmp, NULL) && + nfs4_stateid_match_other(&state->stateid, &tmp)) + nfs_state_set_delegation(state, &tmp, state->state); + else + nfs_state_clear_delegation(state); } /** @@ -2490,9 +3023,9 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) spin_lock(&state->state_lock); list_for_each_entry(lsp, &state->lock_states, ls_locks) { if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { - struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + const struct cred *cred = lsp->ls_state->owner->so_cred; - atomic_inc(&lsp->ls_count); + refcount_inc(&lsp->ls_count); spin_unlock(&state->state_lock); nfs4_put_lock_state(prev); @@ -2534,28 +3067,18 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); nfs4_stateid *stateid = &state->open_stateid; - struct rpc_cred *cred = state->owner->so_cred; + const struct cred *cred = state->owner->so_cred; int status; - if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) { - if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) { - if (nfs4_have_delegation(state->inode, state->state)) - return NFS_OK; - return -NFS4ERR_OPENMODE; - } + if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) return -NFS4ERR_BAD_STATEID; - } status = nfs41_test_and_free_expired_stateid(server, stateid, cred); trace_nfs4_test_open_stateid(state, NULL, status); if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) { - clear_bit(NFS_O_RDONLY_STATE, &state->flags); - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_O_RDWR_STATE, &state->flags); - clear_bit(NFS_OPEN_STATE, &state->flags); + nfs_state_clear_open_state_flags(state); stateid->type = NFS4_INVALID_STATEID_TYPE; - } - if (status != NFS_OK) return status; + } if (nfs_open_stateid_recover_openmode(state)) return -NFS4ERR_OPENMODE; return NFS_OK; @@ -2565,7 +3088,11 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st { int status; - nfs41_check_delegation_stateid(state); + status = nfs41_check_delegation_stateid(state); + if (status != NFS_OK) + return status; + nfs41_delegation_recover_stateid(state); + status = nfs41_check_expired_locks(state); if (status != NFS_OK) return status; @@ -2581,48 +3108,61 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st * fields corresponding to attributes that were used to store the verifier. * Make sure we clobber those fields in the later setattr call */ -static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, +static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr, struct nfs4_label **label) { - const u32 *attrset = opendata->o_res.attrset; + const __u32 *bitmask = opendata->o_arg.server->exclcreat_bitmask; + __u32 attrset[3]; + unsigned ret; + unsigned i; - if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) && - !(sattr->ia_valid & ATTR_ATIME_SET)) - sattr->ia_valid |= ATTR_ATIME; + for (i = 0; i < ARRAY_SIZE(attrset); i++) { + attrset[i] = opendata->o_res.attrset[i]; + if (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE4_1) + attrset[i] &= ~bitmask[i]; + } - if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) && - !(sattr->ia_valid & ATTR_MTIME_SET)) - sattr->ia_valid |= ATTR_MTIME; + ret = (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE) ? + sattr->ia_valid : 0; - /* Except MODE, it seems harmless of setting twice. */ - if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && - (attrset[1] & FATTR4_WORD1_MODE || - attrset[2] & FATTR4_WORD2_MODE_UMASK)) - sattr->ia_valid &= ~ATTR_MODE; + if ((attrset[1] & (FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET))) { + if (sattr->ia_valid & ATTR_ATIME_SET) + ret |= ATTR_ATIME_SET; + else + ret |= ATTR_ATIME; + } + + if ((attrset[1] & (FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET))) { + if (sattr->ia_valid & ATTR_MTIME_SET) + ret |= ATTR_MTIME_SET; + else + ret |= ATTR_MTIME; + } - if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) + if (!(attrset[2] & FATTR4_WORD2_SECURITY_LABEL)) *label = NULL; + return ret; } static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, - fmode_t fmode, - int flags, struct nfs_open_context *ctx) { struct nfs4_state_owner *sp = opendata->owner; struct nfs_server *server = sp->so_server; struct dentry *dentry; struct nfs4_state *state; - unsigned int seq; + fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx); + struct inode *dir = d_inode(opendata->dir); + unsigned long dir_verifier; int ret; - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + dir_verifier = nfs_save_change_attribute(dir); - ret = _nfs4_proc_open(opendata); + ret = _nfs4_proc_open(opendata, ctx); if (ret != 0) goto out; - state = nfs4_opendata_to_nfs4_state(opendata); + state = _nfs4_opendata_to_nfs4_state(opendata); ret = PTR_ERR(state); if (IS_ERR(state)) goto out; @@ -2631,33 +3171,52 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags); + if (opendata->o_res.rflags & NFS4_OPEN_RESULT_PRESERVE_UNLINKED) + set_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(state->inode)->flags); dentry = opendata->dentry; if (d_really_is_negative(dentry)) { struct dentry *alias; d_drop(dentry); - alias = d_exact_alias(dentry, state->inode); - if (!alias) - alias = d_splice_alias(igrab(state->inode), dentry); + alias = d_splice_alias(igrab(state->inode), dentry); /* d_splice_alias() can't fail here - it's a non-directory */ if (alias) { dput(ctx->dentry); ctx->dentry = dentry = alias; } - nfs_set_verifier(dentry, - nfs_save_change_attribute(d_inode(opendata->dir))); } - ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); + switch(opendata->o_arg.claim) { + default: + break; + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + if (!opendata->rpc_done) + break; + if (opendata->o_res.delegation.type != 0) + dir_verifier = nfs_save_change_attribute(dir); + nfs_set_verifier(dentry, dir_verifier); + } + + /* Parse layoutget results before we check for access */ + pnfs_parse_lgopen(state->inode, opendata->lgp, ctx); + + ret = nfs4_opendata_access(sp->so_cred, opendata, state, acc_mode); if (ret != 0) goto out; - if (d_inode(dentry) == state->inode) { + if (d_inode(dentry) == state->inode) nfs_inode_attach_open_context(ctx); - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) - nfs4_schedule_stateid_recovery(server, state); - } + out: + if (!opendata->cancelled) { + if (opendata->lgp) { + nfs4_lgopen_release(opendata->lgp); + opendata->lgp = NULL; + } + nfs4_sequence_free_slot(&opendata->o_res.seq_res); + } return ret; } @@ -2667,8 +3226,7 @@ out: static int _nfs4_do_open(struct inode *dir, struct nfs_open_context *ctx, int flags, - struct iattr *sattr, - struct nfs4_label *label, + const struct nfs4_open_createattrs *c, int *opened) { struct nfs4_state_owner *sp; @@ -2676,11 +3234,12 @@ static int _nfs4_do_open(struct inode *dir, struct nfs_server *server = NFS_SERVER(dir); struct nfs4_opendata *opendata; struct dentry *dentry = ctx->dentry; - struct rpc_cred *cred = ctx->cred; + const struct cred *cred = ctx->cred; struct nfs4_threshold **ctx_th = &ctx->mdsthreshold; - fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); + fmode_t fmode = _nfs4_ctx_to_openmode(ctx); enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; - struct nfs4_label *olabel = NULL; + struct iattr *sattr = c->sattr; + struct nfs4_label *label = c->label; int status; /* Protect against reboot recovery conflicts */ @@ -2698,69 +3257,61 @@ static int _nfs4_do_open(struct inode *dir, status = -ENOMEM; if (d_really_is_positive(dentry)) claim = NFS4_OPEN_CLAIM_FH; - opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, - label, claim, GFP_KERNEL); + opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, + c, claim, GFP_KERNEL); if (opendata == NULL) goto err_put_state_owner; - if (label) { - olabel = nfs4_label_alloc(server, GFP_KERNEL); - if (IS_ERR(olabel)) { - status = PTR_ERR(olabel); - goto err_opendata_put; - } - } - if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { if (!opendata->f_attr.mdsthreshold) { opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); if (!opendata->f_attr.mdsthreshold) - goto err_free_label; + goto err_opendata_put; } opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } if (d_really_is_positive(dentry)) opendata->state = nfs4_get_open_state(d_inode(dentry), sp); - status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx); + status = _nfs4_open_and_get_state(opendata, ctx); if (status != 0) - goto err_free_label; + goto err_opendata_put; state = ctx->state; if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { - nfs4_exclusive_attrset(opendata, sattr, &label); + unsigned attrs = nfs4_exclusive_attrset(opendata, sattr, &label); /* * send create attributes which was not set by open * with an extra setattr. */ - if (sattr->ia_valid & NFS4_VALID_ATTRS) { + if (attrs || label) { + unsigned ia_old = sattr->ia_valid; + + sattr->ia_valid = attrs; nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, opendata->o_res.f_attr, sattr, - ctx, label, olabel); + ctx, label); if (status == 0) { nfs_setattr_update_inode(state->inode, sattr, opendata->o_res.f_attr); - nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + nfs_setsecurity(state->inode, opendata->o_res.f_attr); } + sattr->ia_valid = ia_old; } } if (opened && opendata->file_created) - *opened |= FILE_CREATED; + *opened = 1; if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { *ctx_th = opendata->f_attr.mdsthreshold; opendata->f_attr.mdsthreshold = NULL; } - nfs4_label_free(olabel); - nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); return 0; -err_free_label: - nfs4_label_free(olabel); err_opendata_put: nfs4_opendata_put(opendata); err_put_state_owner: @@ -2778,12 +3329,22 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, int *opened) { struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; struct nfs4_state *res; + struct nfs4_open_createattrs c = { + .label = label, + .sattr = sattr, + .verf = { + [0] = (__u32)jiffies, + [1] = (__u32)current->pid, + }, + }; int status; do { - status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened); + status = _nfs4_do_open(dir, ctx, flags, &c, opened); res = ctx->state; trace_nfs4_open_file(ctx, flags, status); if (status == 0) @@ -2816,6 +3377,11 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, exception.retry = 1; continue; } + if (status == -NFS4ERR_EXPIRED) { + nfs4_schedule_lease_recovery(server->nfs_client); + exception.retry = 1; + continue; + } if (status == -EAGAIN) { /* We must have found a delegation */ exception.retry = 1; @@ -2832,7 +3398,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, static int _nfs4_do_setattr(struct inode *inode, struct nfs_setattrargs *arg, struct nfs_setattrres *res, - struct rpc_cred *cred, + const struct cred *cred, struct nfs_open_context *ctx) { struct nfs_server *server = NFS_SERVER(inode); @@ -2842,9 +3408,8 @@ static int _nfs4_do_setattr(struct inode *inode, .rpc_resp = res, .rpc_cred = cred, }; - struct rpc_cred *delegation_cred = NULL; + const struct cred *delegation_cred = NULL; unsigned long timestamp = jiffies; - fmode_t fmode; bool truncate; int status; @@ -2852,11 +3417,14 @@ static int _nfs4_do_setattr(struct inode *inode, /* Servers should only apply open mode checks for file size changes */ truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false; - fmode = truncate ? FMODE_WRITE : FMODE_READ; + if (!truncate) { + nfs4_inode_make_writeable(inode); + goto zero_stateid; + } - if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) { + if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) { /* Use that stateid */ - } else if (truncate && ctx != NULL) { + } else if (ctx != NULL && ctx->state) { struct nfs_lock_context *l_ctx; if (!nfs4_valid_open_stateid(ctx->state)) return -EBADF; @@ -2868,37 +3436,40 @@ static int _nfs4_do_setattr(struct inode *inode, nfs_put_lock_context(l_ctx); if (status == -EIO) return -EBADF; - } else + else if (status == -EAGAIN) + goto zero_stateid; + } else { +zero_stateid: nfs4_stateid_copy(&arg->stateid, &zero_stateid); + } if (delegation_cred) msg.rpc_cred = delegation_cred; status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1); - put_rpccred(delegation_cred); + put_cred(delegation_cred); if (status == 0 && ctx != NULL) renew_lease(server, timestamp); trace_nfs4_setattr(inode, &arg->stateid, status); return status; } -static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs_open_context *ctx, struct nfs4_label *ilabel, - struct nfs4_label *olabel) + struct nfs_open_context *ctx, struct nfs4_label *ilabel) { struct nfs_server *server = NFS_SERVER(inode); + __u32 bitmask[NFS4_BITMASK_SZ]; struct nfs4_state *state = ctx ? ctx->state : NULL; struct nfs_setattrargs arg = { .fh = NFS_FH(inode), .iap = sattr, .server = server, - .bitmask = server->attr_bitmask, + .bitmask = bitmask, .label = ilabel, }; struct nfs_setattrres res = { .fattr = fattr, - .label = olabel, .server = server, }; struct nfs4_exception exception = { @@ -2906,13 +3477,23 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, .inode = inode, .stateid = &arg.stateid, }; + unsigned long adjust_flags = NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME; int err; - arg.bitmask = nfs4_bitmask(server, ilabel); - if (ilabel) - arg.bitmask = nfs4_bitmask(server, olabel); + if (sattr->ia_valid & (ATTR_MODE | ATTR_KILL_SUID | ATTR_KILL_SGID)) + adjust_flags |= NFS_INO_INVALID_MODE; + if (sattr->ia_valid & (ATTR_UID | ATTR_GID)) + adjust_flags |= NFS_INO_INVALID_OTHER; + if (sattr->ia_valid & ATTR_ATIME) + adjust_flags |= NFS_INO_INVALID_ATIME; + if (sattr->ia_valid & ATTR_MTIME) + adjust_flags |= NFS_INO_INVALID_MTIME; do { + nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), + inode, adjust_flags); + err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx); switch (err) { case -NFS4ERR_OPENMODE: @@ -2945,6 +3526,102 @@ nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task) return pnfs_wait_on_layoutreturn(inode, task); } +/* + * Update the seqid of an open stateid + */ +static void nfs4_sync_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + int seq; + + for (;;) { + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + nfs4_stateid_copy(dst, &state->open_stateid); + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0) + dst->seqid = seqid_open; + break; + } +} + +/* + * Update the seqid of an open stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + bool ret; + int seq, status = -EAGAIN; + DEFINE_WAIT(wait); + + for (;;) { + ret = false; + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + + write_seqlock(&state->seqlock); + seqid_open = state->open_stateid.seqid; + + dst_seqid = be32_to_cpu(dst->seqid); + + /* Did another OPEN bump the state's seqid? try again: */ + if ((s32)(be32_to_cpu(seqid_open) - dst_seqid) > 0) { + dst->seqid = seqid_open; + write_sequnlock(&state->seqlock); + ret = true; + break; + } + + /* server says we're behind but we haven't seen the update yet */ + set_bit(NFS_STATE_CHANGE_WAIT, &state->flags); + prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE); + write_sequnlock(&state->seqlock); + trace_nfs4_close_stateid_update_wait(state->inode, dst, 0); + + if (fatal_signal_pending(current) || nfs_current_task_exiting()) + status = -EINTR; + else + if (schedule_timeout(5*HZ) != 0) + status = 0; + + finish_wait(&state->waitq, &wait); + + if (!status) + continue; + if (status == -EINTR) + break; + + /* we slept the whole 5 seconds, we must have lost a seqid */ + dst->seqid = cpu_to_be32(dst_seqid + 1); + ret = true; + break; + } + + return ret; +} + struct nfs4_closedata { struct inode *inode; struct nfs4_state *state; @@ -2959,6 +3636,7 @@ struct nfs4_closedata { } lr; struct nfs_fattr fattr; unsigned long timestamp; + unsigned short retrans; }; static void nfs4_free_closedata(void *data) @@ -2983,36 +3661,21 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); nfs4_stateid *res_stateid = NULL; + struct nfs4_exception exception = { + .state = state, + .inode = calldata->inode, + .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, + }; - dprintk("%s: begin!\n", __func__); if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); /* Handle Layoutreturn errors */ - if (calldata->arg.lr_args && task->tk_status != 0) { - switch (calldata->res.lr_ret) { - default: - calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - break; - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - calldata->res.lr_ret = 0; - rpc_restart_call_prepare(task); - return; - } - } + if (pnfs_roc_done(task, &calldata->arg.lr_args, &calldata->res.lr_res, + &calldata->res.lr_ret) == -EAGAIN) + goto out_restart; /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors @@ -3026,39 +3689,46 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->arg.bitmask != NULL) { calldata->arg.bitmask = NULL; calldata->res.fattr = NULL; - task->tk_status = 0; - rpc_restart_call_prepare(task); - goto out_release; + goto out_restart; } break; + case -NFS4ERR_OLD_STATEID: + /* Did we race with OPEN? */ + if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid, + state)) + goto out_restart; + goto out_release; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: nfs4_free_revoked_stateid(server, &calldata->arg.stateid, task->tk_msg.rpc_cred); - case -NFS4ERR_OLD_STATEID: + fallthrough; case -NFS4ERR_BAD_STATEID: - if (!nfs4_stateid_match(&calldata->arg.stateid, - &state->open_stateid)) { - rpc_restart_call_prepare(task); - goto out_release; - } if (calldata->arg.fmode == 0) break; + fallthrough; default: - if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - goto out_release; - } + task->tk_status = nfs4_async_handle_exception(task, + server, task->tk_status, &exception); + calldata->retrans = exception.retrans; + if (exception.retry) + goto out_restart; } nfs_clear_open_stateid(state, &calldata->arg.stateid, res_stateid, calldata->arg.fmode); out_release: + task->tk_status = 0; nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, &calldata->fattr); - dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); + dprintk("%s: ret = %d\n", __func__, task->tk_status); + return; +out_restart: + task->tk_status = 0; + rpc_restart_call_prepare(task); + goto out_release; } static void nfs4_close_prepare(struct rpc_task *task, void *data) @@ -3066,10 +3736,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct inode *inode = calldata->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo; bool is_rdonly, is_wronly, is_rdwr; int call_close = 0; - dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; @@ -3078,7 +3749,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); - nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid); /* Calculate the change in open mode */ calldata->arg.fmode = 0; if (state->n_rdwr == 0) { @@ -3095,8 +3765,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) } else if (is_rdwr) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; - if (!nfs4_valid_open_stateid(state) || - test_bit(NFS_OPEN_STATE, &state->flags) == 0) + nfs4_sync_open_stateid(&calldata->arg.stateid, state); + if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -3110,20 +3780,28 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_wait; } + lo = calldata->arg.lr_args ? calldata->arg.lr_args->layout : NULL; + if (lo && !pnfs_layout_is_valid(lo)) { + calldata->arg.lr_args = NULL; + calldata->res.lr_res = NULL; + } + if (calldata->arg.fmode == 0) task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) { /* Close-to-open cache consistency revalidation */ - if (!nfs4_have_delegation(inode, FMODE_READ)) - calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; - else + if (!nfs4_have_delegation(inode, FMODE_READ, 0)) { + nfs4_bitmask_set(calldata->arg.bitmask_store, + server->cache_consistency_bitmask, + inode, 0); + calldata->arg.bitmask = calldata->arg.bitmask_store; + } else calldata->arg.bitmask = NULL; } calldata->arg.share_access = - nfs4_map_atomic_open_share(NFS_SERVER(inode), - calldata->arg.fmode, 0); + nfs4_fmode_to_share_access(calldata->arg.fmode); if (calldata->res.fattr == NULL) calldata->arg.bitmask = NULL; @@ -3135,7 +3813,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) &calldata->res.seq_res, task) != 0) nfs_release_seqid(calldata->arg.seqid); - dprintk("%s: done!\n", __func__); return; out_no_action: task->tk_action = NULL; @@ -3176,20 +3853,25 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) .rpc_message = &msg, .callback_ops = &nfs4_close_ops, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int status = -ENOMEM; + if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); calldata = kzalloc(sizeof(*calldata), gfp_mask); if (calldata == NULL) goto out; - nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1); + nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1, 0); calldata->inode = state->inode; calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); + if (!nfs4_copy_open_stateid(&calldata->arg.stateid, state)) + goto out_free_calldata; /* Serialization for the sequence id */ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask); @@ -3234,7 +3916,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr, int *opened) { struct nfs4_state *state; - struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL; + struct nfs4_label l, *label; label = nfs4_label_init_security(dir, ctx->dentry, attr, &l); @@ -3250,21 +3932,39 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) { + struct dentry *dentry = ctx->dentry; if (ctx->state == NULL) return; + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + nfs4_inode_set_return_delegation_on_close(d_inode(dentry)); if (is_sync) - nfs4_close_sync(ctx->state, ctx->mode); + nfs4_close_sync(ctx->state, _nfs4_ctx_to_openmode(ctx)); else - nfs4_close_state(ctx->state, ctx->mode); + nfs4_close_state(ctx->state, _nfs4_ctx_to_openmode(ctx)); } #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) #define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) -#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_MODE_UMASK - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_OPEN_ARGUMENTS - 1UL) + +#define FATTR4_WORD2_NFS42_TIME_DELEG_MASK \ + (FATTR4_WORD2_TIME_DELEG_MODIFY|FATTR4_WORD2_TIME_DELEG_ACCESS) +static bool nfs4_server_delegtime_capable(struct nfs4_server_caps_res *res) +{ + u32 share_access_want = res->open_caps.oa_share_access_want[0]; + u32 attr_bitmask = res->attr_bitmask[2]; + + return (share_access_want & NFS4_SHARE_WANT_DELEG_TIMESTAMPS) && + ((attr_bitmask & FATTR4_WORD2_NFS42_TIME_DELEG_MASK) == + FATTR4_WORD2_NFS42_TIME_DELEG_MASK); +} static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { - u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion; + u32 minorversion = server->nfs_client->cl_minorversion; + u32 bitmask[3] = { + [0] = FATTR4_WORD0_SUPPORTED_ATTRS, + }; struct nfs4_server_caps_arg args = { .fhandle = fhandle, .bitmask = bitmask, @@ -3282,12 +3982,24 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_FH_EXPIRE_TYPE | FATTR4_WORD0_LINK_SUPPORT | FATTR4_WORD0_SYMLINK_SUPPORT | - FATTR4_WORD0_ACLSUPPORT; + FATTR4_WORD0_ACLSUPPORT | + FATTR4_WORD0_CASE_INSENSITIVE | + FATTR4_WORD0_CASE_PRESERVING; if (minorversion) bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + if (minorversion > 1) + bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { + bitmask[0] = (FATTR4_WORD0_SUPPORTED_ATTRS | + FATTR4_WORD0_FH_EXPIRE_TYPE | + FATTR4_WORD0_LINK_SUPPORT | + FATTR4_WORD0_SYMLINK_SUPPORT | + FATTR4_WORD0_ACLSUPPORT | + FATTR4_WORD0_CASE_INSENSITIVE | + FATTR4_WORD0_CASE_PRESERVING) & + res.attr_bitmask[0]; /* Sanity check the server answers */ switch (minorversion) { case 0: @@ -3296,17 +4008,21 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f break; case 1: res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK; + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT & + res.attr_bitmask[2]; break; case 2: res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK; + bitmask[2] = (FATTR4_WORD2_SUPPATTR_EXCLCREAT | + FATTR4_WORD2_OPEN_ARGUMENTS) & + res.attr_bitmask[2]; } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); - server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| - NFS_CAP_SYMLINKS|NFS_CAP_FILEID| - NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| - NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| - NFS_CAP_CTIME|NFS_CAP_MTIME| - NFS_CAP_SECURITY_LABEL); + server->caps &= + ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS | + NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS | + NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME); + server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) server->caps |= NFS_CAP_ACLS; @@ -3314,30 +4030,50 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; - if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID) - server->caps |= NFS_CAP_FILEID; - if (res.attr_bitmask[1] & FATTR4_WORD1_MODE) - server->caps |= NFS_CAP_MODE; - if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS) - server->caps |= NFS_CAP_NLINK; - if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER) - server->caps |= NFS_CAP_OWNER; - if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP) - server->caps |= NFS_CAP_OWNER_GROUP; - if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS) - server->caps |= NFS_CAP_ATIME; - if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA) - server->caps |= NFS_CAP_CTIME; - if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) - server->caps |= NFS_CAP_MTIME; + if (res.case_insensitive) + server->caps |= NFS_CAP_CASE_INSENSITIVE; + if (res.case_preserving) + server->caps |= NFS_CAP_CASE_PRESERVING; #ifdef CONFIG_NFS_V4_SECURITY_LABEL if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) server->caps |= NFS_CAP_SECURITY_LABEL; #endif + if (res.attr_bitmask[0] & FATTR4_WORD0_FS_LOCATIONS) + server->caps |= NFS_CAP_FS_LOCATIONS; + if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID)) + server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE)) + server->fattr_valid &= ~NFS_ATTR_FATTR_MODE; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)) + server->fattr_valid &= ~NFS_ATTR_FATTR_NLINK; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER)) + server->fattr_valid &= ~(NFS_ATTR_FATTR_OWNER | + NFS_ATTR_FATTR_OWNER_NAME); + if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)) + server->fattr_valid &= ~(NFS_ATTR_FATTR_GROUP | + NFS_ATTR_FATTR_GROUP_NAME); + if (!(res.attr_bitmask[1] & FATTR4_WORD1_SPACE_USED)) + server->fattr_valid &= ~NFS_ATTR_FATTR_SPACE_USED; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)) + server->fattr_valid &= ~NFS_ATTR_FATTR_ATIME; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)) + server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)) + server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)) + server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_CREATE)) + server->fattr_valid &= ~NFS_ATTR_FATTR_BTIME; memcpy(server->attr_bitmask_nl, res.attr_bitmask, sizeof(server->attr_bitmask)); server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + if (res.open_caps.oa_share_access_want[0] & + NFS4_SHARE_WANT_OPEN_XOR_DELEGATION) + server->caps |= NFS_CAP_OPEN_XOR; + if (nfs4_server_delegtime_capable(&res)) + server->caps |= NFS_CAP_DELEGTIME; + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; @@ -3358,8 +4094,11 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; + do { err = nfs4_handle_exception(server, _nfs4_server_capabilities(server, fhandle), @@ -3368,16 +4107,157 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) return err; } +static void test_fs_location_for_trunking(struct nfs4_fs_location *location, + struct nfs_client *clp, + struct nfs_server *server) +{ + int i; + + for (i = 0; i < location->nservers; i++) { + struct nfs4_string *srv_loc = &location->servers[i]; + struct sockaddr_storage addr; + size_t addrlen; + struct xprt_create xprt_args = { + .ident = 0, + .net = clp->cl_net, + }; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; + char *servername = NULL; + + if (!srv_loc->len) + continue; + + addrlen = nfs_parse_server_name(srv_loc->data, srv_loc->len, + &addr, sizeof(addr), + clp->cl_net, server->port); + if (!addrlen) + return; + xprt_args.dstaddr = (struct sockaddr *)&addr; + xprt_args.addrlen = addrlen; + servername = kmalloc(srv_loc->len + 1, GFP_KERNEL); + if (!servername) + return; + memcpy(servername, srv_loc->data, srv_loc->len); + servername[srv_loc->len] = '\0'; + xprt_args.servername = servername; + + xprtdata.cred = nfs4_get_clid_cred(clp); + rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, + rpc_clnt_setup_test_and_add_xprt, + &rpcdata); + if (xprtdata.cred) + put_cred(xprtdata.cred); + kfree(servername); + } +} + +static bool _is_same_nfs4_pathname(struct nfs4_pathname *path1, + struct nfs4_pathname *path2) +{ + int i; + + if (path1->ncomponents != path2->ncomponents) + return false; + for (i = 0; i < path1->ncomponents; i++) { + if (path1->components[i].len != path2->components[i].len) + return false; + if (memcmp(path1->components[i].data, path2->components[i].data, + path1->components[i].len)) + return false; + } + return true; +} + +static int _nfs4_discover_trunking(struct nfs_server *server, + struct nfs_fh *fhandle) +{ + struct nfs4_fs_locations *locations = NULL; + struct page *page; + const struct cred *cred; + struct nfs_client *clp = server->nfs_client; + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + int status = -ENOMEM, i; + + cred = ops->get_state_renewal_cred(clp); + if (cred == NULL) { + cred = nfs4_get_clid_cred(clp); + if (cred == NULL) + return -ENOKEY; + } + + page = alloc_page(GFP_KERNEL); + if (!page) + goto out_put_cred; + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (!locations) + goto out_free; + locations->fattr = nfs_alloc_fattr(); + if (!locations->fattr) + goto out_free_2; + + status = nfs4_proc_get_locations(server, fhandle, locations, page, + cred); + if (status) + goto out_free_3; + + for (i = 0; i < locations->nlocations; i++) { + if (!_is_same_nfs4_pathname(&locations->fs_path, + &locations->locations[i].rootpath)) + continue; + test_fs_location_for_trunking(&locations->locations[i], clp, + server); + } +out_free_3: + kfree(locations->fattr); +out_free_2: + kfree(locations); +out_free: + __free_page(page); +out_put_cred: + put_cred(cred); + return status; +} + +static int nfs4_discover_trunking(struct nfs_server *server, + struct nfs_fh *fhandle) +{ + struct nfs4_exception exception = { + .interruptible = true, + }; + struct nfs_client *clp = server->nfs_client; + int err = 0; + + if (!nfs4_has_session(clp)) + goto out; + do { + err = nfs4_handle_exception(server, + _nfs4_discover_trunking(server, fhandle), + &exception); + } while (exception.retry); +out: + return err; +} + static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fattr *fattr) { - u32 bitmask[3]; + u32 bitmask[3] = { + [0] = FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE | + FATTR4_WORD0_SIZE | FATTR4_WORD0_FSID, + }; struct nfs4_lookup_root_arg args = { .bitmask = bitmask, }; struct nfs4_lookup_res res = { .server = server, - .fattr = info->fattr, + .fattr = fattr, .fh = fhandle, }; struct rpc_message msg = { @@ -3386,25 +4266,20 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; - bitmask[0] = nfs4_fattr_bitmap[0]; - bitmask[1] = nfs4_fattr_bitmap[1]; - /* - * Process the label in the upcoming getfattr - */ - bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL; - - nfs_fattr_init(info->fattr); + nfs_fattr_init(fattr); return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fattr *fattr) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { - err = _nfs4_lookup_root(server, fhandle, info); - trace_nfs4_lookup_root(server, fhandle, info->fattr, err); + err = _nfs4_lookup_root(server, fhandle, fattr); + trace_nfs4_lookup_root(server, fhandle, fattr, err); switch (err) { case 0: case -NFS4ERR_WRONGSEC: @@ -3417,8 +4292,9 @@ out: return err; } -static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, rpc_authflavor_t flavor) +static int nfs4_lookup_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + rpc_authflavor_t flavor) { struct rpc_auth_create_args auth_args = { .pseudoflavor = flavor, @@ -3428,7 +4304,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl auth = rpcauth_create(&auth_args, server->client); if (IS_ERR(auth)) return -EACCES; - return nfs4_lookup_root(server, fhandle, info); + return nfs4_lookup_root(server, fhandle, fattr); } /* @@ -3441,7 +4317,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl * negative errno value. */ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fattr *fattr) { /* Per 3530bis 15.33.5 */ static const rpc_authflavor_t flav_array[] = { @@ -3457,8 +4333,9 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, if (server->auth_info.flavor_len > 0) { /* try each flavor specified by user */ for (i = 0; i < server->auth_info.flavor_len; i++) { - status = nfs4_lookup_root_sec(server, fhandle, info, - server->auth_info.flavors[i]); + status = nfs4_lookup_root_sec( + server, fhandle, fattr, + server->auth_info.flavors[i]); if (status == -NFS4ERR_WRONGSEC || status == -EACCES) continue; break; @@ -3466,7 +4343,7 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, } else { /* no flavors specified by user, try default list */ for (i = 0; i < ARRAY_SIZE(flav_array); i++) { - status = nfs4_lookup_root_sec(server, fhandle, info, + status = nfs4_lookup_root_sec(server, fhandle, fattr, flav_array[i]); if (status == -NFS4ERR_WRONGSEC || status == -EACCES) continue; @@ -3475,7 +4352,7 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, } /* - * -EACCESS could mean that the user doesn't have correct permissions + * -EACCES could mean that the user doesn't have correct permissions * to access the mount. It could also mean that we tried to mount * with a gss auth flavor, but rpc.gssd isn't running. Either way, * existing mount programs don't handle -EACCES very well so it should @@ -3490,28 +4367,22 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, * nfs4_proc_get_rootfh - get file handle for server's pseudoroot * @server: initialized nfs_server handle * @fhandle: we fill in the pseudo-fs root file handle - * @info: we fill in an FSINFO struct + * @fattr: we fill in a bare bones struct fattr * @auth_probe: probe the auth flavours * * Returns zero on success, or a negative errno. */ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, - bool auth_probe) + struct nfs_fattr *fattr, bool auth_probe) { int status = 0; if (!auth_probe) - status = nfs4_lookup_root(server, fhandle, info); + status = nfs4_lookup_root(server, fhandle, fattr); if (auth_probe || status == NFS4ERR_WRONGSEC) - status = server->nfs_client->cl_mvops->find_root_sec(server, - fhandle, info); - - if (status == 0) - status = nfs4_server_capabilities(server, fhandle); - if (status == 0) - status = nfs4_do_fsinfo(server, fhandle, info); + status = server->nfs_client->cl_mvops->find_root_sec( + server, fhandle, fattr); return nfs4_map_errors(status); } @@ -3521,7 +4392,6 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, { int error; struct nfs_fattr *fattr = info->fattr; - struct nfs4_label *label = NULL; error = nfs4_server_capabilities(server, mntfh); if (error < 0) { @@ -3529,23 +4399,17 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, return error; } - label = nfs4_label_alloc(server, GFP_KERNEL); - if (IS_ERR(label)) - return PTR_ERR(label); - - error = nfs4_proc_getattr(server, mntfh, fattr, label); + error = nfs4_proc_getattr(server, mntfh, fattr, NULL); if (error < 0) { dprintk("nfs4_get_root: getattr error = %d\n", -error); - goto err_free_label; + goto out; } if (fattr->valid & NFS_ATTR_FATTR_FSID && !nfs_fsid_equal(&server->fsid, &fattr->fsid)) memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); -err_free_label: - nfs4_label_free(label); - +out: return error; } @@ -3569,6 +4433,8 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, if (locations == NULL) goto out; + locations->fattr = fattr; + status = nfs4_proc_fs_locations(client, dir, name, locations, page); if (status != 0) goto out; @@ -3578,17 +4444,14 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, * referral. Cause us to drop into the exception handler, which * will kick off migration recovery. */ - if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { + if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &fattr->fsid)) { dprintk("%s: server did not return a different fsid for" " a referral at %s\n", __func__, name->name); status = -NFS4ERR_MOVED; goto out; } /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ - nfs_fixup_referral_attributes(&locations->fattr); - - /* replace the lookup nfs_fattr with the locations nfs_fattr */ - memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); + nfs_fixup_referral_attributes(fattr); memset(fhandle, 0, sizeof(struct nfs_fh)); out: if (page) @@ -3598,15 +4461,15 @@ out: } static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) + struct nfs_fattr *fattr, struct inode *inode) { + __u32 bitmask[NFS4_BITMASK_SZ]; struct nfs4_getattr_arg args = { .fh = fhandle, - .bitmask = server->attr_bitmask, + .bitmask = bitmask, }; struct nfs4_getattr_res res = { .fattr = fattr, - .label = label, .server = server, }; struct rpc_message msg = { @@ -3614,20 +4477,31 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; + unsigned short task_flags = 0; + + if (nfs4_has_session(server->nfs_client)) + task_flags = RPC_TASK_MOVEABLE; - args.bitmask = nfs4_bitmask(server, label); + /* Is this is an attribute revalidation, subject to softreval? */ + if (inode && (server->flags & NFS_MOUNT_SOFTREVAL)) + task_flags |= RPC_TASK_TIMEOUT; + nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), inode, 0); nfs_fattr_init(fattr); - return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + return nfs4_do_call_sync(server->client, server, &msg, + &args.seq_args, &res.seq_res, task_flags); } -static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) +int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct inode *inode) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { - err = _nfs4_proc_getattr(server, fhandle, fattr, label); + err = _nfs4_proc_getattr(server, fhandle, fattr, inode); trace_nfs4_getattr(server, fhandle, fattr, err); err = nfs4_handle_exception(server, err, &exception); @@ -3657,9 +4531,8 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { struct inode *inode = d_inode(dentry); - struct rpc_cred *cred = NULL; + const struct cred *cred = NULL; struct nfs_open_context *ctx = NULL; - struct nfs4_label *label = NULL; int status; if (pnfs_ld_layoutret_on_setattr(inode) && @@ -3685,22 +4558,21 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, cred = ctx->cred; } - label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); - if (IS_ERR(label)) - return PTR_ERR(label); + /* Return any delegations if we're going to change ACLs */ + if ((sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + nfs4_inode_make_writeable(inode); - status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label); + status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL); if (status == 0) { nfs_setattr_update_inode(inode, sattr, fattr); - nfs_setsecurity(inode, fattr, label); + nfs_setsecurity(inode, fattr); } - nfs4_label_free(label); return status; } static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) + struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_server *server = NFS_SERVER(dir); int status; @@ -3712,7 +4584,6 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct nfs4_lookup_res res = { .server = server, .fattr = fattr, - .label = label, .fh = fhandle, }; struct rpc_message msg = { @@ -3720,13 +4591,23 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, .rpc_argp = &args, .rpc_resp = &res, }; + unsigned short task_flags = 0; + + if (nfs_server_capable(dir, NFS_CAP_MOVEABLE)) + task_flags = RPC_TASK_MOVEABLE; - args.bitmask = nfs4_bitmask(server, label); + /* Is this is an attribute revalidation, subject to softreval? */ + if (nfs_lookup_is_soft_revalidate(dentry)) + task_flags |= RPC_TASK_TIMEOUT; + + args.bitmask = nfs4_bitmask(server, fattr->label); nfs_fattr_init(fattr); - dprintk("NFS call lookup %s\n", name->name); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); + dprintk("NFS call lookup %pd2\n", dentry); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_do_call_sync(clnt, server, &msg, + &args.seq_args, &res.seq_res, task_flags); dprintk("NFS reply lookup: %d\n", status); return status; } @@ -3740,14 +4621,16 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr) } static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) + struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; struct rpc_clnt *client = *clnt; int err; do { - err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label); + err = _nfs4_proc_lookup(client, dir, dentry, name, fhandle, fattr); trace_nfs4_lookup(dir, name, err); switch (err) { case -NFS4ERR_BADNAME: @@ -3782,14 +4665,13 @@ out: return err; } -static int nfs4_proc_lookup(struct inode *dir, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr, - struct nfs4_label *label) +static int nfs4_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int status; struct rpc_clnt *client = NFS_CLIENT(dir); - status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label); + status = nfs4_proc_lookup_common(&client, dir, dentry, name, fhandle, fattr); if (client != NFS_CLIENT(dir)) { rpc_shutdown_client(client); nfs_fixup_secinfo_attributes(fattr); @@ -3798,21 +4680,21 @@ static int nfs4_proc_lookup(struct inode *dir, const struct qstr *name, } struct rpc_clnt * -nfs4_proc_lookup_mountpoint(struct inode *dir, const struct qstr *name, +nfs4_proc_lookup_mountpoint(struct inode *dir, struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct rpc_clnt *client = NFS_CLIENT(dir); int status; - status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL); + status = nfs4_proc_lookup_common(&client, dir, dentry, &dentry->d_name, + fhandle, fattr); if (status < 0) return ERR_PTR(status); return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client; } static int _nfs4_proc_lookupp(struct inode *inode, - struct nfs_fh *fhandle, struct nfs_fattr *fattr, - struct nfs4_label *label) + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_server *server = NFS_SERVER(inode); @@ -3824,7 +4706,6 @@ static int _nfs4_proc_lookupp(struct inode *inode, struct nfs4_lookupp_res res = { .server = server, .fattr = fattr, - .label = label, .fh = fhandle, }; struct rpc_message msg = { @@ -3832,25 +4713,34 @@ static int _nfs4_proc_lookupp(struct inode *inode, .rpc_argp = &args, .rpc_resp = &res, }; + unsigned short task_flags = 0; + + if (server->flags & NFS_MOUNT_SOFTREVAL) + task_flags |= RPC_TASK_TIMEOUT; + if (server->caps & NFS_CAP_MOVEABLE) + task_flags |= RPC_TASK_MOVEABLE; - args.bitmask = nfs4_bitmask(server, label); + args.bitmask = nfs4_bitmask(server, fattr->label); nfs_fattr_init(fattr); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, 0); + status = nfs4_do_call_sync(clnt, server, &msg, &args.seq_args, + &res.seq_res, task_flags); dprintk("NFS reply lookupp: %d\n", status); return status; } static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) + struct nfs_fattr *fattr) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { - err = _nfs4_proc_lookupp(inode, fhandle, fattr, label); + err = _nfs4_proc_lookupp(inode, fhandle, fattr); trace_nfs4_lookupp(inode, err); err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); @@ -3858,12 +4748,13 @@ static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, return err; } -static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_accessargs args = { .fh = NFS_FH(inode), - .bitmask = server->cache_consistency_bitmask, + .access = entry->mask, }; struct nfs4_accessres res = { .server = server, @@ -3872,47 +4763,35 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], .rpc_argp = &args, .rpc_resp = &res, - .rpc_cred = entry->cred, + .rpc_cred = cred, }; - int mode = entry->mask; int status = 0; - /* - * Determine which access bits we want to ask for... - */ - if (mode & MAY_READ) - args.access |= NFS4_ACCESS_READ; - if (S_ISDIR(inode->i_mode)) { - if (mode & MAY_WRITE) - args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE; - if (mode & MAY_EXEC) - args.access |= NFS4_ACCESS_LOOKUP; - } else { - if (mode & MAY_WRITE) - args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND; - if (mode & MAY_EXEC) - args.access |= NFS4_ACCESS_EXECUTE; + if (!nfs4_have_delegation(inode, FMODE_READ, 0)) { + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return -ENOMEM; + args.bitmask = server->cache_consistency_bitmask; } - - res.fattr = nfs_alloc_fattr(); - if (res.fattr == NULL) - return -ENOMEM; - status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (!status) { nfs_access_set_mask(entry, res.access); - nfs_refresh_inode(inode, res.fattr); + if (res.fattr) + nfs_refresh_inode(inode, res.fattr); } nfs_free_fattr(res.fattr); return status; } -static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { - err = _nfs4_proc_access(inode, entry); + err = _nfs4_proc_access(inode, entry, cred); trace_nfs4_access(inode, err); err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); @@ -3964,7 +4843,9 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page, static int nfs4_proc_readlink(struct inode *inode, struct page *page, unsigned int pgbase, unsigned int pglen) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs4_proc_readlink(inode, page, pgbase, pglen); @@ -3983,7 +4864,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_label l, *ilabel = NULL; + struct nfs4_label l, *ilabel; struct nfs_open_context *ctx; struct nfs4_state *state; int status = 0; @@ -4007,7 +4888,8 @@ out: return status; } -static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name) +static int +_nfs4_proc_remove(struct inode *dir, const struct qstr *name, u32 ftype) { struct nfs_server *server = NFS_SERVER(dir); struct nfs_removeargs args = { @@ -4026,17 +4908,50 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name) int status; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); - if (status == 0) - update_changeattr(dir, &res.cinfo, timestamp); + if (status == 0) { + spin_lock(&dir->i_lock); + /* Removing a directory decrements nlink in the parent */ + if (ftype == NF4DIR && dir->i_nlink > 2) + nfs4_dec_nlink_locked(dir); + nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp, + NFS_INO_INVALID_DATA); + spin_unlock(&dir->i_lock); + } return status; } -static int nfs4_proc_remove(struct inode *dir, const struct qstr *name) +static int nfs4_proc_remove(struct inode *dir, struct dentry *dentry) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; + struct inode *inode = d_inode(dentry); int err; + + if (inode) { + if (inode->i_nlink == 1) + nfs4_inode_return_delegation(inode); + else + nfs4_inode_make_writeable(inode); + } + do { + err = _nfs4_proc_remove(dir, &dentry->d_name, NF4REG); + trace_nfs4_remove(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + return err; +} + +static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name) +{ + struct nfs4_exception exception = { + .interruptible = true, + }; + int err; + do { - err = _nfs4_proc_remove(dir, name); + err = _nfs4_proc_remove(dir, name, NF4DIR); trace_nfs4_remove(dir, name, err); err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); @@ -4044,17 +4959,23 @@ static int nfs4_proc_remove(struct inode *dir, const struct qstr *name) return err; } -static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +static void nfs4_proc_unlink_setup(struct rpc_message *msg, + struct dentry *dentry, + struct inode *inode) { - struct nfs_server *server = NFS_SERVER(dir); struct nfs_removeargs *args = msg->rpc_argp; struct nfs_removeres *res = msg->rpc_resp; - res->server = server; + res->server = NFS_SB(dentry->d_sb); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; - nfs4_init_sequence(&args->seq_args, &res->seq_res, 1); + nfs4_init_sequence(&args->seq_args, &res->seq_res, 1, 0); nfs_fattr_init(res->dir_attr); + + if (inode) { + nfs4_inode_return_delegation(inode); + nfs_d_prune_case_insensitive_aliases(inode); + } } static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) @@ -4076,19 +4997,28 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) &data->timeout) == -EAGAIN) return 0; if (task->tk_status == 0) - update_changeattr(dir, &res->cinfo, res->dir_attr->time_start); + nfs4_update_changeattr(dir, &res->cinfo, + res->dir_attr->time_start, + NFS_INO_INVALID_DATA); return 1; } -static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +static void nfs4_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { - struct nfs_server *server = NFS_SERVER(dir); struct nfs_renameargs *arg = msg->rpc_argp; struct nfs_renameres *res = msg->rpc_resp; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + if (old_inode) + nfs4_inode_make_writeable(old_inode); + if (new_inode) + nfs4_inode_return_delegation(new_inode); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; - res->server = server; - nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1); + res->server = NFS_SB(old_dentry->d_sb); + nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1, 0); } static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) @@ -4111,9 +5041,21 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 0; if (task->tk_status == 0) { - update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start); - if (new_dir != old_dir) - update_changeattr(new_dir, &res->new_cinfo, res->new_fattr->time_start); + nfs_d_prune_case_insensitive_aliases(d_inode(data->old_dentry)); + if (new_dir != old_dir) { + /* Note: If we moved a directory, nlink will change */ + nfs4_update_changeattr(old_dir, &res->old_cinfo, + res->old_fattr->time_start, + NFS_INO_INVALID_NLINK | + NFS_INO_INVALID_DATA); + nfs4_update_changeattr(new_dir, &res->new_cinfo, + res->new_fattr->time_start, + NFS_INO_INVALID_NLINK | + NFS_INO_INVALID_DATA); + } else + nfs4_update_changeattr(old_dir, &res->old_cinfo, + res->old_fattr->time_start, + NFS_INO_INVALID_DATA); } return 1; } @@ -4121,15 +5063,15 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { struct nfs_server *server = NFS_SERVER(inode); + __u32 bitmask[NFS4_BITMASK_SZ]; struct nfs4_link_arg arg = { .fh = NFS_FH(inode), .dir_fh = NFS_FH(dir), .name = name, - .bitmask = server->attr_bitmask, + .bitmask = bitmask, }; struct nfs4_link_res res = { .server = server, - .label = NULL, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], @@ -4138,28 +5080,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct }; int status = -ENOMEM; - res.fattr = nfs_alloc_fattr(); + res.fattr = nfs_alloc_fattr_with_label(server); if (res.fattr == NULL) goto out; - res.label = nfs4_label_alloc(server, GFP_KERNEL); - if (IS_ERR(res.label)) { - status = PTR_ERR(res.label); - goto out; - } - arg.bitmask = nfs4_bitmask(server, res.label); - + nfs4_inode_make_writeable(inode); + nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.fattr->label), + inode, + NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { - update_changeattr(dir, &res.cinfo, res.fattr->time_start); + nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start, + NFS_INO_INVALID_DATA); + nfs4_inc_nlink(inode); status = nfs_post_op_update_inode(inode, res.fattr); if (!status) - nfs_setsecurity(inode, res.fattr, res.label); + nfs_setsecurity(inode, res.fattr); } - - nfs4_label_free(res.label); - out: nfs_free_fattr(res.fattr); return status; @@ -4167,7 +5105,9 @@ out: static int nfs4_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = nfs4_handle_exception(NFS_SERVER(inode), @@ -4183,7 +5123,6 @@ struct nfs4_createdata { struct nfs4_create_res res; struct nfs_fh fh; struct nfs_fattr fattr; - struct nfs4_label *label; }; static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, @@ -4195,8 +5134,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, if (data != NULL) { struct nfs_server *server = NFS_SERVER(dir); - data->label = nfs4_label_alloc(server, GFP_KERNEL); - if (IS_ERR(data->label)) + data->fattr.label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(data->fattr.label)) goto out_free; data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; @@ -4207,12 +5146,11 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, data->arg.name = name; data->arg.attrs = sattr; data->arg.ftype = ftype; - data->arg.bitmask = nfs4_bitmask(server, data->label); + data->arg.bitmask = nfs4_bitmask(server, data->fattr.label); data->arg.umask = current_umask(); data->res.server = server; data->res.fh = &data->fh; data->res.fattr = &data->fattr; - data->res.label = data->label; nfs_fattr_init(data->res.fattr); } return data; @@ -4226,23 +5164,52 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { - update_changeattr(dir, &data->res.dir_cinfo, - data->res.fattr->time_start); - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label); + spin_lock(&dir->i_lock); + nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, + data->res.fattr->time_start, + NFS_INO_INVALID_DATA); + spin_unlock(&dir->i_lock); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); } return status; } +static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, + struct nfs4_createdata *data, int *statusp) +{ + struct dentry *ret; + + *statusp = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + &data->arg.seq_args, &data->res.seq_res, 1); + + if (*statusp) + return NULL; + + spin_lock(&dir->i_lock); + /* Creating a directory bumps nlink in the parent */ + nfs4_inc_nlink_locked(dir); + nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, + data->res.fattr->time_start, + NFS_INO_INVALID_DATA); + spin_unlock(&dir->i_lock); + ret = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + if (!IS_ERR(ret)) + return ret; + *statusp = PTR_ERR(ret); + return NULL; +} + static void nfs4_free_createdata(struct nfs4_createdata *data) { - nfs4_label_free(data->label); + nfs4_label_free(data->fattr.label); kfree(data); } static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, - struct page *page, unsigned int len, struct iattr *sattr, + struct folio *folio, unsigned int len, struct iattr *sattr, struct nfs4_label *label) { + struct page *page = &folio->page; struct nfs4_createdata *data; int status = -ENAMETOOLONG; @@ -4267,16 +5234,18 @@ out: } static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, - struct page *page, unsigned int len, struct iattr *sattr) + struct folio *folio, unsigned int len, struct iattr *sattr) { - struct nfs4_exception exception = { }; - struct nfs4_label l, *label = NULL; + struct nfs4_exception exception = { + .interruptible = true, + }; + struct nfs4_label l, *label; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); do { - err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label); + err = _nfs4_proc_symlink(dir, dentry, folio, len, sattr, label); trace_nfs4_symlink(dir, &dentry->d_name, err); err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); @@ -4286,30 +5255,35 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, return err; } -static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr, struct nfs4_label *label) +static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, + struct nfs4_label *label, int *statusp) { struct nfs4_createdata *data; - int status = -ENOMEM; + struct dentry *ret = NULL; + *statusp = -ENOMEM; data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); if (data == NULL) goto out; data->arg.label = label; - status = nfs4_do_create(dir, dentry, data); + ret = nfs4_do_mkdir(dir, dentry, data, statusp); nfs4_free_createdata(data); out: - return status; + return ret; } -static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr) +static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr) { struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_exception exception = { }; - struct nfs4_label l, *label = NULL; + struct nfs4_exception exception = { + .interruptible = true, + }; + struct nfs4_label l, *label; + struct dentry *alias; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -4317,45 +5291,52 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_mode &= ~current_umask(); do { - err = _nfs4_proc_mkdir(dir, dentry, sattr, label); + alias = _nfs4_proc_mkdir(dir, dentry, sattr, label, &err); trace_nfs4_mkdir(dir, &dentry->d_name, err); - err = nfs4_handle_exception(NFS_SERVER(dir), err, - &exception); + if (err) + alias = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + err, + &exception)); } while (exception.retry); nfs4_label_release_security(label); - return err; + return alias; } -static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); + struct inode *dir = d_inode(nr_arg->dentry); + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), - .pages = pages, + .pages = nr_arg->pages, .pgbase = 0, - .count = count, - .bitmask = NFS_SERVER(d_inode(dentry))->attr_bitmask, - .plus = plus, + .count = nr_arg->page_len, + .plus = nr_arg->plus, }; struct nfs4_readdir_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR], .rpc_argp = &args, .rpc_resp = &res, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status; - dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__, - dentry, - (unsigned long long)cookie); - nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args); + dprintk("%s: dentry = %pd2, cookie = %llu\n", __func__, + nr_arg->dentry, (unsigned long long)nr_arg->cookie); + if (!(server->caps & NFS_CAP_SECURITY_LABEL)) + args.bitmask = server->attr_bitmask_nl; + else + args.bitmask = server->attr_bitmask; + + nfs4_setup_readdir(nr_arg->cookie, nr_arg->verf, nr_arg->dentry, &args); res.pgbase = args.pgbase; - status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, + &res.seq_res, 0); if (status >= 0) { - memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); + memcpy(nr_res->verf, res.verifier.data, NFS4_VERIFIER_SIZE); status += args.pgbase; } @@ -4365,17 +5346,18 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, return status; } -static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs4_proc_readdir(struct nfs_readdir_arg *arg, + struct nfs_readdir_res *res) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { - err = _nfs4_proc_readdir(dentry, cred, cookie, - pages, count, plus); - trace_nfs4_readdir(d_inode(dentry), err); - err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err, - &exception); + err = _nfs4_proc_readdir(arg, res); + trace_nfs4_readdir(d_inode(arg->dentry), err); + err = nfs4_handle_exception(NFS_SERVER(d_inode(arg->dentry)), + err, &exception); } while (exception.retry); return err; } @@ -4419,8 +5401,10 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_exception exception = { }; - struct nfs4_label l, *label = NULL; + struct nfs4_exception exception = { + .interruptible = true, + }; + struct nfs4_label l, *label; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -4461,7 +5445,9 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = nfs4_handle_exception(server, @@ -4492,17 +5478,16 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) { - struct nfs4_exception exception = { }; - unsigned long now = jiffies; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs4_do_fsinfo(server, fhandle, fsinfo); trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err); if (err == 0) { - nfs4_set_lease_period(server->nfs_client, - fsinfo->lease_time * HZ, - now); + nfs4_set_lease_period(server->nfs_client, fsinfo->lease_time * HZ); break; } err = nfs4_handle_exception(server, err, &exception); @@ -4554,7 +5539,9 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_pathconf *pathconf) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { @@ -4579,12 +5566,12 @@ static bool nfs4_stateid_is_current(nfs4_stateid *stateid, const struct nfs_lock_context *l_ctx, fmode_t fmode) { - nfs4_stateid current_stateid; + nfs4_stateid _current_stateid; /* If the current stateid represents a lost lock, then exit */ - if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode) == -EIO) + if (nfs4_set_rw_stateid(&_current_stateid, ctx, l_ctx, fmode) == -EIO) return true; - return nfs4_stateid_match(stateid, ¤t_stateid); + return nfs4_stateid_match(stateid, &_current_stateid); } static bool nfs4_error_stateid_expired(int err) @@ -4612,9 +5599,11 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -4640,29 +5629,64 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task, return true; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) +static bool nfs4_read_plus_not_supported(struct rpc_task *task, + struct nfs_pgio_header *hdr) { + struct nfs_server *server = NFS_SERVER(hdr->inode); + struct rpc_message *msg = &task->tk_msg; - dprintk("--> %s\n", __func__); + if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] && + task->tk_status == -ENOTSUPP) { + server->caps &= ~NFS_CAP_READ_PLUS; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + rpc_restart_call_prepare(task); + return true; + } + return false; +} +static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) +{ if (!nfs4_sequence_done(task, &hdr->res.seq_res)) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) return -EAGAIN; + if (nfs4_read_plus_not_supported(task, hdr)) + return -EAGAIN; if (task->tk_status > 0) nfs_invalidate_atime(hdr->inode); return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : nfs4_read_done_cb(task, hdr); } +#if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS +static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr, + struct rpc_message *msg) +{ + /* Note: We don't use READ_PLUS with pNFS yet */ + if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) { + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; + return nfs_read_alloc_scratch(hdr, READ_PLUS_SCRATCH_SIZE); + } + return false; +} +#else +static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr, + struct rpc_message *msg) +{ + return false; +} +#endif /* CONFIG_NFS_V4_2 */ + static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg) { hdr->timestamp = jiffies; if (!hdr->pgio_done_cb) hdr->pgio_done_cb = nfs4_read_done_cb; - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; - nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); + if (!nfs42_read_plus_support(hdr, msg)) + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); } static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, @@ -4693,10 +5717,12 @@ static int nfs4_write_done_cb(struct rpc_task *task, .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, NFS_SERVER(inode), task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -4742,19 +5768,59 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr) /* Otherwise, request attributes if and only if we don't hold * a delegation */ - return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; + return nfs4_have_delegation(hdr->inode, FMODE_READ, 0) == 0; +} + +void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], + struct inode *inode, unsigned long cache_validity) +{ + struct nfs_server *server = NFS_SERVER(inode); + unsigned int i; + + memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ); + cache_validity |= READ_ONCE(NFS_I(inode)->cache_validity); + + if (cache_validity & NFS_INO_INVALID_CHANGE) + bitmask[0] |= FATTR4_WORD0_CHANGE; + if (cache_validity & NFS_INO_INVALID_ATIME) + bitmask[1] |= FATTR4_WORD1_TIME_ACCESS; + if (cache_validity & NFS_INO_INVALID_MODE) + bitmask[1] |= FATTR4_WORD1_MODE; + if (cache_validity & NFS_INO_INVALID_OTHER) + bitmask[1] |= FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP; + if (cache_validity & NFS_INO_INVALID_NLINK) + bitmask[1] |= FATTR4_WORD1_NUMLINKS; + if (cache_validity & NFS_INO_INVALID_CTIME) + bitmask[1] |= FATTR4_WORD1_TIME_METADATA; + if (cache_validity & NFS_INO_INVALID_MTIME) + bitmask[1] |= FATTR4_WORD1_TIME_MODIFY; + if (cache_validity & NFS_INO_INVALID_BLOCKS) + bitmask[1] |= FATTR4_WORD1_SPACE_USED; + if (cache_validity & NFS_INO_INVALID_BTIME) + bitmask[1] |= FATTR4_WORD1_TIME_CREATE; + + if (cache_validity & NFS_INO_INVALID_SIZE) + bitmask[0] |= FATTR4_WORD0_SIZE; + + for (i = 0; i < NFS4_BITMASK_SZ; i++) + bitmask[i] &= server->attr_bitmask[i]; } static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, - struct rpc_message *msg) + struct rpc_message *msg, + struct rpc_clnt **clnt) { struct nfs_server *server = NFS_SERVER(hdr->inode); if (!nfs4_write_need_cache_consistency_data(hdr)) { hdr->args.bitmask = NULL; hdr->res.fattr = NULL; - } else - hdr->args.bitmask = server->cache_consistency_bitmask; + } else { + nfs4_bitmask_set(hdr->args.bitmask_store, + server->cache_consistency_bitmask, + hdr->inode, NFS_INO_INVALID_BLOCKS); + hdr->args.bitmask = hdr->args.bitmask_store; + } if (!hdr->pgio_done_cb) hdr->pgio_done_cb = nfs4_write_done_cb; @@ -4762,7 +5828,8 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, hdr->timestamp = jiffies; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; - nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1); + nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); + nfs4_state_protect_write(hdr->ds_clp ? hdr->ds_clp : server->nfs_client, clnt, msg, hdr); } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) @@ -4793,7 +5860,8 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_commit_data *data) return data->commit_done_cb(task, data); } -static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) +static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg, + struct rpc_clnt **clnt) { struct nfs_server *server = NFS_SERVER(data->inode); @@ -4801,7 +5869,43 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess data->commit_done_cb = nfs4_commit_done_cb; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0); + nfs4_state_protect(data->ds_clp ? data->ds_clp : server->nfs_client, + NFS_SP4_MACH_CRED_COMMIT, clnt, msg); +} + +static int _nfs4_proc_commit(struct file *dst, struct nfs_commitargs *args, + struct nfs_commitres *res) +{ + struct inode *dst_inode = file_inode(dst); + struct nfs_server *server = NFS_SERVER(dst_inode); + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT], + .rpc_argp = args, + .rpc_resp = res, + }; + + args->fh = NFS_FH(dst_inode); + return nfs4_call_sync(server->client, server, &msg, + &args->seq_args, &res->seq_res, 1); +} + +int nfs4_proc_commit(struct file *dst, __u64 offset, __u32 count, struct nfs_commitres *res) +{ + struct nfs_commitargs args = { + .offset = offset, + .count = count, + }; + struct nfs_server *dst_server = NFS_SERVER(file_inode(dst)); + struct nfs4_exception exception = { }; + int status; + + do { + status = _nfs4_proc_commit(dst, &args, res); + status = nfs4_handle_exception(dst_server, status, &exception); + } while (exception.retry); + + return status; } struct nfs4_renewdata { @@ -4818,7 +5922,7 @@ static void nfs4_renew_release(void *calldata) struct nfs4_renewdata *data = calldata; struct nfs_client *clp = data->client; - if (atomic_read(&clp->cl_count) > 1) + if (refcount_read(&clp->cl_count) > 1) nfs4_schedule_state_renewal(clp); nfs_put_client(clp); kfree(data); @@ -4855,7 +5959,7 @@ static const struct rpc_call_ops nfs4_renew_ops = { .rpc_release = nfs4_renew_release, }; -static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags) +static int nfs4_proc_async_renew(struct nfs_client *clp, const struct cred *cred, unsigned renew_flags) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], @@ -4866,7 +5970,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, if (renew_flags == 0) return 0; - if (!atomic_inc_not_zero(&clp->cl_count)) + if (!refcount_inc_not_zero(&clp->cl_count)) return -EIO; data = kmalloc(sizeof(*data), GFP_NOFS); if (data == NULL) { @@ -4879,7 +5983,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, &nfs4_renew_ops, data); } -static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], @@ -4896,9 +6000,17 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) return 0; } -static inline int nfs4_server_supports_acls(struct nfs_server *server) +static bool nfs4_server_supports_acls(const struct nfs_server *server, + enum nfs4_acl_type type) { - return server->caps & NFS_CAP_ACLS; + switch (type) { + default: + return server->attr_bitmask[0] & FATTR4_WORD0_ACL; + case NFS4ACL_DACL: + return server->attr_bitmask[1] & FATTR4_WORD1_DACL; + case NFS4ACL_SACL: + return server->attr_bitmask[1] & FATTR4_WORD1_SACL; + } } /* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that @@ -4907,7 +6019,7 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server) */ #define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE) -static int buf_to_pages_noslab(const void *buf, size_t buflen, +int nfs4_buf_to_pages_noslab(const void *buf, size_t buflen, struct page **pages) { struct page *newpage, **spages; @@ -4937,9 +6049,10 @@ unwind: } struct nfs4_cached_acl { + enum nfs4_acl_type type; int cached; size_t len; - char data[0]; + char data[]; }; static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl) @@ -4957,7 +6070,8 @@ static void nfs4_zap_acl_attr(struct inode *inode) nfs4_set_cached_acl(inode, NULL); } -static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen) +static ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_cached_acl *acl; @@ -4967,6 +6081,8 @@ static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_ acl = nfsi->nfs4_acl; if (acl == NULL) goto out; + if (acl->type != type) + goto out; if (buf == NULL) /* user is just asking for length */ goto out_len; if (acl->cached == 0) @@ -4982,7 +6098,9 @@ out: return ret; } -static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len) +static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, + size_t pgbase, size_t acl_len, + enum nfs4_acl_type type) { struct nfs4_cached_acl *acl; size_t buflen = sizeof(*acl) + acl_len; @@ -4999,6 +6117,7 @@ static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size goto out; acl->cached = 0; } + acl->type = type; acl->len = acl_len; out: nfs4_set_cached_acl(inode, acl); @@ -5014,15 +6133,17 @@ out: * length. The next getxattr call will then produce another round trip to * the server, this time with the input buf of the required size. */ -static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, + size_t buflen, enum nfs4_acl_type type) { - struct page *pages[NFS4ACL_MAXPAGES + 1] = {NULL, }; + struct page **pages; struct nfs_getaclargs args = { .fh = NFS_FH(inode), - .acl_pages = pages, + .acl_type = type, .acl_len = buflen, }; struct nfs_getaclres res = { + .acl_type = type, .acl_len = buflen, }; struct rpc_message msg = { @@ -5030,11 +6151,19 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu .rpc_argp = &args, .rpc_resp = &res, }; - unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1; + unsigned int npages; int ret = -ENOMEM, i; + struct nfs_server *server = NFS_SERVER(inode); - if (npages > ARRAY_SIZE(pages)) - return -ERANGE; + if (buflen == 0) + buflen = server->rsize; + + npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1; + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + args.acl_pages = pages; for (i = 0; i < npages; i++) { pages[i] = alloc_page(GFP_KERNEL); @@ -5043,7 +6172,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu } /* for decoding across pages */ - res.acl_scratch = alloc_page(GFP_KERNEL); + res.acl_scratch = folio_alloc(GFP_KERNEL, 0); if (!res.acl_scratch) goto out_free; @@ -5064,7 +6193,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu ret = -ERANGE; goto out_free; } - nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); + nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len, + type); if (buf) { if (res.acl_len > buflen) { ret = -ERANGE; @@ -5075,20 +6205,23 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu out_ok: ret = res.acl_len; out_free: - for (i = 0; i < npages; i++) - if (pages[i]) - __free_page(pages[i]); + while (--i >= 0) + __free_page(pages[i]); if (res.acl_scratch) - __free_page(res.acl_scratch); + folio_put(res.acl_scratch); + kfree(pages); return ret; } -static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, + size_t buflen, enum nfs4_acl_type type) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; ssize_t ret; do { - ret = __nfs4_get_acl_uncached(inode, buf, buflen); + ret = __nfs4_get_acl_uncached(inode, buf, buflen, type); trace_nfs4_get_acl(inode, ret); if (ret >= 0) break; @@ -5097,34 +6230,39 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl return ret; } -static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) +static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, + enum nfs4_acl_type type) { struct nfs_server *server = NFS_SERVER(inode); int ret; - if (!nfs4_server_supports_acls(server)) + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; + if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; - ret = nfs_revalidate_inode(server, inode); + ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); if (ret < 0) return ret; if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); - ret = nfs4_read_cached_acl(inode, buf, buflen); + ret = nfs4_read_cached_acl(inode, buf, buflen, type); if (ret != -ENOENT) /* -ENOENT is returned if there is no ACL or if there is an ACL * but no cached acl data, just the acl length */ return ret; - return nfs4_get_acl_uncached(inode, buf, buflen); + return nfs4_get_acl_uncached(inode, buf, buflen, type); } -static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFS4ACL_MAXPAGES]; struct nfs_setaclargs arg = { - .fh = NFS_FH(inode), - .acl_pages = pages, - .acl_len = buflen, + .fh = NFS_FH(inode), + .acl_type = type, + .acl_len = buflen, + .acl_pages = pages, }; struct nfs_setaclres res; struct rpc_message msg = { @@ -5135,14 +6273,17 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); int ret, i; - if (!nfs4_server_supports_acls(server)) + /* You can't remove system.nfs4_acl: */ + if (buflen == 0) + return -EINVAL; + if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; if (npages > ARRAY_SIZE(pages)) return -ERANGE; - i = buf_to_pages_noslab(buf, buflen, arg.acl_pages); + i = nfs4_buf_to_pages_noslab(buf, buflen, arg.acl_pages); if (i < 0) return i; - nfs4_inode_return_delegation(inode); + nfs4_inode_make_writeable(inode); ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); /* @@ -5157,20 +6298,34 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl * so mark the attribute cache invalid. */ spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); return ret; } -static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs4_exception exception = { }; int err; + + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; do { - err = __nfs4_proc_set_acl(inode, buf, buflen); + err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); + if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) { + /* + * no need to retry since the kernel + * isn't involved in encoding the ACEs. + */ + err = -EINVAL; + break; + } err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); @@ -5182,17 +6337,18 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_fattr fattr; - struct nfs4_label label = {0, 0, buflen, buf}; + struct nfs4_label label = {0, 0, 0, buflen, buf}; u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; + struct nfs_fattr fattr = { + .label = &label, + }; struct nfs4_getattr_arg arg = { .fh = NFS_FH(inode), .bitmask = bitmask, }; struct nfs4_getattr_res res = { .fattr = &fattr, - .label = &label, .server = server, }; struct rpc_message msg = { @@ -5209,15 +6365,15 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, return ret; if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) return -ENOENT; - if (buflen < label.len) - return -ERANGE; - return 0; + return label.len; } static int nfs4_get_security_label(struct inode *inode, void *buf, size_t buflen) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) @@ -5234,8 +6390,7 @@ static int nfs4_get_security_label(struct inode *inode, void *buf, static int _nfs4_do_set_security_label(struct inode *inode, struct nfs4_label *ilabel, - struct nfs_fattr *fattr, - struct nfs4_label *olabel) + struct nfs_fattr *fattr) { struct iattr sattr = {0}; @@ -5250,7 +6405,6 @@ static int _nfs4_do_set_security_label(struct inode *inode, }; struct nfs_setattrres res = { .fattr = fattr, - .label = olabel, .server = server, }; struct rpc_message msg = { @@ -5271,15 +6425,13 @@ static int _nfs4_do_set_security_label(struct inode *inode, static int nfs4_do_set_security_label(struct inode *inode, struct nfs4_label *ilabel, - struct nfs_fattr *fattr, - struct nfs4_label *olabel) + struct nfs_fattr *fattr) { struct nfs4_exception exception = { }; int err; do { - err = _nfs4_do_set_security_label(inode, ilabel, - fattr, olabel); + err = _nfs4_do_set_security_label(inode, ilabel, fattr); trace_nfs4_set_security_label(inode, err); err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); @@ -5290,38 +6442,22 @@ static int nfs4_do_set_security_label(struct inode *inode, static int nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { - struct nfs4_label ilabel, *olabel = NULL; - struct nfs_fattr fattr; - struct rpc_cred *cred; + struct nfs4_label ilabel = {0, 0, 0, buflen, (char *)buf }; + struct nfs_fattr *fattr; int status; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) return -EOPNOTSUPP; - nfs_fattr_init(&fattr); - - ilabel.pi = 0; - ilabel.lfs = 0; - ilabel.label = (char *)buf; - ilabel.len = buflen; - - cred = rpc_lookup_cred(); - if (IS_ERR(cred)) - return PTR_ERR(cred); - - olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); - if (IS_ERR(olabel)) { - status = -PTR_ERR(olabel); - goto out; - } + fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); + if (fattr == NULL) + return -ENOMEM; - status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel); + status = nfs4_do_set_security_label(inode, &ilabel, fattr); if (status == 0) - nfs_setsecurity(inode, &fattr, olabel); + nfs_setsecurity(inode, fattr); - nfs4_label_free(olabel); -out: - put_rpccred(cred); + nfs_free_fattr(fattr); return status; } #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ @@ -5347,9 +6483,34 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, memcpy(bootverf->data, verf, sizeof(bootverf->data)); } +static size_t +nfs4_get_uniquifier(struct nfs_client *clp, char *buf, size_t buflen) +{ + struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); + struct nfs_netns_client *nn_clp = nn->nfs_client; + const char *id; + + buf[0] = '\0'; + + if (nn_clp) { + rcu_read_lock(); + id = rcu_dereference(nn_clp->identifier); + if (id) + strscpy(buf, id, buflen); + rcu_read_unlock(); + } + + if (nfs4_client_id_uniquifier[0] != '\0' && buf[0] == '\0') + strscpy(buf, nfs4_client_id_uniquifier, buflen); + + return strlen(buf); +} + static int nfs4_init_nonuniform_client_string(struct nfs_client *clp) { + char buf[NFS4_CLIENT_ID_UNIQ_LEN]; + size_t buflen; size_t len; char *str; @@ -5357,13 +6518,17 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) return 0; rcu_read_lock(); - len = 14 + strlen(clp->cl_ipaddr) + 1 + - strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + + len = 14 + + strlen(clp->cl_rpcclient->cl_nodename) + 1 + - strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) + + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + 1; rcu_read_unlock(); + buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf)); + if (buflen) + len += buflen + 1; + if (len > NFS4_OPAQUE_LIMIT + 1) return -EINVAL; @@ -5377,10 +6542,16 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) return -ENOMEM; rcu_read_lock(); - scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), - rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); + if (buflen) + scnprintf(str, len, "Linux NFSv4.0 %s/%s/%s", + clp->cl_rpcclient->cl_nodename, buf, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)); + else + scnprintf(str, len, "Linux NFSv4.0 %s/%s", + clp->cl_rpcclient->cl_nodename, + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR)); rcu_read_unlock(); clp->cl_owner_id = str; @@ -5388,50 +6559,23 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) } static int -nfs4_init_uniquifier_client_string(struct nfs_client *clp) -{ - size_t len; - char *str; - - len = 10 + 10 + 1 + 10 + 1 + - strlen(nfs4_client_id_uniquifier) + 1 + - strlen(clp->cl_rpcclient->cl_nodename) + 1; - - if (len > NFS4_OPAQUE_LIMIT + 1) - return -EINVAL; - - /* - * Since this string is allocated at mount time, and held until the - * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying - * about a memory-reclaim deadlock. - */ - str = kmalloc(len, GFP_KERNEL); - if (!str) - return -ENOMEM; - - scnprintf(str, len, "Linux NFSv%u.%u %s/%s", - clp->rpc_ops->version, clp->cl_minorversion, - nfs4_client_id_uniquifier, - clp->cl_rpcclient->cl_nodename); - clp->cl_owner_id = str; - return 0; -} - -static int nfs4_init_uniform_client_string(struct nfs_client *clp) { + char buf[NFS4_CLIENT_ID_UNIQ_LEN]; + size_t buflen; size_t len; char *str; if (clp->cl_owner_id != NULL) return 0; - if (nfs4_client_id_uniquifier[0] != '\0') - return nfs4_init_uniquifier_client_string(clp); - len = 10 + 10 + 1 + 10 + 1 + strlen(clp->cl_rpcclient->cl_nodename) + 1; + buflen = nfs4_get_uniquifier(clp, buf, sizeof(buf)); + if (buflen) + len += buflen + 1; + if (len > NFS4_OPAQUE_LIMIT + 1) return -EINVAL; @@ -5444,9 +6588,14 @@ nfs4_init_uniform_client_string(struct nfs_client *clp) if (!str) return -ENOMEM; - scnprintf(str, len, "Linux NFSv%u.%u %s", - clp->rpc_ops->version, clp->cl_minorversion, - clp->cl_rpcclient->cl_nodename); + if (buflen) + scnprintf(str, len, "Linux NFSv%u.%u %s/%s", + clp->rpc_ops->version, clp->cl_minorversion, + buf, clp->cl_rpcclient->cl_nodename); + else + scnprintf(str, len, "Linux NFSv%u.%u %s", + clp->rpc_ops->version, clp->cl_minorversion, + clp->cl_rpcclient->cl_nodename); clp->cl_owner_id = str; return 0; } @@ -5482,13 +6631,13 @@ static const struct rpc_call_ops nfs4_setclientid_ops = { * @clp: state data structure * @program: RPC program for NFSv4 callback service * @port: IP port number for NFS4 callback service - * @cred: RPC credential to use for this call + * @cred: credential to use for this call * @res: where to place the result * * Returns zero, a negative errno, or a negative NFS4ERR status code. */ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, - unsigned short port, struct rpc_cred *cred, + unsigned short port, const struct cred *cred, struct nfs4_setclientid_res *res) { nfs4_verifier sc_verifier; @@ -5503,14 +6652,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_resp = res, .rpc_cred = cred, }; - struct rpc_task *task; struct rpc_task_setup task_setup_data = { .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs4_setclientid_ops, .callback_data = &setclientid, - .flags = RPC_TASK_TIMEOUT, + .flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN, }; + unsigned long now = jiffies; int status; /* nfs_client_id4 */ @@ -5536,17 +6685,16 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, dprintk("NFS call setclientid auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_owner_id); - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = task->tk_status; + + status = nfs4_call_sync_custom(&task_setup_data); if (setclientid.sc_cred) { + kfree(clp->cl_acceptor); clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); put_rpccred(setclientid.sc_cred); } - rpc_put_task(task); + + if (status == 0) + do_renew_lease(clp, now); out: trace_nfs4_setclientid(clp, status); dprintk("NFS reply setclientid: %d\n", status); @@ -5556,14 +6704,14 @@ out: /** * nfs4_proc_setclientid_confirm - Confirm client ID * @clp: state data structure - * @res: result of a previous SETCLIENTID - * @cred: RPC credential to use for this call + * @arg: result of a previous SETCLIENTID + * @cred: credential to use for this call * * Returns zero, a negative errno, or a negative NFS4ERR status code. */ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct nfs4_setclientid_res *arg, - struct rpc_cred *cred) + const struct cred *cred) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], @@ -5575,7 +6723,8 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_clientid); - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_setclientid_confirm(clp, status); dprintk("NFS reply setclientid_confirm: %d\n", status); return status; @@ -5587,6 +6736,7 @@ struct nfs4_delegreturndata { struct nfs_fh fh; nfs4_stateid stateid; unsigned long timestamp; + unsigned short retrans; struct { struct nfs4_layoutreturn_args arg; struct nfs4_layoutreturn_res res; @@ -5594,6 +6744,7 @@ struct nfs4_delegreturndata { u32 roc_barrier; bool roc; } lr; + struct nfs4_delegattr sattr; struct nfs_fattr fattr; int rpc_status; struct inode *inode; @@ -5602,6 +6753,12 @@ struct nfs4_delegreturndata { static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_delegreturndata *data = calldata; + struct nfs4_exception exception = { + .inode = data->inode, + .stateid = &data->stateid, + .task_is_privileged = data->args.seq_args.sa_privileged, + .retrans = data->retrans, + }; if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -5609,27 +6766,31 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); /* Handle Layoutreturn errors */ - if (data->args.lr_args && task->tk_status != 0) { - switch(data->res.lr_ret) { - default: - data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; + if (pnfs_roc_done(task, &data->args.lr_args, &data->res.lr_res, + &data->res.lr_ret) == -EAGAIN) + goto out_restart; + + if (data->args.sattr_args && task->tk_status != 0) { + switch(data->res.sattr_ret) { case 0: - data->args.lr_args = NULL; - data->res.lr_res = NULL; + data->args.sattr_args = NULL; + data->res.sattr_res = false; break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + /* Let the main handler below do stateid recovery */ + break; case -NFS4ERR_OLD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - data->res.lr_ret = 0; - rpc_restart_call_prepare(task); - return; + if (nfs4_refresh_delegation_stateid(&data->stateid, + data->inode)) + goto out_restart; + fallthrough; + default: + data->args.sattr_args = NULL; + data->res.sattr_res = false; + goto out_restart; } } @@ -5643,27 +6804,41 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) nfs4_free_revoked_stateid(data->res.server, data->args.stateid, task->tk_msg.rpc_cred); + fallthrough; case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: + case -ETIMEDOUT: task->tk_status = 0; break; + case -NFS4ERR_OLD_STATEID: + if (!nfs4_refresh_delegation_stateid(&data->stateid, data->inode)) + nfs4_stateid_seqid_inc(&data->stateid); + if (data->args.bitmask) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } + goto out_restart; case -NFS4ERR_ACCESS: if (data->args.bitmask) { data->args.bitmask = NULL; data->res.fattr = NULL; - task->tk_status = 0; - rpc_restart_call_prepare(task); - return; + goto out_restart; } + fallthrough; default: - if (nfs4_async_handle_error(task, data->res.server, - NULL, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return; - } + task->tk_status = nfs4_async_handle_exception(task, + data->res.server, task->tk_status, + &exception); + data->retrans = exception.retrans; + if (exception.retry) + goto out_restart; } + nfs_delegation_mark_returned(data->inode, data->args.stateid); data->rpc_status = task->tk_status; + return; +out_restart: + task->tk_status = 0; + rpc_restart_call_prepare(task); } static void nfs4_delegreturn_release(void *calldata) @@ -5671,11 +6846,13 @@ static void nfs4_delegreturn_release(void *calldata) struct nfs4_delegreturndata *data = calldata; struct inode *inode = data->inode; + if (data->lr.roc) + pnfs_roc_release(&data->lr.arg, &data->lr.res, + data->res.lr_ret); if (inode) { - if (data->lr.roc) - pnfs_roc_release(&data->lr.arg, &data->lr.res, - data->res.lr_ret); - nfs_post_op_update_inode_force_wcc(inode, &data->fattr); + nfs4_fattr_set_prechange(&data->fattr, + inode_peek_iversion_raw(inode)); + nfs_refresh_inode(inode, &data->fattr); nfs_iput_and_deactive(inode); } kfree(calldata); @@ -5684,11 +6861,20 @@ static void nfs4_delegreturn_release(void *calldata) static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) { struct nfs4_delegreturndata *d_data; + struct pnfs_layout_hdr *lo; - d_data = (struct nfs4_delegreturndata *)data; + d_data = data; - if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) + if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) { + nfs4_sequence_done(task, &d_data->res.seq_res); return; + } + + lo = d_data->args.lr_args ? d_data->args.lr_args->layout : NULL; + if (lo && !pnfs_layout_is_valid(lo)) { + d_data->args.lr_args = NULL; + d_data->res.lr_res = NULL; + } nfs4_setup_sequence(d_data->res.server->nfs_client, &d_data->args.seq_args, @@ -5702,7 +6888,10 @@ static const struct rpc_call_ops nfs4_delegreturn_ops = { .rpc_release = nfs4_delegreturn_release, }; -static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync) +static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, + const nfs4_stateid *stateid, + struct nfs_delegation *delegation, + int issync) { struct nfs4_delegreturndata *data; struct nfs_server *server = NFS_SERVER(inode); @@ -5715,14 +6904,16 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; int status = 0; - data = kzalloc(sizeof(*data), GFP_NOFS); + if (nfs_server_capable(inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return -ENOMEM; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, @@ -5730,7 +6921,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; - data->args.bitmask = server->cache_consistency_bitmask; + nfs4_bitmask_set(data->args.bitmask_store, + server->cache_consistency_bitmask, inode, 0); + data->args.bitmask = data->args.bitmask_store; nfs_copy_fh(&data->fh, NFS_FH(inode)); nfs4_stateid_copy(&data->stateid, stateid); data->res.fattr = &data->fattr; @@ -5740,18 +6933,37 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co nfs_fattr_init(data->res.fattr); data->timestamp = jiffies; data->rpc_status = 0; - data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res, cred); data->inode = nfs_igrab_and_active(inode); - if (data->inode) { + if (data->inode || issync) { + data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res, + cred); if (data->lr.roc) { data->args.lr_args = &data->lr.arg; data->res.lr_res = &data->lr.res; } - } else if (data->lr.roc) { - pnfs_roc_release(&data->lr.arg, &data->lr.res, 0); - data->lr.roc = false; } + if (delegation && + test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags)) { + if (delegation->type & FMODE_READ) { + data->sattr.atime = inode_get_atime(inode); + data->sattr.atime_set = true; + } + if (delegation->type & FMODE_WRITE) { + data->sattr.mtime = inode_get_mtime(inode); + data->sattr.mtime_set = true; + } + data->args.sattr_args = &data->sattr; + data->res.sattr_res = true; + } + + if (!data->inode) + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, + 1); + else + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, + 0); + task_setup_data.callback_data = data; msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; @@ -5769,13 +6981,16 @@ out: return status; } -int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync) +int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, + const nfs4_stateid *stateid, + struct nfs_delegation *delegation, int issync) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_exception exception = { }; int err; do { - err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); + err = _nfs4_proc_delegreturn(inode, cred, stateid, + delegation, issync); trace_nfs4_delegreturn(inode, stateid, err); switch (err) { case -NFS4ERR_STALE_STATEID: @@ -5819,7 +7034,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); switch (status) { case 0: - request->fl_type = F_UNLCK; + request->c.flc_type = F_UNLCK; break; case -NFS4ERR_DENIED: status = 0; @@ -5832,7 +7047,9 @@ out: static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { @@ -5844,6 +7061,42 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } +/* + * Update the seqid of a lock stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret = false; + + spin_lock(&state->state_lock); + if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid)) + goto out; + if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst)) + nfs4_stateid_seqid_inc(dst); + else + dst->seqid = lsp->ls_stateid.seqid; + ret = true; +out: + spin_unlock(&state->state_lock); + return ret; +} + +static bool nfs4_sync_lock_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret; + + spin_lock(&state->state_lock); + ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid); + nfs4_stateid_copy(dst, &lsp->ls_stateid); + spin_unlock(&state->state_lock); + return ret; +} + struct nfs4_unlockdata { struct nfs_locku_args arg; struct nfs_locku_res res; @@ -5853,6 +7106,7 @@ struct nfs4_unlockdata { struct file_lock fl; struct nfs_server *server; unsigned long timestamp; + unsigned short retrans; }; static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, @@ -5861,22 +7115,33 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs_seqid *seqid) { struct nfs4_unlockdata *p; - struct inode *inode = lsp->ls_state->inode; + struct nfs4_state *state = lsp->ls_state; + struct inode *inode = state->inode; + struct nfs_lock_context *l_ctx; - p = kzalloc(sizeof(*p), GFP_NOFS); + p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return NULL; + l_ctx = nfs_get_lock_context(ctx); + if (!IS_ERR(l_ctx)) { + p->l_ctx = l_ctx; + } else { + kfree(p); + return NULL; + } p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; p->res.seqid = seqid; p->lsp = lsp; - atomic_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); - p->l_ctx = nfs_get_lock_context(ctx); - memcpy(&p->fl, fl, sizeof(p->fl)); + locks_init_lock(&p->fl); + locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); + spin_lock(&state->state_lock); + nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid); + spin_unlock(&state->state_lock); return p; } @@ -5893,6 +7158,11 @@ static void nfs4_locku_release_calldata(void *data) static void nfs4_locku_done(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; + struct nfs4_exception exception = { + .inode = calldata->lsp->ls_state->inode, + .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, + }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; @@ -5903,21 +7173,30 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; + fallthrough; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: nfs4_free_revoked_stateid(calldata->server, &calldata->arg.stateid, task->tk_msg.rpc_cred); + fallthrough; case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - if (!nfs4_stateid_match(&calldata->arg.stateid, - &calldata->lsp->ls_stateid)) + if (nfs4_sync_lock_stateid(&calldata->arg.stateid, + calldata->lsp)) + rpc_restart_call_prepare(task); + break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid, + calldata->lsp)) rpc_restart_call_prepare(task); break; default: - if (nfs4_async_handle_error(task, calldata->server, - NULL, NULL) == -EAGAIN) + task->tk_status = nfs4_async_handle_exception(task, + calldata->server, task->tk_status, + &exception); + calldata->retrans = exception.retrans; + if (exception.retry) rpc_restart_call_prepare(task); } nfs_release_seqid(calldata->arg.seqid); @@ -5933,7 +7212,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; - nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ goto out_no_action; @@ -5975,14 +7253,17 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .flags = RPC_TASK_ASYNC, }; + if (nfs_server_capable(lsp->ls_state->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); /* Ensure this is an unlock - when canceling a lock, the * canceled lock is passed in, and it won't be an unlock. */ - fl->fl_type = F_UNLCK; - if (fl->fl_flags & FL_CLOSE) + fl->c.flc_type = F_UNLCK; + if (fl->c.flc_flags & FL_CLOSE) set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); @@ -5991,7 +7272,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, return ERR_PTR(-ENOMEM); } - nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1, 0); msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; @@ -6008,11 +7289,11 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * struct rpc_task *task; struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); int status = 0; - unsigned char fl_flags = request->fl_flags; + unsigned char saved_flags = request->c.flc_flags; status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ - request->fl_flags |= FL_EXISTS; + request->c.flc_flags |= FL_EXISTS; /* Exclude nfs_delegation_claim_locks() */ mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ @@ -6022,12 +7303,13 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_unlock(&sp->so_delegreturn_mutex); goto out; } + lsp = request->fl_u.nfs4_fl.owner; + set_bit(NFS_LOCK_UNLOCKING, &lsp->ls_flags); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); if (status != 0) goto out; /* Is this a delegated lock? */ - lsp = request->fl_u.nfs4_fl.owner; if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) goto out; alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid; @@ -6035,14 +7317,16 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = -ENOMEM; if (IS_ERR(seqid)) goto out; - task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); + task = nfs4_do_unlck(request, + nfs_file_open_context(request->c.flc_file), + lsp, seqid); status = PTR_ERR(task); if (IS_ERR(task)) goto out; status = rpc_wait_for_completion_task(task); rpc_put_task(task); out: - request->fl_flags = fl_flags; + request->c.flc_flags = saved_flags; trace_nfs4_unlock(request, state, F_SETLK, status); return status; } @@ -6087,9 +7371,9 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; p->server = server; - atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); - memcpy(&p->fl, fl, sizeof(p->fl)); + locks_init_lock(&p->fl); + locks_copy_lock(&p->fl, fl); return p; out_free_seqid: nfs_free_seqid(p->arg.open_seqid); @@ -6103,7 +7387,6 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) struct nfs4_lockdata *data = calldata; struct nfs4_state *state = data->lsp->ls_state; - dprintk("%s: begin!\n", __func__); if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) goto out_wait; /* Do we need to do an open_to_lock_owner? */ @@ -6137,7 +7420,7 @@ out_release_lock_seqid: nfs_release_seqid(data->arg.lock_seqid); out_wait: nfs4_sequence_done(task, &data->res.seq_res); - dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); + dprintk("%s: ret = %d\n", __func__, data->rpc_status); } static void nfs4_lock_done(struct rpc_task *task, void *calldata) @@ -6145,8 +7428,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) struct nfs4_lockdata *data = calldata; struct nfs4_lock_state *lsp = data->lsp; - dprintk("%s: begin!\n", __func__); - if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -6155,42 +7436,52 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) case 0: renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), data->timestamp); - if (data->arg.new_lock) { - data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); - if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) { - rpc_restart_call_prepare(task); - break; - } + if (data->arg.new_lock && !data->cancelled) { + data->fl.c.flc_flags &= ~(FL_SLEEP | FL_ACCESS); + if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) + goto out_restart; } if (data->arg.new_lock_owner != 0) { nfs_confirm_seqid(&lsp->ls_seqid, 0); nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid); set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid)) - rpc_restart_call_prepare(task); + goto out_restart; break; - case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: + if (data->arg.new_lock_owner != 0 && + nfs4_refresh_open_old_stateid(&data->arg.open_stateid, + lsp->ls_state)) + goto out_restart; + if (nfs4_refresh_lock_old_stateid(&data->arg.lock_stateid, lsp)) + goto out_restart; + fallthrough; + case -NFS4ERR_BAD_STATEID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: if (data->arg.new_lock_owner != 0) { if (!nfs4_stateid_match(&data->arg.open_stateid, &lsp->ls_state->open_stateid)) - rpc_restart_call_prepare(task); + goto out_restart; } else if (!nfs4_stateid_match(&data->arg.lock_stateid, &lsp->ls_stateid)) - rpc_restart_call_prepare(task); + goto out_restart; } - dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); +out_done: + dprintk("%s: ret = %d!\n", __func__, data->rpc_status); + return; +out_restart: + if (!data->cancelled) + rpc_restart_call_prepare(task); + goto out_done; } static void nfs4_lock_release(void *calldata) { struct nfs4_lockdata *data = calldata; - dprintk("%s: begin!\n", __func__); nfs_free_seqid(data->arg.open_seqid); - if (data->cancelled) { + if (data->cancelled && data->rpc_status == 0) { struct rpc_task *task; task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, data->arg.lock_seqid); @@ -6202,7 +7493,6 @@ static void nfs4_lock_release(void *calldata) nfs4_put_lock_state(data->lsp); put_nfs_open_context(data->ctx); kfree(data); - dprintk("%s: done!\n", __func__); } static const struct rpc_call_ops nfs4_lock_ops = { @@ -6225,7 +7515,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ case -NFS4ERR_STALE_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; nfs4_schedule_lease_recovery(server->nfs_client); - }; + } } static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type) @@ -6241,26 +7531,28 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .rpc_message = &msg, .callback_ops = &nfs4_lock_ops, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int ret; - dprintk("%s: begin!\n", __func__); - data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), - fl->fl_u.nfs4_fl.owner, - recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS); + if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + + data = nfs4_alloc_lockdata(fl, + nfs_file_open_context(fl->c.flc_file), + fl->fl_u.nfs4_fl.owner, GFP_KERNEL); if (data == NULL) return -ENOMEM; if (IS_SETLKW(cmd)) data->arg.block = 1; - nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1, + recovery_type > NFS_LOCK_NEW); msg.rpc_argp = &data->arg; msg.rpc_resp = &data->res; task_setup_data.callback_data = data; if (recovery_type > NFS_LOCK_NEW) { if (recovery_type == NFS_LOCK_RECLAIM) data->arg.reclaim = NFS_LOCK_RECLAIM; - nfs4_set_sequence_privileged(&data->arg.seq_args); } else data->arg.new_lock = 1; task = rpc_run_task(&task_setup_data); @@ -6274,9 +7566,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f data->arg.new_lock_owner, ret); } else data->cancelled = true; - rpc_put_task(task); - dprintk("%s: done, ret = %d!\n", __func__, ret); trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret); + rpc_put_task(task); + dprintk("%s: ret = %d\n", __func__, ret); return ret; } @@ -6353,10 +7645,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock { struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs4_state_owner *sp = state->owner; - unsigned char fl_flags = request->fl_flags; + unsigned char flags = request->c.flc_flags; int status; - request->fl_flags |= FL_ACCESS; + request->c.flc_flags |= FL_ACCESS; status = locks_lock_inode_wait(state->inode, request); if (status < 0) goto out; @@ -6365,7 +7657,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ - request->fl_flags = fl_flags & ~FL_SLEEP; + request->c.flc_flags = flags & ~FL_SLEEP; status = locks_lock_inode_wait(state->inode, request); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); @@ -6375,7 +7667,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock mutex_unlock(&sp->so_delegreturn_mutex); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); out: - request->fl_flags = fl_flags; + request->c.flc_flags = flags; return status; } @@ -6384,6 +7676,7 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * struct nfs4_exception exception = { .state = state, .inode = state->inode, + .interruptible = true, }; int err; @@ -6411,7 +7704,8 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) break; - freezable_schedule_timeout_interruptible(timeout); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + schedule_timeout(timeout); timeout *= 2; timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout); status = -ERESTARTSYS; @@ -6421,85 +7715,70 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, #ifdef CONFIG_NFS_V4_1 struct nfs4_lock_waiter { - struct task_struct *task; struct inode *inode; - struct nfs_lowner *owner; - bool notified; + struct nfs_lowner owner; + wait_queue_entry_t wait; }; static int nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key) { - int ret; - struct cb_notify_lock_args *cbnl = key; - struct nfs4_lock_waiter *waiter = wait->private; - struct nfs_lowner *lowner = &cbnl->cbnl_owner, - *wowner = waiter->owner; - - /* Only wake if the callback was for the same owner */ - if (lowner->clientid != wowner->clientid || - lowner->id != wowner->id || - lowner->s_dev != wowner->s_dev) - return 0; + struct nfs4_lock_waiter *waiter = + container_of(wait, struct nfs4_lock_waiter, wait); - /* Make sure it's for the right inode */ - if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) - return 0; + /* NULL key means to wake up everyone */ + if (key) { + struct cb_notify_lock_args *cbnl = key; + struct nfs_lowner *lowner = &cbnl->cbnl_owner, + *wowner = &waiter->owner; - waiter->notified = true; + /* Only wake if the callback was for the same owner. */ + if (lowner->id != wowner->id || lowner->s_dev != wowner->s_dev) + return 0; - /* override "private" so we can use default_wake_function */ - wait->private = waiter->task; - ret = autoremove_wake_function(wait, mode, flags, key); - wait->private = waiter; - return ret; + /* Make sure it's for the right inode */ + if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) + return 0; + } + + return woken_wake_function(wait, mode, flags, key); } static int nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - int status = -ERESTARTSYS; - unsigned long flags; struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; struct nfs_server *server = NFS_SERVER(state->inode); struct nfs_client *clp = server->nfs_client; wait_queue_head_t *q = &clp->cl_lock_waitq; - struct nfs_lowner owner = { .clientid = clp->cl_clientid, - .id = lsp->ls_seqid.owner_id, - .s_dev = server->s_dev }; - struct nfs4_lock_waiter waiter = { .task = current, - .inode = state->inode, - .owner = &owner, - .notified = false }; - wait_queue_entry_t wait; + struct nfs4_lock_waiter waiter = { + .inode = state->inode, + .owner = { .clientid = clp->cl_clientid, + .id = lsp->ls_seqid.owner_id, + .s_dev = server->s_dev }, + }; + int status; /* Don't bother with waitqueue if we don't expect a callback */ if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) return nfs4_retry_setlk_simple(state, cmd, request); - init_wait(&wait); - wait.private = &waiter; - wait.func = nfs4_wake_lock_waiter; - add_wait_queue(q, &wait); + init_wait(&waiter.wait); + waiter.wait.func = nfs4_wake_lock_waiter; + add_wait_queue(q, &waiter.wait); - while(!signalled()) { + do { status = nfs4_proc_setlk(state, cmd, request); - if ((status != -EAGAIN) || IS_SETLK(cmd)) + if (status != -EAGAIN || IS_SETLK(cmd)) break; status = -ERESTARTSYS; - spin_lock_irqsave(&q->lock, flags); - if (waiter.notified) { - spin_unlock_irqrestore(&q->lock, flags); - continue; - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&q->lock, flags); + wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE, + NFS4_LOCK_MAXTIMEOUT); + } while (!signalled()); - freezable_schedule_timeout(NFS4_LOCK_MAXTIMEOUT); - } + remove_wait_queue(q, &waiter.wait); - finish_wait(q, &wait); return status; } #else /* !CONFIG_NFS_V4_1 */ @@ -6530,7 +7809,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) return -EINVAL; - if (request->fl_type == F_UNLCK) { + if (lock_is_unlock(request)) { if (state != NULL) return nfs4_proc_unlck(state, cmd, request); return 0; @@ -6539,10 +7818,24 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (state == NULL) return -ENOLCK; - if ((request->fl_flags & FL_POSIX) && + if ((request->c.flc_flags & FL_POSIX) && !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) return -ENOLCK; + /* + * Don't rely on the VFS having checked the file open mode, + * since it won't do this for flock() locks. + */ + switch (request->c.flc_type) { + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + status = nfs4_set_lock_state(state, request); if (status != 0) return status; @@ -6550,6 +7843,43 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return nfs4_retry_setlk(state, cmd, request); } +static int nfs4_delete_lease(struct file *file, void **priv) +{ + return generic_setlease(file, F_UNLCK, NULL, priv); +} + +static int nfs4_add_lease(struct file *file, int arg, struct file_lease **lease, + void **priv) +{ + struct inode *inode = file_inode(file); + fmode_t type = arg == F_RDLCK ? FMODE_READ : FMODE_WRITE; + int ret; + + /* No delegation, no lease */ + if (!nfs4_have_delegation(inode, type, 0)) + return -EAGAIN; + ret = generic_setlease(file, arg, lease, priv); + if (ret || nfs4_have_delegation(inode, type, 0)) + return ret; + /* We raced with a delegation return */ + nfs4_delete_lease(file, priv); + return -EAGAIN; +} + +int nfs4_proc_setlease(struct file *file, int arg, struct file_lease **lease, + void **priv) +{ + switch (arg) { + case F_RDLCK: + case F_WRLCK: + return nfs4_add_lease(file, arg, lease, priv); + case F_UNLCK: + return nfs4_delete_lease(file, priv); + default: + return -EINVAL; + } +} + int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs_server *server = NFS_SERVER(state->inode); @@ -6558,8 +7888,13 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, err = nfs4_set_lock_state(state, fl); if (err != 0) return err; - err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - return nfs4_handle_delegation_recall_error(server, state, stateid, err); + do { + err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); + if (err != -NFS4ERR_DELAY && err != -NFS4ERR_GRACE) + break; + ssleep(1); + } while (err == -NFS4ERR_DELAY || err == -NFSERR_GRACE); + return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } struct nfs_release_lockowner_data { @@ -6627,7 +7962,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) if (server->nfs_client->cl_mvops->minor_version != 0) return; - data = kmalloc(sizeof(*data), GFP_NOFS); + data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return; data->lsp = lsp; @@ -6638,35 +7973,86 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0); rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); } #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) { - return nfs4_proc_set_acl(inode, buf, buflen); + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_ACL); } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *key, void *buf, size_t buflen) { - return nfs4_proc_get_acl(inode, buf, buflen); + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_ACL); } static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) { - return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))); + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_ACL); } +#if defined(CONFIG_NFS_V4_1) +#define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl" + +static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_DACL); +} + +static int nfs4_xattr_get_nfs4_dacl(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_DACL); +} + +static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry) +{ + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_DACL); +} + +#define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl" + +static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_SACL); +} + +static int nfs4_xattr_get_nfs4_sacl(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_SACL); +} + +static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry) +{ + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_SACL); +} + +#endif + #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -6693,7 +8079,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) { len = security_inode_listsecurity(inode, list, list_len); - if (list_len && len > list_len) + if (len >= 0 && list_len && len > list_len) return -ERANGE; } return len; @@ -6715,6 +8101,134 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) #endif +#ifdef CONFIG_NFS_V4_2 +static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + u32 mask; + int ret; + + if (!nfs_server_capable(inode, NFS_CAP_XATTR)) + return -EOPNOTSUPP; + + /* + * There is no mapping from the MAY_* flags to the NFS_ACCESS_XA* + * flags right now. Handling of xattr operations use the normal + * file read/write permissions. + * + * Just in case the server has other ideas (which RFC 8276 allows), + * do a cached access check for the XA* flags to possibly avoid + * doing an RPC and getting EACCES back. + */ + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XAWRITE)) + return -EACCES; + } + + if (buf == NULL) { + ret = nfs42_proc_removexattr(inode, key); + if (!ret) + nfs4_xattr_cache_remove(inode, key); + } else { + ret = nfs42_proc_setxattr(inode, key, buf, buflen, flags); + if (!ret) + nfs4_xattr_cache_add(inode, key, buf, NULL, buflen); + } + + return ret; +} + +static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + u32 mask; + ssize_t ret; + + if (!nfs_server_capable(inode, NFS_CAP_XATTR)) + return -EOPNOTSUPP; + + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XAREAD)) + return -EACCES; + } + + ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); + if (ret) + return ret; + + ret = nfs4_xattr_cache_get(inode, key, buf, buflen); + if (ret >= 0 || (ret < 0 && ret != -ENOENT)) + return ret; + + ret = nfs42_proc_getxattr(inode, key, buf, buflen); + + return ret; +} + +static ssize_t +nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) +{ + u64 cookie; + bool eof; + ssize_t ret, size; + char *buf; + size_t buflen; + u32 mask; + + if (!nfs_server_capable(inode, NFS_CAP_XATTR)) + return 0; + + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XALIST)) + return 0; + } + + ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); + if (ret) + return ret; + + ret = nfs4_xattr_cache_list(inode, list, list_len); + if (ret >= 0 || (ret < 0 && ret != -ENOENT)) + return ret; + + cookie = 0; + eof = false; + buflen = list_len ? list_len : XATTR_LIST_MAX; + buf = list_len ? list : NULL; + size = 0; + + while (!eof) { + ret = nfs42_proc_listxattrs(inode, buf, buflen, + &cookie, &eof); + if (ret < 0) + return ret; + + if (list_len) { + buf += ret; + buflen -= ret; + } + size += ret; + } + + if (list_len) + nfs4_xattr_cache_set_list(inode, list, size); + + return size; +} + +#else + +static ssize_t +nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) +{ + return 0; +} +#endif /* CONFIG_NFS_V4_2 */ + /* * nfs_fhget will use either the mounted_on_fileid or the fileid */ @@ -6738,9 +8252,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, struct page *page) { struct nfs_server *server = NFS_SERVER(dir); - u32 bitmask[3] = { - [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, - }; + u32 bitmask[3]; struct nfs4_fs_locations_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -6759,14 +8271,17 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, dprintk("%s: start\n", __func__); + bitmask[0] = nfs4_fattr_bitmap[0] | FATTR4_WORD0_FS_LOCATIONS; + bitmask[1] = nfs4_fattr_bitmap[1]; + /* Ask for the fileid of the absent filesystem if mounted_on_fileid * is not supported */ if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) - bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + bitmask[0] &= ~FATTR4_WORD0_FILEID; else - bitmask[0] |= FATTR4_WORD0_FILEID; + bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - nfs_fattr_init(&fs_locations->fattr); + nfs_fattr_init(fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0); @@ -6779,7 +8294,9 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, struct nfs4_fs_locations *fs_locations, struct page *page) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs4_proc_fs_locations(client, dir, name, @@ -6798,18 +8315,18 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, * appended to this compound to identify the client ID which is * performing recovery. */ -static int _nfs40_proc_get_locations(struct inode *inode, +static int _nfs40_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, - struct page *page, struct rpc_cred *cred) + struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct rpc_clnt *clnt = server->client; u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, }; struct nfs4_fs_locations_arg args = { .clientid = server->nfs_client->cl_clientid, - .fh = NFS_FH(inode), + .fh = fhandle, .page = page, .bitmask = bitmask, .migration = 1, /* skip LOOKUP */ @@ -6829,12 +8346,11 @@ static int _nfs40_proc_get_locations(struct inode *inode, unsigned long now = jiffies; int status; - nfs_fattr_init(&locations->fattr); + nfs_fattr_init(locations->fattr); locations->server = server; locations->nlocations = 0; - nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); status = nfs4_call_sync_sequence(clnt, server, &msg, &args.seq_args, &res.seq_res); if (status) @@ -6856,17 +8372,17 @@ static int _nfs40_proc_get_locations(struct inode *inode, * When the client supports GETATTR(fs_locations_info), it can * be plumbed in here. */ -static int _nfs41_proc_get_locations(struct inode *inode, +static int _nfs41_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, - struct page *page, struct rpc_cred *cred) + struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct rpc_clnt *clnt = server->client; u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, }; struct nfs4_fs_locations_arg args = { - .fh = NFS_FH(inode), + .fh = fhandle, .page = page, .bitmask = bitmask, .migration = 1, /* skip LOOKUP */ @@ -6881,16 +8397,26 @@ static int _nfs41_proc_get_locations(struct inode *inode, .rpc_resp = &res, .rpc_cred = cred, }; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = server->nfs_client->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; int status; - nfs_fattr_init(&locations->fattr); + nfs_fattr_init(locations->fattr); locations->server = server; locations->nlocations = 0; - nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); - status = nfs4_call_sync_sequence(clnt, server, &msg, - &args.seq_args, &res.seq_res); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); + status = nfs4_call_sync_custom(&task_setup_data); if (status == NFS4_OK && res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) status = -NFS4ERR_LEASE_MOVED; @@ -6901,7 +8427,8 @@ static int _nfs41_proc_get_locations(struct inode *inode, /** * nfs4_proc_get_locations - discover locations for a migrated FSID - * @inode: inode on FSID that is migrating + * @server: pointer to nfs_server to process + * @fhandle: pointer to the kernel NFS client file handle * @locations: result of query * @page: buffer * @cred: credential to use for this operation @@ -6916,25 +8443,28 @@ static int _nfs41_proc_get_locations(struct inode *inode, * -NFS4ERR_LEASE_MOVED is returned if the server still has leases * from this client that require migration recovery. */ -int nfs4_proc_get_locations(struct inode *inode, +int nfs4_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, - struct page *page, struct rpc_cred *cred) + struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = server->nfs_client; const struct nfs4_mig_recovery_ops *ops = clp->cl_mvops->mig_recovery_ops; - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int status; dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, (unsigned long long)server->fsid.major, (unsigned long long)server->fsid.minor, clp->cl_hostname); - nfs_display_fhandle(NFS_FH(inode), __func__); + nfs_display_fhandle(fhandle, __func__); do { - status = ops->get_locations(inode, locations, page, cred); + status = ops->get_locations(server, fhandle, locations, page, + cred); if (status != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, status, &exception); @@ -6949,7 +8479,7 @@ int nfs4_proc_get_locations(struct inode *inode, * is appended to this compound to identify the client ID which is * performing recovery. */ -static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +static int _nfs40_proc_fsid_present(struct inode *inode, const struct cred *cred) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; @@ -6975,8 +8505,7 @@ static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) if (res.fh == NULL) return -ENOMEM; - nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); status = nfs4_call_sync_sequence(clnt, server, &msg, &args.seq_args, &res.seq_res); nfs_free_fhandle(res.fh); @@ -6996,7 +8525,7 @@ static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) * this operation is identified in the SEQUENCE operation in this * compound. */ -static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +static int _nfs41_proc_fsid_present(struct inode *inode, const struct cred *cred) { struct nfs_server *server = NFS_SERVER(inode); struct rpc_clnt *clnt = server->client; @@ -7017,8 +8546,7 @@ static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) if (res.fh == NULL) return -ENOMEM; - nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); status = nfs4_call_sync_sequence(clnt, server, &msg, &args.seq_args, &res.seq_res); nfs_free_fhandle(res.fh); @@ -7044,13 +8572,15 @@ static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) * NFS4ERR code if some error occurred on the server, or a * negative errno if a local failure occurred. */ -int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) +int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = server->nfs_client; const struct nfs4_mig_recovery_ops *ops = clp->cl_mvops->mig_recovery_ops; - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int status; dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__, @@ -7068,7 +8598,7 @@ int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) return status; } -/** +/* * If 'use_integrity' is true and the state managment nfs_client * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient * and the machine credential as per RFC3530bis and RFC5661 Security @@ -7078,6 +8608,8 @@ int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred) static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) { int status; + struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs_client *clp = NFS_SERVER(dir)->nfs_client; struct nfs4_secinfo_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -7090,34 +8622,46 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct .rpc_argp = &args, .rpc_resp = &res, }; - struct rpc_clnt *clnt = NFS_SERVER(dir)->client; - struct rpc_cred *cred = NULL; + struct nfs4_call_sync_data data = { + .seq_server = NFS_SERVER(dir), + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = clp->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; + const struct cred *cred = NULL; if (use_integrity) { - clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; - cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + clnt = clp->cl_rpcclient; + task_setup.rpc_client = clnt; + + cred = nfs4_get_clid_cred(clp); msg.rpc_cred = cred; } dprintk("NFS call secinfo %s\n", name->name); - nfs4_state_protect(NFS_SERVER(dir)->nfs_client, - NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); - status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, - &res.seq_res, 0); dprintk("NFS reply secinfo: %d\n", status); - if (cred) - put_rpccred(cred); - + put_cred(cred); return status; } int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = -NFS4ERR_WRONGSEC; @@ -7148,9 +8692,11 @@ int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or * DS flags set. */ -static int nfs4_check_cl_exchange_flags(u32 flags) +static int nfs4_check_cl_exchange_flags(u32 flags, u32 version) { - if (flags & ~EXCHGID4_FLAG_MASK_R) + if (version >= 2 && (flags & ~EXCHGID4_2_FLAG_MASK_R)) + goto out_inval; + else if (version < 2 && (flags & ~EXCHGID4_FLAG_MASK_R)) goto out_inval; if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) && (flags & EXCHGID4_FLAG_USE_NON_PNFS)) @@ -7174,10 +8720,27 @@ nfs41_same_server_scope(struct nfs41_server_scope *a, static void nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) { + struct nfs41_bind_conn_to_session_args *args = task->tk_msg.rpc_argp; + struct nfs41_bind_conn_to_session_res *res = task->tk_msg.rpc_resp; + struct nfs_client *clp = args->client; + + switch (task->tk_status) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + nfs4_schedule_session_recovery(clp->cl_session, + task->tk_status); + return; + } + if (args->dir == NFS4_CDFC4_FORE_OR_BOTH && + res->dir != NFS4_CDFS4_BOTH) { + rpc_task_close_connection(task); + if (args->retries++ < MAX_BIND_CONN_TO_SESSION_RETRIES) + rpc_restart_call(task); + } } static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = { - .rpc_call_done = &nfs4_bind_one_conn_to_session_done, + .rpc_call_done = nfs4_bind_one_conn_to_session_done, }; /* @@ -7190,12 +8753,13 @@ static int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct nfs_client *clp, - struct rpc_cred *cred) + const struct cred *cred) { int status; struct nfs41_bind_conn_to_session_args args = { .client = clp, .dir = NFS4_CDFC4_FORE_OR_BOTH, + .retries = 0, }; struct nfs41_bind_conn_to_session_res res; struct rpc_message msg = { @@ -7252,7 +8816,7 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, struct rpc_bind_conn_calldata { struct nfs_client *clp; - struct rpc_cred *cred; + const struct cred *cred; }; static int @@ -7265,7 +8829,7 @@ nfs4_proc_bind_conn_to_session_callback(struct rpc_clnt *clnt, return nfs4_proc_bind_one_conn_to_session(clnt, xprt, p->clp, p->cred); } -int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred) +int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, const struct cred *cred) { struct rpc_bind_conn_calldata data = { .clp = clp, @@ -7319,7 +8883,9 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, 1 << (OP_DESTROY_SESSION - 32) | 1 << (OP_DESTROY_CLIENTID - 32) }; + unsigned long flags = 0; unsigned int i; + int ret = 0; if (sp->how == SP4_MACH_CRED) { /* Print state protect result */ @@ -7335,7 +8901,8 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) { if (sp->enforce.u.words[i] & ~supported_enforce[i]) { dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } } @@ -7354,10 +8921,11 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) { dfprintk(MOUNT, "sp4_mach_cred:\n"); dfprintk(MOUNT, " minimal mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_MINIMAL, &flags); } else { dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } if (test_bit(OP_CLOSE, sp->allow.u.longs) && @@ -7365,110 +8933,46 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, test_bit(OP_DELEGRETURN, sp->allow.u.longs) && test_bit(OP_LOCKU, sp->allow.u.longs)) { dfprintk(MOUNT, " cleanup mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_CLEANUP, &flags); } if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) { dfprintk(MOUNT, " pnfs cleanup mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, - &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, &flags); } if (test_bit(OP_SECINFO, sp->allow.u.longs) && test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { dfprintk(MOUNT, " secinfo mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_SECINFO, &flags); } if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) && test_bit(OP_FREE_STATEID, sp->allow.u.longs)) { dfprintk(MOUNT, " stateid mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_STATEID, &flags); } if (test_bit(OP_WRITE, sp->allow.u.longs)) { dfprintk(MOUNT, " write mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_WRITE, &flags); } if (test_bit(OP_COMMIT, sp->allow.u.longs)) { dfprintk(MOUNT, " commit mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_COMMIT, &flags); } } - - return 0; +out: + clp->cl_sp4_flags = flags; + return ret; } struct nfs41_exchange_id_data { struct nfs41_exchange_id_res res; struct nfs41_exchange_id_args args; - struct rpc_xprt *xprt; - int rpc_status; }; -static void nfs4_exchange_id_done(struct rpc_task *task, void *data) -{ - struct nfs41_exchange_id_data *cdata = - (struct nfs41_exchange_id_data *)data; - struct nfs_client *clp = cdata->args.client; - int status = task->tk_status; - - trace_nfs4_exchange_id(clp, status); - - if (status == 0) - status = nfs4_check_cl_exchange_flags(cdata->res.flags); - - if (cdata->xprt && status == 0) { - status = nfs4_detect_session_trunking(clp, &cdata->res, - cdata->xprt); - goto out; - } - - if (status == 0) - status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect); - - if (status == 0) { - clp->cl_clientid = cdata->res.clientid; - clp->cl_exchange_flags = cdata->res.flags; - clp->cl_seqid = cdata->res.seqid; - /* Client ID is not confirmed */ - if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) - clear_bit(NFS4_SESSION_ESTABLISHED, - &clp->cl_session->session_state); - - kfree(clp->cl_serverowner); - clp->cl_serverowner = cdata->res.server_owner; - cdata->res.server_owner = NULL; - - /* use the most recent implementation id */ - kfree(clp->cl_implid); - clp->cl_implid = cdata->res.impl_id; - cdata->res.impl_id = NULL; - - if (clp->cl_serverscope != NULL && - !nfs41_same_server_scope(clp->cl_serverscope, - cdata->res.server_scope)) { - dprintk("%s: server_scope mismatch detected\n", - __func__); - set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); - kfree(clp->cl_serverscope); - clp->cl_serverscope = NULL; - } - - if (clp->cl_serverscope == NULL) { - clp->cl_serverscope = cdata->res.server_scope; - cdata->res.server_scope = NULL; - } - /* Save the EXCHANGE_ID verifier session trunk tests */ - memcpy(clp->cl_confirm.data, cdata->args.verifier.data, - sizeof(clp->cl_confirm.data)); - } -out: - cdata->rpc_status = status; - return; -} - static void nfs4_exchange_id_release(void *data) { struct nfs41_exchange_id_data *cdata = @@ -7482,7 +8986,6 @@ static void nfs4_exchange_id_release(void *data) } static const struct rpc_call_ops nfs4_exchange_id_call_ops = { - .rpc_call_done = nfs4_exchange_id_done, .rpc_release = nfs4_exchange_id_release, }; @@ -7491,7 +8994,8 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = { * * Wrapper for EXCHANGE_ID operation. */ -static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, +static struct rpc_task * +nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred, u32 sp4_how, struct rpc_xprt *xprt) { struct rpc_message msg = { @@ -7502,20 +9006,18 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, .rpc_client = clp->cl_rpcclient, .callback_ops = &nfs4_exchange_id_call_ops, .rpc_message = &msg, - .flags = RPC_TASK_TIMEOUT, + .flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN, }; struct nfs41_exchange_id_data *calldata; - struct rpc_task *task; int status; - if (!atomic_inc_not_zero(&clp->cl_count)) - return -EIO; + if (!refcount_inc_not_zero(&clp->cl_count)) + return ERR_PTR(-EIO); + status = -ENOMEM; calldata = kzalloc(sizeof(*calldata), GFP_NOFS); - if (!calldata) { - nfs_put_client(clp); - return -ENOMEM; - } + if (!calldata) + goto out; nfs4_init_boot_verifier(clp, &calldata->args.verifier); @@ -7554,34 +9056,24 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, goto out_impl_id; } if (xprt) { - calldata->xprt = xprt; task_setup_data.rpc_xprt = xprt; task_setup_data.flags |= RPC_TASK_SOFTCONN; memcpy(calldata->args.verifier.data, clp->cl_confirm.data, sizeof(calldata->args.verifier.data)); } calldata->args.client = clp; -#ifdef CONFIG_NFS_V4_1_MIGRATION - calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID | - EXCHGID4_FLAG_SUPP_MOVED_MIGR, -#else calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID, + EXCHGID4_FLAG_BIND_PRINC_STATEID; +#ifdef CONFIG_NFS_V4_1_MIGRATION + calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR; #endif + if (test_bit(NFS_CS_PNFS, &clp->cl_flags)) + calldata->args.flags |= EXCHGID4_FLAG_USE_PNFS_DS; msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - - status = calldata->rpc_status; - - rpc_put_task(task); -out: - return status; + return rpc_run_task(&task_setup_data); out_impl_id: kfree(calldata->res.impl_id); @@ -7591,8 +9083,73 @@ out_server_owner: kfree(calldata->res.server_owner); out_calldata: kfree(calldata); +out: nfs_put_client(clp); - goto out; + return ERR_PTR(status); +} + +/* + * _nfs4_proc_exchange_id() + * + * Wrapper for EXCHANGE_ID operation. + */ +static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred, + u32 sp4_how) +{ + struct rpc_task *task; + struct nfs41_exchange_id_args *argp; + struct nfs41_exchange_id_res *resp; + unsigned long now = jiffies; + int status; + + task = nfs4_run_exchange_id(clp, cred, sp4_how, NULL); + if (IS_ERR(task)) + return PTR_ERR(task); + + argp = task->tk_msg.rpc_argp; + resp = task->tk_msg.rpc_resp; + status = task->tk_status; + if (status != 0) + goto out; + + status = nfs4_check_cl_exchange_flags(resp->flags, + clp->cl_mvops->minor_version); + if (status != 0) + goto out; + + status = nfs4_sp4_select_mode(clp, &resp->state_protect); + if (status != 0) + goto out; + + do_renew_lease(clp, now); + + clp->cl_clientid = resp->clientid; + clp->cl_exchange_flags = resp->flags; + clp->cl_seqid = resp->seqid; + /* Client ID is not confirmed */ + if (!(resp->flags & EXCHGID4_FLAG_CONFIRMED_R)) + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); + + if (clp->cl_serverscope != NULL && + !nfs41_same_server_scope(clp->cl_serverscope, + resp->server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + } + + swap(clp->cl_serverowner, resp->server_owner); + swap(clp->cl_serverscope, resp->server_scope); + swap(clp->cl_implid, resp->impl_id); + + /* Save the EXCHANGE_ID verifier session trunk tests */ + memcpy(clp->cl_confirm.data, argp->verifier.data, + sizeof(clp->cl_confirm.data)); +out: + trace_nfs4_exchange_id(clp, status); + rpc_put_task(task); + return status; } /* @@ -7607,7 +9164,7 @@ out_calldata: * * Will attempt to negotiate SP4_MACH_CRED if krb5i / krb5p auth is used. */ -int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) +int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred) { rpc_authflavor_t authflavor = clp->cl_rpcclient->cl_auth->au_flavor; int status; @@ -7615,13 +9172,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) /* try SP4_MACH_CRED if krb5i/p */ if (authflavor == RPC_AUTH_GSS_KRB5I || authflavor == RPC_AUTH_GSS_KRB5P) { - status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL); + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); if (!status) return 0; } /* try SP4_NONE */ - return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL); + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); } /** @@ -7639,10 +9196,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) * @xprt: the rpc_xprt to test * @data: call data for _nfs4_proc_exchange_id. */ -int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, +void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + struct nfs4_add_xprt_data *adata = data; + struct rpc_task *task; + int status; + u32 sp4_how; dprintk("--> %s try %s\n", __func__, @@ -7650,13 +9210,35 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED); +try_again: /* Test connection for session trunking. Async exchange_id call */ - return _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt); + task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt); + if (IS_ERR(task)) + return; + + status = task->tk_status; + if (status == 0) { + status = nfs4_detect_session_trunking(adata->clp, + task->tk_msg.rpc_resp, xprt); + trace_nfs4_trunked_exchange_id(adata->clp, + xprt->address_strings[RPC_DISPLAY_ADDR], status); + } + if (status == 0) + rpc_clnt_xprt_switch_add_xprt(clnt, xprt); + else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt, + (struct sockaddr *)&xprt->addr)) + rpc_clnt_xprt_switch_remove_xprt(clnt, xprt); + + rpc_put_task(task); + if (status == -NFS4ERR_DELAY) { + ssleep(1); + goto try_again; + } } EXPORT_SYMBOL_GPL(nfs4_test_session_trunk); static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, - struct rpc_cred *cred) + const struct cred *cred) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_CLIENTID], @@ -7665,7 +9247,8 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, }; int status; - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_destroy_clientid(clp, status); if (status) dprintk("NFS: Got error %d from the server %s on " @@ -7674,7 +9257,7 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, } static int nfs4_proc_destroy_clientid(struct nfs_client *clp, - struct rpc_cred *cred) + const struct cred *cred) { unsigned int loop; int ret; @@ -7695,7 +9278,7 @@ static int nfs4_proc_destroy_clientid(struct nfs_client *clp, int nfs4_destroy_clientid(struct nfs_client *clp) { - struct rpc_cred *cred; + const struct cred *cred; int ret = 0; if (clp->cl_mvops->minor_version < 1) @@ -7706,8 +9289,7 @@ int nfs4_destroy_clientid(struct nfs_client *clp) goto out; cred = nfs4_get_clid_cred(clp); ret = nfs4_proc_destroy_clientid(clp, cred); - if (cred) - put_rpccred(cred); + put_cred(cred); switch (ret) { case 0: case -NFS4ERR_STALE_CLIENTID: @@ -7717,6 +9299,8 @@ out: return ret; } +#endif /* CONFIG_NFS_V4_1 */ + struct nfs4_get_lease_time_data { struct nfs4_get_lease_time_args *args; struct nfs4_get_lease_time_res *res; @@ -7729,14 +9313,12 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task, struct nfs4_get_lease_time_data *data = (struct nfs4_get_lease_time_data *)calldata; - dprintk("--> %s\n", __func__); /* just setup sequence, do not trigger session recovery since we're invoked within one */ nfs4_setup_sequence(data->clp, &data->args->la_seq_args, &data->res->lr_seq_res, task); - dprintk("<-- %s\n", __func__); } /* @@ -7748,21 +9330,18 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) struct nfs4_get_lease_time_data *data = (struct nfs4_get_lease_time_data *)calldata; - dprintk("--> %s\n", __func__); - if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) + if (!nfs4_sequence_done(task, &data->res->lr_seq_res)) return; switch (task->tk_status) { case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: - dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); rpc_delay(task, NFS4_POLL_RETRY_MIN); task->tk_status = 0; - /* fall through */ + fallthrough; case -NFS4ERR_RETRY_UNCACHED_REP: rpc_restart_call_prepare(task); return; } - dprintk("<-- %s\n", __func__); } static const struct rpc_call_ops nfs4_get_lease_time_ops = { @@ -7772,7 +9351,6 @@ static const struct rpc_call_ops nfs4_get_lease_time_ops = { int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) { - struct rpc_task *task; struct nfs4_get_lease_time_args args; struct nfs4_get_lease_time_res res = { .lr_fsinfo = fsinfo, @@ -7794,20 +9372,13 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) .callback_data = &data, .flags = RPC_TASK_TIMEOUT, }; - int status; - nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); - nfs4_set_sequence_privileged(&args.la_seq_args); - task = rpc_run_task(&task_setup); - - if (IS_ERR(task)) - return PTR_ERR(task); - - status = task->tk_status; - rpc_put_task(task); - return status; + nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1); + return nfs4_call_sync_custom(&task_setup); } +#ifdef CONFIG_NFS_V4_1 + /* * Initialize the values to be used by the client in CREATE_SESSION * If nfs4_init_session set the fore channel request and response sizes, @@ -7822,6 +9393,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, { unsigned int max_rqst_sz, max_resp_sz; unsigned int max_bc_payload = rpc_max_bc_payload(clnt); + unsigned int max_bc_slots = rpc_num_bc_slots(clnt); max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; @@ -7843,7 +9415,9 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, args->bc_attrs.max_resp_sz = max_bc_payload; args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; - args->bc_attrs.max_reqs = min_t(unsigned short, max_session_cb_slots, 1); + args->bc_attrs.max_reqs = max_t(unsigned short, max_session_cb_slots, 1); + if (args->bc_attrs.max_reqs > max_bc_slots) + args->bc_attrs.max_reqs = max_bc_slots; dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", @@ -7886,7 +9460,7 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args goto out; if (rcvd->max_rqst_sz > sent->max_rqst_sz) return -EINVAL; - if (rcvd->max_resp_sz < sent->max_resp_sz) + if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; @@ -7924,7 +9498,7 @@ static void nfs4_update_session(struct nfs4_session *session, } static int _nfs4_proc_create_session(struct nfs_client *clp, - struct rpc_cred *cred) + const struct cred *cred) { struct nfs4_session *session = clp->cl_session; struct nfs41_create_session_args args = { @@ -7946,7 +9520,8 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, nfs4_init_channel_attrs(&args, clp->cl_rpcclient); args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); - status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_create_session(clp, status); switch (status) { @@ -7956,7 +9531,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, case -EACCES: case -EAGAIN: goto out; - }; + } clp->cl_seqid++; if (!status) { @@ -7976,11 +9551,18 @@ out: * It is the responsibility of the caller to verify the session is * expired before calling this routine. */ -int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred) +int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred) { int status; unsigned *ptr; struct nfs4_session *session = clp->cl_session; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); @@ -7997,8 +9579,8 @@ int nfs4_proc_create_session(struct nfs_client *clp, struct rpc_cred *cred) ptr = (unsigned *)&session->sess_id.data[0]; dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__, clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]); + rpc_clnt_probe_trunked_xprts(clp->cl_rpcclient, &rpcdata); out: - dprintk("<-- %s\n", __func__); return status; } @@ -8007,7 +9589,7 @@ out: * The caller must serialize access to this routine. */ int nfs4_proc_destroy_session(struct nfs4_session *session, - struct rpc_cred *cred) + const struct cred *cred) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DESTROY_SESSION], @@ -8016,20 +9598,18 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, }; int status = 0; - dprintk("--> nfs4_proc_destroy_session\n"); - /* session is still being setup */ if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state)) return 0; - status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_destroy_session(session->clp, status); if (status) dprintk("NFS: Got error %d from the server on DESTROY_SESSION. " "Session has been destroyed regardless...\n", status); - - dprintk("<-- nfs4_proc_destroy_session\n"); + rpc_clnt_manage_trunked_xprts(session->clp->cl_rpcclient); return status; } @@ -8047,7 +9627,7 @@ static void nfs41_sequence_release(void *data) struct nfs4_sequence_data *calldata = data; struct nfs_client *clp = calldata->clp; - if (atomic_read(&clp->cl_count) > 1) + if (refcount_read(&clp->cl_count) > 1) nfs4_schedule_state_renewal(clp); nfs_put_client(clp); kfree(calldata); @@ -8074,10 +9654,10 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) return; trace_nfs4_sequence(clp, task->tk_status); - if (task->tk_status < 0) { + if (task->tk_status < 0 && clp->cl_cons_state >= 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); - if (atomic_read(&clp->cl_count) == 1) - goto out; + if (refcount_read(&clp->cl_count) == 1) + return; if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { rpc_restart_call_prepare(task); @@ -8085,8 +9665,6 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) } } dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); -out: - dprintk("<-- %s\n", __func__); } static void nfs41_sequence_prepare(struct rpc_task *task, void *data) @@ -8109,7 +9687,8 @@ static const struct rpc_call_ops nfs41_sequence_ops = { }; static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, - struct rpc_cred *cred, + const struct cred *cred, + struct nfs4_slot *slot, bool is_privileged) { struct nfs4_sequence_data *calldata; @@ -8121,35 +9700,44 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs41_sequence_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, }; + struct rpc_task *ret; - if (!atomic_inc_not_zero(&clp->cl_count)) - return ERR_PTR(-EIO); - calldata = kzalloc(sizeof(*calldata), GFP_NOFS); - if (calldata == NULL) { - nfs_put_client(clp); - return ERR_PTR(-ENOMEM); - } - nfs4_init_sequence(&calldata->args, &calldata->res, 0); - if (is_privileged) - nfs4_set_sequence_privileged(&calldata->args); + ret = ERR_PTR(-EIO); + if (!refcount_inc_not_zero(&clp->cl_count)) + goto out_err; + + ret = ERR_PTR(-ENOMEM); + calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); + if (calldata == NULL) + goto out_put_clp; + nfs4_init_sequence(&calldata->args, &calldata->res, 0, is_privileged); + nfs4_sequence_attach_slot(&calldata->args, &calldata->res, slot); msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; calldata->clp = clp; task_setup_data.callback_data = calldata; - return rpc_run_task(&task_setup_data); + ret = rpc_run_task(&task_setup_data); + if (IS_ERR(ret)) + goto out_err; + return ret; +out_put_clp: + nfs_put_client(clp); +out_err: + nfs41_release_slot(slot); + return ret; } -static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags) +static int nfs41_proc_async_sequence(struct nfs_client *clp, const struct cred *cred, unsigned renew_flags) { struct rpc_task *task; int ret = 0; if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) return -EAGAIN; - task = _nfs41_proc_sequence(clp, cred, false); + task = _nfs41_proc_sequence(clp, cred, NULL, false); if (IS_ERR(task)) ret = PTR_ERR(task); else @@ -8158,12 +9746,12 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr return ret; } -static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs4_proc_sequence(struct nfs_client *clp, const struct cred *cred) { struct rpc_task *task; int ret; - task = _nfs41_proc_sequence(clp, cred, true); + task = _nfs41_proc_sequence(clp, cred, NULL, true); if (IS_ERR(task)) { ret = PTR_ERR(task); goto out; @@ -8197,19 +9785,22 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf { switch(task->tk_status) { case 0: + wake_up_all(&clp->cl_lock_waitq); + fallthrough; case -NFS4ERR_COMPLETE_ALREADY: case -NFS4ERR_WRONG_CRED: /* What to do here? */ break; case -NFS4ERR_DELAY: rpc_delay(task, NFS4_POLL_RETRY_MAX); - /* fall through */ + fallthrough; case -NFS4ERR_RETRY_UNCACHED_REP: + case -EACCES: + dprintk("%s: failed to reclaim complete error %d for server %s, retrying\n", + __func__, task->tk_status, clp->cl_hostname); return -EAGAIN; case -NFS4ERR_BADSESSION: case -NFS4ERR_DEADSESSION: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - nfs4_schedule_session_recovery(clp->cl_session, - task->tk_status); break; default: nfs4_schedule_lease_recovery(clp); @@ -8223,7 +9814,6 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) struct nfs_client *clp = calldata->clp; struct nfs4_sequence_res *res = &calldata->res.seq_res; - dprintk("--> %s\n", __func__); if (!nfs41_sequence_done(task, res)) return; @@ -8232,7 +9822,6 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) rpc_restart_call_prepare(task); return; } - dprintk("<-- %s\n", __func__); } static void nfs4_free_reclaim_complete_data(void *data) @@ -8252,10 +9841,9 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = { * Issue a global reclaim complete. */ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, - struct rpc_cred *cred) + const struct cred *cred) { struct nfs4_reclaim_complete_data *calldata; - struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], .rpc_cred = cred, @@ -8264,31 +9852,21 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs4_reclaim_complete_call_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_NO_ROUND_ROBIN, }; int status = -ENOMEM; - dprintk("--> %s\n", __func__); calldata = kzalloc(sizeof(*calldata), GFP_NOFS); if (calldata == NULL) goto out; calldata->clp = clp; calldata->arg.one_fs = 0; - nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); - nfs4_set_sequence_privileged(&calldata->arg.seq_args); + nfs4_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0, 1); msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = rpc_wait_for_completion_task(task); - if (status == 0) - status = task->tk_status; - rpc_put_task(task); + status = nfs4_call_sync_custom(&task_setup_data); out: dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -8300,19 +9878,15 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); - dprintk("--> %s\n", __func__); nfs4_setup_sequence(server->nfs_client, &lgp->args.seq_args, &lgp->res.seq_res, task); - dprintk("<-- %s\n", __func__); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; - dprintk("--> %s\n", __func__); nfs41_sequence_process(task, &lgp->res.seq_res); - dprintk("<-- %s\n", __func__); } static int @@ -8321,13 +9895,18 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layout_hdr *lo; + struct pnfs_layout_hdr *lo = lgp->lo; int nfs4err = task->tk_status; int err, status = 0; LIST_HEAD(head); dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); + nfs4_sequence_free_slot(&lgp->res.seq_res); + + exception->state = NULL; + exception->stateid = NULL; + switch (nfs4err) { case 0: goto out; @@ -8363,6 +9942,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, status = -EBUSY; break; case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_RETURNCONFLICT: status = -ERECALLCONFLICT; break; case -NFS4ERR_DELEG_REVOKED: @@ -8371,11 +9951,9 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, case -NFS4ERR_BAD_STATEID: exception->timeout = 0; spin_lock(&inode->i_lock); - lo = NFS_I(inode)->layout; /* If the open stateid was bad, then recover it. */ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || - nfs4_stateid_match_other(&lgp->args.stateid, - &lgp->args.ctx->state->stateid)) { + !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { spin_unlock(&inode->i_lock); exception->state = lgp->args.ctx->state; exception->stateid = &lgp->args.stateid; @@ -8393,7 +9971,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, goto out; } - nfs4_sequence_free_slot(&lgp->res.seq_res); err = nfs4_handle_exception(server, nfs4err, exception); if (!status) { if (exception->retry) @@ -8402,68 +9979,21 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, status = err; } out: - dprintk("<-- %s\n", __func__); return status; } -static size_t max_response_pages(struct nfs_server *server) +size_t max_response_pages(struct nfs_server *server) { u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; return nfs_page_array_len(0, max_resp_sz); } -static void nfs4_free_pages(struct page **pages, size_t size) -{ - int i; - - if (!pages) - return; - - for (i = 0; i < size; i++) { - if (!pages[i]) - break; - __free_page(pages[i]); - } - kfree(pages); -} - -static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) -{ - struct page **pages; - int i; - - pages = kcalloc(size, sizeof(struct page *), gfp_flags); - if (!pages) { - dprintk("%s: can't alloc array of %zu pages\n", __func__, size); - return NULL; - } - - for (i = 0; i < size; i++) { - pages[i] = alloc_page(gfp_flags); - if (!pages[i]) { - dprintk("%s: failed to allocate page\n", __func__); - nfs4_free_pages(pages, size); - return NULL; - } - } - - return pages; -} - static void nfs4_layoutget_release(void *calldata) { struct nfs4_layoutget *lgp = calldata; - struct inode *inode = lgp->args.inode; - struct nfs_server *server = NFS_SERVER(inode); - size_t max_pages = max_response_pages(server); - dprintk("--> %s\n", __func__); nfs4_sequence_free_slot(&lgp->res.seq_res); - nfs4_free_pages(lgp->args.layout.pages, max_pages); - pnfs_put_layout_hdr(NFS_I(inode)->layout); - put_nfs_open_context(lgp->args.ctx); - kfree(calldata); - dprintk("<-- %s\n", __func__); + pnfs_layoutget_free(lgp); } static const struct rpc_call_ops nfs4_layoutget_call_ops = { @@ -8473,11 +10003,11 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { }; struct pnfs_layout_segment * -nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, + struct nfs4_exception *exception) { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); - size_t max_pages = max_response_pages(server); struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], @@ -8490,49 +10020,39 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) .rpc_message = &msg, .callback_ops = &nfs4_layoutget_call_ops, .callback_data = lgp, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | + RPC_TASK_MOVEABLE, }; struct pnfs_layout_segment *lseg = NULL; - struct nfs4_exception exception = { - .inode = inode, - .timeout = *timeout, - }; int status = 0; - dprintk("--> %s\n", __func__); - - /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ - pnfs_get_layout_hdr(NFS_I(inode)->layout); - - lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); - if (!lgp->args.layout.pages) { - nfs4_layoutget_release(lgp); - return ERR_PTR(-ENOMEM); - } - lgp->args.layout.pglen = max_pages * PAGE_SIZE; - - lgp->res.layoutp = &lgp->args.layout; - lgp->res.seq_res.sr_slot = NULL; - nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); + exception->retry = 0; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return ERR_CAST(task); + status = rpc_wait_for_completion_task(task); - if (status == 0) { - status = nfs4_layoutget_handle_exception(task, lgp, &exception); - *timeout = exception.timeout; - } + if (status != 0) + goto out; + if (task->tk_status < 0) { + exception->retry = 1; + status = nfs4_layoutget_handle_exception(task, lgp, exception); + } else if (lgp->res.layoutp->len == 0) { + exception->retry = 1; + status = -EAGAIN; + nfs4_update_delay(&exception->timeout); + } else + lseg = pnfs_layout_process(lgp); +out: trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, &lgp->res.stateid, status); - /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ - if (status == 0 && lgp->res.layoutp->len) - lseg = pnfs_layout_process(lgp); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) @@ -8545,11 +10065,12 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) { struct nfs4_layoutreturn *lrp = calldata; - dprintk("--> %s\n", __func__); nfs4_setup_sequence(lrp->clp, &lrp->args.seq_args, &lrp->res.seq_res, task); + if (!pnfs_layout_is_valid(lrp->args.layout)) + rpc_exit(task, 0); } static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) @@ -8557,25 +10078,58 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) struct nfs4_layoutreturn *lrp = calldata; struct nfs_server *server; - dprintk("--> %s\n", __func__); - if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; + if (task->tk_rpc_status == -ETIMEDOUT) { + lrp->rpc_status = -EAGAIN; + lrp->res.lrs_present = 0; + return; + } + /* + * Was there an RPC level error? Assume the call succeeded, + * and that we need to release the layout + */ + if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) { + lrp->res.lrs_present = 0; + return; + } + server = NFS_SERVER(lrp->args.inode); switch (task->tk_status) { + case -NFS4ERR_OLD_STATEID: + if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid, + &lrp->args.range, + lrp->args.inode)) + goto out_restart; + fallthrough; default: task->tk_status = 0; + lrp->res.lrs_present = 0; + fallthrough; case 0: break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + nfs4_schedule_session_recovery(server->nfs_client->cl_session, + task->tk_status); + lrp->res.lrs_present = 0; + lrp->rpc_status = -EAGAIN; + task->tk_status = 0; + break; case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) - break; - nfs4_sequence_free_slot(&lrp->res.seq_res); - rpc_restart_call_prepare(task); - return; + if (nfs4_async_handle_error(task, server, NULL, NULL) == + -EAGAIN) + goto out_restart; + lrp->res.lrs_present = 0; + break; } - dprintk("<-- %s\n", __func__); + return; +out_restart: + task->tk_status = 0; + nfs4_sequence_free_slot(&lrp->res.seq_res); + rpc_restart_call_prepare(task); } static void nfs4_layoutreturn_release(void *calldata) @@ -8583,16 +10137,20 @@ static void nfs4_layoutreturn_release(void *calldata) struct nfs4_layoutreturn *lrp = calldata; struct pnfs_layout_hdr *lo = lrp->args.layout; - dprintk("--> %s\n", __func__); - pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range, + if (lrp->rpc_status == 0 || !lrp->inode) + pnfs_layoutreturn_free_lsegs( + lo, &lrp->args.stateid, &lrp->args.range, lrp->res.lrs_present ? &lrp->res.stateid : NULL); + else + pnfs_layoutreturn_retry_later(lo, &lrp->args.stateid, + &lrp->args.range); nfs4_sequence_free_slot(&lrp->res.seq_res); if (lrp->ld_private.ops && lrp->ld_private.ops->free) lrp->ld_private.ops->free(&lrp->ld_private); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); + put_cred(lrp->cred); kfree(calldata); - dprintk("<-- %s\n", __func__); } static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { @@ -8601,7 +10159,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { .rpc_release = nfs4_layoutreturn_release, }; -int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, unsigned int flags) { struct rpc_task *task; struct rpc_message msg = { @@ -8615,6 +10173,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) .rpc_message = &msg, .callback_ops = &nfs4_layoutreturn_call_ops, .callback_data = lrp, + .flags = RPC_TASK_MOVEABLE, }; int status = 0; @@ -8622,20 +10181,26 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) NFS_SP4_MACH_CRED_PNFS_CLEANUP, &task_setup_data.rpc_client, &msg); - dprintk("--> %s\n", __func__); - if (!sync) { - lrp->inode = nfs_igrab_and_active(lrp->args.inode); + lrp->inode = nfs_igrab_and_active(lrp->args.inode); + if (flags & PNFS_FL_LAYOUTRETURN_ASYNC) { if (!lrp->inode) { nfs4_layoutreturn_release(lrp); return -EAGAIN; } task_setup_data.flags |= RPC_TASK_ASYNC; } - nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); + if (!lrp->inode) + flags |= PNFS_FL_LAYOUTRETURN_PRIVILEGED; + if (flags & PNFS_FL_LAYOUTRETURN_PRIVILEGED) + nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, + 1); + else + nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, + 0); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - if (sync) + if (!(flags & PNFS_FL_LAYOUTRETURN_ASYNC)) status = task->tk_status; trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status); dprintk("<-- %s status=%d\n", __func__, status); @@ -8646,7 +10211,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) static int _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev, - struct rpc_cred *cred) + const struct cred *cred) { struct nfs4_getdeviceinfo_args args = { .pdev = pdev, @@ -8664,13 +10229,14 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, }; int status; - dprintk("--> %s\n", __func__); status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (res.notification & ~args.notify_types) dprintk("%s: unsupported notification\n", __func__); if (res.notification != args.notify_types) pdev->nocache = 1; + trace_nfs4_getdeviceinfo(server, &pdev->dev_id, status); + dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -8678,7 +10244,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev, - struct rpc_cred *cred) + const struct cred *cred) { struct nfs4_exception exception = { }; int err; @@ -8718,6 +10284,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case -NFS4ERR_BADLAYOUT: /* no layout */ case -NFS4ERR_GRACE: /* loca_recalim always false */ task->tk_status = 0; + break; case 0: break; default: @@ -8735,7 +10302,7 @@ static void nfs4_layoutcommit_release(void *calldata) pnfs_cleanup_layoutcommit(data); nfs_post_op_update_inode_force_wcc(data->args.inode, data->res.fattr); - put_rpccred(data->cred); + put_cred(data->cred); nfs_iput_and_deactive(data->inode); kfree(data); } @@ -8761,6 +10328,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) .rpc_message = &msg, .callback_ops = &nfs4_layoutcommit_ops, .callback_data = data, + .flags = RPC_TASK_MOVEABLE, }; struct rpc_task *task; int status = 0; @@ -8778,7 +10346,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) } task_setup_data.flags = RPC_TASK_ASYNC; } - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -8790,14 +10358,14 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) return status; } -/** +/* * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if * possible) as per RFC3530bis and RFC5661 Security Considerations sections */ -static int -_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, - struct nfs4_secinfo_flavors *flavors, bool use_integrity) +static int _nfs41_proc_secinfo_no_name(struct nfs_server *server, + struct nfs_fh *fhandle, + struct nfs4_secinfo_flavors *flavors, + bool use_integrity) { struct nfs41_secinfo_no_name_args args = { .style = SECINFO_STYLE_CURRENT_FH, @@ -8810,32 +10378,44 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_argp = &args, .rpc_resp = &res, }; - struct rpc_clnt *clnt = server->client; - struct rpc_cred *cred = NULL; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = server->nfs_client->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; + const struct cred *cred = NULL; int status; if (use_integrity) { - clnt = server->nfs_client->cl_rpcclient; + task_setup.rpc_client = server->nfs_client->cl_rpcclient; + cred = nfs4_get_clid_cred(server->nfs_client); msg.rpc_cred = cred; } - dprintk("--> %s\n", __func__); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, 0); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); dprintk("<-- %s status=%d\n", __func__, status); - if (cred) - put_rpccred(cred); + put_cred(cred); return status; } -static int -nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +static int nfs41_proc_secinfo_no_name(struct nfs_server *server, + struct nfs_fh *fhandle, + struct nfs4_secinfo_flavors *flavors) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { /* first try using integrity protection */ @@ -8843,7 +10423,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, /* try to use integrity protection with machine cred */ if (_nfs4_is_integrity_protected(server->nfs_client)) - err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + err = _nfs41_proc_secinfo_no_name(server, fhandle, flavors, true); /* @@ -8853,7 +10433,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, * the current filesystem's rpc_client and the user cred. */ if (err == -NFS4ERR_WRONGSEC) - err = _nfs41_proc_secinfo_no_name(server, fhandle, info, + err = _nfs41_proc_secinfo_no_name(server, fhandle, flavors, false); switch (err) { @@ -8869,9 +10449,8 @@ out: return err; } -static int -nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) +static int nfs41_find_root_sec(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int err; struct page *page; @@ -8887,14 +10466,14 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, } flavors = page_address(page); - err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + err = nfs41_proc_secinfo_no_name(server, fhandle, flavors); /* * Fall back on "guess and check" method if * the server doesn't support SECINFO_NO_NAME */ if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) { - err = nfs4_find_root_sec(server, fhandle, info); + err = nfs4_find_root_sec(server, fhandle, fattr); goto out_freepage; } if (err) @@ -8919,8 +10498,8 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, flavor = RPC_AUTH_MAXFLAVOR; if (flavor != RPC_AUTH_MAXFLAVOR) { - err = nfs4_lookup_root_sec(server, fhandle, - info, flavor); + err = nfs4_lookup_root_sec(server, fhandle, fattr, + flavor); if (!err) break; } @@ -8938,12 +10517,12 @@ out: } static int _nfs41_test_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - struct rpc_cred *cred) + const nfs4_stateid *stateid, + const struct cred *cred) { int status; struct nfs41_test_stateid_args args = { - .stateid = stateid, + .stateid = *stateid, }; struct nfs41_test_stateid_res res; struct rpc_message msg = { @@ -8958,8 +10537,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, &rpc_client, &msg); dprintk("NFS call test_stateid %p\n", stateid); - nfs4_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); status = nfs4_call_sync_sequence(rpc_client, server, &msg, &args.seq_args, &res.seq_res); if (status != NFS_OK) { @@ -9000,10 +10578,12 @@ static void nfs4_handle_delay_or_session_error(struct nfs_server *server, * failed or the state ID is not currently valid. */ static int nfs41_test_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - struct rpc_cred *cred) + const nfs4_stateid *stateid, + const struct cred *cred) { - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .interruptible = true, + }; int err; do { err = _nfs41_test_stateid(server, stateid, cred); @@ -9042,6 +10622,10 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata) static void nfs41_free_stateid_release(void *calldata) { + struct nfs_free_stateid_data *data = calldata; + struct nfs_client *clp = data->server->nfs_client; + + nfs_put_client(clp); kfree(calldata); } @@ -9051,9 +10635,19 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { .rpc_release = nfs41_free_stateid_release, }; -static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - struct rpc_cred *cred, +/** + * nfs41_free_stateid - perform a FREE_STATEID operation + * + * @server: server / transport on which to perform the operation + * @stateid: state ID to release + * @cred: credential + * @privileged: set to true if this call needs to be privileged + * + * Note: this function is always asynchronous. + */ +static int nfs41_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + const struct cred *cred, bool privileged) { struct rpc_message msg = { @@ -9064,17 +10658,22 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs41_free_stateid_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE, }; struct nfs_free_stateid_data *data; + struct rpc_task *task; + struct nfs_client *clp = server->nfs_client; + + if (!refcount_inc_not_zero(&clp->cl_count)) + return -EIO; nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_STATEID, &task_setup.rpc_client, &msg); dprintk("NFS call free_stateid %p\n", stateid); - data = kmalloc(sizeof(*data), GFP_NOFS); + data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) - return ERR_PTR(-ENOMEM); + return -ENOMEM; data->server = server; nfs4_stateid_copy(&data->args.stateid, stateid); @@ -9082,41 +10681,19 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); - if (privileged) - nfs4_set_sequence_privileged(&data->args.seq_args); - - return rpc_run_task(&task_setup); -} - -/** - * nfs41_free_stateid - perform a FREE_STATEID operation - * - * @server: server / transport on which to perform the operation - * @stateid: state ID to release - * @cred: credential - * @is_recovery: set to true if this call needs to be privileged - * - * Note: this function is always asynchronous. - */ -static int nfs41_free_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - struct rpc_cred *cred, - bool is_recovery) -{ - struct rpc_task *task; - - task = _nfs41_free_stateid(server, stateid, cred, is_recovery); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, privileged); + task = rpc_run_task(&task_setup); if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); + stateid->type = NFS4_FREED_STATEID_TYPE; return 0; } static void nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + const struct cred *cred = lsp->ls_state->owner->so_cred; nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); nfs4_free_lock_state(server, lsp); @@ -9125,6 +10702,8 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) static bool nfs41_match_stateid(const nfs4_stateid *s1, const nfs4_stateid *s2) { + trace_nfs41_match_stateid(s1, s2); + if (s1->type != s2->type) return false; @@ -9142,6 +10721,8 @@ static bool nfs41_match_stateid(const nfs4_stateid *s1, static bool nfs4_match_stateid(const nfs4_stateid *s1, const nfs4_stateid *s2) { + trace_nfs4_match_stateid(s1, s2); + return nfs4_stateid_match(s1, s2); } @@ -9187,14 +10768,14 @@ static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = { .sched_state_renewal = nfs4_proc_async_renew, - .get_state_renewal_cred_locked = nfs4_get_renew_cred_locked, + .get_state_renewal_cred = nfs4_get_renew_cred, .renew_lease = nfs4_proc_renew, }; #if defined(CONFIG_NFS_V4_1) static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { .sched_state_renewal = nfs41_proc_async_sequence, - .get_state_renewal_cred_locked = nfs4_get_machine_cred_locked, + .get_state_renewal_cred = nfs4_get_machine_cred, .renew_lease = nfs4_proc_sequence, }; #endif @@ -9243,7 +10824,9 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { | NFS_CAP_ATOMIC_OPEN | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 - | NFS_CAP_ATOMIC_OPEN_V1, + | NFS_CAP_ATOMIC_OPEN_V1 + | NFS_CAP_LGOPEN + | NFS_CAP_MOVEABLE, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -9268,12 +10851,20 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 + | NFS_CAP_LGOPEN | NFS_CAP_ALLOCATE | NFS_CAP_COPY + | NFS_CAP_OFFLOAD_CANCEL + | NFS_CAP_COPY_NOTIFY | NFS_CAP_DEALLOCATE + | NFS_CAP_ZERO_RANGE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS - | NFS_CAP_CLONE, + | NFS_CAP_CLONE + | NFS_CAP_LAYOUTERROR + | NFS_CAP_READ_PLUS + | NFS_CAP_MOVEABLE + | NFS_CAP_OFFLOAD_STATUS, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -9302,20 +10893,66 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { - ssize_t error, error2; + ssize_t error, error2, error3, error4 = 0; + size_t left = size; - error = generic_listxattr(dentry, list, size); + error = generic_listxattr(dentry, list, left); if (error < 0) return error; if (list) { list += error; - size -= error; + left -= error; } - error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size); + error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, left); if (error2 < 0) return error2; - return error + error2; + + if (list) { + list += error2; + left -= error2; + } + + error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left); + if (error3 < 0) + return error3; + if (list) { + list += error3; + left -= error3; + } + + if (!nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) { + error4 = security_inode_listsecurity(d_inode(dentry), list, left); + if (error4 < 0) + return error4; + } + + error += error2 + error3 + error4; + if (size && error > size) + return -ERANGE; + return error; +} + +static void nfs4_enable_swap(struct inode *inode) +{ + /* The state manager thread must always be running. + * It will notice the client is a swapper, and stay put. + */ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + + nfs4_schedule_state_manager(clp); +} + +static void nfs4_disable_swap(struct inode *inode) +{ + /* The state manager thread will now exit once it is + * woken. + */ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + wake_up_var(&clp->cl_state); } static const struct inode_operations nfs4_dir_inode_operations = { @@ -9342,6 +10979,26 @@ static const struct inode_operations nfs4_file_inode_operations = { .listxattr = nfs4_listxattr, }; +static struct nfs_server *nfs4_clone_server(struct nfs_server *source, + struct nfs_fh *fh, struct nfs_fattr *fattr, + rpc_authflavor_t flavor) +{ + struct nfs_server *server; + int error; + + server = nfs_clone_server(source, fh, fattr, flavor); + if (IS_ERR(server)) + return server; + + error = nfs4_delegation_hash_alloc(server); + if (error) { + nfs_free_server(server); + return ERR_PTR(error); + } + + return server; +} + const struct nfs_rpc_ops nfs_v4_clientops = { .version = 4, /* protocol version */ .dentry_ops = &nfs4_dentry_operations, @@ -9350,7 +11007,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .file_ops = &nfs4_file_operations, .getroot = nfs4_proc_get_root, .submount = nfs4_submount, - .try_mount = nfs4_try_mount, + .try_get_tree = nfs4_try_get_tree, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, .lookup = nfs4_proc_lookup, @@ -9368,7 +11025,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .link = nfs4_proc_link, .symlink = nfs4_proc_symlink, .mkdir = nfs4_proc_mkdir, - .rmdir = nfs4_proc_remove, + .rmdir = nfs4_proc_rmdir, .readdir = nfs4_proc_readdir, .mknod = nfs4_proc_mknod, .statfs = nfs4_proc_statfs, @@ -9394,7 +11051,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .init_client = nfs4_init_client, .free_client = nfs4_free_client, .create_server = nfs4_create_server, - .clone_server = nfs_clone_server, + .clone_server = nfs4_clone_server, + .discover_trunking = nfs4_discover_trunking, + .enable_swap = nfs4_enable_swap, + .disable_swap = nfs4_disable_swap, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { @@ -9404,16 +11064,41 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { .set = nfs4_xattr_set_nfs4_acl, }; -const struct xattr_handler *nfs4_xattr_handlers[] = { +#if defined(CONFIG_NFS_V4_1) +static const struct xattr_handler nfs4_xattr_nfs4_dacl_handler = { + .name = XATTR_NAME_NFSV4_DACL, + .list = nfs4_xattr_list_nfs4_dacl, + .get = nfs4_xattr_get_nfs4_dacl, + .set = nfs4_xattr_set_nfs4_dacl, +}; + +static const struct xattr_handler nfs4_xattr_nfs4_sacl_handler = { + .name = XATTR_NAME_NFSV4_SACL, + .list = nfs4_xattr_list_nfs4_sacl, + .get = nfs4_xattr_get_nfs4_sacl, + .set = nfs4_xattr_set_nfs4_sacl, +}; +#endif + +#ifdef CONFIG_NFS_V4_2 +static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = nfs4_xattr_get_nfs4_user, + .set = nfs4_xattr_set_nfs4_user, +}; +#endif + +const struct xattr_handler * const nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, +#if defined(CONFIG_NFS_V4_1) + &nfs4_xattr_nfs4_dacl_handler, + &nfs4_xattr_nfs4_sacl_handler, +#endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL &nfs4_xattr_nfs4_label_handler, #endif +#ifdef CONFIG_NFS_V4_2 + &nfs4_xattr_nfs4_user_handler, +#endif NULL }; - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 1f8c2ae43a8d..18ae614e5a6c 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -57,7 +57,7 @@ nfs4_renew_state(struct work_struct *work) const struct nfs4_state_maintenance_ops *ops; struct nfs_client *clp = container_of(work, struct nfs_client, cl_renewd.work); - struct rpc_cred *cred; + const struct cred *cred; long lease; unsigned long last, now; unsigned renew_flags = 0; @@ -68,7 +68,6 @@ nfs4_renew_state(struct work_struct *work) if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state)) goto out; - spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; last = clp->cl_last_renewal; now = jiffies; @@ -79,8 +78,7 @@ nfs4_renew_state(struct work_struct *work) renew_flags |= NFS4_RENEW_DELEGATION_CB; if (renew_flags != 0) { - cred = ops->get_state_renewal_cred_locked(clp); - spin_unlock(&clp->cl_lock); + cred = ops->get_state_renewal_cred(clp); if (cred == NULL) { if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) { set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); @@ -92,7 +90,7 @@ nfs4_renew_state(struct work_struct *work) /* Queue an asynchronous RENEW. */ ret = ops->sched_state_renewal(clp, cred, renew_flags); - put_rpccred(cred); + put_cred(cred); switch (ret) { default: goto out_exp; @@ -104,7 +102,6 @@ nfs4_renew_state(struct work_struct *work) } else { dprintk("%s: failed to call renewd. Reason: lease not expired \n", __func__); - spin_unlock(&clp->cl_lock); } nfs4_schedule_state_renewal(clp); out_exp: @@ -125,7 +122,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) timeout = 5 * HZ; dprintk("%s: requeueing work. Lease period = %ld\n", __func__, (timeout + HZ - 1) / HZ); - mod_delayed_work(system_wq, &clp->cl_renewd, timeout); + mod_delayed_work(system_percpu_wq, &clp->cl_renewd, timeout); set_bit(NFS_CS_RENEWD, &clp->cl_res_state); spin_unlock(&clp->cl_lock); } @@ -141,23 +138,14 @@ nfs4_kill_renewd(struct nfs_client *clp) * * @clp: pointer to nfs_client * @lease: new value for lease period - * @lastrenewed: time at which lease was last renewed */ void nfs4_set_lease_period(struct nfs_client *clp, - unsigned long lease, - unsigned long lastrenewed) + unsigned long lease) { spin_lock(&clp->cl_lock); clp->cl_lease_time = lease; - clp->cl_last_renewal = lastrenewed; spin_unlock(&clp->cl_lock); /* Cap maximum reconnect timeout at 1/2 lease period */ rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1); } - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index 769b85655c4b..5db460476bf2 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * fs/nfs/nfs4session.c * @@ -55,7 +56,7 @@ static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) /** * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete - * @tbl - controlling slot table + * @tbl: controlling slot table * */ void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) @@ -110,6 +111,8 @@ static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl, slot->table = tbl; slot->slot_nr = slotid; slot->seq_nr = seq_init; + slot->seq_nr_highest_sent = seq_init; + slot->seq_nr_last_acked = seq_init - 1; } return slot; } @@ -276,7 +279,8 @@ static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, p = &tbl->slots; while (*p) { (*p)->seq_nr = ivalue; - (*p)->interrupted = 0; + (*p)->seq_nr_highest_sent = ivalue; + (*p)->seq_nr_last_acked = ivalue - 1; p = &(*p)->next; } tbl->highest_used_slotid = NFS4_NO_SLOT; @@ -507,12 +511,16 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl, struct nfs4_slot *slot, struct nfs4_sequence_res *res) { + u32 target_highest_slotid = min(res->sr_target_highest_slotid, + NFS4_MAX_SLOTID); + u32 highest_slotid = min(res->sr_highest_slotid, NFS4_MAX_SLOTID); + spin_lock(&tbl->slot_tbl_lock); - if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid)) - nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid); + if (!nfs41_is_outlier_target_slotid(tbl, target_highest_slotid)) + nfs41_set_target_slotid_locked(tbl, target_highest_slotid); if (tbl->generation == slot->generation) - nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid); - nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid); + nfs41_set_server_slotid_locked(tbl, highest_slotid); + nfs41_set_max_slotid_locked(tbl, target_highest_slotid); spin_unlock(&tbl->slot_tbl_lock); } @@ -573,12 +581,11 @@ static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) void nfs4_destroy_session(struct nfs4_session *session) { struct rpc_xprt *xprt; - struct rpc_cred *cred; + const struct cred *cred; cred = nfs4_get_clid_cred(session->clp); nfs4_proc_destroy_session(session, cred); - if (cred) - put_rpccred(cred); + put_cred(cred); rcu_read_lock(); xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index dfae4880eacb..f9c291e2165c 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * fs/nfs/nfs4session.h * @@ -9,8 +10,9 @@ /* maximum number of slots to use */ #define NFS4_DEF_SLOT_TABLE_SIZE (64U) -#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U) +#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U) #define NFS4_MAX_SLOT_TABLE (1024U) +#define NFS4_MAX_SLOTID (NFS4_MAX_SLOT_TABLE - 1U) #define NFS4_NO_SLOT ((u32)-1) #if IS_ENABLED(CONFIG_NFS_V4) @@ -22,8 +24,9 @@ struct nfs4_slot { unsigned long generation; u32 slot_nr; u32 seq_nr; - unsigned int interrupted : 1, - privileged : 1, + u32 seq_nr_last_acked; + u32 seq_nr_highest_sent; + unsigned int privileged : 1, seq_done : 1; }; @@ -32,7 +35,7 @@ enum nfs4_slot_tbl_state { NFS4_SLOT_TBL_DRAINING, }; -#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) +#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, BITS_PER_LONG) struct nfs4_slot_table { struct nfs4_session *session; /* Parent session */ struct nfs4_slot *slots; /* seqid per slot */ @@ -145,16 +148,12 @@ static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst, memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN); } -#ifdef CONFIG_CRC32 /* * nfs_session_id_hash - calculate the crc32 hash for the session id * @session - pointer to session */ #define nfs_session_id_hash(sess_id) \ (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) -#else -#define nfs_session_id_hash(session) (0) -#endif #else /* defined(CONFIG_NFS_V4_1) */ static inline int nfs4_init_session(struct nfs_client *clp) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0378e2257ca7..01179f7de322 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -49,6 +49,7 @@ #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/jiffies.h> +#include <linux/sched/mm.h> #include <linux/sunrpc/clnt.h> @@ -60,18 +61,56 @@ #include "nfs4session.h" #include "pnfs.h" #include "netns.h" +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_STATE #define OPENOWNER_POOL_SIZE 8 +static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp); + const nfs4_stateid zero_stateid = { { .data = { 0 } }, .type = NFS4_SPECIAL_STATEID_TYPE, }; +const nfs4_stateid invalid_stateid = { + { + /* Funky initialiser keeps older gcc versions happy */ + .data = { 0xff, 0xff, 0xff, 0xff, 0 }, + }, + .type = NFS4_INVALID_STATEID_TYPE, +}; + +const nfs4_stateid current_stateid = { + { + /* Funky initialiser keeps older gcc versions happy */ + .data = { 0x0, 0x0, 0x0, 0x1, 0 }, + }, + .type = NFS4_SPECIAL_STATEID_TYPE, +}; + static DEFINE_MUTEX(nfs_clid_init_mutex); -int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) +static int nfs4_setup_state_renewal(struct nfs_client *clp) +{ + int status; + struct nfs_fsinfo fsinfo; + + if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { + nfs4_schedule_state_renewal(clp); + return 0; + } + + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + nfs4_set_lease_period(clp, fsinfo.lease_time * HZ); + nfs4_schedule_state_renewal(clp); + } + + return status; +} + +int nfs4_init_clientid(struct nfs_client *clp, const struct cred *cred) { struct nfs4_setclientid_res clid = { .clientid = clp->cl_clientid, @@ -98,7 +137,7 @@ do_confirm: if (status != 0) goto out; clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - nfs4_schedule_state_renewal(clp); + nfs4_setup_state_renewal(clp); out: return status; } @@ -118,7 +157,7 @@ out: */ int nfs40_discover_server_trunking(struct nfs_client *clp, struct nfs_client **result, - struct rpc_cred *cred) + const struct cred *cred) { struct nfs4_setclientid_res clid = { .clientid = clp->cl_clientid, @@ -143,37 +182,32 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, /* Sustain the lease, even if it's empty. If the clientid4 * goes stale it's of no use for trunking discovery. */ nfs4_schedule_state_renewal(*result); + + /* If the client state need to recover, do it. */ + if (clp->cl_state) + nfs4_schedule_state_manager(clp); } out: return status; } -struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp) +const struct cred *nfs4_get_machine_cred(struct nfs_client *clp) { - struct rpc_cred *cred = NULL; - - if (clp->cl_machine_cred != NULL) - cred = get_rpccred(clp->cl_machine_cred); - return cred; + return get_cred(rpc_machine_cred()); } static void nfs4_root_machine_cred(struct nfs_client *clp) { - struct rpc_cred *cred, *new; - new = rpc_lookup_machine_cred(NULL); - spin_lock(&clp->cl_lock); - cred = clp->cl_machine_cred; - clp->cl_machine_cred = new; - spin_unlock(&clp->cl_lock); - if (cred != NULL) - put_rpccred(cred); + /* Force root creds instead of machine */ + clp->cl_principal = NULL; + clp->cl_rpcclient->cl_principal = NULL; } -static struct rpc_cred * +static const struct cred * nfs4_get_renew_cred_server_locked(struct nfs_server *server) { - struct rpc_cred *cred = NULL; + const struct cred *cred = NULL; struct nfs4_state_owner *sp; struct rb_node *pos; @@ -183,29 +217,30 @@ nfs4_get_renew_cred_server_locked(struct nfs_server *server) sp = rb_entry(pos, struct nfs4_state_owner, so_server_node); if (list_empty(&sp->so_states)) continue; - cred = get_rpccred(sp->so_cred); + cred = get_cred(sp->so_cred); break; } return cred; } /** - * nfs4_get_renew_cred_locked - Acquire credential for a renew operation + * nfs4_get_renew_cred - Acquire credential for a renew operation * @clp: client state handle * * Returns an rpc_cred with reference count bumped, or NULL. * Caller must hold clp->cl_lock. */ -struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) +const struct cred *nfs4_get_renew_cred(struct nfs_client *clp) { - struct rpc_cred *cred = NULL; + const struct cred *cred = NULL; struct nfs_server *server; /* Use machine credentials if available */ - cred = nfs4_get_machine_cred_locked(clp); + cred = nfs4_get_machine_cred(clp); if (cred != NULL) goto out; + spin_lock(&clp->cl_lock); rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { cred = nfs4_get_renew_cred_server_locked(server); @@ -213,6 +248,7 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp) break; } rcu_read_unlock(); + spin_unlock(&clp->cl_lock); out: return cred; @@ -258,7 +294,7 @@ static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) static int nfs4_begin_drain_session(struct nfs_client *clp) { struct nfs4_session *ses = clp->cl_session; - int ret = 0; + int ret; if (clp->cl_slot_tbl) return nfs4_drain_slot_tbl(clp->cl_slot_tbl); @@ -273,37 +309,16 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) #if defined(CONFIG_NFS_V4_1) -static int nfs41_setup_state_renewal(struct nfs_client *clp) -{ - int status; - struct nfs_fsinfo fsinfo; - unsigned long now; - - if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { - nfs4_schedule_state_renewal(clp); - return 0; - } - - now = jiffies; - status = nfs4_proc_get_lease_time(clp, &fsinfo); - if (status == 0) { - nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now); - nfs4_schedule_state_renewal(clp); - } - - return status; -} - static void nfs41_finish_session_reset(struct nfs_client *clp) { clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* create_session negotiated new slot table */ clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); - nfs41_setup_state_renewal(clp); + nfs4_setup_state_renewal(clp); } -int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) +int nfs41_init_clientid(struct nfs_client *clp, const struct cred *cred) { int status; @@ -317,6 +332,8 @@ do_confirm: status = nfs4_proc_create_session(clp, cred); if (status != 0) goto out; + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R)) + nfs4_state_start_reclaim_reboot(clp); nfs41_finish_session_reset(clp); nfs_mark_client_ready(clp, NFS_CS_READY); out: @@ -338,7 +355,7 @@ out: */ int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **result, - struct rpc_cred *cred) + const struct cred *cred) { int status; @@ -376,32 +393,32 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, * nfs4_get_clid_cred - Acquire credential for a setclientid operation * @clp: client state handle * - * Returns an rpc_cred with reference count bumped, or NULL. + * Returns a cred with reference count bumped, or NULL. */ -struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp) +const struct cred *nfs4_get_clid_cred(struct nfs_client *clp) { - struct rpc_cred *cred; + const struct cred *cred; - spin_lock(&clp->cl_lock); - cred = nfs4_get_machine_cred_locked(clp); - spin_unlock(&clp->cl_lock); + cred = nfs4_get_machine_cred(clp); return cred; } static struct nfs4_state_owner * -nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred) +nfs4_find_state_owner_locked(struct nfs_server *server, const struct cred *cred) { struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; struct nfs4_state_owner *sp; + int cmp; while (*p != NULL) { parent = *p; sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); + cmp = cred_fscmp(cred, sp->so_cred); - if (cred < sp->so_cred) + if (cmp < 0) p = &parent->rb_left; - else if (cred > sp->so_cred) + else if (cmp > 0) p = &parent->rb_right; else { if (!list_empty(&sp->so_lru)) @@ -420,15 +437,16 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; struct nfs4_state_owner *sp; - int err; + int cmp; while (*p != NULL) { parent = *p; sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); + cmp = cred_fscmp(new->so_cred, sp->so_cred); - if (new->so_cred < sp->so_cred) + if (cmp < 0) p = &parent->rb_left; - else if (new->so_cred > sp->so_cred) + else if (cmp > 0) p = &parent->rb_right; else { if (!list_empty(&sp->so_lru)) @@ -437,9 +455,6 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) return sp; } } - err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id); - if (err) - return ERR_PTR(err); rb_link_node(&new->so_server_node, parent, p); rb_insert_color(&new->so_server_node, &server->state_owners); return new; @@ -452,7 +467,6 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) if (!RB_EMPTY_NODE(&sp->so_server_node)) rb_erase(&sp->so_server_node, &server->state_owners); - ida_remove(&server->openowner_id, sp->so_seqid.owner_id); } static void @@ -479,7 +493,7 @@ nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc) */ static struct nfs4_state_owner * nfs4_alloc_state_owner(struct nfs_server *server, - struct rpc_cred *cred, + const struct cred *cred, gfp_t gfp_flags) { struct nfs4_state_owner *sp; @@ -487,14 +501,14 @@ nfs4_alloc_state_owner(struct nfs_server *server, sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; + sp->so_seqid.owner_id = atomic64_inc_return(&server->owner_ctr); sp->so_server = server; - sp->so_cred = get_rpccred(cred); + sp->so_cred = get_cred(cred); spin_lock_init(&sp->so_lock); INIT_LIST_HEAD(&sp->so_states); nfs4_init_seqid_counter(&sp->so_seqid); atomic_set(&sp->so_count, 1); INIT_LIST_HEAD(&sp->so_lru); - seqcount_init(&sp->so_reclaim_seqcount); mutex_init(&sp->so_delegreturn_mutex); return sp; } @@ -517,7 +531,7 @@ nfs4_reset_state_owner(struct nfs4_state_owner *sp) static void nfs4_free_state_owner(struct nfs4_state_owner *sp) { nfs4_destroy_seqid_counter(&sp->so_seqid); - put_rpccred(sp->so_cred); + put_cred(sp->so_cred); kfree(sp); } @@ -550,11 +564,12 @@ static void nfs4_gc_state_owners(struct nfs_server *server) * nfs4_get_state_owner - Look up a state owner given a credential * @server: nfs_server to search * @cred: RPC credential to match + * @gfp_flags: allocation mode * * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. */ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, - struct rpc_cred *cred, + const struct cred *cred, gfp_t gfp_flags) { struct nfs_client *clp = server->nfs_client; @@ -568,13 +583,9 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, new = nfs4_alloc_state_owner(server, cred, gfp_flags); if (new == NULL) goto out; - do { - if (ida_pre_get(&server->openowner_id, gfp_flags) == 0) - break; - spin_lock(&clp->cl_lock); - sp = nfs4_insert_state_owner_locked(new); - spin_unlock(&clp->cl_lock); - } while (sp == ERR_PTR(-EAGAIN)); + spin_lock(&clp->cl_lock); + sp = nfs4_insert_state_owner_locked(new); + spin_unlock(&clp->cl_lock); if (sp != new) nfs4_free_state_owner(new); out: @@ -610,24 +621,39 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp) /** * nfs4_purge_state_owners - Release all cached state owners * @server: nfs_server with cached state owners to release + * @head: resulting list of state owners * * Called at umount time. Remaining state owners will be on * the LRU with ref count of zero. + * Note that the state owners are not freed, but are added + * to the list @head, which can later be used as an argument + * to nfs4_free_state_owners. */ -void nfs4_purge_state_owners(struct nfs_server *server) +void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head) { struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp, *tmp; - LIST_HEAD(doomed); spin_lock(&clp->cl_lock); list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) { - list_move(&sp->so_lru, &doomed); + list_move(&sp->so_lru, head); nfs4_remove_state_owner_locked(sp); } spin_unlock(&clp->cl_lock); +} - list_for_each_entry_safe(sp, tmp, &doomed, so_lru) { +/** + * nfs4_free_state_owners - Release all cached state owners + * @head: resulting list of state owners + * + * Frees a list of state owners that was generated by + * nfs4_purge_state_owners + */ +void nfs4_free_state_owners(struct list_head *head) +{ + struct nfs4_state_owner *sp, *tmp; + + list_for_each_entry_safe(sp, tmp, head, so_lru) { list_del(&sp->so_lru); nfs4_free_state_owner(sp); } @@ -638,13 +664,14 @@ nfs4_alloc_open_state(void) { struct nfs4_state *state; - state = kzalloc(sizeof(*state), GFP_NOFS); + state = kzalloc(sizeof(*state), GFP_KERNEL_ACCOUNT); if (!state) return NULL; - atomic_set(&state->count, 1); + refcount_set(&state->count, 1); INIT_LIST_HEAD(&state->lock_states); spin_lock_init(&state->state_lock); seqlock_init(&state->seqlock); + init_waitqueue_head(&state->waitq); return state; } @@ -669,12 +696,12 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_state *state; - list_for_each_entry(state, &nfsi->open_states, inode_states) { + list_for_each_entry_rcu(state, &nfsi->open_states, inode_states) { if (state->owner != owner) continue; if (!nfs4_valid_open_stateid(state)) continue; - if (atomic_inc_not_zero(&state->count)) + if (refcount_inc_not_zero(&state->count)) return state; } return NULL; @@ -683,7 +710,7 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) static void nfs4_free_open_state(struct nfs4_state *state) { - kfree(state); + kfree_rcu(state, rcu_head); } struct nfs4_state * @@ -692,9 +719,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) struct nfs4_state *state, *new; struct nfs_inode *nfsi = NFS_I(inode); - spin_lock(&inode->i_lock); + rcu_read_lock(); state = __nfs4_find_state_byowner(inode, owner); - spin_unlock(&inode->i_lock); + rcu_read_unlock(); if (state) goto out; new = nfs4_alloc_open_state(); @@ -705,9 +732,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) state = new; state->owner = owner; atomic_inc(&owner->so_count); - list_add(&state->inode_states, &nfsi->open_states); ihold(inode); state->inode = inode; + list_add_rcu(&state->inode_states, &nfsi->open_states); spin_unlock(&inode->i_lock); /* Note: The reclaim code dictates that we add stateless * and read-only stateids to the end of the list */ @@ -728,13 +755,14 @@ void nfs4_put_open_state(struct nfs4_state *state) struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; - if (!atomic_dec_and_lock(&state->count, &owner->so_lock)) + if (!refcount_dec_and_lock(&state->count, &owner->so_lock)) return; spin_lock(&inode->i_lock); - list_del(&state->inode_states); + list_del_rcu(&state->inode_states); list_del(&state->open_states); spin_unlock(&inode->i_lock); spin_unlock(&owner->so_lock); + nfs4_inode_return_delegation_on_close(inode); iput(inode); nfs4_free_open_state(state); nfs4_put_state_owner(owner); @@ -790,7 +818,7 @@ static void __nfs4_close(struct nfs4_state *state, void nfs4_close_state(struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(state, fmode, GFP_NOFS, 0); + __nfs4_close(state, fmode, GFP_KERNEL, 0); } void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) @@ -813,19 +841,19 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) */ static struct nfs4_lock_state * __nfs4_find_lock_state(struct nfs4_state *state, - fl_owner_t fl_owner, fl_owner_t fl_owner2) + fl_owner_t owner, fl_owner_t owner2) { struct nfs4_lock_state *pos, *ret = NULL; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (pos->ls_owner == fl_owner) { + if (pos->ls_owner == owner) { ret = pos; break; } - if (pos->ls_owner == fl_owner2) + if (pos->ls_owner == owner2) ret = pos; } if (ret) - atomic_inc(&ret->ls_count); + refcount_inc(&ret->ls_count); return ret; } @@ -834,31 +862,25 @@ __nfs4_find_lock_state(struct nfs4_state *state, * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t owner) { struct nfs4_lock_state *lsp; struct nfs_server *server = state->owner->so_server; - lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + lsp = kzalloc(sizeof(*lsp), GFP_KERNEL_ACCOUNT); if (lsp == NULL) return NULL; nfs4_init_seqid_counter(&lsp->ls_seqid); - atomic_set(&lsp->ls_count, 1); + refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; - lsp->ls_owner = fl_owner; - lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); - if (lsp->ls_seqid.owner_id < 0) - goto out_free; + lsp->ls_owner = owner; + lsp->ls_seqid.owner_id = atomic64_inc_return(&server->owner_ctr); INIT_LIST_HEAD(&lsp->ls_locks); return lsp; -out_free: - kfree(lsp); - return NULL; } void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id); nfs4_destroy_seqid_counter(&lsp->ls_seqid); kfree(lsp); } @@ -907,7 +929,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (lsp == NULL) return; state = lsp->ls_state; - if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock)) + if (!refcount_dec_and_lock(&lsp->ls_count, &state->state_lock)) return; list_del(&lsp->ls_locks); if (list_empty(&state->lock_states)) @@ -927,7 +949,7 @@ static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner; dst->fl_u.nfs4_fl.owner = lsp; - atomic_inc(&lsp->ls_count); + refcount_inc(&lsp->ls_count); } static void nfs4_fl_release_lock(struct file_lock *fl) @@ -946,7 +968,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) if (fl->fl_ops != NULL) return 0; - lsp = nfs4_get_lock_state(state, fl->fl_owner); + lsp = nfs4_get_lock_state(state, fl->c.flc_owner); if (lsp == NULL) return -ENOMEM; fl->fl_u.nfs4_fl.owner = lsp; @@ -959,7 +981,7 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, const struct nfs_lock_context *l_ctx) { struct nfs4_lock_state *lsp; - fl_owner_t fl_owner, fl_flock_owner; + fl_owner_t owner, fl_flock_owner; int ret = -ENOENT; if (l_ctx == NULL) @@ -968,11 +990,11 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) goto out; - fl_owner = l_ctx->lockowner; + owner = l_ctx->lockowner; fl_flock_owner = l_ctx->open_context->flock_owner; spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner); + lsp = __nfs4_find_lock_state(state, owner, fl_flock_owner); if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) ret = -EIO; else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { @@ -985,18 +1007,23 @@ out: return ret; } -static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { + bool ret; const nfs4_stateid *src; int seq; do { + ret = false; src = &zero_stateid; seq = read_seqbegin(&state->seqlock); - if (test_bit(NFS_OPEN_STATE, &state->flags)) + if (test_bit(NFS_OPEN_STATE, &state->flags)) { src = &state->open_stateid; + ret = true; + } nfs4_stateid_copy(dst, src); } while (read_seqretry(&state->seqlock, seq)); + return ret; } /* @@ -1005,7 +1032,7 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) */ int nfs4_select_rw_stateid(struct nfs4_state *state, fmode_t fmode, const struct nfs_lock_context *l_ctx, - nfs4_stateid *dst, struct rpc_cred **cred) + nfs4_stateid *dst, const struct cred **cred) { int ret; @@ -1028,8 +1055,7 @@ int nfs4_select_rw_stateid(struct nfs4_state *state, * choose to use. */ goto out; - nfs4_copy_open_stateid(dst, state); - ret = 0; + ret = nfs4_copy_open_stateid(dst, state) ? 0 : -EAGAIN; out: if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) dst->seqid = 0; @@ -1057,14 +1083,12 @@ void nfs_release_seqid(struct nfs_seqid *seqid) return; sequence = seqid->sequence; spin_lock(&sequence->lock); - list_del_init(&seqid->list); - if (!list_empty(&sequence->list)) { - struct nfs_seqid *next; - - next = list_first_entry(&sequence->list, - struct nfs_seqid, list); + if (list_is_first(&seqid->list, &sequence->list) && + !list_is_singular(&sequence->list)) { + struct nfs_seqid *next = list_next_entry(seqid, list); rpc_wake_up_queued_task(&sequence->wait, next->task); } + list_del_init(&seqid->list); spin_unlock(&sequence->lock); } @@ -1091,6 +1115,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) " sequence-id error on an" " unconfirmed sequence %p!\n", seqid->sequence); + return; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: @@ -1100,7 +1125,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) case -NFS4ERR_MOVED: /* Non-seqid mutating errors */ return; - }; + } /* * Note: no locking needed as we are guaranteed to be first * on the sequence list @@ -1159,10 +1184,7 @@ static int nfs4_run_state_manager(void *); static void nfs4_clear_state_manager_bit(struct nfs_client *clp) { - smp_mb__before_atomic(); - clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); - smp_mb__after_atomic(); - wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING); + clear_and_wake_up_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); rpc_wake_up(&clp->cl_rpcwaitq); } @@ -1173,11 +1195,28 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *clnt = clp->cl_rpcclient; + bool swapon = false; + + if (clp->cl_cons_state < 0) + return; + + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + + if (atomic_read(&clnt->cl_swapper)) { + swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, + &clp->cl_state); + if (!swapon) { + wake_up_var(&clp->cl_state); + return; + } + } if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; + __module_get(THIS_MODULE); - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); /* The rcu_read_lock() is not strictly necessary, as the state * manager is the only thread that ever changes the rpc_xprt @@ -1190,6 +1229,10 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) if (IS_ERR(task)) { printk(KERN_ERR "%s: kthread_run: %ld\n", __func__, PTR_ERR(task)); + if (!nfs_client_init_is_complete(clp)) + nfs_mark_client_ready(clp, PTR_ERR(task)); + if (swapon) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs4_clear_state_manager_bit(clp); nfs_put_client(clp); module_put(THIS_MODULE); @@ -1269,9 +1312,10 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); + nfs_wait_bit_killable, + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (res) goto out; if (clp->cl_cons_state < 0) @@ -1354,10 +1398,12 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ if (!nfs4_state_mark_reclaim_nograce(clp, state)) return -EBADF; + nfs_inode_find_delegation_state_and_recover(state->inode, + &state->stateid); dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); - return 0; + return clp->cl_cons_state < 0 ? clp->cl_cons_state : 0; } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); @@ -1370,7 +1416,7 @@ nfs_state_find_lock_state_by_stateid(struct nfs4_state *state, list_for_each_entry(pos, &state->lock_states, ls_locks) { if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags)) continue; - if (nfs4_stateid_match_other(&pos->ls_stateid, stateid)) + if (nfs4_stateid_match_or_older(&pos->ls_stateid, stateid)) return pos; } return NULL; @@ -1399,12 +1445,18 @@ void nfs_inode_find_state_and_recover(struct inode *inode, struct nfs4_state *state; bool found = false; - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; - if (nfs4_stateid_match_other(&state->stateid, stateid) && + if (nfs4_stateid_match_or_older(&state->stateid, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) { + found = true; + continue; + } + if (test_bit(NFS_OPEN_STATE, &state->flags) && + nfs4_stateid_match_or_older(&state->open_stateid, stateid) && nfs4_state_mark_reclaim_nograce(clp, state)) { found = true; continue; @@ -1413,32 +1465,34 @@ void nfs_inode_find_state_and_recover(struct inode *inode, nfs4_state_mark_reclaim_nograce(clp, state)) found = true; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); nfs_inode_find_delegation_state_and_recover(inode, stateid); if (found) nfs4_schedule_state_manager(clp); } -static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) +static void nfs4_state_mark_open_context_bad(struct nfs4_state *state, int err) { struct inode *inode = state->inode; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { if (ctx->state != state) continue; set_bit(NFS_CONTEXT_BAD, &ctx->flags); + pr_warn("NFSv4: state recovery failed for open file %pd2, " + "error = %d\n", ctx->dentry, err); } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); } static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) { set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags); - nfs4_state_mark_open_context_bad(state); + nfs4_state_mark_open_context_bad(state, error); } @@ -1447,8 +1501,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ struct inode *inode = state->inode; struct nfs_inode *nfsi = NFS_I(inode); struct file_lock *fl; + struct nfs4_lock_state *lsp; int status = 0; - struct file_lock_context *flctx = inode->i_flctx; + struct file_lock_context *flctx = locks_inode_context(inode); struct list_head *list; if (flctx == NULL) @@ -1460,14 +1515,15 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ down_write(&nfsi->rwsem); spin_lock(&flctx->flc_lock); restart: - list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file)->state != state) + for_each_file_lock(fl, list) { + if (nfs_file_open_context(fl->c.flc_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = ops->recover_lock(state, fl); switch (status) { case 0: break; + case -ETIMEDOUT: case -ESTALE: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: @@ -1483,11 +1539,14 @@ restart: default: pr_err("NFS: %s: unhandled error %d\n", __func__, status); + fallthrough; case -ENOMEM: case -NFS4ERR_DENIED: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + lsp = fl->fl_u.nfs4_fl.owner; + if (lsp) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); status = 0; } spin_lock(&flctx->flc_lock); @@ -1502,11 +1561,88 @@ out: return status; } -static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) +#ifdef CONFIG_NFS_V4_2 +static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + struct nfs4_copy_state *copy; + + if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags) && + !test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags)) + return; + + spin_lock(&sp->so_server->nfs_client->cl_lock); + list_for_each_entry(copy, &sp->so_server->ss_copies, copies) { + if ((test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags) && + !nfs4_stateid_match_other(&state->stateid, + ©->parent_dst_state->stateid))) + continue; + copy->flags = 1; + if (test_and_clear_bit(NFS_CLNT_DST_SSC_COPY_STATE, + &state->flags)) { + clear_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags); + complete(©->completion); + } + } + list_for_each_entry(copy, &sp->so_server->ss_src_copies, src_copies) { + if ((test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags) && + !nfs4_stateid_match_other(&state->stateid, + ©->parent_src_state->stateid))) + continue; + copy->flags = 1; + if (test_and_clear_bit(NFS_CLNT_DST_SSC_COPY_STATE, + &state->flags)) + complete(©->completion); + } + spin_unlock(&sp->so_server->nfs_client->cl_lock); +} +#else /* !CONFIG_NFS_V4_2 */ +static inline void nfs42_complete_copies(struct nfs4_state_owner *sp, + struct nfs4_state *state) +{ +} +#endif /* CONFIG_NFS_V4_2 */ + +static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state, + const struct nfs4_state_recovery_ops *ops, + int *lost_locks) { - struct nfs4_state *state; struct nfs4_lock_state *lock; + int status; + + status = ops->recover_open(sp, state); + if (status < 0) + return status; + + status = nfs4_reclaim_locks(state, ops); + if (status < 0) + return status; + + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { + spin_lock(&state->state_lock); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + trace_nfs4_state_lock_reclaim(state, lock); + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags) && + !test_bit(NFS_LOCK_UNLOCKING, &lock->ls_flags)) + *lost_locks += 1; + } + spin_unlock(&state->state_lock); + } + + nfs42_complete_copies(sp, state); + clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + return status; +} + +static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, + const struct nfs4_state_recovery_ops *ops, + int *lost_locks) +{ + struct nfs4_state *state; + unsigned int loop = 0; int status = 0; +#ifdef CONFIG_NFS_V4_2 + bool found_ssc_copy_state = false; +#endif /* CONFIG_NFS_V4_2 */ /* Note: we rely on the sp->so_states list being ordered * so that we always reclaim open(O_RDWR) and/or open(O_WRITE) @@ -1517,7 +1653,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs * server that doesn't support a grace period. */ spin_lock(&sp->so_lock); - raw_write_seqcount_begin(&sp->so_reclaim_seqcount); restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) @@ -1526,73 +1661,74 @@ restart: continue; if (state->state == 0) continue; - atomic_inc(&state->count); - spin_unlock(&sp->so_lock); - status = ops->recover_open(sp, state); - if (status >= 0) { - status = nfs4_reclaim_locks(state, ops); - if (status >= 0) { - if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { - spin_lock(&state->state_lock); - list_for_each_entry(lock, &state->lock_states, ls_locks) { - if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) - pr_warn_ratelimited("NFS: " - "%s: Lock reclaim " - "failed!\n", __func__); - } - spin_unlock(&state->state_lock); - } - clear_bit(NFS_STATE_RECLAIM_NOGRACE, - &state->flags); - nfs4_put_open_state(state); - spin_lock(&sp->so_lock); - goto restart; - } +#ifdef CONFIG_NFS_V4_2 + if (test_bit(NFS_SRV_SSC_COPY_STATE, &state->flags)) { + nfs4_state_mark_recovery_failed(state, -EIO); + found_ssc_copy_state = true; + continue; } +#endif /* CONFIG_NFS_V4_2 */ + refcount_inc(&state->count); + spin_unlock(&sp->so_lock); + status = __nfs4_reclaim_open_state(sp, state, ops, lost_locks); + switch (status) { - default: - printk(KERN_ERR "NFS: %s: unhandled error %d\n", - __func__, status); - case -ENOENT: - case -ENOMEM: - case -EACCES: - case -EROFS: - case -EIO: - case -ESTALE: - /* Open state on this file cannot be recovered */ - nfs4_state_mark_recovery_failed(state, status); + default: + if (status >= 0) { + loop = 0; break; - case -EAGAIN: - ssleep(1); - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_OLD_STATEID: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_RECLAIM_BAD: - case -NFS4ERR_RECLAIM_CONFLICT: - nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + } + printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status); + fallthrough; + case -ENOENT: + case -ENOMEM: + case -EACCES: + case -EROFS: + case -EIO: + case -ESTALE: + /* Open state on this file cannot be recovered */ + nfs4_state_mark_recovery_failed(state, status); + break; + case -EAGAIN: + ssleep(1); + if (loop++ < 10) { + set_bit(ops->state_flag_bit, &state->flags); break; - case -NFS4ERR_EXPIRED: - case -NFS4ERR_NO_GRACE: - nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - goto out_err; + } + fallthrough; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + fallthrough; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -ETIMEDOUT: + goto out_err; } nfs4_put_open_state(state); spin_lock(&sp->so_lock); goto restart; } - raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); +#ifdef CONFIG_NFS_V4_2 + if (found_ssc_copy_state) + return -EIO; +#endif /* CONFIG_NFS_V4_2 */ return 0; out_err: nfs4_put_open_state(state); spin_lock(&sp->so_lock); - raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return status; } @@ -1650,6 +1786,7 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) { + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); /* Mark all delegations for reclaim */ nfs_delegation_mark_reclaim(clp); nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); @@ -1657,7 +1794,7 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) static int nfs4_reclaim_complete(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops, - struct rpc_cred *cred) + const struct cred *cred) { /* Notify the server we're done reclaiming our state */ if (ops->reclaim_complete) @@ -1708,15 +1845,16 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) { const struct nfs4_state_recovery_ops *ops; - struct rpc_cred *cred; + const struct cred *cred; int err; if (!nfs4_state_clear_reclaim_reboot(clp)) return; + pnfs_destroy_all_layouts(clp); ops = clp->cl_mvops->reboot_recovery_ops; cred = nfs4_get_clid_cred(clp); err = nfs4_reclaim_complete(clp, ops, cred); - put_rpccred(cred); + put_cred(cred); if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION) set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); } @@ -1730,38 +1868,38 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { - case 0: - break; - case -NFS4ERR_CB_PATH_DOWN: - nfs40_handle_cb_pathdown(clp); - break; - case -NFS4ERR_NO_GRACE: - nfs4_state_end_reclaim_reboot(clp); - break; - case -NFS4ERR_STALE_CLIENTID: - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_reboot(clp); - break; - case -NFS4ERR_EXPIRED: - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_nograce(clp); - break; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - /* Zero session reset errors */ - break; - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); - break; - default: - dprintk("%s: failed to handle error %d for server %s\n", - __func__, error, clp->cl_hostname); - return error; + case 0: + break; + case -NFS4ERR_CB_PATH_DOWN: + nfs40_handle_cb_pathdown(clp); + break; + case -NFS4ERR_NO_GRACE: + nfs4_state_end_reclaim_reboot(clp); + break; + case -NFS4ERR_STALE_CLIENTID: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_reboot(clp); + break; + case -NFS4ERR_EXPIRED: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* Zero session reset errors */ + break; + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + break; + default: + dprintk("%s: failed to handle error %d for server %s\n", + __func__, error, clp->cl_hostname); + return error; } dprintk("%s: handled error %d for server %s\n", __func__, error, clp->cl_hostname); @@ -1773,12 +1911,17 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov struct nfs4_state_owner *sp; struct nfs_server *server; struct rb_node *pos; - int status = 0; + LIST_HEAD(freeme); + int lost_locks = 0; + int status; + status = nfs4_begin_drain_session(clp); + if (status < 0) + return status; restart: rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - nfs4_purge_state_owners(server); + nfs4_purge_state_owners(server, &freeme); spin_lock(&clp->cl_lock); for (pos = rb_first(&server->state_owners); pos != NULL; @@ -1793,11 +1936,15 @@ restart: spin_unlock(&clp->cl_lock); rcu_read_unlock(); - status = nfs4_reclaim_open_state(sp, ops); + status = nfs4_reclaim_open_state(sp, ops, &lost_locks); if (status < 0) { + if (lost_locks) + pr_warn("NFS: %s: lost %d locks\n", + clp->cl_hostname, lost_locks); set_bit(ops->owner_flag_bit, &sp->so_flags); nfs4_put_state_owner(sp); status = nfs4_recovery_handle_error(clp, status); + nfs4_free_state_owners(&freeme); return (status != 0) ? status : -EAGAIN; } @@ -1807,12 +1954,17 @@ restart: spin_unlock(&clp->cl_lock); } rcu_read_unlock(); + nfs4_free_state_owners(&freeme); + nfs_local_probe_async(clp); + if (lost_locks) + pr_warn("NFS: %s: lost %d locks\n", + clp->cl_hostname, lost_locks); return 0; } static int nfs4_check_lease(struct nfs_client *clp) { - struct rpc_cred *cred; + const struct cred *cred; const struct nfs4_state_maintenance_ops *ops = clp->cl_mvops->state_renewal_ops; int status; @@ -1820,9 +1972,7 @@ static int nfs4_check_lease(struct nfs_client *clp) /* Is the client already known to have an expired lease? */ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) return 0; - spin_lock(&clp->cl_lock); - cred = ops->get_state_renewal_cred_locked(clp); - spin_unlock(&clp->cl_lock); + cred = ops->get_state_renewal_cred(clp); if (cred == NULL) { cred = nfs4_get_clid_cred(clp); status = -ENOKEY; @@ -1830,7 +1980,7 @@ static int nfs4_check_lease(struct nfs_client *clp) goto out; } status = ops->renew_lease(clp, cred); - put_rpccred(cred); + put_cred(cred); if (status == -ETIMEDOUT) { set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); return 0; @@ -1862,9 +2012,14 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) nfs_mark_client_ready(clp, -EPERM); clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); return -EPERM; + case -ETIMEDOUT: + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) { + nfs_mark_client_ready(clp, -EIO); + return -EIO; + } + fallthrough; case -EACCES: case -NFS4ERR_DELAY: - case -ETIMEDOUT: case -EAGAIN: ssleep(1); break; @@ -1875,6 +2030,10 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) dprintk("%s: exit with error %d for server %s\n", __func__, -EPROTONOSUPPORT, clp->cl_hostname); return -EPROTONOSUPPORT; + case -ENOSPC: + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) + nfs_mark_client_ready(clp, -EIO); + return -EIO; case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ default: @@ -1890,20 +2049,21 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) static int nfs4_establish_lease(struct nfs_client *clp) { - struct rpc_cred *cred; + const struct cred *cred; const struct nfs4_state_recovery_ops *ops = clp->cl_mvops->reboot_recovery_ops; int status; - nfs4_begin_drain_session(clp); + status = nfs4_begin_drain_session(clp); + if (status != 0) + return status; cred = nfs4_get_clid_cred(clp); if (cred == NULL) return -ENOENT; status = ops->establish_clid(clp, cred); - put_rpccred(cred); + put_cred(cred); if (status != 0) return status; - pnfs_destroy_all_layouts(clp); return 0; } @@ -1947,10 +2107,11 @@ static int nfs4_purge_lease(struct nfs_client *clp) * * Returns zero or a negative NFS4ERR status code. */ -static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) +static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred) { struct nfs_client *clp = server->nfs_client; struct nfs4_fs_locations *locations = NULL; + struct nfs_fattr *fattr; struct inode *inode; struct page *page; int status, result; @@ -1960,16 +2121,19 @@ static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) (unsigned long long)server->fsid.minor, clp->cl_hostname); - result = 0; page = alloc_page(GFP_KERNEL); locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); - if (page == NULL || locations == NULL) { + fattr = nfs_alloc_fattr(); + if (page == NULL || locations == NULL || fattr == NULL) { dprintk("<-- %s: no memory\n", __func__); + result = 0; goto out; } + locations->fattr = fattr; inode = d_inode(server->super->s_root); - result = nfs4_proc_get_locations(inode, locations, page, cred); + result = nfs4_proc_get_locations(server, NFS_FH(inode), locations, + page, cred); if (result) { dprintk("<-- %s: failed to retrieve fs_locations: %d\n", __func__, result); @@ -1977,13 +2141,20 @@ static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) } result = -NFS4ERR_NXIO; - if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { + if (!locations->nlocations) + goto out; + + if (!(locations->fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); goto out; } - nfs4_begin_drain_session(clp); + status = nfs4_begin_drain_session(clp); + if (status != 0) { + result = status; + goto out; + } status = nfs4_replace_transport(server, locations); if (status != 0) { @@ -1998,6 +2169,8 @@ static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) out: if (page != NULL) __free_page(page); + if (locations != NULL) + kfree(locations->fattr); kfree(locations); if (result) { pr_err("NFS: migration recovery failed (server %s)\n", @@ -2015,14 +2188,12 @@ static int nfs4_handle_migration(struct nfs_client *clp) const struct nfs4_state_maintenance_ops *ops = clp->cl_mvops->state_renewal_ops; struct nfs_server *server; - struct rpc_cred *cred; + const struct cred *cred; dprintk("%s: migration reported on \"%s\"\n", __func__, clp->cl_hostname); - spin_lock(&clp->cl_lock); - cred = ops->get_state_renewal_cred_locked(clp); - spin_unlock(&clp->cl_lock); + cred = ops->get_state_renewal_cred(clp); if (cred == NULL) return -NFS4ERR_NOENT; @@ -2043,13 +2214,13 @@ restart: rcu_read_unlock(); status = nfs4_try_migration(server, cred); if (status < 0) { - put_rpccred(cred); + put_cred(cred); return status; } goto restart; } rcu_read_unlock(); - put_rpccred(cred); + put_cred(cred); return 0; } @@ -2063,14 +2234,12 @@ static int nfs4_handle_lease_moved(struct nfs_client *clp) const struct nfs4_state_maintenance_ops *ops = clp->cl_mvops->state_renewal_ops; struct nfs_server *server; - struct rpc_cred *cred; + const struct cred *cred; dprintk("%s: lease moved reported on \"%s\"\n", __func__, clp->cl_hostname); - spin_lock(&clp->cl_lock); - cred = ops->get_state_renewal_cred_locked(clp); - spin_unlock(&clp->cl_lock); + cred = ops->get_state_renewal_cred(clp); if (cred == NULL) return -NFS4ERR_NOENT; @@ -2098,7 +2267,7 @@ restart: rcu_read_unlock(); out: - put_rpccred(cred); + put_cred(cred); return 0; } @@ -2121,7 +2290,7 @@ int nfs4_discover_server_trunking(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops = clp->cl_mvops->reboot_recovery_ops; struct rpc_clnt *clnt; - struct rpc_cred *cred; + const struct cred *cred; int i, status; dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname); @@ -2137,7 +2306,7 @@ again: goto out_unlock; status = ops->detect_trunking(clp, result, cred); - put_rpccred(cred); + put_cred(cred); switch (status) { case 0: case -EINTR: @@ -2146,9 +2315,11 @@ again: case -ETIMEDOUT: if (clnt->cl_softrtry) break; + fallthrough; case -NFS4ERR_DELAY: case -EAGAIN: ssleep(1); + fallthrough; case -NFS4ERR_STALE_CLIENTID: dprintk("NFS: %s after status %d, retrying\n", __func__, status); @@ -2160,6 +2331,7 @@ again: } if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) break; + fallthrough; case -NFS4ERR_CLID_INUSE: case -NFS4ERR_WRONGSEC: /* No point in retrying if we already used RPC_AUTH_UNIX */ @@ -2265,8 +2437,7 @@ static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) { /* FIXME: For now, we destroy all layouts. */ pnfs_destroy_all_layouts(clp); - /* FIXME: For now, we test all delegations+open state+locks. */ - nfs41_handle_some_state_revoked(clp); + nfs_test_expired_all_delegations(clp); dprintk("%s: Recallable state revoked on server %s!\n", __func__, clp->cl_hostname); } @@ -2325,12 +2496,14 @@ out_recovery: static int nfs4_reset_session(struct nfs_client *clp) { - struct rpc_cred *cred; + const struct cred *cred; int status; if (!nfs4_has_session(clp)) return 0; - nfs4_begin_drain_session(clp); + status = nfs4_begin_drain_session(clp); + if (status != 0) + return status; cred = nfs4_get_clid_cred(clp); status = nfs4_proc_destroy_session(clp->cl_session, cred); switch (status) { @@ -2361,23 +2534,23 @@ static int nfs4_reset_session(struct nfs_client *clp) dprintk("%s: session reset was successful for server %s!\n", __func__, clp->cl_hostname); out: - if (cred) - put_rpccred(cred); + put_cred(cred); return status; } static int nfs4_bind_conn_to_session(struct nfs_client *clp) { - struct rpc_cred *cred; + const struct cred *cred; int ret; if (!nfs4_has_session(clp)) return 0; - nfs4_begin_drain_session(clp); + ret = nfs4_begin_drain_session(clp); + if (ret != 0) + return ret; cred = nfs4_get_clid_cred(clp); ret = nfs4_proc_bind_conn_to_session(clp, cred); - if (cred) - put_rpccred(cred); + put_cred(cred); clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); switch (ret) { case 0: @@ -2393,6 +2566,21 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) } return 0; } + +static void nfs4_layoutreturn_any_run(struct nfs_client *clp) +{ + int iomode = 0; + + if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &clp->cl_state)) + iomode += IOMODE_READ; + if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &clp->cl_state)) + iomode += IOMODE_RW; + /* Note: IOMODE_READ + IOMODE_RW == IOMODE_ANY */ + if (iomode) { + pnfs_layout_return_unused_byclid(clp, iomode); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + } +} #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } @@ -2400,15 +2588,29 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) { return 0; } + +static void nfs4_layoutreturn_any_run(struct nfs_client *clp) +{ +} #endif /* CONFIG_NFS_V4_1 */ static void nfs4_state_manager(struct nfs_client *clp) { + unsigned int memflags; int status = 0; const char *section = "", *section_sep = ""; + /* + * State recovery can deadlock if the direct reclaim code tries + * start NFS writeback. So ensure memory allocations are all + * GFP_NOFS. + */ + memflags = memalloc_nofs_save(); + /* Ensure exclusive access to NFSv4 state */ do { + trace_nfs4_state_mgr(clp); + clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { section = "purge state"; status = nfs4_purge_lease(clp); @@ -2473,22 +2675,28 @@ static void nfs4_state_manager(struct nfs_client *clp) section = "reclaim reboot"; status = nfs4_do_reclaim(clp, clp->cl_mvops->reboot_recovery_ops); + if (status == 0) + status = pnfs_layout_handle_reboot(clp); if (status == -EAGAIN) continue; if (status < 0) goto out_error; nfs4_state_end_reclaim_reboot(clp); + continue; } /* Detect expired delegations... */ if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) { section = "detect expired delegations"; + status = nfs4_begin_drain_session(clp); + if (status < 0) + goto out_error; nfs_reap_expired_delegations(clp); continue; } /* Now recover expired state... */ - if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { section = "reclaim nograce"; status = nfs4_do_reclaim(clp, clp->cl_mvops->nograce_recovery_ops); @@ -2496,29 +2704,55 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; if (status < 0) goto out_error; + clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); } + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); - if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { - nfs_client_return_marked_delegations(clp); + nfs4_clear_state_manager_bit(clp); + + if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, + &clp->cl_state)) { + memflags = memalloc_nofs_save(); continue; } - nfs4_clear_state_manager_bit(clp); - /* Did we race with an attempt to give us more work? */ - if (clp->cl_state == 0) - break; - if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) - break; - } while (atomic_read(&clp->cl_count) > 1); - return; + if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) { + if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { + nfs_client_return_marked_delegations(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + } + nfs4_layoutreturn_any_run(clp); + clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state); + } + + return; + + } while (refcount_read(&clp->cl_count) > 1 && !signalled()); + goto out_drain; + out_error: if (strlen(section)) section_sep = ": "; + trace_nfs4_state_mgr_failed(clp, section, status); pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); - ssleep(1); + switch (status) { + case -ENETDOWN: + case -ENETUNREACH: + nfs_mark_client_ready(clp, -EIO); + break; + case -EINVAL: + nfs_mark_client_ready(clp, status); + break; + default: + ssleep(1); + break; + } +out_drain: + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } @@ -2526,16 +2760,35 @@ out_error: static int nfs4_run_state_manager(void *ptr) { struct nfs_client *clp = ptr; + struct rpc_clnt *cl = clp->cl_rpcclient; + + while (cl != cl->cl_parent) + cl = cl->cl_parent; allow_signal(SIGKILL); +again: nfs4_state_manager(clp); + + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && + !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) { + wait_var_event_interruptible(&clp->cl_state, + test_bit(NFS4CLNT_RUN_MANAGER, + &clp->cl_state)); + if (!atomic_read(&cl->cl_swapper)) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) + goto again; + /* Either no longer a swapper, or were signalled */ + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + } + + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) + goto again; + nfs_put_client(clp); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 6fb7cb6b3f4b..5ec9c83f1ef0 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -1,10 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012 Bryan Schumaker <bjschuma@netapp.com> */ #include <linux/init.h> #include <linux/module.h> +#include <linux/mount.h> #include <linux/nfs4_mount.h> #include <linux/nfs_fs.h> +#include <linux/nfs_ssc.h> #include "delegation.h" #include "internal.h" #include "nfs4_fs.h" @@ -17,40 +20,10 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc); static void nfs4_evict_inode(struct inode *inode); -static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data); -static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data); -static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data); - -static struct file_system_type nfs4_remote_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .mount = nfs4_remote_mount, - .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, -}; - -static struct file_system_type nfs4_remote_referral_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .mount = nfs4_remote_referral_mount, - .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, -}; - -struct file_system_type nfs4_referral_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .mount = nfs4_referral_mount, - .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, -}; static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, - .destroy_inode = nfs_destroy_inode, + .free_inode = nfs_free_inode, .write_inode = nfs4_write_inode, .drop_inode = nfs_drop_inode, .statfs = nfs_statfs, @@ -60,16 +33,15 @@ static const struct super_operations nfs4_sops = { .show_devname = nfs_show_devname, .show_path = nfs_show_path, .show_stats = nfs_show_stats, - .remount_fs = nfs_remount, }; struct nfs_subversion nfs_v4 = { - .owner = THIS_MODULE, - .nfs_fs = &nfs4_fs_type, - .rpc_vers = &nfs_version4, - .rpc_ops = &nfs_v4_clientops, - .sops = &nfs4_sops, - .xattr = nfs4_xattr_handlers, + .owner = THIS_MODULE, + .nfs_fs = &nfs4_fs_type, + .rpc_vers = &nfs_version4, + .rpc_ops = &nfs_v4_clientops, + .sops = &nfs4_sops, + .xattr = nfs4_xattr_handlers, }; static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -91,60 +63,14 @@ static void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - /* If we are holding a delegation, return it! */ - nfs_inode_return_delegation_noreclaim(inode); + /* If we are holding a delegation, return and free it */ + nfs_inode_evict_delegation(inode); /* Note that above delegreturn would trigger pnfs return-on-close */ pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); + pnfs_destroy_layout_final(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); -} - -/* - * Get the superblock for the NFS4 root partition - */ -static struct dentry * -nfs4_remote_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *info) -{ - struct nfs_mount_info *mount_info = info; - struct nfs_server *server; - struct dentry *mntroot = ERR_PTR(-ENOMEM); - - mount_info->set_security = nfs_set_sb_security; - - /* Get a volume representation */ - server = nfs4_create_server(mount_info, &nfs_v4); - if (IS_ERR(server)) { - mntroot = ERR_CAST(server); - goto out; - } - - mntroot = nfs_fs_mount_common(server, flags, dev_name, mount_info, &nfs_v4); - -out: - return mntroot; -} - -static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, - int flags, void *data, const char *hostname) -{ - struct vfsmount *root_mnt; - char *root_devname; - size_t len; - - len = strlen(hostname) + 5; - root_devname = kmalloc(len, GFP_KERNEL); - if (root_devname == NULL) - return ERR_PTR(-ENOMEM); - /* Does hostname needs to be enclosed in brackets? */ - if (strchr(hostname, ':')) - snprintf(root_devname, len, "[%s]:/", hostname); - else - snprintf(root_devname, len, "%s:/", hostname); - root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data); - kfree(root_devname); - return root_mnt; + nfs4_xattr_cache_zap(inode); } struct nfs_referral_count { @@ -213,111 +139,127 @@ static void nfs_referral_loop_unprotect(void) kfree(p); } -static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, - const char *export_path) +static int do_nfs4_mount(struct nfs_server *server, + struct fs_context *fc, + const char *hostname, + const char *export_path) { + struct nfs_fs_context *root_ctx; + struct nfs_fs_context *ctx; + struct fs_context *root_fc; + struct vfsmount *root_mnt; struct dentry *dentry; - int err; + char *source; + int ret; - if (IS_ERR(root_mnt)) - return ERR_CAST(root_mnt); + if (IS_ERR(server)) + return PTR_ERR(server); - err = nfs_referral_loop_protect(); - if (err) { - mntput(root_mnt); - return ERR_PTR(err); + root_fc = vfs_dup_fs_context(fc); + if (IS_ERR(root_fc)) { + nfs_free_server(server); + return PTR_ERR(root_fc); + } + kfree(root_fc->source); + root_fc->source = NULL; + + ctx = nfs_fc2context(fc); + root_ctx = nfs_fc2context(root_fc); + root_ctx->internal = true; + root_ctx->server = server; + + if (ctx->fscache_uniq) { + ret = vfs_parse_fs_string(root_fc, "fsc", ctx->fscache_uniq); + if (ret < 0) { + put_fs_context(root_fc); + return ret; + } } + /* We leave export_path unset as it's not used to find the root. */ - dentry = mount_subtree(root_mnt, export_path); - nfs_referral_loop_unprotect(); + /* Does hostname needs to be enclosed in brackets? */ + if (strchr(hostname, ':')) + source = kasprintf(GFP_KERNEL, "[%s]:/", hostname); + else + source = kasprintf(GFP_KERNEL, "%s:/", hostname); - return dentry; -} + if (!source) { + put_fs_context(root_fc); + return -ENOMEM; + } + ret = vfs_parse_fs_string(root_fc, "source", source); + kfree(source); + if (ret < 0) { + put_fs_context(root_fc); + return ret; + } + root_mnt = fc_mount(root_fc); + put_fs_context(root_fc); -struct dentry *nfs4_try_mount(int flags, const char *dev_name, - struct nfs_mount_info *mount_info, - struct nfs_subversion *nfs_mod) -{ - char *export_path; - struct vfsmount *root_mnt; - struct dentry *res; - struct nfs_parsed_mount_data *data = mount_info->parsed; + if (IS_ERR(root_mnt)) + return PTR_ERR(root_mnt); - dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + ret = nfs_referral_loop_protect(); + if (ret) { + mntput(root_mnt); + return ret; + } - export_path = data->nfs_server.export_path; - data->nfs_server.export_path = "/"; - root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info, - data->nfs_server.hostname); - data->nfs_server.export_path = export_path; + dentry = mount_subtree(root_mnt, export_path); + nfs_referral_loop_unprotect(); - res = nfs_follow_remote_path(root_mnt, export_path); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); - dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", - PTR_ERR_OR_ZERO(res), - IS_ERR(res) ? " [error]" : ""); - return res; + fc->root = dentry; + return 0; } -static struct dentry * -nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data) +int nfs4_try_get_tree(struct fs_context *fc) { - struct nfs_mount_info mount_info = { - .fill_super = nfs_fill_super, - .set_security = nfs_clone_sb_security, - .cloned = raw_data, - }; - struct nfs_server *server; - struct dentry *mntroot = ERR_PTR(-ENOMEM); - - dprintk("--> nfs4_referral_get_sb()\n"); - - mount_info.mntfh = nfs_alloc_fhandle(); - if (mount_info.cloned == NULL || mount_info.mntfh == NULL) - goto out; + struct nfs_fs_context *ctx = nfs_fc2context(fc); + int err; - /* create a new volume representation */ - server = nfs4_create_referral_server(mount_info.cloned, mount_info.mntfh); - if (IS_ERR(server)) { - mntroot = ERR_CAST(server); - goto out; - } + dfprintk(MOUNT, "--> nfs4_try_get_tree()\n"); - mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, &nfs_v4); -out: - nfs_free_fhandle(mount_info.mntfh); - return mntroot; + /* We create a mount for the server's root, walk to the requested + * location and then create another mount for that. + */ + err= do_nfs4_mount(nfs4_create_server(fc), + fc, ctx->nfs_server.hostname, + ctx->nfs_server.export_path); + if (err) { + nfs_ferrorf(fc, MOUNT, "NFS4: Couldn't follow remote path"); + dfprintk(MOUNT, "<-- nfs4_try_get_tree() = %d [error]\n", err); + } else { + dfprintk(MOUNT, "<-- nfs4_try_get_tree() = 0\n"); + } + return err; } /* * Create an NFS4 server record on referral traversal */ -static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data) +int nfs4_get_referral_tree(struct fs_context *fc) { - struct nfs_clone_mount *data = raw_data; - char *export_path; - struct vfsmount *root_mnt; - struct dentry *res; + struct nfs_fs_context *ctx = nfs_fc2context(fc); + int err; dprintk("--> nfs4_referral_mount()\n"); - export_path = data->mnt_path; - data->mnt_path = "/"; - - root_mnt = nfs_do_root_mount(&nfs4_remote_referral_fs_type, - flags, data, data->hostname); - data->mnt_path = export_path; - - res = nfs_follow_remote_path(root_mnt, export_path); - dprintk("<-- nfs4_referral_mount() = %d%s\n", - PTR_ERR_OR_ZERO(res), - IS_ERR(res) ? " [error]" : ""); - return res; + /* create a new volume representation */ + err = do_nfs4_mount(nfs4_create_referral_server(fc), + fc, ctx->nfs_server.hostname, + ctx->nfs_server.export_path); + if (err) { + nfs_ferrorf(fc, MOUNT, "NFS4: Couldn't follow remote path"); + dfprintk(MOUNT, "<-- nfs4_get_referral_tree() = %d [error]\n", err); + } else { + dfprintk(MOUNT, "<-- nfs4_get_referral_tree() = 0\n"); + } + return err; } - static int __init init_nfs_v4(void) { int err; @@ -330,10 +272,19 @@ static int __init init_nfs_v4(void) if (err) goto out1; +#ifdef CONFIG_NFS_V4_2 + err = nfs4_xattr_cache_init(); + if (err) + goto out2; +#endif + err = nfs4_register_sysctl(); if (err) goto out2; +#ifdef CONFIG_NFS_V4_2 + nfs42_ssc_register_ops(); +#endif register_nfs_version(&nfs_v4); return 0; out2: @@ -350,11 +301,16 @@ static void __exit exit_nfs_v4(void) nfs4_pnfs_v3_ds_connect_unload(); unregister_nfs_version(&nfs_v4); +#ifdef CONFIG_NFS_V4_2 + nfs4_xattr_cache_exit(); + nfs42_ssc_unregister_ops(); +#endif nfs4_unregister_sysctl(); nfs_idmap_quit(); nfs_dns_resolver_destroy(); } +MODULE_DESCRIPTION("NFSv4 client support"); MODULE_LICENSE("GPL"); module_init(init_nfs_v4); diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 8693d77c45ea..d1a92d8f8ba4 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/nfs4sysctl.c * @@ -16,7 +17,7 @@ static const int nfs_set_port_min; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; -static struct ctl_table nfs4_cb_sysctls[] = { +static const struct ctl_table nfs4_cb_sysctls[] = { { .procname = "nfs_callback_tcpport", .data = &nfs_callback_set_tcpport, @@ -31,32 +32,14 @@ static struct ctl_table nfs4_cb_sysctls[] = { .data = &nfs_idmap_cache_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, }, - { } -}; - -static struct ctl_table nfs4_cb_sysctl_dir[] = { - { - .procname = "nfs", - .mode = 0555, - .child = nfs4_cb_sysctls, - }, - { } -}; - -static struct ctl_table nfs4_cb_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = nfs4_cb_sysctl_dir, - }, - { } }; int nfs4_register_sysctl(void) { - nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root); + nfs4_callback_sysctl_table = register_sysctl("fs/nfs", + nfs4_cb_sysctls); if (nfs4_callback_sysctl_table == NULL) return -ENOMEM; return 0; diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index 2850bce19244..987c92d6364b 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -1,6 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> */ +#include <uapi/linux/pr.h> +#include <linux/blkdev.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" #include "internal.h" @@ -15,4 +18,25 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds); + +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_get_mirror_count); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_ds_connect); + +EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error); +EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error); +EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_ext_tree_prepare_commit); +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_reg); +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_reg_err); +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_unreg); +EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_unreg_err); + +EXPORT_TRACEPOINT_SYMBOL_GPL(fl_getdevinfo); #endif diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index be1da19c65d6..9776d220cec3 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> */ @@ -8,171 +9,12 @@ #define _TRACE_NFS4_H #include <linux/tracepoint.h> +#include <trace/misc/sunrpc.h> -#define show_nfsv4_errors(error) \ - __print_symbolic(error, \ - { NFS4_OK, "OK" }, \ - /* Mapped by nfs4_stat_to_errno() */ \ - { -EPERM, "EPERM" }, \ - { -ENOENT, "ENOENT" }, \ - { -EIO, "EIO" }, \ - { -ENXIO, "ENXIO" }, \ - { -EACCES, "EACCES" }, \ - { -EEXIST, "EEXIST" }, \ - { -EXDEV, "EXDEV" }, \ - { -ENOTDIR, "ENOTDIR" }, \ - { -EISDIR, "EISDIR" }, \ - { -EFBIG, "EFBIG" }, \ - { -ENOSPC, "ENOSPC" }, \ - { -EROFS, "EROFS" }, \ - { -EMLINK, "EMLINK" }, \ - { -ENAMETOOLONG, "ENAMETOOLONG" }, \ - { -ENOTEMPTY, "ENOTEMPTY" }, \ - { -EDQUOT, "EDQUOT" }, \ - { -ESTALE, "ESTALE" }, \ - { -EBADHANDLE, "EBADHANDLE" }, \ - { -EBADCOOKIE, "EBADCOOKIE" }, \ - { -ENOTSUPP, "ENOTSUPP" }, \ - { -ETOOSMALL, "ETOOSMALL" }, \ - { -EREMOTEIO, "EREMOTEIO" }, \ - { -EBADTYPE, "EBADTYPE" }, \ - { -EAGAIN, "EAGAIN" }, \ - { -ELOOP, "ELOOP" }, \ - { -EOPNOTSUPP, "EOPNOTSUPP" }, \ - { -EDEADLK, "EDEADLK" }, \ - /* RPC errors */ \ - { -ENOMEM, "ENOMEM" }, \ - { -EKEYEXPIRED, "EKEYEXPIRED" }, \ - { -ETIMEDOUT, "ETIMEDOUT" }, \ - { -ERESTARTSYS, "ERESTARTSYS" }, \ - { -ECONNREFUSED, "ECONNREFUSED" }, \ - { -ECONNRESET, "ECONNRESET" }, \ - { -ENETUNREACH, "ENETUNREACH" }, \ - { -EHOSTUNREACH, "EHOSTUNREACH" }, \ - { -EHOSTDOWN, "EHOSTDOWN" }, \ - { -EPIPE, "EPIPE" }, \ - { -EPFNOSUPPORT, "EPFNOSUPPORT" }, \ - { -EPROTONOSUPPORT, "EPROTONOSUPPORT" }, \ - /* NFSv4 native errors */ \ - { -NFS4ERR_ACCESS, "ACCESS" }, \ - { -NFS4ERR_ATTRNOTSUPP, "ATTRNOTSUPP" }, \ - { -NFS4ERR_ADMIN_REVOKED, "ADMIN_REVOKED" }, \ - { -NFS4ERR_BACK_CHAN_BUSY, "BACK_CHAN_BUSY" }, \ - { -NFS4ERR_BADCHAR, "BADCHAR" }, \ - { -NFS4ERR_BADHANDLE, "BADHANDLE" }, \ - { -NFS4ERR_BADIOMODE, "BADIOMODE" }, \ - { -NFS4ERR_BADLAYOUT, "BADLAYOUT" }, \ - { -NFS4ERR_BADLABEL, "BADLABEL" }, \ - { -NFS4ERR_BADNAME, "BADNAME" }, \ - { -NFS4ERR_BADOWNER, "BADOWNER" }, \ - { -NFS4ERR_BADSESSION, "BADSESSION" }, \ - { -NFS4ERR_BADSLOT, "BADSLOT" }, \ - { -NFS4ERR_BADTYPE, "BADTYPE" }, \ - { -NFS4ERR_BADXDR, "BADXDR" }, \ - { -NFS4ERR_BAD_COOKIE, "BAD_COOKIE" }, \ - { -NFS4ERR_BAD_HIGH_SLOT, "BAD_HIGH_SLOT" }, \ - { -NFS4ERR_BAD_RANGE, "BAD_RANGE" }, \ - { -NFS4ERR_BAD_SEQID, "BAD_SEQID" }, \ - { -NFS4ERR_BAD_SESSION_DIGEST, "BAD_SESSION_DIGEST" }, \ - { -NFS4ERR_BAD_STATEID, "BAD_STATEID" }, \ - { -NFS4ERR_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ - { -NFS4ERR_CLID_INUSE, "CLID_INUSE" }, \ - { -NFS4ERR_CLIENTID_BUSY, "CLIENTID_BUSY" }, \ - { -NFS4ERR_COMPLETE_ALREADY, "COMPLETE_ALREADY" }, \ - { -NFS4ERR_CONN_NOT_BOUND_TO_SESSION, \ - "CONN_NOT_BOUND_TO_SESSION" }, \ - { -NFS4ERR_DEADLOCK, "DEADLOCK" }, \ - { -NFS4ERR_DEADSESSION, "DEAD_SESSION" }, \ - { -NFS4ERR_DELAY, "DELAY" }, \ - { -NFS4ERR_DELEG_ALREADY_WANTED, \ - "DELEG_ALREADY_WANTED" }, \ - { -NFS4ERR_DELEG_REVOKED, "DELEG_REVOKED" }, \ - { -NFS4ERR_DENIED, "DENIED" }, \ - { -NFS4ERR_DIRDELEG_UNAVAIL, "DIRDELEG_UNAVAIL" }, \ - { -NFS4ERR_DQUOT, "DQUOT" }, \ - { -NFS4ERR_ENCR_ALG_UNSUPP, "ENCR_ALG_UNSUPP" }, \ - { -NFS4ERR_EXIST, "EXIST" }, \ - { -NFS4ERR_EXPIRED, "EXPIRED" }, \ - { -NFS4ERR_FBIG, "FBIG" }, \ - { -NFS4ERR_FHEXPIRED, "FHEXPIRED" }, \ - { -NFS4ERR_FILE_OPEN, "FILE_OPEN" }, \ - { -NFS4ERR_GRACE, "GRACE" }, \ - { -NFS4ERR_HASH_ALG_UNSUPP, "HASH_ALG_UNSUPP" }, \ - { -NFS4ERR_INVAL, "INVAL" }, \ - { -NFS4ERR_IO, "IO" }, \ - { -NFS4ERR_ISDIR, "ISDIR" }, \ - { -NFS4ERR_LAYOUTTRYLATER, "LAYOUTTRYLATER" }, \ - { -NFS4ERR_LAYOUTUNAVAILABLE, "LAYOUTUNAVAILABLE" }, \ - { -NFS4ERR_LEASE_MOVED, "LEASE_MOVED" }, \ - { -NFS4ERR_LOCKED, "LOCKED" }, \ - { -NFS4ERR_LOCKS_HELD, "LOCKS_HELD" }, \ - { -NFS4ERR_LOCK_RANGE, "LOCK_RANGE" }, \ - { -NFS4ERR_MINOR_VERS_MISMATCH, "MINOR_VERS_MISMATCH" }, \ - { -NFS4ERR_MLINK, "MLINK" }, \ - { -NFS4ERR_MOVED, "MOVED" }, \ - { -NFS4ERR_NAMETOOLONG, "NAMETOOLONG" }, \ - { -NFS4ERR_NOENT, "NOENT" }, \ - { -NFS4ERR_NOFILEHANDLE, "NOFILEHANDLE" }, \ - { -NFS4ERR_NOMATCHING_LAYOUT, "NOMATCHING_LAYOUT" }, \ - { -NFS4ERR_NOSPC, "NOSPC" }, \ - { -NFS4ERR_NOTDIR, "NOTDIR" }, \ - { -NFS4ERR_NOTEMPTY, "NOTEMPTY" }, \ - { -NFS4ERR_NOTSUPP, "NOTSUPP" }, \ - { -NFS4ERR_NOT_ONLY_OP, "NOT_ONLY_OP" }, \ - { -NFS4ERR_NOT_SAME, "NOT_SAME" }, \ - { -NFS4ERR_NO_GRACE, "NO_GRACE" }, \ - { -NFS4ERR_NXIO, "NXIO" }, \ - { -NFS4ERR_OLD_STATEID, "OLD_STATEID" }, \ - { -NFS4ERR_OPENMODE, "OPENMODE" }, \ - { -NFS4ERR_OP_ILLEGAL, "OP_ILLEGAL" }, \ - { -NFS4ERR_OP_NOT_IN_SESSION, "OP_NOT_IN_SESSION" }, \ - { -NFS4ERR_PERM, "PERM" }, \ - { -NFS4ERR_PNFS_IO_HOLE, "PNFS_IO_HOLE" }, \ - { -NFS4ERR_PNFS_NO_LAYOUT, "PNFS_NO_LAYOUT" }, \ - { -NFS4ERR_RECALLCONFLICT, "RECALLCONFLICT" }, \ - { -NFS4ERR_RECLAIM_BAD, "RECLAIM_BAD" }, \ - { -NFS4ERR_RECLAIM_CONFLICT, "RECLAIM_CONFLICT" }, \ - { -NFS4ERR_REJECT_DELEG, "REJECT_DELEG" }, \ - { -NFS4ERR_REP_TOO_BIG, "REP_TOO_BIG" }, \ - { -NFS4ERR_REP_TOO_BIG_TO_CACHE, \ - "REP_TOO_BIG_TO_CACHE" }, \ - { -NFS4ERR_REQ_TOO_BIG, "REQ_TOO_BIG" }, \ - { -NFS4ERR_RESOURCE, "RESOURCE" }, \ - { -NFS4ERR_RESTOREFH, "RESTOREFH" }, \ - { -NFS4ERR_RETRY_UNCACHED_REP, "RETRY_UNCACHED_REP" }, \ - { -NFS4ERR_RETURNCONFLICT, "RETURNCONFLICT" }, \ - { -NFS4ERR_ROFS, "ROFS" }, \ - { -NFS4ERR_SAME, "SAME" }, \ - { -NFS4ERR_SHARE_DENIED, "SHARE_DENIED" }, \ - { -NFS4ERR_SEQUENCE_POS, "SEQUENCE_POS" }, \ - { -NFS4ERR_SEQ_FALSE_RETRY, "SEQ_FALSE_RETRY" }, \ - { -NFS4ERR_SEQ_MISORDERED, "SEQ_MISORDERED" }, \ - { -NFS4ERR_SERVERFAULT, "SERVERFAULT" }, \ - { -NFS4ERR_STALE, "STALE" }, \ - { -NFS4ERR_STALE_CLIENTID, "STALE_CLIENTID" }, \ - { -NFS4ERR_STALE_STATEID, "STALE_STATEID" }, \ - { -NFS4ERR_SYMLINK, "SYMLINK" }, \ - { -NFS4ERR_TOOSMALL, "TOOSMALL" }, \ - { -NFS4ERR_TOO_MANY_OPS, "TOO_MANY_OPS" }, \ - { -NFS4ERR_UNKNOWN_LAYOUTTYPE, "UNKNOWN_LAYOUTTYPE" }, \ - { -NFS4ERR_UNSAFE_COMPOUND, "UNSAFE_COMPOUND" }, \ - { -NFS4ERR_WRONGSEC, "WRONGSEC" }, \ - { -NFS4ERR_WRONG_CRED, "WRONG_CRED" }, \ - { -NFS4ERR_WRONG_TYPE, "WRONG_TYPE" }, \ - { -NFS4ERR_XDEV, "XDEV" }) - -#define show_open_flags(flags) \ - __print_flags(flags, "|", \ - { O_CREAT, "O_CREAT" }, \ - { O_EXCL, "O_EXCL" }, \ - { O_TRUNC, "O_TRUNC" }, \ - { O_DIRECT, "O_DIRECT" }) +#include <trace/misc/fs.h> +#include <trace/misc/nfs.h> -#define show_fmode_flags(mode) \ - __print_flags(mode, "|", \ - { ((__force unsigned long)FMODE_READ), "READ" }, \ - { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \ - { ((__force unsigned long)FMODE_EXEC), "EXEC" }) +#include "delegation.h" #define show_nfs_fattr_flags(valid) \ __print_flags((unsigned long)valid, "|", \ @@ -190,7 +32,8 @@ { NFS_ATTR_FATTR_CTIME, "CTIME" }, \ { NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \ { NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \ - { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }) + { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }, \ + { NFS_ATTR_FATTR_BTIME, "BTIME" }) DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_PROTO( @@ -201,23 +44,19 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_ARGS(clp, error), TP_STRUCT__entry( - __string(dstaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)) - __field(int, error) + __string(dstaddr, clp->cl_hostname) + __field(unsigned long, error) ), TP_fast_assign( - __entry->error = error; - __assign_str(dstaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)); + __entry->error = error < 0 ? -error : 0; + __assign_str(dstaddr); ), TP_printk( - "error=%d (%s) dstaddr=%s", - __entry->error, - show_nfsv4_errors(__entry->error), + "error=%ld (%s) dstaddr=%s", + -__entry->error, + show_nfs4_status(__entry->error), __get_str(dstaddr) ) ); @@ -241,28 +80,35 @@ DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session); DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence); DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete); -#define show_nfs4_sequence_status_flags(status) \ - __print_flags((unsigned long)status, "|", \ - { SEQ4_STATUS_CB_PATH_DOWN, "CB_PATH_DOWN" }, \ - { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING, \ - "CB_GSS_CONTEXTS_EXPIRING" }, \ - { SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED, \ - "CB_GSS_CONTEXTS_EXPIRED" }, \ - { SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED, \ - "EXPIRED_ALL_STATE_REVOKED" }, \ - { SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED, \ - "EXPIRED_SOME_STATE_REVOKED" }, \ - { SEQ4_STATUS_ADMIN_STATE_REVOKED, \ - "ADMIN_STATE_REVOKED" }, \ - { SEQ4_STATUS_RECALLABLE_STATE_REVOKED, \ - "RECALLABLE_STATE_REVOKED" }, \ - { SEQ4_STATUS_LEASE_MOVED, "LEASE_MOVED" }, \ - { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, \ - "RESTART_RECLAIM_NEEDED" }, \ - { SEQ4_STATUS_CB_PATH_DOWN_SESSION, \ - "CB_PATH_DOWN_SESSION" }, \ - { SEQ4_STATUS_BACKCHANNEL_FAULT, \ - "BACKCHANNEL_FAULT" }) +TRACE_EVENT(nfs4_trunked_exchange_id, + TP_PROTO( + const struct nfs_client *clp, + const char *addr, + int error + ), + + TP_ARGS(clp, addr, error), + + TP_STRUCT__entry( + __string(main_addr, clp->cl_hostname) + __string(trunk_addr, addr) + __field(unsigned long, error) + ), + + TP_fast_assign( + __entry->error = error < 0 ? -error : 0; + __assign_str(main_addr); + __assign_str(trunk_addr); + ), + + TP_printk( + "error=%ld (%s) main_addr=%s trunk_addr=%s", + -__entry->error, + show_nfs4_status(__entry->error), + __get_str(main_addr), + __get_str(trunk_addr) + ) +); TRACE_EVENT(nfs4_sequence_done, TP_PROTO( @@ -277,8 +123,8 @@ TRACE_EVENT(nfs4_sequence_done, __field(unsigned int, seq_nr) __field(unsigned int, highest_slotid) __field(unsigned int, target_highest_slotid) - __field(unsigned int, status_flags) - __field(int, error) + __field(unsigned long, status_flags) + __field(unsigned long, error) ), TP_fast_assign( @@ -290,21 +136,22 @@ TRACE_EVENT(nfs4_sequence_done, __entry->target_highest_slotid = res->sr_target_highest_slotid; __entry->status_flags = res->sr_status_flags; - __entry->error = res->sr_status; + __entry->error = res->sr_status < 0 ? + -res->sr_status : 0; ), TP_printk( - "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u " "highest_slotid=%u target_highest_slotid=%u " - "status_flags=%u (%s)", - __entry->error, - show_nfsv4_errors(__entry->error), + "status_flags=0x%lx (%s)", + -__entry->error, + show_nfs4_status(__entry->error), __entry->session, __entry->slot_nr, __entry->seq_nr, __entry->highest_slotid, __entry->target_highest_slotid, __entry->status_flags, - show_nfs4_sequence_status_flags(__entry->status_flags) + show_nfs4_seq4_status(__entry->status_flags) ) ); @@ -325,7 +172,7 @@ TRACE_EVENT(nfs4_cb_sequence, __field(unsigned int, seq_nr) __field(unsigned int, highest_slotid) __field(unsigned int, cachethis) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( @@ -334,20 +181,127 @@ TRACE_EVENT(nfs4_cb_sequence, __entry->seq_nr = args->csa_sequenceid; __entry->highest_slotid = args->csa_highestslotid; __entry->cachethis = args->csa_cachethis; - __entry->error = -be32_to_cpu(status); + __entry->error = be32_to_cpu(status); ), TP_printk( - "error=%d (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u " "highest_slotid=%u", - __entry->error, - show_nfsv4_errors(__entry->error), + -__entry->error, + show_nfs4_status(__entry->error), __entry->session, __entry->slot_nr, __entry->seq_nr, __entry->highest_slotid ) ); + +TRACE_EVENT(nfs4_cb_seqid_err, + TP_PROTO( + const struct cb_sequenceargs *args, + __be32 status + ), + TP_ARGS(args, status), + + TP_STRUCT__entry( + __field(unsigned int, session) + __field(unsigned int, slot_nr) + __field(unsigned int, seq_nr) + __field(unsigned int, highest_slotid) + __field(unsigned int, cachethis) + __field(unsigned long, error) + ), + + TP_fast_assign( + __entry->session = nfs_session_id_hash(&args->csa_sessionid); + __entry->slot_nr = args->csa_slotid; + __entry->seq_nr = args->csa_sequenceid; + __entry->highest_slotid = args->csa_highestslotid; + __entry->cachethis = args->csa_cachethis; + __entry->error = be32_to_cpu(status); + ), + + TP_printk( + "error=%ld (%s) session=0x%08x slot_nr=%u seq_nr=%u " + "highest_slotid=%u", + -__entry->error, + show_nfs4_status(__entry->error), + __entry->session, + __entry->slot_nr, + __entry->seq_nr, + __entry->highest_slotid + ) +); + +TRACE_EVENT(nfs4_cb_offload, + TP_PROTO( + const struct nfs_fh *cb_fh, + const nfs4_stateid *cb_stateid, + uint64_t cb_count, + int cb_error, + int cb_how_stable + ), + + TP_ARGS(cb_fh, cb_stateid, cb_count, cb_error, + cb_how_stable), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(u32, fhandle) + __field(loff_t, cb_count) + __field(int, cb_how) + __field(int, cb_stateid_seq) + __field(u32, cb_stateid_hash) + ), + + TP_fast_assign( + __entry->error = cb_error < 0 ? -cb_error : 0; + __entry->fhandle = nfs_fhandle_hash(cb_fh); + __entry->cb_stateid_seq = + be32_to_cpu(cb_stateid->seqid); + __entry->cb_stateid_hash = + nfs_stateid_hash(cb_stateid); + __entry->cb_count = cb_count; + __entry->cb_how = cb_how_stable; + ), + + TP_printk( + "error=%ld (%s) fhandle=0x%08x cb_stateid=%d:0x%08x " + "cb_count=%llu cb_how=%s", + -__entry->error, + show_nfs4_status(__entry->error), + __entry->fhandle, + __entry->cb_stateid_seq, __entry->cb_stateid_hash, + __entry->cb_count, + show_nfs_stable_how(__entry->cb_how) + ) +); + +TRACE_EVENT(pnfs_ds_connect, + TP_PROTO( + char *ds_remotestr, + int status + ), + + TP_ARGS(ds_remotestr, status), + + TP_STRUCT__entry( + __string(ds_ips, ds_remotestr) + __field(int, status) + ), + + TP_fast_assign( + __assign_str(ds_ips); + __entry->status = status; + ), + + TP_printk( + "ds_ips=%s, status=%d", + __get_str(ds_ips), + __entry->status + ) +); + #endif /* CONFIG_NFS_V4_1 */ TRACE_EVENT(nfs4_setup_sequence, @@ -382,6 +336,221 @@ TRACE_EVENT(nfs4_setup_sequence, ) ); +TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_RUNNING); +TRACE_DEFINE_ENUM(NFS4CLNT_CHECK_LEASE); +TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_EXPIRED); +TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_REBOOT); +TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_NOGRACE); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN); +TRACE_DEFINE_ENUM(NFS4CLNT_SESSION_RESET); +TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_CONFIRM); +TRACE_DEFINE_ENUM(NFS4CLNT_SERVER_SCOPE_MISMATCH); +TRACE_DEFINE_ENUM(NFS4CLNT_PURGE_STATE); +TRACE_DEFINE_ENUM(NFS4CLNT_BIND_CONN_TO_SESSION); +TRACE_DEFINE_ENUM(NFS4CLNT_MOVED); +TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED); +TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER); +TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_AVAILABLE); +TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING); +TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ); +TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_DELAYED); + +#define show_nfs4_clp_state(state) \ + __print_flags(state, "|", \ + { BIT(NFS4CLNT_MANAGER_RUNNING), "MANAGER_RUNNING" }, \ + { BIT(NFS4CLNT_CHECK_LEASE), "CHECK_LEASE" }, \ + { BIT(NFS4CLNT_LEASE_EXPIRED), "LEASE_EXPIRED" }, \ + { BIT(NFS4CLNT_RECLAIM_REBOOT), "RECLAIM_REBOOT" }, \ + { BIT(NFS4CLNT_RECLAIM_NOGRACE), "RECLAIM_NOGRACE" }, \ + { BIT(NFS4CLNT_DELEGRETURN), "DELEGRETURN" }, \ + { BIT(NFS4CLNT_SESSION_RESET), "SESSION_RESET" }, \ + { BIT(NFS4CLNT_LEASE_CONFIRM), "LEASE_CONFIRM" }, \ + { BIT(NFS4CLNT_SERVER_SCOPE_MISMATCH), "SERVER_SCOPE_MISMATCH" }, \ + { BIT(NFS4CLNT_PURGE_STATE), "PURGE_STATE" }, \ + { BIT(NFS4CLNT_BIND_CONN_TO_SESSION), "BIND_CONN_TO_SESSION" }, \ + { BIT(NFS4CLNT_MOVED), "MOVED" }, \ + { BIT(NFS4CLNT_LEASE_MOVED), "LEASE_MOVED" }, \ + { BIT(NFS4CLNT_DELEGATION_EXPIRED), "DELEGATION_EXPIRED" }, \ + { BIT(NFS4CLNT_RUN_MANAGER), "RUN_MANAGER" }, \ + { BIT(NFS4CLNT_MANAGER_AVAILABLE), "MANAGER_AVAILABLE" }, \ + { BIT(NFS4CLNT_RECALL_RUNNING), "RECALL_RUNNING" }, \ + { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_READ), "RECALL_ANY_LAYOUT_READ" }, \ + { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_RW), "RECALL_ANY_LAYOUT_RW" }, \ + { BIT(NFS4CLNT_DELEGRETURN_DELAYED), "DELERETURN_DELAYED" }) + +TRACE_EVENT(nfs4_state_mgr, + TP_PROTO( + const struct nfs_client *clp + ), + + TP_ARGS(clp), + + TP_STRUCT__entry( + __field(unsigned long, state) + __string(hostname, clp->cl_hostname) + ), + + TP_fast_assign( + __entry->state = clp->cl_state; + __assign_str(hostname); + ), + + TP_printk( + "hostname=%s clp state=%s", __get_str(hostname), + show_nfs4_clp_state(__entry->state) + ) +) + +TRACE_EVENT(nfs4_state_mgr_failed, + TP_PROTO( + const struct nfs_client *clp, + const char *section, + int status + ), + + TP_ARGS(clp, section, status), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(unsigned long, state) + __string(hostname, clp->cl_hostname) + __string(section, section) + ), + + TP_fast_assign( + __entry->error = status < 0 ? -status : 0; + __entry->state = clp->cl_state; + __assign_str(hostname); + __assign_str(section); + ), + + TP_printk( + "hostname=%s clp state=%s error=%ld (%s) section=%s", + __get_str(hostname), + show_nfs4_clp_state(__entry->state), -__entry->error, + show_nfs4_status(__entry->error), __get_str(section) + + ) +) + +TRACE_EVENT(nfs4_xdr_bad_operation, + TP_PROTO( + const struct xdr_stream *xdr, + u32 op, + u32 expected + ), + + TP_ARGS(xdr, op, expected), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(u32, op) + __field(u32, expected) + ), + + TP_fast_assign( + const struct rpc_rqst *rqstp = xdr->rqst; + const struct rpc_task *task = rqstp->rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->op = op; + __entry->expected = expected; + ), + + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " xid=0x%08x operation=%u, expected=%u", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->op, __entry->expected + ) +); + +DECLARE_EVENT_CLASS(nfs4_xdr_event, + TP_PROTO( + const struct xdr_stream *xdr, + u32 op, + u32 error + ), + + TP_ARGS(xdr, op, error), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(u32, op) + __field(unsigned long, error) + ), + + TP_fast_assign( + const struct rpc_rqst *rqstp = xdr->rqst; + const struct rpc_task *task = rqstp->rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->op = op; + __entry->error = error; + ), + + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " xid=0x%08x error=%ld (%s) operation=%u", + __entry->task_id, __entry->client_id, __entry->xid, + -__entry->error, show_nfs4_status(__entry->error), + __entry->op + ) +); +#define DEFINE_NFS4_XDR_EVENT(name) \ + DEFINE_EVENT(nfs4_xdr_event, name, \ + TP_PROTO( \ + const struct xdr_stream *xdr, \ + u32 op, \ + u32 error \ + ), \ + TP_ARGS(xdr, op, error)) +DEFINE_NFS4_XDR_EVENT(nfs4_xdr_status); +DEFINE_NFS4_XDR_EVENT(nfs4_xdr_bad_filehandle); + +DECLARE_EVENT_CLASS(nfs4_cb_error_class, + TP_PROTO( + __be32 xid, + u32 cb_ident + ), + + TP_ARGS(xid, cb_ident), + + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, cbident) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(xid); + __entry->cbident = cb_ident; + ), + + TP_printk( + "xid=0x%08x cb_ident=0x%08x", + __entry->xid, __entry->cbident + ) +); + +#define DEFINE_CB_ERROR_EVENT(name) \ + DEFINE_EVENT(nfs4_cb_error_class, nfs_cb_##name, \ + TP_PROTO( \ + __be32 xid, \ + u32 cb_ident \ + ), \ + TP_ARGS(xid, cb_ident)) + +DEFINE_CB_ERROR_EVENT(no_clp); +DEFINE_CB_ERROR_EVENT(badprinc); + DECLARE_EVENT_CLASS(nfs4_open_event, TP_PROTO( const struct nfs_open_context *ctx, @@ -392,9 +561,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event, TP_ARGS(ctx, flags, error), TP_STRUCT__entry( - __field(int, error) - __field(unsigned int, flags) - __field(unsigned int, fmode) + __field(unsigned long, error) + __field(unsigned long, flags) + __field(unsigned long, fmode) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -410,9 +579,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event, const struct nfs4_state *state = ctx->state; const struct inode *inode = NULL; - __entry->error = error; + __entry->error = -error; __entry->flags = flags; - __entry->fmode = (__force unsigned int)ctx->mode; + __entry->fmode = (__force unsigned long)ctx->mode; __entry->dev = ctx->dentry->d_sb->s_dev; if (!IS_ERR_OR_NULL(state)) { inode = state->inode; @@ -438,19 +607,19 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fhandle = 0; } __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent)); - __assign_str(name, ctx->dentry->d_name.name); + __assign_str(name); ), TP_printk( - "error=%d (%s) flags=%d (%s) fmode=%s " + "error=%ld (%s) flags=%lu (%s) fmode=%s " "fileid=%02x:%02x:%llu fhandle=0x%08x " "name=%02x:%02x:%llu/%s stateid=%d:0x%08x " "openstateid=%d:0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), + -__entry->error, + show_nfs4_status(__entry->error), __entry->flags, - show_open_flags(__entry->flags), - show_fmode_flags(__entry->fmode), + show_fs_fcntl_open_flags(__entry->flags), + show_fs_fmode_flags(__entry->fmode), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -504,7 +673,7 @@ TRACE_EVENT(nfs4_cached_open, TP_printk( "fmode=%s fileid=%02x:%02x:%llu " "fhandle=0x%08x stateid=%d:0x%08x", - __entry->fmode ? show_fmode_flags(__entry->fmode) : + __entry->fmode ? show_fs_fmode_flags(__entry->fmode) : "closed", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -528,7 +697,7 @@ TRACE_EVENT(nfs4_close, __field(u32, fhandle) __field(u64, fileid) __field(unsigned int, fmode) - __field(int, error) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -540,7 +709,7 @@ TRACE_EVENT(nfs4_close, __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->fmode = (__force unsigned int)state->state; - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->stateid_seq = be32_to_cpu(args->stateid.seqid); __entry->stateid_hash = @@ -548,11 +717,11 @@ TRACE_EVENT(nfs4_close, ), TP_printk( - "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu " + "error=%ld (%s) fmode=%s fileid=%02x:%02x:%llu " "fhandle=0x%08x openstateid=%d:0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), - __entry->fmode ? show_fmode_flags(__entry->fmode) : + -__entry->error, + show_nfs4_status(__entry->error), + __entry->fmode ? show_fs_fmode_flags(__entry->fmode) : "closed", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, @@ -561,17 +730,6 @@ TRACE_EVENT(nfs4_close, ) ); -#define show_lock_cmd(type) \ - __print_symbolic((int)type, \ - { F_GETLK, "GETLK" }, \ - { F_SETLK, "SETLK" }, \ - { F_SETLKW, "SETLKW" }) -#define show_lock_type(type) \ - __print_symbolic((int)type, \ - { F_RDLCK, "RDLCK" }, \ - { F_WRLCK, "WRLCK" }, \ - { F_UNLCK, "UNLCK" }) - DECLARE_EVENT_CLASS(nfs4_lock_event, TP_PROTO( const struct file_lock *request, @@ -583,9 +741,9 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, TP_ARGS(request, state, cmd, error), TP_STRUCT__entry( - __field(int, error) - __field(int, cmd) - __field(char, type) + __field(unsigned long, error) + __field(unsigned long, cmd) + __field(unsigned long, type) __field(loff_t, start) __field(loff_t, end) __field(dev_t, dev) @@ -598,9 +756,9 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, TP_fast_assign( const struct inode *inode = state->inode; - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->cmd = cmd; - __entry->type = request->fl_type; + __entry->type = request->c.flc_type; __entry->start = request->fl_start; __entry->end = request->fl_end; __entry->dev = inode->i_sb->s_dev; @@ -613,13 +771,13 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, ), TP_printk( - "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "error=%ld (%s) cmd=%s:%s range=%lld:%lld " "fileid=%02x:%02x:%llu fhandle=0x%08x " "stateid=%d:0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), - show_lock_cmd(__entry->cmd), - show_lock_type(__entry->type), + -__entry->error, + show_nfs4_status(__entry->error), + show_fs_fcntl_cmd(__entry->cmd), + show_fs_fcntl_lock_type(__entry->type), (long long)__entry->start, (long long)__entry->end, MAJOR(__entry->dev), MINOR(__entry->dev), @@ -653,9 +811,9 @@ TRACE_EVENT(nfs4_set_lock, TP_ARGS(request, state, lockstateid, cmd, error), TP_STRUCT__entry( - __field(int, error) - __field(int, cmd) - __field(char, type) + __field(unsigned long, error) + __field(unsigned long, cmd) + __field(unsigned long, type) __field(loff_t, start) __field(loff_t, end) __field(dev_t, dev) @@ -670,9 +828,9 @@ TRACE_EVENT(nfs4_set_lock, TP_fast_assign( const struct inode *inode = state->inode; - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->cmd = cmd; - __entry->type = request->fl_type; + __entry->type = request->c.flc_type; __entry->start = request->fl_start; __entry->end = request->fl_end; __entry->dev = inode->i_sb->s_dev; @@ -689,13 +847,13 @@ TRACE_EVENT(nfs4_set_lock, ), TP_printk( - "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "error=%ld (%s) cmd=%s:%s range=%lld:%lld " "fileid=%02x:%02x:%llu fhandle=0x%08x " "stateid=%d:0x%08x lockstateid=%d:0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), - show_lock_cmd(__entry->cmd), - show_lock_type(__entry->type), + -__entry->error, + show_nfs4_status(__entry->error), + show_fs_fcntl_cmd(__entry->cmd), + show_fs_fcntl_lock_type(__entry->type), (long long)__entry->start, (long long)__entry->end, MAJOR(__entry->dev), MINOR(__entry->dev), @@ -706,6 +864,88 @@ TRACE_EVENT(nfs4_set_lock, ) ); +TRACE_DEFINE_ENUM(LK_STATE_IN_USE); +TRACE_DEFINE_ENUM(NFS_DELEGATED_STATE); +TRACE_DEFINE_ENUM(NFS_OPEN_STATE); +TRACE_DEFINE_ENUM(NFS_O_RDONLY_STATE); +TRACE_DEFINE_ENUM(NFS_O_WRONLY_STATE); +TRACE_DEFINE_ENUM(NFS_O_RDWR_STATE); +TRACE_DEFINE_ENUM(NFS_STATE_RECLAIM_REBOOT); +TRACE_DEFINE_ENUM(NFS_STATE_RECLAIM_NOGRACE); +TRACE_DEFINE_ENUM(NFS_STATE_POSIX_LOCKS); +TRACE_DEFINE_ENUM(NFS_STATE_RECOVERY_FAILED); +TRACE_DEFINE_ENUM(NFS_STATE_MAY_NOTIFY_LOCK); +TRACE_DEFINE_ENUM(NFS_STATE_CHANGE_WAIT); +TRACE_DEFINE_ENUM(NFS_CLNT_DST_SSC_COPY_STATE); +TRACE_DEFINE_ENUM(NFS_CLNT_SRC_SSC_COPY_STATE); +TRACE_DEFINE_ENUM(NFS_SRV_SSC_COPY_STATE); + +#define show_nfs4_state_flags(flags) \ + __print_flags(flags, "|", \ + { LK_STATE_IN_USE, "IN_USE" }, \ + { NFS_DELEGATED_STATE, "DELEGATED" }, \ + { NFS_OPEN_STATE, "OPEN" }, \ + { NFS_O_RDONLY_STATE, "O_RDONLY" }, \ + { NFS_O_WRONLY_STATE, "O_WRONLY" }, \ + { NFS_O_RDWR_STATE, "O_RDWR" }, \ + { NFS_STATE_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \ + { NFS_STATE_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \ + { NFS_STATE_POSIX_LOCKS, "POSIX_LOCKS" }, \ + { NFS_STATE_RECOVERY_FAILED, "RECOVERY_FAILED" }, \ + { NFS_STATE_MAY_NOTIFY_LOCK, "MAY_NOTIFY_LOCK" }, \ + { NFS_STATE_CHANGE_WAIT, "CHANGE_WAIT" }, \ + { NFS_CLNT_DST_SSC_COPY_STATE, "CLNT_DST_SSC_COPY" }, \ + { NFS_CLNT_SRC_SSC_COPY_STATE, "CLNT_SRC_SSC_COPY" }, \ + { NFS_SRV_SSC_COPY_STATE, "SRV_SSC_COPY" }) + +#define show_nfs4_lock_flags(flags) \ + __print_flags(flags, "|", \ + { BIT(NFS_LOCK_INITIALIZED), "INITIALIZED" }, \ + { BIT(NFS_LOCK_LOST), "LOST" }) + +TRACE_EVENT(nfs4_state_lock_reclaim, + TP_PROTO( + const struct nfs4_state *state, + const struct nfs4_lock_state *lock + ), + + TP_ARGS(state, lock), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned long, state_flags) + __field(unsigned long, lock_flags) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->state_flags = state->flags; + __entry->lock_flags = lock->ls_flags; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x state_flags=%s lock_flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + show_nfs4_state_flags(__entry->state_flags), + show_nfs4_lock_flags(__entry->lock_flags) + ) +) + DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_PROTO( const struct inode *inode, @@ -730,7 +970,7 @@ DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_printk( "fmode=%s fileid=%02x:%02x:%llu fhandle=0x%08x", - show_fmode_flags(__entry->fmode), + show_fs_fmode_flags(__entry->fmode), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle @@ -745,6 +985,52 @@ DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_ARGS(inode, fmode)) DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation); DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation); +DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_detach_delegation); + +#define show_delegation_flags(flags) \ + __print_flags(flags, "|", \ + { BIT(NFS_DELEGATION_NEED_RECLAIM), "NEED_RECLAIM" }, \ + { BIT(NFS_DELEGATION_RETURN), "RETURN" }, \ + { BIT(NFS_DELEGATION_RETURN_IF_CLOSED), "RETURN_IF_CLOSED" }, \ + { BIT(NFS_DELEGATION_REFERENCED), "REFERENCED" }, \ + { BIT(NFS_DELEGATION_RETURNING), "RETURNING" }, \ + { BIT(NFS_DELEGATION_REVOKED), "REVOKED" }, \ + { BIT(NFS_DELEGATION_TEST_EXPIRED), "TEST_EXPIRED" }, \ + { BIT(NFS_DELEGATION_INODE_FREEING), "INODE_FREEING" }, \ + { BIT(NFS_DELEGATION_RETURN_DELAYED), "RETURN_DELAYED" }) + +DECLARE_EVENT_CLASS(nfs4_delegation_event, + TP_PROTO( + const struct nfs_delegation *delegation + ), + + TP_ARGS(delegation), + + TP_STRUCT__entry( + __field(u32, fhandle) + __field(unsigned int, fmode) + __field(unsigned long, flags) + ), + + TP_fast_assign( + __entry->fhandle = nfs_fhandle_hash(NFS_FH(delegation->inode)); + __entry->fmode = delegation->type; + __entry->flags = delegation->flags; + ), + + TP_printk( + "fhandle=0x%08x fmode=%s flags=%s", + __entry->fhandle, show_fs_fmode_flags(__entry->fmode), + show_delegation_flags(__entry->flags) + ) +); +#define DEFINE_NFS4_DELEGATION_EVENT(name) \ + DEFINE_EVENT(nfs4_delegation_event, name, \ + TP_PROTO( \ + const struct nfs_delegation *delegation \ + ), \ + TP_ARGS(delegation)) +DEFINE_NFS4_DELEGATION_EVENT(nfs_delegation_need_return); TRACE_EVENT(nfs4_delegreturn_exit, TP_PROTO( @@ -758,7 +1044,7 @@ TRACE_EVENT(nfs4_delegreturn_exit, TP_STRUCT__entry( __field(dev_t, dev) __field(u32, fhandle) - __field(int, error) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -766,7 +1052,7 @@ TRACE_EVENT(nfs4_delegreturn_exit, TP_fast_assign( __entry->dev = res->server->s_dev; __entry->fhandle = nfs_fhandle_hash(args->fhandle); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->stateid_seq = be32_to_cpu(args->stateid->seqid); __entry->stateid_hash = @@ -774,10 +1060,10 @@ TRACE_EVENT(nfs4_delegreturn_exit, ), TP_printk( - "error=%d (%s) dev=%02x:%02x fhandle=0x%08x " + "error=%ld (%s) dev=%02x:%02x fhandle=0x%08x " "stateid=%d:0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), __entry->fhandle, __entry->stateid_seq, __entry->stateid_hash @@ -795,7 +1081,7 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, TP_ARGS(state, lsp, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -806,7 +1092,7 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, TP_fast_assign( const struct inode *inode = state->inode; - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); @@ -817,10 +1103,10 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "stateid=%d:0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -852,7 +1138,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, error) + __field(unsigned long, error) __field(u64, dir) __string(name, name->name) ), @@ -860,14 +1146,14 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __entry->error = error; - __assign_str(name, name->name); + __entry->error = -error; + __assign_str(name); ), TP_printk( - "error=%d (%s) name=%02x:%02x:%llu/%s", - __entry->error, - show_nfsv4_errors(__entry->error), + "error=%ld (%s) name=%02x:%02x:%llu/%s", + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) @@ -902,19 +1188,19 @@ TRACE_EVENT(nfs4_lookupp, TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = NFS_FILEID(inode); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; ), TP_printk( - "error=%d (%s) inode=%02x:%02x:%llu", - __entry->error, - show_nfsv4_errors(__entry->error), + "error=%ld (%s) inode=%02x:%02x:%llu", + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->ino ) @@ -933,7 +1219,7 @@ TRACE_EVENT(nfs4_rename, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, error) + __field(unsigned long, error) __field(u64, olddir) __string(oldname, oldname->name) __field(u64, newdir) @@ -944,16 +1230,16 @@ TRACE_EVENT(nfs4_rename, __entry->dev = olddir->i_sb->s_dev; __entry->olddir = NFS_FILEID(olddir); __entry->newdir = NFS_FILEID(newdir); - __entry->error = error; - __assign_str(oldname, oldname->name); - __assign_str(newname, newname->name); + __entry->error = error < 0 ? -error : 0; + __assign_str(oldname); + __assign_str(newname); ), TP_printk( - "error=%d (%s) oldname=%02x:%02x:%llu/%s " + "error=%ld (%s) oldname=%02x:%02x:%llu/%s " "newname=%02x:%02x:%llu/%s", - __entry->error, - show_nfsv4_errors(__entry->error), + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->olddir, __get_str(oldname), @@ -975,20 +1261,20 @@ DECLARE_EVENT_CLASS(nfs4_inode_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle @@ -1026,7 +1312,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __field(int, error) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -1035,7 +1321,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->stateid_seq = be32_to_cpu(stateid->seqid); __entry->stateid_hash = @@ -1043,10 +1329,10 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "stateid=%d:0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -1065,6 +1351,9 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_close_stateid_update_wait); DECLARE_EVENT_CLASS(nfs4_getattr_event, TP_PROTO( @@ -1081,7 +1370,7 @@ DECLARE_EVENT_CLASS(nfs4_getattr_event, __field(u32, fhandle) __field(u64, fileid) __field(unsigned int, valid) - __field(int, error) + __field(unsigned long, error) ), TP_fast_assign( @@ -1089,14 +1378,14 @@ DECLARE_EVENT_CLASS(nfs4_getattr_event, __entry->valid = fattr->valid; __entry->fhandle = nfs_fhandle_hash(fhandle); __entry->fileid = (fattr->valid & NFS_ATTR_FATTR_FILEID) ? fattr->fileid : 0; - __entry->error = error; + __entry->error = error < 0 ? -error : 0; ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "valid=%s", - __entry->error, - show_nfsv4_errors(__entry->error), + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -1128,35 +1417,31 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, TP_ARGS(clp, fhandle, inode, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __string(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __string(dstaddr, clp ? clp->cl_hostname : "unknown") ), TP_fast_assign( - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->fhandle = nfs_fhandle_hash(fhandle); - if (inode != NULL) { + if (!IS_ERR_OR_NULL(inode)) { __entry->fileid = NFS_FILEID(inode); __entry->dev = inode->i_sb->s_dev; } else { __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __assign_str(dstaddr); ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "dstaddr=%s", - __entry->error, - show_nfsv4_errors(__entry->error), + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -1187,30 +1472,26 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, TP_ARGS(clp, fhandle, inode, stateid, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __string(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __string(dstaddr, clp ? clp->cl_hostname : "unknown") __field(int, stateid_seq) __field(u32, stateid_hash) ), TP_fast_assign( - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->fhandle = nfs_fhandle_hash(fhandle); - if (inode != NULL) { + if (!IS_ERR_OR_NULL(inode)) { __entry->fileid = NFS_FILEID(inode); __entry->dev = inode->i_sb->s_dev; } else { __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __assign_str(dstaddr); __entry->stateid_seq = be32_to_cpu(stateid->seqid); __entry->stateid_hash = @@ -1218,10 +1499,10 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "stateid=%d:0x%08x dstaddr=%s", - __entry->error, - show_nfsv4_errors(__entry->error), + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -1243,6 +1524,63 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall); DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file); +#define show_stateid_type(type) \ + __print_symbolic(type, \ + { NFS4_INVALID_STATEID_TYPE, "INVALID" }, \ + { NFS4_SPECIAL_STATEID_TYPE, "SPECIAL" }, \ + { NFS4_OPEN_STATEID_TYPE, "OPEN" }, \ + { NFS4_LOCK_STATEID_TYPE, "LOCK" }, \ + { NFS4_DELEGATION_STATEID_TYPE, "DELEGATION" }, \ + { NFS4_LAYOUT_STATEID_TYPE, "LAYOUT" }, \ + { NFS4_PNFS_DS_STATEID_TYPE, "PNFS_DS" }, \ + { NFS4_REVOKED_STATEID_TYPE, "REVOKED" }, \ + { NFS4_FREED_STATEID_TYPE, "FREED" }) + +DECLARE_EVENT_CLASS(nfs4_match_stateid_event, + TP_PROTO( + const nfs4_stateid *s1, + const nfs4_stateid *s2 + ), + + TP_ARGS(s1, s2), + + TP_STRUCT__entry( + __field(int, s1_seq) + __field(int, s2_seq) + __field(u32, s1_hash) + __field(u32, s2_hash) + __field(int, s1_type) + __field(int, s2_type) + ), + + TP_fast_assign( + __entry->s1_seq = s1->seqid; + __entry->s1_hash = nfs_stateid_hash(s1); + __entry->s1_type = s1->type; + __entry->s2_seq = s2->seqid; + __entry->s2_hash = nfs_stateid_hash(s2); + __entry->s2_type = s2->type; + ), + + TP_printk( + "s1=%s:%x:%u s2=%s:%x:%u", + show_stateid_type(__entry->s1_type), + __entry->s1_hash, __entry->s1_seq, + show_stateid_type(__entry->s2_type), + __entry->s2_hash, __entry->s2_seq + ) +); + +#define DEFINE_NFS4_MATCH_STATEID_EVENT(name) \ + DEFINE_EVENT(nfs4_match_stateid_event, name, \ + TP_PROTO( \ + const nfs4_stateid *s1, \ + const nfs4_stateid *s2 \ + ), \ + TP_ARGS(s1, s2)) +DEFINE_NFS4_MATCH_STATEID_EVENT(nfs41_match_stateid); +DEFINE_NFS4_MATCH_STATEID_EVENT(nfs4_match_stateid); + DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( const char *name, @@ -1254,7 +1592,7 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_ARGS(name, len, id, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(u32, id) __dynamic_array(char, name, len > 0 ? len + 1 : 1) ), @@ -1269,8 +1607,8 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event, ), TP_printk( - "error=%d id=%u name=%s", - __entry->error, + "error=%ld (%s) id=%u name=%s", + -__entry->error, show_nfs4_status(__entry->error), __entry->id, __get_str(name) ) @@ -1289,6 +1627,13 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_group_to_gid); DEFINE_NFS4_IDMAP_EVENT(nfs4_map_uid_to_name); DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group); +#ifdef CONFIG_NFS_V4_1 +#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) \ + (lseg ? nfs_stateid_hash(&lseg->pls_layout->plh_stateid) : 0) +#else +#define NFS4_LSEG_LAYOUT_STATEID_HASH(lseg) (0) +#endif + DECLARE_EVENT_CLASS(nfs4_read_event, TP_PROTO( const struct nfs_pgio_header *hdr, @@ -1302,39 +1647,53 @@ DECLARE_EVENT_CLASS(nfs4_read_event, __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) - __field(size_t, count) - __field(int, error) + __field(u32, arg_count) + __field(u32, res_count) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) ), TP_fast_assign( const struct inode *inode = hdr->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = hdr->args.fh ? + hdr->args.fh : &nfsi->fh; const struct nfs4_state *state = hdr->args.context->state; + const struct pnfs_layout_segment *lseg = hdr->lseg; + __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); - __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = hdr->args.offset; - __entry->count = hdr->args.count; - __entry->error = error; + __entry->arg_count = hdr->args.count; + __entry->res_count = hdr->res.count; + __entry->error = error < 0 ? -error : 0; __entry->stateid_seq = be32_to_cpu(state->stateid.seqid); __entry->stateid_hash = nfs_stateid_hash(&state->stateid); + __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0; + __entry->layoutstateid_hash = + NFS4_LSEG_LAYOUT_STATEID_HASH(lseg); ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu stateid=%d:0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u stateid=%d:0x%08x " + "layoutstateid=%d:0x%08x", + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count, - __entry->stateid_seq, __entry->stateid_hash + __entry->arg_count, __entry->res_count, + __entry->stateid_seq, __entry->stateid_hash, + __entry->layoutstateid_seq, __entry->layoutstateid_hash ) ); #define DEFINE_NFS4_READ_EVENT(name) \ @@ -1362,39 +1721,53 @@ DECLARE_EVENT_CLASS(nfs4_write_event, __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) - __field(size_t, count) - __field(int, error) + __field(u32, arg_count) + __field(u32, res_count) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) ), TP_fast_assign( const struct inode *inode = hdr->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = hdr->args.fh ? + hdr->args.fh : &nfsi->fh; const struct nfs4_state *state = hdr->args.context->state; + const struct pnfs_layout_segment *lseg = hdr->lseg; + __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); - __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = hdr->args.offset; - __entry->count = hdr->args.count; - __entry->error = error; + __entry->arg_count = hdr->args.count; + __entry->res_count = hdr->res.count; + __entry->error = error < 0 ? -error : 0; __entry->stateid_seq = be32_to_cpu(state->stateid.seqid); __entry->stateid_hash = nfs_stateid_hash(&state->stateid); + __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0; + __entry->layoutstateid_hash = + NFS4_LSEG_LAYOUT_STATEID_HASH(lseg); ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu stateid=%d:0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u stateid=%d:0x%08x " + "layoutstateid=%d:0x%08x", + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count, - __entry->stateid_seq, __entry->stateid_hash + __entry->arg_count, __entry->res_count, + __entry->stateid_seq, __entry->stateid_hash, + __entry->layoutstateid_seq, __entry->layoutstateid_hash ) ); @@ -1422,31 +1795,42 @@ DECLARE_EVENT_CLASS(nfs4_commit_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) + __field(unsigned long, error) __field(loff_t, offset) - __field(size_t, count) - __field(int, error) + __field(u32, count) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) ), TP_fast_assign( const struct inode *inode = data->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = data->args.fh ? + data->args.fh : &nfsi->fh; + const struct pnfs_layout_segment *lseg = data->lseg; + __entry->dev = inode->i_sb->s_dev; - __entry->fileid = NFS_FILEID(inode); - __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); __entry->offset = data->args.offset; __entry->count = data->args.count; - __entry->error = error; + __entry->error = error < 0 ? -error : 0; + __entry->layoutstateid_seq = lseg ? lseg->pls_seq : 0; + __entry->layoutstateid_hash = + NFS4_LSEG_LAYOUT_STATEID_HASH(lseg); ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu", - __entry->error, - show_nfsv4_errors(__entry->error), + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u layoutstateid=%d:0x%08x", + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count + __entry->count, + __entry->layoutstateid_seq, __entry->layoutstateid_hash ) ); #define DEFINE_NFS4_COMMIT_EVENT(name) \ @@ -1460,12 +1844,6 @@ DEFINE_NFS4_COMMIT_EVENT(nfs4_commit); #ifdef CONFIG_NFS_V4_1 DEFINE_NFS4_COMMIT_EVENT(nfs4_pnfs_commit_ds); -#define show_pnfs_iomode(iomode) \ - __print_symbolic(iomode, \ - { IOMODE_READ, "READ" }, \ - { IOMODE_RW, "RW" }, \ - { IOMODE_ANY, "ANY" }) - TRACE_EVENT(nfs4_layoutget, TP_PROTO( const struct nfs_open_context *ctx, @@ -1484,7 +1862,7 @@ TRACE_EVENT(nfs4_layoutget, __field(u32, iomode) __field(u64, offset) __field(u64, count) - __field(int, error) + __field(unsigned long, error) __field(int, stateid_seq) __field(u32, stateid_hash) __field(int, layoutstateid_seq) @@ -1500,7 +1878,7 @@ TRACE_EVENT(nfs4_layoutget, __entry->iomode = args->iomode; __entry->offset = args->offset; __entry->count = args->length; - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->stateid_seq = be32_to_cpu(state->stateid.seqid); __entry->stateid_hash = @@ -1517,15 +1895,15 @@ TRACE_EVENT(nfs4_layoutget, ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x " "layoutstateid=%d:0x%08x", - __entry->error, - show_nfsv4_errors(__entry->error), + -__entry->error, + show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, - show_pnfs_iomode(__entry->iomode), + show_pnfs_layout_iomode(__entry->iomode), (unsigned long long)__entry->offset, (unsigned long long)__entry->count, __entry->stateid_seq, __entry->stateid_hash, @@ -1535,7 +1913,24 @@ TRACE_EVENT(nfs4_layoutget, DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn); -DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn_on_close); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layouterror); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutstats); + +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_UNKNOWN); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NO_PNFS); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RD_ZEROLEN); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_MDSTHRESH); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_NOMEM); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BULK_RECALL); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_FOUND_CACHED); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETURN); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BLOCKED); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_INVALID_OPEN); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETRY); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_EXIT); #define show_pnfs_update_layout_reason(reason) \ __print_symbolic(reason, \ @@ -1551,7 +1946,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \ { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \ - { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) + { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }, \ + { PNFS_UPDATE_LAYOUT_EXIT, "exit" }) TRACE_EVENT(pnfs_update_layout, TP_PROTO(struct inode *inode, @@ -1583,7 +1979,7 @@ TRACE_EVENT(pnfs_update_layout, __entry->count = count; __entry->iomode = iomode; __entry->reason = reason; - if (lo != NULL) { + if (lo != NULL && pnfs_layout_is_valid(lo)) { __entry->layoutstateid_seq = be32_to_cpu(lo->plh_stateid.seqid); __entry->layoutstateid_hash = @@ -1601,7 +1997,7 @@ TRACE_EVENT(pnfs_update_layout, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, - show_pnfs_iomode(__entry->iomode), + show_pnfs_layout_iomode(__entry->iomode), (unsigned long long)__entry->pos, (unsigned long long)__entry->count, __entry->layoutstateid_seq, __entry->layoutstateid_hash, @@ -1610,6 +2006,875 @@ TRACE_EVENT(pnfs_update_layout, ) ); +DECLARE_EVENT_CLASS(pnfs_layout_event, + TP_PROTO(struct inode *inode, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg + ), + TP_ARGS(inode, pos, count, iomode, lo, lseg), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, pos) + __field(u64, count) + __field(enum pnfs_iomode, iomode) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) + __field(long, lseg) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->pos = pos; + __entry->count = count; + __entry->iomode = iomode; + if (lo != NULL && pnfs_layout_is_valid(lo)) { + __entry->layoutstateid_seq = + be32_to_cpu(lo->plh_stateid.seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(&lo->plh_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } + __entry->lseg = (long)lseg; + ), + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "iomode=%s pos=%llu count=%llu " + "layoutstateid=%d:0x%08x lseg=0x%lx", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_pnfs_layout_iomode(__entry->iomode), + (unsigned long long)__entry->pos, + (unsigned long long)__entry->count, + __entry->layoutstateid_seq, __entry->layoutstateid_hash, + __entry->lseg + ) +); + +#define DEFINE_PNFS_LAYOUT_EVENT(name) \ + DEFINE_EVENT(pnfs_layout_event, name, \ + TP_PROTO(struct inode *inode, \ + loff_t pos, \ + u64 count, \ + enum pnfs_iomode iomode, \ + struct pnfs_layout_hdr *lo, \ + struct pnfs_layout_segment *lseg \ + ), \ + TP_ARGS(inode, pos, count, iomode, lo, lseg)) + +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_read); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_write); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_get_mirror_count); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_done); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist); + +DECLARE_EVENT_CLASS(nfs4_deviceid_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs4_deviceid *deviceid + ), + + TP_ARGS(clp, deviceid), + + TP_STRUCT__entry( + __string(dstaddr, clp->cl_hostname) + __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE) + ), + + TP_fast_assign( + __assign_str(dstaddr); + memcpy(__entry->deviceid, deviceid->data, + NFS4_DEVICEID4_SIZE); + ), + + TP_printk( + "deviceid=%s, dstaddr=%s", + __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE), + __get_str(dstaddr) + ) +); +#define DEFINE_PNFS_DEVICEID_EVENT(name) \ + DEFINE_EVENT(nfs4_deviceid_event, name, \ + TP_PROTO(const struct nfs_client *clp, \ + const struct nfs4_deviceid *deviceid \ + ), \ + TP_ARGS(clp, deviceid)) +DEFINE_PNFS_DEVICEID_EVENT(nfs4_deviceid_free); + +DECLARE_EVENT_CLASS(nfs4_deviceid_status, + TP_PROTO( + const struct nfs_server *server, + const struct nfs4_deviceid *deviceid, + int status + ), + + TP_ARGS(server, deviceid, status), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, status) + __string(dstaddr, server->nfs_client->cl_hostname) + __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE) + ), + + TP_fast_assign( + __entry->dev = server->s_dev; + __entry->status = status; + __assign_str(dstaddr); + memcpy(__entry->deviceid, deviceid->data, + NFS4_DEVICEID4_SIZE); + ), + + TP_printk( + "dev=%02x:%02x: deviceid=%s, dstaddr=%s, status=%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE), + __get_str(dstaddr), + __entry->status + ) +); +#define DEFINE_PNFS_DEVICEID_STATUS(name) \ + DEFINE_EVENT(nfs4_deviceid_status, name, \ + TP_PROTO(const struct nfs_server *server, \ + const struct nfs4_deviceid *deviceid, \ + int status \ + ), \ + TP_ARGS(server, deviceid, status)) +DEFINE_PNFS_DEVICEID_STATUS(nfs4_getdeviceinfo); +DEFINE_PNFS_DEVICEID_STATUS(nfs4_find_deviceid); + +TRACE_EVENT(fl_getdevinfo, + TP_PROTO( + const struct nfs_server *server, + const struct nfs4_deviceid *deviceid, + char *ds_remotestr + ), + TP_ARGS(server, deviceid, ds_remotestr), + + TP_STRUCT__entry( + __string(mds_addr, server->nfs_client->cl_hostname) + __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE) + __string(ds_ips, ds_remotestr) + ), + + TP_fast_assign( + __assign_str(mds_addr); + __assign_str(ds_ips); + memcpy(__entry->deviceid, deviceid->data, + NFS4_DEVICEID4_SIZE); + ), + TP_printk( + "deviceid=%s, mds_addr=%s, ds_ips=%s", + __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE), + __get_str(mds_addr), + __get_str(ds_ips) + ) +); + +DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, + TP_PROTO( + const struct nfs_pgio_header *hdr, + int error + ), + + TP_ARGS(hdr, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(unsigned long, nfs_error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(u32, count) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __string(dstaddr, hdr->ds_clp ? + rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + ), + + TP_fast_assign( + const struct inode *inode = hdr->inode; + + __entry->error = -error; + __entry->nfs_error = hdr->res.op_status; + __entry->fhandle = nfs_fhandle_hash(hdr->args.fh); + __entry->fileid = NFS_FILEID(inode); + __entry->dev = inode->i_sb->s_dev; + __entry->offset = hdr->args.offset; + __entry->count = hdr->args.count; + __entry->stateid_seq = + be32_to_cpu(hdr->args.stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&hdr->args.stateid); + __assign_str(dstaddr); + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s " + "nfs_error=%lu (%s)", + -__entry->error, + show_nfs4_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->count, + __entry->stateid_seq, __entry->stateid_hash, + __get_str(dstaddr), __entry->nfs_error, + show_nfs4_status(__entry->nfs_error) + ) +); + +#define DEFINE_NFS4_FLEXFILES_IO_EVENT(name) \ + DEFINE_EVENT(nfs4_flexfiles_io_event, name, \ + TP_PROTO( \ + const struct nfs_pgio_header *hdr, \ + int error \ + ), \ + TP_ARGS(hdr, error)) +DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_read_error); +DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_write_error); + +TRACE_EVENT(ff_layout_commit_error, + TP_PROTO( + const struct nfs_commit_data *data, + int error + ), + + TP_ARGS(data, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(unsigned long, nfs_error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(u32, count) + __string(dstaddr, data->ds_clp ? + rpc_peeraddr2str(data->ds_clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + ), + + TP_fast_assign( + const struct inode *inode = data->inode; + + __entry->error = -error; + __entry->nfs_error = data->res.op_status; + __entry->fhandle = nfs_fhandle_hash(data->args.fh); + __entry->fileid = NFS_FILEID(inode); + __entry->dev = inode->i_sb->s_dev; + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __assign_str(dstaddr); + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%llu count=%u dstaddr=%s nfs_error=%lu (%s)", + -__entry->error, + show_nfs4_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->offset, __entry->count, + __get_str(dstaddr), __entry->nfs_error, + show_nfs4_status(__entry->nfs_error) + ) +); + +TRACE_EVENT(bl_ext_tree_prepare_commit, + TP_PROTO( + int ret, + size_t count, + u64 lwb, + bool not_all_ranges + ), + + TP_ARGS(ret, count, lwb, not_all_ranges), + + TP_STRUCT__entry( + __field(int, ret) + __field(size_t, count) + __field(u64, lwb) + __field(bool, not_all_ranges) + ), + + TP_fast_assign( + __entry->ret = ret; + __entry->count = count; + __entry->lwb = lwb; + __entry->not_all_ranges = not_all_ranges; + ), + + TP_printk( + "ret=%d, found %zu ranges, lwb=%llu%s", + __entry->ret, + __entry->count, + __entry->lwb, + __entry->not_all_ranges ? ", not all ranges encoded" : + "" + ) +); + +DECLARE_EVENT_CLASS(pnfs_bl_pr_key_class, + TP_PROTO( + const struct block_device *bdev, + u64 key + ), + TP_ARGS(bdev, key), + TP_STRUCT__entry( + __field(u64, key) + __field(dev_t, dev) + __string(device, bdev->bd_disk->disk_name) + ), + TP_fast_assign( + __entry->key = key; + __entry->dev = bdev->bd_dev; + __assign_str(device); + ), + TP_printk("dev=%d,%d (%s) key=0x%016llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __get_str(device), __entry->key + ) +); + +#define DEFINE_NFS4_BLOCK_PRKEY_EVENT(name) \ + DEFINE_EVENT(pnfs_bl_pr_key_class, name, \ + TP_PROTO( \ + const struct block_device *bdev, \ + u64 key \ + ), \ + TP_ARGS(bdev, key)) +DEFINE_NFS4_BLOCK_PRKEY_EVENT(bl_pr_key_reg); +DEFINE_NFS4_BLOCK_PRKEY_EVENT(bl_pr_key_unreg); + +/* + * From uapi/linux/pr.h + */ +TRACE_DEFINE_ENUM(PR_STS_SUCCESS); +TRACE_DEFINE_ENUM(PR_STS_IOERR); +TRACE_DEFINE_ENUM(PR_STS_RESERVATION_CONFLICT); +TRACE_DEFINE_ENUM(PR_STS_RETRY_PATH_FAILURE); +TRACE_DEFINE_ENUM(PR_STS_PATH_FAST_FAILED); +TRACE_DEFINE_ENUM(PR_STS_PATH_FAILED); + +#define show_pr_status(x) \ + __print_symbolic(x, \ + { PR_STS_SUCCESS, "SUCCESS" }, \ + { PR_STS_IOERR, "IOERR" }, \ + { PR_STS_RESERVATION_CONFLICT, "RESERVATION_CONFLICT" }, \ + { PR_STS_RETRY_PATH_FAILURE, "RETRY_PATH_FAILURE" }, \ + { PR_STS_PATH_FAST_FAILED, "PATH_FAST_FAILED" }, \ + { PR_STS_PATH_FAILED, "PATH_FAILED" }) + +DECLARE_EVENT_CLASS(pnfs_bl_pr_key_err_class, + TP_PROTO( + const struct block_device *bdev, + u64 key, + int status + ), + TP_ARGS(bdev, key, status), + TP_STRUCT__entry( + __field(u64, key) + __field(dev_t, dev) + __field(unsigned long, status) + __string(device, bdev->bd_disk->disk_name) + ), + TP_fast_assign( + __entry->key = key; + __entry->dev = bdev->bd_dev; + __entry->status = status; + __assign_str(device); + ), + TP_printk("dev=%d,%d (%s) key=0x%016llx status=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __get_str(device), __entry->key, + show_pr_status(__entry->status) + ) +); + +#define DEFINE_NFS4_BLOCK_PRKEY_ERR_EVENT(name) \ + DEFINE_EVENT(pnfs_bl_pr_key_err_class, name, \ + TP_PROTO( \ + const struct block_device *bdev, \ + u64 key, \ + int status \ + ), \ + TP_ARGS(bdev, key, status)) +DEFINE_NFS4_BLOCK_PRKEY_ERR_EVENT(bl_pr_key_reg_err); +DEFINE_NFS4_BLOCK_PRKEY_ERR_EVENT(bl_pr_key_unreg_err); + +#ifdef CONFIG_NFS_V4_2 +TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA); +TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE); + +#define show_llseek_mode(what) \ + __print_symbolic(what, \ + { NFS4_CONTENT_DATA, "DATA" }, \ + { NFS4_CONTENT_HOLE, "HOLE" }) + +TRACE_EVENT(nfs4_llseek, + TP_PROTO( + const struct inode *inode, + const struct nfs42_seek_args *args, + const struct nfs42_seek_res *res, + int error + ), + + TP_ARGS(inode, args, res, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(u32, fhandle) + __field(u32, fileid) + __field(dev_t, dev) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(loff_t, offset_s) + __field(u32, what) + __field(loff_t, offset_r) + __field(u32, eof) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = args->sa_fh; + + __entry->fileid = nfsi->fileid; + __entry->dev = inode->i_sb->s_dev; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->offset_s = args->sa_offset; + __entry->stateid_seq = + be32_to_cpu(args->sa_stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&args->sa_stateid); + __entry->what = args->sa_what; + if (error) { + __entry->error = -error; + __entry->offset_r = 0; + __entry->eof = 0; + } else { + __entry->error = 0; + __entry->offset_r = res->sr_offset; + __entry->eof = res->sr_eof; + } + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x offset_s=%llu what=%s " + "offset_r=%llu eof=%u", + -__entry->error, + show_nfs4_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __entry->offset_s, + show_llseek_mode(__entry->what), + __entry->offset_r, + __entry->eof + ) +); + +DECLARE_EVENT_CLASS(nfs4_sparse_event, + TP_PROTO( + const struct inode *inode, + const struct nfs42_falloc_args *args, + int error + ), + + TP_ARGS(inode, args, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(loff_t, offset) + __field(loff_t, len) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->error = error < 0 ? -error : 0; + __entry->offset = args->falloc_offset; + __entry->len = args->falloc_length; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(args->falloc_stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&args->falloc_stateid); + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x offset=%llu len=%llu", + -__entry->error, + show_nfs4_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + (long long)__entry->offset, + (long long)__entry->len + ) +); +#define DEFINE_NFS4_SPARSE_EVENT(name) \ + DEFINE_EVENT(nfs4_sparse_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const struct nfs42_falloc_args *args, \ + int error \ + ), \ + TP_ARGS(inode, args, error)) +DEFINE_NFS4_SPARSE_EVENT(nfs4_fallocate); +DEFINE_NFS4_SPARSE_EVENT(nfs4_deallocate); + +TRACE_EVENT(nfs4_copy, + TP_PROTO( + const struct inode *src_inode, + const struct inode *dst_inode, + const struct nfs42_copy_args *args, + const struct nfs42_copy_res *res, + const struct nl4_server *nss, + int error + ), + + TP_ARGS(src_inode, dst_inode, args, res, nss, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(u32, src_fhandle) + __field(u32, src_fileid) + __field(u32, dst_fhandle) + __field(u32, dst_fileid) + __field(dev_t, src_dev) + __field(dev_t, dst_dev) + __field(int, src_stateid_seq) + __field(u32, src_stateid_hash) + __field(int, dst_stateid_seq) + __field(u32, dst_stateid_hash) + __field(loff_t, src_offset) + __field(loff_t, dst_offset) + __field(bool, sync) + __field(loff_t, len) + __field(int, res_stateid_seq) + __field(u32, res_stateid_hash) + __field(loff_t, res_count) + __field(bool, res_sync) + __field(bool, res_cons) + __field(bool, intra) + ), + + TP_fast_assign( + const struct nfs_inode *src_nfsi = NFS_I(src_inode); + const struct nfs_inode *dst_nfsi = NFS_I(dst_inode); + + __entry->src_fileid = src_nfsi->fileid; + __entry->src_dev = src_inode->i_sb->s_dev; + __entry->src_fhandle = nfs_fhandle_hash(args->src_fh); + __entry->src_offset = args->src_pos; + __entry->dst_fileid = dst_nfsi->fileid; + __entry->dst_dev = dst_inode->i_sb->s_dev; + __entry->dst_fhandle = nfs_fhandle_hash(args->dst_fh); + __entry->dst_offset = args->dst_pos; + __entry->len = args->count; + __entry->sync = args->sync; + __entry->src_stateid_seq = + be32_to_cpu(args->src_stateid.seqid); + __entry->src_stateid_hash = + nfs_stateid_hash(&args->src_stateid); + __entry->dst_stateid_seq = + be32_to_cpu(args->dst_stateid.seqid); + __entry->dst_stateid_hash = + nfs_stateid_hash(&args->dst_stateid); + __entry->intra = nss ? 0 : 1; + if (error) { + __entry->error = -error; + __entry->res_stateid_seq = 0; + __entry->res_stateid_hash = 0; + __entry->res_count = 0; + __entry->res_sync = 0; + __entry->res_cons = 0; + } else { + __entry->error = 0; + __entry->res_stateid_seq = + be32_to_cpu(res->write_res.stateid.seqid); + __entry->res_stateid_hash = + nfs_stateid_hash(&res->write_res.stateid); + __entry->res_count = res->write_res.count; + __entry->res_sync = res->synchronous; + __entry->res_cons = res->consecutive; + } + ), + + TP_printk( + "error=%ld (%s) intra=%d src_fileid=%02x:%02x:%llu " + "src_fhandle=0x%08x dst_fileid=%02x:%02x:%llu " + "dst_fhandle=0x%08x src_stateid=%d:0x%08x " + "dst_stateid=%d:0x%08x src_offset=%llu dst_offset=%llu " + "len=%llu sync=%d cb_stateid=%d:0x%08x res_sync=%d " + "res_cons=%d res_count=%llu", + -__entry->error, + show_nfs4_status(__entry->error), + __entry->intra, + MAJOR(__entry->src_dev), MINOR(__entry->src_dev), + (unsigned long long)__entry->src_fileid, + __entry->src_fhandle, + MAJOR(__entry->dst_dev), MINOR(__entry->dst_dev), + (unsigned long long)__entry->dst_fileid, + __entry->dst_fhandle, + __entry->src_stateid_seq, __entry->src_stateid_hash, + __entry->dst_stateid_seq, __entry->dst_stateid_hash, + __entry->src_offset, + __entry->dst_offset, + __entry->len, + __entry->sync, + __entry->res_stateid_seq, __entry->res_stateid_hash, + __entry->res_sync, + __entry->res_cons, + __entry->res_count + ) +); + +TRACE_EVENT(nfs4_clone, + TP_PROTO( + const struct inode *src_inode, + const struct inode *dst_inode, + const struct nfs42_clone_args *args, + int error + ), + + TP_ARGS(src_inode, dst_inode, args, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(u32, src_fhandle) + __field(u32, src_fileid) + __field(u32, dst_fhandle) + __field(u32, dst_fileid) + __field(dev_t, src_dev) + __field(dev_t, dst_dev) + __field(loff_t, src_offset) + __field(loff_t, dst_offset) + __field(int, src_stateid_seq) + __field(u32, src_stateid_hash) + __field(int, dst_stateid_seq) + __field(u32, dst_stateid_hash) + __field(loff_t, len) + ), + + TP_fast_assign( + const struct nfs_inode *src_nfsi = NFS_I(src_inode); + const struct nfs_inode *dst_nfsi = NFS_I(dst_inode); + + __entry->src_fileid = src_nfsi->fileid; + __entry->src_dev = src_inode->i_sb->s_dev; + __entry->src_fhandle = nfs_fhandle_hash(args->src_fh); + __entry->src_offset = args->src_offset; + __entry->dst_fileid = dst_nfsi->fileid; + __entry->dst_dev = dst_inode->i_sb->s_dev; + __entry->dst_fhandle = nfs_fhandle_hash(args->dst_fh); + __entry->dst_offset = args->dst_offset; + __entry->len = args->count; + __entry->error = error < 0 ? -error : 0; + __entry->src_stateid_seq = + be32_to_cpu(args->src_stateid.seqid); + __entry->src_stateid_hash = + nfs_stateid_hash(&args->src_stateid); + __entry->dst_stateid_seq = + be32_to_cpu(args->dst_stateid.seqid); + __entry->dst_stateid_hash = + nfs_stateid_hash(&args->dst_stateid); + ), + + TP_printk( + "error=%ld (%s) src_fileid=%02x:%02x:%llu " + "src_fhandle=0x%08x dst_fileid=%02x:%02x:%llu " + "dst_fhandle=0x%08x src_stateid=%d:0x%08x " + "dst_stateid=%d:0x%08x src_offset=%llu " + "dst_offset=%llu len=%llu", + -__entry->error, + show_nfs4_status(__entry->error), + MAJOR(__entry->src_dev), MINOR(__entry->src_dev), + (unsigned long long)__entry->src_fileid, + __entry->src_fhandle, + MAJOR(__entry->dst_dev), MINOR(__entry->dst_dev), + (unsigned long long)__entry->dst_fileid, + __entry->dst_fhandle, + __entry->src_stateid_seq, __entry->src_stateid_hash, + __entry->dst_stateid_seq, __entry->dst_stateid_hash, + __entry->src_offset, + __entry->dst_offset, + __entry->len + ) +); + +TRACE_EVENT(nfs4_copy_notify, + TP_PROTO( + const struct inode *inode, + const struct nfs42_copy_notify_args *args, + const struct nfs42_copy_notify_res *res, + int error + ), + + TP_ARGS(inode, args, res, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(u32, fhandle) + __field(u32, fileid) + __field(dev_t, dev) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, res_stateid_seq) + __field(u32, res_stateid_hash) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->fileid = nfsi->fileid; + __entry->dev = inode->i_sb->s_dev; + __entry->fhandle = nfs_fhandle_hash(args->cna_src_fh); + __entry->stateid_seq = + be32_to_cpu(args->cna_src_stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&args->cna_src_stateid); + if (error) { + __entry->error = -error; + __entry->res_stateid_seq = 0; + __entry->res_stateid_hash = 0; + } else { + __entry->error = 0; + __entry->res_stateid_seq = + be32_to_cpu(res->cnr_stateid.seqid); + __entry->res_stateid_hash = + nfs_stateid_hash(&res->cnr_stateid); + } + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x res_stateid=%d:0x%08x", + -__entry->error, + show_nfs4_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __entry->res_stateid_seq, __entry->res_stateid_hash + ) +); + +DECLARE_EVENT_CLASS(nfs4_offload_class, + TP_PROTO( + const struct nfs42_offload_status_args *args, + int error + ), + + TP_ARGS(args, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(u32, fhandle) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->fhandle = nfs_fhandle_hash(args->osa_src_fh); + __entry->error = error < 0 ? -error : 0; + __entry->stateid_seq = + be32_to_cpu(args->osa_stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&args->osa_stateid); + ), + + TP_printk( + "error=%ld (%s) fhandle=0x%08x stateid=%d:0x%08x", + -__entry->error, + show_nfs4_status(__entry->error), + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash + ) +); +#define DEFINE_NFS4_OFFLOAD_EVENT(name) \ + DEFINE_EVENT(nfs4_offload_class, name, \ + TP_PROTO( \ + const struct nfs42_offload_status_args *args, \ + int error \ + ), \ + TP_ARGS(args, error)) +DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_cancel); +DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_status); + +DECLARE_EVENT_CLASS(nfs4_xattr_event, + TP_PROTO( + const struct inode *inode, + const char *name, + int error + ), + + TP_ARGS(inode, name, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(name, name) + ), + + TP_fast_assign( + __entry->error = error < 0 ? -error : 0; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __assign_str(name); + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "name=%s", + -__entry->error, show_nfs4_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __get_str(name) + ) +); +#define DEFINE_NFS4_XATTR_EVENT(name) \ + DEFINE_EVENT(nfs4_xattr_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const char *name, \ + int error \ + ), \ + TP_ARGS(inode, name, error)) +DEFINE_NFS4_XATTR_EVENT(nfs4_getxattr); +DEFINE_NFS4_XATTR_EVENT(nfs4_setxattr); +DEFINE_NFS4_XATTR_EVENT(nfs4_removexattr); + +DEFINE_NFS4_INODE_EVENT(nfs4_listxattr); +#endif /* CONFIG_NFS_V4_2 */ + #endif /* CONFIG_NFS_V4_1 */ #endif /* _TRACE_NFS4_H */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 37c8af003275..1d0e6c10f921 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,9 +52,10 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> -#include <linux/fs_struct.h> +#include <linux/nfs_common.h> #include "nfs4_fs.h" +#include "nfs4trace.h" #include "internal.h" #include "nfs4idmap.h" #include "nfs4session.h" @@ -63,10 +64,12 @@ #define NFSDBG_FACILITY NFSDBG_XDR -/* Mapping from NFS error code to "errno" error code. */ -#define errno_NFSERR_IO EIO - -static int nfs4_stat_to_errno(int); +struct compound_hdr; +static void encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr); +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res); /* NFSv4 COMPOUND tags are only wanted for debugging purposes */ #ifdef DEBUG @@ -78,9 +81,9 @@ static int nfs4_stat_to_errno(int); /* lock,open owner id: * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ -#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2) -#define lock_owner_id_maxsz (1 + 1 + 4) -#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define pagepad_maxsz (1) +#define open_owner_id_maxsz (2 + 1 + 2 + 2) +#define lock_owner_id_maxsz (2 + 1 + 2) #define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define op_encode_hdr_maxsz (1) @@ -99,6 +102,7 @@ static int nfs4_stat_to_errno(int); ((3+NFS4_FHSIZE) >> 2)) #define nfs4_fattr_bitmap_maxsz 4 #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) +#define nfstime4_maxsz (3) #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) @@ -113,7 +117,8 @@ static int nfs4_stat_to_errno(int); #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) /* This is based on getfattr, which uses the most attributes: */ #define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ - 3 + 3 + 3 + nfs4_owner_maxsz + \ + 3*nfstime4_maxsz + \ + nfs4_owner_maxsz + \ nfs4_group_maxsz + nfs4_label_maxsz + \ decode_mdsthreshold_maxsz)) #define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ @@ -124,7 +129,8 @@ static int nfs4_stat_to_errno(int); nfs4_owner_maxsz + \ nfs4_group_maxsz + \ nfs4_label_maxsz + \ - 4 + 4) + 1 + nfstime4_maxsz + \ + 1 + nfstime4_maxsz) #define encode_savefh_maxsz (op_encode_hdr_maxsz) #define decode_savefh_maxsz (op_decode_hdr_maxsz) #define encode_restorefh_maxsz (op_encode_hdr_maxsz) @@ -134,7 +140,17 @@ static int nfs4_stat_to_errno(int); * layout types will be returned. */ #define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ - nfs4_fattr_bitmap_maxsz + 4 + 8 + 5) + nfs4_fattr_bitmap_maxsz + 1 + \ + 1 /* lease time */ + \ + 2 /* max filesize */ + \ + 2 /* max read */ + \ + 2 /* max write */ + \ + nfstime4_maxsz /* time delta */ + \ + 5 /* fs layout types */ + \ + 1 /* layout blksize */ + \ + 1 /* clone blksize */ + \ + 1 /* change attr type */ + \ + 1 /* xattr support */) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) #define encode_setclientid_maxsz \ @@ -168,7 +184,7 @@ static int nfs4_stat_to_errno(int); #define encode_claim_null_maxsz (1 + nfs4_name_maxsz) #define encode_open_maxsz (op_encode_hdr_maxsz + \ 2 + encode_share_access_maxsz + 2 + \ - open_owner_id_maxsz + \ + 1 + open_owner_id_maxsz + \ encode_opentype_maxsz + \ encode_claim_null_maxsz) #define decode_space_limit_maxsz (3) @@ -204,16 +220,21 @@ static int nfs4_stat_to_errno(int); encode_attrs_maxsz) #define decode_setattr_maxsz (op_decode_hdr_maxsz + \ nfs4_fattr_bitmap_maxsz) +#define encode_delegattr_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + nfs4_fattr_bitmap_maxsz + \ + 2*nfstime4_maxsz) +#define decode_delegattr_maxsz (decode_setattr_maxsz) #define encode_read_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) -#define decode_read_maxsz (op_decode_hdr_maxsz + 2) +#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + pagepad_maxsz) #define encode_readdir_maxsz (op_encode_hdr_maxsz + \ 2 + encode_verifier_maxsz + 5 + \ nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz) + decode_verifier_maxsz + pagepad_maxsz) #define encode_readlink_maxsz (op_encode_hdr_maxsz) -#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) +#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz) #define encode_write_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 4) #define decode_write_maxsz (op_decode_hdr_maxsz + \ @@ -233,13 +254,14 @@ static int nfs4_stat_to_errno(int); #define encode_link_maxsz (op_encode_hdr_maxsz + \ nfs4_name_maxsz) #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) -#define encode_lockowner_maxsz (7) +#define encode_lockowner_maxsz (2 + 1 + lock_owner_id_maxsz) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ 7 + \ 1 + encode_stateid_maxsz + 1 + \ encode_lockowner_maxsz) #define decode_lock_denied_maxsz \ - (8 + decode_lockowner_maxsz) + (2 + 2 + 1 + 2 + 1 + lock_owner_id_maxsz) #define decode_lock_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) #define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ @@ -275,14 +297,14 @@ static int nfs4_stat_to_errno(int); #define decode_delegreturn_maxsz (op_decode_hdr_maxsz) #define encode_getacl_maxsz (encode_getattr_maxsz) #define decode_getacl_maxsz (op_decode_hdr_maxsz + \ - nfs4_fattr_bitmap_maxsz + 1) + nfs4_fattr_bitmap_maxsz + 1 + pagepad_maxsz) #define encode_setacl_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) #define decode_setacl_maxsz (decode_setattr_maxsz) #define encode_fs_locations_maxsz \ (encode_getattr_maxsz) #define decode_fs_locations_maxsz \ - (0) + (pagepad_maxsz) #define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) #define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) @@ -383,12 +405,14 @@ static int nfs4_stat_to_errno(int); 1 /* opaque devaddr4 length */ + \ /* devaddr4 payload is read into page */ \ 1 /* notification bitmap length */ + \ - 1 /* notification bitmap, word 0 */) + 1 /* notification bitmap, word 0 */ + \ + pagepad_maxsz /* possible XDR padding */) #define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ encode_stateid_maxsz) #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ decode_stateid_maxsz + \ - XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ + pagepad_maxsz) #define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ 2 /* offset */ + \ 2 /* length */ + \ @@ -422,6 +446,8 @@ static int nfs4_stat_to_errno(int); #define decode_sequence_maxsz 0 #define encode_layoutreturn_maxsz 0 #define decode_layoutreturn_maxsz 0 +#define encode_layoutget_maxsz 0 +#define decode_layoutget_maxsz 0 #endif /* CONFIG_NFS_V4_1 */ #define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ @@ -474,14 +500,16 @@ static int nfs4_stat_to_errno(int); encode_open_maxsz + \ encode_access_maxsz + \ encode_getfh_maxsz + \ - encode_getattr_maxsz) + encode_getattr_maxsz + \ + encode_layoutget_maxsz) #define NFS4_dec_open_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_open_maxsz + \ decode_access_maxsz + \ decode_getfh_maxsz + \ - decode_getattr_maxsz) + decode_getattr_maxsz + \ + decode_layoutget_maxsz) #define NFS4_enc_open_confirm_sz \ (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ @@ -495,13 +523,15 @@ static int nfs4_stat_to_errno(int); encode_putfh_maxsz + \ encode_open_maxsz + \ encode_access_maxsz + \ - encode_getattr_maxsz) + encode_getattr_maxsz + \ + encode_layoutget_maxsz) #define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_open_maxsz + \ decode_access_maxsz + \ - decode_getattr_maxsz) + decode_getattr_maxsz + \ + decode_layoutget_maxsz) #define NFS4_enc_open_downgrade_sz \ (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ @@ -587,7 +617,7 @@ static int nfs4_stat_to_errno(int); encode_lockowner_maxsz) #define NFS4_dec_release_lockowner_sz \ (compound_decode_hdr_maxsz + \ - decode_lockowner_maxsz) + decode_release_lockowner_maxsz) #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -730,12 +760,14 @@ static int nfs4_stat_to_errno(int); encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_layoutreturn_maxsz + \ + encode_delegattr_maxsz + \ encode_delegreturn_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_layoutreturn_maxsz + \ + decode_delegattr_maxsz + \ decode_delegreturn_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ @@ -821,6 +853,7 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_sequence_sz \ (compound_decode_hdr_maxsz + \ decode_sequence_maxsz) +#endif #define NFS4_enc_get_lease_time_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putrootfh_maxsz + \ @@ -829,6 +862,7 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putrootfh_maxsz + \ decode_fsinfo_maxsz) +#if defined(CONFIG_NFS_V4_1) #define NFS4_enc_reclaim_complete_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_reclaim_complete_maxsz) @@ -938,11 +972,6 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) return p; } -static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) -{ - WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0); -} - static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { WARN_ON_ONCE(xdr_stream_encode_opaque(xdr, str, len) < 0); @@ -958,6 +987,35 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n) WARN_ON_ONCE(xdr_stream_encode_u64(xdr, n) < 0); } +static ssize_t xdr_encode_bitmap4(struct xdr_stream *xdr, + const __u32 *bitmap, size_t len) +{ + ssize_t ret; + + /* Trim empty words */ + while (len > 0 && bitmap[len-1] == 0) + len--; + ret = xdr_stream_encode_uint32_array(xdr, bitmap, len); + if (WARN_ON_ONCE(ret < 0)) + return ret; + return len; +} + +static size_t mask_bitmap4(const __u32 *bitmap, const __u32 *mask, + __u32 *res, size_t len) +{ + size_t i; + __u32 tmp; + + while (len > 0 && (bitmap[len-1] == 0 || mask[len-1] == 0)) + len--; + for (i = len; i-- > 0;) { + tmp = bitmap[i] & mask[i]; + res[i] = tmp; + } + return len; +} + static void encode_nfs4_seqid(struct xdr_stream *xdr, const struct nfs_seqid *seqid) { @@ -972,12 +1030,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - struct rpc_auth *auth = req->rq_cred->cr_auth; /* initialize running count of expected bytes in reply. * NOTE: the replied tag SHOULD be the same is the one sent, * but this is not required as a MUST for the server to do so. */ - hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; + hdr->replen = 3 + hdr->taglen; WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN); encode_string(xdr, hdr->taglen, hdr->tag); @@ -1002,9 +1059,10 @@ static void encode_nops(struct compound_hdr *hdr) *hdr->nops_p = htonl(hdr->nops); } -static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid) +static void encode_nfs4_stateid(struct xdr_stream *xdr, + const nfs4_stateid *stateid) { - encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); + encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); } static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) @@ -1012,6 +1070,14 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); } +static __be32 * +xdr_encode_nfstime4(__be32 *p, const struct timespec64 *t) +{ + p = xdr_encode_hyper(p, t->tv_sec); + *p++ = cpu_to_be32(t->tv_nsec); + return p; +} + static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, const umode_t *umask, @@ -1023,9 +1089,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, int owner_namelen = 0; int owner_grouplen = 0; __be32 *p; - unsigned i; uint32_t len = 0; - uint32_t bmval_len; uint32_t bmval[3] = { 0 }; /* @@ -1073,7 +1137,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 16; + len += 4 + (nfstime4_maxsz << 2); } else if (iap->ia_valid & ATTR_ATIME) { bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; len += 4; @@ -1082,7 +1146,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 16; + len += 4 + (nfstime4_maxsz << 2); } else if (iap->ia_valid & ATTR_MTIME) { bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 4; @@ -1094,19 +1158,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; } - if (bmval[2] != 0) - bmval_len = 3; - else if (bmval[1] != 0) - bmval_len = 2; - else - bmval_len = 1; - - p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len); - - *p++ = cpu_to_be32(bmval_len); - for (i = 0; i < bmval_len; i++) - *p++ = cpu_to_be32(bmval[i]); - *p++ = cpu_to_be32(len); + xdr_encode_bitmap4(xdr, bmval, ARRAY_SIZE(bmval)); + xdr_stream_encode_opaque_inline(xdr, (void **)&p, len); if (bmval[0] & FATTR4_WORD0_SIZE) p = xdr_encode_hyper(p, iap->ia_size); @@ -1119,20 +1172,18 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); - *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); + p = xdr_encode_nfstime4(p, &iap->ia_atime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); - *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); + p = xdr_encode_nfstime4(p, &iap->ia_mtime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } - if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) { *p++ = cpu_to_be32(label->lfs); *p++ = cpu_to_be32(label->pi); *p++ = cpu_to_be32(label->len); @@ -1200,85 +1251,45 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * create->server, create->server->attr_bitmask); } -static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) -{ - __be32 *p; - - encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(bitmap); -} - -static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) -{ - __be32 *p; - - encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(bm0); - *p = cpu_to_be32(bm1); -} - -static void -encode_getattr_three(struct xdr_stream *xdr, - uint32_t bm0, uint32_t bm1, uint32_t bm2, - struct compound_hdr *hdr) +static void encode_getattr(struct xdr_stream *xdr, + const __u32 *bitmap, const __u32 *mask, size_t len, + struct compound_hdr *hdr) { - __be32 *p; + __u32 masked_bitmap[nfs4_fattr_bitmap_maxsz]; encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - if (bm2) { - p = reserve_space(xdr, 16); - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(bm0); - *p++ = cpu_to_be32(bm1); - *p = cpu_to_be32(bm2); - } else if (bm1) { - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(bm0); - *p = cpu_to_be32(bm1); - } else { - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(bm0); + if (mask) { + if (WARN_ON_ONCE(len > ARRAY_SIZE(masked_bitmap))) + len = ARRAY_SIZE(masked_bitmap); + len = mask_bitmap4(bitmap, mask, masked_bitmap, len); + bitmap = masked_bitmap; } + xdr_encode_bitmap4(xdr, bitmap, len); } static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0], - bitmask[1] & nfs4_fattr_bitmap[1], - bitmask[2] & nfs4_fattr_bitmap[2], - hdr); + encode_getattr(xdr, nfs4_fattr_bitmap, bitmask, + ARRAY_SIZE(nfs4_fattr_bitmap), hdr); } static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, const u32 *open_bitmap, struct compound_hdr *hdr) { - encode_getattr_three(xdr, - bitmask[0] & open_bitmap[0], - bitmask[1] & open_bitmap[1], - bitmask[2] & open_bitmap[2], - hdr); + encode_getattr(xdr, open_bitmap, bitmask, 3, hdr); } static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_three(xdr, - bitmask[0] & nfs4_fsinfo_bitmap[0], - bitmask[1] & nfs4_fsinfo_bitmap[1], - bitmask[2] & nfs4_fsinfo_bitmap[2], - hdr); + encode_getattr(xdr, nfs4_fsinfo_bitmap, bitmask, + ARRAY_SIZE(nfs4_fsinfo_bitmap), hdr); } static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0], - bitmask[1] & nfs4_fs_locations_bitmap[1], hdr); + encode_getattr(xdr, nfs4_fs_locations_bitmap, bitmask, + ARRAY_SIZE(nfs4_fs_locations_bitmap), hdr); } static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) @@ -1294,7 +1305,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct static inline int nfs4_lock_type(struct file_lock *fl, int block) { - if (fl->fl_type == F_RDLCK) + if (lock_is_read(fl)) return block ? NFS4_READW_LT : NFS4_READ_LT; return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT; } @@ -1401,16 +1412,16 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena __be32 *p; /* * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, - * owner 4 = 32 + * owner 28 */ encode_nfs4_seqid(xdr, arg->seqid); encode_share_access(xdr, arg->share_access); - p = reserve_space(xdr, 36); + p = reserve_space(xdr, 40); p = xdr_encode_hyper(p, arg->clientid); - *p++ = cpu_to_be32(24); + *p++ = cpu_to_be32(28); p = xdr_encode_opaque_fixed(p, "open id:", 8); *p++ = cpu_to_be32(arg->server->s_dev); - *p++ = cpu_to_be32(arg->id.uniquifier); + p = xdr_encode_hyper(p, arg->id.uniquifier); xdr_encode_hyper(p, arg->id.create_time); } @@ -1457,20 +1468,18 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a } } -static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type) +static inline void encode_delegation_type(struct xdr_stream *xdr, u32 delegation_type) { __be32 *p; p = reserve_space(xdr, 4); switch (delegation_type) { - case 0: - *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); - break; - case FMODE_READ: - *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); - break; - case FMODE_WRITE|FMODE_READ: - *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); + case NFS4_OPEN_DELEGATE_NONE: + case NFS4_OPEN_DELEGATE_READ: + case NFS4_OPEN_DELEGATE_WRITE: + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + *p = cpu_to_be32(delegation_type); break; default: BUG(); @@ -1486,7 +1495,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * encode_string(xdr, name->len, name->name); } -static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) +static inline void encode_claim_previous(struct xdr_stream *xdr, u32 type) { __be32 *p; @@ -1591,23 +1600,33 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) { uint32_t attrs[3] = { - FATTR4_WORD0_RDATTR_ERROR, + FATTR4_WORD0_TYPE + | FATTR4_WORD0_RDATTR_ERROR, FATTR4_WORD1_MOUNTED_ON_FILEID, }; - uint32_t dircount = readdir->count >> 1; + uint32_t dircount = readdir->count; + uint32_t maxcount = readdir->count; __be32 *p, verf[2]; uint32_t attrlen = 0; unsigned int i; if (readdir->plus) { - attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| - FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID; - attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER| - FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV| - FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| - FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; + attrs[0] |= FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEHANDLE + | FATTR4_WORD0_FILEID; + attrs[1] |= FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_CREATE + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY; attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; - dircount >>= 1; } /* Use mounted_on_fileid only if the server supports it */ if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) @@ -1623,7 +1642,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg encode_nfs4_verifier(xdr, &readdir->verifier); p = reserve_space(xdr, 12 + (attrlen << 2)); *p++ = cpu_to_be32(dircount); - *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(maxcount); *p++ = cpu_to_be32(attrlen); for (i = 0; i < attrlen; i++) *p++ = cpu_to_be32(attrs[i]); @@ -1669,19 +1688,35 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr); } -static void -encode_setacl(struct xdr_stream *xdr, const struct nfs_setaclargs *arg, - struct compound_hdr *hdr) +static void nfs4_acltype_to_bitmap(enum nfs4_acl_type type, __u32 bitmap[2]) { - __be32 *p; + switch (type) { + default: + bitmap[0] = FATTR4_WORD0_ACL; + bitmap[1] = 0; + break; + case NFS4ACL_DACL: + bitmap[0] = 0; + bitmap[1] = FATTR4_WORD1_DACL; + break; + case NFS4ACL_SACL: + bitmap[0] = 0; + bitmap[1] = FATTR4_WORD1_SACL; + } +} + +static void encode_setacl(struct xdr_stream *xdr, + const struct nfs_setaclargs *arg, + struct compound_hdr *hdr) +{ + __u32 bitmap[2]; + + nfs4_acltype_to_bitmap(arg->acl_type, bitmap); encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr); encode_nfs4_stateid(xdr, &zero_stateid); - p = reserve_space(xdr, 2*4); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(FATTR4_WORD0_ACL); - p = reserve_space(xdr, 4); - *p = cpu_to_be32(arg->acl_len); + xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap)); + encode_uint32(xdr, arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len); } @@ -1699,6 +1734,33 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs server->attr_bitmask); } +static void encode_delegattr(struct xdr_stream *xdr, + const nfs4_stateid *stateid, + const struct nfs4_delegattr *attr, + struct compound_hdr *hdr) +{ + uint32_t bitmap[3] = { 0 }; + uint32_t len = 0; + __be32 *p; + + encode_op_hdr(xdr, OP_SETATTR, encode_delegattr_maxsz, hdr); + encode_nfs4_stateid(xdr, stateid); + if (attr->atime_set) { + bitmap[2] |= FATTR4_WORD2_TIME_DELEG_ACCESS; + len += (nfstime4_maxsz << 2); + } + if (attr->mtime_set) { + bitmap[2] |= FATTR4_WORD2_TIME_DELEG_MODIFY; + len += (nfstime4_maxsz << 2); + } + xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap)); + xdr_stream_encode_opaque_inline(xdr, (void **)&p, len); + if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) + p = xdr_encode_nfstime4(p, &attr->atime); + if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) + p = xdr_encode_nfstime4(p, &attr->mtime); +} + static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) { __be32 *p; @@ -1842,8 +1904,8 @@ static void encode_create_session(struct xdr_stream *xdr, * Assumes OPEN is the biggest non-idempotent compound. * 2 is the verifier. */ - max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + - RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; + max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 2) + * XDR_UNIT + RPC_MAX_AUTH_SIZE; encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12); @@ -2069,7 +2131,7 @@ static void encode_test_stateid(struct xdr_stream *xdr, { encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr); encode_uint32(xdr, 1); - encode_nfs4_stateid(xdr, args->stateid); + encode_nfs4_stateid(xdr, &args->stateid); } static void encode_free_stateid(struct xdr_stream *xdr, @@ -2086,6 +2148,13 @@ encode_layoutreturn(struct xdr_stream *xdr, struct compound_hdr *hdr) { } + +static void +encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr) +{ +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2117,7 +2186,8 @@ static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_access(xdr, args->access, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2331,6 +2401,12 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, if (args->access) encode_access(xdr, args->access, &hdr); encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); + if (args->lg_args) { + encode_layoutget(xdr, args->lg_args, &hdr); + rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, + args->lg_args->layout.pglen, + hdr.replen - pagepad_maxsz); + } encode_nops(&hdr); } @@ -2371,6 +2447,12 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, if (args->access) encode_access(xdr, args->access, &hdr); encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); + if (args->lg_args) { + encode_layoutget(xdr, args->lg_args, &hdr); + rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, + args->lg_args->layout.pglen, + hdr.replen - pagepad_maxsz); + } encode_nops(&hdr); } @@ -2479,8 +2561,8 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_readlink(xdr, args, req, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, - args->pgbase, args->pglen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->pglen, hdr.replen - pagepad_maxsz); encode_nops(&hdr); } @@ -2500,11 +2582,8 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_readdir(xdr, args, req, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, - args->pgbase, args->count); - dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", - __func__, hdr.replen << 2, args->pages, - args->pgbase, args->count); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, hdr.replen - pagepad_maxsz); encode_nops(&hdr); } @@ -2524,8 +2603,8 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_read(xdr, args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->pages, args->pgbase, args->count); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, hdr.replen - pagepad_maxsz); req->rq_rcv_buf.flags |= XDRBUF_READ; encode_nops(&hdr); } @@ -2559,17 +2638,20 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; + __u32 nfs4_acl_bitmap[2]; uint32_t replen; + nfs4_acltype_to_bitmap(args->acl_type, nfs4_acl_bitmap); + encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); replen = hdr.replen + op_decode_hdr_maxsz; - encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); - - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, - args->acl_pages, 0, args->acl_len); + encode_getattr(xdr, nfs4_acl_bitmap, NULL, + ARRAY_SIZE(nfs4_acl_bitmap), &hdr); + rpc_prepare_reply_pages(req, args->acl_pages, 0, + args->acl_len, replen); encode_nops(&hdr); } @@ -2644,8 +2726,8 @@ static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], - &hdr); + encode_getattr(xdr, nfs4_pathconf_bitmap, args->bitmask, + ARRAY_SIZE(nfs4_pathconf_bitmap), &hdr); encode_nops(&hdr); } @@ -2663,8 +2745,8 @@ static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], - args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); + encode_getattr(xdr, nfs4_statfs_bitmap, args->bitmask, + ARRAY_SIZE(nfs4_statfs_bitmap), &hdr); encode_nops(&hdr); } @@ -2684,7 +2766,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); - encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr); + encode_getattr(xdr, bitmask, NULL, 3, &hdr); encode_nops(&hdr); } @@ -2756,6 +2838,8 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, encode_putfh(xdr, args->fhandle, &hdr); if (args->lr_args) encode_layoutreturn(xdr, args->lr_args, &hdr); + if (args->sattr_args) + encode_delegattr(xdr, args->stateid, args->sattr_args, &hdr); if (args->bitmask) encode_getfattr(xdr, args->bitmask, &hdr); encode_delegreturn(xdr, args->stateid, &hdr); @@ -2790,9 +2874,8 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, encode_fs_locations(xdr, args->bitmask, &hdr); } - /* Set up reply kvec to capture returned fs_locations array. */ - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, - (struct page **)&args->page, 0, PAGE_SIZE); + rpc_prepare_reply_pages(req, (struct page **)&args->page, 0, + PAGE_SIZE, replen); encode_nops(&hdr); } @@ -2938,6 +3021,8 @@ static void nfs4_xdr_enc_sequence(struct rpc_rqst *req, struct xdr_stream *xdr, encode_nops(&hdr); } +#endif + /* * a GET_LEASE_TIME request */ @@ -2958,6 +3043,8 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, encode_nops(&hdr); } +#ifdef CONFIG_NFS_V4_1 + /* * a RECLAIM_COMPLETE request */ @@ -2987,17 +3074,19 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; + uint32_t replen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); - encode_getdeviceinfo(xdr, args, &hdr); - /* set up reply kvec. Subtract notification bitmap max size (2) - * so that notification bitmap is put in xdr_buf tail */ - xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, - args->pdev->pages, args->pdev->pgbase, - args->pdev->pglen); + replen = hdr.replen + op_decode_hdr_maxsz + 2; + encode_getdeviceinfo(xdr, args, &hdr); + + /* set up reply kvec. device_addr4 opaque data is read into the + * pages */ + rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase, + args->pdev->pglen, replen); encode_nops(&hdr); } @@ -3018,9 +3107,8 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, encode_putfh(xdr, NFS_FH(args->inode), &hdr); encode_layoutget(xdr, args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->layout.pages, 0, args->layout.pglen); - + rpc_prepare_reply_pages(req, args->layout.pages, 0, + args->layout.pglen, hdr.replen - pagepad_maxsz); encode_nops(&hdr); } @@ -3121,48 +3209,38 @@ static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, } #endif /* CONFIG_NFS_V4_1 */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("nfs: %s: prematurely hit end of receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string, NFS4_OPAQUE_LIMIT); - if (unlikely(ret < 0)) { - if (ret == -EBADMSG) - print_overflow_msg(__func__, xdr); + if (unlikely(ret < 0)) return -EIO; - } *len = ret; return 0; } static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { - __be32 *p; + ssize_t ret; + void *ptr; + u32 tmp; - p = xdr_inline_decode(xdr, 8); - if (unlikely(!p)) - goto out_overflow; - hdr->status = be32_to_cpup(p++); - hdr->taglen = be32_to_cpup(p); + if (xdr_stream_decode_u32(xdr, &tmp) < 0) + return -EIO; + hdr->status = tmp; - p = xdr_inline_decode(xdr, hdr->taglen + 4); - if (unlikely(!p)) - goto out_overflow; - hdr->tag = (char *)p; - p += XDR_QUADLEN(hdr->taglen); - hdr->nops = be32_to_cpup(p); + ret = xdr_stream_decode_opaque_inline(xdr, &ptr, NFS4_OPAQUE_LIMIT); + if (ret < 0) + return -EIO; + hdr->taglen = ret; + hdr->tag = ptr; + + if (xdr_stream_decode_u32(xdr, &tmp) < 0) + return -EIO; + hdr->nops = tmp; if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, @@ -3178,20 +3256,20 @@ static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, opnum = be32_to_cpup(p++); if (unlikely(opnum != expected)) goto out_bad_operation; + if (unlikely(*p != cpu_to_be32(NFS_OK))) + goto out_status; + *nfs_retval = 0; + return true; +out_status: nfserr = be32_to_cpup(p); - if (nfserr == NFS_OK) - *nfs_retval = 0; - else - *nfs_retval = nfs4_stat_to_errno(nfserr); + trace_nfs4_xdr_status(xdr, opnum, nfserr); + *nfs_retval = nfs4_stat_to_errno(nfserr); return true; out_bad_operation: - dprintk("nfs: Server returned operation" - " %d but we issued a request for %d\n", - opnum, expected); + trace_nfs4_xdr_bad_operation(xdr, opnum, expected); *nfs_retval = -EREMOTEIO; return false; out_overflow: - print_overflow_msg(__func__, xdr); *nfs_retval = -EIO; return false; } @@ -3212,38 +3290,29 @@ static int decode_ace(struct xdr_stream *xdr, void *ace) char *str; p = xdr_inline_decode(xdr, 12); - if (likely(p)) - return decode_opaque_inline(xdr, &strlen, &str); - print_overflow_msg(__func__, xdr); - return -EIO; + if (unlikely(!p)) + return -EIO; + return decode_opaque_inline(xdr, &strlen, &str); } -static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +static ssize_t +decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz) { - uint32_t bmlen; - __be32 *p; + ssize_t ret; - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - bmlen = be32_to_cpup(p); + ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz); + if (likely(ret >= 0)) + return ret; + if (ret != -EMSGSIZE) + return -EIO; + return sz; +} - bitmap[0] = bitmap[1] = bitmap[2] = 0; - p = xdr_inline_decode(xdr, (bmlen << 2)); - if (unlikely(!p)) - goto out_overflow; - if (bmlen > 0) { - bitmap[0] = be32_to_cpup(p++); - if (bmlen > 1) { - bitmap[1] = be32_to_cpup(p++); - if (bmlen > 2) - bitmap[2] = be32_to_cpup(p); - } - } - return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; +static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +{ + ssize_t ret; + ret = decode_bitmap4(xdr, bitmap, 3); + return ret < 0 ? ret : 0; } static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep) @@ -3252,13 +3321,10 @@ static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigne p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *attrlen = be32_to_cpup(p); *savep = xdr_stream_pos(xdr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) @@ -3287,7 +3353,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *type = be32_to_cpup(p); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); @@ -3298,9 +3364,6 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * } dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fh_expire_type(struct xdr_stream *xdr, @@ -3314,15 +3377,12 @@ static int decode_attr_fh_expire_type(struct xdr_stream *xdr, if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *type = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE; } dprintk("%s: expire type=0x%x\n", __func__, *type); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) @@ -3336,7 +3396,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; @@ -3344,9 +3404,6 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) @@ -3360,16 +3417,13 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3382,15 +3436,12 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } - dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); + dprintk("%s: link support=%s\n", __func__, str_false_true(*res == 0)); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3403,15 +3454,12 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } - dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); + dprintk("%s: symlink support=%s\n", __func__, str_false_true(*res == 0)); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) @@ -3426,7 +3474,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { p = xdr_inline_decode(xdr, 16); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &fsid->major); xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; @@ -3436,9 +3484,6 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs (unsigned long long)fsid->major, (unsigned long long)fsid->minor); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3451,15 +3496,12 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } - dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); + dprintk("%s: lease time=%u\n", __func__, (unsigned int)*res); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res) @@ -3471,14 +3513,11 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t * if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; *res = -be32_to_cpup(p); } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, @@ -3500,7 +3539,7 @@ static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) { __be32 *p; - int len; + u32 len; if (fh != NULL) memset(fh, 0, sizeof(*fh)); @@ -3510,13 +3549,16 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; - len = be32_to_cpup(p); - if (len > NFS4_FHSIZE) return -EIO; + len = be32_to_cpup(p); + if (len > NFS4_FHSIZE || len == 0) { + trace_nfs4_xdr_bad_filehandle(xdr, OP_READDIR, + NFS4ERR_BADHANDLE); + return -EREMOTEIO; + } p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; if (fh != NULL) { memcpy(fh->data, p, len); fh->size = len; @@ -3524,9 +3566,6 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3539,15 +3578,48 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; +} + +static int decode_attr_case_insensitive(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_INSENSITIVE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_CASE_INSENSITIVE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_CASE_INSENSITIVE; + } + dprintk("%s: case_insensitive=%s\n", __func__, str_false_true(*res == 0)); + return 0; +} + +static int decode_attr_case_preserving(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_PRESERVING - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_CASE_PRESERVING)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_CASE_PRESERVING; + } + dprintk("%s: case_preserving=%s\n", __func__, str_false_true(*res == 0)); + return 0; } static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -3561,16 +3633,13 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -3584,16 +3653,13 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3607,15 +3673,12 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3629,15 +3692,12 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3651,15 +3711,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) @@ -3670,7 +3727,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; n = be32_to_cpup(p); if (n == 0) goto root_path; @@ -3702,9 +3759,6 @@ out_eio: dprintk(" status %d", status); status = -EIO; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) @@ -3729,10 +3783,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st goto out; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; - n = be32_to_cpup(p); - if (n <= 0) goto out_eio; + n = be32_to_cpup(p); for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; struct nfs4_fs_location *loc; @@ -3742,7 +3794,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st loc = &res->locations[res->nlocations]; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + goto out_eio; m = be32_to_cpup(p); dprintk("%s: servers:\n", __func__); @@ -3780,8 +3832,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; -out_overflow: - print_overflow_msg(__func__, xdr); out_eio: status = -EIO; goto out; @@ -3798,15 +3848,12 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) @@ -3820,15 +3867,12 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *maxlink = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) @@ -3842,15 +3886,12 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *maxname = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3865,7 +3906,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ uint64_t maxread; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; @@ -3874,9 +3915,6 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ } dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3891,7 +3929,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 uint64_t maxwrite; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; @@ -3900,9 +3938,6 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 } dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) @@ -3917,7 +3952,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; tmp = be32_to_cpup(p); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; @@ -3925,9 +3960,6 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) @@ -3941,16 +3973,13 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *nlink = be32_to_cpup(p); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static ssize_t decode_nfs4_string(struct xdr_stream *xdr, @@ -3981,7 +4010,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, bitmap[1] &= ~FATTR4_WORD1_OWNER; if (owner_name != NULL) { - len = decode_nfs4_string(xdr, owner_name, GFP_NOWAIT); + len = decode_nfs4_string(xdr, owner_name, GFP_NOIO); if (len <= 0) goto out; dprintk("%s: name=%s\n", __func__, owner_name->data); @@ -3995,10 +4024,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, return NFS_ATTR_FATTR_OWNER; } out: - if (len != -EBADMSG) - return 0; - print_overflow_msg(__func__, xdr); - return -EIO; + if (len == -EBADMSG) + return -EIO; + return 0; } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, @@ -4016,7 +4044,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; if (group_name != NULL) { - len = decode_nfs4_string(xdr, group_name, GFP_NOWAIT); + len = decode_nfs4_string(xdr, group_name, GFP_NOIO); if (len <= 0) goto out; dprintk("%s: name=%s\n", __func__, group_name->data); @@ -4030,10 +4058,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, return NFS_ATTR_FATTR_GROUP; } out: - if (len != -EBADMSG) - return 0; - print_overflow_msg(__func__, xdr); - return -EIO; + if (len == -EBADMSG) + return -EIO; + return 0; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) @@ -4050,7 +4077,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; major = be32_to_cpup(p++); minor = be32_to_cpup(p); tmp = MKDEV(major, minor); @@ -4061,9 +4088,6 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4077,15 +4101,12 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4099,15 +4120,12 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4121,15 +4139,12 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) @@ -4143,7 +4158,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; @@ -4151,31 +4166,31 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } -static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) +static __be32 * +xdr_decode_nfstime4(__be32 *p, struct timespec64 *t) +{ + __u64 sec; + + p = xdr_decode_hyper(p, &sec); + t-> tv_sec = sec; + t->tv_nsec = be32_to_cpup(p++); + return p; +} + +static int decode_attr_time(struct xdr_stream *xdr, struct timespec64 *time) { __be32 *p; - uint64_t sec; - uint32_t nsec; - p = xdr_inline_decode(xdr, 12); + p = xdr_inline_decode(xdr, nfstime4_maxsz << 2); if (unlikely(!p)) - goto out_overflow; - p = xdr_decode_hyper(p, &sec); - nsec = be32_to_cpup(p); - time->tv_sec = (time_t)sec; - time->tv_nsec = (long)nsec; + return -EIO; + xdr_decode_nfstime4(p, time); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } -static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -4189,11 +4204,29 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str status = NFS_ATTR_FATTR_ATIME; bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS; } - dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec); + dprintk("%s: atime=%lld\n", __func__, time->tv_sec); return status; } -static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +static int decode_attr_time_create(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) +{ + int status = 0; + + time->tv_sec = 0; + time->tv_nsec = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_CREATE - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_TIME_CREATE)) { + status = decode_attr_time(xdr, time); + if (status == 0) + status = NFS_ATTR_FATTR_BTIME; + bitmap[1] &= ~FATTR4_WORD1_TIME_CREATE; + } + dprintk("%s: btime=%lld\n", __func__, time->tv_sec); + return status; +} + +static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -4207,12 +4240,12 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s status = NFS_ATTR_FATTR_CTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA; } - dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec); + dprintk("%s: ctime=%lld\n", __func__, time->tv_sec); return status; } static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, - struct timespec *time) + struct timespec64 *time) { int status = 0; @@ -4224,8 +4257,8 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, status = decode_attr_time(xdr, time); bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA; } - dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec, - (long)time->tv_nsec); + dprintk("%s: time_delta=%lld %ld\n", __func__, time->tv_sec, + time->tv_nsec); return status; } @@ -4243,43 +4276,42 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; lfs = be32_to_cpup(p++); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; pi = be32_to_cpup(p++); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p++); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; + bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; if (len < NFS4_MAXLABELLEN) { - if (label) { + if (label && label->len) { + if (label->len < len) + return -ERANGE; memcpy(label->label, p, len); label->len = len; label->pi = pi; label->lfs = lfs; status = NFS_ATTR_FATTR_V4_SECURITY_LABEL; } - bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL; } else printk(KERN_WARNING "%s: label too long (%u)!\n", __func__, len); + if (label && label->label) + dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n", + __func__, label->len, (char *)label->label, + label->len, label->pi, label->lfs); } - if (label && label->label) - dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, - (char *)label->label, label->len, label->pi, label->lfs); return status; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } -static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -4293,10 +4325,51 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str status = NFS_ATTR_FATTR_MTIME; bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY; } - dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec); + dprintk("%s: mtime=%lld\n", __func__, time->tv_sec); return status; } +static int decode_attr_xattrsupport(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[2] & (FATTR4_WORD2_XATTR_SUPPORT - 1U))) + return -EIO; + if (likely(bitmap[2] & FATTR4_WORD2_XATTR_SUPPORT)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + *res = be32_to_cpup(p); + bitmap[2] &= ~FATTR4_WORD2_XATTR_SUPPORT; + } + dprintk("%s: XATTR support=%s\n", __func__, str_false_true(*res == 0)); + return 0; +} + +static int decode_attr_open_arguments(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs4_open_caps *res) +{ + memset(res, 0, sizeof(*res)); + if (unlikely(bitmap[2] & (FATTR4_WORD2_OPEN_ARGUMENTS - 1U))) + return -EIO; + if (likely(bitmap[2] & FATTR4_WORD2_OPEN_ARGUMENTS)) { + if (decode_bitmap4(xdr, res->oa_share_access, ARRAY_SIZE(res->oa_share_access)) < 0) + return -EIO; + if (decode_bitmap4(xdr, res->oa_share_deny, ARRAY_SIZE(res->oa_share_deny)) < 0) + return -EIO; + if (decode_bitmap4(xdr, res->oa_share_access_want, ARRAY_SIZE(res->oa_share_access_want)) < 0) + return -EIO; + if (decode_bitmap4(xdr, res->oa_open_claim, ARRAY_SIZE(res->oa_open_claim)) < 0) + return -EIO; + if (decode_bitmap4(xdr, res->oa_createmode, ARRAY_SIZE(res->oa_createmode)) < 0) + return -EIO; + bitmap[2] &= ~FATTR4_WORD2_OPEN_ARGUMENTS; + } + return 0; +} + static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen) { unsigned int attrwords = XDR_QUADLEN(attrlen); @@ -4320,14 +4393,11 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c p = xdr_inline_decode(xdr, 20); if (unlikely(!p)) - goto out_overflow; + return -EIO; cinfo->atomic = be32_to_cpup(p++); p = xdr_decode_hyper(p, &cinfo->before); xdr_decode_hyper(p, &cinfo->after); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) @@ -4341,25 +4411,12 @@ static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) return status; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; supp = be32_to_cpup(p++); acc = be32_to_cpup(p); *supported = supp; *access = acc; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; -} - -static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) -{ - ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len); - if (unlikely(ret < 0)) { - print_overflow_msg(__func__, xdr); - return -EIO; - } - return 0; } static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) @@ -4385,6 +4442,14 @@ static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *state return decode_stateid(xdr, stateid); } +static int decode_invalid_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + nfs4_stateid dummy; + + nfs4_stateid_copy(stateid, &invalid_stateid); + return decode_stateid(xdr, &dummy); +} + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) { int status; @@ -4393,7 +4458,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_open_stateid(xdr, &res->stateid); + status = decode_invalid_stateid(xdr, &res->stateid); return status; } @@ -4409,11 +4474,14 @@ static int decode_write_verifier(struct xdr_stream *xdr, struct nfs_write_verifi static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res) { + struct nfs_writeverf *verf = res->verf; int status; status = decode_op_hdr(xdr, OP_COMMIT); if (!status) - status = decode_write_verifier(xdr, &res->verf->verifier); + status = decode_write_verifier(xdr, &verf->verifier); + if (!status) + verf->committed = NFS_FILE_SYNC; return status; } @@ -4430,13 +4498,11 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; bmlen = be32_to_cpup(p); p = xdr_inline_decode(xdr, bmlen << 2); if (likely(p)) return 0; -out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -4463,9 +4529,15 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_case_insensitive(xdr, bitmap, &res->case_insensitive)) != 0) + goto xdr_error; + if ((status = decode_attr_case_preserving(xdr, bitmap, &res->case_preserving)) != 0) + goto xdr_error; if ((status = decode_attr_exclcreat_supported(xdr, bitmap, res->exclcreat_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_open_arguments(xdr, bitmap, &res->open_caps)) != 0) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: dprintk("%s: xdr returned %d!\n", __func__, -status); @@ -4544,13 +4616,10 @@ static int decode_threshold_hint(struct xdr_stream *xdr, if (likely(bitmap[0] & hint_bit)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_first_threshold_item4(struct xdr_stream *xdr, @@ -4563,10 +4632,8 @@ static int decode_first_threshold_item4(struct xdr_stream *xdr, /* layout type */ p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } res->l_type = be32_to_cpup(p); /* thi_hintset bitmap */ @@ -4624,7 +4691,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, return -EREMOTEIO; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; num = be32_to_cpup(p); if (num == 0) return 0; @@ -4637,15 +4704,11 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD; } return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fattr *fattr, struct nfs_fh *fh, - struct nfs4_fs_locations *fs_loc, struct nfs4_label *label, - const struct nfs_server *server) + struct nfs4_fs_locations *fs_loc, const struct nfs_server *server) { int status; umode_t fmode = 0; @@ -4737,6 +4800,11 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; + status = decode_attr_time_create(xdr, bitmap, &fattr->btime); + if (status < 0) + goto xdr_error; + fattr->valid |= status; + status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime); if (status < 0) goto xdr_error; @@ -4760,12 +4828,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, if (status < 0) goto xdr_error; - if (label) { - status = decode_attr_security_label(xdr, bitmap, label); - if (status < 0) - goto xdr_error; - fattr->valid |= status; - } + status = decode_attr_security_label(xdr, bitmap, fattr->label); + if (status < 0) + goto xdr_error; + fattr->valid |= status; xdr_error: dprintk("%s: xdr returned %d\n", __func__, -status); @@ -4774,7 +4840,7 @@ xdr_error: static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, - struct nfs4_label *label, const struct nfs_server *server) + const struct nfs_server *server) { unsigned int savep; uint32_t attrlen, @@ -4793,8 +4859,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat if (status < 0) goto xdr_error; - status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, - label, server); + status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server); if (status < 0) goto xdr_error; @@ -4804,16 +4869,10 @@ xdr_error: return status; } -static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr, - struct nfs4_label *label, const struct nfs_server *server) -{ - return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server); -} - static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) { - return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server); + return decode_getfattr_generic(xdr, fattr, NULL, NULL, server); } /* @@ -4827,7 +4886,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; fsinfo->nlayouttypes = be32_to_cpup(p); /* pNFS is not supported by the underlying file system */ @@ -4837,7 +4896,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, /* Decode and set first layout type, move xdr->p past unused types */ p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; /* If we get too many, then just cap it at the max */ if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) { @@ -4849,9 +4908,6 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, for(i = 0; i < fsinfo->nlayouttypes; ++i) fsinfo->layouttype[i] = be32_to_cpup(p++); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -4874,7 +4930,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, } /* - * The prefered block size for layout directed io + * The preferred block size for layout directed io */ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -4885,10 +4941,8 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, *res = 0; if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } *res = be32_to_cpup(p); bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; } @@ -4907,16 +4961,40 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap, *res = 0; if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } *res = be32_to_cpup(p); bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE; } return 0; } +static int decode_attr_change_attr_type(struct xdr_stream *xdr, + uint32_t *bitmap, + enum nfs4_change_attr_type *res) +{ + u32 tmp = NFS4_CHANGE_TYPE_IS_UNDEFINED; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); + if (bitmap[2] & FATTR4_WORD2_CHANGE_ATTR_TYPE) { + if (xdr_stream_decode_u32(xdr, &tmp)) + return -EIO; + bitmap[2] &= ~FATTR4_WORD2_CHANGE_ATTR_TYPE; + } + + switch(tmp) { + case NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR: + case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER: + case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS: + case NFS4_CHANGE_TYPE_IS_TIME_METADATA: + *res = tmp; + break; + default: + *res = NFS4_CHANGE_TYPE_IS_UNDEFINED; + } + return 0; +} + static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { unsigned int savep; @@ -4965,6 +5043,16 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) if (status) goto xdr_error; + status = decode_attr_change_attr_type(xdr, bitmap, + &fsinfo->change_attr_type); + if (status) + goto xdr_error; + + status = decode_attr_xattrsupport(xdr, bitmap, + &fsinfo->xattr_support); + if (status) + goto xdr_error; + status = verify_attr_len(xdr, savep, attrlen); xdr_error: dprintk("%s: xdr returned %d!\n", __func__, -status); @@ -4986,19 +5074,18 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; - len = be32_to_cpup(p); - if (len > NFS4_FHSIZE) return -EIO; + len = be32_to_cpup(p); + if (len > NFS4_FHSIZE || len == 0) { + trace_nfs4_xdr_bad_filehandle(xdr, OP_GETFH, NFS4ERR_BADHANDLE); + return -EREMOTEIO; + } fh->size = len; p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; memcpy(fh->data, p, len); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -5014,7 +5101,7 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) /* * We create the owner, so we know a proper owner.id length is 4. */ -static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) +static int decode_lock_denied(struct xdr_stream *xdr, struct file_lock *fl) { uint64_t offset, length, clientid; __be32 *p; @@ -5022,7 +5109,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) p = xdr_inline_decode(xdr, 32); /* read 32 bytes */ if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */ p = xdr_decode_hyper(p, &length); type = be32_to_cpup(p++); /* 4 byte read */ @@ -5031,19 +5118,17 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_end = fl->fl_start + (loff_t)length - 1; if (length == ~(uint64_t)0) fl->fl_end = OFFSET_MAX; - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; if (type & 1) - fl->fl_type = F_RDLCK; - fl->fl_pid = 0; + fl->c.flc_type = F_RDLCK; + fl->c.flc_pid = 0; } p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ p = xdr_inline_decode(xdr, namelen); /* variable size field */ - if (likely(p)) - return -NFS4ERR_DENIED; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; + if (likely(!p)) + return -EIO; + return -NFS4ERR_DENIED; } static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) @@ -5112,7 +5197,7 @@ static int decode_space_limit(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; limit_type = be32_to_cpup(p++); switch (limit_type) { case NFS4_LIMIT_SIZE: @@ -5126,85 +5211,77 @@ static int decode_space_limit(struct xdr_stream *xdr, maxsize >>= PAGE_SHIFT; *pagemod_limit = min_t(u64, maxsize, ULONG_MAX); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_rw_delegation(struct xdr_stream *xdr, - uint32_t delegation_type, - struct nfs_openres *res) + struct nfs4_open_delegation *res) { __be32 *p; int status; - status = decode_delegation_stateid(xdr, &res->delegation); + status = decode_delegation_stateid(xdr, &res->stateid); if (unlikely(status)) return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->do_recall = be32_to_cpup(p); - switch (delegation_type) { + switch (res->open_delegation_type) { case NFS4_OPEN_DELEGATE_READ: - res->delegation_type = FMODE_READ; + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + res->type = FMODE_READ; break; case NFS4_OPEN_DELEGATE_WRITE: - res->delegation_type = FMODE_WRITE|FMODE_READ; + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + res->type = FMODE_WRITE|FMODE_READ; if (decode_space_limit(xdr, &res->pagemod_limit) < 0) return -EIO; } return decode_ace(xdr, NULL); -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } -static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) +static int decode_no_delegation(struct xdr_stream *xdr, + struct nfs4_open_delegation *res) { __be32 *p; - uint32_t why_no_delegation; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; - why_no_delegation = be32_to_cpup(p); - switch (why_no_delegation) { + return -EIO; + res->why_no_delegation = be32_to_cpup(p); + switch (res->why_no_delegation) { case WND4_CONTENTION: case WND4_RESOURCE: - xdr_inline_decode(xdr, 4); - /* Ignore for now */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + res->will_notify = be32_to_cpup(p); } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } -static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) +static int decode_delegation(struct xdr_stream *xdr, + struct nfs4_open_delegation *res) { __be32 *p; - uint32_t delegation_type; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; - delegation_type = be32_to_cpup(p); - res->delegation_type = 0; - switch (delegation_type) { + return -EIO; + res->open_delegation_type = be32_to_cpup(p); + switch (res->open_delegation_type) { case NFS4_OPEN_DELEGATE_NONE: return 0; case NFS4_OPEN_DELEGATE_READ: case NFS4_OPEN_DELEGATE_WRITE: - return decode_rw_delegation(xdr, delegation_type, res); + case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG: + case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG: + return decode_rw_delegation(xdr, res); case NFS4_OPEN_DELEGATE_NONE_EXT: return decode_no_delegation(xdr, res); } return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) @@ -5226,7 +5303,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->rflags = be32_to_cpup(p++); bmlen = be32_to_cpup(p); if (bmlen > 10) @@ -5234,20 +5311,17 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, bmlen << 2); if (unlikely(!p)) - goto out_overflow; + return -EIO; savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) res->attrset[i] = be32_to_cpup(p++); for (; i < NFS4_BITMAP_SIZE; i++) res->attrset[i] = 0; - return decode_delegation(xdr, res); + return decode_delegation(xdr, &res->delegation); xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) @@ -5296,7 +5370,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, return status; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; eof = be32_to_cpup(p++); count = be32_to_cpup(p); recvd = xdr_read_pages(xdr, count); @@ -5309,9 +5383,6 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, res->eof = eof; res->count = count; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) @@ -5344,7 +5415,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) /* Convert length of symlink */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); @@ -5360,14 +5431,11 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) * The XDR encode routine has set things up so that * the link text will be copied directly into the * buffer. We just have to do overflow-checking, - * and and null-terminate the text (the VFS expects + * and null-terminate the text (the VFS expects * null-termination). */ xdr_terminate_string(rcvbuf, len); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -5409,13 +5477,12 @@ decode_restorefh(struct xdr_stream *xdr) } static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, - struct nfs_getaclres *res) + struct nfs_getaclres *res, enum nfs4_acl_type type) { unsigned int savep; uint32_t attrlen, bitmap[3] = {0}; int status; - unsigned int pg_offset; res->acl_len = 0; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -5423,34 +5490,44 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, xdr_enter_page(xdr, xdr->buf->page_len); - /* Calculate the offset of the page data */ - pg_offset = xdr->buf->head[0].iov_len; - if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) goto out; if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) goto out; - if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) - return -EIO; - if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { - - /* The bitmap (xdr len + bitmaps) and the attr xdr len words - * are stored with the acl data to handle the problem of - * variable length bitmaps.*/ - res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset; - res->acl_len = attrlen; - - /* Check for receive buffer overflow */ - if (res->acl_len > (xdr->nwords << 2) || - res->acl_len + res->acl_data_offset > xdr->buf->page_len) { - res->acl_flags |= NFS4_ACL_TRUNC; - dprintk("NFS: acl reply: attrlen %u > page_len %u\n", - attrlen, xdr->nwords << 2); - } - } else - status = -EOPNOTSUPP; + switch (type) { + default: + if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) + return -EIO; + if (!(bitmap[0] & FATTR4_WORD0_ACL)) + return -EOPNOTSUPP; + break; + case NFS4ACL_DACL: + if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_DACL - 1U))) + return -EIO; + if (!(bitmap[1] & FATTR4_WORD1_DACL)) + return -EOPNOTSUPP; + break; + case NFS4ACL_SACL: + if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_SACL - 1U))) + return -EIO; + if (!(bitmap[1] & FATTR4_WORD1_SACL)) + return -EOPNOTSUPP; + } + /* The bitmap (xdr len + bitmaps) and the attr xdr len words + * are stored with the acl data to handle the problem of + * variable length bitmaps.*/ + res->acl_data_offset = xdr_page_pos(xdr); + res->acl_len = attrlen; + + /* Check for receive buffer overflow */ + if (res->acl_len > xdr_stream_remaining(xdr) || + res->acl_len + res->acl_data_offset > xdr->buf->page_len) { + res->acl_flags |= NFS4_ACL_TRUNC; + dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", + attrlen, xdr_stream_remaining(xdr)); + } out: return status; } @@ -5463,25 +5540,21 @@ decode_savefh(struct xdr_stream *xdr) static int decode_setattr(struct xdr_stream *xdr) { - __be32 *p; - uint32_t bmlen; int status; status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - bmlen = be32_to_cpup(p); - p = xdr_inline_decode(xdr, bmlen << 2); - if (likely(p)) + if (decode_bitmap4(xdr, NULL, 0) >= 0) return 0; -out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } +static int decode_delegattr(struct xdr_stream *xdr) +{ + return decode_setattr(xdr); +} + static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res) { __be32 *p; @@ -5490,7 +5563,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" @@ -5501,7 +5574,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re if (nfserr == NFS_OK) { p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->clientid); memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { @@ -5510,28 +5583,25 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re /* skip netid string */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; /* skip uaddr string */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; return -NFSERR_CLID_INUSE; } else return nfs4_stat_to_errno(nfserr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_setclientid_confirm(struct xdr_stream *xdr) @@ -5550,13 +5620,10 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->count = be32_to_cpup(p++); res->verf->committed = be32_to_cpup(p++); return decode_write_verifier(xdr, &res->verf->verifier); -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_delegreturn(struct xdr_stream *xdr) @@ -5572,30 +5639,24 @@ static int decode_secinfo_gss(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; oid_len = be32_to_cpup(p); if (oid_len > GSS_OID_MAX_LEN) - goto out_err; + return -EINVAL; p = xdr_inline_decode(xdr, oid_len); if (unlikely(!p)) - goto out_overflow; + return -EIO; memcpy(flavor->flavor_info.oid.data, p, oid_len); flavor->flavor_info.oid.len = oid_len; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; flavor->flavor_info.qop = be32_to_cpup(p++); flavor->flavor_info.service = be32_to_cpup(p); return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; -out_err: - return -EINVAL; } static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) @@ -5607,7 +5668,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->flavors->num_flavors = 0; num_flavors = be32_to_cpup(p); @@ -5619,7 +5680,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; sec_flavor->flavor = be32_to_cpup(p); if (sec_flavor->flavor == RPC_AUTH_GSS) { @@ -5633,9 +5694,6 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res status = 0; out: return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) @@ -5657,20 +5715,9 @@ static int decode_secinfo_no_name(struct xdr_stream *xdr, struct nfs4_secinfo_re static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) { - __be32 *p; - uint32_t bitmap_words; - unsigned int i; - - p = xdr_inline_decode(xdr, 4); - if (!p) - return -EIO; - bitmap_words = be32_to_cpup(p++); - if (bitmap_words > NFS4_OP_MAP_NUM_WORDS) + if (xdr_stream_decode_uint32_array(xdr, op_map->u.words, + ARRAY_SIZE(op_map->u.words)) < 0) return -EIO; - p = xdr_inline_decode(xdr, 4 * bitmap_words); - for (i = 0; i < bitmap_words; i++) - op_map->u.words[i] = be32_to_cpup(p++); - return 0; } @@ -5689,11 +5736,11 @@ static int decode_exchange_id(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &res->clientid); p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->seqid = be32_to_cpup(p++); res->flags = be32_to_cpup(p++); @@ -5717,7 +5764,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* server_owner4.so_minor_id */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->server_owner->minor_id); /* server_owner4.so_major_id */ @@ -5737,7 +5784,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* Implementation Id */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; impl_id_count = be32_to_cpup(p++); if (impl_id_count) { @@ -5756,16 +5803,13 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* nii_date */ p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->impl_id->date.seconds); res->impl_id->date.nseconds = be32_to_cpup(p); /* if there's more than one entry, ignore the rest */ } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_chan_attrs(struct xdr_stream *xdr, @@ -5776,7 +5820,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 28); if (unlikely(!p)) - goto out_overflow; + return -EIO; val = be32_to_cpup(p++); /* headerpadsz */ if (val) return -EINVAL; /* no support for header padding yet */ @@ -5794,12 +5838,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr, if (nr_attrs == 1) { p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ if (unlikely(!p)) - goto out_overflow; + return -EIO; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) @@ -5822,7 +5863,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr, /* dir flags, rdma mode bool */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->dir = be32_to_cpup(p++); if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH) @@ -5833,9 +5874,6 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr, res->use_conn_in_rdma_mode = true; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_create_session(struct xdr_stream *xdr, @@ -5853,7 +5891,7 @@ static int decode_create_session(struct xdr_stream *xdr, /* seqid, flags */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->seqid = be32_to_cpup(p++); res->flags = be32_to_cpup(p); @@ -5862,9 +5900,6 @@ static int decode_create_session(struct xdr_stream *xdr, if (!status) status = decode_chan_attrs(xdr, &res->bc_attrs); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) @@ -5945,7 +5980,6 @@ out_err: res->sr_status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out_err; #else /* CONFIG_NFS_V4_1 */ @@ -5973,7 +6007,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, if (status == -ETOOSMALL) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; pdev->mincount = be32_to_cpup(p); dprintk("%s: Min count too small. mincnt = %u\n", __func__, pdev->mincount); @@ -5983,7 +6017,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; type = be32_to_cpup(p++); if (type != pdev->layout_type) { dprintk("%s: layout mismatch req: %u pdev: %u\n", @@ -5997,19 +6031,19 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, */ pdev->mincount = be32_to_cpup(p); if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount) - goto out_overflow; + return -EIO; /* Parse notification bitmap, verifying that it is zero. */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len) { uint32_t i; p = xdr_inline_decode(xdr, 4 * len); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->notification = be32_to_cpup(p++); for (i = 1; i < len; i++) { @@ -6021,9 +6055,6 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, } } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, @@ -6036,7 +6067,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, status = decode_op_hdr(xdr, OP_LAYOUTGET); if (status) - return status; + goto out; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; @@ -6049,7 +6080,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, if (!layout_count) { dprintk("%s: server responded with empty layout array\n", __func__); - return -EINVAL; + status = -EINVAL; + goto out; } p = xdr_inline_decode(xdr, 28); @@ -6074,7 +6106,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, dprintk("NFS: server cheating in layoutget reply: " "layout len %u > recvd %u\n", res->layoutp->len, recvd); - return -EINVAL; + status = -EINVAL; + goto out; } if (layout_count > 1) { @@ -6087,10 +6120,12 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, __func__, layout_count); } - return 0; +out: + res->status = status; + return status; out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; + status = -EIO; + goto out; } static int decode_layoutreturn(struct xdr_stream *xdr, @@ -6104,14 +6139,13 @@ static int decode_layoutreturn(struct xdr_stream *xdr, return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->lrs_present = be32_to_cpup(p); if (res->lrs_present) status = decode_layout_stateid(xdr, &res->stateid); + else + nfs4_stateid_copy(&res->stateid, &invalid_stateid); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutcommit(struct xdr_stream *xdr, @@ -6129,19 +6163,16 @@ static int decode_layoutcommit(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; sizechanged = be32_to_cpup(p); if (sizechanged) { /* throw away new size */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_test_stateid(struct xdr_stream *xdr, @@ -6157,21 +6188,17 @@ static int decode_test_stateid(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; num_res = be32_to_cpup(p++); if (num_res != 1) - goto out; + return -EIO; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->status = be32_to_cpup(p++); return status; -out_overflow: - print_overflow_msg(__func__, xdr); -out: - return -EIO; } static int decode_free_stateid(struct xdr_stream *xdr, @@ -6187,6 +6214,13 @@ int decode_layoutreturn(struct xdr_stream *xdr, { return 0; } + +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res) +{ + return 0; +} + #endif /* CONFIG_NFS_V4_1 */ /* @@ -6246,7 +6280,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_access(xdr, &res->supported, &res->access); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server); + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6276,7 +6311,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6306,7 +6341,7 @@ static int nfs4_xdr_dec_lookupp(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6333,8 +6368,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, goto out; status = decode_getfh(xdr, res->fh); if (status == 0) - status = decode_getfattr_label(xdr, res->fattr, - res->label, res->server); + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6428,7 +6462,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_restorefh(xdr); if (status) goto out; - decode_getfattr_label(xdr, res->fattr, res->label, res->server); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6458,7 +6492,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_getfh(xdr, res->fh); if (status) goto out; - decode_getfattr_label(xdr, res->fattr, res->label, res->server); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6491,7 +6525,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getfattr_label(xdr, res->fattr, res->label, res->server); + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6550,10 +6584,8 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, struct compound_hdr hdr; int status; - if (res->acl_scratch != NULL) { - void *p = page_address(res->acl_scratch); - xdr_set_scratch_buffer(xdr, p, PAGE_SIZE); - } + if (res->acl_scratch != NULL) + xdr_set_scratch_folio(xdr, res->acl_scratch); status = decode_compound_hdr(xdr, &hdr); if (status) goto out; @@ -6563,7 +6595,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getacl(xdr, rqstp, res); + status = decode_getacl(xdr, rqstp, res, res->acl_type); out: return status; @@ -6631,7 +6663,9 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr, goto out; if (res->access_request) decode_access(xdr, &res->access_supported, &res->access_result); - decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server); + decode_getfattr(xdr, res->f_attr, res->server); + if (res->lg_res) + decode_layoutget(xdr, rqstp, res->lg_res); out: return status; } @@ -6684,6 +6718,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, if (res->access_request) decode_access(xdr, &res->access_supported, &res->access_result); decode_getfattr(xdr, res->f_attr, res->server); + if (res->lg_res) + decode_layoutget(xdr, rqstp, res->lg_res); out: return status; } @@ -6711,7 +6747,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, status = decode_setattr(xdr); if (status) goto out; - decode_getfattr_label(xdr, res->fattr, res->label, res->server); + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -7091,6 +7127,12 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, if (status) goto out; } + if (res->sattr_res) { + status = decode_delegattr(xdr); + res->sattr_ret = status; + if (status) + goto out; + } if (res->fattr) { status = decode_getfattr(xdr, res->fattr, res->server); if (status != 0) @@ -7124,9 +7166,9 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, if (res->migration) { xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, - &res->fs_locations->fattr, + res->fs_locations->fattr, NULL, res->fs_locations, - NULL, res->fs_locations->server); + res->fs_locations->server); if (status) goto out; if (res->renew) @@ -7137,9 +7179,9 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, goto out; xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, - &res->fs_locations->fattr, + res->fs_locations->fattr, NULL, res->fs_locations, - NULL, res->fs_locations->server); + res->fs_locations->server); } out: return status; @@ -7296,6 +7338,8 @@ static int nfs4_xdr_dec_sequence(struct rpc_rqst *rqstp, return status; } +#endif + /* * Decode GET_LEASE_TIME response */ @@ -7317,6 +7361,8 @@ static int nfs4_xdr_dec_get_lease_time(struct rpc_rqst *rqstp, return status; } +#ifdef CONFIG_NFS_V4_1 + /* * Decode RECLAIM_COMPLETE response */ @@ -7526,13 +7572,14 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, unsigned int savep; uint32_t bitmap[3] = {0}; uint32_t len; + uint64_t new_cookie; __be32 *p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; if (*p == xdr_zero) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; if (*p == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -7541,14 +7588,13 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; - entry->prev_cookie = entry->cookie; - p = xdr_decode_hyper(p, &entry->cookie); + return -EAGAIN; + p = xdr_decode_hyper(p, &new_cookie); entry->len = be32_to_cpup(p); p = xdr_inline_decode(xdr, entry->len); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; entry->name = (const char *) p; /* @@ -7560,14 +7606,14 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->fattr->valid = 0; if (decode_attr_bitmap(xdr, bitmap) < 0) - goto out_overflow; + return -EAGAIN; if (decode_attr_length(xdr, &len, &savep) < 0) - goto out_overflow; + return -EAGAIN; if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, - NULL, entry->label, entry->server) < 0) - goto out_overflow; + NULL, entry->server) < 0) + return -EAGAIN; if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) entry->ino = entry->fattr->mounted_on_fileid; else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) @@ -7577,75 +7623,9 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); - return 0; + entry->cookie = new_cookie; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; -} - -/* - * We need to translate between nfs status return values and - * the local errno values which may not be the same. - */ -static struct { - int stat; - int errno; -} nfs_errtbl[] = { - { NFS4_OK, 0 }, - { NFS4ERR_PERM, -EPERM }, - { NFS4ERR_NOENT, -ENOENT }, - { NFS4ERR_IO, -errno_NFSERR_IO}, - { NFS4ERR_NXIO, -ENXIO }, - { NFS4ERR_ACCESS, -EACCES }, - { NFS4ERR_EXIST, -EEXIST }, - { NFS4ERR_XDEV, -EXDEV }, - { NFS4ERR_NOTDIR, -ENOTDIR }, - { NFS4ERR_ISDIR, -EISDIR }, - { NFS4ERR_INVAL, -EINVAL }, - { NFS4ERR_FBIG, -EFBIG }, - { NFS4ERR_NOSPC, -ENOSPC }, - { NFS4ERR_ROFS, -EROFS }, - { NFS4ERR_MLINK, -EMLINK }, - { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, - { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, - { NFS4ERR_DQUOT, -EDQUOT }, - { NFS4ERR_STALE, -ESTALE }, - { NFS4ERR_BADHANDLE, -EBADHANDLE }, - { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, - { NFS4ERR_NOTSUPP, -ENOTSUPP }, - { NFS4ERR_TOOSMALL, -ETOOSMALL }, - { NFS4ERR_SERVERFAULT, -EREMOTEIO }, - { NFS4ERR_BADTYPE, -EBADTYPE }, - { NFS4ERR_LOCKED, -EAGAIN }, - { NFS4ERR_SYMLINK, -ELOOP }, - { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, - { NFS4ERR_DEADLOCK, -EDEADLK }, - { -1, -EIO } -}; - -/* - * Convert an NFS error code to a local one. - * This one is used jointly by NFSv2 and NFSv3. - */ -static int -nfs4_stat_to_errno(int stat) -{ - int i; - for (i = 0; nfs_errtbl[i].stat != -1; i++) { - if (nfs_errtbl[i].stat == stat) - return nfs_errtbl[i].errno; - } - if (stat <= 10000 || stat > 10100) { - /* The server is looney tunes. */ - return -EREMOTEIO; - } - /* If we cannot translate the error, the recovery routines should - * handle it. - * Note: remaining NFSv4 error codes have values > 10000, so should - * not conflict with native Linux error codes. - */ - return -stat; + return 0; } #ifdef CONFIG_NFS_V4_2 @@ -7668,6 +7648,22 @@ nfs4_stat_to_errno(int stat) .p_name = #proc, \ } +#if defined(CONFIG_NFS_V4_1) +#define PROC41(proc, argtype, restype) \ + PROC(proc, argtype, restype) +#else +#define PROC41(proc, argtype, restype) \ + STUB(proc) +#endif + +#if defined(CONFIG_NFS_V4_2) +#define PROC42(proc, argtype, restype) \ + PROC(proc, argtype, restype) +#else +#define PROC42(proc, argtype, restype) \ + STUB(proc) +#endif + const struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), PROC(WRITE, enc_write, dec_write), @@ -7688,7 +7684,6 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC(ACCESS, enc_access, dec_access), PROC(GETATTR, enc_getattr, dec_getattr), PROC(LOOKUP, enc_lookup, dec_lookup), - PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC(LOOKUP_ROOT, enc_lookup_root, dec_lookup_root), PROC(REMOVE, enc_remove, dec_remove), PROC(RENAME, enc_rename, dec_rename), @@ -7707,33 +7702,40 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), PROC(SECINFO, enc_secinfo, dec_secinfo), PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present), -#if defined(CONFIG_NFS_V4_1) - PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), - PROC(CREATE_SESSION, enc_create_session, dec_create_session), - PROC(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), - PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC41(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC41(CREATE_SESSION, enc_create_session, dec_create_session), + PROC41(DESTROY_SESSION, enc_destroy_session, dec_destroy_session), + PROC41(SEQUENCE, enc_sequence, dec_sequence), PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), - PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), - PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), - PROC(LAYOUTGET, enc_layoutget, dec_layoutget), - PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), - PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), - PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), - PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), - PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + PROC41(RECLAIM_COMPLETE,enc_reclaim_complete, dec_reclaim_complete), + PROC41(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC41(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC41(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), + PROC41(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC41(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), + PROC41(TEST_STATEID, enc_test_stateid, dec_test_stateid), + PROC41(FREE_STATEID, enc_free_stateid, dec_free_stateid), STUB(GETDEVICELIST), - PROC(BIND_CONN_TO_SESSION, + PROC41(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), - PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), -#endif /* CONFIG_NFS_V4_1 */ -#ifdef CONFIG_NFS_V4_2 - PROC(SEEK, enc_seek, dec_seek), - PROC(ALLOCATE, enc_allocate, dec_allocate), - PROC(DEALLOCATE, enc_deallocate, dec_deallocate), - PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), - PROC(CLONE, enc_clone, dec_clone), - PROC(COPY, enc_copy, dec_copy), -#endif /* CONFIG_NFS_V4_2 */ + PROC41(DESTROY_CLIENTID,enc_destroy_clientid, dec_destroy_clientid), + PROC42(SEEK, enc_seek, dec_seek), + PROC42(ALLOCATE, enc_allocate, dec_allocate), + PROC42(DEALLOCATE, enc_deallocate, dec_deallocate), + PROC42(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), + PROC42(CLONE, enc_clone, dec_clone), + PROC42(COPY, enc_copy, dec_copy), + PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel), + PROC42(OFFLOAD_STATUS, enc_offload_status, dec_offload_status), + PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify), + PROC(LOOKUPP, enc_lookupp, dec_lookupp), + PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror), + PROC42(GETXATTR, enc_getxattr, dec_getxattr), + PROC42(SETXATTR, enc_setxattr, dec_setxattr), + PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs), + PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr), + PROC42(READ_PLUS, enc_read_plus, dec_read_plus), + PROC42(ZERO_RANGE, enc_zero_range, dec_zero_range), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; @@ -7743,9 +7745,3 @@ const struct rpc_version nfs_version4 = { .procs = nfs4_procedures, .counts = nfs_version4_counts, }; - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 89a15dbe5efc..432612d22437 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de> * @@ -87,7 +88,13 @@ #define NFS_ROOT "/tftpboot/%s" /* Default NFSROOT mount options. */ -#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096" +#if defined(CONFIG_NFS_V2) +#define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096" +#elif defined(CONFIG_NFS_V3) +#define NFS_DEF_OPTIONS "vers=3,tcp,rsize=4096,wsize=4096" +#else +#define NFS_DEF_OPTIONS "vers=4,tcp,rsize=4096,wsize=4096" +#endif /* Parameters passed from the kernel command line */ static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = ""; @@ -132,7 +139,7 @@ static int __init nfs_root_setup(char *line) ROOT_DEV = Root_NFS; if (line[0] == '/' || line[0] == ',' || (line[0] >= '0' && line[0] <= '9')) { - strlcpy(nfs_root_parms, line, sizeof(nfs_root_parms)); + strscpy(nfs_root_parms, line, sizeof(nfs_root_parms)); } else { size_t n = strlen(line) + sizeof(NFS_ROOT) - 1; if (n >= sizeof(nfs_root_parms)) @@ -157,7 +164,7 @@ __setup("nfsroot=", nfs_root_setup); static int __init root_nfs_copy(char *dest, const char *src, const size_t destlen) { - if (strlcpy(dest, src, destlen) > destlen) + if (strscpy(dest, src, destlen) == -E2BIG) return -1; return 0; } @@ -168,10 +175,10 @@ static int __init root_nfs_cat(char *dest, const char *src, size_t len = strlen(dest); if (len && dest[len - 1] != ',') - if (strlcat(dest, ",", destlen) > destlen) + if (strlcat(dest, ",", destlen) >= destlen) return -1; - if (strlcat(dest, src, destlen) > destlen) + if (strlcat(dest, src, destlen) >= destlen) return -1; return 0; } diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index c74f7af23d77..5d1bfccbb4da 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> */ @@ -10,3 +11,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_bad_filehandle); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 2ca9167bc97d..6ce55e8e6b67 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com> */ @@ -8,38 +9,58 @@ #define _TRACE_NFS_H #include <linux/tracepoint.h> +#include <linux/iversion.h> -#define nfs_show_file_type(ftype) \ - __print_symbolic(ftype, \ - { DT_UNKNOWN, "UNKNOWN" }, \ - { DT_FIFO, "FIFO" }, \ - { DT_CHR, "CHR" }, \ - { DT_DIR, "DIR" }, \ - { DT_BLK, "BLK" }, \ - { DT_REG, "REG" }, \ - { DT_LNK, "LNK" }, \ - { DT_SOCK, "SOCK" }, \ - { DT_WHT, "WHT" }) +#include <trace/misc/fs.h> +#include <trace/misc/nfs.h> +#include <trace/misc/sunrpc.h> #define nfs_show_cache_validity(v) \ __print_flags(v, "|", \ - { NFS_INO_INVALID_ATTR, "INVALID_ATTR" }, \ { NFS_INO_INVALID_DATA, "INVALID_DATA" }, \ { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \ { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \ { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \ - { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \ { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \ - { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }) + { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \ + { NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \ + { NFS_INO_INVALID_CTIME, "INVALID_CTIME" }, \ + { NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \ + { NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \ + { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \ + { NFS_INO_DATA_INVAL_DEFER, "DATA_INVAL_DEFER" }, \ + { NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \ + { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \ + { NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \ + { NFS_INO_INVALID_MODE, "INVALID_MODE" }, \ + { NFS_INO_INVALID_BTIME, "INVALID_BTIME" }) #define nfs_show_nfsi_flags(v) \ __print_flags(v, "|", \ - { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ - { 1 << NFS_INO_STALE, "STALE" }, \ - { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ - { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ - { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ - { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) + { BIT(NFS_INO_STALE), "STALE" }, \ + { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \ + { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \ + { BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \ + { BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \ + { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ + { BIT(NFS_INO_ODIRECT), "ODIRECT" }) + +#define nfs_show_wb_flags(v) \ + __print_flags(v, "|", \ + { BIT(PG_BUSY), "BUSY" }, \ + { BIT(PG_MAPPED), "MAPPED" }, \ + { BIT(PG_FOLIO), "FOLIO" }, \ + { BIT(PG_CLEAN), "CLEAN" }, \ + { BIT(PG_COMMIT_TO_DS), "COMMIT_TO_DS" }, \ + { BIT(PG_INODE_REF), "INODE_REF" }, \ + { BIT(PG_HEADLOCK), "HEADLOCK" }, \ + { BIT(PG_TEARDOWN), "TEARDOWN" }, \ + { BIT(PG_UNLOCKPAGE), "UNLOCKPAGE" }, \ + { BIT(PG_UPTODATE), "UPTODATE" }, \ + { BIT(PG_WB_END), "WB_END" }, \ + { BIT(PG_REMOVE), "REMOVE" }, \ + { BIT(PG_CONTENDED1), "CONTENDED1" }, \ + { BIT(PG_CONTENDED2), "CONTENDED2" }) DECLARE_EVENT_CLASS(nfs_inode_event, TP_PROTO( @@ -53,6 +74,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event, __field(u32, fhandle) __field(u64, fileid) __field(u64, version) + __field(unsigned long, cache_validity) ), TP_fast_assign( @@ -60,15 +82,18 @@ DECLARE_EVENT_CLASS(nfs_inode_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); - __entry->version = inode->i_version; + __entry->version = inode_peek_iversion_raw(inode); + __entry->cache_validity = nfsi->cache_validity; ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ", + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu cache_validity=0x%lx (%s)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, - (unsigned long long)__entry->version + (unsigned long long)__entry->version, + __entry->cache_validity, + nfs_show_cache_validity(__entry->cache_validity) ) ); @@ -81,7 +106,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, TP_ARGS(inode, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u32, fhandle) __field(unsigned char, type) @@ -94,27 +119,27 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, TP_fast_assign( const struct nfs_inode *nfsi = NFS_I(inode); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->type = nfs_umode_to_dtype(inode->i_mode); - __entry->version = inode->i_version; + __entry->version = inode_peek_iversion_raw(inode); __entry->size = i_size_read(inode); __entry->nfsi_flags = nfsi->flags; __entry->cache_validity = nfsi->cache_validity; ), TP_printk( - "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " "type=%u (%s) version=%llu size=%lld " - "cache_validity=%lu (%s) nfs_flags=%ld (%s)", - __entry->error, + "cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s)", + -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->type, - nfs_show_file_type(__entry->type), + show_fs_dirent_type(__entry->type), (unsigned long long)__entry->version, (long long)__entry->size, __entry->cache_validity, @@ -137,6 +162,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, int error \ ), \ TP_ARGS(inode, error)) +DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale); DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter); @@ -147,22 +173,242 @@ DEFINE_NFS_INODE_EVENT(nfs_getattr_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit); DEFINE_NFS_INODE_EVENT(nfs_setattr_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit); -DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter); -DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit); DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); DEFINE_NFS_INODE_EVENT(nfs_access_enter); -DEFINE_NFS_INODE_EVENT_DONE(nfs_access_exit); +DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid); +DEFINE_NFS_INODE_EVENT(nfs_readdir_force_readdirplus); +DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_cache_fill_done); +DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_uncached_done); + +TRACE_EVENT(nfs_access_exit, + TP_PROTO( + const struct inode *inode, + unsigned int mask, + unsigned int permitted, + int error + ), + + TP_ARGS(inode, mask, permitted, error), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(unsigned char, type) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, size) + __field(unsigned long, nfsi_flags) + __field(unsigned long, cache_validity) + __field(unsigned int, mask) + __field(unsigned int, permitted) + ), -#define show_lookup_flags(flags) \ - __print_flags((unsigned long)flags, "|", \ - { LOOKUP_AUTOMOUNT, "AUTOMOUNT" }, \ - { LOOKUP_DIRECTORY, "DIRECTORY" }, \ - { LOOKUP_OPEN, "OPEN" }, \ - { LOOKUP_CREATE, "CREATE" }, \ - { LOOKUP_EXCL, "EXCL" }) + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + __entry->error = error < 0 ? -error : 0; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->type = nfs_umode_to_dtype(inode->i_mode); + __entry->version = inode_peek_iversion_raw(inode); + __entry->size = i_size_read(inode); + __entry->nfsi_flags = nfsi->flags; + __entry->cache_validity = nfsi->cache_validity; + __entry->mask = mask; + __entry->permitted = permitted; + ), + + TP_printk( + "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "type=%u (%s) version=%llu size=%lld " + "cache_validity=0x%lx (%s) nfs_flags=0x%lx (%s) " + "mask=0x%x permitted=0x%x", + -__entry->error, show_nfs_status(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->type, + show_fs_dirent_type(__entry->type), + (unsigned long long)__entry->version, + (long long)__entry->size, + __entry->cache_validity, + nfs_show_cache_validity(__entry->cache_validity), + __entry->nfsi_flags, + nfs_show_nfsi_flags(__entry->nfsi_flags), + __entry->mask, __entry->permitted + ) +); + +DECLARE_EVENT_CLASS(nfs_update_size_class, + TP_PROTO( + const struct inode *inode, + loff_t new_size + ), + + TP_ARGS(inode, new_size), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, cur_size) + __field(loff_t, new_size) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->fileid = nfsi->fileid; + __entry->version = inode_peek_iversion_raw(inode); + __entry->cur_size = i_size_read(inode); + __entry->new_size = new_size; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu cursize=%lld newsize=%lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->cur_size, __entry->new_size + ) +); + +#define DEFINE_NFS_UPDATE_SIZE_EVENT(name) \ + DEFINE_EVENT(nfs_update_size_class, nfs_size_##name, \ + TP_PROTO( \ + const struct inode *inode, \ + loff_t new_size \ + ), \ + TP_ARGS(inode, new_size)) + +DEFINE_NFS_UPDATE_SIZE_EVENT(truncate); +DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio); +DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); +DEFINE_NFS_UPDATE_SIZE_EVENT(update); +DEFINE_NFS_UPDATE_SIZE_EVENT(grow); + +DECLARE_EVENT_CLASS(nfs_inode_range_event, + TP_PROTO( + const struct inode *inode, + loff_t range_start, + loff_t range_end + ), + + TP_ARGS(inode, range_start, range_end), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, range_start) + __field(loff_t, range_end) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->fileid = nfsi->fileid; + __entry->version = inode_peek_iversion_raw(inode); + __entry->range_start = range_start; + __entry->range_end = range_end; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "range=[%lld, %lld]", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->range_start, __entry->range_end + ) +); + +#define DEFINE_NFS_INODE_RANGE_EVENT(name) \ + DEFINE_EVENT(nfs_inode_range_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + loff_t range_start, \ + loff_t range_end \ + ), \ + TP_ARGS(inode, range_start, range_end)) + +DEFINE_NFS_INODE_RANGE_EVENT(nfs_readdir_invalidate_cache_range); + +DECLARE_EVENT_CLASS(nfs_readdir_event, + TP_PROTO( + const struct file *file, + const __be32 *verifier, + u64 cookie, + pgoff_t page_index, + unsigned int dtsize + ), + + TP_ARGS(file, verifier, cookie, page_index, dtsize), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __array(char, verifier, NFS4_VERIFIER_SIZE) + __field(u64, cookie) + __field(pgoff_t, index) + __field(unsigned int, dtsize) + ), + + TP_fast_assign( + const struct inode *dir = file_inode(file); + const struct nfs_inode *nfsi = NFS_I(dir); + + __entry->dev = dir->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(dir); + if (cookie != 0) + memcpy(__entry->verifier, verifier, + NFS4_VERIFIER_SIZE); + else + memset(__entry->verifier, 0, + NFS4_VERIFIER_SIZE); + __entry->cookie = cookie; + __entry->index = page_index; + __entry->dtsize = dtsize; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "cookie=%s:0x%llx cache_index=%lu dtsize=%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->version, show_nfs4_verifier(__entry->verifier), + (unsigned long long)__entry->cookie, __entry->index, + __entry->dtsize + ) +); + +#define DEFINE_NFS_READDIR_EVENT(name) \ + DEFINE_EVENT(nfs_readdir_event, name, \ + TP_PROTO( \ + const struct file *file, \ + const __be32 *verifier, \ + u64 cookie, \ + pgoff_t page_index, \ + unsigned int dtsize \ + ), \ + TP_ARGS(file, verifier, cookie, page_index, dtsize)) + +DEFINE_NFS_READDIR_EVENT(nfs_readdir_cache_fill); +DEFINE_NFS_READDIR_EVENT(nfs_readdir_uncached); DECLARE_EVENT_CLASS(nfs_lookup_event, TP_PROTO( @@ -174,9 +420,10 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( - __field(unsigned int, flags) + __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) + __field(u64, fileid) __string(name, dentry->d_name.name) ), @@ -184,16 +431,18 @@ DECLARE_EVENT_CLASS(nfs_lookup_event, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); + __assign_str(name); ), TP_printk( - "flags=%u (%s) name=%02x:%02x:%llu/%s", + "flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu", __entry->flags, - show_lookup_flags(__entry->flags), + show_fs_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->fileid ) ); @@ -217,29 +466,32 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done, TP_ARGS(dir, dentry, flags, error), TP_STRUCT__entry( - __field(int, error) - __field(unsigned int, flags) + __field(unsigned long, error) + __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) + __field(u64, fileid) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __entry->error = error; + __entry->error = error < 0 ? -error : 0; __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry)); + __assign_str(name); ), TP_printk( - "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu", + -__entry->error, show_nfs_status(__entry->error), __entry->flags, - show_lookup_flags(__entry->flags), + show_fs_lookup_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->fileid ) ); @@ -257,22 +509,9 @@ DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter); DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit); DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter); DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit); - -#define show_open_flags(flags) \ - __print_flags((unsigned long)flags, "|", \ - { O_CREAT, "O_CREAT" }, \ - { O_EXCL, "O_EXCL" }, \ - { O_TRUNC, "O_TRUNC" }, \ - { O_APPEND, "O_APPEND" }, \ - { O_DSYNC, "O_DSYNC" }, \ - { O_DIRECT, "O_DIRECT" }, \ - { O_DIRECTORY, "O_DIRECTORY" }) - -#define show_fmode_flags(mode) \ - __print_flags(mode, "|", \ - { ((__force unsigned long)FMODE_READ), "READ" }, \ - { ((__force unsigned long)FMODE_WRITE), "WRITE" }, \ - { ((__force unsigned long)FMODE_EXEC), "EXEC" }) +DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup); +DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup_revalidate_failed); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_readdir_lookup_revalidate); TRACE_EVENT(nfs_atomic_open_enter, TP_PROTO( @@ -284,8 +523,8 @@ TRACE_EVENT(nfs_atomic_open_enter, TP_ARGS(dir, ctx, flags), TP_STRUCT__entry( - __field(unsigned int, flags) - __field(unsigned int, fmode) + __field(unsigned long, flags) + __field(unsigned long, fmode) __field(dev_t, dev) __field(u64, dir) __string(name, ctx->dentry->d_name.name) @@ -295,15 +534,15 @@ TRACE_EVENT(nfs_atomic_open_enter, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __entry->fmode = (__force unsigned int)ctx->mode; - __assign_str(name, ctx->dentry->d_name.name); + __entry->fmode = (__force unsigned long)ctx->mode; + __assign_str(name); ), TP_printk( - "flags=%u (%s) fmode=%s name=%02x:%02x:%llu/%s", + "flags=0x%lx (%s) fmode=%s name=%02x:%02x:%llu/%s", __entry->flags, - show_open_flags(__entry->flags), - show_fmode_flags(__entry->fmode), + show_fs_fcntl_open_flags(__entry->flags), + show_fs_fmode_flags(__entry->fmode), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) @@ -321,30 +560,30 @@ TRACE_EVENT(nfs_atomic_open_exit, TP_ARGS(dir, ctx, flags, error), TP_STRUCT__entry( - __field(int, error) - __field(unsigned int, flags) - __field(unsigned int, fmode) + __field(unsigned long, error) + __field(unsigned long, flags) + __field(unsigned long, fmode) __field(dev_t, dev) __field(u64, dir) __string(name, ctx->dentry->d_name.name) ), TP_fast_assign( - __entry->error = error; + __entry->error = -error; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __entry->fmode = (__force unsigned int)ctx->mode; - __assign_str(name, ctx->dentry->d_name.name); + __entry->fmode = (__force unsigned long)ctx->mode; + __assign_str(name); ), TP_printk( - "error=%d flags=%u (%s) fmode=%s " + "error=%ld (%s) flags=0x%lx (%s) fmode=%s " "name=%02x:%02x:%llu/%s", - __entry->error, + -__entry->error, show_nfs_status(__entry->error), __entry->flags, - show_open_flags(__entry->flags), - show_fmode_flags(__entry->fmode), + show_fs_fcntl_open_flags(__entry->flags), + show_fs_fmode_flags(__entry->fmode), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) @@ -361,7 +600,7 @@ TRACE_EVENT(nfs_create_enter, TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( - __field(unsigned int, flags) + __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) @@ -371,13 +610,13 @@ TRACE_EVENT(nfs_create_enter, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( - "flags=%u (%s) name=%02x:%02x:%llu/%s", + "flags=0x%lx (%s) name=%02x:%02x:%llu/%s", __entry->flags, - show_open_flags(__entry->flags), + show_fs_fcntl_open_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) @@ -395,26 +634,26 @@ TRACE_EVENT(nfs_create_exit, TP_ARGS(dir, dentry, flags, error), TP_STRUCT__entry( - __field(int, error) - __field(unsigned int, flags) + __field(unsigned long, error) + __field(unsigned long, flags) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) ), TP_fast_assign( - __entry->error = error; + __entry->error = -error; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->flags = flags; - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( - "error=%d flags=%u (%s) name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s", + -__entry->error, show_nfs_status(__entry->error), __entry->flags, - show_open_flags(__entry->flags), + show_fs_fcntl_open_flags(__entry->flags), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) @@ -438,7 +677,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -467,7 +706,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done, TP_ARGS(dir, dentry, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u64, dir) __string(name, dentry->d_name.name) @@ -476,13 +715,13 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __entry->error = error; - __assign_str(name, dentry->d_name.name); + __entry->error = error < 0 ? -error : 0; + __assign_str(name); ), TP_printk( - "error=%d name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) name=%02x:%02x:%llu/%s", + -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) @@ -531,7 +770,7 @@ TRACE_EVENT(nfs_link_enter, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); - __assign_str(name, dentry->d_name.name); + __assign_str(name); ), TP_printk( @@ -555,7 +794,7 @@ TRACE_EVENT(nfs_link_exit, TP_ARGS(inode, dir, dentry, error), TP_STRUCT__entry( - __field(int, error) + __field(unsigned long, error) __field(dev_t, dev) __field(u64, fileid) __field(u64, dir) @@ -566,13 +805,13 @@ TRACE_EVENT(nfs_link_exit, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->dir = NFS_FILEID(dir); - __entry->error = error; - __assign_str(name, dentry->d_name.name); + __entry->error = error < 0 ? -error : 0; + __assign_str(name); ), TP_printk( - "error=%d fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) fileid=%02x:%02x:%llu name=%02x:%02x:%llu/%s", + -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), __entry->fileid, MAJOR(__entry->dev), MINOR(__entry->dev), @@ -603,8 +842,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event, __entry->dev = old_dir->i_sb->s_dev; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); - __assign_str(old_name, old_dentry->d_name.name); - __assign_str(new_name, new_dentry->d_name.name); + __assign_str(old_name); + __assign_str(new_name); ), TP_printk( @@ -640,7 +879,7 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, error) + __field(unsigned long, error) __field(u64, old_dir) __string(old_name, old_dentry->d_name.name) __field(u64, new_dir) @@ -649,17 +888,17 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, TP_fast_assign( __entry->dev = old_dir->i_sb->s_dev; + __entry->error = -error; __entry->old_dir = NFS_FILEID(old_dir); __entry->new_dir = NFS_FILEID(new_dir); - __entry->error = error; - __assign_str(old_name, old_dentry->d_name.name); - __assign_str(new_name, new_dentry->d_name.name); + __assign_str(old_name); + __assign_str(new_name); ), TP_printk( - "error=%d old_name=%02x:%02x:%llu/%s " + "error=%ld (%s) old_name=%02x:%02x:%llu/%s " "new_name=%02x:%02x:%llu/%s", - __entry->error, + -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->old_dir, __get_str(old_name), @@ -683,7 +922,7 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done, DEFINE_NFS_RENAME_EVENT(nfs_rename_enter); DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit); -DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename); +DEFINE_NFS_RENAME_EVENT_DONE(nfs_async_rename_done); TRACE_EVENT(nfs_sillyrename_unlink, TP_PROTO( @@ -695,7 +934,7 @@ TRACE_EVENT(nfs_sillyrename_unlink, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, error) + __field(unsigned long, error) __field(u64, dir) __dynamic_array(char, name, data->args.name.len + 1) ), @@ -705,20 +944,1036 @@ TRACE_EVENT(nfs_sillyrename_unlink, size_t len = data->args.name.len; __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); - __entry->error = error; + __entry->error = -error; memcpy(__get_str(name), data->args.name.name, len); __get_str(name)[len] = 0; ), TP_printk( - "error=%d name=%02x:%02x:%llu/%s", - __entry->error, + "error=%ld (%s) name=%02x:%02x:%llu/%s", + -__entry->error, show_nfs_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, __get_str(name) ) ); + +DECLARE_EVENT_CLASS(nfs_folio_event, + TP_PROTO( + const struct inode *inode, + loff_t offset, + size_t count + ), + + TP_ARGS(inode, offset, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) + __field(size_t, count) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); + __entry->offset = offset; + __entry->count = count; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "offset=%lld count=%zu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->offset, __entry->count + ) +); + +#define DEFINE_NFS_FOLIO_EVENT(name) \ + DEFINE_EVENT(nfs_folio_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + loff_t offset, \ + size_t count \ + ), \ + TP_ARGS(inode, offset, count)) + +DECLARE_EVENT_CLASS(nfs_folio_event_done, + TP_PROTO( + const struct inode *inode, + loff_t offset, + size_t count, + int ret + ), + + TP_ARGS(inode, offset, count, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(int, ret) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) + __field(size_t, count) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); + __entry->offset = offset; + __entry->count = count; + __entry->ret = ret; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "offset=%lld count=%zu ret=%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->offset, __entry->count, __entry->ret + ) +); + +#define DEFINE_NFS_FOLIO_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_folio_event_done, name, \ + TP_PROTO( \ + const struct inode *inode, \ + loff_t offset, \ + size_t count, \ + int ret \ + ), \ + TP_ARGS(inode, offset, count, ret)) + +DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_try_to_update_request); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_try_to_update_request_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_update_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_update_folio_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_write_begin); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_begin_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_write_end); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_end_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_writepages); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writepages_done); + +DECLARE_EVENT_CLASS(nfs_kiocb_event, + TP_PROTO( + const struct kiocb *iocb, + const struct iov_iter *iter + ), + + TP_ARGS(iocb, iter), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + + TP_fast_assign( + const struct inode *inode = file_inode(iocb->ki_filp); + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); + __entry->offset = iocb->ki_pos; + __entry->count = iov_iter_count(iter); + __entry->flags = iocb->ki_flags; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld count=%zu ki_flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->offset, __entry->count, + __print_flags(__entry->flags, "|", TRACE_IOCB_STRINGS) + ) +); + +#define DEFINE_NFS_KIOCB_EVENT(name) \ + DEFINE_EVENT(nfs_kiocb_event, name, \ + TP_PROTO( \ + const struct kiocb *iocb, \ + const struct iov_iter *iter \ + ), \ + TP_ARGS(iocb, iter)) + +DEFINE_NFS_KIOCB_EVENT(nfs_file_read); +DEFINE_NFS_KIOCB_EVENT(nfs_file_write); + +TRACE_EVENT(nfs_aop_readahead, + TP_PROTO( + const struct inode *inode, + loff_t pos, + unsigned int nr_pages + ), + + TP_ARGS(inode, pos, nr_pages), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) + __field(unsigned int, nr_pages) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); + __entry->offset = pos; + __entry->nr_pages = nr_pages; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld nr_pages=%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->offset, __entry->nr_pages + ) +); + +TRACE_EVENT(nfs_aop_readahead_done, + TP_PROTO( + const struct inode *inode, + unsigned int nr_pages, + int ret + ), + + TP_ARGS(inode, nr_pages, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(int, ret) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) + __field(unsigned int, nr_pages) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); + __entry->nr_pages = nr_pages; + __entry->ret = ret; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu nr_pages=%u ret=%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->nr_pages, __entry->ret + ) +); + +TRACE_EVENT(nfs_initiate_read, + TP_PROTO( + const struct nfs_pgio_header *hdr + ), + + TP_ARGS(hdr), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(u32, count) + ), + + TP_fast_assign( + const struct inode *inode = hdr->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = hdr->args.fh ? + hdr->args.fh : &nfsi->fh; + + __entry->offset = hdr->args.offset; + __entry->count = hdr->args.count; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, __entry->count + ) +); + +TRACE_EVENT(nfs_readpage_done, + TP_PROTO( + const struct rpc_task *task, + const struct nfs_pgio_header *hdr + ), + + TP_ARGS(task, hdr), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(u32, arg_count) + __field(u32, res_count) + __field(bool, eof) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = hdr->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = hdr->args.fh ? + hdr->args.fh : &nfsi->fh; + + __entry->error = task->tk_status; + __entry->offset = hdr->args.offset; + __entry->arg_count = hdr->args.count; + __entry->res_count = hdr->res.count; + __entry->eof = hdr->res.eof; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u%s", __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, __entry->arg_count, + __entry->res_count, __entry->eof ? " eof" : "" + ) +); + +TRACE_EVENT(nfs_readpage_short, + TP_PROTO( + const struct rpc_task *task, + const struct nfs_pgio_header *hdr + ), + + TP_ARGS(task, hdr), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(u32, arg_count) + __field(u32, res_count) + __field(bool, eof) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = hdr->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = hdr->args.fh ? + hdr->args.fh : &nfsi->fh; + + __entry->error = task->tk_status; + __entry->offset = hdr->args.offset; + __entry->arg_count = hdr->args.count; + __entry->res_count = hdr->res.count; + __entry->eof = hdr->res.eof; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u%s", __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, __entry->arg_count, + __entry->res_count, __entry->eof ? " eof" : "" + ) +); + + +TRACE_EVENT(nfs_pgio_error, + TP_PROTO( + const struct nfs_pgio_header *hdr, + int error, + loff_t pos + ), + + TP_ARGS(hdr, error, pos), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(u32, arg_count) + __field(u32, res_count) + __field(loff_t, pos) + __field(int, error) + ), + + TP_fast_assign( + const struct inode *inode = hdr->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = hdr->args.fh ? + hdr->args.fh : &nfsi->fh; + + __entry->error = error; + __entry->offset = hdr->args.offset; + __entry->arg_count = hdr->args.count; + __entry->res_count = hdr->res.count; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk("error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u pos=%llu", __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + (long long)__entry->offset, __entry->arg_count, __entry->res_count, + __entry->pos + ) +); + +TRACE_EVENT(nfs_initiate_write, + TP_PROTO( + const struct nfs_pgio_header *hdr + ), + + TP_ARGS(hdr), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(u32, count) + __field(unsigned long, stable) + ), + + TP_fast_assign( + const struct inode *inode = hdr->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = hdr->args.fh ? + hdr->args.fh : &nfsi->fh; + + __entry->offset = hdr->args.offset; + __entry->count = hdr->args.count; + __entry->stable = hdr->args.stable; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u stable=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, __entry->count, + show_nfs_stable_how(__entry->stable) + ) +); + +TRACE_EVENT(nfs_writeback_done, + TP_PROTO( + const struct rpc_task *task, + const struct nfs_pgio_header *hdr + ), + + TP_ARGS(task, hdr), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(u32, arg_count) + __field(u32, res_count) + __field(int, error) + __field(unsigned long, stable) + __array(char, verifier, NFS4_VERIFIER_SIZE) + ), + + TP_fast_assign( + const struct inode *inode = hdr->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = hdr->args.fh ? + hdr->args.fh : &nfsi->fh; + const struct nfs_writeverf *verf = hdr->res.verf; + + __entry->error = task->tk_status; + __entry->offset = hdr->args.offset; + __entry->arg_count = hdr->args.count; + __entry->res_count = hdr->res.count; + __entry->stable = verf->committed; + memcpy(__entry->verifier, + &verf->verifier, + NFS4_VERIFIER_SIZE); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u stable=%s " + "verifier=%s", __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, __entry->arg_count, + __entry->res_count, + show_nfs_stable_how(__entry->stable), + show_nfs4_verifier(__entry->verifier) + ) +); + +DECLARE_EVENT_CLASS(nfs_page_class, + TP_PROTO( + const struct nfs_page *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(const struct nfs_page *__private, req) + __field(loff_t, offset) + __field(unsigned int, count) + __field(unsigned long, flags) + ), + + TP_fast_assign( + const struct inode *inode = folio_inode(req->wb_folio); + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->req = req; + __entry->offset = req_offset(req); + __entry->count = req->wb_bytes; + __entry->flags = req->wb_flags; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x req=%p offset=%lld count=%u flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->req, __entry->offset, __entry->count, + nfs_show_wb_flags(__entry->flags) + ) +); + +#define DEFINE_NFS_PAGE_EVENT(name) \ + DEFINE_EVENT(nfs_page_class, name, \ + TP_PROTO( \ + const struct nfs_page *req \ + ), \ + TP_ARGS(req)) + +DEFINE_NFS_PAGE_EVENT(nfs_writepage_setup); +DEFINE_NFS_PAGE_EVENT(nfs_do_writepage); + +DECLARE_EVENT_CLASS(nfs_page_error_class, + TP_PROTO( + const struct inode *inode, + const struct nfs_page *req, + int error + ), + + TP_ARGS(inode, req, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(unsigned int, count) + __field(int, error) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->offset = req_offset(req); + __entry->count = req->wb_bytes; + __entry->error = error; + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u", __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->offset, + __entry->count + ) +); + +#define DEFINE_NFS_PAGEERR_EVENT(name) \ + DEFINE_EVENT(nfs_page_error_class, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const struct nfs_page *req, \ + int error \ + ), \ + TP_ARGS(inode, req, error)) + +DEFINE_NFS_PAGEERR_EVENT(nfs_write_error); +DEFINE_NFS_PAGEERR_EVENT(nfs_comp_error); +DEFINE_NFS_PAGEERR_EVENT(nfs_commit_error); + +TRACE_EVENT(nfs_initiate_commit, + TP_PROTO( + const struct nfs_commit_data *data + ), + + TP_ARGS(data), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(u32, count) + ), + + TP_fast_assign( + const struct inode *inode = data->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = data->args.fh ? + data->args.fh : &nfsi->fh; + + __entry->offset = data->args.offset; + __entry->count = data->args.count; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, __entry->count + ) +); + +TRACE_EVENT(nfs_commit_done, + TP_PROTO( + const struct rpc_task *task, + const struct nfs_commit_data *data + ), + + TP_ARGS(task, data), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(int, error) + __field(unsigned long, stable) + __array(char, verifier, NFS4_VERIFIER_SIZE) + ), + + TP_fast_assign( + const struct inode *inode = data->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = data->args.fh ? + data->args.fh : &nfsi->fh; + const struct nfs_writeverf *verf = data->res.verf; + + __entry->error = task->tk_status; + __entry->offset = data->args.offset; + __entry->stable = verf->committed; + memcpy(__entry->verifier, + &verf->verifier, + NFS4_VERIFIER_SIZE); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld stable=%s verifier=%s", __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, + show_nfs_stable_how(__entry->stable), + show_nfs4_verifier(__entry->verifier) + ) +); + +#define nfs_show_direct_req_flags(v) \ + __print_flags(v, "|", \ + { NFS_ODIRECT_DO_COMMIT, "DO_COMMIT" }, \ + { NFS_ODIRECT_RESCHED_WRITES, "RESCHED_WRITES" }, \ + { NFS_ODIRECT_SHOULD_DIRTY, "SHOULD DIRTY" }, \ + { NFS_ODIRECT_DONE, "DONE" } ) + +DECLARE_EVENT_CLASS(nfs_direct_req_class, + TP_PROTO( + const struct nfs_direct_req *dreq + ), + + TP_ARGS(dreq), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, offset) + __field(ssize_t, count) + __field(ssize_t, error) + __field(int, flags) + ), + + TP_fast_assign( + const struct inode *inode = dreq->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->offset = dreq->io_start; + __entry->count = dreq->count; + __entry->error = dreq->error; + __entry->flags = dreq->flags; + ), + + TP_printk( + "error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zd flags=%s", + __entry->error, MAJOR(__entry->dev), + MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->offset, + __entry->count, + nfs_show_direct_req_flags(__entry->flags) + ) +); + +#define DEFINE_NFS_DIRECT_REQ_EVENT(name) \ + DEFINE_EVENT(nfs_direct_req_class, name, \ + TP_PROTO( \ + const struct nfs_direct_req *dreq \ + ), \ + TP_ARGS(dreq)) + +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_commit_complete); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_resched_write); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_complete); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io); + +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +DECLARE_EVENT_CLASS(nfs_local_dio_class, + TP_PROTO( + const struct inode *inode, + loff_t offset, + ssize_t count, + const struct nfs_local_dio *local_dio + ), + TP_ARGS(inode, offset, count, local_dio), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, offset) + __field(ssize_t, count) + __field(u32, mem_align) + __field(u32, offset_align) + __field(loff_t, start) + __field(ssize_t, start_len) + __field(loff_t, middle) + __field(ssize_t, middle_len) + __field(loff_t, end) + __field(ssize_t, end_len) + ), + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->offset = offset; + __entry->count = count; + __entry->mem_align = local_dio->mem_align; + __entry->offset_align = local_dio->offset_align; + __entry->start = offset; + __entry->start_len = local_dio->start_len; + __entry->middle = local_dio->middle_offset; + __entry->middle_len = local_dio->middle_len; + __entry->end = local_dio->end_offset; + __entry->end_len = local_dio->end_len; + ), + TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zd " + "mem_align=%u offset_align=%u " + "start=%llu+%zd middle=%llu+%zd end=%llu+%zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->offset, __entry->count, + __entry->mem_align, __entry->offset_align, + __entry->start, __entry->start_len, + __entry->middle, __entry->middle_len, + __entry->end, __entry->end_len) +) + +#define DEFINE_NFS_LOCAL_DIO_EVENT(name) \ +DEFINE_EVENT(nfs_local_dio_class, nfs_local_dio_##name, \ + TP_PROTO(const struct inode *inode, \ + loff_t offset, \ + ssize_t count, \ + const struct nfs_local_dio *local_dio),\ + TP_ARGS(inode, offset, count, local_dio)) + +DEFINE_NFS_LOCAL_DIO_EVENT(read); +DEFINE_NFS_LOCAL_DIO_EVENT(write); +DEFINE_NFS_LOCAL_DIO_EVENT(misaligned); + +#endif /* CONFIG_NFS_LOCALIO */ + +TRACE_EVENT(nfs_fh_to_dentry, + TP_PROTO( + const struct super_block *sb, + const struct nfs_fh *fh, + u64 fileid, + int error + ), + + TP_ARGS(sb, fh, fileid, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + __entry->error = error; + __entry->dev = sb->s_dev; + __entry->fileid = fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x ", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + +TRACE_EVENT(nfs_mount_assign, + TP_PROTO( + const char *option, + const char *value + ), + + TP_ARGS(option, value), + + TP_STRUCT__entry( + __string(option, option) + __string(value, value) + ), + + TP_fast_assign( + __assign_str(option); + __assign_str(value); + ), + + TP_printk("option %s=%s", + __get_str(option), __get_str(value) + ) +); + +TRACE_EVENT(nfs_mount_option, + TP_PROTO( + const struct fs_parameter *param + ), + + TP_ARGS(param), + + TP_STRUCT__entry( + __string(option, param->key) + ), + + TP_fast_assign( + __assign_str(option); + ), + + TP_printk("option %s", __get_str(option)) +); + +TRACE_EVENT(nfs_mount_path, + TP_PROTO( + const char *path + ), + + TP_ARGS(path), + + TP_STRUCT__entry( + __string(path, path) + ), + + TP_fast_assign( + __assign_str(path); + ), + + TP_printk("path='%s'", __get_str(path)) +); + +TRACE_EVENT(nfs_local_open_fh, + TP_PROTO( + const struct nfs_fh *fh, + fmode_t fmode, + int error + ), + + TP_ARGS(fh, fmode, error), + + TP_STRUCT__entry( + __field(int, error) + __field(u32, fhandle) + __field(unsigned int, fmode) + ), + + TP_fast_assign( + __entry->error = error; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->fmode = (__force unsigned int)fmode; + ), + + TP_printk( + "fhandle=0x%08x mode=%s result=%d", + __entry->fhandle, + show_fs_fmode_flags(__entry->fmode), + __entry->error + ) +); + +DECLARE_EVENT_CLASS(nfs_xdr_event, + TP_PROTO( + const struct xdr_stream *xdr, + int error + ), + + TP_ARGS(xdr, error), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(int, version) + __field(unsigned long, error) + __string(program, + xdr->rqst->rq_task->tk_client->cl_program->name) + __string(procedure, + xdr->rqst->rq_task->tk_msg.rpc_proc->p_name) + ), + + TP_fast_assign( + const struct rpc_rqst *rqstp = xdr->rqst; + const struct rpc_task *task = rqstp->rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->version = task->tk_client->cl_vers; + __entry->error = error; + __assign_str(program); + __assign_str(procedure); + ), + + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " xid=0x%08x %sv%d %s error=%ld (%s)", + __entry->task_id, __entry->client_id, __entry->xid, + __get_str(program), __entry->version, + __get_str(procedure), -__entry->error, + show_nfs_status(__entry->error) + ) +); +#define DEFINE_NFS_XDR_EVENT(name) \ + DEFINE_EVENT(nfs_xdr_event, name, \ + TP_PROTO( \ + const struct xdr_stream *xdr, \ + int error \ + ), \ + TP_ARGS(xdr, error)) +DEFINE_NFS_XDR_EVENT(nfs_xdr_status); +DEFINE_NFS_XDR_EVENT(nfs_xdr_bad_filehandle); + #endif /* _TRACE_NFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index de9066a92c0d..6e69ce43a13f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/pagelist.c * @@ -16,28 +17,81 @@ #include <linux/nfs.h> #include <linux/nfs3.h> #include <linux/nfs4.h> -#include <linux/nfs_page.h> #include <linux/nfs_fs.h> +#include <linux/nfs_page.h> #include <linux/nfs_mount.h> #include <linux/export.h> +#include <linux/filelock.h> #include "internal.h" #include "pnfs.h" +#include "nfstrace.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; +struct nfs_page_iter_page { + const struct nfs_page *req; + size_t count; +}; + +static void nfs_page_iter_page_init(struct nfs_page_iter_page *i, + const struct nfs_page *req) +{ + i->req = req; + i->count = 0; +} + +static void nfs_page_iter_page_advance(struct nfs_page_iter_page *i, size_t sz) +{ + const struct nfs_page *req = i->req; + size_t tmp = i->count + sz; + + i->count = (tmp < req->wb_bytes) ? tmp : req->wb_bytes; +} + +static struct page *nfs_page_iter_page_get(struct nfs_page_iter_page *i) +{ + const struct nfs_page *req = i->req; + struct page *page; + + if (i->count != req->wb_bytes) { + size_t base = i->count + req->wb_pgbase; + size_t len = PAGE_SIZE - offset_in_page(base); + + page = nfs_page_to_page(req, base); + nfs_page_iter_page_advance(i, len); + return page; + } + return NULL; +} + +static struct nfs_pgio_mirror * +nfs_pgio_get_mirror(struct nfs_pageio_descriptor *desc, u32 idx) +{ + if (desc->pg_ops->pg_get_mirror) + return desc->pg_ops->pg_get_mirror(desc, idx); + return &desc->pg_mirrors[0]; +} + struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc) { - return nfs_pgio_has_mirroring(desc) ? - &desc->pg_mirrors[desc->pg_mirror_idx] : - &desc->pg_mirrors[0]; + return nfs_pgio_get_mirror(desc, desc->pg_mirror_idx); } EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror); +static u32 +nfs_pgio_set_current_mirror(struct nfs_pageio_descriptor *desc, u32 idx) +{ + if (desc->pg_ops->pg_set_mirror) + return desc->pg_ops->pg_set_mirror(desc, idx); + return desc->pg_mirror_idx; +} + void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)) @@ -47,11 +101,12 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, hdr->req = nfs_list_entry(mirror->pg_list.next); hdr->inode = desc->pg_inode; - hdr->cred = hdr->req->wb_context->cred; + hdr->cred = nfs_req_openctx(hdr->req)->cred; hdr->io_start = req_offset(hdr->req); hdr->good_bytes = mirror->pg_count; hdr->io_completion = desc->pg_io_completion; hdr->dreq = desc->pg_dreq; + nfs_netfs_set_pgio_header(hdr, desc); hdr->release = release; hdr->completion_ops = desc->pg_completion_ops; if (hdr->completion_ops->init_hdr) @@ -63,20 +118,21 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) { - spin_lock(&hdr->lock); - if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags) - || pos < hdr->io_start + hdr->good_bytes) { + unsigned int new = pos - hdr->io_start; + + trace_nfs_pgio_error(hdr, error, pos); + if (hdr->good_bytes > new) { + hdr->good_bytes = new; clear_bit(NFS_IOHDR_EOF, &hdr->flags); - hdr->good_bytes = pos - hdr->io_start; - hdr->error = error; + if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)) + hdr->error = error; } - spin_unlock(&hdr->lock); } -static inline struct nfs_page * -nfs_page_alloc(void) +static inline struct nfs_page *nfs_page_alloc(void) { - struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO); + struct nfs_page *p = + kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask()); if (p) INIT_LIST_HEAD(&p->wb_list); return p; @@ -98,8 +154,8 @@ nfs_page_free(struct nfs_page *p) int nfs_iocounter_wait(struct nfs_lock_context *l_ctx) { - return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable, - TASK_KILLABLE); + return wait_var_event_killable(&l_ctx->io_count, + !atomic_read(&l_ctx->io_count)); } /** @@ -132,86 +188,79 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); /* - * nfs_page_group_lock - lock the head of the page group - * @req - request in group that is to be locked - * @nonblock - if true don't block waiting for lock - * - * this lock must be held if modifying the page group list + * nfs_page_set_headlock - set the request PG_HEADLOCK + * @req: request that is to be locked * - * return 0 on success, < 0 on error: -EDELAY if nonblocking or the - * result from wait_on_bit_lock + * this lock must be held when modifying req->wb_head * - * NOTE: calling with nonblock=false should always have set the - * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock - * with TASK_UNINTERRUPTIBLE), so there is no need to check the result. + * return 0 on success, < 0 on error */ int -nfs_page_group_lock(struct nfs_page *req, bool nonblock) +nfs_page_set_headlock(struct nfs_page *req) { - struct nfs_page *head = req->wb_head; - - WARN_ON_ONCE(head != head->wb_head); - - if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) + if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags)) return 0; - if (!nonblock) { - set_bit(PG_CONTENDED1, &head->wb_flags); - smp_mb__after_atomic(); - return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + set_bit(PG_CONTENDED1, &req->wb_flags); + smp_mb__after_atomic(); + return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); - } - - return -EAGAIN; } /* - * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it - * @req - a request in the group - * - * This is a blocking call to wait for the group lock to be cleared. + * nfs_page_clear_headlock - clear the request PG_HEADLOCK + * @req: request that is to be locked */ void -nfs_page_group_lock_wait(struct nfs_page *req) +nfs_page_clear_headlock(struct nfs_page *req) { - struct nfs_page *head = req->wb_head; + clear_bit_unlock(PG_HEADLOCK, &req->wb_flags); + smp_mb__after_atomic(); + if (!test_bit(PG_CONTENDED1, &req->wb_flags)) + return; + wake_up_bit(&req->wb_flags, PG_HEADLOCK); +} - WARN_ON_ONCE(head != head->wb_head); +/* + * nfs_page_group_lock - lock the head of the page group + * @req: request in group that is to be locked + * + * this lock must be held when traversing or modifying the page + * group list + * + * return 0 on success, < 0 on error + */ +int +nfs_page_group_lock(struct nfs_page *req) +{ + int ret; - if (!test_bit(PG_HEADLOCK, &head->wb_flags)) - return; - set_bit(PG_CONTENDED1, &head->wb_flags); - smp_mb__after_atomic(); - wait_on_bit(&head->wb_flags, PG_HEADLOCK, - TASK_UNINTERRUPTIBLE); + ret = nfs_page_set_headlock(req); + if (ret || req->wb_head == req) + return ret; + return nfs_page_set_headlock(req->wb_head); } /* * nfs_page_group_unlock - unlock the head of the page group - * @req - request in group that is to be unlocked + * @req: request in group that is to be unlocked */ void nfs_page_group_unlock(struct nfs_page *req) { - struct nfs_page *head = req->wb_head; - - WARN_ON_ONCE(head != head->wb_head); - - smp_mb__before_atomic(); - clear_bit(PG_HEADLOCK, &head->wb_flags); - smp_mb__after_atomic(); - if (!test_bit(PG_CONTENDED1, &head->wb_flags)) - return; - wake_up_bit(&head->wb_flags, PG_HEADLOCK); + if (req != req->wb_head) + nfs_page_clear_headlock(req->wb_head); + nfs_page_clear_headlock(req); } -/* - * nfs_page_group_sync_on_bit_locked +/** + * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set + * @req: request in page group + * @bit: PG_* bit that is used to sync page group * * must be called with page group lock held */ -static bool -nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) { struct nfs_page *head = req->wb_head; struct nfs_page *tmp; @@ -246,7 +295,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) { bool ret; - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); ret = nfs_page_group_sync_on_bit_locked(req, bit); nfs_page_group_unlock(req); @@ -285,12 +334,10 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) * has extra ref from the write/commit path to handle handoff * between write and commit lists. */ if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { - inode = page_file_mapping(req->wb_page)->host; + inode = nfs_page_to_inode(req); set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); - spin_lock(&inode->i_lock); - NFS_I(inode)->nrequests++; - spin_unlock(&inode->i_lock); + atomic_long_inc(&NFS_I(inode)->nrequests); } } } @@ -306,14 +353,11 @@ static void nfs_page_group_destroy(struct kref *kref) { struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + struct nfs_page *head = req->wb_head; struct nfs_page *tmp, *next; - /* subrequests must release the ref on the head request */ - if (req->wb_head != req) - nfs_release_request(req->wb_head); - if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) - return; + goto out; tmp = req; do { @@ -324,27 +368,18 @@ nfs_page_group_destroy(struct kref *kref) nfs_free_request(tmp); tmp = next; } while (tmp != req); +out: + /* subrequests must release the ref on the head request */ + if (head != req) + nfs_release_request(head); } -/** - * nfs_create_request - Create an NFS read/write request. - * @ctx: open context to use - * @page: page to write - * @last: last nfs request created for this page group or NULL if head - * @offset: starting offset within the page for the write - * @count: number of bytes to read/write - * - * The page must be locked by the caller. This makes sure we never - * create two different requests for the same page. - * User should ensure it is safe to sleep in this function. - */ -struct nfs_page * -nfs_create_request(struct nfs_open_context *ctx, struct page *page, - struct nfs_page *last, unsigned int offset, - unsigned int count) +static struct nfs_page *nfs_page_create(struct nfs_lock_context *l_ctx, + unsigned int pgbase, pgoff_t index, + unsigned int offset, unsigned int count) { struct nfs_page *req; - struct nfs_lock_context *l_ctx; + struct nfs_open_context *ctx = l_ctx->open_context; if (test_bit(NFS_CONTEXT_BAD, &ctx->flags)) return ERR_PTR(-EBADF); @@ -353,44 +388,139 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, if (req == NULL) return ERR_PTR(-ENOMEM); - /* get lock context early so we can deal with alloc failures */ - l_ctx = nfs_get_lock_context(ctx); - if (IS_ERR(l_ctx)) { - nfs_page_free(req); - return ERR_CAST(l_ctx); - } req->wb_lock_context = l_ctx; + refcount_inc(&l_ctx->count); atomic_inc(&l_ctx->io_count); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ - req->wb_page = page; - if (page) { - req->wb_index = page_index(page); - get_page(page); - } - req->wb_offset = offset; - req->wb_pgbase = offset; - req->wb_bytes = count; - req->wb_context = get_nfs_open_context(ctx); + req->wb_pgbase = pgbase; + req->wb_index = index; + req->wb_offset = offset; + req->wb_bytes = count; kref_init(&req->wb_kref); - nfs_page_group_init(req, last); + req->wb_nio = 0; return req; } +static void nfs_page_assign_folio(struct nfs_page *req, struct folio *folio) +{ + if (folio != NULL) { + req->wb_folio = folio; + folio_get(folio); + set_bit(PG_FOLIO, &req->wb_flags); + } +} + +static void nfs_page_assign_page(struct nfs_page *req, struct page *page) +{ + if (page != NULL) { + req->wb_page = page; + get_page(page); + } +} + +/** + * nfs_page_create_from_page - Create an NFS read/write request. + * @ctx: open context to use + * @page: page to write + * @pgbase: starting offset within the page for the write + * @offset: file offset for the write + * @count: number of bytes to read/write + * + * The page must be locked by the caller. This makes sure we never + * create two different requests for the same page. + * User should ensure it is safe to sleep in this function. + */ +struct nfs_page *nfs_page_create_from_page(struct nfs_open_context *ctx, + struct page *page, + unsigned int pgbase, loff_t offset, + unsigned int count) +{ + struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx); + struct nfs_page *ret; + + if (IS_ERR(l_ctx)) + return ERR_CAST(l_ctx); + ret = nfs_page_create(l_ctx, pgbase, offset >> PAGE_SHIFT, + offset_in_page(offset), count); + if (!IS_ERR(ret)) { + nfs_page_assign_page(ret, page); + nfs_page_group_init(ret, NULL); + } + nfs_put_lock_context(l_ctx); + return ret; +} + +/** + * nfs_page_create_from_folio - Create an NFS read/write request. + * @ctx: open context to use + * @folio: folio to write + * @offset: starting offset within the folio for the write + * @count: number of bytes to read/write + * + * The page must be locked by the caller. This makes sure we never + * create two different requests for the same page. + * User should ensure it is safe to sleep in this function. + */ +struct nfs_page *nfs_page_create_from_folio(struct nfs_open_context *ctx, + struct folio *folio, + unsigned int offset, + unsigned int count) +{ + struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx); + struct nfs_page *ret; + + if (IS_ERR(l_ctx)) + return ERR_CAST(l_ctx); + ret = nfs_page_create(l_ctx, offset, folio->index, offset, count); + if (!IS_ERR(ret)) { + nfs_page_assign_folio(ret, folio); + nfs_page_group_init(ret, NULL); + } + nfs_put_lock_context(l_ctx); + return ret; +} + +static struct nfs_page * +nfs_create_subreq(struct nfs_page *req, + unsigned int pgbase, + unsigned int offset, + unsigned int count) +{ + struct nfs_page *last; + struct nfs_page *ret; + struct folio *folio = nfs_page_to_folio(req); + struct page *page = nfs_page_to_page(req, pgbase); + + ret = nfs_page_create(req->wb_lock_context, pgbase, req->wb_index, + offset, count); + if (!IS_ERR(ret)) { + if (folio) + nfs_page_assign_folio(ret, folio); + else + nfs_page_assign_page(ret, page); + /* find the last request */ + for (last = req->wb_head; + last->wb_this_page != req->wb_head; + last = last->wb_this_page) + ; + + nfs_lock_request(ret); + nfs_page_group_init(ret, last); + ret->wb_nio = req->wb_nio; + } + return ret; +} + /** * nfs_unlock_request - Unlock request and wake up sleepers. - * @req: + * @req: pointer to request */ void nfs_unlock_request(struct nfs_page *req) { - if (!NFS_WBACK_BUSY(req)) { - printk(KERN_ERR "NFS: Invalid unlock attempted\n"); - BUG(); - } - smp_mb__before_atomic(); - clear_bit(PG_BUSY, &req->wb_flags); + clear_bit_unlock(PG_BUSY, &req->wb_flags); smp_mb__after_atomic(); if (!test_bit(PG_CONTENDED2, &req->wb_flags)) return; @@ -399,7 +529,7 @@ void nfs_unlock_request(struct nfs_page *req) /** * nfs_unlock_and_release_request - Unlock request and release the nfs_page - * @req: + * @req: pointer to request */ void nfs_unlock_and_release_request(struct nfs_page *req) { @@ -416,31 +546,33 @@ void nfs_unlock_and_release_request(struct nfs_page *req) */ static void nfs_clear_request(struct nfs_page *req) { + struct folio *folio = nfs_page_to_folio(req); struct page *page = req->wb_page; - struct nfs_open_context *ctx = req->wb_context; struct nfs_lock_context *l_ctx = req->wb_lock_context; + struct nfs_open_context *ctx; - if (page != NULL) { + if (folio != NULL) { + folio_put(folio); + req->wb_folio = NULL; + clear_bit(PG_FOLIO, &req->wb_flags); + } else if (page != NULL) { put_page(page); req->wb_page = NULL; } if (l_ctx != NULL) { if (atomic_dec_and_test(&l_ctx->io_count)) { - wake_up_atomic_t(&l_ctx->io_count); + wake_up_var(&l_ctx->io_count); + ctx = l_ctx->open_context; if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags)) rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq); } nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } - if (ctx != NULL) { - put_nfs_open_context(ctx); - req->wb_context = NULL; - } } /** - * nfs_release_request - Release the count on an NFS read/write request + * nfs_free_request - Release the count on an NFS read/write request * @req: request to release * * Note: Should never be called with the spinlock held! @@ -465,24 +597,7 @@ void nfs_release_request(struct nfs_page *req) { kref_put(&req->wb_kref, nfs_page_group_destroy); } - -/** - * nfs_wait_on_request - Wait for a request to complete. - * @req: request to wait upon. - * - * Interruptible by fatal signals only. - * The user is responsible for holding a count on the request. - */ -int -nfs_wait_on_request(struct nfs_page *req) -{ - if (!test_bit(PG_BUSY, &req->wb_flags)) - return 0; - set_bit(PG_CONTENDED2, &req->wb_flags); - smp_mb__after_atomic(); - return wait_on_bit_io(&req->wb_flags, PG_BUSY, - TASK_UNINTERRUPTIBLE); -} +EXPORT_SYMBOL_GPL(nfs_release_request); /* * nfs_generic_pg_test - determine if requests can be coalesced @@ -490,7 +605,7 @@ nfs_wait_on_request(struct nfs_page *req) * @prev: previous request in desc, or NULL * @req: this request * - * Returns zero if @req can be coalesced into @desc, otherwise it returns + * Returns zero if @req cannot be coalesced into @desc, otherwise it returns * the size of the request. */ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, @@ -523,23 +638,12 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) if (hdr) { INIT_LIST_HEAD(&hdr->pages); - spin_lock_init(&hdr->lock); hdr->rw_ops = ops; } return hdr; } EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc); -/* - * nfs_pgio_header_free - Free a read or write header - * @hdr: The header to free - */ -void nfs_pgio_header_free(struct nfs_pgio_header *hdr) -{ - hdr->rw_ops->rw_free_header(hdr); -} -EXPORT_SYMBOL_GPL(nfs_pgio_header_free); - /** * nfs_pgio_data_destroy - make @hdr suitable for reuse * @@ -548,26 +652,36 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * * @hdr: A header that has had nfs_generic_pgio called */ -void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) +static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) { if (hdr->args.context) put_nfs_open_context(hdr->args.context); if (hdr->page_array.pagevec != hdr->page_array.page_array) kfree(hdr->page_array.pagevec); } -EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy); + +/* + * nfs_pgio_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_pgio_header_free(struct nfs_pgio_header *hdr) +{ + nfs_pgio_data_destroy(hdr); + hdr->rw_ops->rw_free_header(hdr); +} +EXPORT_SYMBOL_GPL(nfs_pgio_header_free); /** * nfs_pgio_rpcsetup - Set up arguments for a pageio call * @hdr: The pageio hdr + * @pgbase: base * @count: Number of bytes to read - * @offset: Initial offset * @how: How to commit data (writes only) * @cinfo: Commit information for the call (writes only) */ -static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, - unsigned int count, unsigned int offset, - int how, struct nfs_commit_info *cinfo) +static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, unsigned int pgbase, + unsigned int count, int how, + struct nfs_commit_info *cinfo) { struct nfs_page *req = hdr->req; @@ -575,13 +689,13 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, * NB: take care not to mess about with hdr->commit et al. */ hdr->args.fh = NFS_FH(hdr->inode); - hdr->args.offset = req_offset(req) + offset; + hdr->args.offset = req_offset(req); /* pnfs_set_layoutcommit needs this */ hdr->mds_offset = hdr->args.offset; - hdr->args.pgbase = req->wb_pgbase + offset; + hdr->args.pgbase = pgbase; hdr->args.pages = hdr->page_array.pagevec; hdr->args.count = count; - hdr->args.context = get_nfs_open_context(req->wb_context); + hdr->args.context = get_nfs_open_context(nfs_req_openctx(req)); hdr->args.lock_context = req->wb_lock_context; hdr->args.stable = NFS_UNSTABLE; switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { @@ -590,12 +704,13 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, case FLUSH_COND_STABLE: if (nfs_reqs_to_commit(cinfo)) break; + fallthrough; default: hdr->args.stable = NFS_FILE_SYNC; } hdr->res.fattr = &hdr->fattr; - hdr->res.count = count; + hdr->res.count = 0; hdr->res.eof = 0; hdr->res.verf = &hdr->verf; nfs_fattr_init(&hdr->fattr); @@ -616,8 +731,9 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) } int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, - struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, - const struct rpc_call_ops *call_ops, int how, int flags) + const struct cred *cred, const struct nfs_rpc_ops *rpc_ops, + const struct rpc_call_ops *call_ops, int how, int flags, + struct nfsd_file *localio) { struct rpc_task *task; struct rpc_message msg = { @@ -634,7 +750,9 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC | flags, }; - int ret = 0; + + if (nfs_server_capable(hdr->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); @@ -645,31 +763,25 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, hdr->args.count, (unsigned long long)hdr->args.offset); + if (localio) + return nfs_local_doio(NFS_SERVER(hdr->inode)->nfs_client, + localio, hdr, call_ops); + task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } + if (IS_ERR(task)) + return PTR_ERR(task); rpc_put_task(task); -out: - return ret; + return 0; } EXPORT_SYMBOL_GPL(nfs_initiate_pgio); /** * nfs_pgio_error - Clean up from a pageio error - * @desc: IO descriptor * @hdr: pageio header */ static void nfs_pgio_error(struct nfs_pgio_header *hdr) { set_bit(NFS_IOHDR_REDO, &hdr->flags); - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -680,7 +792,6 @@ static void nfs_pgio_error(struct nfs_pgio_header *hdr) static void nfs_pgio_release(void *calldata) { struct nfs_pgio_header *hdr = calldata; - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -711,12 +822,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int io_flags, - gfp_t gfp_flags) + int io_flags) { - struct nfs_pgio_mirror *new; - int i; - desc->pg_moreio = 0; desc->pg_inode = inode; desc->pg_ops = pg_ops; @@ -727,28 +834,17 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_lseg = NULL; desc->pg_io_completion = NULL; desc->pg_dreq = NULL; + nfs_netfs_reset_pageio_descriptor(desc); desc->pg_bsize = bsize; desc->pg_mirror_count = 1; desc->pg_mirror_idx = 0; - if (pg_ops->pg_get_mirror_count) { - /* until we have a request, we don't have an lseg and no - * idea how many mirrors there will be */ - new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, - sizeof(struct nfs_pgio_mirror), gfp_flags); - desc->pg_mirrors_dynamic = new; - desc->pg_mirrors = new; - - for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++) - nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize); - } else { - desc->pg_mirrors_dynamic = NULL; - desc->pg_mirrors = desc->pg_mirrors_static; - nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); - } + desc->pg_mirrors_dynamic = NULL; + desc->pg_mirrors = desc->pg_mirrors_static; + nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); + desc->pg_maxretrans = 0; } -EXPORT_SYMBOL_GPL(nfs_pageio_init); /** * nfs_pgio_result - Basic pageio error handling @@ -760,9 +856,6 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata) struct nfs_pgio_header *hdr = calldata; struct inode *inode = hdr->inode; - dprintk("NFS: %s: %5u, (status %d)\n", __func__, - task->tk_pid, task->tk_status); - if (hdr->rw_ops->rw_done(task, hdr, inode) != 0) return; if (task->tk_status < 0) @@ -791,16 +884,15 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct nfs_commit_info cinfo; struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; - gfp_t gfp_flags = GFP_KERNEL; + unsigned int pg_base = offset_in_page(mirror->pg_base); + gfp_t gfp_flags = nfs_io_gfp_mask(); - pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); + pagecount = nfs_page_array_len(pg_base, mirror->pg_count); pg_array->npages = pagecount; if (pagecount <= ARRAY_SIZE(pg_array->page_array)) pg_array->pagevec = pg_array->page_array; else { - if (hdr->rw_mode == FMODE_WRITE) - gfp_flags = GFP_NOIO; pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags); if (!pg_array->pagevec) { pg_array->npages = 0; @@ -815,17 +907,26 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, last_page = NULL; pageused = 0; while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); + struct nfs_page_iter_page i; + struct page *page; - if (!last_page || last_page != req->wb_page) { - pageused++; - if (pageused > pagecount) - break; - *pages++ = last_page = req->wb_page; + req = nfs_list_entry(head->next); + nfs_list_move_request(req, &hdr->pages); + + if (req->wb_pgbase == 0) + last_page = NULL; + + nfs_page_iter_page_init(&i, req); + while ((page = nfs_page_iter_page_get(&i)) != NULL) { + if (last_page != page) { + pageused++; + if (pageused > pagecount) + goto full; + *pages++ = last_page = page; + } } } +full: if (WARN_ON_ONCE(pageused != pagecount)) { nfs_pgio_error(hdr); desc->pg_error = -EINVAL; @@ -837,7 +938,8 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo); + nfs_pgio_rpcsetup(hdr, pg_base, mirror->pg_count, desc->pg_ioflags, + &cinfo); desc->pg_rpc_callops = &nfs_pgio_common_ops; return 0; } @@ -847,6 +949,7 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) { struct nfs_pgio_header *hdr; int ret; + unsigned short task_flags = 0; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { @@ -855,13 +958,45 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) } nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); - if (ret == 0) + if (ret == 0) { + struct nfs_client *clp = NFS_SERVER(hdr->inode)->nfs_client; + + struct nfsd_file *localio = + nfs_local_open_fh(clp, hdr->cred, hdr->args.fh, + &hdr->args.context->nfl, + hdr->args.context->mode); + + if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), hdr, hdr->cred, NFS_PROTO(hdr->inode), desc->pg_rpc_callops, - desc->pg_ioflags, 0); + desc->pg_ioflags, + RPC_TASK_CRED_NOREF | task_flags, + localio); + } + return ret; +} + +static struct nfs_pgio_mirror * +nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc, + unsigned int mirror_count) +{ + struct nfs_pgio_mirror *ret; + unsigned int i; + + kfree(desc->pg_mirrors_dynamic); + desc->pg_mirrors_dynamic = NULL; + if (mirror_count == 1) + return desc->pg_mirrors_static; + ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask()); + if (ret != NULL) { + for (i = 0; i < mirror_count; i++) + nfs_pageio_mirror_init(&ret[i], desc->pg_bsize); + desc->pg_mirrors_dynamic = ret; + } return ret; } @@ -869,37 +1004,28 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) * nfs_pageio_setup_mirroring - determine if mirroring is to be used * by calling the pg_get_mirror_count op */ -static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, +static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - int mirror_count = 1; - - if (!pgio->pg_ops->pg_get_mirror_count) - return 0; - - mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); - - if (pgio->pg_error < 0) - return pgio->pg_error; + unsigned int mirror_count = 1; - if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) - return -EINVAL; + if (pgio->pg_ops->pg_get_mirror_count) + mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + if (mirror_count == pgio->pg_mirror_count || pgio->pg_error < 0) + return; - if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic)) - return -EINVAL; + if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) { + pgio->pg_error = -EINVAL; + return; + } + pgio->pg_mirrors = nfs_pageio_alloc_mirrors(pgio, mirror_count); + if (pgio->pg_mirrors == NULL) { + pgio->pg_error = -ENOMEM; + pgio->pg_mirrors = pgio->pg_mirrors_static; + mirror_count = 1; + } pgio->pg_mirror_count = mirror_count; - - return 0; -} - -/* - * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1) - */ -void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio) -{ - pgio->pg_mirror_count = 1; - pgio->pg_mirror_idx = 0; } static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) @@ -917,50 +1043,56 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1, return l1->lockowner == l2->lockowner; } +static bool nfs_page_is_contiguous(const struct nfs_page *prev, + const struct nfs_page *req) +{ + size_t prev_end = prev->wb_pgbase + prev->wb_bytes; + + if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + return false; + if (req->wb_pgbase == 0) + return prev_end == nfs_page_max_length(prev); + if (req->wb_pgbase == prev_end) { + struct folio *folio = nfs_page_to_folio(req); + if (folio) + return folio == nfs_page_to_folio(prev); + return req->wb_page == prev->wb_page; + } + return false; +} + /** - * nfs_can_coalesce_requests - test two requests for compatibility + * nfs_coalesce_size - test two requests for compatibility * @prev: pointer to nfs_page * @req: pointer to nfs_page + * @pgio: pointer to nfs_pagio_descriptor * * The nfs_page structures 'prev' and 'req' are compared to ensure that the * page data area they describe is contiguous, and that their RPC * credentials, NFSv4 open state, and lockowners are the same. * - * Return 'true' if this is the case, else return 'false'. + * Returns size of the request that can be coalesced */ -static bool nfs_can_coalesce_requests(struct nfs_page *prev, +static unsigned int nfs_coalesce_size(struct nfs_page *prev, struct nfs_page *req, struct nfs_pageio_descriptor *pgio) { - size_t size; struct file_lock_context *flctx; if (prev) { - if (!nfs_match_open_context(req->wb_context, prev->wb_context)) - return false; - flctx = d_inode(req->wb_context->dentry)->i_flctx; + if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev))) + return 0; + flctx = locks_inode_context(d_inode(nfs_req_openctx(req)->dentry)); if (flctx != NULL && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock)) && !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) - return false; - if (req_offset(req) != req_offset(prev) + prev->wb_bytes) - return false; - if (req->wb_page == prev->wb_page) { - if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes) - return false; - } else { - if (req->wb_pgbase != 0 || - prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE) - return false; - } + return 0; + if (!nfs_page_is_contiguous(prev, req)) + return 0; } - size = pgio->pg_ops->pg_test(pgio, prev, req); - WARN_ON_ONCE(size > req->wb_bytes); - if (size && size < req->wb_bytes) - req->wb_bytes = size; - return size > 0; + return pgio->pg_ops->pg_test(pgio, prev, req); } /** @@ -968,31 +1100,42 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, * @desc: destination io descriptor * @req: request * - * Returns true if the request 'req' was successfully coalesced into the - * existing list of pages 'desc'. + * If the request 'req' was successfully coalesced into the existing list + * of pages 'desc', it returns the size of req. */ -static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, - struct nfs_page *req) +static unsigned int +nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) { struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_page *prev = NULL; + unsigned int size; - if (mirror->pg_count != 0) { - prev = nfs_list_entry(mirror->pg_list.prev); - } else { + if (list_empty(&mirror->pg_list)) { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); if (desc->pg_error < 0) return 0; mirror->pg_base = req->wb_pgbase; - } - if (!nfs_can_coalesce_requests(prev, req, desc)) + mirror->pg_count = 0; + mirror->pg_recoalesce = 0; + } else + prev = nfs_list_entry(mirror->pg_list.prev); + + if (desc->pg_maxretrans && req->wb_nio > desc->pg_maxretrans) { + if (NFS_SERVER(desc->pg_inode)->flags & NFS_MOUNT_SOFTERR) + desc->pg_error = -ETIMEDOUT; + else + desc->pg_error = -EIO; return 0; - nfs_list_remove_request(req); - nfs_list_add_request(req, &mirror->pg_list); + } + + size = nfs_coalesce_size(prev, req, desc); + if (size < req->wb_bytes) + return size; + nfs_list_move_request(req, &mirror->pg_list); mirror->pg_count += req->wb_bytes; - return 1; + return req->wb_bytes; } /* @@ -1002,27 +1145,33 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - if (!list_empty(&mirror->pg_list)) { int error = desc->pg_ops->pg_doio(desc); if (error < 0) desc->pg_error = error; - else + if (list_empty(&mirror->pg_list)) mirror->pg_bytes_written += mirror->pg_count; } - if (list_empty(&mirror->pg_list)) { - mirror->pg_count = 0; - mirror->pg_base = 0; - } +} + +static void +nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + LIST_HEAD(head); + + nfs_list_move_request(req, &head); + desc->pg_completion_ops->error_cleanup(&head, desc->pg_error); } /** - * nfs_pageio_add_request - Attempt to coalesce a request into a page list. + * __nfs_pageio_add_request - Attempt to coalesce a request into a page list. * @desc: destination io descriptor * @req: request * * This may split a request into subrequests which are all part of the - * same page group. + * same page group. If so, it will submit @req as the last one, to ensure + * the pointer to @req is still valid in case of failure. * * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. @@ -1031,57 +1180,50 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_page *subreq; - unsigned int bytes_left = 0; - unsigned int offset, pgbase; + unsigned int size, subreq_size; - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); subreq = req; - bytes_left = subreq->wb_bytes; - offset = subreq->wb_offset; - pgbase = subreq->wb_pgbase; - - do { - if (!nfs_pageio_do_add_request(desc, subreq)) { - /* make sure pg_test call(s) did nothing */ - WARN_ON_ONCE(subreq->wb_bytes != bytes_left); - WARN_ON_ONCE(subreq->wb_offset != offset); - WARN_ON_ONCE(subreq->wb_pgbase != pgbase); - + subreq_size = subreq->wb_bytes; + for(;;) { + size = nfs_pageio_do_add_request(desc, subreq); + if (size == subreq_size) { + /* We successfully submitted a request */ + if (subreq == req) + break; + req->wb_pgbase += size; + req->wb_bytes -= size; + req->wb_offset += size; + subreq_size = req->wb_bytes; + subreq = req; + continue; + } + if (WARN_ON_ONCE(subreq != req)) { + nfs_page_group_unlock(req); + nfs_pageio_cleanup_request(desc, subreq); + subreq = req; + subreq_size = req->wb_bytes; + nfs_page_group_lock(req); + } + if (!size) { + /* Can't coalesce any more, so do I/O */ nfs_page_group_unlock(req); desc->pg_moreio = 1; nfs_pageio_doio(desc); - if (desc->pg_error < 0) - return 0; - if (mirror->pg_recoalesce) + if (desc->pg_error < 0 || mirror->pg_recoalesce) return 0; /* retry add_request for this subreq */ - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); continue; } - - /* check for buggy pg_test call(s) */ - WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); - WARN_ON_ONCE(subreq->wb_bytes > bytes_left); - WARN_ON_ONCE(subreq->wb_bytes == 0); - - bytes_left -= subreq->wb_bytes; - offset += subreq->wb_bytes; - pgbase += subreq->wb_bytes; - - if (bytes_left) { - subreq = nfs_create_request(req->wb_context, - req->wb_page, - subreq, pgbase, bytes_left); - if (IS_ERR(subreq)) - goto err_ptr; - nfs_lock_request(subreq); - subreq->wb_offset = offset; - subreq->wb_index = req->wb_index; - } - } while (bytes_left > 0); + subreq = nfs_create_subreq(req, req->wb_pgbase, + req->wb_offset, size); + if (IS_ERR(subreq)) + goto err_ptr; + subreq_size = size; + } nfs_page_group_unlock(req); return 1; @@ -1098,16 +1240,12 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) do { list_splice_init(&mirror->pg_list, &head); - mirror->pg_bytes_written -= mirror->pg_count; - mirror->pg_count = 0; - mirror->pg_base = 0; mirror->pg_recoalesce = 0; while (!list_empty(&head)) { struct nfs_page *req; req = list_first_entry(&head, struct nfs_page, wb_list); - nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; if (desc->pg_error < 0) { @@ -1138,12 +1276,27 @@ static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc, return ret; } +static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc) +{ + u32 midx; + struct nfs_pgio_mirror *mirror; + + if (!desc->pg_error) + return; + + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = nfs_pgio_get_mirror(desc, midx); + desc->pg_completion_ops->error_cleanup(&mirror->pg_list, + desc->pg_error); + } +} + int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { u32 midx; unsigned int pgbase, offset, bytes; - struct nfs_page *dupreq, *lastreq; + struct nfs_page *dupreq; pgbase = req->wb_pgbase; offset = req->wb_offset; @@ -1153,60 +1306,34 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (desc->pg_error < 0) goto out_failed; - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - if (midx) { - nfs_page_group_lock(req, false); - - /* find the last request */ - for (lastreq = req->wb_head; - lastreq->wb_this_page != req->wb_head; - lastreq = lastreq->wb_this_page) - ; - - dupreq = nfs_create_request(req->wb_context, - req->wb_page, lastreq, pgbase, bytes); - - if (IS_ERR(dupreq)) { - nfs_page_group_unlock(req); - desc->pg_error = PTR_ERR(dupreq); - goto out_failed; - } + /* Create the mirror instances first, and fire them off */ + for (midx = 1; midx < desc->pg_mirror_count; midx++) { + nfs_page_group_lock(req); - nfs_lock_request(dupreq); - nfs_page_group_unlock(req); - dupreq->wb_offset = offset; - dupreq->wb_index = req->wb_index; - } else - dupreq = req; + dupreq = nfs_create_subreq(req, + pgbase, offset, bytes); - if (nfs_pgio_has_mirroring(desc)) - desc->pg_mirror_idx = midx; - if (!nfs_pageio_add_request_mirror(desc, dupreq)) + nfs_page_group_unlock(req); + if (IS_ERR(dupreq)) { + desc->pg_error = PTR_ERR(dupreq); goto out_failed; + } + + nfs_pgio_set_current_mirror(desc, midx); + if (!nfs_pageio_add_request_mirror(desc, dupreq)) + goto out_cleanup_subreq; } + nfs_pgio_set_current_mirror(desc, 0); + if (!nfs_pageio_add_request_mirror(desc, req)) + goto out_failed; + return 1; +out_cleanup_subreq: + nfs_pageio_cleanup_request(desc, dupreq); out_failed: - /* - * We might have failed before sending any reqs over wire. - * Clean up rest of the reqs in mirror pg_list. - */ - if (desc->pg_error) { - struct nfs_pgio_mirror *mirror; - void (*func)(struct list_head *); - - /* remember fatal errors */ - if (nfs_error_is_fatal(desc->pg_error)) - mapping_set_error(desc->pg_inode->i_mapping, - desc->pg_error); - - func = desc->pg_completion_ops->error_cleanup; - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; - func(&mirror->pg_list); - } - } + nfs_pageio_error_cleanup(desc); return 0; } @@ -1219,19 +1346,20 @@ out_failed: static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, u32 mirror_idx) { - struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx]; - u32 restore_idx = desc->pg_mirror_idx; + struct nfs_pgio_mirror *mirror; + u32 restore_idx; + + restore_idx = nfs_pgio_set_current_mirror(desc, mirror_idx); + mirror = nfs_pgio_current_mirror(desc); - if (nfs_pgio_has_mirroring(desc)) - desc->pg_mirror_idx = mirror_idx; for (;;) { nfs_pageio_doio(desc); - if (!mirror->pg_recoalesce) + if (desc->pg_error < 0 || !mirror->pg_recoalesce) break; if (!nfs_do_recoalesce(desc)) break; } - desc->pg_mirror_idx = restore_idx; + nfs_pgio_set_current_mirror(desc, restore_idx); } /* @@ -1247,21 +1375,24 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { - LIST_HEAD(failed); + LIST_HEAD(pages); desc->pg_io_completion = hdr->io_completion; desc->pg_dreq = hdr->dreq; - while (!list_empty(&hdr->pages)) { - struct nfs_page *req = nfs_list_entry(hdr->pages.next); + nfs_netfs_set_pageio_descriptor(desc, hdr); + list_splice_init(&hdr->pages, &pages); + while (!list_empty(&pages)) { + struct nfs_page *req = nfs_list_entry(pages.next); - nfs_list_remove_request(req); if (!nfs_pageio_add_request(desc, req)) - nfs_list_add_request(req, &failed); + break; } nfs_pageio_complete(desc); - if (!list_empty(&failed)) { - list_move(&failed, &hdr->pages); - return desc->pg_error < 0 ? desc->pg_error : -EIO; + if (!list_empty(&pages)) { + int err = desc->pg_error < 0 ? desc->pg_error : -EIO; + hdr->completion_ops->error_cleanup(&pages, err); + nfs_set_pgio_error(hdr, err, hdr->io_start); + return err; } return 0; } @@ -1278,6 +1409,8 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) for (midx = 0; midx < desc->pg_mirror_count; midx++) nfs_pageio_complete_mirror(desc, midx); + if (desc->pg_error < 0) + nfs_pageio_error_cleanup(desc); if (desc->pg_ops->pg_cleanup) desc->pg_ops->pg_cleanup(desc); nfs_pageio_cleanup_mirroring(desc); @@ -1298,20 +1431,38 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) { struct nfs_pgio_mirror *mirror; struct nfs_page *prev; + struct folio *folio; u32 midx; for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; + mirror = nfs_pgio_get_mirror(desc, midx); if (!list_empty(&mirror->pg_list)) { prev = nfs_list_entry(mirror->pg_list.prev); - if (index != prev->wb_index + 1) { - nfs_pageio_complete(desc); - break; - } + folio = nfs_page_to_folio(prev); + if (folio) { + if (index == folio_next_index(folio)) + continue; + } else if (index == prev->wb_index + 1) + continue; + /* + * We will submit more requests after these. Indicate + * this to the underlying layers. + */ + desc->pg_moreio = 1; + nfs_pageio_complete(desc); + break; } } } +/* + * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1) + */ +void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio) +{ + nfs_pageio_complete(pgio); +} + int __init nfs_init_nfspagecache(void) { nfs_page_cachep = kmem_cache_create("nfs_page", diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c383d0913b54..f157d43d1312 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -37,6 +37,7 @@ #include "nfs4trace.h" #include "delegation.h" #include "nfs42.h" +#include "nfs4_fs.h" #define NFSDBG_FACILITY NFSDBG_PNFS #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) @@ -60,6 +61,7 @@ static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, u32 seq); static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list); +static int pnfs_layout_return_on_reboot(struct pnfs_layout_hdr *lo); /* Return the registered pnfs layout driver module matching given id */ static struct pnfs_layoutdriver_type * @@ -91,6 +93,17 @@ find_pnfs_driver(u32 id) return local; } +const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id) +{ + return find_pnfs_driver(id); +} + +void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld) +{ + if (ld) + module_put(ld->owner); +} + void unset_pnfs_layoutdriver(struct nfs_server *nfss) { @@ -251,7 +264,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) { - atomic_inc(&lo->plh_refcount); + refcount_inc(&lo->plh_refcount); } static struct pnfs_layout_hdr * @@ -267,14 +280,14 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) struct nfs_server *server = NFS_SERVER(lo->plh_inode); struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - if (!list_empty(&lo->plh_layouts)) { + if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) { struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); + list_del_rcu(&lo->plh_layouts); spin_unlock(&clp->cl_lock); } - put_rpccred(lo->plh_lc_cred); + put_cred(lo->plh_lc_cred); return ld->free_layout_hdr(lo); } @@ -292,19 +305,50 @@ pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { - struct inode *inode = lo->plh_inode; + struct inode *inode; + if (!lo) + return; + inode = lo->plh_inode; pnfs_layoutreturn_before_put_layout_hdr(lo); - if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); + /* Notify pnfs_destroy_layout_final() that we're done */ + if (inode_state_read(inode) & (I_FREEING | I_CLEAR)) + wake_up_var_locked(lo, &inode->i_lock); spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); } } +static struct inode * +pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = igrab(lo->plh_inode); + if (inode) + return inode; + set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags); + return NULL; +} + +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)(s1 - s2) > 0; +} + +static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq) +{ + if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier) + lo->plh_barrier = newseq; +} + static void pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, u32 seq) @@ -313,10 +357,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); - if (seq != 0) { - WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); + /* + * We must set lo->plh_return_seq to avoid livelocks with + * pnfs_layout_need_return() + */ + if (seq == 0) + seq = be32_to_cpu(lo->plh_stateid.seqid); + if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) lo->plh_return_seq = seq; - } + pnfs_barrier_update(lo, seq); } static void @@ -355,6 +404,48 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, } /* + * Update the seqid of a layout stateid after receiving + * NFS4ERR_OLD_STATEID + */ +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, + struct pnfs_layout_range *dst_range, + struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + bool ret = false; + LIST_HEAD(head); + int err; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo && pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + /* Is our call using the most recent seqid? If so, bump it */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) { + nfs4_stateid_seqid_inc(dst); + ret = true; + goto out; + } + /* Try to update the seqid to the most recent */ + err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); + if (err != -EBUSY) { + dst->seqid = lo->plh_stateid.seqid; + *dst_range = range; + ret = true; + } + } +out: + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + return ret; +} + +/* * Mark a pnfs_layout_hdr and all associated layout segments as invalid * * In order to continue using the pnfs_layout_hdr, a full recovery @@ -377,12 +468,25 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, pnfs_clear_lseg_state(lseg, lseg_list); pnfs_clear_layoutreturn_info(lo); pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); + set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags); if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) pnfs_clear_layoutreturn_waitbit(lo); return !list_empty(&lo->plh_segs); } +static int pnfs_mark_layout_stateid_return(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list, + enum pnfs_iomode iomode, u32 seq) +{ + struct pnfs_layout_range range = { + .iomode = iomode, + .length = NFS4_MAX_UINT64, + }; + + return pnfs_mark_matching_lsegs_return(lo, lseg_list, &range, seq); +} + static int pnfs_iomode_to_fail_bit(u32 iomode) { @@ -395,14 +499,14 @@ pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { lo->plh_retry_timestamp = jiffies; if (!test_and_set_bit(fail_bit, &lo->plh_flags)) - atomic_inc(&lo->plh_refcount); + refcount_inc(&lo->plh_refcount); } static void pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { if (test_and_clear_bit(fail_bit, &lo->plh_flags)) - atomic_dec(&lo->plh_refcount); + refcount_dec(&lo->plh_refcount); } static void @@ -418,7 +522,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) spin_lock(&inode->i_lock); pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); - pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0); + pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, @@ -450,7 +554,8 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); - atomic_set(&lseg->pls_refcount, 1); + INIT_LIST_HEAD(&lseg->pls_commits); + refcount_set(&lseg->pls_refcount, 1); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; lseg->pls_range = *range; @@ -472,7 +577,7 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del_init(&lseg->pls_list); /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ - atomic_dec(&lo->plh_refcount); + refcount_dec(&lo->plh_refcount); if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) return; if (list_empty(&lo->plh_segs) && @@ -507,17 +612,13 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) return; dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, - atomic_read(&lseg->pls_refcount), + refcount_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); lo = lseg->pls_layout; inode = lo->plh_inode; - if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { - if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { - spin_unlock(&inode->i_lock); - return; - } + if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); if (pnfs_cache_lseg_for_layoutreturn(lo, lseg)) @@ -529,47 +630,6 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) } EXPORT_SYMBOL_GPL(pnfs_put_lseg); -static void pnfs_free_lseg_async_work(struct work_struct *work) -{ - struct pnfs_layout_segment *lseg; - struct pnfs_layout_hdr *lo; - - lseg = container_of(work, struct pnfs_layout_segment, pls_work); - lo = lseg->pls_layout; - - pnfs_free_lseg(lseg); - pnfs_put_layout_hdr(lo); -} - -static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg) -{ - INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work); - schedule_work(&lseg->pls_work); -} - -void -pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) -{ - if (!lseg) - return; - - assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock); - - dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, - atomic_read(&lseg->pls_refcount), - test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - struct pnfs_layout_hdr *lo = lseg->pls_layout; - if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) - return; - pnfs_layout_remove_lseg(lo, lseg); - if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) { - pnfs_get_layout_hdr(lo); - pnfs_free_lseg_async(lseg); - } - } -} - /* * is l2 fully contained in l1? * start1 end1 @@ -592,7 +652,7 @@ pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) { - if (!atomic_dec_and_test(&lseg->pls_refcount)) + if (!refcount_dec_and_test(&lseg->pls_refcount)) return false; pnfs_layout_remove_lseg(lseg->pls_layout, lseg); list_add(&lseg->pls_list, tmp_list); @@ -611,22 +671,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, * outstanding io is finished. */ dprintk("%s: lseg %p ref %d\n", __func__, lseg, - atomic_read(&lseg->pls_refcount)); + refcount_read(&lseg->pls_refcount)); if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) rv = 1; } return rv; } -/* - * Compare 2 layout stateid sequence ids, to see which is newer, - * taking into account wraparound issues. - */ -static bool pnfs_seqid_is_newer(u32 s1, u32 s2) -{ - return (s32)(s1 - s2) > 0; -} - static bool pnfs_should_free_range(const struct pnfs_layout_range *lseg_range, const struct pnfs_layout_range *recall_range) @@ -670,6 +721,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -678,17 +730,27 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return 0; list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { - dprintk("%s: freeing lseg %p iomode %d seq %u" + dprintk("%s: freeing lseg %p iomode %d seq %u " "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, lseg->pls_range.offset, lseg->pls_range.length); - if (!mark_lseg_invalid(lseg, tmp_list)) - remaining++; + if (mark_lseg_invalid(lseg, tmp_list)) + continue; + remaining++; + pnfs_lseg_cancel_io(server, lseg); } dprintk("%s:Return %i\n", __func__, remaining); return remaining; } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} + static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, struct list_head *free_me, @@ -718,8 +780,7 @@ pnfs_free_lseg_list(struct list_head *free_me) } } -void -pnfs_destroy_layout(struct nfs_inode *nfsi) +static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi) { struct pnfs_layout_hdr *lo; LIST_HEAD(tmp_list); @@ -737,9 +798,28 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_put_layout_hdr(lo); } else spin_unlock(&nfsi->vfs_inode.i_lock); + return lo; +} + +void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + __pnfs_destroy_layout(nfsi); } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); +void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi); + struct inode *inode = &nfsi->vfs_inode; + + if (lo) { + spin_lock(&inode->i_lock); + wait_var_event_spinlock(lo, nfsi->layout != lo, + &inode->i_lock); + spin_unlock(&inode->i_lock); + } +} + static bool pnfs_layout_add_bulk_destroy_list(struct inode *inode, struct list_head *layout_list) @@ -763,22 +843,33 @@ static int pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, struct nfs_server *server, struct list_head *layout_list) + __must_hold(&clp->cl_lock) + __must_hold(RCU) { struct pnfs_layout_hdr *lo, *next; struct inode *inode; list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { - if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) - continue; - inode = igrab(lo->plh_inode); - if (inode == NULL) + if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || + test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) || + !list_empty(&lo->plh_bulk_destroy)) continue; - list_del_init(&lo->plh_layouts); - if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) - continue; - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); + /* If the sb is being destroyed, just bail */ + if (!nfs_sb_active(server->super)) + break; + inode = pnfs_grab_inode_layout_hdr(lo); + if (inode != NULL) { + if (pnfs_layout_add_bulk_destroy_list(inode, + layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + } else { + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + } + nfs_sb_deactive(server->super); spin_lock(&clp->cl_lock); rcu_read_lock(); return -EAGAIN; @@ -788,7 +879,7 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, static int pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, - bool is_bulk_recall) + enum pnfs_layout_destroy_mode mode) { struct pnfs_layout_hdr *lo; struct inode *inode; @@ -806,8 +897,11 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, spin_lock(&inode->i_lock); list_del_init(&lo->plh_bulk_destroy); - if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) { - if (is_bulk_recall) + if (mode == PNFS_LAYOUT_FILE_BULK_RETURN) { + pnfs_mark_layout_stateid_return(lo, &lseg_list, + IOMODE_ANY, 0); + } else if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) { + if (mode == PNFS_LAYOUT_BULK_RETURN) set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); ret = -EAGAIN; } @@ -816,15 +910,13 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, /* Free all lsegs that are attached to commit buckets */ nfs_commit_inode(inode, 0); pnfs_put_layout_hdr(lo); - iput(inode); + nfs_iput_and_deactive(inode); } return ret; } -int -pnfs_destroy_layouts_byfsid(struct nfs_client *clp, - struct nfs_fsid *fsid, - bool is_recall) +int pnfs_layout_destroy_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid, + enum pnfs_layout_destroy_mode mode) { struct nfs_server *server; LIST_HEAD(layout_list); @@ -843,37 +935,44 @@ restart: rcu_read_unlock(); spin_unlock(&clp->cl_lock); - if (list_empty(&layout_list)) - return 0; - return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); + return pnfs_layout_free_bulk_destroy_list(&layout_list, mode); } -int -pnfs_destroy_layouts_byclid(struct nfs_client *clp, - bool is_recall) +static void pnfs_layout_build_destroy_list_byclient(struct nfs_client *clp, + struct list_head *list) { struct nfs_server *server; - LIST_HEAD(layout_list); spin_lock(&clp->cl_lock); rcu_read_lock(); restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - if (pnfs_layout_bulk_destroy_byserver_locked(clp, - server, - &layout_list) != 0) + if (pnfs_layout_bulk_destroy_byserver_locked(clp, server, + list) != 0) goto restart; } rcu_read_unlock(); spin_unlock(&clp->cl_lock); +} - if (list_empty(&layout_list)) - return 0; - return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); +static int pnfs_layout_do_destroy_byclid(struct nfs_client *clp, + struct list_head *list, + enum pnfs_layout_destroy_mode mode) +{ + pnfs_layout_build_destroy_list_byclient(clp, list); + return pnfs_layout_free_bulk_destroy_list(list, mode); +} + +int pnfs_layout_destroy_byclid(struct nfs_client *clp, + enum pnfs_layout_destroy_mode mode) +{ + LIST_HEAD(layout_list); + + return pnfs_layout_do_destroy_byclid(clp, &layout_list, mode); } /* - * Called by the state manger to remove all layouts established under an + * Called by the state manager to remove all layouts established under an * expired lease. */ void @@ -882,40 +981,112 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) nfs4_deviceid_mark_client_invalid(clp); nfs4_deviceid_purge_client(clp); - pnfs_destroy_layouts_byclid(clp, false); + pnfs_layout_destroy_byclid(clp, PNFS_LAYOUT_INVALIDATE); +} + +static void pnfs_layout_build_recover_list_byclient(struct nfs_client *clp, + struct list_head *list) +{ + struct nfs_server *server; + + spin_lock(&clp->cl_lock); + rcu_read_lock(); +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!(server->caps & NFS_CAP_REBOOT_LAYOUTRETURN)) + continue; + if (pnfs_layout_bulk_destroy_byserver_locked(clp, server, + list) != 0) + goto restart; + } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); +} + +static int pnfs_layout_bulk_list_reboot(struct list_head *list) +{ + struct pnfs_layout_hdr *lo; + struct nfs_server *server; + int ret; + + list_for_each_entry(lo, list, plh_bulk_destroy) { + server = NFS_SERVER(lo->plh_inode); + ret = pnfs_layout_return_on_reboot(lo); + switch (ret) { + case 0: + continue; + case -NFS4ERR_BAD_STATEID: + server->caps &= ~NFS_CAP_REBOOT_LAYOUTRETURN; + break; + case -NFS4ERR_NO_GRACE: + break; + default: + goto err; + } + break; + } + return 0; +err: + return ret; +} + +int pnfs_layout_handle_reboot(struct nfs_client *clp) +{ + LIST_HEAD(list); + int ret = 0, ret2; + + pnfs_layout_build_recover_list_byclient(clp, &list); + if (!list_empty(&list)) + ret = pnfs_layout_bulk_list_reboot(&list); + ret2 = pnfs_layout_do_destroy_byclid(clp, &list, + PNFS_LAYOUT_INVALIDATE); + if (!ret) + ret = ret2; + return (ret == 0) ? 0 : -EAGAIN; +} + +static void +pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred) +{ + const struct cred *old; + + if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) { + old = xchg(&lo->plh_lc_cred, get_cred(cred)); + put_cred(old); + } } /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, - bool update_barrier) + const struct cred *cred, bool update_barrier) { - u32 oldseq, newseq, new_barrier = 0; - - oldseq = be32_to_cpu(lo->plh_stateid.seqid); - newseq = be32_to_cpu(new->seqid); + u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid); + u32 newseq = be32_to_cpu(new->seqid); if (!pnfs_layout_is_valid(lo)) { + pnfs_set_layout_cred(lo, cred); nfs4_stateid_copy(&lo->plh_stateid, new); lo->plh_barrier = newseq; pnfs_clear_layoutreturn_info(lo); clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); return; } - if (pnfs_seqid_is_newer(newseq, oldseq)) { + + if (pnfs_seqid_is_newer(newseq, oldseq)) nfs4_stateid_copy(&lo->plh_stateid, new); - /* - * Because of wraparound, we want to keep the barrier - * "close" to the current seqids. - */ - new_barrier = newseq - atomic_read(&lo->plh_outstanding); - } - if (update_barrier) - new_barrier = be32_to_cpu(new->seqid); - else if (new_barrier == 0) + + if (update_barrier) { + pnfs_barrier_update(lo, newseq); return; - if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) - lo->plh_barrier = new_barrier; + } + /* + * Because of wraparound, we want to keep the barrier + * "close" to the current seqids. We really only want to + * get here from a layoutget call. + */ + if (atomic_read(&lo->plh_outstanding) == 1) + pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid)); } static bool @@ -924,7 +1095,7 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, { u32 seqid = be32_to_cpu(stateid->seqid); - return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); + return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid); } /* lget is set to 1 if called from inside send_layoutget call chain */ @@ -935,45 +1106,106 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } -/* - * Get layout from server. - * for now, assume that whole file layouts are requested. - * arg->offset: 0 - * arg->length: all ones - */ -static struct pnfs_layout_segment * -send_layoutget(struct pnfs_layout_hdr *lo, +static struct nfs_server * +pnfs_find_server(struct inode *inode, struct nfs_open_context *ctx) +{ + struct nfs_server *server; + + if (inode) { + server = NFS_SERVER(inode); + } else { + struct dentry *parent_dir = dget_parent(ctx->dentry); + server = NFS_SERVER(parent_dir->d_inode); + dput(parent_dir); + } + return server; +} + +static void nfs4_free_pages(struct page **pages, size_t size) +{ + int i; + + if (!pages) + return; + + for (i = 0; i < size; i++) { + if (!pages[i]) + break; + __free_page(pages[i]); + } + kfree(pages); +} + +static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) +{ + struct page **pages; + int i; + + pages = kmalloc_array(size, sizeof(struct page *), gfp_flags); + if (!pages) { + dprintk("%s: can't alloc array of %zu pages\n", __func__, size); + return NULL; + } + + for (i = 0; i < size; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) { + dprintk("%s: failed to allocate page\n", __func__); + nfs4_free_pages(pages, i); + return NULL; + } + } + + return pages; +} + +static struct nfs4_layoutget * +pnfs_alloc_init_layoutget_args(struct inode *ino, struct nfs_open_context *ctx, - nfs4_stateid *stateid, + const nfs4_stateid *stateid, const struct pnfs_layout_range *range, - long *timeout, gfp_t gfp_flags) + gfp_t gfp_flags) { - struct inode *ino = lo->plh_inode; - struct nfs_server *server = NFS_SERVER(ino); + struct nfs_server *server = pnfs_find_server(ino, ctx); + size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response; + size_t max_pages = max_response_pages(server); struct nfs4_layoutget *lgp; - loff_t i_size; dprintk("--> %s\n", __func__); - /* - * Synchronously retrieve layout information from server and - * store in lseg. If we race with a concurrent seqid morphing - * op, then re-send the LAYOUTGET. - */ lgp = kzalloc(sizeof(*lgp), gfp_flags); if (lgp == NULL) - return ERR_PTR(-ENOMEM); + return NULL; - i_size = i_size_read(ino); + if (max_reply_sz) { + size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages < max_pages) + max_pages = npages; + } + + lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); + if (!lgp->args.layout.pages) { + kfree(lgp); + return NULL; + } + lgp->args.layout.pglen = max_pages * PAGE_SIZE; + lgp->res.layoutp = &lgp->args.layout; + + /* Don't confuse uninitialised result and success */ + lgp->res.status = -NFS4ERR_DELAY; lgp->args.minlength = PAGE_SIZE; if (lgp->args.minlength > range->length) lgp->args.minlength = range->length; - if (range->iomode == IOMODE_READ) { - if (range->offset >= i_size) - lgp->args.minlength = 0; - else if (i_size - range->offset < lgp->args.minlength) - lgp->args.minlength = i_size - range->offset; + if (ino) { + loff_t i_size = i_size_read(ino); + + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; + } } lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; pnfs_copy_range(&lgp->args.range, range); @@ -982,9 +1214,18 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.ctx = get_nfs_open_context(ctx); nfs4_stateid_copy(&lgp->args.stateid, stateid); lgp->gfp_flags = gfp_flags; - lgp->cred = lo->plh_lc_cred; + lgp->cred = ctx->cred; + return lgp; +} + +void pnfs_layoutget_free(struct nfs4_layoutget *lgp) +{ + size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE; - return nfs4_proc_layoutget(lgp, timeout, gfp_flags); + nfs4_free_pages(lgp->args.layout.pages, max_pages); + pnfs_put_layout_hdr(lgp->lo); + put_nfs_open_context(lgp->args.ctx); + kfree(lgp); } static void pnfs_clear_layoutcommit(struct inode *inode, @@ -1002,6 +1243,33 @@ static void pnfs_clear_layoutcommit(struct inode *inode, } } +static void +pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range, + struct list_head *freeme) +{ + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + pnfs_reset_return_info(lo); + else + pnfs_mark_layout_stateid_invalid(lo, freeme); + pnfs_clear_layoutreturn_waitbit(lo); +} + +void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range) +{ + struct inode *inode = lo->plh_inode; + LIST_HEAD(freeme); + + spin_lock(&inode->i_lock); + pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range, &freeme); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); +} + void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, const nfs4_stateid *arg_stateid, const struct pnfs_layout_range *range, @@ -1011,15 +1279,15 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, LIST_HEAD(freeme); spin_lock(&inode->i_lock); - if (!pnfs_layout_is_valid(lo) || !arg_stateid || - !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + if (!nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) goto out_unlock; - if (stateid) { + if (stateid && pnfs_layout_is_valid(lo)) { u32 seq = be32_to_cpu(arg_stateid->seqid); pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); - pnfs_set_layout_stateid(lo, stateid, true); + pnfs_set_layout_stateid(lo, stateid, NULL, true); + pnfs_reset_return_info(lo); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: @@ -1032,30 +1300,27 @@ out_unlock: static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, + const struct cred **cred, enum pnfs_iomode *iomode) { /* Serialise LAYOUTGET/LAYOUTRETURN */ - if (atomic_read(&lo->plh_outstanding) != 0) + if (atomic_read(&lo->plh_outstanding) != 0 && lo->plh_return_seq == 0) return false; if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) return false; set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); pnfs_get_layout_hdr(lo); + nfs4_stateid_copy(stateid, &lo->plh_stateid); + *cred = get_cred(lo->plh_lc_cred); if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { - if (stateid != NULL) { - nfs4_stateid_copy(stateid, &lo->plh_stateid); - if (lo->plh_return_seq != 0) - stateid->seqid = cpu_to_be32(lo->plh_return_seq); - } + if (lo->plh_return_seq != 0) + stateid->seqid = cpu_to_be32(lo->plh_return_seq); if (iomode != NULL) *iomode = lo->plh_return_iomode; pnfs_clear_layoutreturn_info(lo); - return true; - } - if (stateid != NULL) - nfs4_stateid_copy(stateid, &lo->plh_stateid); - if (iomode != NULL) + } else if (iomode != NULL) *iomode = IOMODE_ANY; + pnfs_barrier_update(lo, be32_to_cpu(stateid->seqid)); return true; } @@ -1077,20 +1342,26 @@ pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args, } static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, - enum pnfs_iomode iomode, bool sync) +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid, + const struct cred **pcred, + enum pnfs_iomode iomode, + unsigned int flags) { struct inode *ino = lo->plh_inode; struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; struct nfs4_layoutreturn *lrp; + const struct cred *cred = *pcred; int status = 0; - lrp = kzalloc(sizeof(*lrp), GFP_NOFS); + *pcred = NULL; + lrp = kzalloc(sizeof(*lrp), nfs_io_gfp_mask()); if (unlikely(lrp == NULL)) { status = -ENOMEM; spin_lock(&ino->i_lock); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&ino->i_lock); + put_cred(cred); pnfs_put_layout_hdr(lo); goto out; } @@ -1098,11 +1369,11 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode); lrp->args.ld_private = &lrp->ld_private; lrp->clp = NFS_SERVER(ino)->nfs_client; - lrp->cred = lo->plh_lc_cred; + lrp->cred = cred; if (ld->prepare_layoutreturn) ld->prepare_layoutreturn(&lrp->args); - status = nfs4_proc_layoutreturn(lrp, sync); + status = nfs4_proc_layoutreturn(lrp, flags); out: dprintk("<-- %s status: %d\n", __func__, status); return status; @@ -1112,18 +1383,11 @@ out: static bool pnfs_layout_need_return(struct pnfs_layout_hdr *lo) { - struct pnfs_layout_segment *s; - if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) return false; - - /* Defer layoutreturn until all lsegs are done */ - list_for_each_entry(s, &lo->plh_segs, pls_list) { - if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) - return false; - } - - return true; + return pnfs_mark_layout_stateid_return(lo, &lo->plh_return_segs, + lo->plh_return_iomode, + lo->plh_return_seq) != EBUSY; } static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) @@ -1134,15 +1398,17 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) return; spin_lock(&inode->i_lock); if (pnfs_layout_need_return(lo)) { + const struct cred *cred; nfs4_stateid stateid; enum pnfs_iomode iomode; bool send; - send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, &stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, + PNFS_FL_LAYOUTRETURN_ASYNC); } } else spin_unlock(&inode->i_lock); @@ -1161,10 +1427,16 @@ _pnfs_return_layout(struct inode *ino) { struct pnfs_layout_hdr *lo = NULL; struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; LIST_HEAD(tmp_list); + const struct cred *cred; nfs4_stateid stateid; int status = 0; - bool send; + bool send, valid_layout; dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); @@ -1185,29 +1457,28 @@ _pnfs_return_layout(struct inode *ino) goto out_put_layout_hdr; spin_lock(&ino->i_lock); } + valid_layout = pnfs_layout_is_valid(lo); pnfs_clear_layoutcommit(ino, &tmp_list); - pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0); + pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0); - if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { - struct pnfs_layout_range range = { - .iomode = IOMODE_ANY, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); - } /* Don't send a LAYOUTRETURN if list was initially empty */ - if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { + if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) || + !valid_layout) { spin_unlock(&ino->i_lock); dprintk("NFS: %s no layout segments to return\n", __func__); - goto out_put_layout_hdr; + goto out_wait_layoutreturn; } - send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); + send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL); spin_unlock(&ino->i_lock); if (send) - status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); + status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, + 0); +out_wait_layoutreturn: + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); out_put_layout_hdr: pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -1243,16 +1514,35 @@ pnfs_commit_and_return_layout(struct inode *inode) return ret; } +static int pnfs_layout_return_on_reboot(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = lo->plh_inode; + const struct cred *cred; + + spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + return 0; + } + cred = get_cred(lo->plh_lc_cred); + pnfs_get_layout_hdr(lo); + spin_unlock(&inode->i_lock); + + return pnfs_send_layoutreturn(lo, &zero_stateid, &cred, IOMODE_ANY, + PNFS_FL_LAYOUTRETURN_PRIVILEGED); +} + bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, - const struct rpc_cred *cred) + const struct cred *cred) { struct nfs_inode *nfsi = NFS_I(ino); struct nfs_open_context *ctx; struct nfs4_state *state; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg, *next; + const struct cred *lc_cred; nfs4_stateid stateid; enum pnfs_iomode iomode = 0; bool layoutreturn = false, roc = false; @@ -1261,14 +1551,18 @@ bool pnfs_roc(struct inode *ino, if (!nfs_have_layout(ino)) return false; retry: + rcu_read_lock(); spin_lock(&ino->i_lock); lo = nfsi->layout; if (!lo || !pnfs_layout_is_valid(lo) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + lo = NULL; goto out_noroc; + } + pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { - pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); + rcu_read_unlock(); wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); pnfs_put_layout_hdr(lo); @@ -1282,7 +1576,7 @@ retry: skip_read = true; } - list_for_each_entry(ctx, &nfsi->open_files, list) { + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -1318,50 +1612,122 @@ retry: * 2. we don't send layoutreturn */ /* lo ref dropped in pnfs_roc_release() */ - layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode); /* If the creds don't match, we can't compound the layoutreturn */ - if (!layoutreturn || cred != lo->plh_lc_cred) + if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0) goto out_noroc; roc = layoutreturn; pnfs_init_layoutreturn_args(args, lo, &stateid, iomode); res->lrs_present = 0; layoutreturn = false; + put_cred(lc_cred); out_noroc: spin_unlock(&ino->i_lock); + rcu_read_unlock(); pnfs_layoutcommit_inode(ino, true); if (roc) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; if (ld->prepare_layoutreturn) ld->prepare_layoutreturn(args); + pnfs_put_layout_hdr(lo); return true; } if (layoutreturn) - pnfs_send_layoutreturn(lo, &stateid, iomode, true); + pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, 0); + pnfs_put_layout_hdr(lo); return false; } +int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, int *ret) +{ + struct nfs4_layoutreturn_args *arg = *argpp; + int retval = -EAGAIN; + + if (!arg) + return 0; + /* Handle Layoutreturn errors */ + switch (*ret) { + case 0: + retval = 0; + break; + case -NFS4ERR_NOMATCHING_LAYOUT: + /* Was there an RPC level error? If not, retry */ + if (task->tk_rpc_status == 0) + break; + /* + * Is there a fatal network level error? + * If so release the layout, but flag the error. + */ + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + *ret = 0; + (*respp)->lrs_present = 0; + retval = -EIO; + break; + } + /* If the call was not sent, let caller handle it */ + if (!RPC_WAS_SENT(task)) + return 0; + /* + * Otherwise, assume the call succeeded and + * that we need to release the layout + */ + *ret = 0; + (*respp)->lrs_present = 0; + retval = 0; + break; + case -NFS4ERR_DELAY: + /* Let the caller handle the retry */ + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return 0; + case -NFS4ERR_OLD_STATEID: + if (!nfs4_layout_refresh_old_stateid(&arg->stateid, + &arg->range, arg->inode)) + break; + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return -EAGAIN; + } + *argpp = NULL; + *respp = NULL; + return retval; +} + void pnfs_roc_release(struct nfs4_layoutreturn_args *args, - struct nfs4_layoutreturn_res *res, - int ret) + struct nfs4_layoutreturn_res *res, int ret) { struct pnfs_layout_hdr *lo = args->layout; - const nfs4_stateid *arg_stateid = NULL; + struct inode *inode = args->inode; const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; + LIST_HEAD(freeme); - if (ret == 0) { - arg_stateid = &args->stateid; + switch (ret) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_NOMATCHING_LAYOUT: + spin_lock(&inode->i_lock); + pnfs_layoutreturn_retry_later_locked(lo, &args->stateid, + &args->range, &freeme); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); + break; + case 0: if (res->lrs_present) res_stateid = &res->stateid; + fallthrough; + default: + pnfs_layoutreturn_free_lsegs(lo, &args->stateid, &args->range, + res_stateid); } - pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, - res_stateid); + trace_nfs4_layoutreturn_on_close(args->inode, &args->stateid, ret); if (ld_private && ld_private->ops && ld_private->ops->free) ld_private->ops->free(ld_private); pnfs_put_layout_hdr(lo); - trace_nfs4_layoutreturn_on_close(args->inode, 0); } bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) @@ -1492,13 +1858,13 @@ alloc_init_layout_hdr(struct inode *ino, lo = pnfs_alloc_layout_hdr(ino, gfp_flags); if (!lo) return NULL; - atomic_set(&lo->plh_refcount, 1); + refcount_set(&lo->plh_refcount, 1); INIT_LIST_HEAD(&lo->plh_layouts); INIT_LIST_HEAD(&lo->plh_segs); INIT_LIST_HEAD(&lo->plh_return_segs); INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; - lo->plh_lc_cred = get_rpccred(ctx->cred); + lo->plh_lc_cred = get_cred(ctx->cred); lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID; return lo; } @@ -1554,9 +1920,9 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, if ((range->iomode == IOMODE_RW && ls_range->iomode != IOMODE_RW) || (range->iomode != ls_range->iomode && - strict_iomode == true) || + strict_iomode) || !pnfs_lseg_range_intersecting(ls_range, range)) - return 0; + return false; /* range1 covers only the first byte in the range */ range1 = *range; @@ -1578,7 +1944,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && - !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && pnfs_lseg_range_match(&lseg->pls_range, range, strict_iomode)) { ret = pnfs_get_lseg(lseg); @@ -1587,7 +1952,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, } dprintk("%s:Return lseg %p ref %d\n", - __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); + __func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0); return ret; } @@ -1666,16 +2031,35 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, return ret; } -static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) +static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) { /* * send layoutcommit as it can hold up layoutreturn due to lseg * reference */ pnfs_layoutcommit_inode(lo->plh_inode, false); - return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, + return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, nfs_wait_bit_killable, - TASK_UNINTERRUPTIBLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); +} + +static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) +{ + atomic_inc(&lo->plh_outstanding); +} + +static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) +{ + if (atomic_dec_and_test(&lo->plh_outstanding) && + test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) { + smp_mb__after_atomic(); + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN); + } +} + +static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags); } static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) @@ -1687,6 +2071,21 @@ static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET); } +static void _add_to_server_list(struct pnfs_layout_hdr *lo, + struct nfs_server *server) +{ + if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) { + struct nfs_client *clp = server->nfs_client; + + /* The lo must be on the clp list if there is any + * chance of a CB_LAYOUTRECALL(FILE) coming in. + */ + spin_lock(&clp->cl_lock); + list_add_tail_rcu(&lo->plh_layouts, &server->layouts); + spin_unlock(&clp->cl_lock); + } +} + /* * Layout segment is retreived from the server if not cached. * The appropriate layout segment is referenced and returned to the caller. @@ -1705,13 +2104,16 @@ pnfs_update_layout(struct inode *ino, .offset = pos, .length = count, }; - unsigned pg_offset, seq; + unsigned pg_offset; struct nfs_server *server = NFS_SERVER(ino); struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo = NULL; struct pnfs_layout_segment *lseg = NULL; + struct nfs4_layoutget *lgp; nfs4_stateid stateid; - long timeout = 0; + struct nfs4_exception exception = { + .inode = ino, + }; unsigned long giveup = jiffies + (clp->cl_lease_time << 1); bool first; @@ -1721,12 +2123,6 @@ pnfs_update_layout(struct inode *ino, goto out; } - if (iomode == IOMODE_READ && i_size_read(ino) == 0) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, - PNFS_UPDATE_LAYOUT_RD_ZEROLEN); - goto out; - } - if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_MDSTHRESH); @@ -1734,12 +2130,23 @@ pnfs_update_layout(struct inode *ino, } lookup_again: - nfs4_client_recover_expired_lease(clp); + if (!nfs4_valid_open_stateid(ctx->state)) { + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_INVALID_OPEN); + lseg = ERR_PTR(-EIO); + goto out; + } + + lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp)); + if (IS_ERR(lseg)) + goto out; first = false; spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NOMEM); goto out; @@ -1760,16 +2167,46 @@ lookup_again: goto out_unlock; } - lseg = pnfs_find_lseg(lo, &arg, strict_iomode); - if (lseg) { + /* + * If the layout segment list is empty, but there are outstanding + * layoutget calls, then they might be subject to a layoutrecall. + */ + if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && + atomic_read(&lo->plh_outstanding) != 0) { + spin_unlock(&ino->i_lock); + lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN, + TASK_KILLABLE)); + if (IS_ERR(lseg)) + goto out_put_layout_hdr; + pnfs_put_layout_hdr(lo); + goto lookup_again; + } + + /* + * Because we free lsegs when sending LAYOUTRETURN, we need to wait + * for LAYOUTRETURN. + */ + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + spin_unlock(&ino->i_lock); + dprintk("%s wait for layoutreturn\n", __func__); + lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo)); + if (!IS_ERR(lseg)) { + pnfs_put_layout_hdr(lo); + dprintk("%s retrying\n", __func__); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + lseg, + PNFS_UPDATE_LAYOUT_RETRY); + goto lookup_again; + } trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, - PNFS_UPDATE_LAYOUT_FOUND_CACHED); - goto out_unlock; + PNFS_UPDATE_LAYOUT_RETURN); + goto out_put_layout_hdr; } - if (!nfs4_valid_open_stateid(ctx->state)) { + lseg = pnfs_find_lseg(lo, &arg, strict_iomode); + if (lseg) { trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, - PNFS_UPDATE_LAYOUT_INVALID_OPEN); + PNFS_UPDATE_LAYOUT_FOUND_CACHED); goto out_unlock; } @@ -1779,6 +2216,7 @@ lookup_again: * stateid. */ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { + int status; /* * The first layoutget for the file. Need to serialize per @@ -1787,41 +2225,34 @@ lookup_again: if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags)) { spin_unlock(&ino->i_lock); - wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, - TASK_UNINTERRUPTIBLE); + lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, + NFS_LAYOUT_FIRST_LAYOUTGET, + TASK_KILLABLE)); + if (IS_ERR(lseg)) + goto out_put_layout_hdr; pnfs_put_layout_hdr(lo); dprintk("%s retrying\n", __func__); goto lookup_again; } - first = true; - do { - seq = read_seqbegin(&ctx->state->seqlock); - nfs4_stateid_copy(&stateid, &ctx->state->stateid); - } while (read_seqretry(&ctx->state->seqlock, seq)); - } else { - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - } - - /* - * Because we free lsegs before sending LAYOUTRETURN, we need to wait - * for LAYOUTRETURN even if first is true. - */ - if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { spin_unlock(&ino->i_lock); - dprintk("%s wait for layoutreturn\n", __func__); - if (pnfs_prepare_to_retry_layoutget(lo)) { - if (first) - pnfs_clear_first_layoutget(lo); + first = true; + status = nfs4_select_rw_stateid(ctx->state, + iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ, + NULL, &stateid, NULL); + if (status != 0) { + lseg = ERR_PTR(status); + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_INVALID_OPEN); + nfs4_schedule_stateid_recovery(server, ctx->state); + pnfs_clear_first_layoutget(lo); pnfs_put_layout_hdr(lo); - dprintk("%s retrying\n", __func__); - trace_pnfs_update_layout(ino, pos, count, iomode, lo, - lseg, PNFS_UPDATE_LAYOUT_RETRY); goto lookup_again; } - trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, - PNFS_UPDATE_LAYOUT_RETURN); - goto out_put_layout_hdr; + spin_lock(&ino->i_lock); + } else { + nfs4_stateid_copy(&stateid, &lo->plh_stateid); } if (pnfs_layoutgets_blocked(lo)) { @@ -1829,18 +2260,10 @@ lookup_again: PNFS_UPDATE_LAYOUT_BLOCKED); goto out_unlock; } - atomic_inc(&lo->plh_outstanding); + nfs_layoutget_begin(lo); spin_unlock(&ino->i_lock); - if (list_empty(&lo->plh_layouts)) { - /* The lo must be on the clp list if there is any - * chance of a CB_LAYOUTRECALL(FILE) coming in. - */ - spin_lock(&clp->cl_lock); - if (list_empty(&lo->plh_layouts)) - list_add_tail(&lo->plh_layouts, &server->layouts); - spin_unlock(&clp->cl_lock); - } + _add_to_server_list(lo, server); pg_offset = arg.offset & ~PAGE_MASK; if (pg_offset) { @@ -1850,10 +2273,22 @@ lookup_again: if (arg.length != NFS4_MAX_UINT64) arg.length = PAGE_ALIGN(arg.length); - lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags); + if (!lgp) { + lseg = ERR_PTR(-ENOMEM); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL, + PNFS_UPDATE_LAYOUT_NOMEM); + nfs_layoutget_end(lo); + goto out_put_layout_hdr; + } + + lgp->lo = lo; + pnfs_get_layout_hdr(lo); + + lseg = nfs4_proc_layoutget(lgp, &exception); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); - atomic_dec(&lo->plh_outstanding); + nfs_layoutget_end(lo); if (IS_ERR(lseg)) { switch(PTR_ERR(lseg)) { case -EBUSY: @@ -1861,17 +2296,14 @@ lookup_again: lseg = NULL; break; case -ERECALLCONFLICT: - /* Huh? We hold no layouts, how is there a recall? */ - if (first) { - lseg = NULL; - break; - } - /* Destroy the existing layout and start over */ - if (time_after(jiffies, giveup)) - pnfs_destroy_layout(NFS_I(ino)); - /* Fallthrough */ case -EAGAIN: break; + case -ENODATA: + /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */ + pnfs_layout_set_fail_bit( + lo, pnfs_iomode_to_fail_bit(iomode)); + lseg = NULL; + goto out_put_layout_hdr; default: if (!nfs_error_is_fatal(PTR_ERR(lseg))) { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); @@ -1880,6 +2312,8 @@ lookup_again: goto out_put_layout_hdr; } if (lseg) { + if (!exception.retry) + goto out_put_layout_hdr; if (first) pnfs_clear_first_layoutget(lo); trace_pnfs_update_layout(ino, pos, count, @@ -1894,6 +2328,8 @@ lookup_again: out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_EXIT); pnfs_put_layout_hdr(lo); out: dprintk("%s: inode %s/%llu pNFS layout segment %s for " @@ -1931,10 +2367,173 @@ pnfs_sanity_check_layout_range(struct pnfs_layout_range *range) return true; } +static struct pnfs_layout_hdr * +_pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx) +{ + struct pnfs_layout_hdr *lo; + + spin_lock(&ino->i_lock); + lo = pnfs_find_alloc_layout(ino, ctx, nfs_io_gfp_mask()); + if (!lo) + goto out_unlock; + if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) + goto out_unlock; + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + goto out_unlock; + if (pnfs_layoutgets_blocked(lo)) + goto out_unlock; + if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags)) + goto out_unlock; + nfs_layoutget_begin(lo); + spin_unlock(&ino->i_lock); + _add_to_server_list(lo, NFS_SERVER(ino)); + return lo; + +out_unlock: + spin_unlock(&ino->i_lock); + pnfs_put_layout_hdr(lo); + return NULL; +} + +static void _lgopen_prepare_attached(struct nfs4_opendata *data, + struct nfs_open_context *ctx) +{ + struct inode *ino = data->dentry->d_inode; + struct pnfs_layout_range rng = { + .iomode = (data->o_arg.fmode & FMODE_WRITE) ? + IOMODE_RW: IOMODE_READ, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + struct nfs4_layoutget *lgp; + struct pnfs_layout_hdr *lo; + + /* Heuristic: don't send layoutget if we have cached data */ + if (rng.iomode == IOMODE_READ && + (i_size_read(ino) == 0 || ino->i_mapping->nrpages != 0)) + return; + + lo = _pnfs_grab_empty_layout(ino, ctx); + if (!lo) + return; + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, + nfs_io_gfp_mask()); + if (!lgp) { + pnfs_clear_first_layoutget(lo); + nfs_layoutget_end(lo); + pnfs_put_layout_hdr(lo); + return; + } + lgp->lo = lo; + data->lgp = lgp; + data->o_arg.lg_args = &lgp->args; + data->o_res.lg_res = &lgp->res; +} + +static void _lgopen_prepare_floating(struct nfs4_opendata *data, + struct nfs_open_context *ctx) +{ + struct inode *ino = data->dentry->d_inode; + struct pnfs_layout_range rng = { + .iomode = (data->o_arg.fmode & FMODE_WRITE) ? + IOMODE_RW: IOMODE_READ, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + struct nfs4_layoutget *lgp; + + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, + nfs_io_gfp_mask()); + if (!lgp) + return; + data->lgp = lgp; + data->o_arg.lg_args = &lgp->args; + data->o_res.lg_res = &lgp->res; +} + +void pnfs_lgopen_prepare(struct nfs4_opendata *data, + struct nfs_open_context *ctx) +{ + struct nfs_server *server = NFS_SERVER(data->dir->d_inode); + + if (!(pnfs_enabled_sb(server) && + server->pnfs_curr_ld->flags & PNFS_LAYOUTGET_ON_OPEN)) + return; + /* Could check on max_ops, but currently hardcoded high enough */ + if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN)) + return; + if (data->lgp) + return; + if (data->state) + _lgopen_prepare_attached(data, ctx); + else + _lgopen_prepare_floating(data, ctx); +} + +void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, + struct nfs_open_context *ctx) +{ + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg; + struct nfs_server *srv = NFS_SERVER(ino); + u32 iomode; + + if (!lgp) + return; + dprintk("%s: entered with status %i\n", __func__, lgp->res.status); + if (lgp->res.status) { + switch (lgp->res.status) { + default: + break; + /* + * Halt lgopen attempts if the server doesn't recognise + * the "current stateid" value, the layout type, or the + * layoutget operation as being valid. + * Also if it complains about too many ops in the compound + * or of the request/reply being too big. + */ + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_NOTSUPP: + case -NFS4ERR_REP_TOO_BIG: + case -NFS4ERR_REP_TOO_BIG_TO_CACHE: + case -NFS4ERR_REQ_TOO_BIG: + case -NFS4ERR_TOO_MANY_OPS: + case -NFS4ERR_UNKNOWN_LAYOUTTYPE: + srv->caps &= ~NFS_CAP_LGOPEN; + } + return; + } + if (!lgp->lo) { + lo = _pnfs_grab_empty_layout(ino, ctx); + if (!lo) + return; + lgp->lo = lo; + } else + lo = lgp->lo; + + lseg = pnfs_layout_process(lgp); + if (!IS_ERR(lseg)) { + iomode = lgp->args.range.iomode; + pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + pnfs_put_lseg(lseg); + } +} + +void nfs4_lgopen_release(struct nfs4_layoutget *lgp) +{ + if (lgp != NULL) { + if (lgp->lo) { + pnfs_clear_first_layoutget(lgp->lo); + nfs_layoutget_end(lgp->lo); + } + pnfs_layoutget_free(lgp); + } +} + struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { - struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct pnfs_layout_hdr *lo = lgp->lo; struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; @@ -1962,23 +2561,33 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } - if (!pnfs_layout_is_valid(lo)) { - /* We have a completely new layout */ - pnfs_set_layout_stateid(lo, &res->stateid, true); - } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { + if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && + !pnfs_is_first_layoutget(lo)) + goto out_forget; + + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { + if (!pnfs_layout_is_valid(lo)) + lo->plh_barrier = 0; dprintk("%s forget reply due to sequence\n", __func__); goto out_forget; } - pnfs_set_layout_stateid(lo, &res->stateid, false); - } else { + pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false); + } else if (pnfs_layout_is_valid(lo)) { /* * We got an entirely new state ID. Mark all segments for the * inode invalid, and retry the layoutget */ - pnfs_mark_layout_stateid_invalid(lo, &free_me); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .length = NFS4_MAX_UINT64, + }; + pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0); goto out_forget; + } else { + /* We have a completely new layout */ + pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true); } pnfs_get_lseg(lseg); @@ -1996,8 +2605,6 @@ out_forget: spin_unlock(&ino->i_lock); lseg->pls_layout = lo; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - if (!pnfs_layout_is_valid(lo)) - nfs_commit_inode(ino, 0); return ERR_PTR(-EAGAIN); } @@ -2006,10 +2613,16 @@ out_forget: * @lo: pointer to layout header * @tmp_list: list header to be used with pnfs_free_lseg_list() * @return_range: describe layout segment ranges to be returned + * @seq: stateid seqid to match * * This function is mainly intended for use by layoutrecall. It attempts * to free the layout segment immediately, or else to mark it for return * as soon as its reference count drops to zero. + * + * Returns + * - 0: a layoutreturn needs to be scheduled. + * - EBUSY: there are layout segment that are still in use. + * - ENOENT: there are no layout segments that need to be returned. */ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, @@ -2018,15 +2631,16 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, u32 seq) { struct pnfs_layout_segment *lseg, *next; + struct nfs_server *server = NFS_SERVER(lo->plh_inode); int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); - if (list_empty(&lo->plh_segs)) - return 0; - assert_spin_locked(&lo->plh_inode->i_lock); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + tmp_list = &lo->plh_return_segs; + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (pnfs_match_lseg_recall(lseg, return_range, seq)) { dprintk("%s: marking lseg %p iomode %d " @@ -2034,99 +2648,230 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + tmp_list = &lo->plh_return_segs; if (mark_lseg_invalid(lseg, tmp_list)) continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + pnfs_lseg_cancel_io(server, lseg); } - if (remaining) + if (remaining) { pnfs_set_plh_return_info(lo, return_range->iomode, seq); + return -EBUSY; + } - return remaining; + if (!list_empty(&lo->plh_return_segs)) { + pnfs_set_plh_return_info(lo, return_range->iomode, seq); + return 0; + } + + return -ENOENT; } -void pnfs_error_mark_layout_for_return(struct inode *inode, - struct pnfs_layout_segment *lseg) +static void +pnfs_mark_layout_for_return(struct inode *inode, + const struct pnfs_layout_range *range) { - struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - struct pnfs_layout_range range = { - .iomode = lseg->pls_range.iomode, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; + struct pnfs_layout_hdr *lo; bool return_now = false; spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; if (!pnfs_layout_is_valid(lo)) { spin_unlock(&inode->i_lock); return; } - pnfs_set_plh_return_info(lo, range.iomode, 0); + pnfs_set_plh_return_info(lo, range->iomode, 0); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (!pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0)) { + if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) { + const struct cred *cred; nfs4_stateid stateid; enum pnfs_iomode iomode; - return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); spin_unlock(&inode->i_lock); if (return_now) - pnfs_send_layoutreturn(lo, &stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, + PNFS_FL_LAYOUTRETURN_ASYNC); } else { spin_unlock(&inode->i_lock); nfs_commit_inode(inode, 0); } } + +void pnfs_error_mark_layout_for_return(struct inode *inode, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_range range = { + .iomode = lseg->pls_range.iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + pnfs_mark_layout_for_return(inode, &range); +} EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); +static bool +pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo) +{ + return pnfs_layout_is_valid(lo) && + !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) && + !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); +} + +static struct pnfs_layout_segment * +pnfs_find_first_lseg(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range, + enum pnfs_iomode iomode) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + continue; + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + continue; + if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY) + continue; + if (pnfs_lseg_range_intersecting(&lseg->pls_range, range)) + return lseg; + } + return NULL; +} + +/* Find open file states whose mode matches that of the range */ +static bool +pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range) +{ + struct list_head *head; + struct nfs_open_context *ctx; + fmode_t mode = 0; + + if (!pnfs_layout_can_be_returned(lo) || + !pnfs_find_first_lseg(lo, range, range->iomode)) + return false; + + head = &NFS_I(lo->plh_inode)->open_files; + list_for_each_entry_rcu(ctx, head, list) { + if (ctx->state) + mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE); + } + + switch (range->iomode) { + default: + break; + case IOMODE_READ: + mode &= ~FMODE_WRITE; + break; + case IOMODE_RW: + if (pnfs_find_first_lseg(lo, range, IOMODE_READ)) + mode &= ~FMODE_READ; + } + return mode == 0; +} + +static int pnfs_layout_return_unused_byserver(struct nfs_server *server, + void *data) +{ + const struct pnfs_layout_range *range = data; + const struct cred *cred; + struct pnfs_layout_hdr *lo; + struct inode *inode; + nfs4_stateid stateid; + enum pnfs_iomode iomode; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { + inode = lo->plh_inode; + if (!inode || !pnfs_layout_can_be_returned(lo) || + test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + continue; + spin_lock(&inode->i_lock); + if (!lo->plh_inode || + !pnfs_should_return_unused_layout(lo, range)) { + spin_unlock(&inode->i_lock); + continue; + } + pnfs_get_layout_hdr(lo); + pnfs_set_plh_return_info(lo, range->iomode, 0); + if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, + range, 0) != 0 || + !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) { + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + pnfs_put_layout_hdr(lo); + cond_resched(); + goto restart; + } + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, + PNFS_FL_LAYOUTRETURN_ASYNC); + pnfs_put_layout_hdr(lo); + cond_resched(); + goto restart; + } + rcu_read_unlock(); + return 0; +} + +void +pnfs_layout_return_unused_byclid(struct nfs_client *clp, + enum pnfs_iomode iomode) +{ + struct pnfs_layout_range range = { + .iomode = iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver, + &range); +} + +/* Check if we have we have a valid layout but if there isn't an intersection + * between the request and the pgio->pg_lseg, put this pgio->pg_lseg away. + */ void -pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) +pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) { if (pgio->pg_lseg == NULL || - test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags)) + (test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags) && + pnfs_lseg_request_intersecting(pgio->pg_lseg, req))) return; pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); -/* - * Check for any intersection between the request and the pgio->pg_lseg, - * and if none, put this pgio->pg_lseg away. - */ -static void -pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) { - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; - } -} - void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - u64 rd_size = req->wb_bytes; + u64 rd_size; - pnfs_generic_pg_check_layout(pgio); - pnfs_generic_pg_check_range(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); if (pgio->pg_lseg == NULL) { if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); else - rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - req_offset(req), - rd_size, - IOMODE_READ, - false, - GFP_KERNEL); + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq, + req_offset(req)); + + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), rd_size, + IOMODE_READ, false, + nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -2144,16 +2889,12 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - pnfs_generic_pg_check_layout(pgio); - pnfs_generic_pg_check_range(pgio, req); + pnfs_generic_pg_check_layout(pgio, req); if (pgio->pg_lseg == NULL) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - req_offset(req), - wb_size, - IOMODE_RW, - false, - GFP_NOFS); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), wb_size, IOMODE_RW, + false, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -2229,7 +2970,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, hdr->completion_ops); - set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags); return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); @@ -2274,8 +3014,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); - hdr->release(hdr); + hdr->completion_ops->completion(hdr); } static enum pnfs_try_status @@ -2311,6 +3050,7 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_write_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: @@ -2398,8 +3138,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); - hdr->release(hdr); + hdr->completion_ops->completion(hdr); } /* @@ -2427,7 +3166,8 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, } /* Resend all requests through pnfs. */ -void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr, + unsigned int mirror_idx) { struct nfs_pageio_descriptor pgio; @@ -2438,6 +3178,7 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); + pgio.pg_mirror_idx = mirror_idx; hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); } } @@ -2454,6 +3195,7 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_read_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: @@ -2590,6 +3332,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; int status; + bool mark_as_dirty = false; if (!pnfs_layoutcommit_outstanding(inode)) return 0; @@ -2603,14 +3346,14 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = wait_on_bit_lock_action(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, nfs_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (status) goto out; } status = -ENOMEM; /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ - data = kzalloc(sizeof(*data), GFP_NOFS); + data = kzalloc(sizeof(*data), nfs_io_gfp_mask()); if (!data) goto clear_layoutcommitting; @@ -2625,10 +3368,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) end_pos = nfsi->layout->plh_lwb; nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); + data->cred = get_cred(nfsi->layout->plh_lc_cred); spin_unlock(&inode->i_lock); data->args.inode = inode; - data->cred = get_rpccred(nfsi->layout->plh_lc_cred); nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; @@ -2641,19 +3384,23 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (ld->prepare_layoutcommit) { status = ld->prepare_layoutcommit(&data->args); if (status) { - put_rpccred(data->cred); + if (status != -ENOSPC) + put_cred(data->cred); spin_lock(&inode->i_lock); set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - goto out_unlock; + if (status != -ENOSPC) + goto out_unlock; + spin_unlock(&inode->i_lock); + mark_as_dirty = true; } } status = nfs4_proc_layoutcommit(data, sync); out: - if (status) + if (status || mark_as_dirty) mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; @@ -2677,7 +3424,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { struct nfs4_threshold *thp; - thp = kzalloc(sizeof(*thp), GFP_NOFS); + thp = kzalloc(sizeof(*thp), nfs_io_gfp_mask()); if (!thp) { dprintk("%s mdsthreshold allocation failed\n", __func__); return NULL; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 99731e3e332f..91ff877185c8 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -30,15 +30,20 @@ #ifndef FS_NFS_PNFS_H #define FS_NFS_PNFS_H +#include <linux/refcount.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/workqueue.h> +struct nfs4_exception; +struct nfs4_opendata; + enum { NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ NFS_LSEG_ROC, /* roc bit received from server */ NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */ + NFS_LSEG_UNAVAILABLE, /* unavailable bit set for temporary problem */ }; /* Individual ip address */ @@ -47,14 +52,17 @@ struct nfs4_pnfs_ds_addr { size_t da_addrlen; struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *da_remotestr; /* human readable addr+port */ + const char *da_netid; + int da_transport; }; struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *ds_remotestr; /* comma sep list of addrs */ struct list_head ds_addrs; + const struct net *ds_net; struct nfs_client *ds_clp; - atomic_t ds_count; + refcount_t ds_count; unsigned long ds_state; #define NFS4DS_CONNECTING 0 /* ds is establishing connection */ }; @@ -62,12 +70,12 @@ struct nfs4_pnfs_ds { struct pnfs_layout_segment { struct list_head pls_list; struct list_head pls_lc_list; + struct list_head pls_commits; struct pnfs_layout_range pls_range; - atomic_t pls_refcount; + refcount_t pls_refcount; u32 pls_seq; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; - struct work_struct pls_work; }; enum pnfs_try_status { @@ -86,10 +94,7 @@ enum pnfs_try_status { */ #define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ #define NFS4_DEF_DS_RETRANS 5 - -/* error codes for internal use */ -#define NFS4ERR_RESET_TO_MDS 12001 -#define NFS4ERR_RESET_TO_PNFS 12002 +#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) enum { NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ @@ -100,6 +105,9 @@ enum { NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ + NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */ + NFS_LAYOUT_HASHED, /* The layout visible */ + NFS_LAYOUT_DRAIN, }; enum layoutdriver_policy_flags { @@ -108,6 +116,13 @@ enum layoutdriver_policy_flags { PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, PNFS_LAYOUTRET_ON_ERROR = 1 << 1, PNFS_READ_WHOLE_PAGE = 1 << 2, + PNFS_LAYOUTGET_ON_OPEN = 1 << 3, +}; + +enum pnfs_layout_destroy_mode { + PNFS_LAYOUT_INVALIDATE = 0, + PNFS_LAYOUT_BULK_RETURN, + PNFS_LAYOUT_FILE_BULK_RETURN, }; struct nfs4_deviceid_node; @@ -119,7 +134,7 @@ struct pnfs_layoutdriver_type { const char *name; struct module *owner; unsigned flags; - unsigned max_deviceinfo_size; + unsigned max_layoutget_response; int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); int (*clear_layoutdriver) (struct nfs_server *); @@ -141,22 +156,6 @@ struct pnfs_layoutdriver_type { const struct nfs_pageio_ops *pg_write_ops; struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); - void (*mark_request_commit) (struct nfs_page *req, - struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - u32 ds_commit_idx); - void (*clear_request_commit) (struct nfs_page *req, - struct nfs_commit_info *cinfo); - int (*scan_commit_lists) (struct nfs_commit_info *cinfo, - int max); - void (*recover_commit_reqs) (struct list_head *list, - struct nfs_commit_info *cinfo); - struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, - struct page *page); - int (*commit_pagelist)(struct inode *inode, - struct list_head *mds_pages, - int how, - struct nfs_commit_info *cinfo); int (*sync)(struct inode *inode, bool datasync); @@ -177,10 +176,33 @@ struct pnfs_layoutdriver_type { void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); + + void (*cancel_io)(struct pnfs_layout_segment *lseg); +}; + +struct pnfs_commit_ops { + void (*setup_ds_info)(struct pnfs_ds_commit_info *, + struct pnfs_layout_segment *); + void (*release_ds_info)(struct pnfs_ds_commit_info *, + struct inode *inode); + int (*commit_pagelist)(struct inode *inode, + struct list_head *mds_pages, + int how, + struct nfs_commit_info *cinfo); + void (*mark_request_commit) (struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); + void (*clear_request_commit) (struct nfs_page *req, + struct nfs_commit_info *cinfo); + int (*scan_commit_lists) (struct nfs_commit_info *cinfo, + int max); + void (*recover_commit_reqs) (struct list_head *list, + struct nfs_commit_info *cinfo); }; struct pnfs_layout_hdr { - atomic_t plh_refcount; + refcount_t plh_refcount; atomic_t plh_outstanding; /* number of RPCs out */ struct list_head plh_layouts; /* other client layouts */ struct list_head plh_bulk_destroy; @@ -194,8 +216,9 @@ struct pnfs_layout_hdr { u32 plh_return_seq; enum pnfs_iomode plh_return_iomode; loff_t plh_lwb; /* last write byte for layoutcommit */ - struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ + const struct cred *plh_lc_cred; /* layoutcommit cred */ struct inode *plh_inode; + struct rcu_head plh_rcu; }; struct pnfs_device { @@ -219,22 +242,30 @@ struct pnfs_devicelist { extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); +extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id); +extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld); /* nfs4proc.c */ +#define PNFS_FL_LAYOUTRETURN_ASYNC (1U << 0) +#define PNFS_FL_LAYOUTRETURN_PRIVILEGED (1U << 1) + +extern size_t max_response_pages(struct nfs_server *server); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, - struct rpc_cred *cred); -extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags); -extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); + const struct cred *cred); +extern struct pnfs_layout_segment * +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, + struct nfs4_exception *exception); +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, + unsigned int flags); /* pnfs.c */ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); -void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); -void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio); +void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio, struct nfs_page *req); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, @@ -245,17 +276,22 @@ size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_layoutget_free(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_layout_final(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); -int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, - struct nfs_fsid *fsid, - bool is_recall); -int pnfs_destroy_layouts_byclid(struct nfs_client *clp, - bool is_recall); +int pnfs_layout_destroy_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid, + enum pnfs_layout_destroy_mode mode); +int pnfs_layout_destroy_byclid(struct nfs_client *clp, + enum pnfs_layout_destroy_mode mode); +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, + struct pnfs_layout_range *dst_range, + struct inode *inode); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, + const struct cred *cred, bool update_barrier); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, @@ -270,7 +306,9 @@ int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, - const struct rpc_cred *cred); + const struct cred *cred); +int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, int *ret); void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret); @@ -284,7 +322,7 @@ int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); void pnfs_ld_read_done(struct nfs_pgio_header *); -void pnfs_read_resend_pnfs(struct nfs_pgio_header *); +void pnfs_read_resend_pnfs(struct nfs_pgio_header *, unsigned int mirror_idx); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, @@ -292,6 +330,9 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, enum pnfs_iomode iomode, bool strict_iomode, gfp_t gfp_flags); +void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range); void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, const nfs4_stateid *arg_stateid, const struct pnfs_layout_range *range, @@ -311,6 +352,10 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg); +void pnfs_layout_return_unused_byclid(struct nfs_client *clp, + enum pnfs_iomode iomode); +int pnfs_layout_handle_reboot(struct nfs_client *clp); + /* nfs4_deviceid_flags */ enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ @@ -333,17 +378,28 @@ struct nfs4_deviceid_node { struct nfs4_deviceid_node * nfs4_find_get_deviceid(struct nfs_server *server, - const struct nfs4_deviceid *id, struct rpc_cred *cred, + const struct nfs4_deviceid *id, const struct cred *cred, gfp_t gfp_mask); void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *, const struct nfs4_deviceid *); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node); void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); /* pnfs_nfs.c */ +struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags); +void pnfs_free_commit_array(struct pnfs_commit_array *p); +struct pnfs_commit_array *pnfs_add_commit_array(struct pnfs_ds_commit_info *, + struct pnfs_commit_array *, + struct pnfs_layout_segment *); + +void pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg); +void pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo); + void pnfs_generic_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo); void pnfs_generic_commit_release(void *calldata); @@ -360,7 +416,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode, int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max); void pnfs_generic_write_commit_done(struct rpc_task *task, void *data); void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); -struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net, + struct list_head *dsaddrs, gfp_t gfp_flags); void nfs4_pnfs_v3_ds_connect_unload(void); int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, @@ -373,6 +430,11 @@ void pnfs_layout_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx); +void pnfs_lgopen_prepare(struct nfs4_opendata *data, + struct nfs_open_context *ctx); +void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, + struct nfs_open_context *ctx); +void nfs4_lgopen_release(struct nfs4_layoutget *lgp); static inline bool nfs_have_layout(struct inode *inode) { @@ -395,7 +457,7 @@ static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { if (lseg) { - atomic_inc(&lseg->pls_refcount); + refcount_inc(&lseg->pls_refcount); smp_mb__after_atomic(); } return lseg; @@ -417,9 +479,11 @@ static inline int pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) { - if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0) + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (fl_cinfo == NULL || fl_cinfo->ncommitting == 0) return PNFS_NOT_ATTEMPTED; - return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo); + return fl_cinfo->ops->commit_pagelist(inode, mds_pages, how, cinfo); } static inline struct pnfs_ds_commit_info * @@ -433,6 +497,28 @@ pnfs_get_ds_info(struct inode *inode) } static inline void +pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ + struct pnfs_ds_commit_info *inode_cinfo = pnfs_get_ds_info(inode); + if (inode_cinfo != NULL) + fl_cinfo->ops = inode_cinfo->ops; +} + +static inline void +pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo) +{ + INIT_LIST_HEAD(&fl_cinfo->commits); + fl_cinfo->ops = NULL; +} + +static inline void +pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ + if (fl_cinfo->ops != NULL && fl_cinfo->ops->release_ds_info != NULL) + fl_cinfo->ops->release_ds_info(fl_cinfo, inode); +} + +static inline void pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node) { set_bit(NFS_DEVICEID_INVALID, &node->flags); @@ -442,24 +528,22 @@ static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) { - struct inode *inode = d_inode(req->wb_context->dentry); - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (lseg == NULL || ld->mark_request_commit == NULL) + if (!lseg || !fl_cinfo->ops || !fl_cinfo->ops->mark_request_commit) return false; - ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx); + fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx); return true; } static inline bool pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { - struct inode *inode = d_inode(req->wb_context->dentry); - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (ld == NULL || ld->clear_request_commit == NULL) + if (!fl_cinfo || !fl_cinfo->ops || !fl_cinfo->ops->clear_request_commit) return false; - ld->clear_request_commit(req, cinfo); + fl_cinfo->ops->clear_request_commit(req, cinfo); return true; } @@ -467,21 +551,20 @@ static inline int pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, int max) { - if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (!fl_cinfo || fl_cinfo->nwritten == 0) return 0; - else - return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); + return fl_cinfo->ops->scan_commit_lists(cinfo, max); } -static inline struct nfs_page * -pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, - struct page *page) +static inline void +pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) { - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (ld == NULL || ld->search_commit_reqs == NULL) - return NULL; - return ld->search_commit_reqs(cinfo, page); + if (fl_cinfo && fl_cinfo->nwritten != 0) + fl_cinfo->ops->recover_commit_reqs(head, cinfo); } /* Should the pNFS client commit and return the layout upon a setattr */ @@ -524,8 +607,10 @@ static inline int pnfs_return_layout(struct inode *ino) struct nfs_inode *nfsi = NFS_I(ino); struct nfs_server *nfss = NFS_SERVER(ino); - if (pnfs_enabled_sb(nfss) && nfsi->layout) + if (pnfs_enabled_sb(nfss) && nfsi->layout) { + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &nfsi->layout->plh_flags); return _pnfs_return_layout(ino); + } return 0; } @@ -603,6 +688,13 @@ pnfs_lseg_request_intersecting(struct pnfs_layout_segment *lseg, struct nfs_page req_offset(req), req_last); } +static inline void pnfs_lseg_cancel_io(struct nfs_server *server, + struct pnfs_layout_segment *lseg) +{ + if (server->pnfs_curr_ld->cancel_io) + server->pnfs_curr_ld->cancel_io(lseg); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG @@ -628,6 +720,15 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) { } +static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ +} + +static inline int pnfs_layout_handle_reboot(struct nfs_client *clp) +{ + return 0; +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { @@ -677,11 +778,20 @@ static inline bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, - const struct rpc_cred *cred) + const struct cred *cred) { return false; } +static inline int +pnfs_roc_done(struct rpc_task *task, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + return 0; +} + static inline void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, @@ -718,6 +828,21 @@ pnfs_get_ds_info(struct inode *inode) return NULL; } +static inline void +pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ +} + +static inline void +pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo) +{ +} + +static inline void +pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ +} + static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) @@ -738,11 +863,9 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, return 0; } -static inline struct nfs_page * -pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, - struct page *page) +static inline void +pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) { - return NULL; } static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) @@ -766,6 +889,33 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) { } +static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, + struct pnfs_layout_range *dst_range, + struct inode *inode) +{ + return false; +} + +static inline void pnfs_lgopen_prepare(struct nfs4_opendata *data, + struct nfs_open_context *ctx) +{ +} + +static inline void pnfs_parse_lgopen(struct inode *ino, + struct nfs4_layoutget *lgp, + struct nfs_open_context *ctx) +{ +} + +static inline void nfs4_lgopen_release(struct nfs4_layoutget *lgp) +{ +} + +static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo) +{ + return false; +} + #endif /* CONFIG_NFS_V4_1 */ #if IS_ENABLED(CONFIG_NFS_V4_2) diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 2961fcd7a2df..bf0f2d67e96c 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -34,6 +34,8 @@ #include "internal.h" #include "pnfs.h" +#include "nfs4trace.h" + #define NFSDBG_FACILITY NFSDBG_PNFS /* @@ -43,7 +45,6 @@ #define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) #define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) -#define PNFS_DEVICE_RETRY_TIMEOUT (120*HZ) static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE]; static DEFINE_SPINLOCK(nfs4_deviceid_lock); @@ -95,7 +96,7 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld, static struct nfs4_deviceid_node * nfs4_get_device_info(struct nfs_server *server, const struct nfs4_deviceid *dev_id, - struct rpc_cred *cred, gfp_t gfp_flags) + const struct cred *cred, gfp_t gfp_flags) { struct nfs4_deviceid_node *d = NULL; struct pnfs_device *pdev = NULL; @@ -109,9 +110,6 @@ nfs4_get_device_info(struct nfs_server *server, * GETDEVICEINFO's maxcount */ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; - if (server->pnfs_curr_ld->max_deviceinfo_size && - server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz) - max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size; max_pages = nfs_page_array_len(0, max_resp_sz); dprintk("%s: server %p max_resp_sz %u max_pages %d\n", __func__, server, max_resp_sz, max_pages); @@ -153,7 +151,7 @@ nfs4_get_device_info(struct nfs_server *server, set_bit(NFS_DEVICEID_NOCACHE, &d->flags); out_free_pages: - for (i = 0; i < max_pages; i++) + while (--i >= 0) __free_page(pages[i]); kfree(pages); out_free_pdev: @@ -185,7 +183,7 @@ __nfs4_find_get_deviceid(struct nfs_server *server, struct nfs4_deviceid_node * nfs4_find_get_deviceid(struct nfs_server *server, - const struct nfs4_deviceid *id, struct rpc_cred *cred, + const struct nfs4_deviceid *id, const struct cred *cred, gfp_t gfp_mask) { long hash = nfs4_deviceid_hash(id); @@ -193,24 +191,28 @@ nfs4_find_get_deviceid(struct nfs_server *server, d = __nfs4_find_get_deviceid(server, id, hash); if (d) - return d; + goto found; new = nfs4_get_device_info(server, id, cred, gfp_mask); - if (!new) + if (!new) { + trace_nfs4_find_deviceid(server, id, -ENOENT); return new; + } spin_lock(&nfs4_deviceid_lock); d = __nfs4_find_get_deviceid(server, id, hash); if (d) { spin_unlock(&nfs4_deviceid_lock); server->pnfs_curr_ld->free_deviceid_node(new); - return d; + } else { + atomic_inc(&new->ref); + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); + spin_unlock(&nfs4_deviceid_lock); + d = new; } - hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); - atomic_inc(&new->ref); - spin_unlock(&nfs4_deviceid_lock); - - return new; +found: + trace_nfs4_find_deviceid(server, id, 0); + return d; } EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); @@ -279,16 +281,29 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) } if (!atomic_dec_and_test(&d->ref)) return false; + trace_nfs4_deviceid_free(d->nfs_client, &d->deviceid); d->ld->free_deviceid_node(d); return true; } EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); void +nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node) +{ + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { + clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); + } +} +EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_available); + +void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node) { node->timestamp_unavailable = jiffies; + smp_mb__before_atomic(); set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); } EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable); @@ -303,6 +318,7 @@ nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node) if (time_in_range(node->timestamp_unavailable, start, end)) return true; clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); } return false; } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 25f28fa64c57..9976cc16b689 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Common NFS I/O operations for the pnfs file based * layout drivers. @@ -15,6 +16,8 @@ #include "nfs4session.h" #include "internal.h" #include "pnfs.h" +#include "netns.h" +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -30,12 +33,11 @@ EXPORT_SYMBOL_GPL(pnfs_generic_rw_release); /* Fake up some data that will cause nfs_commit_release to retry the writes. */ void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data) { - struct nfs_page *first = nfs_list_entry(data->pages.next); + struct nfs_writeverf *verf = data->res.verf; data->task.tk_status = 0; - memcpy(&data->verf.verifier, &first->wb_verf, - sizeof(data->verf.verifier)); - data->verf.verifier.data[0]++; /* ensure verifier mismatch */ + memset(&verf->verifier, 0, sizeof(verf->verifier)); + verf->committed = NFS_UNSTABLE; } EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes); @@ -59,226 +61,400 @@ void pnfs_generic_commit_release(void *calldata) } EXPORT_SYMBOL_GPL(pnfs_generic_commit_release); +static struct pnfs_layout_segment * +pnfs_free_bucket_lseg(struct pnfs_commit_bucket *bucket) +{ + if (list_empty(&bucket->committing) && list_empty(&bucket->written)) { + struct pnfs_layout_segment *freeme = bucket->lseg; + bucket->lseg = NULL; + return freeme; + } + return NULL; +} + /* The generic layer is about to remove the req from the commit list. * If this will make the bucket empty, it will need to put the lseg reference. - * Note this must be called holding i_lock + * Note this must be called holding nfsi->commit_mutex */ void pnfs_generic_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { - struct pnfs_layout_segment *freeme = NULL; + struct pnfs_commit_bucket *bucket = NULL; if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) goto out; cinfo->ds->nwritten--; - if (list_is_singular(&req->wb_list)) { - struct pnfs_commit_bucket *bucket; - + if (list_is_singular(&req->wb_list)) bucket = list_first_entry(&req->wb_list, - struct pnfs_commit_bucket, - written); - freeme = bucket->wlseg; - bucket->wlseg = NULL; - } + struct pnfs_commit_bucket, written); out: nfs_request_remove_commit_list(req, cinfo); - pnfs_put_lseg_locked(freeme); + if (bucket) + pnfs_put_lseg(pnfs_free_bucket_lseg(bucket)); } EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit); -static int -pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, - struct nfs_commit_info *cinfo, int max) +struct pnfs_commit_array * +pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags) { - struct nfs_page *req, *tmp; - int ret = 0; + struct pnfs_commit_array *p; + struct pnfs_commit_bucket *b; - list_for_each_entry_safe(req, tmp, src, wb_list) { - if (!nfs_lock_request(req)) - continue; - kref_get(&req->wb_kref); - if (cond_resched_lock(&cinfo->inode->i_lock)) - list_safe_reset_next(req, tmp, wb_list); - nfs_request_remove_commit_list(req, cinfo); - clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); - nfs_list_add_request(req, dst); - ret++; - if ((ret == max) && !cinfo->dreq) - break; + p = kmalloc(struct_size(p, buckets, n), gfp_flags); + if (!p) + return NULL; + p->nbuckets = n; + INIT_LIST_HEAD(&p->cinfo_list); + INIT_LIST_HEAD(&p->lseg_list); + p->lseg = NULL; + for (b = &p->buckets[0]; n != 0; b++, n--) { + INIT_LIST_HEAD(&b->written); + INIT_LIST_HEAD(&b->committing); + b->lseg = NULL; + b->direct_verf.committed = NFS_INVALID_STABLE_HOW; } - return ret; + return p; } +EXPORT_SYMBOL_GPL(pnfs_alloc_commit_array); +void +pnfs_free_commit_array(struct pnfs_commit_array *p) +{ + kfree_rcu(p, rcu); +} +EXPORT_SYMBOL_GPL(pnfs_free_commit_array); + +static struct pnfs_commit_array * +pnfs_find_commit_array_by_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array; + + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (array->lseg == lseg) + return array; + } + return NULL; +} + +struct pnfs_commit_array * +pnfs_add_commit_array(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_commit_array *new, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array; + + array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); + if (array) + return array; + new->lseg = lseg; + refcount_set(&new->refcount, 1); + list_add_rcu(&new->cinfo_list, &fl_cinfo->commits); + list_add(&new->lseg_list, &lseg->pls_commits); + return new; +} +EXPORT_SYMBOL_GPL(pnfs_add_commit_array); + +static struct pnfs_commit_array * +pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array; + + rcu_read_lock(); + array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); + if (!array) { + rcu_read_unlock(); + fl_cinfo->ops->setup_ds_info(fl_cinfo, lseg); + rcu_read_lock(); + array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); + } + rcu_read_unlock(); + return array; +} + +static void +pnfs_release_commit_array_locked(struct pnfs_commit_array *array) +{ + list_del_rcu(&array->cinfo_list); + list_del(&array->lseg_list); + pnfs_free_commit_array(array); +} + +static void +pnfs_put_commit_array_locked(struct pnfs_commit_array *array) +{ + if (refcount_dec_and_test(&array->refcount)) + pnfs_release_commit_array_locked(array); +} + +static void +pnfs_put_commit_array(struct pnfs_commit_array *array, struct inode *inode) +{ + if (refcount_dec_and_lock(&array->refcount, &inode->i_lock)) { + pnfs_release_commit_array_locked(array); + spin_unlock(&inode->i_lock); + } +} + +static struct pnfs_commit_array * +pnfs_get_commit_array(struct pnfs_commit_array *array) +{ + if (refcount_inc_not_zero(&array->refcount)) + return array; + return NULL; +} + +static void +pnfs_remove_and_free_commit_array(struct pnfs_commit_array *array) +{ + array->lseg = NULL; + list_del_init(&array->lseg_list); + pnfs_put_commit_array_locked(array); +} + +void +pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array, *tmp; + + list_for_each_entry_safe(array, tmp, &lseg->pls_commits, lseg_list) + pnfs_remove_and_free_commit_array(array); +} +EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_release_lseg); + +void +pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo) +{ + struct pnfs_commit_array *array, *tmp; + + list_for_each_entry_safe(array, tmp, &fl_cinfo->commits, cinfo_list) + pnfs_remove_and_free_commit_array(array); +} +EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_destroy); + +/* + * Locks the nfs_page requests for commit and moves them to + * @bucket->committing. + */ static int -pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, - struct nfs_commit_info *cinfo, - int max) +pnfs_bucket_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo, + int max) { struct list_head *src = &bucket->written; struct list_head *dst = &bucket->committing; int ret; - lockdep_assert_held(&cinfo->inode->i_lock); - ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); + ret = nfs_scan_commit_list(src, dst, cinfo, max); if (ret) { cinfo->ds->nwritten -= ret; cinfo->ds->ncommitting += ret; - if (bucket->clseg == NULL) - bucket->clseg = pnfs_get_lseg(bucket->wlseg); - if (list_empty(src)) { - pnfs_put_lseg_locked(bucket->wlseg); - bucket->wlseg = NULL; - } } return ret; } +static int pnfs_bucket_scan_array(struct nfs_commit_info *cinfo, + struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + int max) +{ + unsigned int i; + int rv = 0, cnt; + + for (i = 0; i < nbuckets && max != 0; i++) { + cnt = pnfs_bucket_scan_ds_commit_list(&buckets[i], cinfo, max); + rv += cnt; + max -= cnt; + } + return rv; +} + /* Move reqs from written to committing lists, returning count * of number moved. */ -int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, - int max) +int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max) { - int i, rv = 0, cnt; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_array *array; + int rv = 0, cnt; - lockdep_assert_held(&cinfo->inode->i_lock); - for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { - cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], - cinfo, max); - max -= cnt; + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); + cnt = pnfs_bucket_scan_array(cinfo, array->buckets, + array->nbuckets, max); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); rv += cnt; + max -= cnt; + if (!max) + break; } + rcu_read_unlock(); return rv; } EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists); -/* Pull everything off the committing lists and dump into @dst. */ -void pnfs_generic_recover_commit_reqs(struct list_head *dst, - struct nfs_commit_info *cinfo) +static unsigned int +pnfs_bucket_recover_commit_reqs(struct list_head *dst, + struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + struct nfs_commit_info *cinfo) { struct pnfs_commit_bucket *b; struct pnfs_layout_segment *freeme; - int nwritten; - int i; + unsigned int nwritten, ret = 0; + unsigned int i; - lockdep_assert_held(&cinfo->inode->i_lock); restart: - for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { - nwritten = pnfs_generic_transfer_commit_list(&b->written, - dst, cinfo, 0); + for (i = 0, b = buckets; i < nbuckets; i++, b++) { + nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0); if (!nwritten) continue; - cinfo->ds->nwritten -= nwritten; - if (list_empty(&b->written)) { - freeme = b->wlseg; - b->wlseg = NULL; - spin_unlock(&cinfo->inode->i_lock); + ret += nwritten; + freeme = pnfs_free_bucket_lseg(b); + if (freeme) { pnfs_put_lseg(freeme); - spin_lock(&cinfo->inode->i_lock); goto restart; } } + return ret; } -EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); -static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) +/* Pull everything off the committing lists and dump into @dst. */ +void pnfs_generic_recover_commit_reqs(struct list_head *dst, + struct nfs_commit_info *cinfo) { struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_array *array; + unsigned int nwritten; + + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); + nwritten = pnfs_bucket_recover_commit_reqs(dst, + array->buckets, + array->nbuckets, + cinfo); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); + fl_cinfo->nwritten -= nwritten; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); + +static struct pnfs_layout_segment * +pnfs_bucket_get_committing(struct list_head *head, + struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo) +{ + struct pnfs_layout_segment *lseg; + struct list_head *pos; + + list_for_each(pos, &bucket->committing) + cinfo->ds->ncommitting--; + list_splice_init(&bucket->committing, head); + lseg = pnfs_free_bucket_lseg(bucket); + if (!lseg) + lseg = pnfs_get_lseg(bucket->lseg); + return lseg; +} + +static struct nfs_commit_data * +pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo) +{ + struct nfs_commit_data *data = nfs_commitdata_alloc(); + + if (!data) + return NULL; + data->lseg = pnfs_bucket_get_committing(&data->pages, bucket, cinfo); + return data; +} + +static void pnfs_generic_retry_commit(struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + struct nfs_commit_info *cinfo, + unsigned int idx) +{ struct pnfs_commit_bucket *bucket; struct pnfs_layout_segment *freeme; - struct list_head *pos; LIST_HEAD(pages); - int i; - spin_lock(&cinfo->inode->i_lock); - for (i = idx; i < fl_cinfo->nbuckets; i++) { - bucket = &fl_cinfo->buckets[i]; + for (bucket = buckets; idx < nbuckets; bucket++, idx++) { if (list_empty(&bucket->committing)) continue; - freeme = bucket->clseg; - bucket->clseg = NULL; - list_for_each(pos, &bucket->committing) - cinfo->ds->ncommitting--; - list_splice_init(&bucket->committing, &pages); - spin_unlock(&cinfo->inode->i_lock); - nfs_retry_commit(&pages, freeme, cinfo, i); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + freeme = pnfs_bucket_get_committing(&pages, bucket, cinfo); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); + nfs_retry_commit(&pages, freeme, cinfo, idx); pnfs_put_lseg(freeme); - spin_lock(&cinfo->inode->i_lock); } - spin_unlock(&cinfo->inode->i_lock); } static unsigned int -pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, - struct list_head *list) +pnfs_bucket_alloc_ds_commits(struct list_head *list, + struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + struct nfs_commit_info *cinfo) { - struct pnfs_ds_commit_info *fl_cinfo; struct pnfs_commit_bucket *bucket; struct nfs_commit_data *data; - int i; + unsigned int i; unsigned int nreq = 0; - fl_cinfo = cinfo->ds; - bucket = fl_cinfo->buckets; - for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { + for (i = 0, bucket = buckets; i < nbuckets; i++, bucket++) { if (list_empty(&bucket->committing)) continue; - data = nfs_commitdata_alloc(false); - if (!data) - break; - data->ds_commit_index = i; - list_add(&data->pages, list); - nreq++; + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + if (!list_empty(&bucket->committing)) { + data = pnfs_bucket_fetch_commitdata(bucket, cinfo); + if (!data) + goto out_error; + data->ds_commit_index = i; + list_add_tail(&data->list, list); + nreq++; + } + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } - + return nreq; +out_error: + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); /* Clean up on error */ - pnfs_generic_retry_commit(cinfo, i); + pnfs_generic_retry_commit(buckets, nbuckets, cinfo, i); return nreq; } -static inline -void pnfs_fetch_commit_bucket_list(struct list_head *pages, - struct nfs_commit_data *data, - struct nfs_commit_info *cinfo) +static unsigned int +pnfs_alloc_ds_commits_list(struct list_head *list, + struct pnfs_ds_commit_info *fl_cinfo, + struct nfs_commit_info *cinfo) { - struct pnfs_commit_bucket *bucket; - struct list_head *pos; - - bucket = &cinfo->ds->buckets[data->ds_commit_index]; - spin_lock(&cinfo->inode->i_lock); - list_for_each(pos, &bucket->committing) - cinfo->ds->ncommitting--; - list_splice_init(&bucket->committing, pages); - data->lseg = bucket->clseg; - bucket->clseg = NULL; - spin_unlock(&cinfo->inode->i_lock); - -} + struct pnfs_commit_array *array; + unsigned int ret = 0; -/* Helper function for pnfs_generic_commit_pagelist to catch an empty - * page list. This can happen when two commits race. - * - * This must be called instead of nfs_init_commit - call one or the other, but - * not both! - */ -static bool -pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, - struct nfs_commit_data *data, - struct nfs_commit_info *cinfo) -{ - if (list_empty(pages)) { - if (atomic_dec_and_test(&cinfo->mds->rpcs_out)) - wake_up_atomic_t(&cinfo->mds->rpcs_out); - /* don't call nfs_commitdata_release - it tries to put - * the open_context which is not acquired until nfs_init_commit - * which has not been called on @data */ - WARN_ON_ONCE(data->context); - nfs_commit_free(data); - return true; + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); + ret += pnfs_bucket_alloc_ds_commits(list, array->buckets, + array->nbuckets, cinfo); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); } - - return false; + rcu_read_unlock(); + return ret; } /* This follows nfs_commit_list pretty closely */ @@ -288,47 +464,37 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int (*initiate_commit)(struct nfs_commit_data *data, int how)) { + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; struct nfs_commit_data *data, *tmp; LIST_HEAD(list); unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(mds_pages, NULL, cinfo, -1); + return -ENOMEM; + } data->ds_commit_index = -1; - list_add(&data->pages, &list); + list_splice_init(mds_pages, &data->pages); + list_add_tail(&data->list, &list); nreq++; } - nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); - + nreq += pnfs_alloc_ds_commits_list(&list, fl_cinfo, cinfo); if (nreq == 0) goto out; - atomic_add(nreq, &cinfo->mds->rpcs_out); - - list_for_each_entry_safe(data, tmp, &list, pages) { - list_del_init(&data->pages); + list_for_each_entry_safe(data, tmp, &list, list) { + list_del(&data->list); if (data->ds_commit_index < 0) { - /* another commit raced with us */ - if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages, - data, cinfo)) - continue; - - nfs_init_commit(data, mds_pages, NULL, cinfo); + nfs_init_commit(data, NULL, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), - data->mds_ops, how, 0); + data->mds_ops, how, + RPC_TASK_CRED_NOREF, NULL); } else { - LIST_HEAD(pages); - - pnfs_fetch_commit_bucket_list(&pages, data, cinfo); - - /* another commit raced with us */ - if (pnfs_generic_commit_cancel_empty_pagelist(&pages, - data, cinfo)) - continue; - - nfs_init_commit(data, &pages, data->lseg, cinfo); + nfs_init_commit(data, NULL, data->lseg, cinfo); initiate_commit(data, how); } } @@ -340,14 +506,14 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist); /* * Data server cache * - * Data servers can be mapped to different device ids. - * nfs4_pnfs_ds reference counting + * Data servers can be mapped to different device ids, but should + * never be shared between net namespaces. + * + * nfs4_pnfs_ds reference counting: * - set to 1 on allocation * - incremented when a device id maps a data server already in the cache. * - decremented when deviceid is removed from the cache. */ -static DEFINE_SPINLOCK(nfs4_ds_cache_lock); -static LIST_HEAD(nfs4_data_server_cache); /* Debug routines */ static void @@ -362,7 +528,7 @@ print_ds(struct nfs4_pnfs_ds *ds) " client %p\n" " cl_exchange_flags %x\n", ds->ds_remotestr, - atomic_read(&ds->ds_count), ds->ds_clp, + refcount_read(&ds->ds_count), ds->ds_clp, ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); } @@ -440,16 +606,31 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1, * Lookup DS by addresses. nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct list_head *dsaddrs) +_data_server_lookup_locked(const struct nfs_net *nn, const struct list_head *dsaddrs) { struct nfs4_pnfs_ds *ds; - list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) + list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node) if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) return ds; return NULL; } +static struct nfs4_pnfs_ds_addr *nfs4_pnfs_ds_addr_alloc(gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da = kzalloc(sizeof(*da), gfp_flags); + if (da) + INIT_LIST_HEAD(&da->da_node); + return da; +} + +static void nfs4_pnfs_ds_addr_free(struct nfs4_pnfs_ds_addr *da) +{ + kfree(da->da_remotestr); + kfree(da->da_netid); + kfree(da); +} + static void destroy_ds(struct nfs4_pnfs_ds *ds) { struct nfs4_pnfs_ds_addr *da; @@ -465,8 +646,7 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds) struct nfs4_pnfs_ds_addr, da_node); list_del_init(&da->da_node); - kfree(da->da_remotestr); - kfree(da); + nfs4_pnfs_ds_addr_free(da); } kfree(ds->ds_remotestr); @@ -475,10 +655,11 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds) void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds) { - if (atomic_dec_and_lock(&ds->ds_count, - &nfs4_ds_cache_lock)) { + struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id); + + if (refcount_dec_and_lock(&ds->ds_count, &nn->nfs4_data_server_lock)) { list_del_init(&ds->ds_node); - spin_unlock(&nfs4_ds_cache_lock); + spin_unlock(&nn->nfs4_data_server_lock); destroy_ds(ds); } } @@ -538,8 +719,9 @@ out_err: * uncached and return cached struct nfs4_pnfs_ds. */ struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags) { + struct nfs_net *nn = net_generic(net, nfs_net_id); struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; char *remotestr; @@ -555,51 +737,49 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) /* this is only used for debugging, so it's ok if its NULL */ remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); - spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(dsaddrs); + spin_lock(&nn->nfs4_data_server_lock); + tmp_ds = _data_server_lookup_locked(nn, dsaddrs); if (tmp_ds == NULL) { INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; - atomic_set(&ds->ds_count, 1); + refcount_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); + ds->ds_net = net; ds->ds_clp = NULL; - list_add(&ds->ds_node, &nfs4_data_server_cache); + list_add(&ds->ds_node, &nn->nfs4_data_server_cache); dprintk("%s add new data server %s\n", __func__, ds->ds_remotestr); } else { kfree(remotestr); kfree(ds); - atomic_inc(&tmp_ds->ds_count); + refcount_inc(&tmp_ds->ds_count); dprintk("%s data server %s found, inc'ed ds_count to %d\n", __func__, tmp_ds->ds_remotestr, - atomic_read(&tmp_ds->ds_count)); + refcount_read(&tmp_ds->ds_count)); ds = tmp_ds; } - spin_unlock(&nfs4_ds_cache_lock); + spin_unlock(&nn->nfs4_data_server_lock); out: return ds; } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add); -static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +static int nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) { might_sleep(); - wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, - TASK_KILLABLE); + return wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, TASK_KILLABLE); } static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) { smp_mb__before_atomic(); - clear_bit(NFS4DS_CONNECTING, &ds->ds_state); - smp_mb__after_atomic(); - wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); + clear_and_wake_up_bit(NFS4DS_CONNECTING, &ds->ds_state); } static struct nfs_client *(*get_v3_ds_connect)( struct nfs_server *mds_srv, - const struct sockaddr *ds_addr, + const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, @@ -629,13 +809,17 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, unsigned int retrans) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; + unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); if (!load_v3_ds_connect()) - goto out; + return -EPROTONOSUPPORT; list_for_each_entry(da, &ds->ds_addrs, da_node) { dprintk("%s: DS %s: trying address %s\n", @@ -643,20 +827,42 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, if (!IS_ERR(clp)) { struct xprt_create xprt_args = { - .ident = XPRT_TRANSPORT_TCP, + .ident = da->da_transport, .net = clp->cl_net, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, + .connect_timeout = connect_timeout, + .reconnect_timeout = connect_timeout, + .xprtsec = clp->cl_xprtsec, }; + + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) + xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; + + if (xprt_args.ident != clp->cl_proto) + continue; + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) + continue; /* Add this address as an alias */ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); - } else - clp = get_v3_ds_connect(mds_srv, - (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - timeo, retrans); + rpc_clnt_test_and_add_xprt, NULL); + continue; + } + + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen, + ds_proto, timeo, retrans); + if (IS_ERR(clp)) + continue; + clp->cl_rpcclient->cl_softerr = 0; + clp->cl_rpcclient->cl_softrtry = 0; } if (IS_ERR(clp)) { @@ -665,7 +871,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, } smp_wmb(); - ds->ds_clp = clp; + WRITE_ONCE(ds->ds_clp, clp); dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; @@ -678,46 +884,94 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, u32 minor_version) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); list_for_each_entry(da, &ds->ds_addrs, da_node) { + char servername[48]; + dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) { struct xprt_create xprt_args = { - .ident = XPRT_TRANSPORT_TCP, + .ident = da->da_transport, .net = clp->cl_net, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, + .xprtsec = clp->cl_xprtsec, }; struct nfs4_add_xprt_data xprtdata = { .clp = clp, - .cred = nfs4_get_clid_cred(clp), }; struct rpc_add_xprt_test rpcdata = { .add_xprt_test = clp->cl_mvops->session_trunk, .data = &xprtdata, }; + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) { + struct sockaddr *addr = + (struct sockaddr *)&da->da_addr; + struct sockaddr_in *sin = + (struct sockaddr_in *)&da->da_addr; + struct sockaddr_in6 *sin6 = + (struct sockaddr_in6 *)&da->da_addr; + + /* for NFS with TLS we need to supply a correct + * servername of the trunked transport, not the + * servername of the main transport stored in + * clp->cl_hostname. And set the protocol to + * indicate to use TLS + */ + servername[0] = '\0'; + switch(addr->sa_family) { + case AF_INET: + snprintf(servername, sizeof(servername), + "%pI4", &sin->sin_addr.s_addr); + break; + case AF_INET6: + snprintf(servername, sizeof(servername), + "%pI6", &sin6->sin6_addr); + break; + default: + /* do not consider this address */ + continue; + } + xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; + xprt_args.servername = servername; + } + if (xprt_args.ident != clp->cl_proto) + continue; + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) + continue; + /** * Test this address for session trunking and * add as an alias */ + xprtdata.cred = nfs4_get_clid_cred(clp); rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_setup_test_and_add_xprt, &rpcdata); if (xprtdata.cred) - put_rpccred(xprtdata.cred); + put_cred(xprtdata.cred); } else { - clp = nfs4_set_ds_client(mds_srv, - (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - timeo, retrans, minor_version); + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = nfs4_set_ds_client(mds_srv, &da->da_addr, + da->da_addrlen, ds_proto, + timeo, retrans, minor_version); if (IS_ERR(clp)) continue; @@ -728,7 +982,6 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, clp = ERR_PTR(-EIO); continue; } - } } @@ -738,7 +991,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, } smp_wmb(); - ds->ds_clp = clp; + WRITE_ONCE(ds->ds_clp, clp); dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; @@ -755,30 +1008,35 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, { int err; -again: - err = 0; - if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { - if (version == 3) { - err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, - retrans); - } else if (version == 4) { - err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, - retrans, minor_version); - } else { - dprintk("%s: unsupported DS version %d\n", __func__, - version); - err = -EPROTONOSUPPORT; + do { + err = nfs4_wait_ds_connect(ds); + if (err || ds->ds_clp) + goto out; + if (nfs4_test_deviceid_unavailable(devid)) { + err = -ENODEV; + goto out; } + } while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0); - nfs4_clear_ds_conn_bit(ds); - } else { - nfs4_wait_ds_connect(ds); + if (ds->ds_clp) + goto connect_done; - /* what was waited on didn't connect AND didn't mark unavail */ - if (!ds->ds_clp && !nfs4_test_deviceid_unavailable(devid)) - goto again; + switch (version) { + case 3: + err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, retrans); + break; + case 4: + err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, retrans, + minor_version); + break; + default: + dprintk("%s: unsupported DS version %d\n", __func__, version); + err = -EPROTONOSUPPORT; } +connect_done: + nfs4_clear_ds_conn_bit(ds); +out: /* * At this point the ds->ds_clp should be ready, but it might have * hit an error. @@ -787,11 +1045,12 @@ again: if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) { WARN_ON_ONCE(ds->ds_clp || !nfs4_test_deviceid_unavailable(devid)); - return -EINVAL; - } - err = nfs_client_init_status(ds->ds_clp); + err = -EINVAL; + } else + err = nfs_client_init_status(ds->ds_clp); } + trace_pnfs_ds_connect(ds->ds_remotestr, err); return err; } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect); @@ -805,55 +1064,26 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) struct nfs4_pnfs_ds_addr *da = NULL; char *buf, *portstr; __be16 port; - int nlen, rlen; + ssize_t nlen, rlen; int tmp[2]; - __be32 *p; - char *netid, *match_netid; - size_t len, match_netid_len; + char *netid; + size_t len; char *startsep = ""; char *endsep = ""; /* r_netid */ - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_err; - nlen = be32_to_cpup(p++); - - p = xdr_inline_decode(xdr, nlen); - if (unlikely(!p)) + nlen = xdr_stream_decode_string_dup(xdr, &netid, XDR_MAX_NETOBJ, + gfp_flags); + if (unlikely(nlen < 0)) goto out_err; - netid = kmalloc(nlen+1, gfp_flags); - if (unlikely(!netid)) - goto out_err; - - netid[nlen] = '\0'; - memcpy(netid, p, nlen); - /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_free_netid; - rlen = be32_to_cpup(p); - - p = xdr_inline_decode(xdr, rlen); - if (unlikely(!p)) - goto out_free_netid; - /* port is ".ABC.DEF", 8 chars max */ - if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { - dprintk("%s: Invalid address, length %d\n", __func__, - rlen); - goto out_free_netid; - } - buf = kmalloc(rlen + 1, gfp_flags); - if (!buf) { - dprintk("%s: Not enough memory\n", __func__); + rlen = xdr_stream_decode_string_dup(xdr, &buf, INET6_ADDRSTRLEN + + IPV6_SCOPE_ID_LEN + 8, gfp_flags); + if (unlikely(rlen < 0)) goto out_free_netid; - } - buf[rlen] = '\0'; - memcpy(buf, p, rlen); /* replace port '.' with '-' */ portstr = strrchr(buf, '.'); @@ -873,12 +1103,10 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) } *portstr = '\0'; - da = kzalloc(sizeof(*da), gfp_flags); + da = nfs4_pnfs_ds_addr_alloc(gfp_flags); if (unlikely(!da)) goto out_free_buf; - INIT_LIST_HEAD(&da->da_node); - if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr, sizeof(da->da_addr))) { dprintk("%s: error parsing address %s\n", __func__, buf); @@ -893,15 +1121,11 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) case AF_INET: ((struct sockaddr_in *)&da->da_addr)->sin_port = port; da->da_addrlen = sizeof(struct sockaddr_in); - match_netid = "tcp"; - match_netid_len = 3; break; case AF_INET6: ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; da->da_addrlen = sizeof(struct sockaddr_in6); - match_netid = "tcp6"; - match_netid_len = 4; startsep = "["; endsep = "]"; break; @@ -912,12 +1136,15 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) goto out_free_da; } - if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { - dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", - __func__, netid, match_netid); + da->da_transport = xprt_find_transport_ident(netid); + if (da->da_transport < 0) { + dprintk("%s: ERROR: unknown r_netid \"%s\"\n", + __func__, netid); goto out_free_da; } + da->da_netid = netid; + /* save human readable address */ len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; da->da_remotestr = kzalloc(len, gfp_flags); @@ -929,7 +1156,6 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); kfree(buf); - kfree(netid); return da; out_free_da: @@ -951,32 +1177,33 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx) { struct list_head *list; - struct pnfs_commit_bucket *buckets; - - spin_lock(&cinfo->inode->i_lock); - buckets = cinfo->ds->buckets; - list = &buckets[ds_commit_idx].written; - if (list_empty(list)) { - if (!pnfs_is_valid_lseg(lseg)) { - spin_unlock(&cinfo->inode->i_lock); - cinfo->completion_ops->resched_write(cinfo, req); - return; - } - /* Non-empty buckets hold a reference on the lseg. That ref - * is normally transferred to the COMMIT call and released - * there. It could also be released if the last req is pulled - * off due to a rewrite, in which case it will be done in - * pnfs_common_clear_request_commit - */ - WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL); - buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg); - } + struct pnfs_commit_array *array; + struct pnfs_commit_bucket *bucket; + + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + array = pnfs_lookup_commit_array(cinfo->ds, lseg); + if (!array || !pnfs_is_valid_lseg(lseg)) + goto out_resched; + bucket = &array->buckets[ds_commit_idx]; + list = &bucket->written; + /* Non-empty buckets hold a reference on the lseg. That ref + * is normally transferred to the COMMIT call and released + * there. It could also be released if the last req is pulled + * off due to a rewrite, in which case it will be done in + * pnfs_common_clear_request_commit + */ + if (!bucket->lseg) + bucket->lseg = pnfs_get_lseg(lseg); set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; nfs_request_add_commit_list_locked(req, list, cinfo); - spin_unlock(&cinfo->inode->i_lock); - nfs_mark_page_unstable(req->wb_page, cinfo); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); + nfs_folio_mark_unstable(nfs_page_to_folio(req), cinfo); + return; +out_resched: + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); + cinfo->completion_ops->resched_write(cinfo, req); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 7962e49097c3..63e71310b9f6 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/proc.c * @@ -90,6 +91,8 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, info->dtpref = fsinfo.tsize; info->maxfilesize = 0x7FFFFFFF; info->lease_time = 0; + info->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + info->xattr_support = 0; return 0; } @@ -98,7 +101,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, */ static int nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fattr *fattr, struct nfs4_label *label) + struct nfs_fattr *fattr, struct inode *inode) { struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], @@ -106,10 +109,15 @@ nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = fattr, }; int status; + unsigned short task_flags = 0; + + /* Is this is an attribute revalidation, subject to softreval? */ + if (inode && (server->flags & NFS_MOUNT_SOFTREVAL)) + task_flags |= RPC_TASK_TIMEOUT; dprintk("NFS call getattr\n"); nfs_fattr_init(fattr); - status = rpc_call_sync(server->client, &msg, 0); + status = rpc_call_sync(server->client, &msg, task_flags); dprintk("NFS reply getattr: %d\n", status); return status; } @@ -145,9 +153,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs_proc_lookup(struct inode *dir, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr, - struct nfs4_label *label) +nfs_proc_lookup(struct inode *dir, struct dentry *dentry, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_diropargs arg = { .fh = NFS_FH(dir), @@ -164,10 +171,15 @@ nfs_proc_lookup(struct inode *dir, const struct qstr *name, .rpc_resp = &res, }; int status; + unsigned short task_flags = 0; + + /* Is this is an attribute revalidation, subject to softreval? */ + if (nfs_lookup_is_soft_revalidate(dentry)) + task_flags |= RPC_TASK_TIMEOUT; - dprintk("NFS call lookup %s\n", name->name); + dprintk("NFS call lookup %pd2\n", dentry); nfs_fattr_init(fattr); - status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags); dprintk("NFS reply lookup: %d\n", status); return status; } @@ -244,7 +256,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); nfs_free_createdata(data); out: dprintk("NFS reply create: %d\n", status); @@ -291,7 +303,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); } if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); nfs_free_createdata(data); out: dprintk("NFS reply mknod: %d\n", status); @@ -299,11 +311,11 @@ out: } static int -nfs_proc_remove(struct inode *dir, const struct qstr *name) +nfs_proc_remove(struct inode *dir, struct dentry *dentry) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name = *name, + .name = dentry->d_name, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_REMOVE], @@ -311,7 +323,7 @@ nfs_proc_remove(struct inode *dir, const struct qstr *name) }; int status; - dprintk("NFS call remove %s\n", name->name); + dprintk("NFS call remove %pd2\n",dentry); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); @@ -320,7 +332,9 @@ nfs_proc_remove(struct inode *dir, const struct qstr *name) } static void -nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +nfs_proc_unlink_setup(struct rpc_message *msg, + struct dentry *dentry, + struct inode *inode) { msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE]; } @@ -337,7 +351,9 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) } static void -nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +nfs_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; } @@ -380,9 +396,10 @@ nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) } static int -nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, +nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio, unsigned int len, struct iattr *sattr) { + struct page *page = &folio->page; struct nfs_fh *fh; struct nfs_fattr *fattr; struct nfs_symlinkargs arg = { @@ -419,7 +436,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, * should fill in the data with a LOOKUP call on the wire. */ if (status == 0) - status = nfs_instantiate(dentry, fh, fattr, NULL); + status = nfs_instantiate(dentry, fh, fattr); out_free: nfs_free_fattr(fattr); @@ -429,13 +446,14 @@ out: return status; } -static int +static struct dentry * nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], }; + struct dentry *alias = NULL; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); @@ -447,12 +465,15 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); + if (status == 0) { + alias = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + status = PTR_ERR_OR_ZERO(alias); + } else + alias = ERR_PTR(status); nfs_free_createdata(data); out: dprintk("NFS reply mkdir: %d\n", status); - return status; + return alias; } static int @@ -483,26 +504,26 @@ nfs_proc_rmdir(struct inode *dir, const struct qstr *name) * sure it is syntactically correct; the entries itself are decoded * from nfs_readdir by calling the decode_entry function directly. */ -static int -nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); + struct inode *dir = d_inode(nr_arg->dentry); struct nfs_readdirargs arg = { .fh = NFS_FH(dir), - .cookie = cookie, - .count = count, - .pages = pages, + .cookie = nr_arg->cookie, + .count = nr_arg->page_len, + .pages = nr_arg->pages, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_READDIR], .rpc_argp = &arg, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status; - dprintk("NFS call readdir %d\n", (unsigned int)cookie); + dprintk("NFS call readdir %llu\n", (unsigned long long)nr_arg->cookie); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nr_res->verf[0] = nr_res->verf[1] = 0; nfs_invalidate_atime(dir); @@ -588,7 +609,8 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) /* Emulate the eof flag, which isn't normally needed in NFSv2 * as it is guaranteed to always return the file attributes */ - if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size) + if ((hdr->res.count == 0 && hdr->args.count > 0) || + hdr->args.offset + hdr->res.count >= hdr->res.fattr->size) hdr->res.eof = 1; } return 0; @@ -609,13 +631,16 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - if (task->tk_status >= 0) + if (task->tk_status >= 0) { + hdr->res.count = hdr->args.count; nfs_writeback_update_inode(hdr); + } return 0; } static void nfs_proc_write_setup(struct nfs_pgio_header *hdr, - struct rpc_message *msg) + struct rpc_message *msg, + struct rpc_clnt **clnt) { /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ hdr->args.stable = NFS_FILE_SYNC; @@ -628,7 +653,8 @@ static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit } static void -nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg) +nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg, + struct rpc_clnt **clnt) { BUG(); } @@ -665,20 +691,22 @@ out_einval: return -EINVAL; } -static int nfs_have_delegation(struct inode *inode, fmode_t flags) +static int nfs_have_delegation(struct inode *inode, fmode_t type, int flags) { return 0; } static int nfs_return_delegation(struct inode *inode) { - nfs_wb_all(inode); + if (S_ISREG(inode->i_mode)) + nfs_wb_all(inode); return 0; } static const struct inode_operations nfs_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, + .atomic_open = nfs_atomic_open_v23, .link = nfs_link, .unlink = nfs_unlink, .symlink = nfs_symlink, @@ -705,7 +733,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .file_ops = &nfs_file_operations, .getroot = nfs_proc_get_root, .submount = nfs_submount, - .try_mount = nfs_try_mount, + .try_get_tree = nfs_try_get_tree, .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, .lookup = nfs_proc_lookup, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a8421d9dab6a..3c1fa320b3f1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/read.c * @@ -14,6 +15,7 @@ #include <linux/stat.h> #include <linux/mm.h> #include <linux/slab.h> +#include <linux/task_io_accounting_ops.h> #include <linux/pagemap.h> #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> @@ -25,10 +27,12 @@ #include "iostat.h" #include "fscache.h" #include "pnfs.h" +#include "nfstrace.h" +#include "delegation.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; +const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; static const struct nfs_rw_ops nfs_rw_read_ops; static struct kmem_cache *nfs_rdata_cachep; @@ -44,15 +48,16 @@ static struct nfs_pgio_header *nfs_readhdr_alloc(void) static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) { + kfree(rhdr->res.scratch); kmem_cache_free(nfs_rdata_cachep, rhdr); } -static -int nfs_return_empty_page(struct page *page) +static int nfs_return_empty_folio(struct folio *folio) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); - unlock_page(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); return 0; } @@ -68,10 +73,27 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_read_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, - server->rsize, 0, GFP_KERNEL); + server->rsize, 0); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); +void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio) +{ + struct nfs_pgio_mirror *pgm; + unsigned long npages; + + nfs_pageio_complete(pgio); + + /* It doesn't make sense to do mirrored reads! */ + WARN_ON_ONCE(pgio->pg_mirror_count != 1); + + pgm = &pgio->pg_mirrors[0]; + NFS_I(pgio->pg_inode)->read_io += pgm->pg_bytes_written; + npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT; + nfs_add_stats(pgio->pg_inode, NFSIOS_READPAGES, npages); +} + + void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *mirror; @@ -89,103 +111,77 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); -static void nfs_readpage_release(struct nfs_page *req) +bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size) { - struct inode *inode = d_inode(req->wb_context->dentry); - - dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), req->wb_bytes, - (long long)req_offset(req)); - - if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(inode, req->wb_page, 0); - - unlock_page(req->wb_page); - } - nfs_release_request(req); + WARN_ON(hdr->res.scratch != NULL); + hdr->res.scratch = kmalloc(size, GFP_KERNEL); + return hdr->res.scratch != NULL; } +EXPORT_SYMBOL_GPL(nfs_read_alloc_scratch); -int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, - struct page *page) +static void nfs_readpage_release(struct nfs_page *req, int error) { - struct nfs_page *new; - unsigned int len; - struct nfs_pageio_descriptor pgio; - struct nfs_pgio_mirror *pgm; + struct folio *folio = nfs_page_to_folio(req); - len = nfs_page_length(page); - if (len == 0) - return nfs_return_empty_page(page); - new = nfs_create_request(ctx, page, NULL, 0, len); - if (IS_ERR(new)) { - unlock_page(page); - return PTR_ERR(new); - } - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); - nfs_pageio_init_read(&pgio, inode, false, - &nfs_async_read_completion_ops); - if (!nfs_pageio_add_request(&pgio, new)) { - nfs_list_remove_request(new); - nfs_readpage_release(new); - } - nfs_pageio_complete(&pgio); - - /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(pgio.pg_mirror_count != 1); - - pgm = &pgio.pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; - - return pgio.pg_error < 0 ? pgio.pg_error : 0; + nfs_release_request(req); } static void nfs_page_group_set_uptodate(struct nfs_page *req) { if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) - SetPageUptodate(req->wb_page); + folio_mark_uptodate(nfs_page_to_folio(req)); } static void nfs_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; + int error; if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) goto out; while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); - struct page *page = req->wb_page; + struct folio *folio = nfs_page_to_folio(req); unsigned long start = req->wb_pgbase; unsigned long end = req->wb_pgbase + req->wb_bytes; if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { /* note: regions of the page not covered by a - * request are zeroed in nfs_readpage_async / - * readpage_async_filler */ + * request are zeroed in nfs_read_add_folio + */ if (bytes > hdr->good_bytes) { /* nothing in this request was good, so zero * the full extent of the request */ - zero_user_segment(page, start, end); + folio_zero_segment(folio, start, end); } else if (hdr->good_bytes - bytes < req->wb_bytes) { /* part of this request has good bytes, but * not all. zero the bad bytes */ start += hdr->good_bytes - bytes; WARN_ON(start < req->wb_pgbase); - zero_user_segment(page, start, end); + folio_zero_segment(folio, start, end); } } + error = 0; bytes += req->wb_bytes; if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (bytes <= hdr->good_bytes) nfs_page_group_set_uptodate(req); + else { + error = hdr->error; + xchg(&nfs_req_openctx(req)->error, error); + } } else nfs_page_group_set_uptodate(req); nfs_list_remove_request(req); - nfs_readpage_release(req); + nfs_readpage_release(req, error); } + nfs_netfs_read_completion(hdr); + out: hdr->release(hdr); } @@ -195,26 +191,24 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, const struct nfs_rpc_ops *rpc_ops, struct rpc_task_setup *task_setup_data, int how) { - struct inode *inode = hdr->inode; - int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; - - task_setup_data->flags |= swap_flags; rpc_ops->read_setup(hdr, msg); + nfs_netfs_initiate_read(hdr); + trace_nfs_initiate_read(hdr); } static void -nfs_async_read_error(struct list_head *head) +nfs_async_read_error(struct list_head *head, int error) { struct nfs_page *req; while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - nfs_readpage_release(req); + nfs_readpage_release(req, error); } } -static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { +const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { .error_cleanup = nfs_async_read_error, .completion = nfs_read_completion, }; @@ -232,9 +226,10 @@ static int nfs_readpage_done(struct rpc_task *task, return status; nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count); + trace_nfs_readpage_done(task, hdr); if (task->tk_status == -ESTALE) { - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); nfs_mark_for_revalidate(inode); } return 0; @@ -248,6 +243,8 @@ static void nfs_readpage_retry(struct rpc_task *task, /* This is a short read! */ nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD); + trace_nfs_readpage_short(task, hdr); + /* Has the server at least made some progress? */ if (resp->count == 0) { nfs_set_pgio_error(hdr, -EIO, argp->offset); @@ -265,6 +262,8 @@ static void nfs_readpage_retry(struct rpc_task *task, argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; + resp->count = 0; + resp->eof = 0; rpc_restart_call_prepare(task); } @@ -272,169 +271,183 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_header *hdr) { if (hdr->res.eof) { - loff_t bound; + loff_t pos = hdr->args.offset + hdr->res.count; + unsigned int new = pos - hdr->io_start; - bound = hdr->args.offset + hdr->res.count; - spin_lock(&hdr->lock); - if (bound < hdr->io_start + hdr->good_bytes) { + if (hdr->good_bytes > new) { + hdr->good_bytes = new; set_bit(NFS_IOHDR_EOF, &hdr->flags); clear_bit(NFS_IOHDR_ERROR, &hdr->flags); - hdr->good_bytes = bound - hdr->io_start; } - spin_unlock(&hdr->lock); } else if (hdr->res.count < hdr->args.count) nfs_readpage_retry(task, hdr); } +int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, + struct nfs_open_context *ctx, + struct folio *folio) +{ + struct inode *inode = folio->mapping->host; + struct nfs_server *server = NFS_SERVER(inode); + size_t fsize = folio_size(folio); + unsigned int rsize = server->rsize; + struct nfs_page *new; + unsigned int len, aligned_len; + int error; + + len = nfs_folio_length(folio); + if (len == 0) + return nfs_return_empty_folio(folio); + + aligned_len = min_t(unsigned int, ALIGN(len, rsize), fsize); + + new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len); + if (IS_ERR(new)) { + error = PTR_ERR(new); + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); + goto out; + } + + if (len < fsize) + folio_zero_segment(folio, len, fsize); + if (!nfs_pageio_add_request(pgio, new)) { + nfs_list_remove_request(new); + error = pgio->pg_error; + nfs_readpage_release(new, error); + goto out; + } + return 0; +out: + return error; +} + /* - * Read a page over NFS. - * We read the page synchronously in the following case: - * - The error flag is set for this page. This happens only when a - * previous async read operation failed. + * Actually read a folio over the wire. */ -int nfs_readpage(struct file *file, struct page *page) +static int nfs_do_read_folio(struct file *file, struct folio *folio) { + struct inode *inode = file_inode(file); + struct nfs_pageio_descriptor pgio; struct nfs_open_context *ctx; - struct inode *inode = page_file_mapping(page)->host; - int error; + int ret; + + ctx = get_nfs_open_context(nfs_file_open_context(file)); + + xchg(&ctx->error, 0); + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); + + ret = nfs_read_add_folio(&pgio, ctx, folio); + if (ret) + goto out_put; - dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", - page, PAGE_SIZE, page_index(page)); + nfs_pageio_complete_read(&pgio); + nfs_update_delegated_atime(inode); + if (pgio.pg_error < 0) { + ret = pgio.pg_error; + goto out_put; + } + + ret = folio_wait_locked_killable(folio); + if (!folio_test_uptodate(folio) && !ret) + ret = xchg(&ctx->error, 0); + +out_put: + put_nfs_open_context(ctx); + return ret; +} + +/* + * Synchronously read a folio. + * + * This is not heavily used as most users to try an asynchronous + * large read through ->readahead first. + */ +int nfs_read_folio(struct file *file, struct folio *folio) +{ + struct inode *inode = file_inode(file); + loff_t pos = folio_pos(folio); + size_t len = folio_size(folio); + int ret; + + trace_nfs_aop_readpage(inode, pos, len); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_add_stats(inode, NFSIOS_READPAGES, 1); + task_io_account_read(len); /* * Try to flush any pending writes to the file.. * - * NOTE! Because we own the page lock, there cannot + * NOTE! Because we own the folio lock, there cannot * be any new pending writes generated at this point - * for this page (other pages can be written to). + * for this folio (other folios can be written to). */ - error = nfs_wb_page(inode, page); - if (error) + ret = nfs_wb_folio(inode, folio); + if (ret) goto out_unlock; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out_unlock; - error = -ESTALE; + ret = -ESTALE; if (NFS_STALE(inode)) goto out_unlock; - if (file == NULL) { - error = -EBADF; - ctx = nfs_find_open_context(inode, NULL, FMODE_READ); - if (ctx == NULL) - goto out_unlock; - } else - ctx = get_nfs_open_context(nfs_file_open_context(file)); - - if (!IS_SYNC(inode)) { - error = nfs_readpage_from_fscache(ctx, inode, page); - if (error == 0) - goto out; - } - - error = nfs_readpage_async(ctx, inode, page); - + ret = nfs_netfs_read_folio(file, folio); + if (ret) + ret = nfs_do_read_folio(file, folio); out: - put_nfs_open_context(ctx); - return error; + trace_nfs_aop_readpage_done(inode, pos, len, ret); + return ret; out_unlock: - unlock_page(page); - return error; + folio_unlock(folio); + goto out; } -struct nfs_readdesc { - struct nfs_pageio_descriptor *pgio; - struct nfs_open_context *ctx; -}; - -static int -readpage_async_filler(void *data, struct page *page) -{ - struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct nfs_page *new; - unsigned int len; - int error; - - len = nfs_page_length(page); - if (len == 0) - return nfs_return_empty_page(page); - - new = nfs_create_request(desc->ctx, page, NULL, 0, len); - if (IS_ERR(new)) - goto out_error; - - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); - if (!nfs_pageio_add_request(desc->pgio, new)) { - nfs_list_remove_request(new); - nfs_readpage_release(new); - error = desc->pgio->pg_error; - goto out; - } - return 0; -out_error: - error = PTR_ERR(new); - unlock_page(page); -out: - return error; -} - -int nfs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +void nfs_readahead(struct readahead_control *ractl) { struct nfs_pageio_descriptor pgio; - struct nfs_pgio_mirror *pgm; - struct nfs_readdesc desc = { - .pgio = &pgio, - }; - struct inode *inode = mapping->host; - unsigned long npages; - int ret = -ESTALE; + struct nfs_open_context *ctx; + unsigned int nr_pages = readahead_count(ractl); + struct file *file = ractl->file; + struct inode *inode = ractl->mapping->host; + struct folio *folio; + int ret; - dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", - inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), - nr_pages); + trace_nfs_aop_readahead(inode, readahead_pos(ractl), nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); + task_io_account_read(readahead_length(ractl)); + ret = -ESTALE; if (NFS_STALE(inode)) goto out; - if (filp == NULL) { - desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); - if (desc.ctx == NULL) - return -EBADF; - } else - desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); + ret = nfs_netfs_readahead(ractl); + if (!ret) + goto out; - /* attempt to read as many of the pages as possible from the cache - * - this returns -ENOBUFS immediately if the cookie is negative - */ - ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping, - pages, &nr_pages); - if (ret == 0) - goto read_complete; /* all pages were read */ + if (file == NULL) { + ret = -EBADF; + ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (ctx == NULL) + goto out; + } else + ctx = get_nfs_open_context(nfs_file_open_context(file)); nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); - ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - nfs_pageio_complete(&pgio); + while ((folio = readahead_folio(ractl)) != NULL) { + ret = nfs_read_add_folio(&pgio, ctx, folio); + if (ret) + break; + } - /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(pgio.pg_mirror_count != 1); - - pgm = &pgio.pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; - npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> - PAGE_SHIFT; - nfs_add_stats(inode, NFSIOS_READPAGES, npages); -read_complete: - put_nfs_open_context(desc.ctx); + nfs_pageio_complete_read(&pgio); + nfs_update_delegated_atime(inode); + + put_nfs_open_context(ctx); out: - return ret; + trace_nfs_aop_readahead_done(inode, nr_pages, ret); } int __init nfs_init_readpagecache(void) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d828ef88e7db..72dee6f3050e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/super.c * @@ -46,6 +47,7 @@ #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> +#include <linux/sched.h> #include <linux/slab.h> #include <net/ipv6.h> #include <linux/netdevice.h> @@ -56,6 +58,9 @@ #include <linux/rcupdate.h> #include <linux/uaccess.h> +#include <linux/nfs_ssc.h> + +#include <uapi/linux/tls.h> #include "nfs4_fs.h" #include "callback.h" @@ -66,250 +71,15 @@ #include "nfs4session.h" #include "pnfs.h" #include "nfs.h" +#include "netns.h" +#include "sysfs.h" +#include "nfs4idmap.h" #define NFSDBG_FACILITY NFSDBG_VFS -#define NFS_TEXT_DATA 1 - -#if IS_ENABLED(CONFIG_NFS_V3) -#define NFS_DEFAULT_VERSION 3 -#else -#define NFS_DEFAULT_VERSION 2 -#endif - -enum { - /* Mount options that take no arguments */ - Opt_soft, Opt_hard, - Opt_posix, Opt_noposix, - Opt_cto, Opt_nocto, - Opt_ac, Opt_noac, - Opt_lock, Opt_nolock, - Opt_udp, Opt_tcp, Opt_rdma, - Opt_acl, Opt_noacl, - Opt_rdirplus, Opt_nordirplus, - Opt_sharecache, Opt_nosharecache, - Opt_resvport, Opt_noresvport, - Opt_fscache, Opt_nofscache, - Opt_migration, Opt_nomigration, - - /* Mount options that take integer arguments */ - Opt_port, - Opt_rsize, Opt_wsize, Opt_bsize, - Opt_timeo, Opt_retrans, - Opt_acregmin, Opt_acregmax, - Opt_acdirmin, Opt_acdirmax, - Opt_actimeo, - Opt_namelen, - Opt_mountport, - Opt_mountvers, - Opt_minorversion, - - /* Mount options that take string arguments */ - Opt_nfsvers, - Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, - Opt_addr, Opt_mountaddr, Opt_clientaddr, - Opt_lookupcache, - Opt_fscache_uniq, - Opt_local_lock, - - /* Special mount options */ - Opt_userspace, Opt_deprecated, Opt_sloppy, - - Opt_err -}; - -static const match_table_t nfs_mount_option_tokens = { - { Opt_userspace, "bg" }, - { Opt_userspace, "fg" }, - { Opt_userspace, "retry=%s" }, - - { Opt_sloppy, "sloppy" }, - - { Opt_soft, "soft" }, - { Opt_hard, "hard" }, - { Opt_deprecated, "intr" }, - { Opt_deprecated, "nointr" }, - { Opt_posix, "posix" }, - { Opt_noposix, "noposix" }, - { Opt_cto, "cto" }, - { Opt_nocto, "nocto" }, - { Opt_ac, "ac" }, - { Opt_noac, "noac" }, - { Opt_lock, "lock" }, - { Opt_nolock, "nolock" }, - { Opt_udp, "udp" }, - { Opt_tcp, "tcp" }, - { Opt_rdma, "rdma" }, - { Opt_acl, "acl" }, - { Opt_noacl, "noacl" }, - { Opt_rdirplus, "rdirplus" }, - { Opt_nordirplus, "nordirplus" }, - { Opt_sharecache, "sharecache" }, - { Opt_nosharecache, "nosharecache" }, - { Opt_resvport, "resvport" }, - { Opt_noresvport, "noresvport" }, - { Opt_fscache, "fsc" }, - { Opt_nofscache, "nofsc" }, - { Opt_migration, "migration" }, - { Opt_nomigration, "nomigration" }, - - { Opt_port, "port=%s" }, - { Opt_rsize, "rsize=%s" }, - { Opt_wsize, "wsize=%s" }, - { Opt_bsize, "bsize=%s" }, - { Opt_timeo, "timeo=%s" }, - { Opt_retrans, "retrans=%s" }, - { Opt_acregmin, "acregmin=%s" }, - { Opt_acregmax, "acregmax=%s" }, - { Opt_acdirmin, "acdirmin=%s" }, - { Opt_acdirmax, "acdirmax=%s" }, - { Opt_actimeo, "actimeo=%s" }, - { Opt_namelen, "namlen=%s" }, - { Opt_mountport, "mountport=%s" }, - { Opt_mountvers, "mountvers=%s" }, - { Opt_minorversion, "minorversion=%s" }, - - { Opt_nfsvers, "nfsvers=%s" }, - { Opt_nfsvers, "vers=%s" }, - - { Opt_sec, "sec=%s" }, - { Opt_proto, "proto=%s" }, - { Opt_mountproto, "mountproto=%s" }, - { Opt_addr, "addr=%s" }, - { Opt_clientaddr, "clientaddr=%s" }, - { Opt_mounthost, "mounthost=%s" }, - { Opt_mountaddr, "mountaddr=%s" }, - - { Opt_lookupcache, "lookupcache=%s" }, - { Opt_fscache_uniq, "fsc=%s" }, - { Opt_local_lock, "local_lock=%s" }, - - /* The following needs to be listed after all other options */ - { Opt_nfsvers, "v%s" }, - - { Opt_err, NULL } -}; - -enum { - Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, - Opt_xprt_rdma6, - - Opt_xprt_err -}; - -static const match_table_t nfs_xprt_protocol_tokens = { - { Opt_xprt_udp, "udp" }, - { Opt_xprt_udp6, "udp6" }, - { Opt_xprt_tcp, "tcp" }, - { Opt_xprt_tcp6, "tcp6" }, - { Opt_xprt_rdma, "rdma" }, - { Opt_xprt_rdma6, "rdma6" }, - - { Opt_xprt_err, NULL } -}; - -enum { - Opt_sec_none, Opt_sec_sys, - Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, - Opt_sec_lkey, Opt_sec_lkeyi, Opt_sec_lkeyp, - Opt_sec_spkm, Opt_sec_spkmi, Opt_sec_spkmp, - - Opt_sec_err -}; - -static const match_table_t nfs_secflavor_tokens = { - { Opt_sec_none, "none" }, - { Opt_sec_none, "null" }, - { Opt_sec_sys, "sys" }, - - { Opt_sec_krb5, "krb5" }, - { Opt_sec_krb5i, "krb5i" }, - { Opt_sec_krb5p, "krb5p" }, - - { Opt_sec_lkey, "lkey" }, - { Opt_sec_lkeyi, "lkeyi" }, - { Opt_sec_lkeyp, "lkeyp" }, - - { Opt_sec_spkm, "spkm3" }, - { Opt_sec_spkmi, "spkm3i" }, - { Opt_sec_spkmp, "spkm3p" }, - - { Opt_sec_err, NULL } -}; - -enum { - Opt_lookupcache_all, Opt_lookupcache_positive, - Opt_lookupcache_none, - - Opt_lookupcache_err -}; - -static match_table_t nfs_lookupcache_tokens = { - { Opt_lookupcache_all, "all" }, - { Opt_lookupcache_positive, "pos" }, - { Opt_lookupcache_positive, "positive" }, - { Opt_lookupcache_none, "none" }, - - { Opt_lookupcache_err, NULL } -}; - -enum { - Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_posix, - Opt_local_lock_none, - - Opt_local_lock_err -}; - -static match_table_t nfs_local_lock_tokens = { - { Opt_local_lock_all, "all" }, - { Opt_local_lock_flock, "flock" }, - { Opt_local_lock_posix, "posix" }, - { Opt_local_lock_none, "none" }, - - { Opt_local_lock_err, NULL } -}; - -enum { - Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, - Opt_vers_4_1, Opt_vers_4_2, - - Opt_vers_err -}; - -static match_table_t nfs_vers_tokens = { - { Opt_vers_2, "2" }, - { Opt_vers_3, "3" }, - { Opt_vers_4, "4" }, - { Opt_vers_4_0, "4.0" }, - { Opt_vers_4_1, "4.1" }, - { Opt_vers_4_2, "4.2" }, - - { Opt_vers_err, NULL } -}; - -static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data); - -struct file_system_type nfs_fs_type = { - .owner = THIS_MODULE, - .name = "nfs", - .mount = nfs_fs_mount, - .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, -}; -MODULE_ALIAS_FS("nfs"); -EXPORT_SYMBOL_GPL(nfs_fs_type); - -struct file_system_type nfs_xdev_fs_type = { - .owner = THIS_MODULE, - .name = "nfs", - .mount = nfs_xdev_mount, - .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, -}; const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, - .destroy_inode = nfs_destroy_inode, + .free_inode = nfs_free_inode, .write_inode = nfs_write_inode, .drop_inode = nfs_drop_inode, .statfs = nfs_statfs, @@ -319,26 +89,16 @@ const struct super_operations nfs_sops = { .show_devname = nfs_show_devname, .show_path = nfs_show_path, .show_stats = nfs_show_stats, - .remount_fs = nfs_remount, }; EXPORT_SYMBOL_GPL(nfs_sops); -#if IS_ENABLED(CONFIG_NFS_V4) -static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *); -static int nfs4_validate_mount_data(void *options, - struct nfs_parsed_mount_data *args, const char *dev_name); - -struct file_system_type nfs4_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .mount = nfs_fs_mount, - .kill_sb = nfs_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, +#ifdef CONFIG_NFS_V4_2 +static const struct nfs_ssc_client_ops nfs_ssc_clnt_ops_tbl = { + .sco_sb_deactive = nfs_sb_deactive, }; -MODULE_ALIAS_FS("nfs4"); -MODULE_ALIAS("nfs4"); -EXPORT_SYMBOL_GPL(nfs4_fs_type); +#endif +#if IS_ENABLED(CONFIG_NFS_V4) static int __init register_nfs4_fs(void) { return register_filesystem(&nfs4_fs_type); @@ -359,11 +119,19 @@ static void unregister_nfs4_fs(void) } #endif -static struct shrinker acl_shrinker = { - .count_objects = nfs_access_cache_count, - .scan_objects = nfs_access_cache_scan, - .seeks = DEFAULT_SEEKS, -}; +#ifdef CONFIG_NFS_V4_2 +static void nfs_ssc_register_ops(void) +{ + nfs_ssc_register(&nfs_ssc_clnt_ops_tbl); +} + +static void nfs_ssc_unregister_ops(void) +{ + nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl); +} +#endif /* CONFIG_NFS_V4_2 */ + +static struct shrinker *acl_shrinker; /* * Register the NFS filesystems @@ -383,9 +151,21 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - ret = register_shrinker(&acl_shrinker); - if (ret < 0) + + acl_shrinker = shrinker_alloc(0, "nfs-acl"); + if (!acl_shrinker) { + ret = -ENOMEM; goto error_3; + } + + acl_shrinker->count_objects = nfs_access_cache_count; + acl_shrinker->scan_objects = nfs_access_cache_scan; + + shrinker_register(acl_shrinker); + +#ifdef CONFIG_NFS_V4_2 + nfs_ssc_register_ops(); +#endif return 0; error_3: nfs_unregister_sysctl(); @@ -402,9 +182,12 @@ error_0: */ void __exit unregister_nfs_fs(void) { - unregister_shrinker(&acl_shrinker); + shrinker_free(acl_shrinker); nfs_unregister_sysctl(); unregister_nfs4_fs(); +#ifdef CONFIG_NFS_V4_2 + nfs_ssc_unregister_ops(); +#endif unregister_filesystem(&nfs_fs_type); } @@ -429,6 +212,42 @@ void nfs_sb_deactive(struct super_block *sb) } EXPORT_SYMBOL_GPL(nfs_sb_deactive); +static int __nfs_list_for_each_server(struct list_head *head, + int (*fn)(struct nfs_server *, void *), + void *data) +{ + struct nfs_server *server, *last = NULL; + int ret = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, head, client_link) { + if (!(server->super && nfs_sb_active(server->super))) + continue; + rcu_read_unlock(); + if (last) + nfs_sb_deactive(last->super); + last = server; + ret = fn(server, data); + if (ret) + goto out; + cond_resched(); + rcu_read_lock(); + } + rcu_read_unlock(); +out: + if (last) + nfs_sb_deactive(last->super); + return ret; +} + +int nfs_client_for_each_server(struct nfs_client *clp, + int (*fn)(struct nfs_server *, void *), + void *data) +{ + return __nfs_list_for_each_server(&clp->cl_superblocks, fn, data); +} +EXPORT_SYMBOL_GPL(nfs_client_for_each_server); + /* * Deliver file system statistics to userspace */ @@ -450,10 +269,8 @@ int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct dentry *pd_dentry; pd_dentry = dget_parent(dentry); - if (pd_dentry != NULL) { - nfs_zap_caches(d_inode(pd_dentry)); - dput(pd_dentry); - } + nfs_zap_caches(d_inode(pd_dentry)); + dput(pd_dentry); } nfs_free_fattr(res.fattr); if (error < 0) @@ -580,7 +397,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, } default: if (showdefaults) - seq_printf(m, ",mountaddr=unspecified"); + seq_puts(m, ",mountaddr=unspecified"); } if (nfss->mountd_version || showdefaults) @@ -628,15 +445,21 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, const char *str; const char *nostr; } nfs_info[] = { - { NFS_MOUNT_SOFT, ",soft", ",hard" }, + { NFS_MOUNT_SOFT, ",soft", "" }, + { NFS_MOUNT_SOFTERR, ",softerr", "" }, + { NFS_MOUNT_SOFTREVAL, ",softreval", "" }, { NFS_MOUNT_POSIX, ",posix", "" }, { NFS_MOUNT_NOCTO, ",nocto", "" }, { NFS_MOUNT_NOAC, ",noac", "" }, { NFS_MOUNT_NONLM, ",nolock", "" }, { NFS_MOUNT_NOACL, ",noacl", "" }, { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, + { NFS_MOUNT_FORCE_RDIRPLUS, ",rdirplus=force", "" }, { NFS_MOUNT_UNSHARED, ",nosharecache", "" }, { NFS_MOUNT_NORESVPORT, ",noresvport", "" }, + { NFS_MOUNT_NETUNREACH_FATAL, + ",fatal_neterrors=ENETDOWN:ENETUNREACH", + ",fatal_neterrors=none" }, { 0, NULL, NULL } }; const struct proc_nfs_info *nfs_infop; @@ -658,6 +481,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults) seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); + if (!(nfss->flags & (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))) + seq_puts(m, ",hard"); for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { if (nfss->flags & nfs_infop->flag) seq_puts(m, nfs_infop->str); @@ -668,7 +493,11 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_printf(m, ",proto=%s", rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); rcu_read_unlock(); + if (clp->cl_nconnect > 0) + seq_printf(m, ",nconnect=%u", clp->cl_nconnect); if (version == 4) { + if (clp->cl_max_connect > 1) + seq_printf(m, ",max_connect=%u", clp->cl_max_connect); if (nfss->port != NFS_PORT) seq_printf(m, ",port=%u", nfss->port); } else @@ -678,36 +507,64 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ); seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries); seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); + switch (clp->cl_xprtsec.policy) { + case RPC_XPRTSEC_TLS_ANON: + seq_puts(m, ",xprtsec=tls"); + break; + case RPC_XPRTSEC_TLS_X509: + seq_puts(m, ",xprtsec=mtls"); + break; + default: + break; + } if (version != 4) nfs_show_mountd_options(m, nfss, showdefaults); else nfs_show_nfsv4_options(m, nfss, showdefaults); - if (nfss->options & NFS_OPTION_FSCACHE) - seq_printf(m, ",fsc"); + if (nfss->options & NFS_OPTION_FSCACHE) { +#ifdef CONFIG_NFS_FSCACHE + if (nfss->fscache_uniq) + seq_printf(m, ",fsc=%s", nfss->fscache_uniq); + else + seq_puts(m, ",fsc"); +#else + seq_puts(m, ",fsc"); +#endif + } if (nfss->options & NFS_OPTION_MIGRATION) - seq_printf(m, ",migration"); + seq_puts(m, ",migration"); if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) - seq_printf(m, ",lookupcache=none"); + seq_puts(m, ",lookupcache=none"); else - seq_printf(m, ",lookupcache=pos"); + seq_puts(m, ",lookupcache=pos"); } local_flock = nfss->flags & NFS_MOUNT_LOCAL_FLOCK; local_fcntl = nfss->flags & NFS_MOUNT_LOCAL_FCNTL; if (!local_flock && !local_fcntl) - seq_printf(m, ",local_lock=none"); + seq_puts(m, ",local_lock=none"); else if (local_flock && local_fcntl) - seq_printf(m, ",local_lock=all"); + seq_puts(m, ",local_lock=all"); else if (local_flock) - seq_printf(m, ",local_lock=flock"); + seq_puts(m, ",local_lock=flock"); else - seq_printf(m, ",local_lock=posix"); + seq_puts(m, ",local_lock=posix"); + + if (nfss->flags & NFS_MOUNT_NO_ALIGNWRITE) + seq_puts(m, ",noalignwrite"); + + if (nfss->flags & NFS_MOUNT_WRITE_EAGER) { + if (nfss->flags & NFS_MOUNT_WRITE_WAIT) + seq_puts(m, ",write=wait"); + else + seq_puts(m, ",write=eager"); + } } /* @@ -730,11 +587,21 @@ int nfs_show_options(struct seq_file *m, struct dentry *root) EXPORT_SYMBOL_GPL(nfs_show_options); #if IS_ENABLED(CONFIG_NFS_V4) +static void show_lease(struct seq_file *m, struct nfs_server *server) +{ + struct nfs_client *clp = server->nfs_client; + unsigned long expire; + + seq_printf(m, ",lease_time=%ld", clp->cl_lease_time / HZ); + expire = clp->cl_last_renewal + clp->cl_lease_time; + seq_printf(m, ",lease_expired=%ld", + time_after(expire, jiffies) ? 0 : (jiffies - expire) / HZ); +} #ifdef CONFIG_NFS_V4_1 static void show_sessions(struct seq_file *m, struct nfs_server *server) { if (nfs4_has_session(server->nfs_client)) - seq_printf(m, ",sessions"); + seq_puts(m, ",sessions"); } #else static void show_sessions(struct seq_file *m, struct nfs_server *server) {} @@ -811,18 +678,18 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) /* * Display all mount option settings */ - seq_printf(m, "\n\topts:\t"); - seq_puts(m, root->d_sb->s_flags & MS_RDONLY ? "ro" : "rw"); - seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); - seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : ""); - seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); + seq_puts(m, "\n\topts:\t"); + seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw"); + seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : ""); + seq_puts(m, root->d_sb->s_flags & SB_NODIRATIME ? ",nodiratime" : ""); nfs_show_mount_options(m, nfss, 1); seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); show_implementation_id(m, nfss); - seq_printf(m, "\n\tcaps:\t"); + seq_puts(m, "\n\tcaps:\t"); seq_printf(m, "caps=0x%x", nfss->caps); seq_printf(m, ",wtmult=%u", nfss->wtmult); seq_printf(m, ",dtsize=%u", nfss->dtsize); @@ -831,13 +698,14 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) #if IS_ENABLED(CONFIG_NFS_V4) if (nfss->nfs_client->rpc_ops->version == 4) { - seq_printf(m, "\n\tnfsv4:\t"); + seq_puts(m, "\n\tnfsv4:\t"); seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]); seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); show_sessions(m, nfss); show_pnfs(m, nfss); + show_lease(m, nfss); } #endif @@ -861,30 +729,19 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) totals.events[i] += stats->events[i]; for (i = 0; i < __NFSIOS_BYTESMAX; i++) totals.bytes[i] += stats->bytes[i]; -#ifdef CONFIG_NFS_FSCACHE - for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) - totals.fscache[i] += stats->fscache[i]; -#endif preempt_enable(); } - seq_printf(m, "\n\tevents:\t"); + seq_puts(m, "\n\tevents:\t"); for (i = 0; i < __NFSIOS_COUNTSMAX; i++) seq_printf(m, "%lu ", totals.events[i]); - seq_printf(m, "\n\tbytes:\t"); + seq_puts(m, "\n\tbytes:\t"); for (i = 0; i < __NFSIOS_BYTESMAX; i++) seq_printf(m, "%Lu ", totals.bytes[i]); -#ifdef CONFIG_NFS_FSCACHE - if (nfss->options & NFS_OPTION_FSCACHE) { - seq_printf(m, "\n\tfsc:\t"); - for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) - seq_printf(m, "%Lu ", totals.fscache[i]); - } -#endif - seq_printf(m, "\n"); + seq_putc(m, '\n'); - rpc_print_iostats(m, nfss->client); + rpc_clnt_show_stats(m, nfss->client); return 0; } @@ -910,141 +767,6 @@ void nfs_umount_begin(struct super_block *sb) } EXPORT_SYMBOL_GPL(nfs_umount_begin); -static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void) -{ - struct nfs_parsed_mount_data *data; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (data) { - data->timeo = NFS_UNSPEC_TIMEO; - data->retrans = NFS_UNSPEC_RETRANS; - data->acregmin = NFS_DEF_ACREGMIN; - data->acregmax = NFS_DEF_ACREGMAX; - data->acdirmin = NFS_DEF_ACDIRMIN; - data->acdirmax = NFS_DEF_ACDIRMAX; - data->mount_server.port = NFS_UNSPEC_PORT; - data->nfs_server.port = NFS_UNSPEC_PORT; - data->nfs_server.protocol = XPRT_TRANSPORT_TCP; - data->selected_flavor = RPC_AUTH_MAXFLAVOR; - data->minorversion = 0; - data->need_mount = true; - data->net = current->nsproxy->net_ns; - security_init_mnt_opts(&data->lsm_opts); - } - return data; -} - -static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data) -{ - if (data) { - kfree(data->client_address); - kfree(data->mount_server.hostname); - kfree(data->nfs_server.export_path); - kfree(data->nfs_server.hostname); - kfree(data->fscache_uniq); - security_free_mnt_opts(&data->lsm_opts); - kfree(data); - } -} - -/* - * Sanity-check a server address provided by the mount command. - * - * Address family must be initialized, and address must not be - * the ANY address for that family. - */ -static int nfs_verify_server_address(struct sockaddr *addr) -{ - switch (addr->sa_family) { - case AF_INET: { - struct sockaddr_in *sa = (struct sockaddr_in *)addr; - return sa->sin_addr.s_addr != htonl(INADDR_ANY); - } - case AF_INET6: { - struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr; - return !ipv6_addr_any(sa); - } - } - - dfprintk(MOUNT, "NFS: Invalid IP address specified\n"); - return 0; -} - -/* - * Select between a default port value and a user-specified port value. - * If a zero value is set, then autobind will be used. - */ -static void nfs_set_port(struct sockaddr *sap, int *port, - const unsigned short default_port) -{ - if (*port == NFS_UNSPEC_PORT) - *port = default_port; - - rpc_set_port(sap, *port); -} - -/* - * Sanity check the NFS transport protocol. - * - */ -static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt) -{ - switch (mnt->nfs_server.protocol) { - case XPRT_TRANSPORT_UDP: - case XPRT_TRANSPORT_TCP: - case XPRT_TRANSPORT_RDMA: - break; - default: - mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - } -} - -/* - * For text based NFSv2/v3 mounts, the mount protocol transport default - * settings should depend upon the specified NFS transport. - */ -static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) -{ - nfs_validate_transport_protocol(mnt); - - if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP || - mnt->mount_server.protocol == XPRT_TRANSPORT_TCP) - return; - switch (mnt->nfs_server.protocol) { - case XPRT_TRANSPORT_UDP: - mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; - break; - case XPRT_TRANSPORT_TCP: - case XPRT_TRANSPORT_RDMA: - mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; - } -} - -/* - * Add 'flavor' to 'auth_info' if not already present. - * Returns true if 'flavor' ends up in the list, false otherwise - */ -static bool nfs_auth_info_add(struct nfs_auth_info *auth_info, - rpc_authflavor_t flavor) -{ - unsigned int i; - unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors); - - /* make sure this flavor isn't already in the list */ - for (i = 0; i < auth_info->flavor_len; i++) { - if (flavor == auth_info->flavors[i]) - return true; - } - - if (auth_info->flavor_len + 1 >= max_flavor_len) { - dfprintk(MOUNT, "NFS: too many sec= flavors\n"); - return false; - } - - auth_info->flavors[auth_info->flavor_len++] = flavor; - return true; -} - /* * Return true if 'match' is in auth_info or auth_info is empty. * Return false otherwise. @@ -1066,633 +788,17 @@ bool nfs_auth_info_match(const struct nfs_auth_info *auth_info, EXPORT_SYMBOL_GPL(nfs_auth_info_match); /* - * Parse the value of the 'sec=' option. - */ -static int nfs_parse_security_flavors(char *value, - struct nfs_parsed_mount_data *mnt) -{ - substring_t args[MAX_OPT_ARGS]; - rpc_authflavor_t pseudoflavor; - char *p; - - dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value); - - while ((p = strsep(&value, ":")) != NULL) { - switch (match_token(p, nfs_secflavor_tokens, args)) { - case Opt_sec_none: - pseudoflavor = RPC_AUTH_NULL; - break; - case Opt_sec_sys: - pseudoflavor = RPC_AUTH_UNIX; - break; - case Opt_sec_krb5: - pseudoflavor = RPC_AUTH_GSS_KRB5; - break; - case Opt_sec_krb5i: - pseudoflavor = RPC_AUTH_GSS_KRB5I; - break; - case Opt_sec_krb5p: - pseudoflavor = RPC_AUTH_GSS_KRB5P; - break; - case Opt_sec_lkey: - pseudoflavor = RPC_AUTH_GSS_LKEY; - break; - case Opt_sec_lkeyi: - pseudoflavor = RPC_AUTH_GSS_LKEYI; - break; - case Opt_sec_lkeyp: - pseudoflavor = RPC_AUTH_GSS_LKEYP; - break; - case Opt_sec_spkm: - pseudoflavor = RPC_AUTH_GSS_SPKM; - break; - case Opt_sec_spkmi: - pseudoflavor = RPC_AUTH_GSS_SPKMI; - break; - case Opt_sec_spkmp: - pseudoflavor = RPC_AUTH_GSS_SPKMP; - break; - default: - dfprintk(MOUNT, - "NFS: sec= option '%s' not recognized\n", p); - return 0; - } - - if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor)) - return 0; - } - - return 1; -} - -static int nfs_parse_version_string(char *string, - struct nfs_parsed_mount_data *mnt, - substring_t *args) -{ - mnt->flags &= ~NFS_MOUNT_VER3; - switch (match_token(string, nfs_vers_tokens, args)) { - case Opt_vers_2: - mnt->version = 2; - break; - case Opt_vers_3: - mnt->flags |= NFS_MOUNT_VER3; - mnt->version = 3; - break; - case Opt_vers_4: - /* Backward compatibility option. In future, - * the mount program should always supply - * a NFSv4 minor version number. - */ - mnt->version = 4; - break; - case Opt_vers_4_0: - mnt->version = 4; - mnt->minorversion = 0; - break; - case Opt_vers_4_1: - mnt->version = 4; - mnt->minorversion = 1; - break; - case Opt_vers_4_2: - mnt->version = 4; - mnt->minorversion = 2; - break; - default: - return 0; - } - return 1; -} - -static int nfs_get_option_str(substring_t args[], char **option) -{ - kfree(*option); - *option = match_strdup(args); - return !*option; -} - -static int nfs_get_option_ul(substring_t args[], unsigned long *option) -{ - int rc; - char *string; - - string = match_strdup(args); - if (string == NULL) - return -ENOMEM; - rc = kstrtoul(string, 10, option); - kfree(string); - - return rc; -} - -static int nfs_get_option_ul_bound(substring_t args[], unsigned long *option, - unsigned long l_bound, unsigned long u_bound) -{ - int ret; - - ret = nfs_get_option_ul(args, option); - if (ret != 0) - return ret; - if (*option < l_bound || *option > u_bound) - return -ERANGE; - return 0; -} - -/* - * Error-check and convert a string of mount options from user space into - * a data structure. The whole mount string is processed; bad options are - * skipped as they are encountered. If there were no errors, return 1; - * otherwise return 0 (zero). - */ -static int nfs_parse_mount_options(char *raw, - struct nfs_parsed_mount_data *mnt) -{ - char *p, *string, *secdata; - int rc, sloppy = 0, invalid_option = 0; - unsigned short protofamily = AF_UNSPEC; - unsigned short mountfamily = AF_UNSPEC; - - if (!raw) { - dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); - return 1; - } - dfprintk(MOUNT, "NFS: nfs mount opts='%s'\n", raw); - - secdata = alloc_secdata(); - if (!secdata) - goto out_nomem; - - rc = security_sb_copy_data(raw, secdata); - if (rc) - goto out_security_failure; - - rc = security_sb_parse_opts_str(secdata, &mnt->lsm_opts); - if (rc) - goto out_security_failure; - - free_secdata(secdata); - - while ((p = strsep(&raw, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - unsigned long option; - int token; - - if (!*p) - continue; - - dfprintk(MOUNT, "NFS: parsing nfs mount option '%s'\n", p); - - token = match_token(p, nfs_mount_option_tokens, args); - switch (token) { - - /* - * boolean options: foo/nofoo - */ - case Opt_soft: - mnt->flags |= NFS_MOUNT_SOFT; - break; - case Opt_hard: - mnt->flags &= ~NFS_MOUNT_SOFT; - break; - case Opt_posix: - mnt->flags |= NFS_MOUNT_POSIX; - break; - case Opt_noposix: - mnt->flags &= ~NFS_MOUNT_POSIX; - break; - case Opt_cto: - mnt->flags &= ~NFS_MOUNT_NOCTO; - break; - case Opt_nocto: - mnt->flags |= NFS_MOUNT_NOCTO; - break; - case Opt_ac: - mnt->flags &= ~NFS_MOUNT_NOAC; - break; - case Opt_noac: - mnt->flags |= NFS_MOUNT_NOAC; - break; - case Opt_lock: - mnt->flags &= ~NFS_MOUNT_NONLM; - mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | - NFS_MOUNT_LOCAL_FCNTL); - break; - case Opt_nolock: - mnt->flags |= NFS_MOUNT_NONLM; - mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | - NFS_MOUNT_LOCAL_FCNTL); - break; - case Opt_udp: - mnt->flags &= ~NFS_MOUNT_TCP; - mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - break; - case Opt_tcp: - mnt->flags |= NFS_MOUNT_TCP; - mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - break; - case Opt_rdma: - mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ - mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - xprt_load_transport(p); - break; - case Opt_acl: - mnt->flags &= ~NFS_MOUNT_NOACL; - break; - case Opt_noacl: - mnt->flags |= NFS_MOUNT_NOACL; - break; - case Opt_rdirplus: - mnt->flags &= ~NFS_MOUNT_NORDIRPLUS; - break; - case Opt_nordirplus: - mnt->flags |= NFS_MOUNT_NORDIRPLUS; - break; - case Opt_sharecache: - mnt->flags &= ~NFS_MOUNT_UNSHARED; - break; - case Opt_nosharecache: - mnt->flags |= NFS_MOUNT_UNSHARED; - break; - case Opt_resvport: - mnt->flags &= ~NFS_MOUNT_NORESVPORT; - break; - case Opt_noresvport: - mnt->flags |= NFS_MOUNT_NORESVPORT; - break; - case Opt_fscache: - mnt->options |= NFS_OPTION_FSCACHE; - kfree(mnt->fscache_uniq); - mnt->fscache_uniq = NULL; - break; - case Opt_nofscache: - mnt->options &= ~NFS_OPTION_FSCACHE; - kfree(mnt->fscache_uniq); - mnt->fscache_uniq = NULL; - break; - case Opt_migration: - mnt->options |= NFS_OPTION_MIGRATION; - break; - case Opt_nomigration: - mnt->options &= NFS_OPTION_MIGRATION; - break; - - /* - * options that take numeric values - */ - case Opt_port: - if (nfs_get_option_ul(args, &option) || - option > USHRT_MAX) - goto out_invalid_value; - mnt->nfs_server.port = option; - break; - case Opt_rsize: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - mnt->rsize = option; - break; - case Opt_wsize: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - mnt->wsize = option; - break; - case Opt_bsize: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - mnt->bsize = option; - break; - case Opt_timeo: - if (nfs_get_option_ul_bound(args, &option, 1, INT_MAX)) - goto out_invalid_value; - mnt->timeo = option; - break; - case Opt_retrans: - if (nfs_get_option_ul_bound(args, &option, 0, INT_MAX)) - goto out_invalid_value; - mnt->retrans = option; - break; - case Opt_acregmin: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - mnt->acregmin = option; - break; - case Opt_acregmax: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - mnt->acregmax = option; - break; - case Opt_acdirmin: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - mnt->acdirmin = option; - break; - case Opt_acdirmax: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - mnt->acdirmax = option; - break; - case Opt_actimeo: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - mnt->acregmin = mnt->acregmax = - mnt->acdirmin = mnt->acdirmax = option; - break; - case Opt_namelen: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - mnt->namlen = option; - break; - case Opt_mountport: - if (nfs_get_option_ul(args, &option) || - option > USHRT_MAX) - goto out_invalid_value; - mnt->mount_server.port = option; - break; - case Opt_mountvers: - if (nfs_get_option_ul(args, &option) || - option < NFS_MNT_VERSION || - option > NFS_MNT3_VERSION) - goto out_invalid_value; - mnt->mount_server.version = option; - break; - case Opt_minorversion: - if (nfs_get_option_ul(args, &option)) - goto out_invalid_value; - if (option > NFS4_MAX_MINOR_VERSION) - goto out_invalid_value; - mnt->minorversion = option; - break; - - /* - * options that take text values - */ - case Opt_nfsvers: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = nfs_parse_version_string(string, mnt, args); - kfree(string); - if (!rc) - goto out_invalid_value; - break; - case Opt_sec: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = nfs_parse_security_flavors(string, mnt); - kfree(string); - if (!rc) { - dfprintk(MOUNT, "NFS: unrecognized " - "security flavor\n"); - return 0; - } - break; - case Opt_proto: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - token = match_token(string, - nfs_xprt_protocol_tokens, args); - - protofamily = AF_INET; - switch (token) { - case Opt_xprt_udp6: - protofamily = AF_INET6; - case Opt_xprt_udp: - mnt->flags &= ~NFS_MOUNT_TCP; - mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - break; - case Opt_xprt_tcp6: - protofamily = AF_INET6; - case Opt_xprt_tcp: - mnt->flags |= NFS_MOUNT_TCP; - mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - break; - case Opt_xprt_rdma6: - protofamily = AF_INET6; - case Opt_xprt_rdma: - /* vector side protocols to TCP */ - mnt->flags |= NFS_MOUNT_TCP; - mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - xprt_load_transport(string); - break; - default: - dfprintk(MOUNT, "NFS: unrecognized " - "transport protocol\n"); - kfree(string); - return 0; - } - kfree(string); - break; - case Opt_mountproto: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - token = match_token(string, - nfs_xprt_protocol_tokens, args); - kfree(string); - - mountfamily = AF_INET; - switch (token) { - case Opt_xprt_udp6: - mountfamily = AF_INET6; - case Opt_xprt_udp: - mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; - break; - case Opt_xprt_tcp6: - mountfamily = AF_INET6; - case Opt_xprt_tcp: - mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; - break; - case Opt_xprt_rdma: /* not used for side protocols */ - default: - dfprintk(MOUNT, "NFS: unrecognized " - "transport protocol\n"); - return 0; - } - break; - case Opt_addr: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - mnt->nfs_server.addrlen = - rpc_pton(mnt->net, string, strlen(string), - (struct sockaddr *) - &mnt->nfs_server.address, - sizeof(mnt->nfs_server.address)); - kfree(string); - if (mnt->nfs_server.addrlen == 0) - goto out_invalid_address; - break; - case Opt_clientaddr: - if (nfs_get_option_str(args, &mnt->client_address)) - goto out_nomem; - break; - case Opt_mounthost: - if (nfs_get_option_str(args, - &mnt->mount_server.hostname)) - goto out_nomem; - break; - case Opt_mountaddr: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - mnt->mount_server.addrlen = - rpc_pton(mnt->net, string, strlen(string), - (struct sockaddr *) - &mnt->mount_server.address, - sizeof(mnt->mount_server.address)); - kfree(string); - if (mnt->mount_server.addrlen == 0) - goto out_invalid_address; - break; - case Opt_lookupcache: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - token = match_token(string, - nfs_lookupcache_tokens, args); - kfree(string); - switch (token) { - case Opt_lookupcache_all: - mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); - break; - case Opt_lookupcache_positive: - mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; - mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; - break; - case Opt_lookupcache_none: - mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; - break; - default: - dfprintk(MOUNT, "NFS: invalid " - "lookupcache argument\n"); - return 0; - }; - break; - case Opt_fscache_uniq: - if (nfs_get_option_str(args, &mnt->fscache_uniq)) - goto out_nomem; - mnt->options |= NFS_OPTION_FSCACHE; - break; - case Opt_local_lock: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - token = match_token(string, nfs_local_lock_tokens, - args); - kfree(string); - switch (token) { - case Opt_local_lock_all: - mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK | - NFS_MOUNT_LOCAL_FCNTL); - break; - case Opt_local_lock_flock: - mnt->flags |= NFS_MOUNT_LOCAL_FLOCK; - break; - case Opt_local_lock_posix: - mnt->flags |= NFS_MOUNT_LOCAL_FCNTL; - break; - case Opt_local_lock_none: - mnt->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | - NFS_MOUNT_LOCAL_FCNTL); - break; - default: - dfprintk(MOUNT, "NFS: invalid " - "local_lock argument\n"); - return 0; - }; - break; - - /* - * Special options - */ - case Opt_sloppy: - sloppy = 1; - dfprintk(MOUNT, "NFS: relaxing parsing rules\n"); - break; - case Opt_userspace: - case Opt_deprecated: - dfprintk(MOUNT, "NFS: ignoring mount option " - "'%s'\n", p); - break; - - default: - invalid_option = 1; - dfprintk(MOUNT, "NFS: unrecognized mount option " - "'%s'\n", p); - } - } - - if (!sloppy && invalid_option) - return 0; - - if (mnt->minorversion && mnt->version != 4) - goto out_minorversion_mismatch; - - if (mnt->options & NFS_OPTION_MIGRATION && - (mnt->version != 4 || mnt->minorversion != 0)) - goto out_migration_misuse; - - /* - * verify that any proto=/mountproto= options match the address - * families in the addr=/mountaddr= options. - */ - if (protofamily != AF_UNSPEC && - protofamily != mnt->nfs_server.address.ss_family) - goto out_proto_mismatch; - - if (mountfamily != AF_UNSPEC) { - if (mnt->mount_server.addrlen) { - if (mountfamily != mnt->mount_server.address.ss_family) - goto out_mountproto_mismatch; - } else { - if (mountfamily != mnt->nfs_server.address.ss_family) - goto out_mountproto_mismatch; - } - } - - return 1; - -out_mountproto_mismatch: - printk(KERN_INFO "NFS: mount server address does not match mountproto= " - "option\n"); - return 0; -out_proto_mismatch: - printk(KERN_INFO "NFS: server address does not match proto= option\n"); - return 0; -out_invalid_address: - printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); - return 0; -out_invalid_value: - printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p); - return 0; -out_minorversion_mismatch: - printk(KERN_INFO "NFS: mount option vers=%u does not support " - "minorversion=%u\n", mnt->version, mnt->minorversion); - return 0; -out_migration_misuse: - printk(KERN_INFO - "NFS: 'migration' not supported for this NFS version\n"); - return 0; -out_nomem: - printk(KERN_INFO "NFS: not enough memory to parse option\n"); - return 0; -out_security_failure: - free_secdata(secdata); - printk(KERN_INFO "NFS: security options invalid: %d\n", rc); - return 0; -} - -/* - * Ensure that a specified authtype in args->auth_info is supported by - * the server. Returns 0 and sets args->selected_flavor if it's ok, and + * Ensure that a specified authtype in ctx->auth_info is supported by + * the server. Returns 0 and sets ctx->selected_flavor if it's ok, and * -EACCES if not. */ -static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, - rpc_authflavor_t *server_authlist, unsigned int count) +static int nfs_verify_authflavors(struct nfs_fs_context *ctx, + rpc_authflavor_t *server_authlist, + unsigned int count) { rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; + bool found_auth_null = false; unsigned int i; - int use_auth_null = false; /* * If the sec= mount option is used, the specified flavor or AUTH_NULL @@ -1701,19 +807,23 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, * AUTH_NULL has a special meaning when it's in the server list - it * means that the server will ignore the rpc creds, so any flavor * can be used but still use the sec= that was specified. + * + * Note also that the MNT procedure in MNTv1 does not return a list + * of supported security flavors. In this case, nfs_mount() fabricates + * a security flavor list containing just AUTH_NULL. */ for (i = 0; i < count; i++) { flavor = server_authlist[i]; - if (nfs_auth_info_match(&args->auth_info, flavor)) + if (nfs_auth_info_match(&ctx->auth_info, flavor)) goto out; if (flavor == RPC_AUTH_NULL) - use_auth_null = true; + found_auth_null = true; } - if (use_auth_null) { - flavor = RPC_AUTH_NULL; + if (found_auth_null) { + flavor = ctx->auth_info.flavors[0]; goto out; } @@ -1722,8 +832,8 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, return -EACCES; out: - args->selected_flavor = flavor; - dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor); + ctx->selected_flavor = flavor; + dfprintk(MOUNT, "NFS: using auth flavor %u\n", ctx->selected_flavor); return 0; } @@ -1731,56 +841,64 @@ out: * Use the remote server's MOUNT service to request the NFS file handle * corresponding to the provided path. */ -static int nfs_request_mount(struct nfs_parsed_mount_data *args, +static int nfs_request_mount(struct fs_context *fc, struct nfs_fh *root_fh, rpc_authflavor_t *server_authlist, unsigned int *server_authlist_len) { + struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_mount_request request = { - .sap = (struct sockaddr *) - &args->mount_server.address, - .dirpath = args->nfs_server.export_path, - .protocol = args->mount_server.protocol, + .sap = &ctx->mount_server._address, + .dirpath = ctx->nfs_server.export_path, + .protocol = ctx->mount_server.protocol, .fh = root_fh, - .noresvport = args->flags & NFS_MOUNT_NORESVPORT, + .noresvport = ctx->flags & NFS_MOUNT_NORESVPORT, .auth_flav_len = server_authlist_len, .auth_flavs = server_authlist, - .net = args->net, + .net = fc->net_ns, }; int status; - if (args->mount_server.version == 0) { - switch (args->version) { + if (ctx->mount_server.version == 0) { + switch (ctx->version) { default: - args->mount_server.version = NFS_MNT3_VERSION; + ctx->mount_server.version = NFS_MNT3_VERSION; break; case 2: - args->mount_server.version = NFS_MNT_VERSION; + ctx->mount_server.version = NFS_MNT_VERSION; } } - request.version = args->mount_server.version; + request.version = ctx->mount_server.version; - if (args->mount_server.hostname) - request.hostname = args->mount_server.hostname; + if (ctx->mount_server.hostname) + request.hostname = ctx->mount_server.hostname; else - request.hostname = args->nfs_server.hostname; + request.hostname = ctx->nfs_server.hostname; /* * Construct the mount server's address. */ - if (args->mount_server.address.ss_family == AF_UNSPEC) { - memcpy(request.sap, &args->nfs_server.address, - args->nfs_server.addrlen); - args->mount_server.addrlen = args->nfs_server.addrlen; + if (ctx->mount_server.address.sa_family == AF_UNSPEC) { + memcpy(request.sap, &ctx->nfs_server._address, + ctx->nfs_server.addrlen); + ctx->mount_server.addrlen = ctx->nfs_server.addrlen; } - request.salen = args->mount_server.addrlen; - nfs_set_port(request.sap, &args->mount_server.port, 0); + request.salen = ctx->mount_server.addrlen; + nfs_set_port(request.sap, &ctx->mount_server.port, 0); /* * Now ask the mount server to map our export path * to a file handle. */ - status = nfs_mount(&request); + if ((request.protocol == XPRT_TRANSPORT_UDP) == + !(ctx->flags & NFS_MOUNT_TCP)) + /* + * NFS protocol and mount protocol are both UDP or neither UDP + * so timeouts are compatible. Use NFS timeouts for MOUNT + */ + status = nfs_mount(&request, ctx->timeo, ctx->retrans); + else + status = nfs_mount(&request, NFS_UNSPEC_TIMEO, NFS_UNSPEC_RETRANS); if (status != 0) { dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", request.hostname, status); @@ -1790,20 +908,28 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args, return 0; } -static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info, - struct nfs_subversion *nfs_mod) +static struct nfs_server *nfs_try_mount_request(struct fs_context *fc) { + struct nfs_fs_context *ctx = nfs_fc2context(fc); int status; unsigned int i; bool tried_auth_unix = false; bool auth_null_in_list = false; struct nfs_server *server = ERR_PTR(-EACCES); - struct nfs_parsed_mount_data *args = mount_info->parsed; rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS]; unsigned int authlist_len = ARRAY_SIZE(authlist); - status = nfs_request_mount(args, mount_info->mntfh, authlist, - &authlist_len); + /* make sure 'nolock'/'lock' override the 'local_lock' mount option */ + if (ctx->lock_status) { + if (ctx->lock_status == NFS_LOCK_NOLOCK) { + ctx->flags |= NFS_MOUNT_NONLM; + ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); + } else { + ctx->flags &= ~NFS_MOUNT_NONLM; + ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); + } + } + status = nfs_request_mount(fc, ctx->mntfh, authlist, &authlist_len); if (status) return ERR_PTR(status); @@ -1811,13 +937,13 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf * Was a sec= authflavor specified in the options? First, verify * whether the server supports it, and then just try to use it if so. */ - if (args->auth_info.flavor_len > 0) { - status = nfs_verify_authflavors(args, authlist, authlist_len); + if (ctx->auth_info.flavor_len > 0) { + status = nfs_verify_authflavors(ctx, authlist, authlist_len); dfprintk(MOUNT, "NFS: using auth flavor %u\n", - args->selected_flavor); + ctx->selected_flavor); if (status) return ERR_PTR(status); - return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + return ctx->nfs_mod->rpc_ops->create_server(fc); } /* @@ -1840,11 +966,11 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf default: if (rpcauth_get_gssinfo(flavor, &info) != 0) continue; - /* Fallthrough */ + break; } dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor); - args->selected_flavor = flavor; - server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + ctx->selected_flavor = flavor; + server = ctx->nfs_mod->rpc_ops->create_server(fc); if (!IS_ERR(server)) return server; } @@ -1859,338 +985,23 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf /* Last chance! Try AUTH_UNIX */ dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX); - args->selected_flavor = RPC_AUTH_UNIX; - return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + ctx->selected_flavor = RPC_AUTH_UNIX; + return ctx->nfs_mod->rpc_ops->create_server(fc); } -struct dentry *nfs_try_mount(int flags, const char *dev_name, - struct nfs_mount_info *mount_info, - struct nfs_subversion *nfs_mod) +int nfs_try_get_tree(struct fs_context *fc) { - struct nfs_server *server; + struct nfs_fs_context *ctx = nfs_fc2context(fc); - if (mount_info->parsed->need_mount) - server = nfs_try_mount_request(mount_info, nfs_mod); + if (ctx->need_mount) + ctx->server = nfs_try_mount_request(fc); else - server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); + ctx->server = ctx->nfs_mod->rpc_ops->create_server(fc); - if (IS_ERR(server)) - return ERR_CAST(server); - - return nfs_fs_mount_common(server, flags, dev_name, mount_info, nfs_mod); + return nfs_get_tree_common(fc); } -EXPORT_SYMBOL_GPL(nfs_try_mount); +EXPORT_SYMBOL_GPL(nfs_try_get_tree); -/* - * Split "dev_name" into "hostname:export_path". - * - * The leftmost colon demarks the split between the server's hostname - * and the export path. If the hostname starts with a left square - * bracket, then it may contain colons. - * - * Note: caller frees hostname and export path, even on error. - */ -static int nfs_parse_devname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) -{ - size_t len; - char *end; - - /* Is the host name protected with square brakcets? */ - if (*dev_name == '[') { - end = strchr(++dev_name, ']'); - if (end == NULL || end[1] != ':') - goto out_bad_devname; - - len = end - dev_name; - end++; - } else { - char *comma; - - end = strchr(dev_name, ':'); - if (end == NULL) - goto out_bad_devname; - len = end - dev_name; - - /* kill possible hostname list: not supported */ - comma = strchr(dev_name, ','); - if (comma != NULL && comma < end) - *comma = 0; - } - - if (len > maxnamlen) - goto out_hostname; - - /* N.B. caller will free nfs_server.hostname in all cases */ - *hostname = kstrndup(dev_name, len, GFP_KERNEL); - if (*hostname == NULL) - goto out_nomem; - len = strlen(++end); - if (len > maxpathlen) - goto out_path; - *export_path = kstrndup(end, len, GFP_KERNEL); - if (!*export_path) - goto out_nomem; - - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path); - return 0; - -out_bad_devname: - dfprintk(MOUNT, "NFS: device name not in host:path format\n"); - return -EINVAL; - -out_nomem: - dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); - return -ENOMEM; - -out_hostname: - dfprintk(MOUNT, "NFS: server hostname too long\n"); - return -ENAMETOOLONG; - -out_path: - dfprintk(MOUNT, "NFS: export pathname too long\n"); - return -ENAMETOOLONG; -} - -/* - * Validate the NFS2/NFS3 mount data - * - fills in the mount root filehandle - * - * For option strings, user space handles the following behaviors: - * - * + DNS: mapping server host name to IP address ("addr=" option) - * - * + failure mode: how to behave if a mount request can't be handled - * immediately ("fg/bg" option) - * - * + retry: how often to retry a mount request ("retry=" option) - * - * + breaking back: trying proto=udp after proto=tcp, v2 after v3, - * mountproto=tcp after mountproto=udp, and so on - */ -static int nfs23_validate_mount_data(void *options, - struct nfs_parsed_mount_data *args, - struct nfs_fh *mntfh, - const char *dev_name) -{ - struct nfs_mount_data *data = (struct nfs_mount_data *)options; - struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; - int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; - - if (data == NULL) - goto out_no_data; - - args->version = NFS_DEFAULT_VERSION; - switch (data->version) { - case 1: - data->namlen = 0; - case 2: - data->bsize = 0; - case 3: - if (data->flags & NFS_MOUNT_VER3) - goto out_no_v3; - data->root.size = NFS2_FHSIZE; - memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); - /* Turn off security negotiation */ - extra_flags |= NFS_MOUNT_SECFLAVOUR; - case 4: - if (data->flags & NFS_MOUNT_SECFLAVOUR) - goto out_no_sec; - case 5: - memset(data->context, 0, sizeof(data->context)); - case 6: - if (data->flags & NFS_MOUNT_VER3) { - if (data->root.size > NFS3_FHSIZE || data->root.size == 0) - goto out_invalid_fh; - mntfh->size = data->root.size; - args->version = 3; - } else { - mntfh->size = NFS2_FHSIZE; - args->version = 2; - } - - - memcpy(mntfh->data, data->root.data, mntfh->size); - if (mntfh->size < sizeof(mntfh->data)) - memset(mntfh->data + mntfh->size, 0, - sizeof(mntfh->data) - mntfh->size); - - /* - * Translate to nfs_parsed_mount_data, which nfs_fill_super - * can deal with. - */ - args->flags = data->flags & NFS_MOUNT_FLAGMASK; - args->flags |= extra_flags; - args->rsize = data->rsize; - args->wsize = data->wsize; - args->timeo = data->timeo; - args->retrans = data->retrans; - args->acregmin = data->acregmin; - args->acregmax = data->acregmax; - args->acdirmin = data->acdirmin; - args->acdirmax = data->acdirmax; - args->need_mount = false; - - memcpy(sap, &data->addr, sizeof(data->addr)); - args->nfs_server.addrlen = sizeof(data->addr); - args->nfs_server.port = ntohs(data->addr.sin_port); - if (!nfs_verify_server_address(sap)) - goto out_no_address; - - if (!(data->flags & NFS_MOUNT_TCP)) - args->nfs_server.protocol = XPRT_TRANSPORT_UDP; - /* N.B. caller will free nfs_server.hostname in all cases */ - args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); - args->namlen = data->namlen; - args->bsize = data->bsize; - - if (data->flags & NFS_MOUNT_SECFLAVOUR) - args->selected_flavor = data->pseudoflavor; - else - args->selected_flavor = RPC_AUTH_UNIX; - if (!args->nfs_server.hostname) - goto out_nomem; - - if (!(data->flags & NFS_MOUNT_NONLM)) - args->flags &= ~(NFS_MOUNT_LOCAL_FLOCK| - NFS_MOUNT_LOCAL_FCNTL); - else - args->flags |= (NFS_MOUNT_LOCAL_FLOCK| - NFS_MOUNT_LOCAL_FCNTL); - /* - * The legacy version 6 binary mount data from userspace has a - * field used only to transport selinux information into the - * the kernel. To continue to support that functionality we - * have a touch of selinux knowledge here in the NFS code. The - * userspace code converted context=blah to just blah so we are - * converting back to the full string selinux understands. - */ - if (data->context[0]){ -#ifdef CONFIG_SECURITY_SELINUX - int rc; - char *opts_str = kmalloc(sizeof(data->context) + 8, GFP_KERNEL); - if (!opts_str) - return -ENOMEM; - strcpy(opts_str, "context="); - data->context[NFS_MAX_CONTEXT_LEN] = '\0'; - strcat(opts_str, &data->context[0]); - rc = security_sb_parse_opts_str(opts_str, &args->lsm_opts); - kfree(opts_str); - if (rc) - return rc; -#else - return -EINVAL; -#endif - } - - break; - default: - return NFS_TEXT_DATA; - } - - return 0; - -out_no_data: - dfprintk(MOUNT, "NFS: mount program didn't pass any mount data\n"); - return -EINVAL; - -out_no_v3: - dfprintk(MOUNT, "NFS: nfs_mount_data version %d does not support v3\n", - data->version); - return -EINVAL; - -out_no_sec: - dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); - return -EINVAL; - -out_nomem: - dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); - return -ENOMEM; - -out_no_address: - dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); - return -EINVAL; - -out_invalid_fh: - dfprintk(MOUNT, "NFS: invalid root filehandle\n"); - return -EINVAL; -} - -#if IS_ENABLED(CONFIG_NFS_V4) -static int nfs_validate_mount_data(struct file_system_type *fs_type, - void *options, - struct nfs_parsed_mount_data *args, - struct nfs_fh *mntfh, - const char *dev_name) -{ - if (fs_type == &nfs_fs_type) - return nfs23_validate_mount_data(options, args, mntfh, dev_name); - return nfs4_validate_mount_data(options, args, dev_name); -} -#else -static int nfs_validate_mount_data(struct file_system_type *fs_type, - void *options, - struct nfs_parsed_mount_data *args, - struct nfs_fh *mntfh, - const char *dev_name) -{ - return nfs23_validate_mount_data(options, args, mntfh, dev_name); -} -#endif - -static int nfs_validate_text_mount_data(void *options, - struct nfs_parsed_mount_data *args, - const char *dev_name) -{ - int port = 0; - int max_namelen = PAGE_SIZE; - int max_pathlen = NFS_MAXPATHLEN; - struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; - - if (nfs_parse_mount_options((char *)options, args) == 0) - return -EINVAL; - - if (!nfs_verify_server_address(sap)) - goto out_no_address; - - if (args->version == 4) { -#if IS_ENABLED(CONFIG_NFS_V4) - port = NFS_PORT; - max_namelen = NFS4_MAXNAMLEN; - max_pathlen = NFS4_MAXPATHLEN; - nfs_validate_transport_protocol(args); - if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; - nfs4_validate_mount_flags(args); -#else - goto out_v4_not_compiled; -#endif /* CONFIG_NFS_V4 */ - } else - nfs_set_mount_transport_protocol(args); - - nfs_set_port(sap, &args->nfs_server.port, port); - - return nfs_parse_devname(dev_name, - &args->nfs_server.hostname, - max_namelen, - &args->nfs_server.export_path, - max_pathlen); - -#if !IS_ENABLED(CONFIG_NFS_V4) -out_v4_not_compiled: - dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); - return -EPROTONOSUPPORT; -#else -out_invalid_transport_udp: - dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n"); - return -EINVAL; -#endif /* !CONFIG_NFS_V4 */ - -out_no_address: - dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); - return -EINVAL; -} #define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ | NFS_MOUNT_SECURE \ @@ -2207,179 +1018,142 @@ out_no_address: static int nfs_compare_remount_data(struct nfs_server *nfss, - struct nfs_parsed_mount_data *data) -{ - if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK || - data->rsize != nfss->rsize || - data->wsize != nfss->wsize || - data->version != nfss->nfs_client->rpc_ops->version || - data->minorversion != nfss->nfs_client->cl_minorversion || - data->retrans != nfss->client->cl_timeout->to_retries || - !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) || - data->acregmin != nfss->acregmin / HZ || - data->acregmax != nfss->acregmax / HZ || - data->acdirmin != nfss->acdirmin / HZ || - data->acdirmax != nfss->acdirmax / HZ || - data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || - data->nfs_server.port != nfss->port || - data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || - !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address, + struct nfs_fs_context *ctx) +{ + if ((ctx->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK || + ctx->rsize != nfss->rsize || + ctx->wsize != nfss->wsize || + ctx->version != nfss->nfs_client->rpc_ops->version || + ctx->minorversion != nfss->nfs_client->cl_minorversion || + ctx->retrans != nfss->client->cl_timeout->to_retries || + !nfs_auth_info_match(&ctx->auth_info, nfss->client->cl_auth->au_flavor) || + ctx->acregmin != nfss->acregmin / HZ || + ctx->acregmax != nfss->acregmax / HZ || + ctx->acdirmin != nfss->acdirmin / HZ || + ctx->acdirmax != nfss->acdirmax / HZ || + ctx->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + (ctx->options & NFS_OPTION_FSCACHE) != (nfss->options & NFS_OPTION_FSCACHE) || + ctx->nfs_server.port != nfss->port || + ctx->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || + !rpc_cmp_addr((struct sockaddr *)&ctx->nfs_server.address, (struct sockaddr *)&nfss->nfs_client->cl_addr)) return -EINVAL; return 0; } -int -nfs_remount(struct super_block *sb, int *flags, char *raw_data) +int nfs_reconfigure(struct fs_context *fc) { - int error; + struct nfs_fs_context *ctx = nfs_fc2context(fc); + struct super_block *sb = fc->root->d_sb; struct nfs_server *nfss = sb->s_fs_info; - struct nfs_parsed_mount_data *data; - struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data; - struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; - u32 nfsvers = nfss->nfs_client->rpc_ops->version; + int ret; sync_filesystem(sb); /* + * The SB_RDONLY flag has been removed from the superblock during + * mounts to prevent interference between different filesystems. + * Similarly, it is also necessary to ignore the SB_RDONLY flag + * during reconfiguration; otherwise, it may also result in the + * creation of redundant superblocks when mounting a directory with + * different rw and ro flags multiple times. + */ + fc->sb_flags_mask &= ~SB_RDONLY; + + /* * Userspace mount programs that send binary options generally send * them populated with default values. We have no way to know which * ones were explicitly specified. Fall back to legacy behavior and * just return success. */ - if ((nfsvers == 4 && (!options4 || options4->version == 1)) || - (nfsvers <= 3 && (!options || (options->version >= 1 && - options->version <= 6)))) + if (ctx->skip_reconfig_option_check) return 0; - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (data == NULL) - return -ENOMEM; - - /* fill out struct with values from existing mount */ - data->flags = nfss->flags; - data->rsize = nfss->rsize; - data->wsize = nfss->wsize; - data->retrans = nfss->client->cl_timeout->to_retries; - data->selected_flavor = nfss->client->cl_auth->au_flavor; - data->acregmin = nfss->acregmin / HZ; - data->acregmax = nfss->acregmax / HZ; - data->acdirmin = nfss->acdirmin / HZ; - data->acdirmax = nfss->acdirmax / HZ; - data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; - data->nfs_server.port = nfss->port; - data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; - data->version = nfsvers; - data->minorversion = nfss->nfs_client->cl_minorversion; - data->net = current->nsproxy->net_ns; - memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, - data->nfs_server.addrlen); - - /* overwrite those values with any that were specified */ - error = -EINVAL; - if (!nfs_parse_mount_options((char *)options, data)) - goto out; - /* * noac is a special case. It implies -o sync, but that's not - * necessarily reflected in the mtab options. do_remount_sb - * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the + * necessarily reflected in the mtab options. reconfigure_super + * will clear SB_SYNCHRONOUS if -o sync wasn't specified in the * remount options, so we have to explicitly reset it. */ - if (data->flags & NFS_MOUNT_NOAC) - *flags |= MS_SYNCHRONOUS; + if (ctx->flags & NFS_MOUNT_NOAC) { + fc->sb_flags |= SB_SYNCHRONOUS; + fc->sb_flags_mask |= SB_SYNCHRONOUS; + } /* compare new mount options with old ones */ - error = nfs_compare_remount_data(nfss, data); -out: - kfree(data); - return error; -} -EXPORT_SYMBOL_GPL(nfs_remount); - -/* - * Initialise the common bits of the superblock - */ -static void nfs_initialise_sb(struct super_block *sb) -{ - struct nfs_server *server = NFS_SB(sb); - - sb->s_magic = NFS_SUPER_MAGIC; - - /* We probably want something more informative here */ - snprintf(sb->s_id, sizeof(sb->s_id), - "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev)); - - if (sb->s_blocksize == 0) - sb->s_blocksize = nfs_block_bits(server->wsize, - &sb->s_blocksize_bits); + ret = nfs_compare_remount_data(nfss, ctx); + if (ret) + return ret; - nfs_super_set_maxbytes(sb, server->maxfilesize); + return nfs_probe_server(nfss, NFS_FH(d_inode(fc->root))); } +EXPORT_SYMBOL_GPL(nfs_reconfigure); /* - * Finish setting up an NFS2/3 superblock + * Finish setting up an NFS superblock */ -void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) +static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) { - struct nfs_parsed_mount_data *data = mount_info->parsed; struct nfs_server *server = NFS_SB(sb); sb->s_blocksize_bits = 0; sb->s_blocksize = 0; sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr; sb->s_op = server->nfs_client->cl_nfs_mod->sops; - if (data && data->bsize) - sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); + if (ctx->bsize) + sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits); - if (server->nfs_client->rpc_ops->version != 2) { - /* The VFS shouldn't apply the umask to mode bits. We will do - * so ourselves when necessary. + switch (server->nfs_client->rpc_ops->version) { + case 2: + sb->s_time_gran = 1000; + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; + break; + case 3: + /* + * The VFS shouldn't apply the umask to mode bits. + * We will do so ourselves when necessary. */ - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; sb->s_time_gran = 1; + sb->s_time_min = 0; + sb->s_time_max = U32_MAX; sb->s_export_op = &nfs_export_ops; + break; + case 4: + sb->s_iflags |= SB_I_NOUMASK; + sb->s_time_gran = 1; + sb->s_time_min = S64_MIN; + sb->s_time_max = S64_MAX; + if (server->caps & NFS_CAP_ATOMIC_OPEN_V1) + sb->s_export_op = &nfs_export_ops; + break; } - nfs_initialise_sb(sb); -} -EXPORT_SYMBOL_GPL(nfs_fill_super); + sb->s_magic = NFS_SUPER_MAGIC; -/* - * Finish setting up a cloned NFS2/3/4 superblock - */ -static void nfs_clone_super(struct super_block *sb, - struct nfs_mount_info *mount_info) -{ - const struct super_block *old_sb = mount_info->cloned->sb; - struct nfs_server *server = NFS_SB(sb); + /* We probably want something more informative here */ + snprintf(sb->s_id, sizeof(sb->s_id), + "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev)); - sb->s_blocksize_bits = old_sb->s_blocksize_bits; - sb->s_blocksize = old_sb->s_blocksize; - sb->s_maxbytes = old_sb->s_maxbytes; - sb->s_xattr = old_sb->s_xattr; - sb->s_op = old_sb->s_op; - sb->s_time_gran = 1; - sb->s_export_op = old_sb->s_export_op; - - if (server->nfs_client->rpc_ops->version != 2) { - /* The VFS shouldn't apply the umask to mode bits. We will do - * so ourselves when necessary. - */ - sb->s_flags |= MS_POSIXACL; - } + if (sb->s_blocksize == 0) + sb->s_blocksize = nfs_block_bits(server->wsize, + &sb->s_blocksize_bits); - nfs_initialise_sb(sb); + nfs_super_set_maxbytes(sb, server->maxfilesize); + nfs_sysfs_move_server_to_sb(sb); + server->has_sec_mnt_opts = ctx->has_sec_mnt_opts; } -static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) +static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, + const struct fs_context *fc) { const struct nfs_server *a = s->s_fs_info; const struct rpc_clnt *clnt_a = a->client; const struct rpc_clnt *clnt_b = b->client; - if ((s->s_flags & NFS_MS_MASK) != (flags & NFS_MS_MASK)) + if ((s->s_flags & NFS_SB_MASK) != (fc->sb_flags & NFS_SB_MASK)) goto Ebusy; if (a->nfs_client != b->nfs_client) goto Ebusy; @@ -2397,28 +1171,19 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->acdirmax != b->acdirmax) goto Ebusy; - if (b->auth_info.flavor_len > 0 && - clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) + if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) goto Ebusy; return 1; Ebusy: return 0; } -struct nfs_sb_mountdata { - struct nfs_server *server; - int mntflags; -}; - -static int nfs_set_super(struct super_block *s, void *data) +static int nfs_set_super(struct super_block *s, struct fs_context *fc) { - struct nfs_sb_mountdata *sb_mntdata = data; - struct nfs_server *server = sb_mntdata->server; + struct nfs_server *server = fc->s_fs_info; int ret; - s->s_flags = sb_mntdata->mntflags; - s->s_fs_info = server; - s->s_d_op = server->nfs_client->rpc_ops->dentry_ops; + set_default_d_op(s, server->nfs_client->rpc_ops->dentry_ops); ret = set_anon_super(s, server); if (ret == 0) server->s_dev = s->s_dev; @@ -2467,11 +1232,24 @@ static int nfs_compare_super_address(struct nfs_server *server1, return 1; } -static int nfs_compare_super(struct super_block *sb, void *data) +static int nfs_compare_userns(const struct nfs_server *old, + const struct nfs_server *new) +{ + const struct user_namespace *oldns = &init_user_ns; + const struct user_namespace *newns = &init_user_ns; + + if (old->client && old->client->cl_cred) + oldns = old->client->cl_cred->user_ns; + if (new->client && new->client->cl_cred) + newns = new->client->cl_cred->user_ns; + if (oldns != newns) + return 0; + return 1; +} + +static int nfs_compare_super(struct super_block *sb, struct fs_context *fc) { - struct nfs_sb_mountdata *sb_mntdata = data; - struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb); - int mntflags = sb_mntdata->mntflags; + struct nfs_server *server = fc->s_fs_info, *old = NFS_SB(sb); if (!nfs_compare_super_address(old, server)) return 0; @@ -2480,124 +1258,93 @@ static int nfs_compare_super(struct super_block *sb, void *data) return 0; if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0) return 0; - return nfs_compare_mount_options(sb, server, mntflags); + if (!nfs_compare_userns(old, server)) + return 0; + if ((old->has_sec_mnt_opts || fc->security) && + security_sb_mnt_opts_compat(sb, fc->security)) + return 0; + return nfs_compare_mount_options(sb, server, fc); } #ifdef CONFIG_NFS_FSCACHE -static void nfs_get_cache_cookie(struct super_block *sb, - struct nfs_parsed_mount_data *parsed, - struct nfs_clone_mount *cloned) +static int nfs_get_cache_cookie(struct super_block *sb, + struct nfs_fs_context *ctx) { struct nfs_server *nfss = NFS_SB(sb); char *uniq = NULL; int ulen = 0; - nfss->fscache_key = NULL; nfss->fscache = NULL; - if (parsed) { - if (!(parsed->options & NFS_OPTION_FSCACHE)) - return; - if (parsed->fscache_uniq) { - uniq = parsed->fscache_uniq; - ulen = strlen(parsed->fscache_uniq); - } - } else if (cloned) { - struct nfs_server *mnt_s = NFS_SB(cloned->sb); + if (!ctx) + return 0; + + if (ctx->clone_data.sb) { + struct nfs_server *mnt_s = NFS_SB(ctx->clone_data.sb); if (!(mnt_s->options & NFS_OPTION_FSCACHE)) - return; - if (mnt_s->fscache_key) { - uniq = mnt_s->fscache_key->key.uniquifier; - ulen = mnt_s->fscache_key->key.uniq_len; - }; - } else - return; + return 0; + if (mnt_s->fscache_uniq) { + uniq = mnt_s->fscache_uniq; + ulen = strlen(uniq); + } + } else { + if (!(ctx->options & NFS_OPTION_FSCACHE)) + return 0; + if (ctx->fscache_uniq) { + uniq = ctx->fscache_uniq; + ulen = strlen(ctx->fscache_uniq); + } + } - nfs_fscache_get_super_cookie(sb, uniq, ulen); + return nfs_fscache_get_super_cookie(sb, uniq, ulen); } #else -static void nfs_get_cache_cookie(struct super_block *sb, - struct nfs_parsed_mount_data *parsed, - struct nfs_clone_mount *cloned) -{ -} -#endif - -int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, - struct nfs_mount_info *mount_info) +static int nfs_get_cache_cookie(struct super_block *sb, + struct nfs_fs_context *ctx) { - int error; - unsigned long kflags = 0, kflags_out = 0; - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) - kflags |= SECURITY_LSM_NATIVE_LABELS; - - error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts, - kflags, &kflags_out); - if (error) - goto err; - - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && - !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) - NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; -err: - return error; -} -EXPORT_SYMBOL_GPL(nfs_set_sb_security); - -int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, - struct nfs_mount_info *mount_info) -{ - int error; - unsigned long kflags = 0, kflags_out = 0; - - /* clone any lsm security options from the parent to the new sb */ - if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) - return -ESTALE; - - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) - kflags |= SECURITY_LSM_NATIVE_LABELS; - - error = security_sb_clone_mnt_opts(mount_info->cloned->sb, s, kflags, - &kflags_out); - if (error) - return error; - - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && - !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) - NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; return 0; } -EXPORT_SYMBOL_GPL(nfs_clone_sb_security); +#endif -struct dentry *nfs_fs_mount_common(struct nfs_server *server, - int flags, const char *dev_name, - struct nfs_mount_info *mount_info, - struct nfs_subversion *nfs_mod) +int nfs_get_tree_common(struct fs_context *fc) { + struct nfs_fs_context *ctx = nfs_fc2context(fc); struct super_block *s; - struct dentry *mntroot = ERR_PTR(-ENOMEM); - int (*compare_super)(struct super_block *, void *) = nfs_compare_super; - struct nfs_sb_mountdata sb_mntdata = { - .mntflags = flags, - .server = server, - }; + int (*compare_super)(struct super_block *, struct fs_context *) = nfs_compare_super; + struct nfs_server *server = ctx->server; int error; + ctx->server = NULL; + if (IS_ERR(server)) + return PTR_ERR(server); + + /* + * When NFS_MOUNT_UNSHARED is not set, NFS forces the sharing of a + * superblock among each filesystem that mounts sub-directories + * belonging to a single exported root path. + * To prevent interference between different filesystems, the + * SB_RDONLY flag should be removed from the superblock. + */ if (server->flags & NFS_MOUNT_UNSHARED) compare_super = NULL; + else + fc->sb_flags &= ~SB_RDONLY; /* -o noac implies -o sync */ if (server->flags & NFS_MOUNT_NOAC) - sb_mntdata.mntflags |= MS_SYNCHRONOUS; + fc->sb_flags |= SB_SYNCHRONOUS; - if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL) - if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS) - sb_mntdata.mntflags |= MS_SYNCHRONOUS; + if (ctx->clone_data.sb) + if (ctx->clone_data.sb->s_flags & SB_SYNCHRONOUS) + fc->sb_flags |= SB_SYNCHRONOUS; /* Get a superblock - note that we may end up sharing one that already exists */ - s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata); + fc->s_fs_info = server; + s = sget_fc(fc, compare_super, nfs_set_super); + fc->s_fs_info = NULL; if (IS_ERR(s)) { - mntroot = ERR_CAST(s); + error = PTR_ERR(s); + nfs_errorf(fc, "NFS: Couldn't get superblock"); goto out_err_nosb; } @@ -2607,248 +1354,63 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, } else { error = super_setup_bdi_name(s, "%u:%u", MAJOR(server->s_dev), MINOR(server->s_dev)); - if (error) { - mntroot = ERR_PTR(error); + if (error) goto error_splat_super; - } - s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; + s->s_bdi->io_pages = server->rpages; server->super = s; } if (!s->s_root) { + unsigned bsize = ctx->clone_data.inherited_bsize; /* initial superblock/root creation */ - mount_info->fill_super(s, mount_info); - nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); + nfs_fill_super(s, ctx); + if (bsize) { + s->s_blocksize_bits = bsize; + s->s_blocksize = 1U << bsize; + } + error = nfs_get_cache_cookie(s, ctx); + if (error < 0) + goto error_splat_super; } - mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); - if (IS_ERR(mntroot)) + error = nfs_get_root(s, fc); + if (error < 0) { + nfs_errorf(fc, "NFS: Couldn't get root dentry"); goto error_splat_super; + } - error = mount_info->set_security(s, mntroot, mount_info); - if (error) - goto error_splat_root; - - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; + error = 0; out: - return mntroot; + return error; out_err_nosb: nfs_free_server(server); goto out; - -error_splat_root: - dput(mntroot); - mntroot = ERR_PTR(error); error_splat_super: deactivate_locked_super(s); goto out; } -EXPORT_SYMBOL_GPL(nfs_fs_mount_common); - -struct dentry *nfs_fs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data) -{ - struct nfs_mount_info mount_info = { - .fill_super = nfs_fill_super, - .set_security = nfs_set_sb_security, - }; - struct dentry *mntroot = ERR_PTR(-ENOMEM); - struct nfs_subversion *nfs_mod; - int error; - - mount_info.parsed = nfs_alloc_parsed_mount_data(); - mount_info.mntfh = nfs_alloc_fhandle(); - if (mount_info.parsed == NULL || mount_info.mntfh == NULL) - goto out; - - /* Validate the mount data */ - error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name); - if (error == NFS_TEXT_DATA) - error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name); - if (error < 0) { - mntroot = ERR_PTR(error); - goto out; - } - - nfs_mod = get_nfs_version(mount_info.parsed->version); - if (IS_ERR(nfs_mod)) { - mntroot = ERR_CAST(nfs_mod); - goto out; - } - - mntroot = nfs_mod->rpc_ops->try_mount(flags, dev_name, &mount_info, nfs_mod); - - put_nfs_version(nfs_mod); -out: - nfs_free_parsed_mount_data(mount_info.parsed); - nfs_free_fhandle(mount_info.mntfh); - return mntroot; -} -EXPORT_SYMBOL_GPL(nfs_fs_mount); /* - * Destroy an NFS2/3 superblock + * Destroy an NFS superblock */ void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); - dev_t dev = s->s_dev; - generic_shutdown_super(s); + nfs_sysfs_move_sb_to_server(server); + kill_anon_super(s); nfs_fscache_release_super_cookie(s); nfs_free_server(server); - free_anon_bdev(dev); } EXPORT_SYMBOL_GPL(nfs_kill_super); -/* - * Clone an NFS2/3/4 server record on xdev traversal (FSID-change) - */ -static struct dentry * -nfs_xdev_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data) -{ - struct nfs_clone_mount *data = raw_data; - struct nfs_mount_info mount_info = { - .fill_super = nfs_clone_super, - .set_security = nfs_clone_sb_security, - .cloned = data, - }; - struct nfs_server *server; - struct dentry *mntroot = ERR_PTR(-ENOMEM); - struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; - - dprintk("--> nfs_xdev_mount()\n"); - - mount_info.mntfh = mount_info.cloned->fh; - - /* create a new volume representation */ - server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); - - if (IS_ERR(server)) - mntroot = ERR_CAST(server); - else - mntroot = nfs_fs_mount_common(server, flags, - dev_name, &mount_info, nfs_mod); - - dprintk("<-- nfs_xdev_mount() = %ld\n", - IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L); - return mntroot; -} - #if IS_ENABLED(CONFIG_NFS_V4) -static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) -{ - args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3| - NFS_MOUNT_LOCAL_FLOCK|NFS_MOUNT_LOCAL_FCNTL); -} - -/* - * Validate NFSv4 mount options - */ -static int nfs4_validate_mount_data(void *options, - struct nfs_parsed_mount_data *args, - const char *dev_name) -{ - struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; - struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; - char *c; - - if (data == NULL) - goto out_no_data; - - args->version = 4; - - switch (data->version) { - case 1: - if (data->host_addrlen > sizeof(args->nfs_server.address)) - goto out_no_address; - if (data->host_addrlen == 0) - goto out_no_address; - args->nfs_server.addrlen = data->host_addrlen; - if (copy_from_user(sap, data->host_addr, data->host_addrlen)) - return -EFAULT; - if (!nfs_verify_server_address(sap)) - goto out_no_address; - args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); - - if (data->auth_flavourlen) { - rpc_authflavor_t pseudoflavor; - if (data->auth_flavourlen > 1) - goto out_inval_auth; - if (copy_from_user(&pseudoflavor, - data->auth_flavours, - sizeof(pseudoflavor))) - return -EFAULT; - args->selected_flavor = pseudoflavor; - } else - args->selected_flavor = RPC_AUTH_UNIX; - - c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); - if (IS_ERR(c)) - return PTR_ERR(c); - args->nfs_server.hostname = c; - - c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); - if (IS_ERR(c)) - return PTR_ERR(c); - args->nfs_server.export_path = c; - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); - - c = strndup_user(data->client_addr.data, 16); - if (IS_ERR(c)) - return PTR_ERR(c); - args->client_address = c; - - /* - * Translate to nfs_parsed_mount_data, which nfs4_fill_super - * can deal with. - */ - - args->flags = data->flags & NFS4_MOUNT_FLAGMASK; - args->rsize = data->rsize; - args->wsize = data->wsize; - args->timeo = data->timeo; - args->retrans = data->retrans; - args->acregmin = data->acregmin; - args->acregmax = data->acregmax; - args->acdirmin = data->acdirmin; - args->acdirmax = data->acdirmax; - args->nfs_server.protocol = data->proto; - nfs_validate_transport_protocol(args); - if (args->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; - - break; - default: - return NFS_TEXT_DATA; - } - - return 0; - -out_no_data: - dfprintk(MOUNT, "NFS4: mount program didn't pass any mount data\n"); - return -EINVAL; - -out_inval_auth: - dfprintk(MOUNT, "NFS4: Invalid number of RPC auth flavours %d\n", - data->auth_flavourlen); - return -EINVAL; - -out_no_address: - dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); - return -EINVAL; - -out_invalid_transport_udp: - dfprintk(MOUNT, "NFSv4: Unsupported transport protocol udp\n"); - return -EINVAL; -} - /* * NFS v4 module parameters need to stay in the * NFS client for backwards compatibility @@ -2864,6 +1426,7 @@ unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE; unsigned short send_implementation_id = 1; char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; bool recover_lost_locks = false; +short nfs_delay_retrans = -1; EXPORT_SYMBOL_GPL(nfs_callback_nr_threads); EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); @@ -2874,6 +1437,7 @@ EXPORT_SYMBOL_GPL(max_session_cb_slots); EXPORT_SYMBOL_GPL(send_implementation_id); EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); EXPORT_SYMBOL_GPL(recover_lost_locks); +EXPORT_SYMBOL_GPL(nfs_delay_retrans); #define NFS_CALLBACK_MAXPORTNR (65535U) @@ -2885,7 +1449,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp) if (!val) return -EINVAL; ret = kstrtoul(val, 0, &num); - if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) + if (ret || num > NFS_CALLBACK_MAXPORTNR) return -EINVAL; *((unsigned int *)kp->arg) = num; return 0; @@ -2894,7 +1458,7 @@ static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; -#define param_check_portnr(name, p) __param_check(name, p, unsigned int); +#define param_check_portnr(name, p) __param_check(name, p, unsigned int) module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644); @@ -2922,5 +1486,9 @@ MODULE_PARM_DESC(recover_lost_locks, "If the server reports that a lock might be lost, " "try to recover it risking data corruption."); - +module_param_named(delay_retrans, nfs_delay_retrans, short, 0644); +MODULE_PARM_DESC(delay_retrans, + "Unless negative, specifies the number of times the NFSv4 " + "client retries a request before returning an EAGAIN error, " + "after a reply of NFS4ERR_DELAY from the server."); #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 5a1d0ded8979..58146e935402 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/symlink.c * @@ -25,52 +26,45 @@ * and straight-forward than readdir caching. */ -static int nfs_symlink_filler(struct inode *inode, struct page *page) +static int nfs_symlink_filler(struct file *file, struct folio *folio) { + struct inode *inode = folio->mapping->host; int error; - error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE); - if (error < 0) - goto error; - SetPageUptodate(page); - unlock_page(page); - return 0; - -error: - SetPageError(page); - unlock_page(page); - return -EIO; + error = NFS_PROTO(inode)->readlink(inode, &folio->page, 0, PAGE_SIZE); + folio_end_read(folio, error == 0); + return error; } static const char *nfs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page; + struct folio *folio; void *err; if (!dentry) { err = ERR_PTR(nfs_revalidate_mapping_rcu(inode)); if (err) return err; - page = find_get_page(inode->i_mapping, 0); - if (!page) + folio = filemap_get_folio(inode->i_mapping, 0); + if (IS_ERR(folio)) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) return err; - page = read_cache_page(&inode->i_data, 0, - (filler_t *)nfs_symlink_filler, inode); - if (IS_ERR(page)) - return ERR_CAST(page); + folio = read_cache_folio(&inode->i_data, 0, nfs_symlink_filler, + NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); } - set_delayed_call(done, page_put_link, page); - return page_address(page); + set_delayed_call(done, page_put_link, folio); + return folio_address(folio); } /* diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index bb6ed810fa6f..f579df0e8d67 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/sysctl.c * @@ -13,7 +14,7 @@ static struct ctl_table_header *nfs_callback_sysctl_table; -static struct ctl_table nfs_cb_sysctls[] = { +static const struct ctl_table nfs_cb_sysctls[] = { { .procname = "nfs_mountpoint_timeout", .data = &nfs_mountpoint_expiry_timeout, @@ -28,30 +29,11 @@ static struct ctl_table nfs_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } -}; - -static struct ctl_table nfs_cb_sysctl_dir[] = { - { - .procname = "nfs", - .mode = 0555, - .child = nfs_cb_sysctls, - }, - { } -}; - -static struct ctl_table nfs_cb_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = nfs_cb_sysctl_dir, - }, - { } }; int nfs_register_sysctl(void) { - nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root); + nfs_callback_sysctl_table = register_sysctl("fs/nfs", nfs_cb_sysctls); if (nfs_callback_sysctl_table == NULL) return -ENOMEM; return 0; diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c new file mode 100644 index 000000000000..ea6e6168092b --- /dev/null +++ b/fs/nfs/sysfs.c @@ -0,0 +1,470 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Hammerspace Inc + */ + +#include <linux/module.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <linux/string.h> +#include <linux/nfs_fs.h> +#include <linux/rcupdate.h> +#include <linux/lockd/lockd.h> + +#include "internal.h" +#include "nfs4_fs.h" +#include "netns.h" +#include "sysfs.h" + +static struct kset *nfs_kset; + +static void nfs_kset_release(struct kobject *kobj) +{ + struct kset *kset = container_of(kobj, struct kset, kobj); + kfree(kset); +} + +static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type( + const struct kobject *kobj) +{ + return &net_ns_type_operations; +} + +static struct kobj_type nfs_kset_type = { + .release = nfs_kset_release, + .sysfs_ops = &kobj_sysfs_ops, + .child_ns_type = nfs_netns_object_child_ns_type, +}; + +int nfs_sysfs_init(void) +{ + int ret; + + nfs_kset = kzalloc(sizeof(*nfs_kset), GFP_KERNEL); + if (!nfs_kset) + return -ENOMEM; + + ret = kobject_set_name(&nfs_kset->kobj, "nfs"); + if (ret) { + kfree(nfs_kset); + return ret; + } + + nfs_kset->kobj.parent = fs_kobj; + nfs_kset->kobj.ktype = &nfs_kset_type; + nfs_kset->kobj.kset = NULL; + + ret = kset_register(nfs_kset); + if (ret) { + kfree(nfs_kset); + return ret; + } + + return 0; +} + +void nfs_sysfs_exit(void) +{ + kset_unregister(nfs_kset); +} + +static ssize_t nfs_netns_identifier_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct nfs_netns_client *c = container_of(kobj, + struct nfs_netns_client, + kobject); + ssize_t ret; + + rcu_read_lock(); + ret = sysfs_emit(buf, "%s\n", rcu_dereference(c->identifier)); + rcu_read_unlock(); + return ret; +} + +/* Strip trailing '\n' */ +static size_t nfs_string_strip(const char *c, size_t len) +{ + while (len > 0 && c[len-1] == '\n') + --len; + return len; +} + +static ssize_t nfs_netns_identifier_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct nfs_netns_client *c = container_of(kobj, + struct nfs_netns_client, + kobject); + const char *old; + char *p; + size_t len; + + len = nfs_string_strip(buf, min_t(size_t, count, CONTAINER_ID_MAXLEN)); + if (!len) + return 0; + p = kmemdup_nul(buf, len, GFP_KERNEL); + if (!p) + return -ENOMEM; + old = rcu_dereference_protected(xchg(&c->identifier, (char __rcu *)p), 1); + if (old) { + synchronize_rcu(); + kfree(old); + } + return count; +} + +static void nfs_netns_client_release(struct kobject *kobj) +{ + struct nfs_netns_client *c = container_of(kobj, + struct nfs_netns_client, + kobject); + + kfree(rcu_dereference_raw(c->identifier)); +} + +static const void *nfs_netns_client_namespace(const struct kobject *kobj) +{ + return container_of(kobj, struct nfs_netns_client, kobject)->net; +} + +static struct kobj_attribute nfs_netns_client_id = __ATTR(identifier, + 0644, nfs_netns_identifier_show, nfs_netns_identifier_store); + +static struct attribute *nfs_netns_client_attrs[] = { + &nfs_netns_client_id.attr, + NULL, +}; +ATTRIBUTE_GROUPS(nfs_netns_client); + +static struct kobj_type nfs_netns_client_type = { + .release = nfs_netns_client_release, + .default_groups = nfs_netns_client_groups, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = nfs_netns_client_namespace, +}; + +static void nfs_netns_object_release(struct kobject *kobj) +{ + struct nfs_netns_client *c = container_of(kobj, + struct nfs_netns_client, + nfs_net_kobj); + kfree(c); +} + +static const void *nfs_netns_namespace(const struct kobject *kobj) +{ + return container_of(kobj, struct nfs_netns_client, nfs_net_kobj)->net; +} + +static struct kobj_type nfs_netns_object_type = { + .release = nfs_netns_object_release, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = nfs_netns_namespace, +}; + +static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent, + struct net *net) +{ + struct nfs_netns_client *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (p) { + p->net = net; + p->kobject.kset = nfs_kset; + p->nfs_net_kobj.kset = nfs_kset; + + if (kobject_init_and_add(&p->nfs_net_kobj, &nfs_netns_object_type, + parent, "net") != 0) { + kobject_put(&p->nfs_net_kobj); + return NULL; + } + + if (kobject_init_and_add(&p->kobject, &nfs_netns_client_type, + &p->nfs_net_kobj, "nfs_client") == 0) + return p; + + kobject_put(&p->kobject); + kobject_put(&p->nfs_net_kobj); + } + return NULL; +} + +void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net) +{ + struct nfs_netns_client *clp; + + clp = nfs_netns_client_alloc(&nfs_kset->kobj, net); + if (clp) { + netns->nfs_client = clp; + kobject_uevent(&clp->kobject, KOBJ_ADD); + } +} + +void nfs_netns_sysfs_destroy(struct nfs_net *netns) +{ + struct nfs_netns_client *clp = netns->nfs_client; + + if (clp) { + kobject_uevent(&clp->kobject, KOBJ_REMOVE); + kobject_del(&clp->kobject); + kobject_put(&clp->kobject); + kobject_del(&clp->nfs_net_kobj); + kobject_put(&clp->nfs_net_kobj); + netns->nfs_client = NULL; + } +} + +static bool shutdown_match_client(const struct rpc_task *task, const void *data) +{ + return true; +} + +static void shutdown_client(struct rpc_clnt *clnt) +{ + clnt->cl_shutdown = 1; + rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL); +} + +/* + * Shut down the nfs_client only once all the superblocks + * have been shut down. + */ +static void shutdown_nfs_client(struct nfs_client *clp) +{ + struct nfs_server *server; + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!(server->flags & NFS_MOUNT_SHUTDOWN)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + nfs_mark_client_ready(clp, -EIO); + shutdown_client(clp->cl_rpcclient); +} + +static ssize_t +shutdown_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + bool shutdown = server->flags & NFS_MOUNT_SHUTDOWN; + return sysfs_emit(buf, "%d\n", shutdown); +} + +static ssize_t +shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct nfs_server *server; + int ret, val; + + server = container_of(kobj, struct nfs_server, kobj); + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val != 1) + return -EINVAL; + + /* already shut down? */ + if (server->flags & NFS_MOUNT_SHUTDOWN) + goto out; + + server->flags |= NFS_MOUNT_SHUTDOWN; + shutdown_client(server->client); + + if (!IS_ERR(server->client_acl)) + shutdown_client(server->client_acl); + + if (server->nlm_host) + shutdown_client(server->nlm_host->h_rpcclnt); +out: + shutdown_nfs_client(server->nfs_client); + return count; +} + +static struct kobj_attribute nfs_sysfs_attr_shutdown = __ATTR_RW(shutdown); + +#if IS_ENABLED(CONFIG_NFS_V4_1) +static ssize_t +implid_domain_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid; + + if (!impl_id || strlen(impl_id->domain) == 0) + return 0; //sysfs_emit(buf, ""); + return sysfs_emit(buf, "%s\n", impl_id->domain); +} + +static struct kobj_attribute nfs_sysfs_attr_implid_domain = __ATTR_RO(implid_domain); + + +static ssize_t +implid_name_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid; + + if (!impl_id || strlen(impl_id->name) == 0) + return 0; //sysfs_emit(buf, ""); + return sysfs_emit(buf, "%s\n", impl_id->name); +} + +static struct kobj_attribute nfs_sysfs_attr_implid_name = __ATTR_RO(implid_name); + +#endif /* IS_ENABLED(CONFIG_NFS_V4_1) */ + +#define RPC_CLIENT_NAME_SIZE 64 + +void nfs_sysfs_link_rpc_client(struct nfs_server *server, + struct rpc_clnt *clnt, const char *uniq) +{ + char name[RPC_CLIENT_NAME_SIZE]; + int ret; + + strscpy(name, clnt->cl_program->name, sizeof(name)); + strncat(name, uniq ? uniq : "", sizeof(name) - strlen(name) - 1); + strncat(name, "_client", sizeof(name) - strlen(name) - 1); + + ret = sysfs_create_link_nowarn(&server->kobj, + &clnt->cl_sysfs->kobject, name); + if (ret < 0) + pr_warn("NFS: can't create link to %s in sysfs (%d)\n", + name, ret); +} +EXPORT_SYMBOL_GPL(nfs_sysfs_link_rpc_client); + +static void nfs_sysfs_sb_release(struct kobject *kobj) +{ + /* no-op: why? see lib/kobject.c kobject_cleanup() */ +} + +static const void *nfs_netns_server_namespace(const struct kobject *kobj) +{ + return container_of(kobj, struct nfs_server, kobj)->nfs_client->cl_net; +} + +static struct kobj_type nfs_sb_ktype = { + .release = nfs_sysfs_sb_release, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = nfs_netns_server_namespace, + .child_ns_type = nfs_netns_object_child_ns_type, +}; + +#if IS_ENABLED(CONFIG_NFS_V4_1) +static void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) +{ + int ret; + + if (!server->nfs_client->cl_implid) + return; + + ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_domain.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); + + ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_name.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +#else /* CONFIG_NFS_V4_1 */ +static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +static ssize_t +localio_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + bool localio = nfs_server_is_local(server->nfs_client); + return sysfs_emit(buf, "%d\n", localio); +} + +static struct kobj_attribute nfs_sysfs_attr_localio = __ATTR_RO(localio); + +static void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ + int ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_localio.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +#else +static inline void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ +} +#endif /* IS_ENABLED(CONFIG_NFS_LOCALIO) */ + +void nfs_sysfs_add_server(struct nfs_server *server) +{ + int ret; + + ret = kobject_init_and_add(&server->kobj, &nfs_sb_ktype, + &nfs_kset->kobj, "server-%d", server->s_sysfs_id); + if (ret < 0) { + pr_warn("NFS: nfs sysfs add server-%d failed (%d)\n", + server->s_sysfs_id, ret); + return; + } + ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_shutdown.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); + + nfs_sysfs_add_nfsv41_server(server); + nfs_sysfs_add_nfs_localio_server(server); +} +EXPORT_SYMBOL_GPL(nfs_sysfs_add_server); + +void nfs_sysfs_move_server_to_sb(struct super_block *s) +{ + struct nfs_server *server = s->s_fs_info; + int ret; + + ret = kobject_rename(&server->kobj, s->s_id); + if (ret < 0) + pr_warn("NFS: rename sysfs %s failed (%d)\n", + server->kobj.name, ret); +} + +void nfs_sysfs_move_sb_to_server(struct nfs_server *server) +{ + const char *s; + int ret = -ENOMEM; + + s = kasprintf(GFP_KERNEL, "server-%d", server->s_sysfs_id); + if (s) { + ret = kobject_rename(&server->kobj, s); + kfree(s); + } + if (ret < 0) + pr_warn("NFS: rename sysfs %s failed (%d)\n", + server->kobj.name, ret); +} + +/* unlink, not dec-ref */ +void nfs_sysfs_remove_server(struct nfs_server *server) +{ + kobject_del(&server->kobj); +} diff --git a/fs/nfs/sysfs.h b/fs/nfs/sysfs.h new file mode 100644 index 000000000000..c5d1990cade5 --- /dev/null +++ b/fs/nfs/sysfs.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2019 Hammerspace Inc + */ + +#ifndef __NFS_SYSFS_H +#define __NFS_SYSFS_H + +#define CONTAINER_ID_MAXLEN (64) + +struct nfs_netns_client { + struct kobject kobject; + struct kobject nfs_net_kobj; + struct net *net; + const char __rcu *identifier; +}; + +extern struct kobject *nfs_net_kobj; + +extern int nfs_sysfs_init(void); +extern void nfs_sysfs_exit(void); + +void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net); +void nfs_netns_sysfs_destroy(struct nfs_net *netns); + +void nfs_sysfs_link_rpc_client(struct nfs_server *server, + struct rpc_clnt *clnt, const char *sysfs_prefix); +void nfs_sysfs_add_server(struct nfs_server *s); +void nfs_sysfs_move_server_to_sb(struct super_block *s); +void nfs_sysfs_move_sb_to_server(struct nfs_server *s); +void nfs_sysfs_remove_server(struct nfs_server *s); + +#endif diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index e3949d93085c..b55467911648 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/nfs/unlink.c * @@ -30,7 +31,7 @@ static void nfs_free_unlinkdata(struct nfs_unlinkdata *data) { - put_rpccred(data->cred); + put_cred(data->cred); kfree(data->args.name.name); kfree(data); } @@ -38,6 +39,7 @@ nfs_free_unlinkdata(struct nfs_unlinkdata *data) /** * nfs_async_unlink_done - Sillydelete post-processing * @task: rpc_task of the sillydelete + * @calldata: pointer to nfs_unlinkdata * * Do the directory attribute update. */ @@ -53,7 +55,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) /** * nfs_async_unlink_release - Release the sillydelete data. - * @task: rpc_task of the sillydelete + * @calldata: struct nfs_unlinkdata to release * * We need to call nfs_put_unlinkdata as a 'tk_release' task since the * rpc_task would be freed too. @@ -84,7 +86,7 @@ static const struct rpc_call_ops nfs_unlink_ops = { .rpc_call_prepare = nfs_unlink_prepare, }; -static void nfs_do_call_unlink(struct nfs_unlinkdata *data) +static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data) { struct rpc_message msg = { .rpc_argp = &data->args, @@ -96,15 +98,19 @@ static void nfs_do_call_unlink(struct nfs_unlinkdata *data) .callback_ops = &nfs_unlink_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; struct rpc_task *task; struct inode *dir = d_inode(data->dentry->d_parent); + + if (nfs_server_capable(inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); nfs_fattr_init(data->res.dir_attr); - NFS_PROTO(dir)->unlink_setup(&msg, dir); + NFS_PROTO(dir)->unlink_setup(&msg, data->dentry, inode); task_setup_data.rpc_client = NFS_CLIENT(dir); task = rpc_run_task(&task_setup_data); @@ -112,7 +118,7 @@ static void nfs_do_call_unlink(struct nfs_unlinkdata *data) rpc_put_task_async(task); } -static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) +static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nfs_unlinkdata *data) { struct inode *dir = d_inode(dentry->d_parent); struct dentry *alias; @@ -133,6 +139,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) */ spin_lock(&alias->d_lock); if (d_really_is_positive(alias) && + !nfs_compare_fh(NFS_FH(inode), NFS_FH(d_inode(alias))) && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; @@ -152,14 +159,14 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) return ret; } data->dentry = alias; - nfs_do_call_unlink(data); + nfs_do_call_unlink(inode, data); return 1; } /** * nfs_async_unlink - asynchronous unlinking of a file - * @dir: parent directory of dentry - * @dentry: dentry to unlink + * @dentry: parent directory of dentry + * @name: name of dentry to unlink */ static int nfs_async_unlink(struct dentry *dentry, const struct qstr *name) @@ -176,11 +183,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) goto out_free; data->args.name.len = name->len; - data->cred = rpc_lookup_cred(); - if (IS_ERR(data->cred)) { - status = PTR_ERR(data->cred); - goto out_free_name; - } + data->cred = get_current_cred(); data->res.dir_attr = &data->dir_attr; init_waitqueue_head(&data->wq); @@ -201,8 +204,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) return 0; out_unlock: spin_unlock(&dentry->d_lock); - put_rpccred(data->cred); -out_free_name: + put_cred(data->cred); kfree(data->args.name.name); out_free: kfree(data); @@ -230,7 +232,9 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode) dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); - if (NFS_STALE(inode) || !nfs_call_unlink(dentry, data)) + NFS_PROTO(inode)->return_delegation(inode); + + if (NFS_STALE(inode) || !nfs_call_unlink(dentry, inode, data)) nfs_free_unlinkdata(data); } @@ -265,7 +269,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct inode *new_dir = data->new_dir; struct dentry *old_dentry = data->old_dentry; - trace_nfs_sillyrename_rename(old_dir, old_dentry, + trace_nfs_async_rename_done(old_dir, old_dentry, new_dir, data->new_dentry, task->tk_status); if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { rpc_restart_call_prepare(task); @@ -306,7 +310,7 @@ static void nfs_async_rename_release(void *calldata) iput(data->old_dir); iput(data->new_dir); nfs_sb_deactive(sb); - put_rpccred(data->cred); + put_cred(data->cred); kfree(data); } @@ -328,6 +332,7 @@ static const struct rpc_call_ops nfs_rename_ops = { * @new_dir: target directory for the rename * @old_dentry: original dentry to be renamed * @new_dentry: dentry to which the old_dentry should be renamed + * @complete: Function to run on successful completion * * It's expected that valid references to the dentries and inodes are held */ @@ -343,20 +348,20 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, .callback_ops = &nfs_rename_ops, .workqueue = nfsiod_workqueue, .rpc_client = NFS_CLIENT(old_dir), - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; + if (nfs_server_capable(old_dir, NFS_CAP_MOVEABLE) && + nfs_server_capable(new_dir, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); + task_setup_data.task = &data->task; task_setup_data.callback_data = data; - data->cred = rpc_lookup_cred(); - if (IS_ERR(data->cred)) { - struct rpc_task *task = ERR_CAST(data->cred); - kfree(data); - return task; - } + data->cred = get_current_cred(); msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; @@ -385,7 +390,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, nfs_sb_active(old_dir->i_sb); - NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir); + NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry); return rpc_run_task(&task_setup_data); } @@ -403,12 +408,6 @@ nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data) nfs_cancel_async_unlink(dentry); return; } - - /* - * vfs_unlink and the like do not issue this when a file is - * sillyrenamed, so do it here. - */ - fsnotify_nameremove(dentry, 0); } #define SILLYNAME_PREFIX ".nfs" @@ -447,6 +446,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) unsigned char silly[SILLYNAME_LEN + 1]; unsigned long long fileid; struct dentry *sdentry; + struct inode *inode = d_inode(dentry); struct rpc_task *task; int error = -EBUSY; @@ -462,23 +462,19 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) fileid = NFS_FILEID(d_inode(dentry)); - /* Return delegation in anticipation of the rename */ - NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry)); - sdentry = NULL; do { - int slen; dput(sdentry); sillycounter++; - slen = scnprintf(silly, sizeof(silly), - SILLYNAME_PREFIX "%0*llx%0*x", - SILLYNAME_FILEID_LEN, fileid, - SILLYNAME_COUNTER_LEN, sillycounter); + scnprintf(silly, sizeof(silly), + SILLYNAME_PREFIX "%0*llx%0*x", + SILLYNAME_FILEID_LEN, fileid, + SILLYNAME_COUNTER_LEN, sillycounter); dfprintk(VFS, "NFS: trying to rename %pd to %s\n", dentry, silly); - sdentry = lookup_one_len(silly, dentry->d_parent, slen); + sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent); /* * N.B. Better to return EBUSY here ... it could be * dangerous to delete the file while it's in use. @@ -487,6 +483,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) goto out; } while (d_inode(sdentry) != NULL); /* need negative lookup */ + ihold(inode); + /* queue unlink first. Can't do this from rpc_release as it * has to allocate memory */ @@ -511,6 +509,12 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) case 0: /* The rename succeeded */ nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + spin_lock(&inode->i_lock); + NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_REVAL_FORCED); + spin_unlock(&inode->i_lock); d_move(dentry, sdentry); break; case -ERESTARTSYS: @@ -521,6 +525,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) } rpc_put_task(task); out_dput: + iput(inode); dput(sdentry); out: return error; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b1af5dee5e0a..336c510f3750 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/write.c * @@ -23,8 +24,11 @@ #include <linux/export.h> #include <linux/freezer.h> #include <linux/wait.h> +#include <linux/iversion.h> +#include <linux/filelock.h> #include <linux/uaccess.h> +#include <linux/sched/mm.h> #include "delegation.h" #include "internal.h" @@ -54,39 +58,28 @@ static const struct rpc_call_ops nfs_commit_ops; static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; -static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_inode_remove_request(struct nfs_page *req); +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); -static struct nfs_page * -nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct page *page); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) +struct nfs_commit_data *nfs_commitdata_alloc(void) { struct nfs_commit_data *p; - if (never_fail) - p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); - else { - /* It is OK to do some reclaim, not no safe to wait - * for anything to be returned to the pool. - * mempool_alloc() cannot handle that particular combination, - * so we need two separate attempts. - */ + p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask()); + if (!p) { p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); if (!p) - p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | - __GFP_NOWARN | __GFP_NORETRY); - if (!p) return NULL; + memset(p, 0, sizeof(*p)); } - - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); return p; } @@ -100,12 +93,16 @@ EXPORT_SYMBOL_GPL(nfs_commit_free); static struct nfs_pgio_header *nfs_writehdr_alloc(void) { - struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); + struct nfs_pgio_header *p; - if (p) { + p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask()); + if (!p) { + p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT); + if (!p) + return NULL; memset(p, 0, sizeof(*p)); - p->rw_mode = FMODE_WRITE; } + p->rw_mode = FMODE_WRITE; return p; } @@ -147,142 +144,110 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc) kref_put(&ioc->refcount, nfs_io_completion_release); } -static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) +static void +nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) +{ + if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) { + kref_get(&req->wb_kref); + atomic_long_inc(&NFS_I(inode)->nrequests); + } +} + +static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) { - ctx->error = error; - smp_wmb(); - set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) + nfs_page_set_inode_ref(req, inode); } -/* - * nfs_page_find_head_request_locked - find head request associated with @page +/** + * nfs_folio_find_head_request - find head request associated with a folio + * @folio: pointer to folio * * must be called while holding the inode lock. * * returns matching head request with reference held, or NULL if not found. */ -static struct nfs_page * -nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) +static struct nfs_page *nfs_folio_find_head_request(struct folio *folio) { - struct nfs_page *req = NULL; - - if (PagePrivate(page)) - req = (struct nfs_page *)page_private(page); - else if (unlikely(PageSwapCache(page))) - req = nfs_page_search_commits_for_head_request_locked(nfsi, - page); + struct address_space *mapping = folio->mapping; + struct nfs_page *req; + if (!folio_test_private(folio)) + return NULL; + spin_lock(&mapping->i_private_lock); + req = folio->private; if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } - - return req; -} - -/* - * nfs_page_find_head_request - find head request associated with @page - * - * returns matching head request with reference held, or NULL if not found. - */ -static struct nfs_page *nfs_page_find_head_request(struct page *page) -{ - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req = NULL; - - spin_lock(&inode->i_lock); - req = nfs_page_find_head_request_locked(NFS_I(inode), page); - spin_unlock(&inode->i_lock); + spin_unlock(&mapping->i_private_lock); return req; } /* Adjust the file length if we're writing beyond the end */ -static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) +static void nfs_grow_file(struct folio *folio, unsigned int offset, + unsigned int count) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio->mapping->host; loff_t end, i_size; pgoff_t end_index; spin_lock(&inode->i_lock); i_size = i_size_read(inode); - end_index = (i_size - 1) >> PAGE_SHIFT; - if (i_size > 0 && page_index(page) < end_index) + end_index = ((i_size - 1) >> folio_shift(folio)) << folio_order(folio); + if (i_size > 0 && folio->index < end_index) goto out; - end = page_file_offset(page) + ((loff_t)offset+count); + end = folio_pos(folio) + (loff_t)offset + (loff_t)count; if (i_size >= end) goto out; + trace_nfs_size_grow(inode, end); i_size_write(inode, end); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); out: + /* Atomically update timestamps if they are delegated to us. */ + nfs_update_delegated_mtime_locked(inode); spin_unlock(&inode->i_lock); + nfs_fscache_invalidate(inode, 0); } /* A writeback failed: mark the page as bad, and invalidate the page cache */ -static void nfs_set_pageerror(struct page *page) +static void nfs_set_pageerror(struct address_space *mapping) { - nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); + struct inode *inode = mapping->host; + + nfs_zap_mapping(mapping->host, mapping); + /* Force file size revalidation */ + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED | + NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_SIZE); + spin_unlock(&inode->i_lock); } -/* - * nfs_page_group_search_locked - * @head - head request of page group - * @page_offset - offset into page - * - * Search page group with head @head to find a request that contains the - * page offset @page_offset. - * - * Returns a pointer to the first matching nfs request, or NULL if no - * match is found. - * - * Must be called with the page group lock held - */ -static struct nfs_page * -nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) +static void nfs_mapping_set_error(struct folio *folio, int error) { - struct nfs_page *req; - - WARN_ON_ONCE(head != head->wb_head); - WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); - - req = head; - do { - if (page_offset >= req->wb_pgbase && - page_offset < (req->wb_pgbase + req->wb_bytes)) - return req; + struct address_space *mapping = folio->mapping; - req = req->wb_this_page; - } while (req != head); - - return NULL; + filemap_set_wb_err(mapping, error); + if (mapping->host) + errseq_set(&mapping->host->i_sb->s_wb_err, + error == -ENOSPC ? -ENOSPC : -EIO); + nfs_set_pageerror(mapping); } /* - * nfs_page_group_covers_page - * @head - head request of page group + * nfs_page_covers_folio + * @req: struct nfs_page * - * Return true if the page group with head @head covers the whole page, - * returns false otherwise + * Return true if the request covers the whole folio. + * Note that the caller should ensure all subrequests have been joined */ static bool nfs_page_group_covers_page(struct nfs_page *req) { - struct nfs_page *tmp; - unsigned int pos = 0; - unsigned int len = nfs_page_length(req->wb_page); + unsigned int len = nfs_folio_length(nfs_page_to_folio(req)); - nfs_page_group_lock(req, false); - - do { - tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (tmp) { - /* no way this should happen */ - WARN_ON_ONCE(tmp->wb_pgbase != pos); - pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); - } - } while (tmp && pos < len); - - nfs_page_group_unlock(req); - WARN_ON_ONCE(pos > len); - return pos == len; + return req->wb_pgbase == 0 && req->wb_bytes == len; } /* We can set the PG_uptodate flag if we see that a write request @@ -290,11 +255,13 @@ static bool nfs_page_group_covers_page(struct nfs_page *req) */ static void nfs_mark_uptodate(struct nfs_page *req) { - if (PageUptodate(req->wb_page)) + struct folio *folio = nfs_page_to_folio(req); + + if (folio_test_uptodate(folio)) return; if (!nfs_page_group_covers_page(req)) return; - SetPageUptodate(req->wb_page); + folio_mark_uptodate(folio); } static int wb_priority(struct writeback_control *wbc) @@ -316,93 +283,34 @@ int nfs_congestion_kb; #define NFS_CONGESTION_OFF_THRESH \ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) -static void nfs_set_page_writeback(struct page *page) +static void nfs_folio_set_writeback(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); - int ret = test_set_page_writeback(page); + struct nfs_server *nfss = NFS_SERVER(folio->mapping->host); - WARN_ON_ONCE(ret != 0); - - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) - set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); + folio_start_writeback(folio); + if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH) + nfss->write_congested = 1; } -static void nfs_end_page_writeback(struct nfs_page *req) +static void nfs_folio_end_writeback(struct folio *folio) { - struct inode *inode = page_file_mapping(req->wb_page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); - - if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) - return; - - end_page_writeback(req->wb_page); - if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); -} - + struct nfs_server *nfss = NFS_SERVER(folio->mapping->host); -/* nfs_page_group_clear_bits - * @req - an nfs request - * clears all page group related bits from @req - */ -static void -nfs_page_group_clear_bits(struct nfs_page *req) -{ - clear_bit(PG_TEARDOWN, &req->wb_flags); - clear_bit(PG_UNLOCKPAGE, &req->wb_flags); - clear_bit(PG_UPTODATE, &req->wb_flags); - clear_bit(PG_WB_END, &req->wb_flags); - clear_bit(PG_REMOVE, &req->wb_flags); + folio_end_writeback_no_dropbehind(folio); + if (atomic_long_dec_return(&nfss->writeback) < + NFS_CONGESTION_OFF_THRESH) { + nfss->write_congested = 0; + wake_up_all(&nfss->write_congestion_wait); + } } - -/* - * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req - * - * this is a helper function for nfs_lock_and_join_requests - * - * @inode - inode associated with request page group, must be holding inode lock - * @head - head request of page group, must be holding head lock - * @req - request that couldn't lock and needs to wait on the req bit lock - * @nonblock - if true, don't actually wait - * - * NOTE: this must be called holding page_group bit lock and inode spin lock - * and BOTH will be released before returning. - * - * returns 0 on success, < 0 on error. - */ -static int -nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, - struct nfs_page *req, bool nonblock) - __releases(&inode->i_lock) +static void nfs_page_end_writeback(struct nfs_page *req) { - struct nfs_page *tmp; - int ret; - - /* relinquish all the locks successfully grabbed this run */ - for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) - nfs_unlock_request(tmp); - - WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); - - /* grab a ref on the request that will be waited on */ - kref_get(&req->wb_kref); - - nfs_page_group_unlock(head); - spin_unlock(&inode->i_lock); - - /* release ref from nfs_page_find_head_request_locked */ - nfs_release_request(head); - - if (!nonblock) - ret = nfs_wait_on_request(req); - else - ret = -EAGAIN; - nfs_release_request(req); - - return ret; + if (nfs_page_group_sync_on_bit(req, PG_WB_END)) { + nfs_unlock_request(req); + nfs_folio_end_writeback(nfs_page_to_folio(req)); + } else + nfs_unlock_request(req); } /* @@ -417,7 +325,8 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, */ static void nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, - struct nfs_page *old_head) + struct nfs_page *old_head, + struct inode *inode) { while (destroy_list) { struct nfs_page *subreq = destroy_list; @@ -425,140 +334,84 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, destroy_list = (subreq->wb_this_page == old_head) ? NULL : subreq->wb_this_page; + /* Note: lock subreq in order to change subreq->wb_head */ + nfs_page_set_headlock(subreq); WARN_ON_ONCE(old_head != subreq->wb_head); /* make sure old group is not used */ - subreq->wb_head = subreq; subreq->wb_this_page = subreq; + subreq->wb_head = subreq; - /* subreq is now totally disconnected from page group or any - * write / commit lists. last chance to wake any waiters */ - nfs_unlock_request(subreq); + clear_bit(PG_REMOVE, &subreq->wb_flags); - if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { - /* release ref on old head request */ - nfs_release_request(old_head); + /* Note: races with nfs_page_group_destroy() */ + if (!kref_read(&subreq->wb_kref)) { + /* Check if we raced with nfs_page_group_destroy() */ + if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) { + nfs_page_clear_headlock(subreq); + nfs_free_request(subreq); + } else + nfs_page_clear_headlock(subreq); + continue; + } + nfs_page_clear_headlock(subreq); - nfs_page_group_clear_bits(subreq); + nfs_release_request(old_head); - /* release the PG_INODE_REF reference */ - if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) - nfs_release_request(subreq); - else - WARN_ON_ONCE(1); - } else { - WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); - /* zombie requests have already released the last - * reference and were waiting on the rest of the - * group to complete. Since it's no longer part of a - * group, simply free the request */ - nfs_page_group_clear_bits(subreq); - nfs_free_request(subreq); + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { + nfs_release_request(subreq); + atomic_long_dec(&NFS_I(inode)->nrequests); } + + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_and_release_request(subreq); } } /* - * nfs_lock_and_join_requests - join all subreqs to the head req and return - * a locked reference, cancelling any pending - * operations for this page. - * - * @page - the page used to lookup the "page group" of nfs_page structures - * @nonblock - if true, don't block waiting for request locks + * nfs_join_page_group - destroy subrequests of the head req + * @head: the page used to lookup the "page group" of nfs_page structures + * @inode: Inode to which the request belongs. * * This function joins all sub requests to the head request by first * locking all requests in the group, cancelling any pending operations * and finally updating the head request to cover the whole range covered by * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. - * - * Returns a locked, referenced pointer to the head request - which after - * this call is guaranteed to be the only request associated with the page. - * Returns NULL if no requests are found for @page, or a ERR_PTR if an - * error was encountered. */ -static struct nfs_page * -nfs_lock_and_join_requests(struct page *page, bool nonblock) +void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, + struct inode *inode) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *head, *subreq; + struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; - unsigned int total_bytes; - int ret; - -try_again: - total_bytes = 0; - - WARN_ON_ONCE(destroy_list); - - spin_lock(&inode->i_lock); - - /* - * A reference is taken only on the head request which acts as a - * reference to the whole page group - the group will not be destroyed - * until the head reference is released. - */ - head = nfs_page_find_head_request_locked(NFS_I(inode), page); - - if (!head) { - spin_unlock(&inode->i_lock); - return NULL; - } - - /* holding inode lock, so always make a non-blocking call to try the - * page group lock */ - ret = nfs_page_group_lock(head, true); - if (ret < 0) { - spin_unlock(&inode->i_lock); - - if (!nonblock && ret == -EAGAIN) { - nfs_page_group_lock_wait(head); - nfs_release_request(head); - goto try_again; + unsigned int pgbase, off, bytes; + + pgbase = head->wb_pgbase; + bytes = head->wb_bytes; + off = head->wb_offset; + for (subreq = head->wb_this_page; subreq != head; + subreq = subreq->wb_this_page) { + /* Subrequests should always form a contiguous range */ + if (pgbase > subreq->wb_pgbase) { + off -= pgbase - subreq->wb_pgbase; + bytes += pgbase - subreq->wb_pgbase; + pgbase = subreq->wb_pgbase; } - - nfs_release_request(head); - return ERR_PTR(ret); + bytes = max(subreq->wb_pgbase + subreq->wb_bytes + - pgbase, bytes); } - /* lock each request in the page group */ - subreq = head; - do { - /* - * Subrequests are always contiguous, non overlapping - * and in order - but may be repeated (mirrored writes). - */ - if (subreq->wb_offset == (head->wb_offset + total_bytes)) { - /* keep track of how many bytes this group covers */ - total_bytes += subreq->wb_bytes; - } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset || - ((subreq->wb_offset + subreq->wb_bytes) > - (head->wb_offset + total_bytes)))) { - nfs_page_group_unlock(head); - spin_unlock(&inode->i_lock); - return ERR_PTR(-EIO); - } - - if (!nfs_lock_request(subreq)) { - /* releases page group bit lock and - * inode spin lock and all references */ - ret = nfs_unroll_locks_and_wait(inode, head, - subreq, nonblock); - - if (ret == 0) - goto try_again; - - return ERR_PTR(ret); - } - - subreq = subreq->wb_this_page; - } while (subreq != head); + /* Set the head request's range to cover the former page group */ + head->wb_pgbase = pgbase; + head->wb_bytes = bytes; + head->wb_offset = off; /* Now that all requests are locked, make sure they aren't on any list. * Commit list removal accounting is done after locks are dropped */ subreq = head; do { - nfs_clear_request_commit(subreq); + nfs_clear_request_commit(cinfo, subreq); subreq = subreq->wb_this_page; } while (subreq != head); @@ -567,81 +420,186 @@ try_again: /* destroy list will be terminated by head */ destroy_list = head->wb_this_page; head->wb_this_page = head; + } + + nfs_destroy_unlinked_subrequests(destroy_list, head, inode); +} + +/** + * nfs_wait_on_request - Wait for a request to complete. + * @req: request to wait upon. + * + * Interruptible by fatal signals only. + * The user is responsible for holding a count on the request. + */ +static int nfs_wait_on_request(struct nfs_page *req) +{ + if (!test_bit(PG_BUSY, &req->wb_flags)) + return 0; + set_bit(PG_CONTENDED2, &req->wb_flags); + smp_mb__after_atomic(); + return wait_on_bit_io(&req->wb_flags, PG_BUSY, + TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_unroll_locks - unlock all newly locked reqs and wait on @req + * @head: head request of page group, must be holding head lock + * @req: request that couldn't lock and needs to wait on the req bit lock + * + * This is a helper function for nfs_lock_and_join_requests + * returns 0 on success, < 0 on error. + */ +static void +nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req) +{ + struct nfs_page *tmp; - /* change head request to cover whole range that - * the former page group covered */ - head->wb_bytes = total_bytes; + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { + if (!kref_read(&tmp->wb_kref)) + continue; + nfs_unlock_and_release_request(tmp); } +} - /* - * prepare head request to be added to new pgio descriptor - */ - nfs_page_group_clear_bits(head); +/* + * nfs_page_group_lock_subreq - try to lock a subrequest + * @head: head request of page group + * @subreq: request to lock + * + * This is a helper function for nfs_lock_and_join_requests which + * must be called with the head request and page group both locked. + * On error, it returns with the page group unlocked. + */ +static int +nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq) +{ + int ret; + + if (!kref_get_unless_zero(&subreq->wb_kref)) + return 0; + while (!nfs_lock_request(subreq)) { + nfs_page_group_unlock(head); + ret = nfs_wait_on_request(subreq); + if (!ret) + ret = nfs_page_group_lock(head); + if (ret < 0) { + nfs_unroll_locks(head, subreq); + nfs_release_request(subreq); + return ret; + } + } + return 0; +} + +/* + * nfs_lock_and_join_requests - join all subreqs to the head req + * @folio: the folio used to lookup the "page group" of nfs_page structures + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @folio, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) +{ + struct inode *inode = folio->mapping->host; + struct nfs_page *head, *subreq; + struct nfs_commit_info cinfo; + int ret; /* - * some part of the group was still on the inode list - otherwise - * the group wouldn't be involved in async write. - * grab a reference for the head request, iff it needs one. + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. */ - if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) - kref_get(&head->wb_kref); +retry: + head = nfs_folio_find_head_request(folio); + if (!head) + return NULL; - nfs_page_group_unlock(head); + while (!nfs_lock_request(head)) { + ret = nfs_wait_on_request(head); + if (ret < 0) { + nfs_release_request(head); + return ERR_PTR(ret); + } + } - /* drop lock to clean uprequests on destroy list */ - spin_unlock(&inode->i_lock); + ret = nfs_page_group_lock(head); + if (ret < 0) + goto out_unlock; + + /* Ensure that nobody removed the request before we locked it */ + if (head != folio->private) { + nfs_page_group_unlock(head); + nfs_unlock_and_release_request(head); + goto retry; + } - nfs_destroy_unlinked_subrequests(destroy_list, head); + nfs_cancel_remove_inode(head, inode); - /* still holds ref on head from nfs_page_find_head_request_locked - * and still has lock on head from lock loop */ + /* lock each request in the page group */ + for (subreq = head->wb_this_page; + subreq != head; + subreq = subreq->wb_this_page) { + ret = nfs_page_group_lock_subreq(head, subreq); + if (ret < 0) + goto out_unlock; + } + + nfs_page_group_unlock(head); + + nfs_init_cinfo_from_inode(&cinfo, inode); + nfs_join_page_group(head, &cinfo, inode); return head; -} -static void nfs_write_error_remove_page(struct nfs_page *req) -{ - nfs_unlock_request(req); - nfs_end_page_writeback(req); - generic_error_remove_page(page_file_mapping(req->wb_page), - req->wb_page); - nfs_release_request(req); +out_unlock: + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); } -static bool -nfs_error_is_fatal_on_server(int err) +static void nfs_write_error(struct nfs_page *req, int error) { - switch (err) { - case 0: - case -ERESTARTSYS: - case -EINTR: - return false; - } - return nfs_error_is_fatal(err); + trace_nfs_write_error(nfs_page_to_inode(req), req, error); + nfs_mapping_set_error(nfs_page_to_folio(req), error); + nfs_inode_remove_request(req); + nfs_page_end_writeback(req); + nfs_release_request(req); } /* * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock) +static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; - int ret = 0; + int ret; + + nfs_pageio_cond_complete(pgio, folio->index); - req = nfs_lock_and_join_requests(page, nonblock); + req = nfs_lock_and_join_requests(folio); if (!req) - goto out; - ret = PTR_ERR(req); + return 0; if (IS_ERR(req)) - goto out; + return PTR_ERR(req); - nfs_set_page_writeback(page); + trace_nfs_do_writepage(req); + nfs_folio_set_writeback(folio); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); - ret = 0; /* If there is a fatal error that covers this write, just exit */ - if (nfs_error_is_fatal_on_server(req->wb_context->error)) + ret = pgio->pg_error; + if (nfs_error_is_fatal_on_server(ret)) goto out_launder; if (!nfs_pageio_add_request(pgio, req)) { @@ -649,75 +607,39 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, /* * Remove the problematic req upon fatal errors on the server */ - if (nfs_error_is_fatal(ret)) { - nfs_context_set_write_error(req->wb_context, ret); - if (nfs_error_is_fatal_on_server(ret)) - goto out_launder; - } + if (nfs_error_is_fatal_on_server(ret)) + goto out_launder; + folio_redirty_for_writepage(wbc, folio); nfs_redirty_request(req); - ret = -EAGAIN; - } else - nfs_add_stats(page_file_mapping(page)->host, - NFSIOS_WRITEPAGES, 1); -out: - return ret; -out_launder: - nfs_write_error_remove_page(req); - return ret; -} + pgio->pg_error = 0; + return ret; + } -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) -{ - int ret; + nfs_add_stats(folio->mapping->host, NFSIOS_WRITEPAGES, 1); + return 0; - nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); - ret = 0; - } - return ret; +out_launder: + nfs_write_error(req, ret); + return 0; } /* * Write an mmapped page to the server. */ -static int nfs_writepage_locked(struct page *page, +static int nfs_writepage_locked(struct folio *folio, struct writeback_control *wbc) { struct nfs_pageio_descriptor pgio; - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio->mapping->host; int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_pageio_init_write(&pgio, inode, 0, - false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio); + nfs_pageio_init_write(&pgio, inode, 0, false, + &nfs_async_write_completion_ops); + err = nfs_do_writepage(folio, wbc, &pgio); + pgio.pg_error = 0; nfs_pageio_complete(&pgio); - if (err < 0) - return err; - if (pgio.pg_error < 0) - return pgio.pg_error; - return 0; -} - -int nfs_writepage(struct page *page, struct writeback_control *wbc) -{ - int ret; - - ret = nfs_writepage_locked(page, wbc); - unlock_page(page); - return ret; -} - -static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) -{ - int ret; - - ret = nfs_do_writepage(page, wbc, data); - unlock_page(page); - return ret; + return err; } static void nfs_io_completion_commit(void *inode) @@ -729,64 +651,82 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct nfs_pageio_descriptor pgio; - struct nfs_io_completion *ioc = nfs_io_completion_alloc(GFP_NOFS); + struct nfs_io_completion *ioc = NULL; + unsigned int mntflags = NFS_SERVER(inode)->flags; + struct nfs_server *nfss = NFS_SERVER(inode); + int priority = 0; int err; + trace_nfs_writepages(inode, wbc->range_start, wbc->range_end - wbc->range_start); + + /* Wait with writeback until write congestion eases */ + if (wbc->sync_mode == WB_SYNC_NONE && nfss->write_congested) { + err = wait_event_killable(nfss->write_congestion_wait, + nfss->write_congested == 0); + if (err) + goto out_err; + } + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - if (ioc) - nfs_io_completion_init(ioc, nfs_io_completion_commit, inode); + if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || + wbc->for_background || wbc->for_sync) { + ioc = nfs_io_completion_alloc(GFP_KERNEL); + if (ioc) + nfs_io_completion_init(ioc, nfs_io_completion_commit, + inode); + priority = wb_priority(wbc); + } - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, - &nfs_async_write_completion_ops); - pgio.pg_io_completion = ioc; - err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); - nfs_pageio_complete(&pgio); + do { + struct folio *folio = NULL; + + nfs_pageio_init_write(&pgio, inode, priority, false, + &nfs_async_write_completion_ops); + pgio.pg_io_completion = ioc; + while ((folio = writeback_iter(mapping, wbc, folio, &err))) { + err = nfs_do_writepage(folio, wbc, &pgio); + folio_unlock(folio); + } + pgio.pg_error = 0; + nfs_pageio_complete(&pgio); + if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR) + break; + } while (err < 0 && !nfs_error_is_fatal(err)); nfs_io_completion_put(ioc); - if (err < 0) - goto out_err; - err = pgio.pg_error; - if (err < 0) - goto out_err; - return 0; + if (err > 0) + err = 0; out_err: + trace_nfs_writepages_done(inode, wbc->range_start, wbc->range_end - wbc->range_start, err); return err; } /* * Insert a write request into an inode */ -static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static void nfs_inode_add_request(struct nfs_page *req) { - struct nfs_inode *nfsi = NFS_I(inode); + struct folio *folio = nfs_page_to_folio(req); + struct address_space *mapping = folio->mapping; + struct nfs_inode *nfsi = NFS_I(mapping->host); WARN_ON_ONCE(req->wb_this_page != req); /* Lock the request! */ nfs_lock_request(req); - - spin_lock(&inode->i_lock); - if (!nfsi->nrequests && - NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) - inode->i_version++; - /* - * Swap-space should not get truncated. Hence no need to plug the race - * with invalidate/truncate. - */ - if (likely(!PageSwapCache(req->wb_page))) { - set_bit(PG_MAPPED, &req->wb_flags); - SetPagePrivate(req->wb_page); - set_page_private(req->wb_page, (unsigned long)req); - } - nfsi->nrequests++; + spin_lock(&mapping->i_private_lock); + set_bit(PG_MAPPED, &req->wb_flags); + folio_set_private(folio); + folio->private = req; + spin_unlock(&mapping->i_private_lock); + atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. * This flag also informs pgio layer when to bump nrequests when * adding subrequests. */ WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); - spin_unlock(&inode->i_lock); } /* @@ -794,68 +734,36 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct inode *inode = d_inode(req->wb_context->dentry); - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_page *head; + struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); - if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { - head = req->wb_head; + nfs_page_group_lock(req); + if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) { + struct folio *folio = nfs_page_to_folio(req->wb_head); + struct address_space *mapping = folio->mapping; - spin_lock(&inode->i_lock); - if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { - set_page_private(head->wb_page, 0); - ClearPagePrivate(head->wb_page); - clear_bit(PG_MAPPED, &head->wb_flags); + spin_lock(&mapping->i_private_lock); + if (likely(folio)) { + folio->private = NULL; + folio_clear_private(folio); + clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } - nfsi->nrequests--; - spin_unlock(&inode->i_lock); - } else { - spin_lock(&inode->i_lock); - nfsi->nrequests--; - spin_unlock(&inode->i_lock); + spin_unlock(&mapping->i_private_lock); + + folio_end_dropbehind(folio); } + nfs_page_group_unlock(req); - if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { + atomic_long_dec(&nfsi->nrequests); nfs_release_request(req); + } } -static void -nfs_mark_request_dirty(struct nfs_page *req) -{ - if (req->wb_page) - __set_page_dirty_nobuffers(req->wb_page); -} - -/* - * nfs_page_search_commits_for_head_request_locked - * - * Search through commit lists on @inode for the head request for @page. - * Must be called while holding the inode (which is cinfo) lock. - * - * Returns the head request if found, or NULL if not found. - */ -static struct nfs_page * -nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct page *page) +static void nfs_mark_request_dirty(struct nfs_page *req) { - struct nfs_page *freq, *t; - struct nfs_commit_info cinfo; - struct inode *inode = &nfsi->vfs_inode; - - nfs_init_cinfo_from_inode(&cinfo, inode); - - /* search through pnfs commit lists */ - freq = pnfs_search_commit_reqs(inode, &cinfo, page); - if (freq) - return freq->wb_head; - - /* Linearly search the commit list for the correct request */ - list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { - if (freq->wb_page == page) - return freq->wb_head; - } - - return NULL; + struct folio *folio = nfs_page_to_folio(req); + if (folio) + filemap_dirty_folio(folio_mapping(folio), folio); } /** @@ -868,7 +776,8 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, * number of outstanding requests requiring a commit as well as * the MM page stats. * - * The caller must hold cinfo->inode->i_lock, and the nfs_page lock. + * The caller must hold NFS_I(cinfo->inode)->commit_mutex, and the + * nfs_page lock. */ void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, @@ -876,14 +785,13 @@ nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, { set_bit(PG_CLEAN, &req->wb_flags); nfs_list_add_request(req, dst); - cinfo->mds->ncommit++; + atomic_long_inc(&cinfo->mds->ncommit); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); /** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page - * @dst: commit list head * @cinfo: holds list lock and accounting info * * This sets the PG_CLEAN bit, updates the cinfo count of @@ -896,11 +804,10 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); - spin_unlock(&cinfo->inode->i_lock); - if (req->wb_page) - nfs_mark_page_unstable(req->wb_page, cinfo); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); + nfs_folio_mark_unstable(nfs_page_to_folio(req), cinfo); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -922,7 +829,7 @@ nfs_request_remove_commit_list(struct nfs_page *req, if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) return; nfs_list_remove_request(req); - cinfo->mds->ncommit--; + atomic_long_dec(&cinfo->mds->ncommit); } EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); @@ -959,27 +866,31 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, nfs_request_add_commit_list(req, cinfo); } -static void -nfs_clear_page_commit(struct page *page) +static void nfs_folio_clear_commit(struct folio *folio) { - dec_node_page_state(page, NR_UNSTABLE_NFS); - dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, - WB_RECLAIMABLE); + if (folio) { + long nr = folio_nr_pages(folio); + + node_stat_mod_folio(folio, NR_WRITEBACK, -nr); + wb_stat_mod(&inode_to_bdi(folio->mapping->host)->wb, + WB_WRITEBACK, -nr); + } } -/* Called holding inode (/cinfo) lock */ -static void -nfs_clear_request_commit(struct nfs_page *req) +/* Called holding the request lock on @req */ +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { - struct inode *inode = d_inode(req->wb_context->dentry); - struct nfs_commit_info cinfo; + struct nfs_open_context *ctx = nfs_req_openctx(req); + struct inode *inode = d_inode(ctx->dentry); - nfs_init_cinfo_from_inode(&cinfo, inode); - if (!pnfs_clear_request_commit(req, &cinfo)) { - nfs_request_remove_commit_list(req, &cinfo); + mutex_lock(&NFS_I(inode)->commit_mutex); + if (!pnfs_clear_request_commit(req, cinfo)) { + nfs_request_remove_commit_list(req, cinfo); } - nfs_clear_page_commit(req->wb_page); + mutex_unlock(&NFS_I(inode)->commit_mutex); + nfs_folio_clear_commit(nfs_page_to_folio(req)); } } @@ -1010,21 +921,23 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) nfs_list_remove_request(req); if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes < bytes)) { - nfs_set_pageerror(req->wb_page); - nfs_context_set_write_error(req->wb_context, hdr->error); + trace_nfs_comp_error(hdr->inode, req, hdr->error); + nfs_mapping_set_error(nfs_page_to_folio(req), + hdr->error); goto remove_req; } if (nfs_write_need_commit(hdr)) { + /* Reset wb_nio, since the write was successful. */ + req->wb_nio = 0; memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo, - hdr->pgio_mirror_idx); + hdr->ds_commit_idx); goto next; } remove_req: nfs_inode_remove_request(req); next: - nfs_unlock_request(req); - nfs_end_page_writeback(req); + nfs_page_end_writeback(req); nfs_release_request(req); } out: @@ -1035,10 +948,10 @@ out: unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { - return cinfo->mds->ncommit; + return atomic_long_read(&cinfo->mds->ncommit); } -/* cinfo->inode->i_lock held by caller */ +/* NFS_I(cinfo->inode)->commit_mutex held by caller */ int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) @@ -1047,19 +960,22 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int ret = 0; list_for_each_entry_safe(req, tmp, src, wb_list) { - if (!nfs_lock_request(req)) - continue; kref_get(&req->wb_kref); - if (cond_resched_lock(&cinfo->inode->i_lock)) - list_safe_reset_next(req, tmp, wb_list); + if (!nfs_lock_request(req)) { + nfs_release_request(req); + continue; + } nfs_request_remove_commit_list(req, cinfo); + clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); nfs_list_add_request(req, dst); ret++; if ((ret == max) && !cinfo->dreq) break; + cond_resched(); } return ret; } +EXPORT_SYMBOL_GPL(nfs_scan_commit_list); /* * nfs_scan_commit - Scan an inode for commit requests @@ -1076,15 +992,17 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, { int ret = 0; - spin_lock(&cinfo->inode->i_lock); - if (cinfo->mds->ncommit > 0) { + if (!atomic_long_read(&cinfo->mds->ncommit)) + return 0; + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + if (atomic_long_read(&cinfo->mds->ncommit) > 0) { const int max = INT_MAX; ret = nfs_scan_commit_list(&cinfo->mds->list, dst, cinfo, max); ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); } - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); return ret; } @@ -1095,53 +1013,31 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, * If the attempt fails, then the existing request is flushed out * to disk. */ -static struct nfs_page *nfs_try_to_update_request(struct inode *inode, - struct page *page, - unsigned int offset, - unsigned int bytes) +static struct nfs_page *nfs_try_to_update_request(struct folio *folio, + unsigned int offset, + unsigned int bytes) { struct nfs_page *req; unsigned int rqend; unsigned int end; int error; - if (!PagePrivate(page)) - return NULL; - + trace_nfs_try_to_update_request(folio_inode(folio), offset, bytes); end = offset + bytes; - spin_lock(&inode->i_lock); - - for (;;) { - req = nfs_page_find_head_request_locked(NFS_I(inode), page); - if (req == NULL) - goto out_unlock; - - /* should be handled by nfs_flush_incompatible */ - WARN_ON_ONCE(req->wb_head != req); - WARN_ON_ONCE(req->wb_this_page != req); - rqend = req->wb_offset + req->wb_bytes; - /* - * Tell the caller to flush out the request if - * the offsets are non-contiguous. - * Note: nfs_flush_incompatible() will already - * have flushed out requests having wrong owners. - */ - if (offset > rqend - || end < req->wb_offset) - goto out_flushme; - - if (nfs_lock_request(req)) - break; + req = nfs_lock_and_join_requests(folio); + if (IS_ERR_OR_NULL(req)) + goto out; - /* The request is locked, so wait and then retry */ - spin_unlock(&inode->i_lock); - error = nfs_wait_on_request(req); - nfs_release_request(req); - if (error != 0) - goto out_err; - spin_lock(&inode->i_lock); - } + rqend = req->wb_offset + req->wb_bytes; + /* + * Tell the caller to flush out the request if + * the offsets are non-contiguous. + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ + if (offset > rqend || end < req->wb_offset) + goto out_flushme; /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { @@ -1152,17 +1048,22 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, req->wb_bytes = end - req->wb_offset; else req->wb_bytes = rqend - req->wb_offset; -out_unlock: - if (req) - nfs_clear_request_commit(req); - spin_unlock(&inode->i_lock); + req->wb_nio = 0; +out: + trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, + PTR_ERR_OR_ZERO(req)); return req; out_flushme: - spin_unlock(&inode->i_lock); - nfs_release_request(req); - error = nfs_wb_page(inode, page); -out_err: - return ERR_PTR(error); + /* + * Note: we mark the request dirty here because + * nfs_lock_and_join_requests() cannot preserve + * commit flags, so we have to replay the write. + */ + nfs_mark_request_dirty(req); + nfs_unlock_and_release_request(req); + error = nfs_wb_folio(folio->mapping->host, folio); + trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, error); + return (error < 0) ? ERR_PTR(error) : NULL; } /* @@ -1172,44 +1073,47 @@ out_err: * if we have to add a new request. Also assumes that the caller has * already called nfs_flush_incompatible() if necessary. */ -static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, - struct page *page, unsigned int offset, unsigned int bytes) +static struct nfs_page *nfs_setup_write_request(struct nfs_open_context *ctx, + struct folio *folio, + unsigned int offset, + unsigned int bytes) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req; + struct nfs_page *req; - req = nfs_try_to_update_request(inode, page, offset, bytes); + req = nfs_try_to_update_request(folio, offset, bytes); if (req != NULL) goto out; - req = nfs_create_request(ctx, page, NULL, offset, bytes); + req = nfs_page_create_from_folio(ctx, folio, offset, bytes); if (IS_ERR(req)) goto out; - nfs_inode_add_request(inode, req); + nfs_inode_add_request(req); out: return req; } -static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, - unsigned int offset, unsigned int count) +static int nfs_writepage_setup(struct nfs_open_context *ctx, + struct folio *folio, unsigned int offset, + unsigned int count) { - struct nfs_page *req; + struct nfs_page *req; - req = nfs_setup_write_request(ctx, page, offset, count); + req = nfs_setup_write_request(ctx, folio, offset, count); if (IS_ERR(req)) return PTR_ERR(req); + trace_nfs_writepage_setup(req); /* Update file length */ - nfs_grow_file(page, offset, count); + nfs_grow_file(folio, offset, count); nfs_mark_uptodate(req); nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); return 0; } -int nfs_flush_incompatible(struct file *file, struct page *page) +int nfs_flush_incompatible(struct file *file, struct folio *folio) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct nfs_lock_context *l_ctx; - struct file_lock_context *flctx = file_inode(file)->i_flctx; + struct file_lock_context *flctx = locks_inode_context(file_inode(file)); struct nfs_page *req; int do_flush, status; /* @@ -1221,14 +1125,12 @@ int nfs_flush_incompatible(struct file *file, struct page *page) * dropped page. */ do { - req = nfs_page_find_head_request(page); + req = nfs_folio_find_head_request(folio); if (req == NULL) return 0; l_ctx = req->wb_lock_context; - do_flush = req->wb_page != page || - !nfs_match_open_context(req->wb_context, ctx); - /* for now, flush if more than 1 request in page_group */ - do_flush |= req->wb_this_page != req; + do_flush = nfs_page_to_folio(req) != folio || + !nfs_match_open_context(nfs_req_openctx(req), ctx); if (l_ctx && flctx && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock))) { @@ -1237,7 +1139,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) nfs_release_request(req); if (!do_flush) return 0; - status = nfs_wb_page(page_file_mapping(page)->host, page); + status = nfs_wb_folio(folio->mapping->host, folio); } while (status == 0); return status; } @@ -1256,9 +1158,12 @@ int nfs_key_timeout_notify(struct file *filp, struct inode *inode) { struct nfs_open_context *ctx = nfs_file_open_context(filp); - struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; - return rpcauth_key_timeout_notify(auth, ctx->cred); + if (nfs_ctx_key_to_expire(ctx, inode) && + !rcu_access_pointer(ctx->ll_cred)) + /* Already expired! */ + return -EACCES; + return 0; } /* @@ -1267,8 +1172,38 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode) bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) { struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; + struct rpc_cred *cred, *new, *old = NULL; + struct auth_cred acred = { + .cred = ctx->cred, + }; + bool ret = false; + + rcu_read_lock(); + cred = rcu_dereference(ctx->ll_cred); + if (cred && !(cred->cr_ops->crkey_timeout && + cred->cr_ops->crkey_timeout(cred))) + goto out; + rcu_read_unlock(); - return rpcauth_cred_key_to_expire(auth, ctx->cred); + new = auth->au_ops->lookup_cred(auth, &acred, 0); + if (new == cred) { + put_rpccred(new); + return true; + } + if (IS_ERR_OR_NULL(new)) { + new = NULL; + ret = true; + } else if (new->cr_ops->crkey_timeout && + new->cr_ops->crkey_timeout(new)) + ret = true; + + rcu_read_lock(); + old = rcu_dereference_protected(xchg(&ctx->ll_cred, + RCU_INITIALIZER(new)), 1); +out: + rcu_read_unlock(); + put_rpccred(old); + return ret; } /* @@ -1276,28 +1211,30 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) * the PageUptodate() flag. In this case, we will need to turn off * write optimisations that depend on the page contents being correct. */ -static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) +static bool nfs_folio_write_uptodate(struct folio *folio, unsigned int pagelen) { + struct inode *inode = folio->mapping->host; struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) goto out; - if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + if (nfsi->cache_validity & + (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE)) return false; smp_rmb(); - if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0) return false; out: - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0) return false; - return PageUptodate(page) != 0; + return folio_test_uptodate(folio) != 0; } static bool is_whole_file_wrlock(struct file_lock *fl) { return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX && - fl->fl_type == F_WRLCK; + lock_is_write(fl); } /* If we know the page is up to date, and we're not using byte range locks (or @@ -1308,17 +1245,22 @@ is_whole_file_wrlock(struct file_lock *fl) * If the file is opened for synchronous writes then we can just skip the rest * of the checks. */ -static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) +static int nfs_can_extend_write(struct file *file, struct folio *folio, + unsigned int pagelen) { - int ret; - struct file_lock_context *flctx = inode->i_flctx; + struct inode *inode = file_inode(file); + struct file_lock_context *flctx = locks_inode_context(inode); struct file_lock *fl; + int ret; + unsigned int mntflags = NFS_SERVER(inode)->flags; + if (mntflags & NFS_MOUNT_NO_ALIGNWRITE) + return 0; if (file->f_flags & O_DSYNC) return 0; - if (!nfs_write_pageuptodate(page, inode)) + if (!nfs_folio_write_uptodate(folio, pagelen)) return 0; - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) + if (nfs_have_write_delegation(inode)) return 1; if (!flctx || (list_empty_careful(&flctx->flc_flock) && list_empty_careful(&flctx->flc_posix))) @@ -1329,13 +1271,13 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino spin_lock(&flctx->flc_lock); if (!list_empty(&flctx->flc_posix)) { fl = list_first_entry(&flctx->flc_posix, struct file_lock, - fl_list); + c.flc_list); if (is_whole_file_wrlock(fl)) ret = 1; } else if (!list_empty(&flctx->flc_flock)) { fl = list_first_entry(&flctx->flc_flock, struct file_lock, - fl_list); - if (fl->fl_type == F_WRLCK) + c.flc_list); + if (lock_is_write(fl)) ret = 1; } spin_unlock(&flctx->flc_lock); @@ -1348,33 +1290,40 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad * things with a page scheduled for an RPC call (e.g. invalidate it). */ -int nfs_updatepage(struct file *file, struct page *page, - unsigned int offset, unsigned int count) +int nfs_update_folio(struct file *file, struct folio *folio, + unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = page_file_mapping(page)->host; + struct address_space *mapping = folio->mapping; + struct inode *inode = mapping->host; + unsigned int pagelen = nfs_folio_length(folio); int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); - dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", - file, count, (long long)(page_file_offset(page) + offset)); + trace_nfs_update_folio(inode, offset, count); + + dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count, + (long long)(folio_pos(folio) + offset)); if (!count) goto out; - if (nfs_can_extend_write(file, page, inode)) { - count = max(count + offset, nfs_page_length(page)); - offset = 0; + if (nfs_can_extend_write(file, folio, pagelen)) { + unsigned int end = count + offset; + + offset = round_down(offset, PAGE_SIZE); + if (end < pagelen) + end = min(round_up(end, PAGE_SIZE), pagelen); + count = end - offset; } - status = nfs_writepage_setup(ctx, page, offset, count); + status = nfs_writepage_setup(ctx, folio, offset, count); if (status < 0) - nfs_set_pageerror(page); - else - __set_page_dirty_nobuffers(page); + nfs_set_pageerror(mapping); out: - dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", + trace_nfs_update_folio_done(inode, offset, count, status); + dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; } @@ -1397,11 +1346,11 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, { int priority = flush_task_priority(how); + if (IS_SWAPFILE(hdr->inode)) + task_setup_data->flags |= RPC_TASK_SWAPPER; task_setup_data->priority = priority; - rpc_ops->write_setup(hdr, msg); - - nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client, - &task_setup_data->rpc_client, msg, hdr); + rpc_ops->write_setup(hdr, msg, &task_setup_data->rpc_client); + trace_nfs_initiate_write(hdr); } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -1410,27 +1359,33 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, */ static void nfs_redirty_request(struct nfs_page *req) { + struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); + + /* Bump the transmission count */ + req->wb_nio++; nfs_mark_request_dirty(req); - set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); - nfs_unlock_request(req); - nfs_end_page_writeback(req); + atomic_long_inc(&nfsi->redirtied_pages); + nfs_page_end_writeback(req); nfs_release_request(req); } -static void nfs_async_write_error(struct list_head *head) +static void nfs_async_write_error(struct list_head *head, int error) { struct nfs_page *req; while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - nfs_redirty_request(req); + if (nfs_error_is_fatal_on_server(error)) + nfs_write_error(req, error); + else + nfs_redirty_request(req); } } static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) { - nfs_async_write_error(&hdr->pages); + nfs_async_write_error(&hdr->pages, 0); } static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { @@ -1452,7 +1407,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_write_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, - server->wsize, ioflags, GFP_NOIO); + server->wsize, ioflags); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); @@ -1480,31 +1435,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -/* - * Special version of should_remove_suid() that ignores capabilities. - */ -static int nfs_should_remove_suid(const struct inode *inode) -{ - umode_t mode = inode->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && S_ISREG(mode))) - return kill; - - return 0; -} - static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, struct nfs_fattr *fattr) { @@ -1531,6 +1461,13 @@ void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) struct nfs_fattr *fattr = &hdr->fattr; struct inode *inode = hdr->inode; + if (nfs_have_delegated_mtime(inode)) { + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); + spin_unlock(&inode->i_lock); + return; + } + spin_lock(&inode->i_lock); nfs_writeback_check_extend(hdr, fattr); nfs_post_op_update_inode_force_wcc_locked(inode, fattr); @@ -1557,33 +1494,51 @@ static int nfs_writeback_done(struct rpc_task *task, status = NFS_PROTO(inode)->write_done(task, hdr); if (status != 0) return status; + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); + trace_nfs_writeback_done(task, hdr); - if (hdr->res.verf->committed < hdr->args.stable && - task->tk_status >= 0) { - /* We tried a write call, but the server did not - * commit data to stable storage even though we - * requested it. - * Note: There is a known bug in Tru64 < 5.0 in which - * the server reports NFS_DATA_SYNC, but performs - * NFS_FILE_SYNC. We therefore implement this checking - * as a dprintk() in order to avoid filling syslog. - */ - static unsigned long complain; + if (task->tk_status >= 0) { + enum nfs3_stable_how committed = hdr->res.verf->committed; - /* Note this will print the MDS for a DS write */ - if (time_before(complain, jiffies)) { - dprintk("NFS: faulty NFS server %s:" - " (committed = %d) != (stable = %d)\n", - NFS_SERVER(inode)->nfs_client->cl_hostname, - hdr->res.verf->committed, hdr->args.stable); - complain = jiffies + 300 * HZ; + if (committed == NFS_UNSTABLE) { + /* + * We have some uncommitted data on the server at + * this point, so ensure that we keep track of that + * fact irrespective of what later writes do. + */ + set_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags); + } + + if (committed < hdr->args.stable) { + /* We tried a write call, but the server did not + * commit data to stable storage even though we + * requested it. + * Note: There is a known bug in Tru64 < 5.0 in which + * the server reports NFS_DATA_SYNC, but performs + * NFS_FILE_SYNC. We therefore implement this checking + * as a dprintk() in order to avoid filling syslog. + */ + static unsigned long complain; + + /* Note this will print the MDS for a DS write */ + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", + NFS_SERVER(inode)->nfs_client->cl_hostname, + committed, hdr->args.stable); + complain = jiffies + 300 * HZ; + } } } /* Deal with the suid/sgid bit corner case */ - if (nfs_should_remove_suid(inode)) - nfs_mark_for_revalidate(inode); + if (nfs_should_remove_suid(inode)) { + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE + | NFS_INO_REVAL_FORCED); + spin_unlock(&inode->i_lock); + } return 0; } @@ -1634,25 +1589,30 @@ static void nfs_writeback_result(struct rpc_task *task, */ argp->stable = NFS_FILE_SYNC; } + resp->count = 0; + resp->verf->committed = 0; rpc_restart_call_prepare(task); } } static int wait_on_commit(struct nfs_mds_commit_info *cinfo) { - return wait_on_atomic_t(&cinfo->rpcs_out, - nfs_wait_atomic_killable, TASK_KILLABLE); + return wait_var_event_killable(&cinfo->rpcs_out, + !atomic_read(&cinfo->rpcs_out)); } -static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) +void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) { atomic_inc(&cinfo->rpcs_out); } -static void nfs_commit_end(struct nfs_mds_commit_info *cinfo) +bool nfs_commit_end(struct nfs_mds_commit_info *cinfo) { - if (atomic_dec_and_test(&cinfo->rpcs_out)) - wake_up_atomic_t(&cinfo->rpcs_out); + if (atomic_dec_and_test(&cinfo->rpcs_out)) { + wake_up_var(&cinfo->rpcs_out); + return true; + } + return false; } void nfs_commitdata_release(struct nfs_commit_data *data) @@ -1665,7 +1625,8 @@ EXPORT_SYMBOL_GPL(nfs_commitdata_release); int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, const struct nfs_rpc_ops *nfs_ops, const struct rpc_call_ops *call_ops, - int how, int flags) + int how, int flags, + struct nfsd_file *localio) { struct rpc_task *task; int priority = flush_task_priority(how); @@ -1684,13 +1645,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .flags = RPC_TASK_ASYNC | flags, .priority = priority, }; + + if (nfs_server_capable(data->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + /* Set up the initial task struct. */ - nfs_ops->commit_setup(data, &msg); + nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client); + trace_nfs_initiate_commit(data); dprintk("NFS: initiated commit call\n"); - nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, - NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); + if (localio) + return nfs_local_commit(localio, data, call_ops, how); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -1722,16 +1688,22 @@ void nfs_init_commit(struct nfs_commit_data *data, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo) { - struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = d_inode(first->wb_context->dentry); + struct nfs_page *first; + struct nfs_open_context *ctx; + struct inode *inode; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ - list_splice_init(head, &data->pages); + if (head) + list_splice_init(head, &data->pages); + + first = nfs_list_entry(data->pages.next); + ctx = nfs_req_openctx(first); + inode = d_inode(ctx->dentry); data->inode = inode; - data->cred = first->wb_context->cred; + data->cred = ctx->cred; data->lseg = lseg; /* reference transferred */ /* only set lwb for pnfs commit */ if (lseg) @@ -1744,10 +1716,11 @@ void nfs_init_commit(struct nfs_commit_data *data, /* Note: we always request a commit of the entire inode */ data->args.offset = 0; data->args.count = 0; - data->context = get_nfs_open_context(first->wb_context); + data->context = get_nfs_open_context(ctx); data->res.fattr = &data->fattr; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); + nfs_commit_begin(cinfo->mds); } EXPORT_SYMBOL_GPL(nfs_init_commit); @@ -1762,18 +1735,18 @@ void nfs_retry_commit(struct list_head *page_list, req = nfs_list_entry(page_list->next); nfs_list_remove_request(req); nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx); - if (!cinfo->dreq) - nfs_clear_page_commit(req->wb_page); + nfs_folio_clear_commit(nfs_page_to_folio(req)); nfs_unlock_and_release_request(req); } } EXPORT_SYMBOL_GPL(nfs_retry_commit); -static void -nfs_commit_resched_write(struct nfs_commit_info *cinfo, - struct nfs_page *req) +static void nfs_commit_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) { - __set_page_dirty_nobuffers(req->wb_page); + struct folio *folio = nfs_page_to_folio(req); + + filemap_dirty_folio(folio_mapping(folio), folio); } /* @@ -1784,18 +1757,30 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo) { struct nfs_commit_data *data; + struct nfsd_file *localio; + unsigned short task_flags = 0; /* another commit raced with us */ if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(head, NULL, cinfo, -1); + return -ENOMEM; + } /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); - atomic_inc(&cinfo->mds->rpcs_out); + if (NFS_SERVER(inode)->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; + + localio = nfs_local_open_fh(NFS_SERVER(inode)->nfs_client, data->cred, + data->args.fh, &data->context->nfl, + data->context->mode); return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), - data->mds_ops, how, 0); + data->mds_ops, how, + RPC_TASK_CRED_NOREF | task_flags, localio); } /* @@ -1805,58 +1790,59 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) { struct nfs_commit_data *data = calldata; - dprintk("NFS: %5u nfs_commit_done (status %d)\n", - task->tk_pid, task->tk_status); - /* Call the NFS version-specific code */ NFS_PROTO(data->inode)->commit_done(task, data); + trace_nfs_commit_done(task, data); } static void nfs_commit_release_pages(struct nfs_commit_data *data) { + const struct nfs_writeverf *verf = data->res.verf; struct nfs_page *req; int status = data->task.tk_status; struct nfs_commit_info cinfo; - struct nfs_server *nfss; + struct folio *folio; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (req->wb_page) - nfs_clear_page_commit(req->wb_page); + folio = nfs_page_to_folio(req); + nfs_folio_clear_commit(folio); dprintk("NFS: commit (%s/%llu %d@%lld)", - req->wb_context->dentry->d_sb->s_id, - (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), + nfs_req_openctx(req)->dentry->d_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode(nfs_req_openctx(req)->dentry)), req->wb_bytes, (long long)req_offset(req)); if (status < 0) { - nfs_context_set_write_error(req->wb_context, status); - if (req->wb_page) + if (folio) { + trace_nfs_commit_error(data->inode, req, + status); + nfs_mapping_set_error(folio, status); nfs_inode_remove_request(req); - dprintk_cont(", error = %d\n", status); + } + dprintk(", error = %d\n", status); goto next; } /* Okay, COMMIT succeeded, apparently. Check the verifier * returned by the server against all stored verfs. */ - if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { + if (nfs_write_match_verf(verf, req)) { /* We have a match */ - if (req->wb_page) + if (folio) nfs_inode_remove_request(req); - dprintk_cont(" OK\n"); + dprintk(" OK\n"); goto next; } /* We have a mismatch. Write the page again */ - dprintk_cont(" mismatch\n"); + dprintk(" mismatch\n"); nfs_mark_request_dirty(req); - set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); + atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: nfs_unlock_and_release_request(req); + /* Latency breaker */ + cond_resched(); } - nfss = NFS_SERVER(data->inode); - if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); nfs_commit_end(cinfo.mds); @@ -1892,38 +1878,44 @@ int nfs_generic_commit_list(struct inode *inode, struct list_head *head, return status; } -int nfs_commit_inode(struct inode *inode, int how) +static int __nfs_commit_inode(struct inode *inode, int how, + struct writeback_control *wbc) { LIST_HEAD(head); struct nfs_commit_info cinfo; int may_wait = how & FLUSH_SYNC; - int error = 0; - int res; + int ret, nscan; + how &= ~FLUSH_SYNC; nfs_init_cinfo_from_inode(&cinfo, inode); nfs_commit_begin(cinfo.mds); - res = nfs_scan_commit(inode, &head, &cinfo); - if (res) - error = nfs_generic_commit_list(inode, &head, how, &cinfo); + for (;;) { + ret = nscan = nfs_scan_commit(inode, &head, &cinfo); + if (ret <= 0) + break; + ret = nfs_generic_commit_list(inode, &head, how, &cinfo); + if (ret < 0) + break; + ret = 0; + if (wbc && wbc->sync_mode == WB_SYNC_NONE) { + if (nscan < wbc->nr_to_write) + wbc->nr_to_write -= nscan; + else + wbc->nr_to_write = 0; + } + if (nscan < INT_MAX) + break; + cond_resched(); + } nfs_commit_end(cinfo.mds); - if (error < 0) - goto out_error; - if (!may_wait) - goto out_mark_dirty; - error = wait_on_commit(cinfo.mds); - if (error < 0) - return error; - return res; -out_error: - res = error; - /* Note: If we exit without ensuring that the commit is complete, - * we must mark the inode as dirty. Otherwise, future calls to - * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure - * that the data is on the disk. - */ -out_mark_dirty: - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return res; + if (ret || !may_wait) + return ret; + return wait_on_commit(cinfo.mds); +} + +int nfs_commit_inode(struct inode *inode, int how) +{ + return __nfs_commit_inode(inode, how, NULL); } EXPORT_SYMBOL_GPL(nfs_commit_inode); @@ -1933,11 +1925,11 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int flags = FLUSH_SYNC; int ret = 0; - /* no commits means nothing needs to be done */ - if (!nfsi->commit_info.ncommit) - return ret; - if (wbc->sync_mode == WB_SYNC_NONE) { + /* no commits means nothing needs to be done */ + if (!atomic_long_read(&nfsi->commit_info.ncommit)) + goto check_requests_outstanding; + /* Don't commit yet if this is a non-blocking flush and there * are a lot of outstanding writes for this mapping. */ @@ -1948,16 +1940,16 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) flags = 0; } - ret = nfs_commit_inode(inode, flags); - if (ret >= 0) { - if (wbc->sync_mode == WB_SYNC_NONE) { - if (ret < wbc->nr_to_write) - wbc->nr_to_write -= ret; - else - wbc->nr_to_write = 0; - } - return 0; - } + ret = __nfs_commit_inode(inode, flags, wbc); + if (!ret) { + if (flags & FLUSH_SYNC) + return 0; + } else if (atomic_long_read(&nfsi->commit_info.ncommit)) + goto out_mark_dirty; + +check_requests_outstanding: + if (!atomic_read(&nfsi->commit_info.rpcs_out)) + return ret; out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; @@ -2006,87 +1998,100 @@ out: } EXPORT_SYMBOL_GPL(nfs_wb_all); -int nfs_wb_page_cancel(struct inode *inode, struct page *page) +int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) { struct nfs_page *req; int ret = 0; - wait_on_page_writeback(page); + folio_wait_writeback(folio); /* blocking call to cancel all requests and join to a single (head) * request */ - req = nfs_lock_and_join_requests(page, false); + req = nfs_lock_and_join_requests(folio); if (IS_ERR(req)) { ret = PTR_ERR(req); } else if (req) { - /* all requests from this page have been cancelled by + /* all requests from this folio have been cancelled by * nfs_lock_and_join_requests, so just remove the head * request from the inode / page_private pointer and * release it */ nfs_inode_remove_request(req); nfs_unlock_and_release_request(req); + folio_cancel_dirty(folio); } return ret; } -/* - * Write back all requests on one page - we do this before reading it. +/** + * nfs_wb_folio - Write back all requests on one page + * @inode: pointer to page + * @folio: pointer to folio + * + * Assumes that the folio has been locked by the caller, and will + * not unlock it. */ -int nfs_wb_page(struct inode *inode, struct page *page) +int nfs_wb_folio(struct inode *inode, struct folio *folio) { - loff_t range_start = page_file_offset(page); - loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); + loff_t range_start = folio_pos(folio); + size_t len = folio_size(folio); struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, .range_start = range_start, - .range_end = range_end, + .range_end = range_start + len - 1, }; int ret; - trace_nfs_writeback_page_enter(inode); + trace_nfs_writeback_folio(inode, range_start, len); for (;;) { - wait_on_page_writeback(page); - if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc); + folio_wait_writeback(folio); + if (folio_clear_dirty_for_io(folio)) { + ret = nfs_writepage_locked(folio, &wbc); if (ret < 0) goto out_error; continue; } ret = 0; - if (!PagePrivate(page)) + if (!folio_test_private(folio)) break; ret = nfs_commit_inode(inode, FLUSH_SYNC); if (ret < 0) goto out_error; } out_error: - trace_nfs_writeback_page_exit(inode, ret); + trace_nfs_writeback_folio_done(inode, range_start, len, ret); return ret; } #ifdef CONFIG_MIGRATION -int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) +int nfs_migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) { /* - * If PagePrivate is set, then the page is currently associated with + * If the private flag is set, the folio is currently associated with * an in-progress read or write request. Don't try to migrate it. * * FIXME: we could do this in principle, but we'll need a way to ensure * that we can safely release the inode reference while holding - * the page lock. + * the folio lock. */ - if (PagePrivate(page)) - return -EBUSY; + if (folio_test_private(src)) { + if (mode == MIGRATE_SYNC) + nfs_wb_folio(src->mapping->host, src); + if (folio_test_private(src)) + return -EBUSY; + } - if (!nfs_fscache_release_page(page, GFP_KERNEL)) - return -EBUSY; + if (folio_test_private_2(src)) { /* [DEPRECATED] */ + if (mode == MIGRATE_ASYNC) + return -EBUSY; + folio_wait_private_2(src); + } - return migrate_page(mapping, newpage, page, mode); + return migrate_folio(mapping, dst, src, mode); } #endif @@ -2132,7 +2137,7 @@ int __init nfs_init_writepagecache(void) * This allows larger machines to have larger/more transfers. * Limit the default to 256M */ - nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); if (nfs_congestion_kb > 256*1024) nfs_congestion_kb = 256*1024; |
