diff options
Diffstat (limited to 'fs/nfsd')
57 files changed, 24912 insertions, 10986 deletions
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 20b1c17320d5..0b5c1a0bf1cf 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -1,11 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-only config NFSD tristate "NFS server support" depends on INET depends on FILE_LOCKING + depends on FSNOTIFY + select CRC32 + select CRYPTO_LIB_MD5 if NFSD_LEGACY_CLIENT_TRACKING + select CRYPTO_LIB_SHA256 if NFSD_V4 select LOCKD select SUNRPC select EXPORTFS + select NFS_COMMON select NFS_ACL_SUPPORT if NFSD_V2_ACL + select NFS_ACL_SUPPORT if NFSD_V3_ACL depends on MULTIUSER help Choose Y here if you want to allow other computers to access @@ -24,28 +31,29 @@ config NFSD Below you can choose which versions of the NFS protocol are available to clients mounting the NFS server on this system. - Support for NFS version 2 (RFC 1094) is always available when + Support for NFS version 3 (RFC 1813) is always available when CONFIG_NFSD is selected. If unsure, say N. -config NFSD_V2_ACL - bool - depends on NFSD - -config NFSD_V3 - bool "NFS server support for NFS version 3" +config NFSD_V2 + bool "NFS server support for NFS version 2 (DEPRECATED)" depends on NFSD + default n help - This option enables support in your system's NFS server for - version 3 of the NFS protocol (RFC 1813). + NFSv2 (RFC 1094) was the first publicly-released version of NFS. + Unless you are hosting ancient (1990's era) NFS clients, you don't + need this. - If unsure, say Y. + If unsure, say N. + +config NFSD_V2_ACL + bool "NFS server support for the NFSv2 ACL protocol extension" + depends on NFSD_V2 config NFSD_V3_ACL bool "NFS server support for the NFSv3 ACL protocol extension" - depends on NFSD_V3 - select NFSD_V2_ACL + depends on NFSD help Solaris NFS servers support an auxiliary NFSv3 ACL protocol that never became an official part of the NFS version 3 protocol. @@ -68,11 +76,11 @@ config NFSD_V3_ACL config NFSD_V4 bool "NFS server support for NFS version 4" depends on NFSD && PROC_FS - select NFSD_V3 select FS_POSIX_ACL - select SUNRPC_GSS - select CRYPTO + select RPCSEC_GSS_KRB5 + select CRYPTO # required by RPCSEC_GSS_KRB5 select GRACE_PERIOD + select NFS_V4_2_SSC_HELPER if NFS_V4_2 help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). @@ -94,7 +102,7 @@ config NFSD_BLOCKLAYOUT help This option enables support for the exporting pNFS block layouts in the kernel's NFS server. The pNFS block layout enables NFS - clients to directly perform I/O to block devices accesible to both + clients to directly perform I/O to block devices accessible to both the server and the clients. See RFC 5663 for more details. If unsure, say N. @@ -104,11 +112,10 @@ config NFSD_SCSILAYOUT depends on NFSD_V4 && BLOCK select NFSD_PNFS select EXPORTFS_BLOCK_OPS - select BLK_SCSI_REQUEST help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS - clients to directly perform I/O to SCSI devices accesible to both + clients to directly perform I/O to SCSI devices accessible to both the server and the clients. See draft-ietf-nfsv4-scsi-layout for more details. @@ -122,7 +129,7 @@ config NFSD_FLEXFILELAYOUT This option enables support for the exporting pNFS Flex File layouts in the kernel's NFS server. The pNFS Flex File layout enables NFS clients to directly perform I/O to NFSv3 devices - accesible to both the server and the clients. See + accessible to both the server and the clients. See draft-ietf-nfsv4-flex-files for more details. Warning, this server implements the bare minimum functionality @@ -131,6 +138,16 @@ config NFSD_FLEXFILELAYOUT If unsure, say N. +config NFSD_V4_2_INTER_SSC + bool "NFSv4.2 inter server to server COPY" + depends on NFSD_V4 && NFS_V4_2 + help + This option enables support for NFSv4.2 inter server to + server copy where the destination server calls the NFSv4.2 + client to read the data to copy from the source server. + + If unsure, say N. + config NFSD_V4_SECURITY_LABEL bool "Provide Security Label support for NFSv4 server" depends on NFSD_V4 && SECURITY @@ -144,12 +161,28 @@ config NFSD_V4_SECURITY_LABEL If you do not wish to enable fine-grained security labels SELinux or Smack policies on NFSv4 files, say N. -config NFSD_FAULT_INJECTION - bool "NFS server manual fault injection" - depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS +config NFSD_LEGACY_CLIENT_TRACKING + bool "Support legacy NFSv4 client tracking methods (DEPRECATED)" + depends on NFSD_V4 + default n help - This option enables support for manually injecting faults - into the NFS server. This is intended to be used for - testing error recovery on the NFS client. - - If unsure, say N. + The NFSv4 server needs to store a small amount of information on + stable storage in order to handle state recovery after reboot. Most + modern deployments upcall to a userland daemon for this (nfsdcld), + but older NFS servers may store information directly in a + recoverydir, or spawn a process directly using a usermodehelper + upcall. + + These legacy client tracking methods have proven to be problematic + and will be removed in the future. Say Y here if you need support + for them in the interim. + +config NFSD_V4_DELEG_TIMESTAMPS + bool "Support delegated timestamps" + depends on NFSD_V4 + default n + help + NFSD implements delegated timestamps according to + draft-ietf-nfsv4-delstid-08 "Extending the Opening of Files". This + is currently an experimental feature and is therefore left disabled + by default. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 2bfb58eefad1..55744bb786c9 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -10,15 +10,32 @@ obj-$(CONFIG_NFSD) += nfsd.o # this one should be compiled first, as the tracing macros can easily blow up nfsd-y += trace.o -nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ - export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o -nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o +nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \ + export.o auth.o lockd.o nfscache.o \ + stats.o filecache.o nfs3proc.o nfs3xdr.o \ + netlink.o +nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o -nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ - nfs4acl.o nfs4callback.o nfs4recover.o + nfs4acl.o nfs4callback.o nfs4recover.o nfs4xdr_gen.o nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o +nfsd-$(CONFIG_NFS_LOCALIO) += localio.o +nfsd-$(CONFIG_DEBUG_FS) += debugfs.o + + +.PHONY: xdrgen + +xdrgen: ../../include/linux/sunrpc/xdrgen/nfs4_1.h nfs4xdr_gen.h nfs4xdr_gen.c + +../../include/linux/sunrpc/xdrgen/nfs4_1.h: ../../Documentation/sunrpc/xdr/nfs4_1.x + ../../tools/net/sunrpc/xdrgen/xdrgen definitions $< > $@ + +nfs4xdr_gen.h: ../../Documentation/sunrpc/xdr/nfs4_1.x + ../../tools/net/sunrpc/xdrgen/xdrgen declarations $< > $@ + +nfs4xdr_gen.c: ../../Documentation/sunrpc/xdr/nfs4_1.x + ../../tools/net/sunrpc/xdrgen/xdrgen source $< > $@ diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 4cd7c69a6cb9..4b7324458a94 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -38,14 +38,8 @@ struct nfs4_acl; struct svc_fh; struct svc_rqst; - -/* - * Maximum ACL we'll accept from a client; chosen (somewhat - * arbitrarily) so that kmalloc'ing the ACL shouldn't require a - * high-order allocation. This allows 204 ACEs on x86_64: - */ -#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ - / sizeof(struct nfs4_ace)) +struct nfsd_attrs; +enum nfs_ftype4; int nfs4_acl_bytes(int entries); int nfs4_acl_get_whotype(char *, u32); @@ -53,7 +47,7 @@ __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl); -__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct nfs4_acl *acl); +__be32 nfsd4_acl_to_attr(enum nfs_ftype4 type, struct nfs4_acl *acl, + struct nfsd_attrs *attr); #endif /* LINUX_NFS4_ACL_H */ diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index fdf2aad73470..4dc327e02456 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -5,39 +5,37 @@ #include "nfsd.h" #include "auth.h" -int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) +int nfsexp_flags(struct svc_cred *cred, struct svc_export *exp) { struct exp_flavor_info *f; struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; for (f = exp->ex_flavors; f < end; f++) { - if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) + if (f->pseudoflavor == cred->cr_flavor) return f->flags; } return exp->ex_flags; } -int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) +int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp) { struct group_info *rqgi; struct group_info *gi; struct cred *new; int i; - int flags = nfsexp_flags(rqstp, exp); - - validate_process_creds(); + int flags = nfsexp_flags(cred, exp); /* discard any old override before preparing the new set */ - revert_creds(get_cred(current_real_cred())); + put_cred(revert_creds(get_cred(current_real_cred()))); new = prepare_creds(); if (!new) return -ENOMEM; - new->fsuid = rqstp->rq_cred.cr_uid; - new->fsgid = rqstp->rq_cred.cr_gid; + new->fsuid = cred->cr_uid; + new->fsgid = cred->cr_gid; - rqgi = rqstp->rq_cred.cr_group_info; + rqgi = cred->cr_group_info; if (flags & NFSEXP_ALLSQUASH) { new->fsuid = exp->ex_anon_uid; @@ -81,10 +79,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); - validate_process_creds(); put_cred(override_creds(new)); - put_cred(new); - validate_process_creds(); return 0; oom: diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h index dbd66424f600..8c5031bbbcee 100644 --- a/fs/nfsd/auth.h +++ b/fs/nfsd/auth.h @@ -12,6 +12,6 @@ * Set the current process's fsuid/fsgid etc to those of the NFS * client user */ -int nfsd_setuser(struct svc_rqst *, struct svc_export *); +int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp); #endif /* LINUX_NFSD_AUTH_H */ diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 4fb1f72a25fb..afa16d7a8013 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -4,136 +4,194 @@ */ #include <linux/exportfs.h> #include <linux/iomap.h> -#include <linux/genhd.h> #include <linux/slab.h> #include <linux/pr.h> #include <linux/nfsd/debug.h> -#include <scsi/scsi_proto.h> -#include <scsi/scsi_common.h> -#include <scsi/scsi_request.h> #include "blocklayoutxdr.h" #include "pnfs.h" +#include "filecache.h" +#include "vfs.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS +/* + * Get an extent from the file system that starts at offset or below + * and may be shorter than the requested length. + */ static __be32 -nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, - struct nfsd4_layoutget *args) +nfsd4_block_map_extent(struct inode *inode, const struct svc_fh *fhp, + u64 offset, u64 length, u32 iomode, u64 minlength, + struct pnfs_block_extent *bex) { - struct nfsd4_layout_seg *seg = &args->lg_seg; struct super_block *sb = inode->i_sb; - u32 block_size = i_blocksize(inode); - struct pnfs_block_extent *bex; struct iomap iomap; u32 device_generation = 0; int error; - if (seg->offset & (block_size - 1)) { - dprintk("pnfsd: I/O misaligned\n"); - goto out_layoutunavailable; - } - - /* - * Some clients barf on non-zero block numbers for NONE or INVALID - * layouts, so make sure to zero the whole structure. - */ - error = -ENOMEM; - bex = kzalloc(sizeof(*bex), GFP_KERNEL); - if (!bex) - goto out_error; - args->lg_content = bex; - - error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length, - &iomap, seg->iomode != IOMODE_READ, - &device_generation); + error = sb->s_export_op->map_blocks(inode, offset, length, &iomap, + iomode != IOMODE_READ, &device_generation); if (error) { if (error == -ENXIO) - goto out_layoutunavailable; - goto out_error; - } - - if (iomap.length < args->lg_minlength) { - dprintk("pnfsd: extent smaller than minlength\n"); - goto out_layoutunavailable; + return nfserr_layoutunavailable; + return nfserrno(error); } switch (iomap.type) { case IOMAP_MAPPED: - if (seg->iomode == IOMODE_READ) + if (iomode == IOMODE_READ) bex->es = PNFS_BLOCK_READ_DATA; else bex->es = PNFS_BLOCK_READWRITE_DATA; bex->soff = iomap.addr; break; case IOMAP_UNWRITTEN: - if (seg->iomode & IOMODE_RW) { + if (iomode & IOMODE_RW) { /* * Crack monkey special case from section 2.3.1. */ - if (args->lg_minlength == 0) { + if (minlength == 0) { dprintk("pnfsd: no soup for you!\n"); - goto out_layoutunavailable; + return nfserr_layoutunavailable; } bex->es = PNFS_BLOCK_INVALID_DATA; bex->soff = iomap.addr; break; } - /*FALLTHRU*/ + fallthrough; case IOMAP_HOLE: - if (seg->iomode == IOMODE_READ) { + if (iomode == IOMODE_READ) { bex->es = PNFS_BLOCK_NONE_DATA; break; } - /*FALLTHRU*/ + fallthrough; case IOMAP_DELALLOC: default: WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type); - goto out_layoutunavailable; + return nfserr_layoutunavailable; } error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation); if (error) - goto out_error; + return nfserrno(error); + bex->foff = iomap.offset; bex->len = iomap.length; + return nfs_ok; +} - seg->offset = iomap.offset; - seg->length = iomap.length; +static __be32 +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, + const struct svc_fh *fhp, struct nfsd4_layoutget *args) +{ + struct nfsd4_layout_seg *seg = &args->lg_seg; + struct pnfs_block_layout *bl; + struct pnfs_block_extent *first_bex, *last_bex; + u64 offset = seg->offset, length = seg->length; + u32 i, nr_extents_max, block_size = i_blocksize(inode); + __be32 nfserr; - dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es); - return 0; + if (locks_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + + nfserr = nfserr_layoutunavailable; + if (seg->offset & (block_size - 1)) { + dprintk("pnfsd: I/O misaligned\n"); + goto out_error; + } + + /* + * RFC 8881, section 3.3.17: + * The layout4 data type defines a layout for a file. + * + * RFC 8881, section 18.43.3: + * The loga_maxcount field specifies the maximum layout size + * (in bytes) that the client can handle. If the size of the + * layout structure exceeds the size specified by maxcount, + * the metadata server will return the NFS4ERR_TOOSMALL error. + */ + nfserr = nfserr_toosmall; + if (args->lg_maxcount < PNFS_BLOCK_LAYOUT4_SIZE + + PNFS_BLOCK_EXTENT_SIZE) + goto out_error; + + /* + * Limit the maximum layout size to avoid allocating + * a large buffer on the server for each layout request. + */ + nr_extents_max = (min(args->lg_maxcount, PAGE_SIZE) - + PNFS_BLOCK_LAYOUT4_SIZE) / PNFS_BLOCK_EXTENT_SIZE; + + /* + * Some clients barf on non-zero block numbers for NONE or INVALID + * layouts, so make sure to zero the whole structure. + */ + nfserr = nfserrno(-ENOMEM); + bl = kzalloc(struct_size(bl, extents, nr_extents_max), GFP_KERNEL); + if (!bl) + goto out_error; + bl->nr_extents = nr_extents_max; + args->lg_content = bl; + + for (i = 0; i < bl->nr_extents; i++) { + struct pnfs_block_extent *bex = bl->extents + i; + u64 bex_length; + + nfserr = nfsd4_block_map_extent(inode, fhp, offset, length, + seg->iomode, args->lg_minlength, bex); + if (nfserr != nfs_ok) + goto out_error; + + bex_length = bex->len - (offset - bex->foff); + if (bex_length >= length) { + bl->nr_extents = i + 1; + break; + } + + offset = bex->foff + bex->len; + length -= bex_length; + } + + first_bex = bl->extents; + last_bex = bl->extents + bl->nr_extents - 1; + + nfserr = nfserr_layoutunavailable; + length = last_bex->foff + last_bex->len - seg->offset; + if (length < args->lg_minlength) { + dprintk("pnfsd: extent smaller than minlength\n"); + goto out_error; + } + + seg->offset = first_bex->foff; + seg->length = last_bex->foff - first_bex->foff + last_bex->len; + return nfs_ok; out_error: seg->length = 0; - return nfserrno(error); -out_layoutunavailable: - seg->length = 0; - return nfserr_layoutunavailable; + return nfserr; } static __be32 nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, struct iomap *iomaps, int nr_iomaps) { - loff_t new_size = lcp->lc_last_wr + 1; + struct timespec64 mtime = inode_get_mtime(inode); struct iattr iattr = { .ia_valid = 0 }; - struct timespec ts; int error; - ts = timespec64_to_timespec(inode->i_mtime); if (lcp->lc_mtime.tv_nsec == UTIME_NOW || - timespec_compare(&lcp->lc_mtime, &ts) < 0) - lcp->lc_mtime = timespec64_to_timespec(current_time(inode)); + timespec64_compare(&lcp->lc_mtime, &mtime) < 0) + lcp->lc_mtime = current_time(inode); iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; - iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = timespec_to_timespec64(lcp->lc_mtime); + iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; - if (new_size > i_size_read(inode)) { + if (lcp->lc_size_chg) { iattr.ia_valid |= ATTR_SIZE; - iattr.ia_size = new_size; + iattr.ia_size = lcp->lc_newsize; } error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps, @@ -150,8 +208,7 @@ nfsd4_block_get_device_info_simple(struct super_block *sb, struct pnfs_block_deviceaddr *dev; struct pnfs_block_volume *b; - dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + - sizeof(struct pnfs_block_volume), GFP_KERNEL); + dev = kzalloc(struct_size(dev, volumes, 1), GFP_KERNEL); if (!dev) return -ENOMEM; gdp->gd_device = dev; @@ -171,22 +228,26 @@ nfsd4_block_proc_getdeviceinfo(struct super_block *sb, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { - if (sb->s_bdev != sb->s_bdev->bd_contains) + if (bdev_is_partition(sb->s_bdev)) return nfserr_inval; return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); } static __be32 -nfsd4_block_proc_layoutcommit(struct inode *inode, +nfsd4_block_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp) { struct iomap *iomaps; int nr_iomaps; + __be32 nfserr; - nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, i_blocksize(inode)); - if (nr_iomaps < 0) - return nfserrno(nr_iomaps); + rqstp->rq_arg = lcp->lc_up_layout; + svcxdr_init_decode(rqstp); + + nfserr = nfsd4_block_decode_layoutupdate(&rqstp->rq_arg_stream, + &iomaps, &nr_iomaps, i_blocksize(inode)); + if (nfserr != nfs_ok) + return nfserr; return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); } @@ -212,109 +273,6 @@ const struct nfsd4_layout_ops bl_layout_ops = { #endif /* CONFIG_NFSD_BLOCKLAYOUT */ #ifdef CONFIG_NFSD_SCSILAYOUT -static int nfsd4_scsi_identify_device(struct block_device *bdev, - struct pnfs_block_volume *b) -{ - struct request_queue *q = bdev->bd_disk->queue; - struct request *rq; - struct scsi_request *req; - /* - * The allocation length (passed in bytes 3 and 4 of the INQUIRY - * command descriptor block) specifies the number of bytes that have - * been allocated for the data-in buffer. - * 252 is the highest one-byte value that is a multiple of 4. - * 65532 is the highest two-byte value that is a multiple of 4. - */ - size_t bufflen = 252, maxlen = 65532, len, id_len; - u8 *buf, *d, type, assoc; - int retries = 1, error; - - if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) - return -EINVAL; - -again: - buf = kzalloc(bufflen, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - rq = blk_get_request(q, REQ_OP_SCSI_IN, 0); - if (IS_ERR(rq)) { - error = -ENOMEM; - goto out_free_buf; - } - req = scsi_req(rq); - - error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); - if (error) - goto out_put_request; - - req->cmd[0] = INQUIRY; - req->cmd[1] = 1; - req->cmd[2] = 0x83; - req->cmd[3] = bufflen >> 8; - req->cmd[4] = bufflen & 0xff; - req->cmd_len = COMMAND_SIZE(INQUIRY); - - blk_execute_rq(rq->q, NULL, rq, 1); - if (req->result) { - pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", - req->result); - error = -EIO; - goto out_put_request; - } - - len = (buf[2] << 8) + buf[3] + 4; - if (len > bufflen) { - if (len <= maxlen && retries--) { - blk_put_request(rq); - kfree(buf); - bufflen = len; - goto again; - } - pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n", - len); - goto out_put_request; - } - - d = buf + 4; - for (d = buf + 4; d < buf + len; d += id_len + 4) { - id_len = d[3]; - type = d[1] & 0xf; - assoc = (d[1] >> 4) & 0x3; - - /* - * We only care about a EUI-64 and NAA designator types - * with LU association. - */ - if (assoc != 0x00) - continue; - if (type != 0x02 && type != 0x03) - continue; - if (id_len != 8 && id_len != 12 && id_len != 16) - continue; - - b->scsi.code_set = PS_CODE_SET_BINARY; - b->scsi.designator_type = type == 0x02 ? - PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA; - b->scsi.designator_len = id_len; - memcpy(b->scsi.designator, d + 4, id_len); - - /* - * If we found a 8 or 12 byte descriptor continue on to - * see if a 16 byte one is available. If we find a - * 16 byte descriptor we're done. - */ - if (id_len == 16) - break; - } - -out_put_request: - blk_put_request(rq); -out_free_buf: - kfree(buf); - return error; -} - #define NFSD_MDS_PR_KEY 0x0100000000000000ULL /* @@ -326,6 +284,31 @@ static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp) return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id; } +static const u8 designator_types[] = { + PS_DESIGNATOR_EUI64, + PS_DESIGNATOR_NAA, +}; + +static int +nfsd4_block_get_unique_id(struct gendisk *disk, struct pnfs_block_volume *b) +{ + int ret, i; + + for (i = 0; i < ARRAY_SIZE(designator_types); i++) { + u8 type = designator_types[i]; + + ret = disk->fops->get_unique_id(disk, b->scsi.designator, type); + if (ret > 0) { + b->scsi.code_set = PS_CODE_SET_BINARY; + b->scsi.designator_type = type; + b->scsi.designator_len = ret; + return 0; + } + } + + return -EINVAL; +} + static int nfsd4_block_get_device_info_scsi(struct super_block *sb, struct nfs4_client *clp, @@ -334,10 +317,9 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, struct pnfs_block_deviceaddr *dev; struct pnfs_block_volume *b; const struct pr_ops *ops; - int error; + int ret; - dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + - sizeof(struct pnfs_block_volume), GFP_KERNEL); + dev = kzalloc(struct_size(dev, volumes, 1), GFP_KERNEL); if (!dev) return -ENOMEM; gdp->gd_device = dev; @@ -348,33 +330,39 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, b->type = PNFS_BLOCK_VOLUME_SCSI; b->scsi.pr_key = nfsd4_scsi_pr_key(clp); - error = nfsd4_scsi_identify_device(sb->s_bdev, b); - if (error) - return error; + ret = nfsd4_block_get_unique_id(sb->s_bdev->bd_disk, b); + if (ret < 0) + goto out_free_dev; + ret = -EINVAL; ops = sb->s_bdev->bd_disk->fops->pr_ops; if (!ops) { pr_err("pNFS: device %s does not support PRs.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } - error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); - if (error) { + ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); + if (ret) { pr_err("pNFS: failed to register key for device %s.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } - error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY, + ret = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY, PR_EXCLUSIVE_ACCESS_REG_ONLY, 0); - if (error) { + if (ret) { pr_err("pNFS: failed to reserve device %s.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } return 0; + +out_free_dev: + kfree(dev); + gdp->gd_device = NULL; + return ret; } static __be32 @@ -383,33 +371,40 @@ nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { - if (sb->s_bdev != sb->s_bdev->bd_contains) + if (bdev_is_partition(sb->s_bdev)) return nfserr_inval; return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp)); } static __be32 -nfsd4_scsi_proc_layoutcommit(struct inode *inode, +nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp) { struct iomap *iomaps; int nr_iomaps; + __be32 nfserr; + + rqstp->rq_arg = lcp->lc_up_layout; + svcxdr_init_decode(rqstp); - nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, i_blocksize(inode)); - if (nr_iomaps < 0) - return nfserrno(nr_iomaps); + nfserr = nfsd4_scsi_decode_layoutupdate(&rqstp->rq_arg_stream, + &iomaps, &nr_iomaps, i_blocksize(inode)); + if (nfserr != nfs_ok) + return nfserr; return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps); } static void -nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls) +nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; - struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev; + struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; + int status; - bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, - nfsd4_scsi_pr_key(clp), 0, true); + status = bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, + nfsd4_scsi_pr_key(clp), + PR_EXCLUSIVE_ACCESS_REG_ONLY, true); + trace_nfsd_pnfs_fence(clp, bdev->bd_disk->disk_name, status); } const struct nfsd4_layout_ops scsi_layout_ops = { diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 442543304930..196ef4245604 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -9,16 +9,30 @@ #include "nfsd.h" #include "blocklayoutxdr.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS +/** + * nfsd4_block_encode_layoutget - encode block/scsi layout extent array + * @xdr: stream for data encoding + * @lgp: layoutget content, actually an array of extents to encode + * + * Encode the opaque loc_body field in the layoutget response. Since the + * pnfs_block_layout4 and pnfs_scsi_layout4 structures on the wire are + * the same, this function is used by both layout drivers. + * + * Return values: + * %nfs_ok: Success, all extents encoded into @xdr + * %nfserr_toosmall: Not enough space in @xdr to encode all the data + */ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp) + const struct nfsd4_layoutget *lgp) { - struct pnfs_block_extent *b = lgp->lg_content; - int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32); + const struct pnfs_block_layout *bl = lgp->lg_content; + u32 i, len = sizeof(__be32) + bl->nr_extents * PNFS_BLOCK_EXTENT_SIZE; __be32 *p; p = xdr_reserve_space(xdr, sizeof(__be32) + len); @@ -26,15 +40,19 @@ nfsd4_block_encode_layoutget(struct xdr_stream *xdr, return nfserr_toosmall; *p++ = cpu_to_be32(len); - *p++ = cpu_to_be32(1); /* we always return a single extent */ - - p = xdr_encode_opaque_fixed(p, &b->vol_id, - sizeof(struct nfsd4_deviceid)); - p = xdr_encode_hyper(p, b->foff); - p = xdr_encode_hyper(p, b->len); - p = xdr_encode_hyper(p, b->soff); - *p++ = cpu_to_be32(b->es); - return 0; + *p++ = cpu_to_be32(bl->nr_extents); + + for (i = 0; i < bl->nr_extents; i++) { + const struct pnfs_block_extent *bex = bl->extents + i; + + p = svcxdr_encode_deviceid4(p, &bex->vol_id); + p = xdr_encode_hyper(p, bex->foff); + p = xdr_encode_hyper(p, bex->len); + p = xdr_encode_hyper(p, bex->soff); + *p++ = cpu_to_be32(bex->es); + } + + return nfs_ok; } static int @@ -76,12 +94,21 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp) + const struct nfsd4_getdeviceinfo *gdp) { struct pnfs_block_deviceaddr *dev = gdp->gd_device; int len = sizeof(__be32), ret, i; __be32 *p; + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + p = xdr_reserve_space(xdr, len + sizeof(__be32)); if (!p) return nfserr_resource; @@ -102,64 +129,86 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, return 0; } -int -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size) +/** + * nfsd4_block_decode_layoutupdate - decode the block layout extent array + * @xdr: subbuf set to the encoded array + * @iomapp: pointer to store the decoded extent array + * @nr_iomapsp: pointer to store the number of extents + * @block_size: alignment of extent offset and length + * + * This function decodes the opaque field of the layoutupdate4 structure + * in a layoutcommit request for the block layout driver. The field is + * actually an array of extents sent by the client. It also checks that + * the file offset, storage offset and length of each extent are aligned + * by @block_size. + * + * Return values: + * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid + * %nfserr_bad_xdr: The encoded array in @xdr is invalid + * %nfserr_inval: An unaligned extent found + * %nfserr_delay: Failed to allocate memory for @iomapp + */ +__be32 +nfsd4_block_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, + int *nr_iomapsp, u32 block_size) { struct iomap *iomaps; - u32 nr_iomaps, i; + u32 nr_iomaps, expected, len, i; + __be32 nfserr; - if (len < sizeof(u32)) { - dprintk("%s: extent array too small: %u\n", __func__, len); - return -EINVAL; - } - len -= sizeof(u32); - if (len % PNFS_BLOCK_EXTENT_SIZE) { - dprintk("%s: extent array invalid: %u\n", __func__, len); - return -EINVAL; - } + if (xdr_stream_decode_u32(xdr, &nr_iomaps)) + return nfserr_bad_xdr; - nr_iomaps = be32_to_cpup(p++); - if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) { - dprintk("%s: extent array size mismatch: %u/%u\n", - __func__, len, nr_iomaps); - return -EINVAL; - } + len = sizeof(__be32) + xdr_stream_remaining(xdr); + expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; + if (len != expected) + return nfserr_bad_xdr; iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); - if (!iomaps) { - dprintk("%s: failed to allocate extent array\n", __func__); - return -ENOMEM; - } + if (!iomaps) + return nfserr_delay; for (i = 0; i < nr_iomaps; i++) { struct pnfs_block_extent bex; - memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); - p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); + if (nfsd4_decode_deviceid4(xdr, &bex.vol_id)) { + nfserr = nfserr_bad_xdr; + goto fail; + } - p = xdr_decode_hyper(p, &bex.foff); + if (xdr_stream_decode_u64(xdr, &bex.foff)) { + nfserr = nfserr_bad_xdr; + goto fail; + } if (bex.foff & (block_size - 1)) { - dprintk("%s: unaligned offset 0x%llx\n", - __func__, bex.foff); + nfserr = nfserr_inval; + goto fail; + } + + if (xdr_stream_decode_u64(xdr, &bex.len)) { + nfserr = nfserr_bad_xdr; goto fail; } - p = xdr_decode_hyper(p, &bex.len); if (bex.len & (block_size - 1)) { - dprintk("%s: unaligned length 0x%llx\n", - __func__, bex.foff); + nfserr = nfserr_inval; + goto fail; + } + + if (xdr_stream_decode_u64(xdr, &bex.soff)) { + nfserr = nfserr_bad_xdr; goto fail; } - p = xdr_decode_hyper(p, &bex.soff); if (bex.soff & (block_size - 1)) { - dprintk("%s: unaligned disk offset 0x%llx\n", - __func__, bex.soff); + nfserr = nfserr_inval; + goto fail; + } + + if (xdr_stream_decode_u32(xdr, &bex.es)) { + nfserr = nfserr_bad_xdr; goto fail; } - bex.es = be32_to_cpup(p++); if (bex.es != PNFS_BLOCK_READWRITE_DATA) { - dprintk("%s: incorrect extent state %d\n", - __func__, bex.es); + nfserr = nfserr_inval; goto fail; } @@ -168,59 +217,79 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, } *iomapp = iomaps; - return nr_iomaps; + *nr_iomapsp = nr_iomaps; + return nfs_ok; fail: kfree(iomaps); - return -EINVAL; + return nfserr; } -int -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size) +/** + * nfsd4_scsi_decode_layoutupdate - decode the scsi layout extent array + * @xdr: subbuf set to the encoded array + * @iomapp: pointer to store the decoded extent array + * @nr_iomapsp: pointer to store the number of extents + * @block_size: alignment of extent offset and length + * + * This function decodes the opaque field of the layoutupdate4 structure + * in a layoutcommit request for the scsi layout driver. The field is + * actually an array of extents sent by the client. It also checks that + * the offset and length of each extent are aligned by @block_size. + * + * Return values: + * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid + * %nfserr_bad_xdr: The encoded array in @xdr is invalid + * %nfserr_inval: An unaligned extent found + * %nfserr_delay: Failed to allocate memory for @iomapp + */ +__be32 +nfsd4_scsi_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, + int *nr_iomapsp, u32 block_size) { struct iomap *iomaps; - u32 nr_iomaps, expected, i; + u32 nr_iomaps, expected, len, i; + __be32 nfserr; - if (len < sizeof(u32)) { - dprintk("%s: extent array too small: %u\n", __func__, len); - return -EINVAL; - } + if (xdr_stream_decode_u32(xdr, &nr_iomaps)) + return nfserr_bad_xdr; - nr_iomaps = be32_to_cpup(p++); + len = sizeof(__be32) + xdr_stream_remaining(xdr); expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; - if (len != expected) { - dprintk("%s: extent array size mismatch: %u/%u\n", - __func__, len, expected); - return -EINVAL; - } + if (len != expected) + return nfserr_bad_xdr; iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); - if (!iomaps) { - dprintk("%s: failed to allocate extent array\n", __func__); - return -ENOMEM; - } + if (!iomaps) + return nfserr_delay; for (i = 0; i < nr_iomaps; i++) { u64 val; - p = xdr_decode_hyper(p, &val); + if (xdr_stream_decode_u64(xdr, &val)) { + nfserr = nfserr_bad_xdr; + goto fail; + } if (val & (block_size - 1)) { - dprintk("%s: unaligned offset 0x%llx\n", __func__, val); + nfserr = nfserr_inval; goto fail; } iomaps[i].offset = val; - p = xdr_decode_hyper(p, &val); + if (xdr_stream_decode_u64(xdr, &val)) { + nfserr = nfserr_bad_xdr; + goto fail; + } if (val & (block_size - 1)) { - dprintk("%s: unaligned length 0x%llx\n", __func__, val); + nfserr = nfserr_inval; goto fail; } iomaps[i].length = val; } *iomapp = iomaps; - return nr_iomaps; + *nr_iomapsp = nr_iomaps; + return nfs_ok; fail: kfree(iomaps); - return -EINVAL; + return nfserr; } diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index bc5166bfe46b..2e0c6c7d2b42 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -8,6 +8,15 @@ struct iomap; struct xdr_stream; +/* On the wire size of the layout4 struct with zero number of extents */ +#define PNFS_BLOCK_LAYOUT4_SIZE \ + (sizeof(__be32) * 2 + /* offset4 */ \ + sizeof(__be32) * 2 + /* length4 */ \ + sizeof(__be32) + /* layoutiomode4 */ \ + sizeof(__be32) + /* layouttype4 */ \ + sizeof(__be32) + /* number of bytes */ \ + sizeof(__be32)) /* number of extents */ + struct pnfs_block_extent { struct nfsd4_deviceid vol_id; u64 foff; @@ -21,6 +30,11 @@ struct pnfs_block_range { u64 len; }; +struct pnfs_block_layout { + u32 nr_extents; + struct pnfs_block_extent extents[] __counted_by(nr_extents); +}; + /* * Random upper cap for the uuid length to avoid unbounded allocation. * Not actually limited by the protocol. @@ -47,16 +61,16 @@ struct pnfs_block_volume { struct pnfs_block_deviceaddr { u32 nr_volumes; - struct pnfs_block_volume volumes[]; + struct pnfs_block_volume volumes[] __counted_by(nr_volumes); }; __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp); + const struct nfsd4_getdeviceinfo *gdp); __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp); -int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size); -int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, - u32 block_size); + const struct nfsd4_layoutget *lgp); +__be32 nfsd4_block_decode_layoutupdate(struct xdr_stream *xdr, + struct iomap **iomapp, int *nr_iomapsp, u32 block_size); +__be32 nfsd4_scsi_decode_layoutupdate(struct xdr_stream *xdr, + struct iomap **iomapp, int *nr_iomapsp, u32 block_size); #endif /* _NFSD_BLOCKLAYOUTXDR_H */ diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 4a98537efb0f..bb7addef4a31 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -10,6 +10,7 @@ #define NFSCACHE_H #include <linux/sunrpc/svc.h> +#include "nfsd.h" /* * Representation of a reply cache entry. @@ -18,7 +19,7 @@ * typical sockaddr_storage. This is for space reasons, since sockaddr_storage * is much larger than a sockaddr_in6. */ -struct svc_cacherep { +struct nfsd_cacherep { struct { /* Keep often-read xid, csum in the same cache line: */ __be32 k_xid; @@ -77,10 +78,14 @@ enum { /* Checksum this amount of the request */ #define RC_CSUMLEN (256U) -int nfsd_reply_cache_init(void); -void nfsd_reply_cache_shutdown(void); -int nfsd_cache_lookup(struct svc_rqst *); -void nfsd_cache_update(struct svc_rqst *, int, __be32 *); -int nfsd_reply_cache_stats_open(struct inode *, struct file *); +int nfsd_drc_slab_create(void); +void nfsd_drc_slab_free(void); +int nfsd_reply_cache_init(struct nfsd_net *); +void nfsd_reply_cache_shutdown(struct nfsd_net *); +int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, + unsigned int len, struct nfsd_cacherep **cacherep); +void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, + int cachetype, __be32 *statp); +int nfsd_reply_cache_stats_show(struct seq_file *m, void *v); #endif /* NFSCACHE_H */ diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c new file mode 100644 index 000000000000..7f44689e0a53 --- /dev/null +++ b/fs/nfsd/debugfs.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/debugfs.h> + +#include "nfsd.h" + +static struct dentry *nfsd_top_dir __read_mostly; + +/* + * /sys/kernel/debug/nfsd/disable-splice-read + * + * Contents: + * %0: NFS READ is allowed to use page splicing + * %1: NFS READ uses only iov iter read + * + * The default value of this setting is zero (page splicing is + * allowed). This setting takes immediate effect for all NFS + * versions, all exports, and in all NFSD net namespaces. + */ + +static int nfsd_dsr_get(void *data, u64 *val) +{ + *val = nfsd_disable_splice_read ? 1 : 0; + return 0; +} + +static int nfsd_dsr_set(void *data, u64 val) +{ + nfsd_disable_splice_read = (val > 0); + if (!nfsd_disable_splice_read) { + /* + * Must use buffered I/O if splice_read is enabled. + */ + nfsd_io_cache_read = NFSD_IO_BUFFERED; + } + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n"); + +/* + * /sys/kernel/debug/nfsd/io_cache_read + * + * Contents: + * %0: NFS READ will use buffered IO + * %1: NFS READ will use dontcache (buffered IO w/ dropbehind) + * %2: NFS READ will use direct IO + * + * This setting takes immediate effect for all NFS versions, + * all exports, and in all NFSD net namespaces. + */ + +static int nfsd_io_cache_read_get(void *data, u64 *val) +{ + *val = nfsd_io_cache_read; + return 0; +} + +static int nfsd_io_cache_read_set(void *data, u64 val) +{ + int ret = 0; + + switch (val) { + case NFSD_IO_BUFFERED: + nfsd_io_cache_read = NFSD_IO_BUFFERED; + break; + case NFSD_IO_DONTCACHE: + case NFSD_IO_DIRECT: + /* + * Must disable splice_read when enabling + * NFSD_IO_DONTCACHE. + */ + nfsd_disable_splice_read = true; + nfsd_io_cache_read = val; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_io_cache_read_fops, nfsd_io_cache_read_get, + nfsd_io_cache_read_set, "%llu\n"); + +/* + * /sys/kernel/debug/nfsd/io_cache_write + * + * Contents: + * %0: NFS WRITE will use buffered IO + * %1: NFS WRITE will use dontcache (buffered IO w/ dropbehind) + * + * This setting takes immediate effect for all NFS versions, + * all exports, and in all NFSD net namespaces. + */ + +static int nfsd_io_cache_write_get(void *data, u64 *val) +{ + *val = nfsd_io_cache_write; + return 0; +} + +static int nfsd_io_cache_write_set(void *data, u64 val) +{ + int ret = 0; + + switch (val) { + case NFSD_IO_BUFFERED: + case NFSD_IO_DONTCACHE: + case NFSD_IO_DIRECT: + nfsd_io_cache_write = val; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_io_cache_write_fops, nfsd_io_cache_write_get, + nfsd_io_cache_write_set, "%llu\n"); + +void nfsd_debugfs_exit(void) +{ + debugfs_remove_recursive(nfsd_top_dir); + nfsd_top_dir = NULL; +} + +void nfsd_debugfs_init(void) +{ + nfsd_top_dir = debugfs_create_dir("nfsd", NULL); + + debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO, + nfsd_top_dir, NULL, &nfsd_dsr_fops); + + debugfs_create_file("io_cache_read", 0644, nfsd_top_dir, NULL, + &nfsd_io_cache_read_fops); + + debugfs_create_file("io_cache_write", 0644, nfsd_top_dir, NULL, + &nfsd_io_cache_write_fops); +} diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 802993d8912f..9d55512d0cc9 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -22,6 +22,8 @@ #include "nfsfh.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -49,6 +51,11 @@ static void expkey_put(struct kref *ref) kfree_rcu(key, ek_rcu); } +static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); +} + static void expkey_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -75,8 +82,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) int len; struct auth_domain *dom = NULL; int err; - int fsidtype; - char *ep; + u8 fsidtype; struct svc_expkey key; struct svc_expkey *ek = NULL; @@ -90,7 +96,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) goto out; err = -EINVAL; - if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; @@ -100,12 +106,11 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) dprintk("found domain %s\n", buf); err = -EINVAL; - if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; - fsidtype = simple_strtoul(buf, &ep, 10); - if (*ep) + if (kstrtou8(buf, 10, &fsidtype)) goto out; - dprintk("found fsidtype %d\n", fsidtype); + dprintk("found fsidtype %u\n", fsidtype); if (key_len(fsidtype)==0) /* invalid type */ goto out; if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) @@ -116,11 +121,11 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) /* OK, we seem to have a valid key */ key.h.flags = 0; - key.h.expiry_time = get_expiry(&mesg); - if (key.h.expiry_time == 0) + err = get_expiry(&mesg, &key.h.expiry_time); + if (err) goto out; - key.ek_client = dom; + key.ek_client = dom; key.ek_fsidtype = fsidtype; memcpy(key.ek_fsid, buf, len); @@ -139,7 +144,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) if (len == 0) { set_bit(CACHE_NEGATIVE, &key.h.flags); ek = svc_expkey_update(cd, &key, ek); - if (!ek) + if (ek) + trace_nfsd_expkey_update(ek, NULL); + else err = -ENOMEM; } else { err = kern_path(buf, 0, &key.ek_path); @@ -149,7 +156,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) dprintk("Found the path %s\n", buf); ek = svc_expkey_update(cd, &key, ek); - if (!ek) + if (ek) + trace_nfsd_expkey_update(ek, buf); + else err = -ENOMEM; path_put(&key.ek_path); } @@ -232,11 +241,23 @@ static struct cache_head *expkey_alloc(void) return NULL; } +static void expkey_flush(void) +{ + /* + * Take the nfsd_mutex here to ensure that the file cache is not + * destroyed while we're in the middle of flushing. + */ + mutex_lock(&nfsd_mutex); + nfsd_file_cache_purge(current->nsproxy->net_ns); + mutex_unlock(&nfsd_mutex); +} + static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", .cache_put = expkey_put, + .cache_upcall = expkey_upcall, .cache_request = expkey_request, .cache_parse = expkey_parse, .cache_show = expkey_show, @@ -244,6 +265,7 @@ static const struct cache_detail svc_expkey_cache_template = { .init = expkey_init, .update = expkey_update, .alloc = expkey_alloc, + .flush = expkey_flush, }; static int @@ -307,14 +329,54 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) fsloc->locations = NULL; } +static int export_stats_init(struct export_stats *stats) +{ + stats->start_time = ktime_get_seconds(); + return percpu_counter_init_many(stats->counter, 0, GFP_KERNEL, + EXP_STATS_COUNTERS_NUM); +} + +static void export_stats_reset(struct export_stats *stats) +{ + if (stats) { + int i; + + for (i = 0; i < EXP_STATS_COUNTERS_NUM; i++) + percpu_counter_set(&stats->counter[i], 0); + } +} + +static void export_stats_destroy(struct export_stats *stats) +{ + if (stats) + percpu_counter_destroy_many(stats->counter, + EXP_STATS_COUNTERS_NUM); +} + +static void svc_export_release(struct rcu_head *rcu_head) +{ + struct svc_export *exp = container_of(rcu_head, struct svc_export, + ex_rcu); + + nfsd4_fslocs_free(&exp->ex_fslocs); + export_stats_destroy(exp->ex_stats); + kfree(exp->ex_stats); + kfree(exp->ex_uuid); + kfree(exp); +} + static void svc_export_put(struct kref *ref) { struct svc_export *exp = container_of(ref, struct svc_export, h.ref); + path_put(&exp->ex_path); auth_domain_put(exp->ex_client); - nfsd4_fslocs_free(&exp->ex_fslocs); - kfree(exp->ex_uuid); - kfree_rcu(exp, ex_rcu); + call_rcu(&exp->ex_rcu, svc_export_release); +} + +static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); } static void svc_export_request(struct cache_detail *cd, @@ -340,8 +402,9 @@ static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); -static int check_export(struct inode *inode, int *flags, unsigned char *uuid) +static int check_export(const struct path *path, int *flags, unsigned char *uuid) { + struct inode *inode = d_inode(path->dentry); /* * We currently export only dirs, regular files, and (for v4 @@ -365,6 +428,7 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid) * or an FSID number (so NFSEXP_FSID or ->uuid is needed). * 2: We must be able to find an inode from a filehandle. * This means that s_export_op must be set. + * 3: We must not currently be on an idmapped mount. */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && !(*flags & NFSEXP_FSID) && @@ -373,14 +437,23 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid) return -EINVAL; } - if (!inode->i_sb->s_export_op || - !inode->i_sb->s_export_op->fh_to_dentry) { + if (!exportfs_can_decode_fh(inode->i_sb->s_export_op)) { dprintk("exp_export: export of invalid fs type.\n"); return -EINVAL; } - return 0; + if (is_idmapped_mnt(path->mnt)) { + dprintk("exp_export: export of idmapped mounts not yet supported.\n"); + return -EINVAL; + } + if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK && + !(*flags & NFSEXP_NOSUBTREECHECK)) { + dprintk("%s: %s does not support subtree checking!\n", + __func__, inode->i_sb->s_type->name); + return -EINVAL; + } + return 0; } #ifdef CONFIG_NFSD_V4 @@ -487,6 +560,29 @@ static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif +static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp) +{ + unsigned int i, mode, listsize; + int err; + + err = get_uint(mesg, &listsize); + if (err) + return err; + if (listsize > NFSEXP_XPRTSEC_NUM) + return -EINVAL; + + exp->ex_xprtsec_modes = 0; + for (i = 0; i < listsize; i++) { + err = get_uint(mesg, &mode); + if (err) + return err; + if (mode > NFSEXP_XPRTSEC_MTLS) + return -EINVAL; + exp->ex_xprtsec_modes |= mode; + } + return 0; +} + static inline int nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { @@ -512,7 +608,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ char *buf; - int len; int err; struct auth_domain *dom = NULL; struct svc_export exp = {}, *expp; @@ -528,8 +623,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) /* client */ err = -EINVAL; - len = qword_get(&mesg, buf, PAGE_SIZE); - if (len <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; @@ -539,7 +633,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) /* path */ err = -EINVAL; - if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out1; err = kern_path(buf, 0, &exp.ex_path); @@ -549,11 +643,11 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; exp.cd = cd; exp.ex_devid_map = NULL; + exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL; /* expiry */ - err = -EINVAL; - exp.h.expiry_time = get_expiry(&mesg); - if (exp.h.expiry_time == 0) + err = get_expiry(&mesg, &exp.h.expiry_time); + if (err) goto out3; /* flags */ @@ -565,18 +659,18 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (err || an_int < 0) goto out3; exp.ex_flags= an_int; - + /* anon uid */ err = get_int(&mesg, &an_int); if (err) goto out3; - exp.ex_anon_uid= make_kuid(&init_user_ns, an_int); + exp.ex_anon_uid= make_kuid(current_user_ns(), an_int); /* anon gid */ err = get_int(&mesg, &an_int); if (err) goto out3; - exp.ex_anon_gid= make_kgid(&init_user_ns, an_int); + exp.ex_anon_gid= make_kgid(current_user_ns(), an_int); /* fsid */ err = get_int(&mesg, &an_int); @@ -584,13 +678,15 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out3; exp.ex_fsid = an_int; - while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { + while (qword_get(&mesg, buf, PAGE_SIZE) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); + else if (strcmp(buf, "xprtsec") == 0) + err = xprtsec_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything * following. Newer user-space can try to set @@ -601,10 +697,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out4; } - err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags, - exp.ex_uuid); + err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; + /* * No point caching this if it would immediately expire. * Also, this protects exportfs's dummy export from the @@ -630,15 +726,17 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) } expp = svc_export_lookup(&exp); - if (expp) - expp = svc_export_update(&exp, expp); - else - err = -ENOMEM; - cache_flush(); - if (expp == NULL) + if (!expp) { err = -ENOMEM; - else + goto out4; + } + expp = svc_export_update(&exp, expp); + if (expp) { + trace_nfsd_export_update(expp); + cache_flush(); exp_put(expp); + } else + err = -ENOMEM; out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); @@ -655,22 +753,49 @@ static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); static void show_secinfo(struct seq_file *m, struct svc_export *exp); +static int is_export_stats_file(struct seq_file *m) +{ + /* + * The export_stats file uses the same ops as the exports file. + * We use the file's name to determine the reported info per export. + * There is no rename in nsfdfs, so d_name.name is stable. + */ + return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats"); +} + static int svc_export_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { - struct svc_export *exp ; + struct svc_export *exp; + bool export_stats = is_export_stats_file(m); - if (h ==NULL) { - seq_puts(m, "#path domain(flags)\n"); + if (h == NULL) { + if (export_stats) + seq_puts(m, "#path domain start-time\n#\tstats\n"); + else + seq_puts(m, "#path domain(flags)\n"); return 0; } exp = container_of(h, struct svc_export, h); seq_path(m, &exp->ex_path, " \t\n\\"); seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); + if (export_stats) { + struct percpu_counter *counter = exp->ex_stats->counter; + + seq_printf(m, "\t%lld\n", exp->ex_stats->start_time); + seq_printf(m, "\tfh_stale: %lld\n", + percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE])); + seq_printf(m, "\tio_read: %lld\n", + percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ])); + seq_printf(m, "\tio_write: %lld\n", + percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE])); + seq_putc(m, '\n'); + return 0; + } seq_putc(m, '('); - if (test_bit(CACHE_VALID, &h->flags) && + if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { exp_flags(m, exp->ex_flags, exp->ex_fsid, exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); @@ -711,6 +836,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; + export_stats_reset(new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) @@ -738,15 +864,28 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; } + new->ex_xprtsec_modes = item->ex_xprtsec_modes; } static struct cache_head *svc_export_alloc(void) { struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); - if (i) - return &i->h; - else + if (!i) return NULL; + + i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL); + if (!i->ex_stats) { + kfree(i); + return NULL; + } + + if (export_stats_init(i->ex_stats)) { + kfree(i->ex_stats); + kfree(i); + return NULL; + } + + return &i->h; } static const struct cache_detail svc_export_cache_template = { @@ -754,6 +893,7 @@ static const struct cache_detail svc_export_cache_template = { .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", .cache_put = svc_export_put, + .cache_upcall = svc_export_upcall, .cache_request = svc_export_request, .cache_parse = svc_export_parse, .cache_show = svc_export_show, @@ -819,8 +959,10 @@ exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, if (ek == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &ek->h, reqp); - if (err) + if (err) { + trace_nfsd_exp_find_key(&key, err); return ERR_PTR(err); + } return ek; } @@ -842,8 +984,10 @@ exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, if (exp == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &exp->h, reqp); - if (err) + if (err) { + trace_nfsd_exp_get_by_name(&key, err); return ERR_PTR(err); + } return exp; } @@ -937,24 +1081,76 @@ static struct svc_export *exp_find(struct cache_detail *cd, return exp; } -__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) +/** + * check_xprtsec_policy - check if access to export is allowed by the + * xprtsec policy + * @exp: svc_export that is being accessed. + * @rqstp: svc_rqst attempting to access @exp. + * + * Helper function for check_nfsd_access(). Note that callers should be + * using check_nfsd_access() instead of calling this function directly. The + * one exception is __fh_verify() since it has logic that may result in one + * or both of the helpers being skipped. + * + * Return values: + * %nfs_ok if access is granted, or + * %nfserr_wrongsec if access is denied + */ +__be32 check_xprtsec_policy(struct svc_export *exp, struct svc_rqst *rqstp) { - struct exp_flavor_info *f; - struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; + struct svc_xprt *xprt = rqstp->rq_xprt; + + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) { + if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) + return nfs_ok; + } + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) { + if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && + !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) + return nfs_ok; + } + if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) { + if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && + test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) + return nfs_ok; + } + return nfserr_wrongsec; +} + +/** + * check_security_flavor - check if access to export is allowed by the + * security flavor + * @exp: svc_export that is being accessed. + * @rqstp: svc_rqst attempting to access @exp. + * @may_bypass_gss: reduce strictness of authorization check + * + * Helper function for check_nfsd_access(). Note that callers should be + * using check_nfsd_access() instead of calling this function directly. The + * one exception is __fh_verify() since it has logic that may result in one + * or both of the helpers being skipped. + * + * Return values: + * %nfs_ok if access is granted, or + * %nfserr_wrongsec if access is denied + */ +__be32 check_security_flavor(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss) +{ + struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; /* legacy gss-only clients are always OK: */ if (exp->ex_client == rqstp->rq_gssclient) - return 0; + return nfs_ok; /* ip-address based client; check sec= export option: */ for (f = exp->ex_flavors; f < end; f++) { if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) - return 0; + return nfs_ok; } /* defaults in absence of sec= options: */ if (exp->ex_nflavors == 0) { if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) - return 0; + return nfs_ok; } /* If the compound op contains a spo_must_allowed op, @@ -964,11 +1160,49 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) */ if (nfsd4_spo_must_allow(rqstp)) - return 0; + return nfs_ok; + + /* Some calls may be processed without authentication + * on GSS exports. For example NFS2/3 calls on root + * directory, see section 2.3.2 of rfc 2623. + * For "may_bypass_gss" check that export has really + * enabled some flavor with authentication (GSS or any + * other) and also check that the used auth flavor is + * without authentication (none or sys). + */ + if (may_bypass_gss && ( + rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || + rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)) { + for (f = exp->ex_flavors; f < end; f++) { + if (f->pseudoflavor >= RPC_AUTH_DES) + return 0; + } + } return nfserr_wrongsec; } +/** + * check_nfsd_access - check if access to export is allowed. + * @exp: svc_export that is being accessed. + * @rqstp: svc_rqst attempting to access @exp. + * @may_bypass_gss: reduce strictness of authorization check + * + * Return values: + * %nfs_ok if access is granted, or + * %nfserr_wrongsec if access is denied + */ +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss) +{ + __be32 status; + + status = check_xprtsec_policy(exp, rqstp); + if (status != nfs_ok) + return status; + return check_security_flavor(exp, rqstp, may_bypass_gss); +} + /* * Uses rq_client and rq_gssclient to find an export; uses rq_client (an * auth_unix client) if it's available and has secinfo information; @@ -979,7 +1213,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) * use exp_get_by_name() or exp_find(). */ struct svc_export * -rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) +rqst_exp_get_by_name(struct svc_rqst *rqstp, const struct path *path) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); @@ -1009,19 +1243,35 @@ gss: return gssexp; } +/** + * rqst_exp_find - Find an svc_export in the context of a rqst or similar + * @reqp: The handle to be used to suspend the request if a cache-upcall is needed + * If NULL, missing in-cache information will result in failure. + * @net: The network namespace in which the request exists + * @cl: default auth_domain to use for looking up the export + * @gsscl: an alternate auth_domain defined using deprecated gss/krb5 format. + * @fsid_type: The type of fsid to look for + * @fsidv: The actual fsid to look up in the context of either client. + * + * Perform a lookup for @cl/@fsidv in the given @net for an export. If + * none found and @gsscl specified, repeat the lookup. + * + * Returns an export, or an error pointer. + */ struct svc_export * -rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) +rqst_exp_find(struct cache_req *reqp, struct net *net, + struct auth_domain *cl, struct auth_domain *gsscl, + int fsid_type, u32 *fsidv) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; - if (rqstp->rq_client == NULL) + if (!cl) goto gss; /* First try the auth_unix client: */ - exp = exp_find(cd, rqstp->rq_client, fsid_type, - fsidv, &rqstp->rq_chandle); + exp = exp_find(cd, cl, fsid_type, fsidv, reqp); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) @@ -1031,10 +1281,9 @@ rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv) return exp; gss: /* Otherwise, try falling back on gss client */ - if (rqstp->rq_gssclient == NULL) + if (!gsscl) return exp; - gssexp = exp_find(cd, rqstp->rq_gssclient, fsid_type, fsidv, - &rqstp->rq_chandle); + gssexp = exp_find(cd, gsscl, fsid_type, fsidv, reqp); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) @@ -1065,7 +1314,9 @@ struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); - return rqst_exp_find(rqstp, FSID_NUM, fsidv); + return rqst_exp_find(&rqstp->rq_chandle, SVC_NET(rqstp), + rqstp->rq_client, rqstp->rq_gssclient, + FSID_NUM, fsidv); } /* @@ -1170,15 +1421,17 @@ static void show_secinfo(struct seq_file *m, struct svc_export *exp) static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc) { + struct user_namespace *userns = m->file->f_cred->user_ns; + show_expflags(m, flag, NFSEXP_ALLFLAGS); if (flag & NFSEXP_FSID) seq_printf(m, ",fsid=%d", fsid); - if (!uid_eq(anonu, make_kuid(&init_user_ns, (uid_t)-2)) && - !uid_eq(anonu, make_kuid(&init_user_ns, 0x10000-2))) - seq_printf(m, ",anonuid=%u", from_kuid(&init_user_ns, anonu)); - if (!gid_eq(anong, make_kgid(&init_user_ns, (gid_t)-2)) && - !gid_eq(anong, make_kgid(&init_user_ns, 0x10000-2))) - seq_printf(m, ",anongid=%u", from_kgid(&init_user_ns, anong)); + if (!uid_eq(anonu, make_kuid(userns, (uid_t)-2)) && + !uid_eq(anonu, make_kuid(userns, 0x10000-2))) + seq_printf(m, ",anonuid=%u", from_kuid_munged(userns, anonu)); + if (!gid_eq(anong, make_kgid(userns, (gid_t)-2)) && + !gid_eq(anong, make_kgid(userns, 0x10000-2))) + seq_printf(m, ",anongid=%u", from_kgid_munged(userns, anong)); if (fsloc && fsloc->locations_count > 0) { char *loctype = (fsloc->migrated) ? "refer" : "replicas"; int i; @@ -1201,17 +1454,20 @@ static int e_show(struct seq_file *m, void *p) struct cache_head *cp = p; struct svc_export *exp = container_of(cp, struct svc_export, h); struct cache_detail *cd = m->private; + bool export_stats = is_export_stats_file(m); if (p == SEQ_START_TOKEN) { seq_puts(m, "# Version 1.1\n"); - seq_puts(m, "# Path Client(Flags) # IPs\n"); + if (export_stats) + seq_puts(m, "# Path Client Start-time\n#\tStats\n"); + else + seq_puts(m, "# Path Client(Flags) # IPs\n"); return 0; } - exp_get(exp); - if (cache_check(cd, &exp->h, NULL)) + if (cache_check_rcu(cd, &exp->h, NULL)) return 0; - exp_put(exp); + return svc_export_show(m, cd, cp); } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index e7daa1f246f0..d2b09cd76145 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -6,6 +6,7 @@ #define NFSD_EXPORT_H #include <linux/sunrpc/cache.h> +#include <linux/percpu_counter.h> #include <uapi/linux/nfsd/export.h> #include <linux/nfs4.h> @@ -46,14 +47,27 @@ struct exp_flavor_info { u32 flags; }; +/* Per-export stats */ +enum { + EXP_STATS_FH_STALE, + EXP_STATS_IO_READ, + EXP_STATS_IO_WRITE, + EXP_STATS_COUNTERS_NUM +}; + +struct export_stats { + time64_t start_time; + struct percpu_counter counter[EXP_STATS_COUNTERS_NUM]; +}; + struct svc_export { struct cache_head h; struct auth_domain * ex_client; int ex_flags; + int ex_fsid; struct path ex_path; kuid_t ex_anon_uid; kgid_t ex_anon_gid; - int ex_fsid; unsigned char * ex_uuid; /* 16 byte fsid */ struct nfsd4_fs_locations ex_fslocs; uint32_t ex_nflavors; @@ -62,6 +76,8 @@ struct svc_export { struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; struct rcu_head ex_rcu; + unsigned long ex_xprtsec_modes; + struct export_stats *ex_stats; }; /* an "export key" (expkey) maps a filehandlefragement to an @@ -72,7 +88,7 @@ struct svc_expkey { struct cache_head h; struct auth_domain * ek_client; - int ek_fsidtype; + u8 ek_fsidtype; u32 ek_fsid[6]; struct path ek_path; @@ -83,8 +99,13 @@ struct svc_expkey { #define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE) #define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES) -int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp); -__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp); +struct svc_cred; +int nfsexp_flags(struct svc_cred *cred, struct svc_export *exp); +__be32 check_xprtsec_policy(struct svc_export *exp, struct svc_rqst *rqstp); +__be32 check_security_flavor(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss); +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss); /* * Function declarations @@ -93,14 +114,13 @@ int nfsd_export_init(struct net *); void nfsd_export_shutdown(struct net *); void nfsd_export_flush(struct net *); struct svc_export * rqst_exp_get_by_name(struct svc_rqst *, - struct path *); + const struct path *); struct svc_export * rqst_exp_parent(struct svc_rqst *, struct path *); struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); int exp_rootfh(struct net *, struct auth_domain *, char *path, struct knfsd_fh *, int maxsize); __be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *); -__be32 nfserrno(int errno); static inline void exp_put(struct svc_export *exp) { @@ -112,6 +132,8 @@ static inline struct svc_export *exp_get(struct svc_export *exp) cache_get(&exp->h); return exp; } -struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *); +struct svc_export *rqst_exp_find(struct cache_req *reqp, struct net *net, + struct auth_domain *cl, struct auth_domain *gsscl, + int fsid_type, u32 *fsidv); #endif /* NFSD_EXPORT_H */ diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c deleted file mode 100644 index 84831253203d..000000000000 --- a/fs/nfsd/fault_inject.c +++ /dev/null @@ -1,150 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com> - * - * Uses debugfs to create fault injection points for client testing - */ - -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/debugfs.h> -#include <linux/module.h> -#include <linux/nsproxy.h> -#include <linux/sunrpc/addr.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> - -#include "state.h" -#include "netns.h" - -struct nfsd_fault_inject_op { - char *file; - u64 (*get)(void); - u64 (*set_val)(u64); - u64 (*set_clnt)(struct sockaddr_storage *, size_t); -}; - -static struct dentry *debug_dir; - -static ssize_t fault_inject_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - static u64 val; - char read_buf[25]; - size_t size; - loff_t pos = *ppos; - struct nfsd_fault_inject_op *op = file_inode(file)->i_private; - - if (!pos) - val = op->get(); - size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); - - return simple_read_from_buffer(buf, len, ppos, read_buf, size); -} - -static ssize_t fault_inject_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - char write_buf[INET6_ADDRSTRLEN]; - size_t size = min(sizeof(write_buf) - 1, len); - struct net *net = current->nsproxy->net_ns; - struct sockaddr_storage sa; - struct nfsd_fault_inject_op *op = file_inode(file)->i_private; - u64 val; - char *nl; - - if (copy_from_user(write_buf, buf, size)) - return -EFAULT; - write_buf[size] = '\0'; - - /* Deal with any embedded newlines in the string */ - nl = strchr(write_buf, '\n'); - if (nl) { - size = nl - write_buf; - *nl = '\0'; - } - - size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); - if (size > 0) { - val = op->set_clnt(&sa, size); - if (val) - pr_info("NFSD [%s]: Client %s had %llu state object(s)\n", - op->file, write_buf, val); - } else { - val = simple_strtoll(write_buf, NULL, 0); - if (val == 0) - pr_info("NFSD Fault Injection: %s (all)", op->file); - else - pr_info("NFSD Fault Injection: %s (n = %llu)", - op->file, val); - val = op->set_val(val); - pr_info("NFSD: %s: found %llu", op->file, val); - } - return len; /* on success, claim we got the whole input */ -} - -static const struct file_operations fops_nfsd = { - .owner = THIS_MODULE, - .read = fault_inject_read, - .write = fault_inject_write, -}; - -void nfsd_fault_inject_cleanup(void) -{ - debugfs_remove_recursive(debug_dir); -} - -static struct nfsd_fault_inject_op inject_ops[] = { - { - .file = "forget_clients", - .get = nfsd_inject_print_clients, - .set_val = nfsd_inject_forget_clients, - .set_clnt = nfsd_inject_forget_client, - }, - { - .file = "forget_locks", - .get = nfsd_inject_print_locks, - .set_val = nfsd_inject_forget_locks, - .set_clnt = nfsd_inject_forget_client_locks, - }, - { - .file = "forget_openowners", - .get = nfsd_inject_print_openowners, - .set_val = nfsd_inject_forget_openowners, - .set_clnt = nfsd_inject_forget_client_openowners, - }, - { - .file = "forget_delegations", - .get = nfsd_inject_print_delegations, - .set_val = nfsd_inject_forget_delegations, - .set_clnt = nfsd_inject_forget_client_delegations, - }, - { - .file = "recall_delegations", - .get = nfsd_inject_print_delegations, - .set_val = nfsd_inject_recall_delegations, - .set_clnt = nfsd_inject_recall_client_delegations, - }, -}; - -int nfsd_fault_inject_init(void) -{ - unsigned int i; - struct nfsd_fault_inject_op *op; - umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; - - debug_dir = debugfs_create_dir("nfsd", NULL); - if (!debug_dir) - goto fail; - - for (i = 0; i < ARRAY_SIZE(inject_ops); i++) { - op = &inject_ops[i]; - if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd)) - goto fail; - } - return 0; - -fail: - nfsd_fault_inject_cleanup(); - return -ENOMEM; -} diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c new file mode 100644 index 000000000000..93798575b807 --- /dev/null +++ b/fs/nfsd/filecache.c @@ -0,0 +1,1430 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The NFSD open file cache. + * + * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com> + * + * An nfsd_file object is a per-file collection of open state that binds + * together: + * - a struct file * + * - a user credential + * - a network namespace + * - a read-ahead context + * - monitoring for writeback errors + * + * nfsd_file objects are reference-counted. Consumers acquire a new + * object via the nfsd_file_acquire API. They manage their interest in + * the acquired object, and hence the object's reference count, via + * nfsd_file_get and nfsd_file_put. There are two varieties of nfsd_file + * object: + * + * * non-garbage-collected: When a consumer wants to precisely control + * the lifetime of a file's open state, it acquires a non-garbage- + * collected nfsd_file. The final nfsd_file_put releases the open + * state immediately. + * + * * garbage-collected: When a consumer does not control the lifetime + * of open state, it acquires a garbage-collected nfsd_file. The + * final nfsd_file_put allows the open state to linger for a period + * during which it may be re-used. + */ + +#include <linux/hash.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/pagemap.h> +#include <linux/sched.h> +#include <linux/list_lru.h> +#include <linux/fsnotify_backend.h> +#include <linux/fsnotify.h> +#include <linux/seq_file.h> +#include <linux/rhashtable.h> +#include <linux/nfslocalio.h> + +#include "vfs.h" +#include "nfsd.h" +#include "nfsfh.h" +#include "netns.h" +#include "filecache.h" +#include "trace.h" + +#define NFSD_LAUNDRETTE_DELAY (2 * HZ) + +#define NFSD_FILE_CACHE_UP (0) + +/* We only care about NFSD_MAY_READ/WRITE for this cache */ +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO) + +static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); +static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); +static DEFINE_PER_CPU(unsigned long, nfsd_file_allocations); +static DEFINE_PER_CPU(unsigned long, nfsd_file_releases); +static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age); +static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions); + +struct nfsd_fcache_disposal { + spinlock_t lock; + struct list_head freeme; +}; + +static struct kmem_cache *nfsd_file_slab; +static struct kmem_cache *nfsd_file_mark_slab; +static struct list_lru nfsd_file_lru; +static unsigned long nfsd_file_flags; +static struct fsnotify_group *nfsd_file_fsnotify_group; +static struct delayed_work nfsd_filecache_laundrette; +static struct rhltable nfsd_file_rhltable + ____cacheline_aligned_in_smp; + +static bool +nfsd_match_cred(const struct cred *c1, const struct cred *c2) +{ + int i; + + if (!uid_eq(c1->fsuid, c2->fsuid)) + return false; + if (!gid_eq(c1->fsgid, c2->fsgid)) + return false; + if (c1->group_info == NULL || c2->group_info == NULL) + return c1->group_info == c2->group_info; + if (c1->group_info->ngroups != c2->group_info->ngroups) + return false; + for (i = 0; i < c1->group_info->ngroups; i++) { + if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i])) + return false; + } + return true; +} + +static const struct rhashtable_params nfsd_file_rhash_params = { + .key_len = sizeof_field(struct nfsd_file, nf_inode), + .key_offset = offsetof(struct nfsd_file, nf_inode), + .head_offset = offsetof(struct nfsd_file, nf_rlist), + + /* + * Start with a single page hash table to reduce resizing churn + * on light workloads. + */ + .min_size = 256, + .automatic_shrinking = true, +}; + +static void +nfsd_file_schedule_laundrette(void) +{ + if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags)) + queue_delayed_work(system_dfl_wq, &nfsd_filecache_laundrette, + NFSD_LAUNDRETTE_DELAY); +} + +static void +nfsd_file_slab_free(struct rcu_head *rcu) +{ + struct nfsd_file *nf = container_of(rcu, struct nfsd_file, nf_rcu); + + put_cred(nf->nf_cred); + kmem_cache_free(nfsd_file_slab, nf); +} + +static void +nfsd_file_mark_free(struct fsnotify_mark *mark) +{ + struct nfsd_file_mark *nfm = container_of(mark, struct nfsd_file_mark, + nfm_mark); + + kmem_cache_free(nfsd_file_mark_slab, nfm); +} + +static struct nfsd_file_mark * +nfsd_file_mark_get(struct nfsd_file_mark *nfm) +{ + if (!refcount_inc_not_zero(&nfm->nfm_ref)) + return NULL; + return nfm; +} + +static void +nfsd_file_mark_put(struct nfsd_file_mark *nfm) +{ + if (refcount_dec_and_test(&nfm->nfm_ref)) { + fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group); + fsnotify_put_mark(&nfm->nfm_mark); + } +} + +static struct nfsd_file_mark * +nfsd_file_mark_find_or_create(struct inode *inode) +{ + int err; + struct fsnotify_mark *mark; + struct nfsd_file_mark *nfm = NULL, *new; + + do { + fsnotify_group_lock(nfsd_file_fsnotify_group); + mark = fsnotify_find_inode_mark(inode, + nfsd_file_fsnotify_group); + if (mark) { + nfm = nfsd_file_mark_get(container_of(mark, + struct nfsd_file_mark, + nfm_mark)); + fsnotify_group_unlock(nfsd_file_fsnotify_group); + if (nfm) { + fsnotify_put_mark(mark); + break; + } + /* Avoid soft lockup race with nfsd_file_mark_put() */ + fsnotify_destroy_mark(mark, nfsd_file_fsnotify_group); + fsnotify_put_mark(mark); + } else { + fsnotify_group_unlock(nfsd_file_fsnotify_group); + } + + /* allocate a new nfm */ + new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL); + if (!new) + return NULL; + fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group); + new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF; + refcount_set(&new->nfm_ref, 1); + + err = fsnotify_add_inode_mark(&new->nfm_mark, inode, 0); + + /* + * If the add was successful, then return the object. + * Otherwise, we need to put the reference we hold on the + * nfm_mark. The fsnotify code will take a reference and put + * it on failure, so we can't just free it directly. It's also + * not safe to call fsnotify_destroy_mark on it as the + * mark->group will be NULL. Thus, we can't let the nfm_ref + * counter drive the destruction at this point. + */ + if (likely(!err)) + nfm = new; + else + fsnotify_put_mark(&new->nfm_mark); + } while (unlikely(err == -EEXIST)); + + return nfm; +} + +static struct nfsd_file * +nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, + bool want_gc) +{ + struct nfsd_file *nf; + + nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); + if (unlikely(!nf)) + return NULL; + + this_cpu_inc(nfsd_file_allocations); + INIT_LIST_HEAD(&nf->nf_lru); + INIT_LIST_HEAD(&nf->nf_gc); + nf->nf_birthtime = ktime_get(); + nf->nf_file = NULL; + nf->nf_cred = get_current_cred(); + nf->nf_net = net; + nf->nf_flags = want_gc ? + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING) | BIT(NFSD_FILE_GC) : + BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING); + nf->nf_inode = inode; + refcount_set(&nf->nf_ref, 1); + nf->nf_may = need; + nf->nf_mark = NULL; + nf->nf_dio_mem_align = 0; + nf->nf_dio_offset_align = 0; + nf->nf_dio_read_offset_align = 0; + return nf; +} + +/** + * nfsd_file_check_write_error - check for writeback errors on a file + * @nf: nfsd_file to check for writeback errors + * + * Check whether a nfsd_file has an unseen error. Reset the write + * verifier if so. + */ +static void +nfsd_file_check_write_error(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + + if ((file->f_mode & FMODE_WRITE) && + filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err))) + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); +} + +static void +nfsd_file_hash_remove(struct nfsd_file *nf) +{ + trace_nfsd_file_unhash(nf); + rhltable_remove(&nfsd_file_rhltable, &nf->nf_rlist, + nfsd_file_rhash_params); +} + +static bool +nfsd_file_unhash(struct nfsd_file *nf) +{ + if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_hash_remove(nf); + return true; + } + return false; +} + +static void +nfsd_file_free(struct nfsd_file *nf) +{ + s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime)); + + trace_nfsd_file_free(nf); + + this_cpu_inc(nfsd_file_releases); + this_cpu_add(nfsd_file_total_age, age); + + nfsd_file_unhash(nf); + if (nf->nf_mark) + nfsd_file_mark_put(nf->nf_mark); + if (nf->nf_file) { + nfsd_file_check_write_error(nf); + nfsd_filp_close(nf->nf_file); + } + + /* + * If this item is still linked via nf_lru, that's a bug. + * WARN and leak it to preserve system stability. + */ + if (WARN_ON_ONCE(!list_empty(&nf->nf_lru))) + return; + + call_rcu(&nf->nf_rcu, nfsd_file_slab_free); +} + +static bool +nfsd_file_check_writeback(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + struct address_space *mapping; + + /* File not open for write? */ + if (!(file->f_mode & FMODE_WRITE)) + return false; + + /* + * Some filesystems (e.g. NFS) flush all dirty data on close. + * On others, there is no need to wait for writeback. + */ + if (!(file_inode(file)->i_sb->s_export_op->flags & EXPORT_OP_FLUSH_ON_CLOSE)) + return false; + + mapping = file->f_mapping; + return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); +} + +static void nfsd_file_lru_add(struct nfsd_file *nf) +{ + refcount_inc(&nf->nf_ref); + if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) + trace_nfsd_file_lru_add(nf); + else + WARN_ON(1); + nfsd_file_schedule_laundrette(); +} + +static bool nfsd_file_lru_remove(struct nfsd_file *nf) +{ + if (list_lru_del_obj(&nfsd_file_lru, &nf->nf_lru)) { + trace_nfsd_file_lru_del(nf); + return true; + } + return false; +} + +struct nfsd_file * +nfsd_file_get(struct nfsd_file *nf) +{ + if (nf && refcount_inc_not_zero(&nf->nf_ref)) + return nf; + return NULL; +} + +/** + * nfsd_file_put - put the reference to a nfsd_file + * @nf: nfsd_file of which to put the reference + * + * Put a reference to a nfsd_file. In the non-GC case, we just put the + * reference immediately. In the GC case, if the reference would be + * the last one, the put it on the LRU instead to be cleaned up later. + */ +void +nfsd_file_put(struct nfsd_file *nf) +{ + might_sleep(); + trace_nfsd_file_put(nf); + + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) && + test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + set_bit(NFSD_FILE_RECENT, &nf->nf_flags); + } + + if (refcount_dec_and_test(&nf->nf_ref)) + nfsd_file_free(nf); +} + +/** + * nfsd_file_put_local - put nfsd_file reference and arm nfsd_net_put in caller + * @pnf: nfsd_file of which to put the reference + * + * First save the associated net to return to caller, then put + * the reference of the nfsd_file. + */ +struct net * +nfsd_file_put_local(struct nfsd_file __rcu **pnf) +{ + struct nfsd_file *nf; + struct net *net = NULL; + + nf = unrcu_pointer(xchg(pnf, NULL)); + if (nf) { + net = nf->nf_net; + nfsd_file_put(nf); + } + return net; +} + +/** + * nfsd_file_file - get the backing file of an nfsd_file + * @nf: nfsd_file of which to access the backing file. + * + * Return backing file for @nf. + */ +struct file * +nfsd_file_file(struct nfsd_file *nf) +{ + return nf->nf_file; +} + +static void +nfsd_file_dispose_list(struct list_head *dispose) +{ + struct nfsd_file *nf; + + while (!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_gc); + list_del_init(&nf->nf_gc); + nfsd_file_free(nf); + } +} + +/** + * nfsd_file_dispose_list_delayed - move list of dead files to net's freeme list + * @dispose: list of nfsd_files to be disposed + * + * Transfers each file to the "freeme" list for its nfsd_net, to eventually + * be disposed of by the per-net garbage collector. + */ +static void +nfsd_file_dispose_list_delayed(struct list_head *dispose) +{ + while(!list_empty(dispose)) { + struct nfsd_file *nf = list_first_entry(dispose, + struct nfsd_file, nf_gc); + struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); + struct nfsd_fcache_disposal *l = nn->fcache_disposal; + struct svc_serv *serv; + + spin_lock(&l->lock); + list_move_tail(&nf->nf_gc, &l->freeme); + spin_unlock(&l->lock); + + /* + * The filecache laundrette is shut down after the + * nn->nfsd_serv pointer is cleared, but before the + * svc_serv is freed. + */ + serv = nn->nfsd_serv; + if (serv) + svc_wake_up(serv); + } +} + +/** + * nfsd_file_net_dispose - deal with nfsd_files waiting to be disposed. + * @nn: nfsd_net in which to find files to be disposed. + * + * When files held open for nfsv3 are removed from the filecache, whether + * due to memory pressure or garbage collection, they are queued to + * a per-net-ns queue. This function completes the disposal, either + * directly or by waking another nfsd thread to help with the work. + */ +void nfsd_file_net_dispose(struct nfsd_net *nn) +{ + struct nfsd_fcache_disposal *l = nn->fcache_disposal; + + if (!list_empty(&l->freeme)) { + LIST_HEAD(dispose); + int i; + + spin_lock(&l->lock); + for (i = 0; i < 8 && !list_empty(&l->freeme); i++) + list_move(l->freeme.next, &dispose); + spin_unlock(&l->lock); + if (!list_empty(&l->freeme)) + /* Wake up another thread to share the work + * *before* doing any actual disposing. + */ + svc_wake_up(nn->nfsd_serv); + nfsd_file_dispose_list(&dispose); + } +} + +/** + * nfsd_file_lru_cb - Examine an entry on the LRU list + * @item: LRU entry to examine + * @lru: controlling LRU + * @arg: dispose list + * + * Return values: + * %LRU_REMOVED: @item was removed from the LRU + * %LRU_ROTATE: @item is to be moved to the LRU tail + * %LRU_SKIP: @item cannot be evicted + */ +static enum lru_status +nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, + void *arg) +{ + struct list_head *head = arg; + struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); + + /* We should only be dealing with GC entries here */ + WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags)); + + /* + * Don't throw out files that are still undergoing I/O or + * that have uncleared errors pending. + */ + if (nfsd_file_check_writeback(nf)) { + trace_nfsd_file_gc_writeback(nf); + return LRU_SKIP; + } + + /* If it was recently added to the list, skip it */ + if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) { + trace_nfsd_file_gc_referenced(nf); + return LRU_ROTATE; + } + + /* + * Put the reference held on behalf of the LRU if it is the last + * reference, else rotate. + */ + if (!refcount_dec_if_one(&nf->nf_ref)) { + trace_nfsd_file_gc_in_use(nf); + return LRU_ROTATE; + } + + /* Refcount went to zero. Unhash it and queue it to the dispose list */ + nfsd_file_unhash(nf); + list_lru_isolate(lru, &nf->nf_lru); + list_add(&nf->nf_gc, head); + this_cpu_inc(nfsd_file_evictions); + trace_nfsd_file_gc_disposed(nf); + return LRU_REMOVED; +} + +static enum lru_status +nfsd_file_gc_cb(struct list_head *item, struct list_lru_one *lru, + void *arg) +{ + struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); + + if (test_and_clear_bit(NFSD_FILE_RECENT, &nf->nf_flags)) { + /* + * "REFERENCED" really means "should be at the end of the + * LRU. As we are putting it there we can clear the flag. + */ + clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + trace_nfsd_file_gc_aged(nf); + return LRU_ROTATE; + } + return nfsd_file_lru_cb(item, lru, arg); +} + +/* If the shrinker runs between calls to list_lru_walk_node() in + * nfsd_file_gc(), the "remaining" count will be wrong. This could + * result in premature freeing of some files. This may not matter much + * but is easy to fix with this spinlock which temporarily disables + * the shrinker. + */ +static DEFINE_SPINLOCK(nfsd_gc_lock); +static void +nfsd_file_gc(void) +{ + unsigned long ret = 0; + LIST_HEAD(dispose); + int nid; + + spin_lock(&nfsd_gc_lock); + for_each_node_state(nid, N_NORMAL_MEMORY) { + unsigned long remaining = list_lru_count_node(&nfsd_file_lru, nid); + + while (remaining > 0) { + unsigned long nr = min(remaining, NFSD_FILE_GC_BATCH); + + remaining -= nr; + ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb, + &dispose, &nr); + if (nr) + /* walk aborted early */ + remaining = 0; + } + } + spin_unlock(&nfsd_gc_lock); + trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru)); + nfsd_file_dispose_list_delayed(&dispose); +} + +static void +nfsd_file_gc_worker(struct work_struct *work) +{ + if (list_lru_count(&nfsd_file_lru)) + nfsd_file_gc(); + nfsd_file_schedule_laundrette(); +} + +static unsigned long +nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc) +{ + return list_lru_count(&nfsd_file_lru); +} + +static unsigned long +nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long ret; + + if (!spin_trylock(&nfsd_gc_lock)) + return SHRINK_STOP; + + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, + nfsd_file_lru_cb, &dispose); + spin_unlock(&nfsd_gc_lock); + trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru)); + nfsd_file_dispose_list_delayed(&dispose); + return ret; +} + +static struct shrinker *nfsd_file_shrinker; + +/** + * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file + * @nf: nfsd_file to attempt to queue + * @dispose: private list to queue successfully-put objects + * + * Unhash an nfsd_file, try to get a reference to it, and then put that + * reference. If it's the last reference, queue it to the dispose list. + */ +static void +nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose) + __must_hold(RCU) +{ + int decrement = 1; + + /* If we raced with someone else unhashing, ignore it */ + if (!nfsd_file_unhash(nf)) + return; + + /* If we can't get a reference, ignore it */ + if (!nfsd_file_get(nf)) + return; + + /* Extra decrement if we remove from the LRU */ + if (nfsd_file_lru_remove(nf)) + ++decrement; + + /* If refcount goes to 0, then put on the dispose list */ + if (refcount_sub_and_test(decrement, &nf->nf_ref)) { + list_add(&nf->nf_gc, dispose); + trace_nfsd_file_closing(nf); + } +} + +/** + * nfsd_file_queue_for_close: try to close out any open nfsd_files for an inode + * @inode: inode on which to close out nfsd_files + * @dispose: list on which to gather nfsd_files to close out + * + * An nfsd_file represents a struct file being held open on behalf of nfsd. + * An open file however can block other activity (such as leases), or cause + * undesirable behavior (e.g. spurious silly-renames when reexporting NFS). + * + * This function is intended to find open nfsd_files when this sort of + * conflicting access occurs and then attempt to close those files out. + * + * Populates the dispose list with entries that have already had their + * refcounts go to zero. The actual free of an nfsd_file can be expensive, + * so we leave it up to the caller whether it wants to wait or not. + */ +static void +nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose) +{ + struct rhlist_head *tmp, *list; + struct nfsd_file *nf; + + rcu_read_lock(); + list = rhltable_lookup(&nfsd_file_rhltable, &inode, + nfsd_file_rhash_params); + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) { + if (!test_bit(NFSD_FILE_GC, &nf->nf_flags)) + continue; + nfsd_file_cond_queue(nf, dispose); + } + rcu_read_unlock(); +} + +/** + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Close out any open nfsd_files that can be reaped for @inode. The + * actual freeing is deferred to the dispose_list_delayed infrastructure. + * + * This is used by the fsnotify callbacks and setlease notifier. + */ +static void +nfsd_file_close_inode(struct inode *inode) +{ + LIST_HEAD(dispose); + + nfsd_file_queue_for_close(inode, &dispose); + nfsd_file_dispose_list_delayed(&dispose); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Close out any open nfsd_files that can be reaped for @inode. The + * nfsd_files are closed out synchronously. + * + * This is called from nfsd_rename and nfsd_unlink to avoid silly-renames + * when reexporting NFS. + */ +void +nfsd_file_close_inode_sync(struct inode *inode) +{ + LIST_HEAD(dispose); + + trace_nfsd_file_close(inode); + + nfsd_file_queue_for_close(inode, &dispose); + nfsd_file_dispose_list(&dispose); +} + +static int +nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg, + void *data) +{ + struct file_lease *fl = data; + + /* Only close files for F_SETLEASE leases */ + if (fl->c.flc_flags & FL_LEASE) + nfsd_file_close_inode(file_inode(fl->c.flc_file)); + return 0; +} + +static struct notifier_block nfsd_file_lease_notifier = { + .notifier_call = nfsd_file_lease_notifier_call, +}; + +static int +nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *name, u32 cookie) +{ + if (WARN_ON_ONCE(!inode)) + return 0; + + trace_nfsd_file_fsnotify_handle_event(inode, mask); + + /* Should be no marks on non-regular files */ + if (!S_ISREG(inode->i_mode)) { + WARN_ON_ONCE(1); + return 0; + } + + /* don't close files if this was not the last link */ + if (mask & FS_ATTRIB) { + if (inode->i_nlink) + return 0; + } + + nfsd_file_close_inode(inode); + return 0; +} + + +static const struct fsnotify_ops nfsd_file_fsnotify_ops = { + .handle_inode_event = nfsd_file_fsnotify_handle_event, + .free_mark = nfsd_file_mark_free, +}; + +int +nfsd_file_cache_init(void) +{ + int ret; + + lockdep_assert_held(&nfsd_mutex); + if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) + return 0; + + ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params); + if (ret) + goto out; + + ret = -ENOMEM; + nfsd_file_slab = KMEM_CACHE(nfsd_file, 0); + if (!nfsd_file_slab) { + pr_err("nfsd: unable to create nfsd_file_slab\n"); + goto out_err; + } + + nfsd_file_mark_slab = KMEM_CACHE(nfsd_file_mark, 0); + if (!nfsd_file_mark_slab) { + pr_err("nfsd: unable to create nfsd_file_mark_slab\n"); + goto out_err; + } + + ret = list_lru_init(&nfsd_file_lru); + if (ret) { + pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret); + goto out_err; + } + + nfsd_file_shrinker = shrinker_alloc(0, "nfsd-filecache"); + if (!nfsd_file_shrinker) { + ret = -ENOMEM; + pr_err("nfsd: failed to allocate nfsd_file_shrinker\n"); + goto out_lru; + } + + nfsd_file_shrinker->count_objects = nfsd_file_lru_count; + nfsd_file_shrinker->scan_objects = nfsd_file_lru_scan; + nfsd_file_shrinker->seeks = 1; + + shrinker_register(nfsd_file_shrinker); + + ret = lease_register_notifier(&nfsd_file_lease_notifier); + if (ret) { + pr_err("nfsd: unable to register lease notifier: %d\n", ret); + goto out_shrinker; + } + + nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops, + 0); + if (IS_ERR(nfsd_file_fsnotify_group)) { + pr_err("nfsd: unable to create fsnotify group: %ld\n", + PTR_ERR(nfsd_file_fsnotify_group)); + ret = PTR_ERR(nfsd_file_fsnotify_group); + nfsd_file_fsnotify_group = NULL; + goto out_notifier; + } + + INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker); +out: + if (ret) + clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags); + return ret; +out_notifier: + lease_unregister_notifier(&nfsd_file_lease_notifier); +out_shrinker: + shrinker_free(nfsd_file_shrinker); +out_lru: + list_lru_destroy(&nfsd_file_lru); +out_err: + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + rhltable_destroy(&nfsd_file_rhltable); + goto out; +} + +/** + * __nfsd_file_cache_purge: clean out the cache for shutdown + * @net: net-namespace to shut down the cache (may be NULL) + * + * Walk the nfsd_file cache and close out any that match @net. If @net is NULL, + * then close out everything. Called when an nfsd instance is being shut down, + * and when the exports table is flushed. + */ +static void +__nfsd_file_cache_purge(struct net *net) +{ + struct rhashtable_iter iter; + struct nfsd_file *nf; + LIST_HEAD(dispose); + +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + if (net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + nfs_localio_invalidate_clients(&nn->local_clients, + &nn->local_clients_lock); + } +#endif + + rhltable_walk_enter(&nfsd_file_rhltable, &iter); + do { + rhashtable_walk_start(&iter); + + nf = rhashtable_walk_next(&iter); + while (!IS_ERR_OR_NULL(nf)) { + if (!net || nf->nf_net == net) + nfsd_file_cond_queue(nf, &dispose); + nf = rhashtable_walk_next(&iter); + } + + rhashtable_walk_stop(&iter); + } while (nf == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); + + nfsd_file_dispose_list(&dispose); +} + +static struct nfsd_fcache_disposal * +nfsd_alloc_fcache_disposal(void) +{ + struct nfsd_fcache_disposal *l; + + l = kmalloc(sizeof(*l), GFP_KERNEL); + if (!l) + return NULL; + spin_lock_init(&l->lock); + INIT_LIST_HEAD(&l->freeme); + return l; +} + +static void +nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l) +{ + nfsd_file_dispose_list(&l->freeme); + kfree(l); +} + +static void +nfsd_free_fcache_disposal_net(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_fcache_disposal *l = nn->fcache_disposal; + + nfsd_free_fcache_disposal(l); +} + +int +nfsd_file_cache_start_net(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nn->fcache_disposal = nfsd_alloc_fcache_disposal(); + return nn->fcache_disposal ? 0 : -ENOMEM; +} + +/** + * nfsd_file_cache_purge - Remove all cache items associated with @net + * @net: target net namespace + * + */ +void +nfsd_file_cache_purge(struct net *net) +{ + lockdep_assert_held(&nfsd_mutex); + if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) + __nfsd_file_cache_purge(net); +} + +void +nfsd_file_cache_shutdown_net(struct net *net) +{ + nfsd_file_cache_purge(net); + nfsd_free_fcache_disposal_net(net); +} + +void +nfsd_file_cache_shutdown(void) +{ + int i; + + lockdep_assert_held(&nfsd_mutex); + if (test_and_clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0) + return; + + lease_unregister_notifier(&nfsd_file_lease_notifier); + shrinker_free(nfsd_file_shrinker); + /* + * make sure all callers of nfsd_file_lru_cb are done before + * calling nfsd_file_cache_purge + */ + cancel_delayed_work_sync(&nfsd_filecache_laundrette); + __nfsd_file_cache_purge(NULL); + list_lru_destroy(&nfsd_file_lru); + rcu_barrier(); + fsnotify_put_group(nfsd_file_fsnotify_group); + nfsd_file_fsnotify_group = NULL; + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + fsnotify_wait_marks_destroyed(); + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + rhltable_destroy(&nfsd_file_rhltable); + + for_each_possible_cpu(i) { + per_cpu(nfsd_file_cache_hits, i) = 0; + per_cpu(nfsd_file_acquisitions, i) = 0; + per_cpu(nfsd_file_allocations, i) = 0; + per_cpu(nfsd_file_releases, i) = 0; + per_cpu(nfsd_file_total_age, i) = 0; + per_cpu(nfsd_file_evictions, i) = 0; + } +} + +static struct nfsd_file * +nfsd_file_lookup_locked(const struct net *net, const struct cred *cred, + struct inode *inode, unsigned char need, + bool want_gc) +{ + struct rhlist_head *tmp, *list; + struct nfsd_file *nf; + + list = rhltable_lookup(&nfsd_file_rhltable, &inode, + nfsd_file_rhash_params); + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) { + if (nf->nf_may != need) + continue; + if (nf->nf_net != net) + continue; + if (!nfsd_match_cred(nf->nf_cred, cred)) + continue; + if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != want_gc) + continue; + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) + continue; + + if (!nfsd_file_get(nf)) + continue; + return nf; + } + return NULL; +} + +/** + * nfsd_file_is_cached - are there any cached open files for this inode? + * @inode: inode to check + * + * The lookup matches inodes in all net namespaces and is atomic wrt + * nfsd_file_acquire(). + * + * Return values: + * %true: filecache contains at least one file matching this inode + * %false: filecache contains no files matching this inode + */ +bool +nfsd_file_is_cached(struct inode *inode) +{ + struct rhlist_head *tmp, *list; + struct nfsd_file *nf; + bool ret = false; + + rcu_read_lock(); + list = rhltable_lookup(&nfsd_file_rhltable, &inode, + nfsd_file_rhash_params); + rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) + if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) { + ret = true; + break; + } + rcu_read_unlock(); + + trace_nfsd_file_is_cached(inode, (int)ret); + return ret; +} + +static __be32 +nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf) +{ + struct inode *inode = file_inode(nf->nf_file); + struct kstat stat; + __be32 status; + + /* Currently only need to get DIO alignment info for regular files */ + if (!S_ISREG(inode->i_mode)) + return nfs_ok; + + status = fh_getattr(fhp, &stat); + if (status != nfs_ok) + return status; + + trace_nfsd_file_get_dio_attrs(inode, &stat); + + if (stat.result_mask & STATX_DIOALIGN) { + nf->nf_dio_mem_align = stat.dio_mem_align; + nf->nf_dio_offset_align = stat.dio_offset_align; + } + if (stat.result_mask & STATX_DIO_READ_ALIGN) + nf->nf_dio_read_offset_align = stat.dio_read_offset_align; + else + nf->nf_dio_read_offset_align = nf->nf_dio_offset_align; + + return nfs_ok; +} + +static __be32 +nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net, + struct svc_cred *cred, + struct auth_domain *client, + struct svc_fh *fhp, + unsigned int may_flags, struct file *file, + umode_t type, bool want_gc, struct nfsd_file **pnf) +{ + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; + struct nfsd_file *new, *nf; + bool stale_retry = true; + bool open_retry = true; + struct inode *inode; + __be32 status; + int ret; + +retry: + if (rqstp) + status = fh_verify(rqstp, fhp, type, + may_flags|NFSD_MAY_OWNER_OVERRIDE); + else + status = fh_verify_local(net, cred, client, fhp, type, + may_flags|NFSD_MAY_OWNER_OVERRIDE); + + if (status != nfs_ok) + return status; + inode = d_inode(fhp->fh_dentry); + + rcu_read_lock(); + nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc); + rcu_read_unlock(); + + if (nf) + goto wait_for_construction; + + new = nfsd_file_alloc(net, inode, need, want_gc); + if (!new) { + status = nfserr_jukebox; + goto out; + } + + rcu_read_lock(); + spin_lock(&inode->i_lock); + nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc); + if (unlikely(nf)) { + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + nfsd_file_free(new); + goto wait_for_construction; + } + nf = new; + ret = rhltable_insert(&nfsd_file_rhltable, &nf->nf_rlist, + nfsd_file_rhash_params); + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + if (likely(ret == 0)) + goto open_file; + + trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret); + status = nfserr_jukebox; + goto construction_err; + +wait_for_construction: + wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); + + /* Did construction of this file fail? */ + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + trace_nfsd_file_cons_err(rqstp, inode, may_flags, nf); + if (!open_retry) { + status = nfserr_jukebox; + goto construction_err; + } + nfsd_file_put(nf); + open_retry = false; + fh_put(fhp); + goto retry; + } + this_cpu_inc(nfsd_file_cache_hits); + + status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags)); + if (status != nfs_ok) { + nfsd_file_put(nf); + nf = NULL; + } + +out: + if (status == nfs_ok) { + this_cpu_inc(nfsd_file_acquisitions); + nfsd_file_check_write_error(nf); + *pnf = nf; + } + trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status); + return status; + +open_file: + trace_nfsd_file_alloc(nf); + + if (type == S_IFREG) + nf->nf_mark = nfsd_file_mark_find_or_create(inode); + + if (type != S_IFREG || nf->nf_mark) { + if (file) { + get_file(file); + nf->nf_file = file; + status = nfs_ok; + trace_nfsd_file_opened(nf, status); + } else { + ret = nfsd_open_verified(fhp, type, may_flags, &nf->nf_file); + if (ret == -EOPENSTALE && stale_retry) { + stale_retry = false; + nfsd_file_unhash(nf); + clear_and_wake_up_bit(NFSD_FILE_PENDING, + &nf->nf_flags); + if (refcount_dec_and_test(&nf->nf_ref)) + nfsd_file_free(nf); + nf = NULL; + fh_put(fhp); + goto retry; + } + status = nfserrno(ret); + trace_nfsd_file_open(nf, status); + if (status == nfs_ok) + status = nfsd_file_get_dio_attrs(fhp, nf); + } + } else + status = nfserr_jukebox; + /* + * If construction failed, or we raced with a call to unlink() + * then unhash. + */ + if (status != nfs_ok || inode->i_nlink == 0) + nfsd_file_unhash(nf); + else if (want_gc) + nfsd_file_lru_add(nf); + + clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags); + if (status == nfs_ok) + goto out; + +construction_err: + if (refcount_dec_and_test(&nf->nf_ref)) + nfsd_file_free(nf); + nf = NULL; + goto out; +} + +/** + * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * The nfsd_file object returned by this API is reference-counted + * and garbage-collected. The object is retained for a few + * seconds after the final nfsd_file_put() in case the caller + * wants to re-use it. + * + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. + */ +__be32 +nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, + fhp, may_flags, NULL, S_IFREG, true, pnf); +} + +/** + * nfsd_file_acquire - Get a struct nfsd_file with an open file + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * The nfsd_file_object returned by this API is reference-counted + * but not garbage-collected. The object is unhashed after the + * final nfsd_file_put(). + * + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. + */ +__be32 +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, + fhp, may_flags, NULL, S_IFREG, false, pnf); +} + +/** + * nfsd_file_acquire_local - Get a struct nfsd_file with an open file for localio + * @net: The network namespace in which to perform a lookup + * @cred: the user credential with which to validate access + * @client: the auth_domain for LOCALIO lookup + * @fhp: the NFS filehandle of the file to be opened + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * This file lookup interface provide access to a file given the + * filehandle and credential. No connection-based authorisation + * is performed and in that way it is quite different to other + * file access mediated by nfsd. It allows a kernel module such as the NFS + * client to reach across network and filesystem namespaces to access + * a file. The security implications of this should be carefully + * considered before use. + * + * The nfsd_file_object returned by this API is reference-counted + * but not garbage-collected. The object is unhashed after the + * final nfsd_file_put(). + * + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. + */ +__be32 +nfsd_file_acquire_local(struct net *net, struct svc_cred *cred, + struct auth_domain *client, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + /* + * Save creds before calling nfsd_file_do_acquire() (which calls + * nfsd_setuser). Important because caller (LOCALIO) is from + * client context. + */ + const struct cred *save_cred = get_current_cred(); + __be32 beres; + + beres = nfsd_file_do_acquire(NULL, net, cred, client, fhp, may_flags, + NULL, S_IFREG, false, pnf); + put_cred(revert_creds(save_cred)); + return beres; +} + +/** + * nfsd_file_acquire_opened - Get a struct nfsd_file using existing open file + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file just created + * @may_flags: NFSD_MAY_ settings for the file + * @file: cached, already-open file (may be NULL) + * @pnf: OUT: new or found "struct nfsd_file" object + * + * Acquire a nfsd_file object that is not GC'ed. If one doesn't already exist, + * and @file is non-NULL, use it to instantiate a new nfsd_file instead of + * opening a new one. + * + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. + */ +__be32 +nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct file *file, + struct nfsd_file **pnf) +{ + return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, + fhp, may_flags, file, S_IFREG, false, pnf); +} + +/** + * nfsd_file_acquire_dir - Get a struct nfsd_file with an open directory + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @pnf: OUT: new or found "struct nfsd_file" object + * + * The nfsd_file_object returned by this API is reference-counted + * but not garbage-collected. The object is unhashed after the + * final nfsd_file_put(). This opens directories only, and only + * in O_RDONLY mode. + * + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. + */ +__be32 +nfsd_file_acquire_dir(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file **pnf) +{ + return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, fhp, + NFSD_MAY_READ|NFSD_MAY_64BIT_COOKIE, + NULL, S_IFDIR, false, pnf); +} + +/* + * Note that fields may be added, removed or reordered in the future. Programs + * scraping this file for info should test the labels to ensure they're + * getting the correct field. + */ +int nfsd_file_cache_stats_show(struct seq_file *m, void *v) +{ + unsigned long allocations = 0, releases = 0, evictions = 0; + unsigned long hits = 0, acquisitions = 0; + unsigned int i, count = 0, buckets = 0; + unsigned long lru = 0, total_age = 0; + + /* Serialize with server shutdown */ + mutex_lock(&nfsd_mutex); + if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) { + struct bucket_table *tbl; + struct rhashtable *ht; + + lru = list_lru_count(&nfsd_file_lru); + + rcu_read_lock(); + ht = &nfsd_file_rhltable.ht; + count = atomic_read(&ht->nelems); + tbl = rht_dereference_rcu(ht->tbl, ht); + buckets = tbl->size; + rcu_read_unlock(); + } + mutex_unlock(&nfsd_mutex); + + for_each_possible_cpu(i) { + hits += per_cpu(nfsd_file_cache_hits, i); + acquisitions += per_cpu(nfsd_file_acquisitions, i); + allocations += per_cpu(nfsd_file_allocations, i); + releases += per_cpu(nfsd_file_releases, i); + total_age += per_cpu(nfsd_file_total_age, i); + evictions += per_cpu(nfsd_file_evictions, i); + } + + seq_printf(m, "total inodes: %u\n", count); + seq_printf(m, "hash buckets: %u\n", buckets); + seq_printf(m, "lru entries: %lu\n", lru); + seq_printf(m, "cache hits: %lu\n", hits); + seq_printf(m, "acquisitions: %lu\n", acquisitions); + seq_printf(m, "allocations: %lu\n", allocations); + seq_printf(m, "releases: %lu\n", releases); + seq_printf(m, "evictions: %lu\n", evictions); + if (releases) + seq_printf(m, "mean age (ms): %ld\n", total_age / releases); + else + seq_printf(m, "mean age (ms): -\n"); + return 0; +} diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h new file mode 100644 index 000000000000..b383dbc5b921 --- /dev/null +++ b/fs/nfsd/filecache.h @@ -0,0 +1,88 @@ +#ifndef _FS_NFSD_FILECACHE_H +#define _FS_NFSD_FILECACHE_H + +#include <linux/fsnotify_backend.h> + +/* + * Limit the time that the list_lru_one lock is held during + * an LRU scan. + */ +#define NFSD_FILE_GC_BATCH (16UL) + +/* + * This is the fsnotify_mark container that nfsd attaches to the files that it + * is holding open. Note that we have a separate refcount here aside from the + * one in the fsnotify_mark. We only want a single fsnotify_mark attached to + * the inode, and for each nfsd_file to hold a reference to it. + * + * The fsnotify_mark is itself refcounted, but that's not sufficient to tell us + * how to put that reference. If there are still outstanding nfsd_files that + * reference the mark, then we would want to call fsnotify_put_mark on it. + * If there were not, then we'd need to call fsnotify_destroy_mark. Since we + * can't really tell the difference, we use the nfm_mark to keep track of how + * many nfsd_files hold references to the mark. When that counter goes to zero + * then we know to call fsnotify_destroy_mark on it. + */ +struct nfsd_file_mark { + struct fsnotify_mark nfm_mark; + refcount_t nfm_ref; +}; + +/* + * A representation of a file that has been opened by knfsd. These are hashed + * in the hashtable by inode pointer value. Note that this object doesn't + * hold a reference to the inode by itself, so the nf_inode pointer should + * never be dereferenced, only used for comparison. + */ +struct nfsd_file { + struct rhlist_head nf_rlist; + void *nf_inode; + struct file *nf_file; + const struct cred *nf_cred; + struct net *nf_net; +#define NFSD_FILE_HASHED (0) +#define NFSD_FILE_PENDING (1) +#define NFSD_FILE_REFERENCED (2) +#define NFSD_FILE_GC (3) +#define NFSD_FILE_RECENT (4) + unsigned long nf_flags; + refcount_t nf_ref; + unsigned char nf_may; + + struct nfsd_file_mark *nf_mark; + struct list_head nf_lru; + struct list_head nf_gc; + struct rcu_head nf_rcu; + ktime_t nf_birthtime; + + u32 nf_dio_mem_align; + u32 nf_dio_offset_align; + u32 nf_dio_read_offset_align; +}; + +int nfsd_file_cache_init(void); +void nfsd_file_cache_purge(struct net *); +void nfsd_file_cache_shutdown(void); +int nfsd_file_cache_start_net(struct net *net); +void nfsd_file_cache_shutdown_net(struct net *net); +void nfsd_file_put(struct nfsd_file *nf); +struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf); +struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); +struct file *nfsd_file_file(struct nfsd_file *nf); +void nfsd_file_close_inode_sync(struct inode *inode); +void nfsd_file_net_dispose(struct nfsd_net *nn); +bool nfsd_file_is_cached(struct inode *inode); +__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); +__be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct file *file, + struct nfsd_file **nfp); +__be32 nfsd_file_acquire_local(struct net *net, struct svc_cred *cred, + struct auth_domain *client, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf); +__be32 nfsd_file_acquire_dir(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file **pnf); +int nfsd_file_cache_stats_show(struct seq_file *m, void *v); +#endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index db7ef07ae50c..0f1a35400cd5 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -15,12 +15,13 @@ #include "flexfilelayoutxdr.h" #include "pnfs.h" +#include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS static __be32 -nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, - struct nfsd4_layoutget *args) +nfsd4_ff_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, + const struct svc_fh *fhp, struct nfsd4_layoutget *args) { struct nfsd4_layout_seg *seg = &args->lg_seg; u32 device_generation = 0; @@ -61,7 +62,7 @@ nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, goto out_error; fl->fh.size = fhp->fh_handle.fh_size; - memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size); + memcpy(fl->fh.data, &fhp->fh_handle.fh_raw, fl->fh.size); /* Give whole file layout segments */ seg->offset = 0; @@ -117,13 +118,20 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, da->netaddr.addr_len = snprintf(da->netaddr.addr, FF_ADDR_LEN + 1, - "%s.%hhu.%hhu", addr, port >> 8, port & 0xff); + "%s.%d.%d", addr, port >> 8, port & 0xff); da->tightly_coupled = false; return 0; } +static __be32 +nfsd4_ff_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, + struct nfsd4_layoutcommit *lcp) +{ + return nfs_ok; +} + const struct nfsd4_layout_ops ff_layout_ops = { .notify_types = NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, @@ -132,4 +140,5 @@ const struct nfsd4_layout_ops ff_layout_ops = { .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, .proc_layoutget = nfsd4_ff_proc_layoutget, .encode_layoutget = nfsd4_ff_encode_layoutget, + .proc_layoutcommit = nfsd4_ff_proc_layoutcommit, }; diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c index e81d2a5cf381..f9f7e38cba13 100644 --- a/fs/nfsd/flexfilelayoutxdr.c +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -17,9 +17,9 @@ struct ff_idmap { __be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp) + const struct nfsd4_layoutget *lgp) { - struct pnfs_ff_layout *fl = lgp->lg_content; + const struct pnfs_ff_layout *fl = lgp->lg_content; int len, mirror_len, ds_len, fh_len; __be32 *p; @@ -54,8 +54,7 @@ nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, *p++ = cpu_to_be32(1); /* single mirror */ *p++ = cpu_to_be32(1); /* single data server */ - p = xdr_encode_opaque_fixed(p, &fl->deviceid, - sizeof(struct nfsd4_deviceid)); + p = svcxdr_encode_deviceid4(p, &fl->deviceid); *p++ = cpu_to_be32(1); /* efficiency */ @@ -77,7 +76,7 @@ nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, __be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp) + const struct nfsd4_getdeviceinfo *gdp) { struct pnfs_ff_device_addr *da = gdp->gd_device; int len; @@ -85,6 +84,15 @@ nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, int addr_len; __be32 *p; + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + /* len + padding for two strings */ addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len; ver_len = 20; diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h index 8e195aeca023..6d5a1066a903 100644 --- a/fs/nfsd/flexfilelayoutxdr.h +++ b/fs/nfsd/flexfilelayoutxdr.h @@ -43,8 +43,8 @@ struct pnfs_ff_layout { }; __be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp); + const struct nfsd4_getdeviceinfo *gdp); __be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp); + const struct nfsd4_layoutget *lgp); #endif /* _NFSD_FLEXFILELAYOUTXDR_H */ diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c new file mode 100644 index 000000000000..be710d809a3b --- /dev/null +++ b/fs/nfsd/localio.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * NFS server support for local clients to bypass network stack + * + * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com> + * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com> + * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com> + * Copyright (C) 2024 NeilBrown <neilb@suse.de> + */ + +#include <linux/exportfs.h> +#include <linux/sunrpc/svcauth.h> +#include <linux/sunrpc/clnt.h> +#include <linux/nfs.h> +#include <linux/nfs_common.h> +#include <linux/nfslocalio.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_xdr.h> +#include <linux/string.h> + +#include "nfsd.h" +#include "vfs.h" +#include "netns.h" +#include "filecache.h" +#include "cache.h" + +/** + * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file + * + * @net: 'struct net' to get the proper nfsd_net required for LOCALIO access + * @dom: 'struct auth_domain' required for LOCALIO access + * @rpc_clnt: rpc_clnt that the client established + * @cred: cred that the client established + * @nfs_fh: filehandle to lookup + * @pnf: place to find the nfsd_file, or store it if it was non-NULL + * @fmode: fmode_t to use for open + * + * This function maps a local fh to a path on a local filesystem. + * This is useful when the nfs client has the local server mounted - it can + * avoid all the NFS overhead with reads, writes and commits. + * + * On successful return, returned nfsd_file will have its nf_net member + * set. Caller (NFS client) is responsible for calling nfsd_net_put and + * nfsd_file_put (via nfs_to_nfsd_file_put_local). + */ +static struct nfsd_file * +nfsd_open_local_fh(struct net *net, struct auth_domain *dom, + struct rpc_clnt *rpc_clnt, const struct cred *cred, + const struct nfs_fh *nfs_fh, struct nfsd_file __rcu **pnf, + const fmode_t fmode) +{ + int mayflags = NFSD_MAY_LOCALIO; + struct svc_cred rq_cred; + struct svc_fh fh; + struct nfsd_file *localio; + __be32 beres; + + if (nfs_fh->size > NFS4_FHSIZE) + return ERR_PTR(-EINVAL); + + if (!nfsd_net_try_get(net)) + return ERR_PTR(-ENXIO); + + rcu_read_lock(); + localio = nfsd_file_get(rcu_dereference(*pnf)); + rcu_read_unlock(); + if (localio) + return localio; + + /* nfs_fh -> svc_fh */ + fh_init(&fh, NFS4_FHSIZE); + fh.fh_handle.fh_size = nfs_fh->size; + memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size); + + if (fmode & FMODE_READ) + mayflags |= NFSD_MAY_READ; + if (fmode & FMODE_WRITE) + mayflags |= NFSD_MAY_WRITE; + + svcauth_map_clnt_to_svc_cred_local(rpc_clnt, cred, &rq_cred); + + beres = nfsd_file_acquire_local(net, &rq_cred, dom, + &fh, mayflags, &localio); + if (beres) + localio = ERR_PTR(nfs_stat_to_errno(be32_to_cpu(beres))); + + fh_put(&fh); + if (rq_cred.cr_group_info) + put_group_info(rq_cred.cr_group_info); + + if (!IS_ERR(localio)) { + struct nfsd_file *new; + if (!nfsd_net_try_get(net)) { + nfsd_file_put(localio); + nfsd_net_put(net); + return ERR_PTR(-ENXIO); + } + nfsd_file_get(localio); + again: + new = unrcu_pointer(cmpxchg(pnf, NULL, RCU_INITIALIZER(localio))); + if (new) { + /* Some other thread installed an nfsd_file */ + if (nfsd_file_get(new) == NULL) + goto again; + /* + * Drop the ref we were going to install (both file and + * net) and the one we were going to return (only file). + */ + nfsd_file_put(localio); + nfsd_net_put(net); + nfsd_file_put(localio); + localio = new; + } + } else + nfsd_net_put(net); + + return localio; +} + +static void nfsd_file_dio_alignment(struct nfsd_file *nf, + u32 *nf_dio_mem_align, + u32 *nf_dio_offset_align, + u32 *nf_dio_read_offset_align) +{ + *nf_dio_mem_align = nf->nf_dio_mem_align; + *nf_dio_offset_align = nf->nf_dio_offset_align; + *nf_dio_read_offset_align = nf->nf_dio_read_offset_align; +} + +static const struct nfsd_localio_operations nfsd_localio_ops = { + .nfsd_net_try_get = nfsd_net_try_get, + .nfsd_net_put = nfsd_net_put, + .nfsd_open_local_fh = nfsd_open_local_fh, + .nfsd_file_put_local = nfsd_file_put_local, + .nfsd_file_file = nfsd_file_file, + .nfsd_file_dio_alignment = nfsd_file_dio_alignment, +}; + +void nfsd_localio_ops_init(void) +{ + nfs_to = &nfsd_localio_ops; +} + +/* + * UUID_IS_LOCAL XDR functions + */ + +static __be32 localio_proc_null(struct svc_rqst *rqstp) +{ + return rpc_success; +} + +struct localio_uuidarg { + uuid_t uuid; +}; + +static __be32 localio_proc_uuid_is_local(struct svc_rqst *rqstp) +{ + struct localio_uuidarg *argp = rqstp->rq_argp; + struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfs_uuid_is_local(&argp->uuid, &nn->local_clients, + &nn->local_clients_lock, + net, rqstp->rq_client, THIS_MODULE); + + return rpc_success; +} + +static bool localio_decode_uuidarg(struct svc_rqst *rqstp, + struct xdr_stream *xdr) +{ + struct localio_uuidarg *argp = rqstp->rq_argp; + u8 uuid[UUID_SIZE]; + + if (decode_opaque_fixed(xdr, uuid, UUID_SIZE)) + return false; + import_uuid(&argp->uuid, uuid); + + return true; +} + +static const struct svc_procedure localio_procedures1[] = { + [LOCALIOPROC_NULL] = { + .pc_func = localio_proc_null, + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = 0, + .pc_name = "NULL", + }, + [LOCALIOPROC_UUID_IS_LOCAL] = { + .pc_func = localio_proc_uuid_is_local, + .pc_decode = localio_decode_uuidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct localio_uuidarg), + .pc_argzero = sizeof(struct localio_uuidarg), + .pc_ressize = sizeof(struct nfsd_voidres), + .pc_cachetype = RC_NOCACHE, + .pc_name = "UUID_IS_LOCAL", + }, +}; + +#define LOCALIO_NR_PROCEDURES ARRAY_SIZE(localio_procedures1) +static DEFINE_PER_CPU_ALIGNED(unsigned long, + localio_count[LOCALIO_NR_PROCEDURES]); +const struct svc_version localio_version1 = { + .vs_vers = 1, + .vs_nproc = LOCALIO_NR_PROCEDURES, + .vs_proc = localio_procedures1, + .vs_dispatch = nfsd_dispatch, + .vs_count = localio_count, + .vs_xdrsize = XDR_QUADLEN(UUID_SIZE), + .vs_hidden = true, +}; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 3f5b3d7b62b7..c774ce9aa296 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -25,26 +25,52 @@ * Note: we hold the dentry use count while the file is open. */ static __be32 -nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) +nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, + int mode) { __be32 nfserr; + int access; struct svc_fh fh; /* must initialize before using! but maxsize doesn't matter */ fh_init(&fh,0); fh.fh_handle.fh_size = f->size; - memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size); + memcpy(&fh.fh_handle.fh_raw, f->data, f->size); fh.fh_export = NULL; - nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); + /* + * Allow BYPASS_GSS as some client implementations use AUTH_SYS + * for NLM even when GSS is used for NFS. + * Allow OWNER_OVERRIDE as permission might have been changed + * after the file was opened. + * Pass MAY_NLM so that authentication can be completely bypassed + * if NFSEXP_NOAUTHNLM is set. Some older clients use AUTH_NULL + * for NLM requests. + */ + access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; + access |= NFSD_MAY_NLM | NFSD_MAY_OWNER_OVERRIDE | NFSD_MAY_BYPASS_GSS; + nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp); fh_put(&fh); - /* We return nlm error codes as nlm doesn't know + /* We return nlm error codes as nlm doesn't know * about nfsd, but nfsd does know about nlm.. */ switch (nfserr) { case nfs_ok: return 0; - case nfserr_dropit: + case nfserr_jukebox: + /* this error can indicate a presence of a conflicting + * delegation to an NLM lock request. Options are: + * (1) For now, drop this request and make the client + * retry. When delegation is returned, client's lock retry + * will complete. + * (2) NLM4_DENIED as per "spec" signals to the client + * that the lock is unavailable now but client can retry. + * Linux client implementation does not. It treats + * NLM4_DENIED same as NLM4_FAILED and errors the request. + * (3) For the future, treat this as blocked lock and try + * to callback when the delegation is returned but might + * not have a proper lock request to block on. + */ return nlm_drop_reply; case nfserr_stale: return nlm_stale_fh; diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c new file mode 100644 index 000000000000..ac51a44e1065 --- /dev/null +++ b/fs/nfsd/netlink.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/nfsd.yaml */ +/* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "netlink.h" + +#include <uapi/linux/nfsd_netlink.h> + +/* Common nested types */ +const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1] = { + [NFSD_A_SOCK_ADDR] = { .type = NLA_BINARY, }, + [NFSD_A_SOCK_TRANSPORT_NAME] = { .type = NLA_NUL_STRING, }, +}; + +const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = { + [NFSD_A_VERSION_MAJOR] = { .type = NLA_U32, }, + [NFSD_A_VERSION_MINOR] = { .type = NLA_U32, }, + [NFSD_A_VERSION_ENABLED] = { .type = NLA_FLAG, }, +}; + +/* NFSD_CMD_THREADS_SET - do */ +static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_SCOPE + 1] = { + [NFSD_A_SERVER_THREADS] = { .type = NLA_U32, }, + [NFSD_A_SERVER_GRACETIME] = { .type = NLA_U32, }, + [NFSD_A_SERVER_LEASETIME] = { .type = NLA_U32, }, + [NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, }, +}; + +/* NFSD_CMD_VERSION_SET - do */ +static const struct nla_policy nfsd_version_set_nl_policy[NFSD_A_SERVER_PROTO_VERSION + 1] = { + [NFSD_A_SERVER_PROTO_VERSION] = NLA_POLICY_NESTED(nfsd_version_nl_policy), +}; + +/* NFSD_CMD_LISTENER_SET - do */ +static const struct nla_policy nfsd_listener_set_nl_policy[NFSD_A_SERVER_SOCK_ADDR + 1] = { + [NFSD_A_SERVER_SOCK_ADDR] = NLA_POLICY_NESTED(nfsd_sock_nl_policy), +}; + +/* NFSD_CMD_POOL_MODE_SET - do */ +static const struct nla_policy nfsd_pool_mode_set_nl_policy[NFSD_A_POOL_MODE_MODE + 1] = { + [NFSD_A_POOL_MODE_MODE] = { .type = NLA_NUL_STRING, }, +}; + +/* Ops table for nfsd */ +static const struct genl_split_ops nfsd_nl_ops[] = { + { + .cmd = NFSD_CMD_RPC_STATUS_GET, + .dumpit = nfsd_nl_rpc_status_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NFSD_CMD_THREADS_SET, + .doit = nfsd_nl_threads_set_doit, + .policy = nfsd_threads_set_nl_policy, + .maxattr = NFSD_A_SERVER_SCOPE, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_THREADS_GET, + .doit = nfsd_nl_threads_get_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_VERSION_SET, + .doit = nfsd_nl_version_set_doit, + .policy = nfsd_version_set_nl_policy, + .maxattr = NFSD_A_SERVER_PROTO_VERSION, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_VERSION_GET, + .doit = nfsd_nl_version_get_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_LISTENER_SET, + .doit = nfsd_nl_listener_set_doit, + .policy = nfsd_listener_set_nl_policy, + .maxattr = NFSD_A_SERVER_SOCK_ADDR, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_LISTENER_GET, + .doit = nfsd_nl_listener_get_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_POOL_MODE_SET, + .doit = nfsd_nl_pool_mode_set_doit, + .policy = nfsd_pool_mode_set_nl_policy, + .maxattr = NFSD_A_POOL_MODE_MODE, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NFSD_CMD_POOL_MODE_GET, + .doit = nfsd_nl_pool_mode_get_doit, + .flags = GENL_CMD_CAP_DO, + }, +}; + +struct genl_family nfsd_nl_family __ro_after_init = { + .name = NFSD_FAMILY_NAME, + .version = NFSD_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = nfsd_nl_ops, + .n_split_ops = ARRAY_SIZE(nfsd_nl_ops), +}; diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h new file mode 100644 index 000000000000..478117ff6b8c --- /dev/null +++ b/fs/nfsd/netlink.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/nfsd.yaml */ +/* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#ifndef _LINUX_NFSD_GEN_H +#define _LINUX_NFSD_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/nfsd_netlink.h> + +/* Common nested types */ +extern const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1]; +extern const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1]; + +int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); +int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_pool_mode_set_doit(struct sk_buff *skb, struct genl_info *info); +int nfsd_nl_pool_mode_get_doit(struct sk_buff *skb, struct genl_info *info); + +extern struct genl_family nfsd_nl_family; + +#endif /* _LINUX_NFSD_GEN_H */ diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 32cb8c027483..3e2d0fde80a7 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -1,21 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * per net namespace data structures for nfsd * * Copyright (C) 2012, Jeff Layton <jlayton@redhat.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 51 - * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #ifndef __NFSD_NETNS_H__ @@ -23,6 +10,12 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/filelock.h> +#include <linux/nfs4.h> +#include <linux/percpu_counter.h> +#include <linux/percpu-refcount.h> +#include <linux/siphash.h> +#include <linux/sunrpc/stats.h> /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -34,6 +27,26 @@ struct cld_net; struct nfsd4_client_tracking_ops; +enum { + /* cache misses due only to checksum comparison failures */ + NFSD_STATS_PAYLOAD_MISSES, + /* amount of memory (in bytes) currently consumed by the DRC */ + NFSD_STATS_DRC_MEM_USAGE, + NFSD_STATS_RC_HITS, /* repcache hits */ + NFSD_STATS_RC_MISSES, /* repcache misses */ + NFSD_STATS_RC_NOCACHE, /* uncached reqs */ + NFSD_STATS_FH_STALE, /* FH stale error */ + NFSD_STATS_IO_READ, /* bytes returned to read requests */ + NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ +#ifdef CONFIG_NFSD_V4 + NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ + NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, +#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) + NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ +#endif + NFSD_STATS_COUNTERS_NUM +}; + /* * Represents a nfsd "container". With respect to nfsv4 state tracking, the * fields of interest are the *_id_hashtbls and the *_name_tree. These track @@ -53,7 +66,9 @@ struct nfsd_net { struct lock_manager nfsd4_manager; bool grace_ended; - time_t boot_time; + time64_t boot_time; + + struct dentry *nfsd_client_dir; /* * reclaim_str_hashtbl[] holds known client info from previous reset/reboot @@ -100,29 +115,29 @@ struct nfsd_net { bool in_grace; const struct nfsd4_client_tracking_ops *client_tracking_ops; - time_t nfsd4_lease; - time_t nfsd4_grace; + time64_t nfsd4_lease; + time64_t nfsd4_grace; bool somebody_reclaimed; + bool track_reclaim_completes; + atomic_t nr_reclaim_complete; + bool nfsd_net_up; bool lockd_up; - /* Time of server startup */ - struct timespec64 nfssvc_boot; - - /* - * Max number of connections this nfsd container will allow. Defaults - * to '0' which is means that it bases this on the number of threads. - */ - unsigned int max_connections; + seqlock_t writeverf_lock; + unsigned char writeverf[8]; + u32 clientid_base; u32 clientid_counter; u32 clverifier_counter; - struct svc_serv *nfsd_serv; + struct svc_info nfsd_info; +#define nfsd_serv nfsd_info.serv - wait_queue_head_t ntf_wq; - atomic_t ntf_refcnt; + struct percpu_ref nfsd_net_ref; + struct completion nfsd_net_confirm_done; + struct completion nfsd_net_free_done; /* * clientid and stateid data for construction of net unique COPY @@ -131,10 +146,88 @@ struct nfsd_net { u32 s2s_cp_cl_id; struct idr s2s_cp_stateids; spinlock_t s2s_cp_lock; + atomic_t pending_async_copies; + + /* + * Version information + */ + bool nfsd_versions[NFSD_MAXVERS + 1]; + bool nfsd4_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1]; + + /* + * Duplicate reply cache + */ + struct nfsd_drc_bucket *drc_hashtbl; + + /* max number of entries allowed in the cache */ + unsigned int max_drc_entries; + + /* number of significant bits in the hash value */ + unsigned int maskbits; + unsigned int drc_hashsize; + + /* + * Stats and other tracking of on the duplicate reply cache. + * The longest_chain* fields are modified with only the per-bucket + * cache lock, which isn't really safe and should be fixed if we want + * these statistics to be completely accurate. + */ + + /* total number of entries */ + atomic_t num_drc_entries; + + /* Per-netns stats counters */ + struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; + + /* sunrpc svc stats */ + struct svc_stat nfsd_svcstats; + + /* longest hash chain seen */ + unsigned int longest_chain; + + /* size of cache when we saw the longest hash chain */ + unsigned int longest_chain_cachesize; + + struct shrinker *nfsd_reply_cache_shrinker; + + /* tracking server-to-server copy mounts */ + spinlock_t nfsd_ssc_lock; + struct list_head nfsd_ssc_mount_list; + wait_queue_head_t nfsd_ssc_waitq; + + /* utsname taken from the process that starts the server */ + char nfsd_name[UNX_MAXNODENAME+1]; + + struct nfsd_fcache_disposal *fcache_disposal; + + siphash_key_t siphash_key; + + atomic_t nfs4_client_count; + int nfs4_max_clients; + + atomic_t nfsd_courtesy_clients; + struct shrinker *nfsd_client_shrinker; + struct work_struct nfsd_shrinker_work; + + /* last time an admin-revoke happened for NFSv4.0 */ + time64_t nfs40_last_revoke; + +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + /* Local clients to be invalidated when net is shut down */ + spinlock_t local_clients_lock; + struct list_head local_clients; +#endif }; /* Simple check to find out if a given net was properly initialized */ #define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl) +extern bool nfsd_support_version(int vers); extern unsigned int nfsd_net_id; + +bool nfsd_net_try_get(struct net *net); +void nfsd_net_put(struct net *net); + +void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn); +void nfsd_reset_write_verifier(struct nfsd_net *nn); #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index cbab1d2d8a75..5fb202acb0fd 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -14,7 +14,6 @@ #include "vfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC -#define RETURN_STATUS(st) { resp->status = (st); return (st); } /* * NULL call. @@ -22,7 +21,7 @@ static __be32 nfsacld_proc_null(struct svc_rqst *rqstp) { - return nfs_ok; + return rpc_success; } /* @@ -35,33 +34,34 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) struct posix_acl *acl; struct inode *inode; svc_fh *fh; - __be32 nfserr = 0; dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); - if (nfserr) - RETURN_STATUS(nfserr); + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; inode = d_inode(fh->fh_dentry); - if (argp->mask & ~NFS_ACL_MASK) - RETURN_STATUS(nfserr_inval); + if (argp->mask & ~NFS_ACL_MASK) { + resp->status = nfserr_inval; + goto out; + } resp->mask = argp->mask; - nfserr = fh_getattr(fh, &resp->stat); - if (nfserr) - RETURN_STATUS(nfserr); + resp->status = fh_getattr(fh, &resp->stat); + if (resp->status != nfs_ok) + goto out; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); + resp->status = nfserrno(PTR_ERR(acl)); goto fail; } resp->acl_access = acl; @@ -69,21 +69,24 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); + resp->status = nfserrno(PTR_ERR(acl)); goto fail; } resp->acl_default = acl; } /* resp->acl_{access,default} are released in nfssvc_release_getacl. */ - RETURN_STATUS(0); +out: + return rpc_success; fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); - RETURN_STATUS(nfserr); + resp->acl_access = NULL; + resp->acl_default = NULL; + goto out; } /* @@ -95,14 +98,13 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) struct nfsd_attrstat *resp = rqstp->rq_resp; struct inode *inode; svc_fh *fh; - __be32 nfserr = 0; int error; dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); - if (nfserr) + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (resp->status != nfs_ok) goto out; inode = d_inode(fh->fh_dentry); @@ -111,32 +113,35 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) if (error) goto out_errno; - fh_lock(fh); + inode_lock(inode); - error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_ACCESS, + argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_DEFAULT, + argp->acl_default); if (error) goto out_drop_lock; - fh_unlock(fh); + inode_unlock(inode); fh_drop_write(fh); - nfserr = fh_getattr(fh, &resp->stat); + resp->status = fh_getattr(fh, &resp->stat); out: /* argp->acl_{access,default} may have been allocated in nfssvc_decode_setaclargs. */ posix_acl_release(argp->acl_access); posix_acl_release(argp->acl_default); - return nfserr; + return rpc_success; + out_drop_lock: - fh_unlock(fh); + inode_unlock(inode); fh_drop_write(fh); out_errno: - nfserr = nfserrno(error); + resp->status = nfserrno(error); goto out; } @@ -147,15 +152,16 @@ static __be32 nfsacld_proc_getattr(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; + dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); - if (nfserr) - return nfserr; - nfserr = fh_getattr(&resp->fh, &resp->stat); - return nfserr; + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; } /* @@ -165,7 +171,6 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp) { struct nfsd3_accessargs *argp = rqstp->rq_argp; struct nfsd3_accessres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: ACCESS(2acl) %s 0x%x\n", SVCFH_fmt(&argp->fh), @@ -173,152 +178,117 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->access = argp->access; - nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); - if (nfserr) - return nfserr; - nfserr = fh_getattr(&resp->fh, &resp->stat); - return nfserr; + resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + return rpc_success; } /* * XDR decode functions */ -static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) + +static bool +nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_getaclargs *argp = rqstp->rq_argp; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) - return 0; - argp->mask = ntohl(*p); p++; + if (!svcxdr_decode_fhandle(xdr, &argp->fh)) + return false; + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) + return false; - return xdr_argsize_check(rqstp, p); + return true; } - -static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) +static bool +nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_setaclargs *argp = rqstp->rq_argp; - struct kvec *head = rqstp->rq_arg.head; - unsigned int base; - int n; - - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) - return 0; - argp->mask = ntohl(*p++); - if (argp->mask & ~NFS_ACL_MASK || - !xdr_argsize_check(rqstp, p)) - return 0; - - base = (char *)p - (char *)head->iov_base; - n = nfsacl_decode(&rqstp->rq_arg, base, NULL, - (argp->mask & NFS_ACL) ? - &argp->acl_access : NULL); - if (n > 0) - n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, - (argp->mask & NFS_DFACL) ? - &argp->acl_default : NULL); - return (n > 0); -} -static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_fhandle *argp = rqstp->rq_argp; - - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); + if (!svcxdr_decode_fhandle(xdr, &argp->fh)) + return false; + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) + return false; + if (argp->mask & ~NFS_ACL_MASK) + return false; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL)) + return false; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL)) + return false; + + return true; } -static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) +static bool +nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { - struct nfsd3_accessargs *argp = rqstp->rq_argp; + struct nfsd3_accessargs *args = rqstp->rq_argp; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) - return 0; - argp->access = ntohl(*p++); + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return false; + if (xdr_stream_decode_u32(xdr, &args->access) < 0) + return false; - return xdr_argsize_check(rqstp, p); + return true; } /* * XDR encode functions */ -/* - * There must be an encoding function for void results so svc_process - * will work properly. - */ -static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} - /* GETACL */ -static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) +static bool +nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; struct inode *inode; - struct kvec *head = rqstp->rq_res.head; - unsigned int base; - int n; - int w; - - /* - * Since this is version 2, the check for nfserr in - * nfsd_dispatch actually ensures the following cannot happen. - * However, it seems fragile to depend on that. - */ + + if (!svcxdr_encode_stat(xdr, resp->status)) + return false; + if (dentry == NULL || d_really_is_negative(dentry)) - return 0; + return true; inode = d_inode(dentry); - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); - *p++ = htonl(resp->mask); - if (!xdr_ressize_check(rqstp, p)) - return 0; - base = (char *)p - (char *)head->iov_base; - - rqstp->rq_res.page_len = w = nfsacl_size( - (resp->mask & NFS_ACL) ? resp->acl_access : NULL, - (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); - while (w > 0) { - if (!*(rqstp->rq_next_page++)) - return 0; - w -= PAGE_SIZE; - } - - n = nfsacl_encode(&rqstp->rq_res, base, inode, - resp->acl_access, - resp->mask & NFS_ACL, 0); - if (n > 0) - n = nfsacl_encode(&rqstp->rq_res, base + n, inode, - resp->acl_default, - resp->mask & NFS_DFACL, - NFS_ACL_DEFAULT); - return (n > 0); -} + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return false; + if (xdr_stream_encode_u32(xdr, resp->mask) < 0) + return false; -static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_attrstat *resp = rqstp->rq_resp; + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, + resp->mask & NFS_ACL, 0)) + return false; + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default, + resp->mask & NFS_DFACL, NFS_ACL_DEFAULT)) + return false; - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); - return xdr_ressize_check(rqstp, p); + return true; } /* ACCESS */ -static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p) +static bool +nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_accessres *resp = rqstp->rq_resp; - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); - *p++ = htonl(resp->access); - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return false; + if (xdr_stream_encode_u32(xdr, resp->access) < 0) + return false; + break; + } + + return true; } /* @@ -333,13 +303,6 @@ static void nfsaclsvc_release_getacl(struct svc_rqst *rqstp) posix_acl_release(resp->acl_default); } -static void nfsaclsvc_release_attrstat(struct svc_rqst *rqstp) -{ - struct nfsd_attrstat *resp = rqstp->rq_resp; - - fh_put(&resp->fh); -} - static void nfsaclsvc_release_access(struct svc_rqst *rqstp) { struct nfsd3_accessres *resp = rqstp->rq_resp; @@ -347,42 +310,78 @@ static void nfsaclsvc_release_access(struct svc_rqst *rqstp) fh_put(&resp->fh); } -#define nfsaclsvc_decode_voidargs NULL -#define nfsaclsvc_release_void NULL -#define nfsd3_fhandleargs nfsd_fhandle -#define nfsd3_attrstatres nfsd_attrstat -#define nfsd3_voidres nfsd3_voidargs -struct nfsd3_voidargs { int dummy; }; - -#define PROC(name, argt, rest, relt, cache, respsize) \ -{ \ - .pc_func = nfsacld_proc_##name, \ - .pc_decode = nfsaclsvc_decode_##argt##args, \ - .pc_encode = nfsaclsvc_encode_##rest##res, \ - .pc_release = nfsaclsvc_release_##relt, \ - .pc_argsize = sizeof(struct nfsd3_##argt##args), \ - .pc_ressize = sizeof(struct nfsd3_##rest##res), \ - .pc_cachetype = cache, \ - .pc_xdrressize = respsize, \ -} - #define ST 1 /* status*/ #define AT 21 /* attributes */ #define pAT (1+AT) /* post attributes - conditional */ #define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ -static const struct svc_procedure nfsd_acl_procedures2[] = { - PROC(null, void, void, void, RC_NOCACHE, ST), - PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), - PROC(setacl, setacl, attrstat, attrstat, RC_NOCACHE, ST+AT), - PROC(getattr, fhandle, attrstat, attrstat, RC_NOCACHE, ST+AT), - PROC(access, access, access, access, RC_NOCACHE, ST+AT+1), +static const struct svc_procedure nfsd_acl_procedures2[5] = { + [ACLPROC2_NULL] = { + .pc_func = nfsacld_proc_null, + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + .pc_name = "NULL", + }, + [ACLPROC2_GETACL] = { + .pc_func = nfsacld_proc_getacl, + .pc_decode = nfsaclsvc_decode_getaclargs, + .pc_encode = nfsaclsvc_encode_getaclres, + .pc_release = nfsaclsvc_release_getacl, + .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_argzero = sizeof(struct nfsd3_getaclargs), + .pc_ressize = sizeof(struct nfsd3_getaclres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+2*(1+ACL), + .pc_name = "GETACL", + }, + [ACLPROC2_SETACL] = { + .pc_func = nfsacld_proc_setacl, + .pc_decode = nfsaclsvc_decode_setaclargs, + .pc_encode = nfssvc_encode_attrstatres, + .pc_release = nfssvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_argzero = sizeof(struct nfsd3_setaclargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + .pc_name = "SETACL", + }, + [ACLPROC2_GETATTR] = { + .pc_func = nfsacld_proc_getattr, + .pc_decode = nfssvc_decode_fhandleargs, + .pc_encode = nfssvc_encode_attrstatres, + .pc_release = nfssvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", + }, + [ACLPROC2_ACCESS] = { + .pc_func = nfsacld_proc_access, + .pc_decode = nfsaclsvc_decode_accessargs, + .pc_encode = nfsaclsvc_encode_accessres, + .pc_release = nfsaclsvc_release_access, + .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_argzero = sizeof(struct nfsd3_accessargs), + .pc_ressize = sizeof(struct nfsd3_accessres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT+1, + .pc_name = "SETATTR", + }, }; -static unsigned int nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]); const struct svc_version nfsd_acl_version2 = { .vs_vers = 2, - .vs_nproc = 5, + .vs_nproc = ARRAY_SIZE(nfsd_acl_procedures2), .vs_proc = nfsd_acl_procedures2, .vs_count = nfsd_acl_count2, .vs_dispatch = nfsd_dispatch, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 13bca4a2f89d..7b5433bd3019 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -13,15 +13,13 @@ #include "xdr3.h" #include "vfs.h" -#define RETURN_STATUS(st) { resp->status = (st); return (st); } - /* * NULL call. */ static __be32 nfsd3_proc_null(struct svc_rqst *rqstp) { - return nfs_ok; + return rpc_success; } /* @@ -34,27 +32,28 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) struct posix_acl *acl; struct inode *inode; svc_fh *fh; - __be32 nfserr = 0; fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); - if (nfserr) - RETURN_STATUS(nfserr); + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; inode = d_inode(fh->fh_dentry); - if (argp->mask & ~NFS_ACL_MASK) - RETURN_STATUS(nfserr_inval); + if (argp->mask & ~NFS_ACL_MASK) { + resp->status = nfserr_inval; + goto out; + } resp->mask = argp->mask; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = get_acl(inode, ACL_TYPE_ACCESS); + acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); + resp->status = nfserrno(PTR_ERR(acl)); goto fail; } resp->acl_access = acl; @@ -62,21 +61,24 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - acl = get_acl(inode, ACL_TYPE_DEFAULT); + acl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { - nfserr = nfserrno(PTR_ERR(acl)); + resp->status = nfserrno(PTR_ERR(acl)); goto fail; } resp->acl_default = acl; } /* resp->acl_{access,default} are released in nfs3svc_release_getacl. */ - RETURN_STATUS(0); +out: + return rpc_success; fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); - RETURN_STATUS(nfserr); + resp->acl_access = NULL; + resp->acl_default = NULL; + goto out; } /* @@ -88,12 +90,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) struct nfsd3_attrstat *resp = rqstp->rq_resp; struct inode *inode; svc_fh *fh; - __be32 nfserr = 0; int error; fh = fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); - if (nfserr) + resp->status = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (resp->status != nfs_ok) goto out; inode = d_inode(fh->fh_dentry); @@ -102,66 +103,64 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) if (error) goto out_errno; - fh_lock(fh); + inode_lock(inode); - error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_ACCESS, + argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); + error = set_posix_acl(&nop_mnt_idmap, fh->fh_dentry, ACL_TYPE_DEFAULT, + argp->acl_default); out_drop_lock: - fh_unlock(fh); + inode_unlock(inode); fh_drop_write(fh); out_errno: - nfserr = nfserrno(error); + resp->status = nfserrno(error); out: /* argp->acl_{access,default} may have been allocated in nfs3svc_decode_setaclargs. */ posix_acl_release(argp->acl_access); posix_acl_release(argp->acl_default); - RETURN_STATUS(nfserr); + return rpc_success; } /* * XDR decode functions */ -static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) + +static bool +nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_getaclargs *args = rqstp->rq_argp; - p = nfs3svc_decode_fh(p, &args->fh); - if (!p) - return 0; - args->mask = ntohl(*p); p++; + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return false; + if (xdr_stream_decode_u32(xdr, &args->mask) < 0) + return false; - return xdr_argsize_check(rqstp, p); + return true; } - -static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) +static bool +nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { - struct nfsd3_setaclargs *args = rqstp->rq_argp; - struct kvec *head = rqstp->rq_arg.head; - unsigned int base; - int n; - - p = nfs3svc_decode_fh(p, &args->fh); - if (!p) - return 0; - args->mask = ntohl(*p++); - if (args->mask & ~NFS_ACL_MASK || - !xdr_argsize_check(rqstp, p)) - return 0; - - base = (char *)p - (char *)head->iov_base; - n = nfsacl_decode(&rqstp->rq_arg, base, NULL, - (args->mask & NFS_ACL) ? - &args->acl_access : NULL); - if (n > 0) - n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, - (args->mask & NFS_DFACL) ? - &args->acl_default : NULL); - return (n > 0); + struct nfsd3_setaclargs *argp = rqstp->rq_argp; + + if (!svcxdr_decode_nfs_fh3(xdr, &argp->fh)) + return false; + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) + return false; + if (argp->mask & ~NFS_ACL_MASK) + return false; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL)) + return false; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL)) + return false; + + return true; } /* @@ -169,58 +168,47 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) */ /* GETACL */ -static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) +static bool +nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; + struct inode *inode; - p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0 && dentry && d_really_is_positive(dentry)) { - struct inode *inode = d_inode(dentry); - struct kvec *head = rqstp->rq_res.head; - unsigned int base; - int n; - int w; - - *p++ = htonl(resp->mask); - if (!xdr_ressize_check(rqstp, p)) - return 0; - base = (char *)p - (char *)head->iov_base; - - rqstp->rq_res.page_len = w = nfsacl_size( - (resp->mask & NFS_ACL) ? resp->acl_access : NULL, - (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); - while (w > 0) { - if (!*(rqstp->rq_next_page++)) - return 0; - w -= PAGE_SIZE; - } + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + inode = d_inode(dentry); + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + if (xdr_stream_encode_u32(xdr, resp->mask) < 0) + return false; + + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, + resp->mask & NFS_ACL, 0)) + return false; + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT)) + return false; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + } - n = nfsacl_encode(&rqstp->rq_res, base, inode, - resp->acl_access, - resp->mask & NFS_ACL, 0); - if (n > 0) - n = nfsacl_encode(&rqstp->rq_res, base + n, inode, - resp->acl_default, - resp->mask & NFS_DFACL, - NFS_ACL_DEFAULT); - if (n <= 0) - return 0; - } else - if (!xdr_ressize_check(rqstp, p)) - return 0; - - return 1; + return true; } /* SETACL */ -static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p) +static bool +nfs3svc_encode_setaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_attrstat *resp = rqstp->rq_resp; - p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); - - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh); } /* @@ -235,39 +223,54 @@ static void nfs3svc_release_getacl(struct svc_rqst *rqstp) posix_acl_release(resp->acl_default); } -#define nfs3svc_decode_voidargs NULL -#define nfs3svc_release_void NULL -#define nfsd3_setaclres nfsd3_attrstat -#define nfsd3_voidres nfsd3_voidargs -struct nfsd3_voidargs { int dummy; }; - -#define PROC(name, argt, rest, relt, cache, respsize) \ -{ \ - .pc_func = nfsd3_proc_##name, \ - .pc_decode = nfs3svc_decode_##argt##args, \ - .pc_encode = nfs3svc_encode_##rest##res, \ - .pc_release = nfs3svc_release_##relt, \ - .pc_argsize = sizeof(struct nfsd3_##argt##args), \ - .pc_ressize = sizeof(struct nfsd3_##rest##res), \ - .pc_cachetype = cache, \ - .pc_xdrressize = respsize, \ -} - #define ST 1 /* status*/ #define AT 21 /* attributes */ #define pAT (1+AT) /* post attributes - conditional */ #define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ -static const struct svc_procedure nfsd_acl_procedures3[] = { - PROC(null, void, void, void, RC_NOCACHE, ST), - PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), - PROC(setacl, setacl, setacl, fhandle, RC_NOCACHE, ST+pAT), +static const struct svc_procedure nfsd_acl_procedures3[3] = { + [ACLPROC3_NULL] = { + .pc_func = nfsd3_proc_null, + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + .pc_name = "NULL", + }, + [ACLPROC3_GETACL] = { + .pc_func = nfsd3_proc_getacl, + .pc_decode = nfs3svc_decode_getaclargs, + .pc_encode = nfs3svc_encode_getaclres, + .pc_release = nfs3svc_release_getacl, + .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_argzero = sizeof(struct nfsd3_getaclargs), + .pc_ressize = sizeof(struct nfsd3_getaclres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+2*(1+ACL), + .pc_name = "GETACL", + }, + [ACLPROC3_SETACL] = { + .pc_func = nfsd3_proc_setacl, + .pc_decode = nfs3svc_decode_setaclargs, + .pc_encode = nfs3svc_encode_setaclres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_argzero = sizeof(struct nfsd3_setaclargs), + .pc_ressize = sizeof(struct nfsd3_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT, + .pc_name = "SETACL", + }, }; -static unsigned int nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]); const struct svc_version nfsd_acl_version3 = { .vs_vers = 3, - .vs_nproc = 3, + .vs_nproc = ARRAY_SIZE(nfsd_acl_procedures3), .vs_proc = nfsd_acl_procedures3, .vs_count = nfsd_acl_count3, .vs_dispatch = nfsd_dispatch, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9eb8086ea841..42adc5461db0 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -8,15 +8,16 @@ #include <linux/fs.h> #include <linux/ext2_fs.h> #include <linux/magic.h> +#include <linux/namei.h> #include "cache.h" #include "xdr3.h" #include "vfs.h" +#include "filecache.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC -#define RETURN_STATUS(st) { resp->status = (st); return (st); } - static int nfs3_ftypes[] = { 0, /* NF3NON */ S_IFREG, /* NF3REG */ @@ -28,13 +29,36 @@ static int nfs3_ftypes[] = { S_IFIFO, /* NF3FIFO */ }; +static __be32 nfsd3_map_status(__be32 status) +{ + switch (status) { + case nfs_ok: + break; + case nfserr_nofilehandle: + status = nfserr_badhandle; + break; + case nfserr_wrongsec: + case nfserr_file_open: + status = nfserr_acces; + break; + case nfserr_symlink_not_dir: + status = nfserr_notdir; + break; + case nfserr_symlink: + case nfserr_wrong_type: + status = nfserr_inval; + break; + } + return status; +} + /* * NULL call. */ static __be32 nfsd3_proc_null(struct svc_rqst *rqstp) { - return nfs_ok; + return rpc_success; } /* @@ -45,20 +69,19 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; - dprintk("nfsd: GETATTR(3) %s\n", - SVCFH_fmt(&argp->fh)); + trace_nfsd_vfs_getattr(rqstp, &argp->fh); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, - NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); - if (nfserr) - RETURN_STATUS(nfserr); - - nfserr = fh_getattr(&resp->fh, &resp->stat); - - RETURN_STATUS(nfserr); + resp->status = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -69,15 +92,20 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp) { struct nfsd3_sattrargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; + struct nfsd_attrs attrs = { + .na_iattr = &argp->attrs, + }; + const struct timespec64 *guardtime = NULL; dprintk("nfsd: SETATTR(3) %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs, - argp->check_guard, argp->guardtime); - RETURN_STATUS(nfserr); + if (argp->check_guard) + guardtime = &argp->guardtime; + resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs, guardtime); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -88,7 +116,6 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp) { struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: LOOKUP(3) %s %.*s\n", SVCFH_fmt(&argp->fh), @@ -98,11 +125,11 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp) fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); - nfserr = nfsd_lookup(rqstp, &resp->dirfh, - argp->name, - argp->len, - &resp->fh); - RETURN_STATUS(nfserr); + resp->status = nfsd_lookup(rqstp, &resp->dirfh, + argp->name, argp->len, + &resp->fh); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -113,7 +140,6 @@ nfsd3_proc_access(struct svc_rqst *rqstp) { struct nfsd3_accessargs *argp = rqstp->rq_argp; struct nfsd3_accessres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: ACCESS(3) %s 0x%x\n", SVCFH_fmt(&argp->fh), @@ -121,8 +147,9 @@ nfsd3_proc_access(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->access = argp->access; - nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); - RETURN_STATUS(nfserr); + resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -131,17 +158,19 @@ nfsd3_proc_access(struct svc_rqst *rqstp) static __be32 nfsd3_proc_readlink(struct svc_rqst *rqstp) { - struct nfsd3_readlinkargs *argp = rqstp->rq_argp; + struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_readlinkres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ fh_copy(&resp->fh, &argp->fh); resp->len = NFS3_MAXPATHLEN; - nfserr = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len); - RETURN_STATUS(nfserr); + resp->pages = rqstp->rq_next_page++; + resp->status = nfsd_readlink(rqstp, &resp->fh, + page_address(*resp->pages), &resp->len); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -152,34 +181,34 @@ nfsd3_proc_read(struct svc_rqst *rqstp) { struct nfsd3_readargs *argp = rqstp->rq_argp; struct nfsd3_readres *resp = rqstp->rq_resp; - __be32 nfserr; - u32 max_blocksize = svc_max_payload(rqstp); - unsigned long cnt = min(argp->count, max_blocksize); dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), (unsigned long) argp->count, (unsigned long long) argp->offset); + argp->count = min_t(u32, argp->count, svc_max_payload(rqstp)); + argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); + if (argp->offset > (u64)OFFSET_MAX) + argp->offset = (u64)OFFSET_MAX; + if (argp->offset + argp->count > (u64)OFFSET_MAX) + argp->count = (u64)OFFSET_MAX - argp->offset; + + resp->pages = rqstp->rq_next_page; + /* Obtain buffer pointer for payload. * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) * + 1 (xdr opaque byte count) = 26 */ - resp->count = cnt; - svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); + resp->count = argp->count; + svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3) << 2) + + resp->count + 4); fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_read(rqstp, &resp->fh, - argp->offset, - rqstp->rq_vec, argp->vlen, - &resp->count); - if (nfserr == 0) { - struct inode *inode = d_inode(resp->fh.fh_dentry); - resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset, - inode->i_size); - } - - RETURN_STATUS(nfserr); + resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, + &resp->count, &resp->eof); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -190,67 +219,170 @@ nfsd3_proc_write(struct svc_rqst *rqstp) { struct nfsd3_writeargs *argp = rqstp->rq_argp; struct nfsd3_writeres *resp = rqstp->rq_resp; - __be32 nfserr; unsigned long cnt = argp->len; - unsigned int nvecs; dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", SVCFH_fmt(&argp->fh), argp->len, (unsigned long long) argp->offset, - argp->stable? " stable" : ""); + argp->stable ? " stable" : ""); + + resp->status = nfserr_fbig; + if (argp->offset > (u64)OFFSET_MAX || + argp->offset + argp->len > (u64)OFFSET_MAX) + return rpc_success; fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; - nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, - &argp->first, cnt); - if (!nvecs) - RETURN_STATUS(nfserr_io); - nfserr = nfsd_write(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, nvecs, &cnt, - resp->committed); + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, + &argp->payload, &cnt, + resp->committed, resp->verf); resp->count = cnt; - RETURN_STATUS(nfserr); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* - * With NFSv3, CREATE processing is a lot easier than with NFSv2. - * At least in theory; we'll see how it fares in practice when the - * first reports about SunOS compatibility problems start to pour in... + * Implement NFSv3's unchecked, guarded, and exclusive CREATE + * semantics for regular files. Except for the created file, + * this operation is stateless on the server. + * + * Upon return, caller must release @fhp and @resfhp. */ static __be32 +nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct nfsd3_createargs *argp) +{ + struct iattr *iap = &argp->attrs; + struct dentry *parent, *child; + struct nfsd_attrs attrs = { + .na_iattr = iap, + }; + __u32 v_mtime, v_atime; + struct inode *inode; + __be32 status; + int host_err; + + trace_nfsd_vfs_create(rqstp, fhp, S_IFREG, argp->name, argp->len); + + if (isdotent(argp->name, argp->len)) + return nfserr_exist; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (status != nfs_ok) + return status; + + parent = fhp->fh_dentry; + inode = d_inode(parent); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + child = start_creating(&nop_mnt_idmap, parent, + &QSTR_LEN(argp->name, argp->len)); + if (IS_ERR(child)) { + status = nfserrno(PTR_ERR(child)); + goto out_write; + } + + if (d_really_is_negative(child)) { + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (status != nfs_ok) + goto out; + } + + status = fh_compose(resfhp, fhp->fh_export, child, fhp); + if (status != nfs_ok) + goto out; + + v_mtime = 0; + v_atime = 0; + if (argp->createmode == NFS3_CREATE_EXCLUSIVE) { + u32 *verifier = (u32 *)argp->verf; + + /* + * Solaris 7 gets confused (bugid 4218508) if these have + * the high bit set, as do xfs filesystems without the + * "bigtime" feature. So just clear the high bits. + */ + v_mtime = verifier[0] & 0x7fffffff; + v_atime = verifier[1] & 0x7fffffff; + } + + if (d_really_is_positive(child)) { + status = nfs_ok; + + switch (argp->createmode) { + case NFS3_CREATE_UNCHECKED: + if (!d_is_reg(child)) + break; + iap->ia_valid &= ATTR_SIZE; + goto set_attr; + case NFS3_CREATE_GUARDED: + status = nfserr_exist; + break; + case NFS3_CREATE_EXCLUSIVE: + if (inode_get_mtime_sec(d_inode(child)) == v_mtime && + inode_get_atime_sec(d_inode(child)) == v_atime && + d_inode(child)->i_size == 0) { + break; + } + status = nfserr_exist; + } + goto out; + } + + if (!IS_POSIXACL(inode)) + iap->ia_mode &= ~current_umask(); + + status = fh_fill_pre_attrs(fhp); + if (status != nfs_ok) + goto out; + host_err = vfs_create(&nop_mnt_idmap, child, iap->ia_mode, NULL); + if (host_err < 0) { + status = nfserrno(host_err); + goto out; + } + fh_fill_post_attrs(fhp); + + /* A newly created file already has a file size of zero. */ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; + if (argp->createmode == NFS3_CREATE_EXCLUSIVE) { + iap->ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_MTIME_SET | ATTR_ATIME_SET; + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + +set_attr: + status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs); + +out: + end_creating(child); +out_write: + fh_drop_write(fhp); + return status; +} + +static __be32 nfsd3_proc_create(struct svc_rqst *rqstp) { struct nfsd3_createargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - svc_fh *dirfhp, *newfhp = NULL; - struct iattr *attr; - __be32 nfserr; - - dprintk("nfsd: CREATE(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); + svc_fh *dirfhp, *newfhp; dirfhp = fh_copy(&resp->dirfh, &argp->fh); newfhp = fh_init(&resp->fh, NFS3_FHSIZE); - attr = &argp->attrs; - - /* Unfudge the mode bits */ - attr->ia_mode &= ~S_IFMT; - if (!(attr->ia_valid & ATTR_MODE)) { - attr->ia_valid |= ATTR_MODE; - attr->ia_mode = S_IFREG; - } else { - attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG; - } - /* Now create the file and set attributes */ - nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, - attr, newfhp, - argp->createmode, (u32 *)argp->verf, NULL, NULL); - - RETURN_STATUS(nfserr); + resp->status = nfsd3_create_file(rqstp, dirfhp, newfhp, argp); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -261,20 +393,17 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp) { struct nfsd3_createargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - __be32 nfserr; - - dprintk("nfsd: MKDIR(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); + struct nfsd_attrs attrs = { + .na_iattr = &argp->attrs, + }; argp->attrs.ia_valid &= ~ATTR_SIZE; fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); - nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, - &argp->attrs, S_IFDIR, 0, &resp->fh); - fh_unlock(&resp->dirfh); - RETURN_STATUS(nfserr); + resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, + &attrs, S_IFDIR, 0, &resp->fh); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } static __be32 @@ -282,30 +411,35 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) { struct nfsd3_symlinkargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - __be32 nfserr; + struct nfsd_attrs attrs = { + .na_iattr = &argp->attrs, + }; - if (argp->tlen == 0) - RETURN_STATUS(nfserr_inval); - if (argp->tlen > NFS3_MAXPATHLEN) - RETURN_STATUS(nfserr_nametoolong); + if (argp->tlen == 0) { + resp->status = nfserr_inval; + goto out; + } + if (argp->tlen > NFS3_MAXPATHLEN) { + resp->status = nfserr_nametoolong; + goto out; + } argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, page_address(rqstp->rq_arg.pages[0]), argp->tlen); - if (IS_ERR(argp->tname)) - RETURN_STATUS(nfserrno(PTR_ERR(argp->tname))); - - dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", - SVCFH_fmt(&argp->ffh), - argp->flen, argp->fname, - argp->tlen, argp->tname); + if (IS_ERR(argp->tname)) { + resp->status = nfserrno(PTR_ERR(argp->tname)); + goto out; + } fh_copy(&resp->dirfh, &argp->ffh); fh_init(&resp->fh, NFS3_FHSIZE); - nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen, - argp->tname, &resp->fh); + resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, + argp->flen, argp->tname, &attrs, &resp->fh); kfree(argp->tname); - RETURN_STATUS(nfserr); +out: + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -316,34 +450,33 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) { struct nfsd3_mknodargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - __be32 nfserr; + struct nfsd_attrs attrs = { + .na_iattr = &argp->attrs, + }; int type; dev_t rdev = 0; - dprintk("nfsd: MKNOD(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); - if (argp->ftype == 0 || argp->ftype >= NF3BAD) - RETURN_STATUS(nfserr_inval); if (argp->ftype == NF3CHR || argp->ftype == NF3BLK) { rdev = MKDEV(argp->major, argp->minor); if (MAJOR(rdev) != argp->major || - MINOR(rdev) != argp->minor) - RETURN_STATUS(nfserr_inval); - } else - if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) - RETURN_STATUS(nfserr_inval); + MINOR(rdev) != argp->minor) { + resp->status = nfserr_inval; + goto out; + } + } else if (argp->ftype != NF3SOCK && argp->ftype != NF3FIFO) { + resp->status = nfserr_badtype; + goto out; + } type = nfs3_ftypes[argp->ftype]; - nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, - &argp->attrs, type, rdev, &resp->fh); - fh_unlock(&resp->dirfh); - RETURN_STATUS(nfserr); + resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, + &attrs, type, rdev, &resp->fh); +out: + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -354,18 +487,13 @@ nfsd3_proc_remove(struct svc_rqst *rqstp) { struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; - - dprintk("nfsd: REMOVE(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); /* Unlink. -S_IFDIR means file must not be a directory */ fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len); - fh_unlock(&resp->fh); - RETURN_STATUS(nfserr); + resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, + argp->name, argp->len); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -376,17 +504,12 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp) { struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; - - dprintk("nfsd: RMDIR(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); - fh_unlock(&resp->fh); - RETURN_STATUS(nfserr); + resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, + argp->name, argp->len); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } static __be32 @@ -394,22 +517,13 @@ nfsd3_proc_rename(struct svc_rqst *rqstp) { struct nfsd3_renameargs *argp = rqstp->rq_argp; struct nfsd3_renameres *resp = rqstp->rq_resp; - __be32 nfserr; - - dprintk("nfsd: RENAME(3) %s %.*s ->\n", - SVCFH_fmt(&argp->ffh), - argp->flen, - argp->fname); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); fh_copy(&resp->ffh, &argp->ffh); fh_copy(&resp->tfh, &argp->tfh); - nfserr = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen, - &resp->tfh, argp->tname, argp->tlen); - RETURN_STATUS(nfserr); + resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen, + &resp->tfh, argp->tname, argp->tlen); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } static __be32 @@ -417,20 +531,33 @@ nfsd3_proc_link(struct svc_rqst *rqstp) { struct nfsd3_linkargs *argp = rqstp->rq_argp; struct nfsd3_linkres *resp = rqstp->rq_resp; - __be32 nfserr; - - dprintk("nfsd: LINK(3) %s ->\n", - SVCFH_fmt(&argp->ffh)); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); fh_copy(&resp->fh, &argp->ffh); fh_copy(&resp->tfh, &argp->tfh); - nfserr = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen, - &resp->fh); - RETURN_STATUS(nfserr); + resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen, + &resp->fh); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; +} + +static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, + struct nfsd3_readdirres *resp, + u32 count) +{ + struct xdr_buf *buf = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; + unsigned int sendbuf = min_t(unsigned int, rqstp->rq_res.buflen, + svc_max_payload(rqstp)); + + memset(buf, 0, sizeof(*buf)); + + /* Reserve room for the NULL ptr & eof flag (-2 words) */ + buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), sendbuf); + buf->buflen -= XDR_UNIT * 2; + buf->pages = rqstp->rq_next_page; + rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT; + + xdr_init_encode_pages(xdr, buf); } /* @@ -441,32 +568,27 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) { struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; - __be32 nfserr; - int count; + loff_t offset; - dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, (u32) argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); - /* Make sure we've room for the NULL ptr & eof flag, and shrink to - * client read size */ - count = (argp->count >> 2) - 2; + nfsd3_init_dirlist_pages(rqstp, resp, argp->count); - /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); - - resp->buflen = count; resp->common.err = nfs_ok; - resp->buffer = argp->buffer; + resp->cookie_offset = 0; resp->rqstp = rqstp; - nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, - &resp->common, nfs3svc_encode_entry); + offset = argp->cookie; + resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, + &resp->common, nfs3svc_encode_entry3); memcpy(resp->verf, argp->verf, 8); - resp->count = resp->buffer - argp->buffer; - if (resp->offset) - xdr_encode_hyper(resp->offset, argp->cookie); + nfs3svc_encode_cookie3(resp, offset); - RETURN_STATUS(nfserr); + /* Recycle only pages that were part of the reply */ + rqstp->rq_next_page = resp->xdr.page_ptr + 1; + + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -478,64 +600,38 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) { struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; - __be32 nfserr; - int count = 0; loff_t offset; - struct page **p; - caddr_t page_addr = NULL; - dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, (u32) argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); - /* Convert byte count to number of words (i.e. >> 2), - * and reserve room for the NULL ptr & eof flag (-2 words) */ - resp->count = (argp->count >> 2) - 2; + nfsd3_init_dirlist_pages(rqstp, resp, argp->count); - /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); - resp->common.err = nfs_ok; - resp->buffer = argp->buffer; - resp->buflen = resp->count; + resp->cookie_offset = 0; resp->rqstp = rqstp; offset = argp->cookie; - nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP); - if (nfserr) - RETURN_STATUS(nfserr); + resp->status = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; - if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) - RETURN_STATUS(nfserr_notsupp); + if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) { + resp->status = nfserr_notsupp; + goto out; + } - nfserr = nfsd_readdir(rqstp, &resp->fh, - &offset, - &resp->common, - nfs3svc_encode_entry_plus); + resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, + &resp->common, nfs3svc_encode_entryplus3); memcpy(resp->verf, argp->verf, 8); - for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { - page_addr = page_address(*p); + nfs3svc_encode_cookie3(resp, offset); - if (((caddr_t)resp->buffer >= page_addr) && - ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { - count += (caddr_t)resp->buffer - page_addr; - break; - } - count += PAGE_SIZE; - } - resp->count = count >> 2; - if (resp->offset) { - if (unlikely(resp->offset1)) { - /* we ended up with offset on a page boundary */ - *resp->offset = htonl(offset >> 32); - *resp->offset1 = htonl(offset & 0xffffffff); - resp->offset1 = NULL; - } else { - xdr_encode_hyper(resp->offset, offset); - } - } + /* Recycle only pages that were part of the reply */ + rqstp->rq_next_page = resp->xdr.page_ptr + 1; - RETURN_STATUS(nfserr); +out: + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -546,14 +642,11 @@ nfsd3_proc_fsstat(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_fsstatres *resp = rqstp->rq_resp; - __be32 nfserr; - - dprintk("nfsd: FSSTAT(3) %s\n", - SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); + resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); fh_put(&argp->fh); - RETURN_STATUS(nfserr); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -564,7 +657,6 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_fsinfores *resp = rqstp->rq_resp; - __be32 nfserr; u32 max_blocksize = svc_max_payload(rqstp); dprintk("nfsd: FSINFO(3) %s\n", @@ -576,17 +668,17 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) resp->f_wtmax = max_blocksize; resp->f_wtpref = max_blocksize; resp->f_wtmult = PAGE_SIZE; - resp->f_dtpref = PAGE_SIZE; + resp->f_dtpref = max_blocksize; resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; - nfserr = fh_verify(rqstp, &argp->fh, 0, - NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + resp->status = fh_verify(rqstp, &argp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); /* Check special features of the file system. May request * different read/write sizes for file systems known to have * problems with large blocks */ - if (nfserr == 0) { + if (resp->status == nfs_ok) { struct super_block *sb = argp->fh.fh_dentry->d_sb; /* Note that we don't care for remote fs's here */ @@ -597,7 +689,8 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) } fh_put(&argp->fh); - RETURN_STATUS(nfserr); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } /* @@ -608,7 +701,6 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_pathconfres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: PATHCONF(3) %s\n", SVCFH_fmt(&argp->fh)); @@ -621,9 +713,9 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp) resp->p_case_insensitive = 0; resp->p_case_preserving = 1; - nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); + resp->status = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); - if (nfserr == 0) { + if (resp->status == nfs_ok) { struct super_block *sb = argp->fh.fh_dentry->d_sb; /* Note that we don't care for remote fs's here */ @@ -640,10 +732,10 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp) } fh_put(&argp->fh); - RETURN_STATUS(nfserr); + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } - /* * Commit a file (range) to stable storage. */ @@ -652,20 +744,24 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) { struct nfsd3_commitargs *argp = rqstp->rq_argp; struct nfsd3_commitres *resp = rqstp->rq_resp; - __be32 nfserr; + struct nfsd_file *nf; dprintk("nfsd: COMMIT(3) %s %u@%Lu\n", SVCFH_fmt(&argp->fh), argp->count, (unsigned long long) argp->offset); - if (argp->offset > NFS_OFFSET_MAX) - RETURN_STATUS(nfserr_inval); - fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_commit(rqstp, &resp->fh, argp->offset, argp->count); - - RETURN_STATUS(nfserr); + resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE | + NFSD_MAY_NOT_BREAK_LEASE, &nf); + if (resp->status) + goto out; + resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset, + argp->count, resp->verf); + nfsd_file_put(nf); +out: + resp->status = nfsd3_map_status(resp->status); + return rpc_success; } @@ -673,18 +769,14 @@ nfsd3_proc_commit(struct svc_rqst *rqstp) * NFSv3 Server procedures. * Only the results of non-idempotent operations are cached. */ -#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle #define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat #define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat #define nfsd3_mkdirargs nfsd3_createargs #define nfsd3_readdirplusargs nfsd3_readdirargs #define nfsd3_fhandleargs nfsd_fhandle -#define nfsd3_fhandleres nfsd3_attrstat #define nfsd3_attrstatres nfsd3_attrstat #define nfsd3_wccstatres nfsd3_attrstat #define nfsd3_createres nfsd3_diropres -#define nfsd3_voidres nfsd3_voidargs -struct nfsd3_voidargs { int dummy; }; #define ST 1 /* status*/ #define FH 17 /* filehandle with length */ @@ -695,21 +787,26 @@ struct nfsd3_voidargs { int dummy; }; static const struct svc_procedure nfsd_procedures3[22] = { [NFS3PROC_NULL] = { .pc_func = nfsd3_proc_null, - .pc_encode = nfs3svc_encode_voidres, - .pc_argsize = sizeof(struct nfsd3_voidargs), - .pc_ressize = sizeof(struct nfsd3_voidres), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, + .pc_name = "NULL", }, [NFS3PROC_GETATTR] = { .pc_func = nfsd3_proc_getattr, .pc_decode = nfs3svc_decode_fhandleargs, - .pc_encode = nfs3svc_encode_attrstatres, + .pc_encode = nfs3svc_encode_getattrres, .pc_release = nfs3svc_release_fhandle, - .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_attrstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [NFS3PROC_SETATTR] = { .pc_func = nfsd3_proc_setattr, @@ -717,19 +814,23 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_wccstatres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_sattrargs), + .pc_argzero = sizeof(struct nfsd3_sattrargs), .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "SETATTR", }, [NFS3PROC_LOOKUP] = { .pc_func = nfsd3_proc_lookup, .pc_decode = nfs3svc_decode_diropargs, - .pc_encode = nfs3svc_encode_diropres, + .pc_encode = nfs3svc_encode_lookupres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_argzero = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+pAT+pAT, + .pc_name = "LOOKUP", }, [NFS3PROC_ACCESS] = { .pc_func = nfsd3_proc_access, @@ -737,19 +838,23 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_accessres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_argzero = sizeof(struct nfsd3_accessargs), .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1, + .pc_name = "ACCESS", }, [NFS3PROC_READLINK] = { .pc_func = nfsd3_proc_readlink, - .pc_decode = nfs3svc_decode_readlinkargs, + .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_readlinkres, .pc_release = nfs3svc_release_fhandle, - .pc_argsize = sizeof(struct nfsd3_readlinkargs), + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4, + .pc_name = "READLINK", }, [NFS3PROC_READ] = { .pc_func = nfsd3_proc_read, @@ -757,9 +862,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_readargs), + .pc_argzero = sizeof(struct nfsd3_readargs), .pc_ressize = sizeof(struct nfsd3_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4, + .pc_name = "READ", }, [NFS3PROC_WRITE] = { .pc_func = nfsd3_proc_write, @@ -767,9 +874,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_writeres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_writeargs), + .pc_argzero = sizeof(struct nfsd3_writeargs), .pc_ressize = sizeof(struct nfsd3_writeres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+4, + .pc_name = "WRITE", }, [NFS3PROC_CREATE] = { .pc_func = nfsd3_proc_create, @@ -777,9 +886,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_createargs), + .pc_argzero = sizeof(struct nfsd3_createargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "CREATE", }, [NFS3PROC_MKDIR] = { .pc_func = nfsd3_proc_mkdir, @@ -787,9 +898,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_mkdirargs), + .pc_argzero = sizeof(struct nfsd3_mkdirargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "MKDIR", }, [NFS3PROC_SYMLINK] = { .pc_func = nfsd3_proc_symlink, @@ -797,9 +910,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_symlinkargs), + .pc_argzero = sizeof(struct nfsd3_symlinkargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "SYMLINK", }, [NFS3PROC_MKNOD] = { .pc_func = nfsd3_proc_mknod, @@ -807,9 +922,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_createres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_mknodargs), + .pc_argzero = sizeof(struct nfsd3_mknodargs), .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "MKNOD", }, [NFS3PROC_REMOVE] = { .pc_func = nfsd3_proc_remove, @@ -817,9 +934,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_wccstatres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_argzero = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "REMOVE", }, [NFS3PROC_RMDIR] = { .pc_func = nfsd3_proc_rmdir, @@ -827,9 +946,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_wccstatres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_diropargs), + .pc_argzero = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "RMDIR", }, [NFS3PROC_RENAME] = { .pc_func = nfsd3_proc_rename, @@ -837,9 +958,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_renameres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_renameargs), + .pc_argzero = sizeof(struct nfsd3_renameargs), .pc_ressize = sizeof(struct nfsd3_renameres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+WC, + .pc_name = "RENAME", }, [NFS3PROC_LINK] = { .pc_func = nfsd3_proc_link, @@ -847,9 +970,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_linkres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_linkargs), + .pc_argzero = sizeof(struct nfsd3_linkargs), .pc_ressize = sizeof(struct nfsd3_linkres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+pAT+WC, + .pc_name = "LINK", }, [NFS3PROC_READDIR] = { .pc_func = nfsd3_proc_readdir, @@ -857,8 +982,10 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readdirres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_readdirargs), + .pc_argzero = sizeof(struct nfsd3_readdirargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIR", }, [NFS3PROC_READDIRPLUS] = { .pc_func = nfsd3_proc_readdirplus, @@ -866,35 +993,43 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_readdirres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_readdirplusargs), + .pc_argzero = sizeof(struct nfsd3_readdirplusargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIRPLUS", }, [NFS3PROC_FSSTAT] = { .pc_func = nfsd3_proc_fsstat, .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_fsstatres, .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argzero = sizeof(struct nfsd3_fhandleargs), .pc_ressize = sizeof(struct nfsd3_fsstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+2*6+1, + .pc_name = "FSSTAT", }, [NFS3PROC_FSINFO] = { .pc_func = nfsd3_proc_fsinfo, .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_fsinfores, .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argzero = sizeof(struct nfsd3_fhandleargs), .pc_ressize = sizeof(struct nfsd3_fsinfores), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+12, + .pc_name = "FSINFO", }, [NFS3PROC_PATHCONF] = { .pc_func = nfsd3_proc_pathconf, .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_pathconfres, .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argzero = sizeof(struct nfsd3_fhandleargs), .pc_ressize = sizeof(struct nfsd3_pathconfres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+6, + .pc_name = "PATHCONF", }, [NFS3PROC_COMMIT] = { .pc_func = nfsd3_proc_commit, @@ -902,16 +1037,19 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_encode = nfs3svc_encode_commitres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd3_commitargs), + .pc_argzero = sizeof(struct nfsd3_commitargs), .pc_ressize = sizeof(struct nfsd3_commitres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+WC+2, + .pc_name = "COMMIT", }, }; -static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures3)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_count3[ARRAY_SIZE(nfsd_procedures3)]); const struct svc_version nfsd_version3 = { .vs_vers = 3, - .vs_nproc = 22, + .vs_nproc = ARRAY_SIZE(nfsd_procedures3), .vs_proc = nfsd_procedures3, .vs_dispatch = nfsd_dispatch, .vs_count = nfsd_count3, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9b973f4f7d01..ef4971d71ac4 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -14,820 +14,966 @@ #include "netns.h" #include "vfs.h" -#define NFSDDBG_FACILITY NFSDDBG_XDR +/* + * Force construction of an empty post-op attr + */ +static const struct svc_fh nfs3svc_null_fh = { + .fh_no_wcc = true, +}; +/* + * time_delta. {1, 0} means the server is accurate only + * to the nearest second. + */ +static const struct timespec64 nfs3svc_time_delta = { + .tv_sec = 1, + .tv_nsec = 0, +}; /* * Mapping of S_IF* types to NFS file types */ -static u32 nfs3_ftypes[] = { +static const u32 nfs3_ftypes[] = { NF3NON, NF3FIFO, NF3CHR, NF3BAD, NF3DIR, NF3BAD, NF3BLK, NF3BAD, NF3REG, NF3BAD, NF3LNK, NF3BAD, NF3SOCK, NF3BAD, NF3LNK, NF3BAD, }; + /* - * XDR functions for basic NFS types + * Basic NFSv3 data types (RFC 1813 Sections 2.5 and 2.6) */ + static __be32 * -encode_time3(__be32 *p, struct timespec *time) +encode_nfstime3(__be32 *p, const struct timespec64 *time) { - *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec); + *p++ = cpu_to_be32((u32)time->tv_sec); + *p++ = cpu_to_be32(time->tv_nsec); + return p; } -static __be32 * -decode_time3(__be32 *p, struct timespec *time) +static bool +svcxdr_decode_nfstime3(struct xdr_stream *xdr, struct timespec64 *timep) { - time->tv_sec = ntohl(*p++); - time->tv_nsec = ntohl(*p++); - return p; + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p); + + return true; } -static __be32 * -decode_fh(__be32 *p, struct svc_fh *fhp) +/** + * svcxdr_decode_nfs_fh3 - Decode an NFSv3 file handle + * @xdr: XDR stream positioned at an undecoded NFSv3 FH + * @fhp: OUT: filled-in server file handle + * + * Return values: + * %false: The encoded file handle was not valid + * %true: @fhp has been initialized + */ +bool +svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp) { - unsigned int size; + __be32 *p; + u32 size; + + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS3_FHSIZE) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; fh_init(fhp, NFS3_FHSIZE); - size = ntohl(*p++); - if (size > NFS3_FHSIZE) - return NULL; - - memcpy(&fhp->fh_handle.fh_base, p, size); fhp->fh_handle.fh_size = size; - return p + XDR_QUADLEN(size); + memcpy(&fhp->fh_handle.fh_raw, p, size); + + return true; } -/* Helper function for NFSv3 ACL code */ -__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp) +/** + * svcxdr_encode_nfsstat3 - Encode an NFSv3 status code + * @xdr: XDR stream + * @status: status value to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_nfsstat3(struct xdr_stream *xdr, __be32 status) { - return decode_fh(p, fhp); + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(status)); + if (!p) + return false; + *p = status; + + return true; } -static __be32 * -encode_fh(__be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_nfs_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp) +{ + u32 size = fhp->fh_handle.fh_size; + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT + size); + if (!p) + return false; + *p++ = cpu_to_be32(size); + if (size) + p[XDR_QUADLEN(size) - 1] = 0; + memcpy(p, &fhp->fh_handle.fh_raw, size); + + return true; +} + +static bool +svcxdr_encode_post_op_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp) { - unsigned int size = fhp->fh_handle.fh_size; - *p++ = htonl(size); - if (size) p[XDR_QUADLEN(size)-1]=0; - memcpy(p, &fhp->fh_handle.fh_base, size); - return p + XDR_QUADLEN(size); + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + if (!svcxdr_encode_nfs_fh3(xdr, fhp)) + return false; + + return true; } -/* - * Decode a file name and make sure that the path contains - * no slashes or null bytes. - */ -static __be32 * -decode_filename(__be32 *p, char **namp, unsigned int *lenp) +static bool +svcxdr_encode_cookieverf3(struct xdr_stream *xdr, const __be32 *verf) { - char *name; - unsigned int i; - - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0' || *name == '/') - return NULL; - } + __be32 *p; + + p = xdr_reserve_space(xdr, NFS3_COOKIEVERFSIZE); + if (!p) + return false; + memcpy(p, verf, NFS3_COOKIEVERFSIZE); + + return true; +} + +static bool +svcxdr_encode_writeverf3(struct xdr_stream *xdr, const __be32 *verf) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS3_WRITEVERFSIZE); + if (!p) + return false; + memcpy(p, verf, NFS3_WRITEVERFSIZE); + + return true; +} + +static bool +svcxdr_decode_filename3(struct xdr_stream *xdr, char **name, unsigned int *len) +{ + u32 size, i; + __be32 *p; + char *c; + + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS3_MAXNAMLEN) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; + + *len = size; + *name = (char *)p; + for (i = 0, c = *name; i < size; i++, c++) { + if (*c == '\0' || *c == '/') + return false; } - return p; + return true; } -static __be32 * -decode_sattr3(__be32 *p, struct iattr *iap) +static bool +svcxdr_decode_diropargs3(struct xdr_stream *xdr, struct svc_fh *fhp, + char **name, unsigned int *len) +{ + return svcxdr_decode_nfs_fh3(xdr, fhp) && + svcxdr_decode_filename3(xdr, name, len); +} + +static bool +svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct iattr *iap) { - u32 tmp; + u32 set_it; iap->ia_valid = 0; - if (*p++) { + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 mode; + + if (xdr_stream_decode_u32(xdr, &mode) < 0) + return false; iap->ia_valid |= ATTR_MODE; - iap->ia_mode = ntohl(*p++); + iap->ia_mode = mode; } - if (*p++) { - iap->ia_uid = make_kuid(&init_user_ns, ntohl(*p++)); + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 uid; + + if (xdr_stream_decode_u32(xdr, &uid) < 0) + return false; + iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), uid); if (uid_valid(iap->ia_uid)) iap->ia_valid |= ATTR_UID; } - if (*p++) { - iap->ia_gid = make_kgid(&init_user_ns, ntohl(*p++)); + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 gid; + + if (xdr_stream_decode_u32(xdr, &gid) < 0) + return false; + iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), gid); if (gid_valid(iap->ia_gid)) iap->ia_valid |= ATTR_GID; } - if (*p++) { - u64 newsize; + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u64 newsize; + if (xdr_stream_decode_u64(xdr, &newsize) < 0) + return false; iap->ia_valid |= ATTR_SIZE; - p = xdr_decode_hyper(p, &newsize); - iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); + iap->ia_size = newsize; } - if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + if (xdr_stream_decode_u32(xdr, &set_it) < 0) + return false; + switch (set_it) { + case DONT_CHANGE: + break; + case SET_TO_SERVER_TIME: iap->ia_valid |= ATTR_ATIME; - } else if (tmp == 2) { /* set to client time */ + break; + case SET_TO_CLIENT_TIME: + if (!svcxdr_decode_nfstime3(xdr, &iap->ia_atime)) + return false; iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - iap->ia_atime.tv_sec = ntohl(*p++); - iap->ia_atime.tv_nsec = ntohl(*p++); + break; + default: + return false; } - if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + if (xdr_stream_decode_u32(xdr, &set_it) < 0) + return false; + switch (set_it) { + case DONT_CHANGE: + break; + case SET_TO_SERVER_TIME: iap->ia_valid |= ATTR_MTIME; - } else if (tmp == 2) { /* set to client time */ + break; + case SET_TO_CLIENT_TIME: + if (!svcxdr_decode_nfstime3(xdr, &iap->ia_mtime)) + return false; iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; - iap->ia_mtime.tv_sec = ntohl(*p++); - iap->ia_mtime.tv_nsec = ntohl(*p++); + break; + default: + return false; } - return p; + + return true; } -static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) +static bool +svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args) { - u64 f; - switch(fsid_source(fhp)) { - default: - case FSIDSOURCE_DEV: - p = xdr_encode_hyper(p, (u64)huge_encode_dev - (fhp->fh_dentry->d_sb->s_dev)); - break; - case FSIDSOURCE_FSID: - p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); - break; - case FSIDSOURCE_UUID: - f = ((u64*)fhp->fh_export->ex_uuid)[0]; - f ^= ((u64*)fhp->fh_export->ex_uuid)[1]; - p = xdr_encode_hyper(p, f); - break; - } - return p; + u32 check; + + if (xdr_stream_decode_bool(xdr, &check) < 0) + return false; + if (check) { + if (!svcxdr_decode_nfstime3(xdr, &args->guardtime)) + return false; + args->check_guard = 1; + } else + args->check_guard = 0; + + return true; } -static __be32 * -encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, - struct kstat *stat) +static bool +svcxdr_decode_specdata3(struct xdr_stream *xdr, struct nfsd3_mknodargs *args) { - struct timespec ts; - *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); - *p++ = htonl((u32) (stat->mode & S_IALLUGO)); - *p++ = htonl((u32) stat->nlink); - *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); - *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); - if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { - p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); - } else { - p = xdr_encode_hyper(p, (u64) stat->size); - } - p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); - *p++ = htonl((u32) MAJOR(stat->rdev)); - *p++ = htonl((u32) MINOR(stat->rdev)); - p = encode_fsid(p, fhp); - p = xdr_encode_hyper(p, stat->ino); - ts = timespec64_to_timespec(stat->atime); - p = encode_time3(p, &ts); - ts = timespec64_to_timespec(stat->mtime); - p = encode_time3(p, &ts); - ts = timespec64_to_timespec(stat->ctime); - p = encode_time3(p, &ts); + __be32 *p; - return p; + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + args->major = be32_to_cpup(p++); + args->minor = be32_to_cpup(p); + + return true; } -static __be32 * -encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +static bool +svcxdr_decode_devicedata3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct nfsd3_mknodargs *args) { - /* Attributes to follow */ - *p++ = xdr_one; - return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr); + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) && + svcxdr_decode_specdata3(xdr, args); } -/* - * Encode post-operation attributes. - * The inode may be NULL if the call failed because of a stale file - * handle. In this case, no attributes are returned. - */ -static __be32 * -encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp, const struct kstat *stat) { - struct dentry *dentry = fhp->fh_dentry; - if (dentry && d_really_is_positive(dentry)) { - __be32 err; - struct kstat stat; - - err = fh_getattr(fhp, &stat); - if (!err) { - *p++ = xdr_one; /* attributes follow */ - lease_get_mtime(d_inode(dentry), &stat.mtime); - return encode_fattr3(rqstp, p, fhp, &stat); - } + struct user_namespace *userns = nfsd_user_namespace(rqstp); + __be32 *p; + u64 fsid; + + p = xdr_reserve_space(xdr, XDR_UNIT * 21); + if (!p) + return false; + + *p++ = cpu_to_be32(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); + *p++ = cpu_to_be32((u32)(stat->mode & S_IALLUGO)); + *p++ = cpu_to_be32((u32)stat->nlink); + *p++ = cpu_to_be32((u32)from_kuid_munged(userns, stat->uid)); + *p++ = cpu_to_be32((u32)from_kgid_munged(userns, stat->gid)); + if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) + p = xdr_encode_hyper(p, (u64)NFS3_MAXPATHLEN); + else + p = xdr_encode_hyper(p, (u64)stat->size); + + /* used */ + p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); + + /* rdev */ + *p++ = cpu_to_be32((u32)MAJOR(stat->rdev)); + *p++ = cpu_to_be32((u32)MINOR(stat->rdev)); + + switch(fsid_source(fhp)) { + case FSIDSOURCE_FSID: + fsid = (u64)fhp->fh_export->ex_fsid; + break; + case FSIDSOURCE_UUID: + fsid = ((u64 *)fhp->fh_export->ex_uuid)[0]; + fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1]; + break; + default: + fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev); } - *p++ = xdr_zero; - return p; + p = xdr_encode_hyper(p, fsid); + + /* fileid */ + p = xdr_encode_hyper(p, stat->ino); + + p = encode_nfstime3(p, &stat->atime); + p = encode_nfstime3(p, &stat->mtime); + encode_nfstime3(p, &stat->ctime); + + return true; } -/* Helper for NFSv3 ACLs */ -__be32 * -nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_wcc_attr(struct xdr_stream *xdr, const struct svc_fh *fhp) { - return encode_post_op_attr(rqstp, p, fhp); + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 6); + if (!p) + return false; + p = xdr_encode_hyper(p, (u64)fhp->fh_pre_size); + p = encode_nfstime3(p, &fhp->fh_pre_mtime); + encode_nfstime3(p, &fhp->fh_pre_ctime); + + return true; } -/* - * Enocde weak cache consistency data - */ -static __be32 * -encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_pre_op_attr(struct xdr_stream *xdr, const struct svc_fh *fhp) { - struct dentry *dentry = fhp->fh_dentry; - - if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) { - if (fhp->fh_pre_saved) { - *p++ = xdr_one; - p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size); - p = encode_time3(p, &fhp->fh_pre_mtime); - p = encode_time3(p, &fhp->fh_pre_ctime); - } else { - *p++ = xdr_zero; - } - return encode_saved_post_attr(rqstp, p, fhp); + if (!fhp->fh_pre_saved) { + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + return true; } - /* no pre- or post-attrs */ - *p++ = xdr_zero; - return encode_post_op_attr(rqstp, p, fhp); + + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + return svcxdr_encode_wcc_attr(xdr, fhp); } -/* - * Fill in the pre_op attr for the wcc data +/** + * svcxdr_encode_post_op_attr - Encode NFSv3 post-op attributes + * @rqstp: Context of a completed RPC transaction + * @xdr: XDR stream + * @fhp: File handle to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success */ -void fill_pre_wcc(struct svc_fh *fhp) +bool +svcxdr_encode_post_op_attr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp) { - struct inode *inode; - struct kstat stat; - __be32 err; + struct dentry *dentry = fhp->fh_dentry; + struct kstat stat; - if (fhp->fh_pre_saved) - return; + /* + * The inode may be NULL if the call failed because of a + * stale file handle. In this case, no attributes are + * returned. + */ + if (fhp->fh_no_wcc || !dentry || !d_really_is_positive(dentry)) + goto no_post_op_attrs; + if (fh_getattr(fhp, &stat) != nfs_ok) + goto no_post_op_attrs; - inode = d_inode(fhp->fh_dentry); - err = fh_getattr(fhp, &stat); - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; - } + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + lease_get_mtime(d_inode(dentry), &stat.mtime); + if (!svcxdr_encode_fattr3(rqstp, xdr, fhp, &stat)) + return false; + + return true; - fhp->fh_pre_mtime = timespec64_to_timespec(stat.mtime); - fhp->fh_pre_ctime = timespec64_to_timespec(stat.ctime); - fhp->fh_pre_size = stat.size; - fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); - fhp->fh_pre_saved = true; +no_post_op_attrs: + return xdr_stream_encode_item_absent(xdr) > 0; } /* - * Fill in the post_op attr for the wcc data + * Encode weak cache consistency data */ -void fill_post_wcc(struct svc_fh *fhp) +static bool +svcxdr_encode_wcc_data(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp) { - __be32 err; - - if (fhp->fh_post_saved) - printk("nfsd: inode locked twice during operation.\n"); - - err = fh_getattr(fhp, &fhp->fh_post_attr); - fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, - d_inode(fhp->fh_dentry)); - if (err) { - fhp->fh_post_saved = false; - /* Grab the ctime anyway - set_change_info might use it */ - fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime; - } else - fhp->fh_post_saved = true; + struct dentry *dentry = fhp->fh_dentry; + + if (!dentry || !d_really_is_positive(dentry) || !fhp->fh_post_saved) + goto neither; + + /* before */ + if (!svcxdr_encode_pre_op_attr(xdr, fhp)) + return false; + + /* after */ + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + if (!svcxdr_encode_fattr3(rqstp, xdr, fhp, &fhp->fh_post_attr)) + return false; + + return true; + +neither: + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, fhp)) + return false; + + return true; } /* * XDR decode functions */ -int -nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) + +bool +nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_fhandle *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->fh); } -int -nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_sattrargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = decode_sattr3(p, &args->attrs); - - if ((args->check_guard = ntohl(*p++)) != 0) { - struct timespec time; - p = decode_time3(p, &time); - args->guardtime = time.tv_sec; - } - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->fh) && + svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) && + svcxdr_decode_sattrguard3(xdr, args); } -int -nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_diropargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len); } -int -nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_accessargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - args->access = ntohl(*p++); + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return false; + if (xdr_stream_decode_u32(xdr, &args->access) < 0) + return false; - return xdr_argsize_check(rqstp, p); + return true; } -int -nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_readargs *args = rqstp->rq_argp; - unsigned int len; - int v; - u32 max_blocksize = svc_max_payload(rqstp); - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); - len = min(args->count, max_blocksize); + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return false; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return false; - /* set up the kvec */ - v=0; - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } - args->vlen = v; - return xdr_argsize_check(rqstp, p); + return true; } -int -nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_writeargs *args = rqstp->rq_argp; - unsigned int len, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); - struct kvec *head = rqstp->rq_arg.head; - struct kvec *tail = rqstp->rq_arg.tail; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = xdr_decode_hyper(p, &args->offset); - - args->count = ntohl(*p++); - args->stable = ntohl(*p++); - len = args->len = ntohl(*p++); - if ((void *)p > head->iov_base + head->iov_len) - return 0; - /* - * The count must equal the amount of data passed. - */ - if (args->count != args->len) - return 0; + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return false; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &args->stable) < 0) + return false; - /* - * Check to make sure that we got the right number of - * bytes. - */ - hdr = (void*)p - head->iov_base; - dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr; - /* - * Round the length of the data which was specified up to - * the next multiple of XDR units and then compare that - * against the length which was actually received. - * Note that when RPCSEC/GSS (for example) is used, the - * data buffer can be padded so dlen might be larger - * than required. It must never be smaller. - */ - if (dlen < XDR_QUADLEN(len)*4) - return 0; + /* opaque data */ + if (xdr_stream_decode_u32(xdr, &args->len) < 0) + return false; + /* request sanity */ + if (args->count != args->len) + return false; if (args->count > max_blocksize) { args->count = max_blocksize; - len = args->len = max_blocksize; + args->len = max_blocksize; } - args->first.iov_base = (void *)p; - args->first.iov_len = head->iov_len - hdr; - return 1; + return xdr_stream_subsegment(xdr, &args->payload, args->count); } -int -nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_createargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - - switch (args->createmode = ntohl(*p++)) { + if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len)) + return false; + if (xdr_stream_decode_u32(xdr, &args->createmode) < 0) + return false; + switch (args->createmode) { case NFS3_CREATE_UNCHECKED: case NFS3_CREATE_GUARDED: - p = decode_sattr3(p, &args->attrs); - break; + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); case NFS3_CREATE_EXCLUSIVE: - args->verf = p; - p += 2; + args->verf = xdr_inline_decode(xdr, NFS3_CREATEVERFSIZE); + if (!args->verf) + return false; break; default: - return 0; + return false; } - - return xdr_argsize_check(rqstp, p); + return true; } -int -nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_createargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) || - !(p = decode_filename(p, &args->name, &args->len))) - return 0; - p = decode_sattr3(p, &args->attrs); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->fh, + &args->name, &args->len) && + svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); } -int -nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_symlinkargs *args = rqstp->rq_argp; - char *base = (char *)p; - size_t dlen; - - if (!(p = decode_fh(p, &args->ffh)) || - !(p = decode_filename(p, &args->fname, &args->flen))) - return 0; - p = decode_sattr3(p, &args->attrs); - - args->tlen = ntohl(*p++); - - args->first.iov_base = p; - args->first.iov_len = rqstp->rq_arg.head[0].iov_len; - args->first.iov_len -= (char *)p - base; + struct kvec *head = rqstp->rq_arg.head; - dlen = args->first.iov_len + rqstp->rq_arg.page_len + - rqstp->rq_arg.tail[0].iov_len; - if (dlen < XDR_QUADLEN(args->tlen) << 2) - return 0; - return 1; + if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen)) + return false; + if (!svcxdr_decode_sattr3(rqstp, xdr, &args->attrs)) + return false; + if (xdr_stream_decode_u32(xdr, &args->tlen) < 0) + return false; + + /* symlink_data */ + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + args->first.iov_base = xdr_inline_decode(xdr, args->tlen); + return args->first.iov_base != NULL; } -int -nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_mknodargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - - args->ftype = ntohl(*p++); - - if (args->ftype == NF3BLK || args->ftype == NF3CHR - || args->ftype == NF3SOCK || args->ftype == NF3FIFO) - p = decode_sattr3(p, &args->attrs); - - if (args->ftype == NF3BLK || args->ftype == NF3CHR) { - args->major = ntohl(*p++); - args->minor = ntohl(*p++); + if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len)) + return false; + if (xdr_stream_decode_u32(xdr, &args->ftype) < 0) + return false; + switch (args->ftype) { + case NF3CHR: + case NF3BLK: + return svcxdr_decode_devicedata3(rqstp, xdr, args); + case NF3SOCK: + case NF3FIFO: + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); + case NF3REG: + case NF3DIR: + case NF3LNK: + /* Valid XDR but illegal file types */ + break; + default: + return false; } - return xdr_argsize_check(rqstp, p); + return true; } -int -nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_renameargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->ffh, + &args->fname, &args->flen) && + svcxdr_decode_diropargs3(xdr, &args->tfh, + &args->tname, &args->tlen); } -int -nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd3_readlinkargs *args = rqstp->rq_argp; - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - args->buffer = page_address(*(rqstp->rq_next_page++)); - - return xdr_argsize_check(rqstp, p); -} - -int -nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_linkargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->ffh) && + svcxdr_decode_diropargs3(xdr, &args->tfh, + &args->tname, &args->tlen); } -int -nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_readdirargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = xdr_decode_hyper(p, &args->cookie); - args->verf = p; p += 2; - args->dircount = ~0; - args->count = ntohl(*p++); - args->count = min_t(u32, args->count, PAGE_SIZE); - args->buffer = page_address(*(rqstp->rq_next_page++)); - - return xdr_argsize_check(rqstp, p); + + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return false; + if (xdr_stream_decode_u64(xdr, &args->cookie) < 0) + return false; + args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (!args->verf) + return false; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return false; + + return true; } -int -nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_readdirargs *args = rqstp->rq_argp; - int len; - u32 max_blocksize = svc_max_payload(rqstp); - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = xdr_decode_hyper(p, &args->cookie); - args->verf = p; p += 2; - args->dircount = ntohl(*p++); - args->count = ntohl(*p++); - - len = args->count = min(args->count, max_blocksize); - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - if (!args->buffer) - args->buffer = page_address(p); - len -= PAGE_SIZE; - } - - return xdr_argsize_check(rqstp, p); + u32 dircount; + + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return false; + if (xdr_stream_decode_u64(xdr, &args->cookie) < 0) + return false; + args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (!args->verf) + return false; + /* dircount is ignored */ + if (xdr_stream_decode_u32(xdr, &dircount) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return false; + + return true; } -int -nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_decode_commitargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_commitargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return false; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return false; + + return true; } /* * XDR encode functions */ -/* - * There must be an encoding function for void results so svc_process - * will work properly. - */ -int -nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} /* GETATTR */ -int -nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_getattrres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_attrstat *resp = rqstp->rq_resp; - if (resp->status == 0) { - lease_get_mtime(d_inode(resp->fh.fh_dentry), - &resp->stat.mtime); - p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime); + if (!svcxdr_encode_fattr3(rqstp, xdr, &resp->fh, &resp->stat)) + return false; + break; } - return xdr_ressize_check(rqstp, p); + + return true; } /* SETATTR, REMOVE, RMDIR */ -int -nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_wccstat(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_attrstat *resp = rqstp->rq_resp; - p = encode_wcc_data(rqstp, p, &resp->fh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh); } /* LOOKUP */ -int -nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_lookupres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_diropres *resp = rqstp->rq_resp; - if (resp->status == 0) { - p = encode_fh(p, &resp->fh); - p = encode_post_op_attr(rqstp, p, &resp->fh); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_nfs_fh3(xdr, &resp->fh)) + return false; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh)) + return false; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh)) + return false; } - p = encode_post_op_attr(rqstp, p, &resp->dirfh); - return xdr_ressize_check(rqstp, p); + + return true; } /* ACCESS */ -int -nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_accessres *resp = rqstp->rq_resp; - p = encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0) - *p++ = htonl(resp->access); - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + if (xdr_stream_encode_u32(xdr, resp->access) < 0) + return false; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + } + + return true; } /* READLINK */ -int -nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_readlinkres *resp = rqstp->rq_resp; + struct kvec *head = rqstp->rq_res.head; + + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + if (xdr_stream_encode_u32(xdr, resp->len) < 0) + return false; + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, 0, + resp->len); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) + return false; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + } - p = encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0) { - *p++ = htonl(resp->len); - xdr_ressize_check(rqstp, p); - rqstp->rq_res.page_len = resp->len; - if (resp->len & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); - } - return 1; - } else - return xdr_ressize_check(rqstp, p); + return true; } /* READ */ -int -nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_readres *resp = rqstp->rq_resp; + struct kvec *head = rqstp->rq_res.head; + + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return false; + if (xdr_stream_encode_bool(xdr, resp->eof) < 0) + return false; + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return false; + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, + rqstp->rq_res.page_base, + resp->count); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) + return false; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + } - p = encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0) { - *p++ = htonl(resp->count); - *p++ = htonl(resp->eof); - *p++ = htonl(resp->count); /* xdr opaque count */ - xdr_ressize_check(rqstp, p); - /* now update rqstp->rq_res to reflect data as well */ - rqstp->rq_res.page_len = resp->count; - if (resp->count & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3); - } - return 1; - } else - return xdr_ressize_check(rqstp, p); + return true; } /* WRITE */ -int -nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_writeres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_writeres *resp = rqstp->rq_resp; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - - p = encode_wcc_data(rqstp, p, &resp->fh); - if (resp->status == 0) { - *p++ = htonl(resp->count); - *p++ = htonl(resp->committed); - /* unique identifier, y2038 overflow can be ignored */ - *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_nsec); + + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return false; + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return false; + if (xdr_stream_encode_u32(xdr, resp->committed) < 0) + return false; + if (!svcxdr_encode_writeverf3(xdr, resp->verf)) + return false; + break; + default: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return false; } - return xdr_ressize_check(rqstp, p); + + return true; } /* CREATE, MKDIR, SYMLINK, MKNOD */ -int -nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_createres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_diropres *resp = rqstp->rq_resp; - if (resp->status == 0) { - *p++ = xdr_one; - p = encode_fh(p, &resp->fh); - p = encode_post_op_attr(rqstp, p, &resp->fh); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_fh3(xdr, &resp->fh)) + return false; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh)) + return false; + break; + default: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh)) + return false; } - p = encode_wcc_data(rqstp, p, &resp->dirfh); - return xdr_ressize_check(rqstp, p); + + return true; } /* RENAME */ -int -nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_renameres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_renameres *resp = rqstp->rq_resp; - p = encode_wcc_data(rqstp, p, &resp->ffh); - p = encode_wcc_data(rqstp, p, &resp->tfh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->ffh) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->tfh); } /* LINK */ -int -nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_linkres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_linkres *resp = rqstp->rq_resp; - p = encode_post_op_attr(rqstp, p, &resp->fh); - p = encode_wcc_data(rqstp, p, &resp->tfh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->tfh); } /* READDIR */ -int -nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_readdirres *resp = rqstp->rq_resp; + struct xdr_buf *dirlist = &resp->dirlist; + + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + if (!svcxdr_encode_cookieverf3(xdr, resp->verf)) + return false; + svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0, + dirlist->len); + /* no more entries */ + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0) + return false; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return false; + } - p = encode_post_op_attr(rqstp, p, &resp->fh); - - if (resp->status == 0) { - /* stupid readdir cookie */ - memcpy(p, resp->verf, 8); p += 2; - xdr_ressize_check(rqstp, p); - if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE) - return 1; /*No room for trailer */ - rqstp->rq_res.page_len = (resp->count) << 2; - - /* add the 'tail' to the end of the 'head' page - page 0. */ - rqstp->rq_res.tail[0].iov_base = p; - *p++ = 0; /* no more entries */ - *p++ = htonl(resp->common.err == nfserr_eof); - rqstp->rq_res.tail[0].iov_len = 2<<2; - return 1; - } else - return xdr_ressize_check(rqstp, p); -} - -static __be32 * -encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, - int namlen, u64 ino) -{ - *p++ = xdr_one; /* mark entry present */ - p = xdr_encode_hyper(p, ino); /* file id */ - p = xdr_encode_array(p, name, namlen);/* name length & name */ - - cd->offset = p; /* remember pointer */ - p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */ - - return p; + return true; } static __be32 @@ -844,19 +990,24 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, if (isdotent(name, namlen)) { if (namlen == 2) { dchild = dget_parent(dparent); - /* filesystem root - cannot return filehandle for ".." */ + /* + * Don't return filehandle for ".." if we're at + * the filesystem or export root: + */ if (dchild == dparent) goto out; + if (dparent == exp->ex_path.dentry) + goto out; } else dchild = dget(dparent); } else - dchild = lookup_one_len_unlocked(name, dparent, namlen); + dchild = lookup_one_positive_unlocked(&nop_mnt_idmap, + &QSTR_LEN(name, namlen), + dparent); if (IS_ERR(dchild)) return rv; if (d_mountpoint(dchild)) goto out; - if (d_really_is_negative(dchild)) - goto out; if (dchild->d_inode->i_ino != ino) goto out; rv = fh_compose(fhp, exp, dchild, &cd->fh); @@ -865,264 +1016,323 @@ out: return rv; } -static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino) -{ - struct svc_fh *fh = &cd->scratch; - __be32 err; - - fh_init(fh, NFS3_FHSIZE); - err = compose_entry_fh(cd, fh, name, namlen, ino); - if (err) { - *p++ = 0; - *p++ = 0; - goto out; - } - p = encode_post_op_attr(cd->rqstp, p, fh); - *p++ = xdr_one; /* yes, a file handle follows */ - p = encode_fh(p, fh); -out: - fh_put(fh); - return p; -} - -/* - * Encode a directory entry. This one works for both normal readdir - * and readdirplus. - * The normal readdir reply requires 2 (fileid) + 1 (stringlen) - * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen. - * - * The readdirplus baggage is 1+21 words for post_op_attr, plus the - * file handle. +/** + * nfs3svc_encode_cookie3 - Encode a directory offset cookie + * @resp: readdir result context + * @offset: offset cookie to encode + * + * The buffer space for the offset cookie has already been reserved + * by svcxdr_encode_entry3_common(). */ - -#define NFS3_ENTRY_BAGGAGE (2 + 1 + 2 + 1) -#define NFS3_ENTRYPLUS_BAGGAGE (1 + 21 + 1 + (NFS3_FHSIZE >> 2)) -static int -encode_entry(struct readdir_cd *ccd, const char *name, int namlen, - loff_t offset, u64 ino, unsigned int d_type, int plus) +void nfs3svc_encode_cookie3(struct nfsd3_readdirres *resp, u64 offset) { - struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres, - common); - __be32 *p = cd->buffer; - caddr_t curr_page_addr = NULL; - struct page ** page; - int slen; /* string (name) length */ - int elen; /* estimated entry length in words */ - int num_entry_words = 0; /* actual number of words */ - - if (cd->offset) { - u64 offset64 = offset; - - if (unlikely(cd->offset1)) { - /* we ended up with offset on a page boundary */ - *cd->offset = htonl(offset64 >> 32); - *cd->offset1 = htonl(offset64 & 0xffffffff); - cd->offset1 = NULL; - } else { - xdr_encode_hyper(cd->offset, offset64); - } - } - - /* - dprintk("encode_entry(%.*s @%ld%s)\n", - namlen, name, (long) offset, plus? " plus" : ""); - */ - - /* truncate filename if too long */ - namlen = min(namlen, NFS3_MAXNAMLEN); + __be64 cookie = cpu_to_be64(offset); - slen = XDR_QUADLEN(namlen); - elen = slen + NFS3_ENTRY_BAGGAGE - + (plus? NFS3_ENTRYPLUS_BAGGAGE : 0); + if (!resp->cookie_offset) + return; + write_bytes_to_xdr_buf(&resp->dirlist, resp->cookie_offset, &cookie, + sizeof(cookie)); + resp->cookie_offset = 0; +} - if (cd->buflen < elen) { - cd->common.err = nfserr_toosmall; - return -EINVAL; - } +static bool +svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name, + int namlen, loff_t offset, u64 ino) +{ + struct xdr_buf *dirlist = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; + + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + /* fileid */ + if (xdr_stream_encode_u64(xdr, ino) < 0) + return false; + /* name */ + if (xdr_stream_encode_opaque(xdr, name, min(namlen, NFS3_MAXNAMLEN)) < 0) + return false; + /* cookie */ + resp->cookie_offset = dirlist->len; + if (xdr_stream_encode_u64(xdr, OFFSET_MAX) < 0) + return false; + + return true; +} - /* determine which page in rq_respages[] we are currently filling */ - for (page = cd->rqstp->rq_respages + 1; - page < cd->rqstp->rq_next_page; page++) { - curr_page_addr = page_address(*page); +/** + * nfs3svc_encode_entry3 - encode one NFSv3 READDIR entry + * @data: directory context + * @name: name of the object to be encoded + * @namlen: length of that name, in bytes + * @offset: the offset of the previous entry + * @ino: the fileid of this entry + * @d_type: unused + * + * Return values: + * %0: Entry was successfully encoded. + * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * + * On exit, the following fields are updated: + * - resp->xdr + * - resp->common.err + * - resp->cookie_offset + */ +int nfs3svc_encode_entry3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = data; + struct nfsd3_readdirres *resp = container_of(ccd, + struct nfsd3_readdirres, + common); + unsigned int starting_length = resp->dirlist.len; - if (((caddr_t)cd->buffer >= curr_page_addr) && - ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE)) - break; - } + /* The offset cookie for the previous entry */ + nfs3svc_encode_cookie3(resp, offset); - if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) { - /* encode entry in current page */ + if (!svcxdr_encode_entry3_common(resp, name, namlen, offset, ino)) + goto out_toosmall; - p = encode_entry_baggage(cd, p, name, namlen, ino); + xdr_commit_encode(&resp->xdr); + resp->common.err = nfs_ok; + return 0; - if (plus) - p = encode_entryplus_baggage(cd, p, name, namlen, ino); - num_entry_words = p - cd->buffer; - } else if (*(page+1) != NULL) { - /* temporarily encode entry into next page, then move back to - * current and next page in rq_respages[] */ - __be32 *p1, *tmp; - int len1, len2; +out_toosmall: + resp->cookie_offset = 0; + resp->common.err = nfserr_toosmall; + resp->dirlist.len = starting_length; + return -EINVAL; +} - /* grab next page for temporary storage of entry */ - p1 = tmp = page_address(*(page+1)); +static bool +svcxdr_encode_entry3_plus(struct nfsd3_readdirres *resp, const char *name, + int namlen, u64 ino) +{ + struct xdr_stream *xdr = &resp->xdr; + struct svc_fh *fhp = &resp->scratch; + bool result; - p1 = encode_entry_baggage(cd, p1, name, namlen, ino); + result = false; + fh_init(fhp, NFS3_FHSIZE); + if (compose_entry_fh(resp, fhp, name, namlen, ino) != nfs_ok) + goto out_noattrs; - if (plus) - p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino); + if (!svcxdr_encode_post_op_attr(resp->rqstp, xdr, fhp)) + goto out; + if (!svcxdr_encode_post_op_fh3(xdr, fhp)) + goto out; + result = true; - /* determine entry word length and lengths to go in pages */ - num_entry_words = p1 - tmp; - len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer; - if ((num_entry_words << 2) < len1) { - /* the actual number of words in the entry is less - * than elen and can still fit in the current page - */ - memmove(p, tmp, num_entry_words << 2); - p += num_entry_words; - - /* update offset */ - cd->offset = cd->buffer + (cd->offset - tmp); - } else { - unsigned int offset_r = (cd->offset - tmp) << 2; - - /* update pointer to offset location. - * This is a 64bit quantity, so we need to - * deal with 3 cases: - * - entirely in first page - * - entirely in second page - * - 4 bytes in each page - */ - if (offset_r + 8 <= len1) { - cd->offset = p + (cd->offset - tmp); - } else if (offset_r >= len1) { - cd->offset -= len1 >> 2; - } else { - /* sitting on the fence */ - BUG_ON(offset_r != len1 - 4); - cd->offset = p + (cd->offset - tmp); - cd->offset1 = tmp; - } - - len2 = (num_entry_words << 2) - len1; - - /* move from temp page to current and next pages */ - memmove(p, tmp, len1); - memmove(tmp, (caddr_t)tmp+len1, len2); - - p = tmp + (len2 >> 2); - } - } - else { - cd->common.err = nfserr_toosmall; - return -EINVAL; - } +out: + fh_put(fhp); + return result; + +out_noattrs: + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + return true; +} - cd->buflen -= num_entry_words; - cd->buffer = p; - cd->common.err = nfs_ok; +/** + * nfs3svc_encode_entryplus3 - encode one NFSv3 READDIRPLUS entry + * @data: directory context + * @name: name of the object to be encoded + * @namlen: length of that name, in bytes + * @offset: the offset of the previous entry + * @ino: the fileid of this entry + * @d_type: unused + * + * Return values: + * %0: Entry was successfully encoded. + * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * + * On exit, the following fields are updated: + * - resp->xdr + * - resp->common.err + * - resp->cookie_offset + */ +int nfs3svc_encode_entryplus3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = data; + struct nfsd3_readdirres *resp = container_of(ccd, + struct nfsd3_readdirres, + common); + unsigned int starting_length = resp->dirlist.len; + + /* The offset cookie for the previous entry */ + nfs3svc_encode_cookie3(resp, offset); + + if (!svcxdr_encode_entry3_common(resp, name, namlen, offset, ino)) + goto out_toosmall; + if (!svcxdr_encode_entry3_plus(resp, name, namlen, ino)) + goto out_toosmall; + + xdr_commit_encode(&resp->xdr); + resp->common.err = nfs_ok; return 0; +out_toosmall: + resp->cookie_offset = 0; + resp->common.err = nfserr_toosmall; + resp->dirlist.len = starting_length; + return -EINVAL; } -int -nfs3svc_encode_entry(void *cd, const char *name, - int namlen, loff_t offset, u64 ino, unsigned int d_type) +static bool +svcxdr_encode_fsstat3resok(struct xdr_stream *xdr, + const struct nfsd3_fsstatres *resp) { - return encode_entry(cd, name, namlen, offset, ino, d_type, 0); -} + const struct kstatfs *s = &resp->stats; + u64 bs = s->f_bsize; + __be32 *p; -int -nfs3svc_encode_entry_plus(void *cd, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int d_type) -{ - return encode_entry(cd, name, namlen, offset, ino, d_type, 1); + p = xdr_reserve_space(xdr, XDR_UNIT * 13); + if (!p) + return false; + p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */ + p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */ + p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */ + p = xdr_encode_hyper(p, s->f_files); /* total inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */ + *p = cpu_to_be32(resp->invarsec); /* mean unchanged time */ + + return true; } /* FSSTAT */ -int -nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_fsstatres *resp = rqstp->rq_resp; - struct kstatfs *s = &resp->stats; - u64 bs = s->f_bsize; - - *p++ = xdr_zero; /* no post_op_attr */ - - if (resp->status == 0) { - p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */ - p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */ - p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */ - p = xdr_encode_hyper(p, s->f_files); /* total inodes */ - p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */ - p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */ - *p++ = htonl(resp->invarsec); /* mean unchanged time */ + + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return false; + if (!svcxdr_encode_fsstat3resok(xdr, resp)) + return false; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return false; } - return xdr_ressize_check(rqstp, p); + + return true; +} + +static bool +svcxdr_encode_fsinfo3resok(struct xdr_stream *xdr, + const struct nfsd3_fsinfores *resp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 12); + if (!p) + return false; + *p++ = cpu_to_be32(resp->f_rtmax); + *p++ = cpu_to_be32(resp->f_rtpref); + *p++ = cpu_to_be32(resp->f_rtmult); + *p++ = cpu_to_be32(resp->f_wtmax); + *p++ = cpu_to_be32(resp->f_wtpref); + *p++ = cpu_to_be32(resp->f_wtmult); + *p++ = cpu_to_be32(resp->f_dtpref); + p = xdr_encode_hyper(p, resp->f_maxfilesize); + p = encode_nfstime3(p, &nfs3svc_time_delta); + *p = cpu_to_be32(resp->f_properties); + + return true; } /* FSINFO */ -int -nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_fsinfores *resp = rqstp->rq_resp; - *p++ = xdr_zero; /* no post_op_attr */ - - if (resp->status == 0) { - *p++ = htonl(resp->f_rtmax); - *p++ = htonl(resp->f_rtpref); - *p++ = htonl(resp->f_rtmult); - *p++ = htonl(resp->f_wtmax); - *p++ = htonl(resp->f_wtpref); - *p++ = htonl(resp->f_wtmult); - *p++ = htonl(resp->f_dtpref); - p = xdr_encode_hyper(p, resp->f_maxfilesize); - *p++ = xdr_one; - *p++ = xdr_zero; - *p++ = htonl(resp->f_properties); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return false; + if (!svcxdr_encode_fsinfo3resok(xdr, resp)) + return false; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return false; } - return xdr_ressize_check(rqstp, p); + return true; +} + +static bool +svcxdr_encode_pathconf3resok(struct xdr_stream *xdr, + const struct nfsd3_pathconfres *resp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 6); + if (!p) + return false; + *p++ = cpu_to_be32(resp->p_link_max); + *p++ = cpu_to_be32(resp->p_name_max); + p = xdr_encode_bool(p, resp->p_no_trunc); + p = xdr_encode_bool(p, resp->p_chown_restricted); + p = xdr_encode_bool(p, resp->p_case_insensitive); + xdr_encode_bool(p, resp->p_case_preserving); + + return true; } /* PATHCONF */ -int -nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_pathconfres *resp = rqstp->rq_resp; - *p++ = xdr_zero; /* no post_op_attr */ - - if (resp->status == 0) { - *p++ = htonl(resp->p_link_max); - *p++ = htonl(resp->p_name_max); - *p++ = htonl(resp->p_no_trunc); - *p++ = htonl(resp->p_chown_restricted); - *p++ = htonl(resp->p_case_insensitive); - *p++ = htonl(resp->p_case_preserving); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return false; + if (!svcxdr_encode_pathconf3resok(xdr, resp)) + return false; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return false; } - return xdr_ressize_check(rqstp, p); + return true; } /* COMMIT */ -int -nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs3svc_encode_commitres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd3_commitres *resp = rqstp->rq_resp; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - - p = encode_wcc_data(rqstp, p, &resp->fh); - /* Write verifier */ - if (resp->status == 0) { - /* unique identifier, y2038 overflow can be ignored */ - *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_nsec); + + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return false; + if (!svcxdr_encode_writeverf3(xdr, resp->verf)) + return false; + break; + default: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return false; } - return xdr_ressize_check(rqstp, p); + + return true; } /* diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 71292a0d6f09..936ea1ad9586 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -135,7 +135,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, unsigned int flags = 0; int size = 0; - pacl = get_acl(inode, ACL_TYPE_ACCESS); + pacl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (!pacl) pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -147,7 +147,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, if (S_ISDIR(inode->i_mode)) { flags = NFS4_ACL_DIR; - dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + dpacl = get_inode_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(dpacl)) { error = PTR_ERR(dpacl); goto rel_pacl; @@ -198,8 +198,6 @@ summarize_posix_acl(struct posix_acl *acl, struct posix_acl_summary *pas) memset(pas, 0, sizeof(*pas)); pas->mask = 07; - pe = acl->a_entries + acl->a_count; - FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: @@ -441,7 +439,7 @@ struct posix_ace_state_array { * calculated so far: */ struct posix_acl_state { - int empty; + unsigned char valid; struct posix_ace_state owner; struct posix_ace_state group; struct posix_ace_state other; @@ -457,7 +455,6 @@ init_state(struct posix_acl_state *state, int cnt) int alloc; memset(state, 0, sizeof(struct posix_acl_state)); - state->empty = 1; /* * In the worst case, each individual acl could be for a distinct * named user or group, but we don't know which, so we allocate @@ -500,7 +497,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) * and effective cases: when there are no inheritable ACEs, * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + if (!state->valid && (flags & NFS4_ACL_TYPE_DEFAULT)) return NULL; /* @@ -622,11 +619,12 @@ static void process_one_v4_ace(struct posix_acl_state *state, struct nfs4_ace *ace) { u32 mask = ace->access_mask; + short type = ace2type(ace); int i; - state->empty = 0; + state->valid |= type; - switch (ace2type(ace)) { + switch (type) { case ACL_USER_OBJ: if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { allow_bits(&state->owner, mask); @@ -726,6 +724,30 @@ static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE)) process_one_v4_ace(&effective_acl_state, ace); } + + /* + * At this point, the default ACL may have zeroed-out entries for owner, + * group and other. That usually results in a non-sensical resulting ACL + * that denies all access except to any ACE that was explicitly added. + * + * The setfacl command solves a similar problem with this logic: + * + * "If a Default ACL entry is created, and the Default ACL contains + * no owner, owning group, or others entry, a copy of the ACL + * owner, owning group, or others entry is added to the Default ACL." + * + * Copy any missing ACEs from the effective set, if any ACEs were + * explicitly set. + */ + if (default_acl_state.valid) { + if (!(default_acl_state.valid & ACL_USER_OBJ)) + default_acl_state.owner = effective_acl_state.owner; + if (!(default_acl_state.valid & ACL_GROUP_OBJ)) + default_acl_state.group = effective_acl_state.group; + if (!(default_acl_state.valid & ACL_OTHER)) + default_acl_state.other = effective_acl_state.other; + } + *pacl = posix_state_to_acl(&effective_acl_state, flags); if (IS_ERR(*pacl)) { ret = PTR_ERR(*pacl); @@ -751,57 +773,26 @@ out_estate: return ret; } -__be32 -nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct nfs4_acl *acl) +__be32 nfsd4_acl_to_attr(enum nfs_ftype4 type, struct nfs4_acl *acl, + struct nfsd_attrs *attr) { - __be32 error; int host_error; - struct dentry *dentry; - struct inode *inode; - struct posix_acl *pacl = NULL, *dpacl = NULL; unsigned int flags = 0; - /* Get inode */ - error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); - if (error) - return error; - - dentry = fhp->fh_dentry; - inode = d_inode(dentry); + if (!acl) + return nfs_ok; - if (S_ISDIR(inode->i_mode)) + if (type == NF4DIR) flags = NFS4_ACL_DIR; - host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); + host_error = nfs4_acl_nfsv4_to_posix(acl, &attr->na_pacl, + &attr->na_dpacl, flags); if (host_error == -EINVAL) return nfserr_attrnotsupp; - if (host_error < 0) - goto out_nfserr; - - fh_lock(fhp); - - host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl); - if (host_error < 0) - goto out_drop_lock; - - if (S_ISDIR(inode->i_mode)) { - host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl); - } - -out_drop_lock: - fh_unlock(fhp); - - posix_acl_release(pacl); - posix_acl_release(dpacl); -out_nfserr: - if (host_error == -EOPNOTSUPP) - return nfserr_attrnotsupp; else return nfserrno(host_error); } - static short ace2type(struct nfs4_ace *ace) { diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c74e4538d0eb..e00b2aea8da2 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -31,6 +31,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/nfs4.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/svc_xprt.h> @@ -38,13 +39,13 @@ #include "nfsd.h" #include "state.h" #include "netns.h" +#include "trace.h" #include "xdr4cb.h" #include "xdr4.h" +#include "nfs4xdr_gen.h" #define NFSDDBG_FACILITY NFSDDBG_PROC -static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); - #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 @@ -60,16 +61,6 @@ struct nfs4_cb_compound_hdr { int status; }; -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - static __be32 *xdr_encode_empty_array(__be32 *p) { *p++ = xdr_zero; @@ -85,30 +76,53 @@ static __be32 *xdr_encode_empty_array(__be32 *p) * 1 Protocol" */ -/* - * nfs_cb_opnum4 - * - * enum nfs_cb_opnum4 { - * OP_CB_GETATTR = 3, - * ... - * }; - */ -enum nfs_cb_opnum4 { - OP_CB_GETATTR = 3, - OP_CB_RECALL = 4, - OP_CB_LAYOUTRECALL = 5, - OP_CB_NOTIFY = 6, - OP_CB_PUSH_DELEG = 7, - OP_CB_RECALL_ANY = 8, - OP_CB_RECALLABLE_OBJ_AVAIL = 9, - OP_CB_RECALL_SLOT = 10, - OP_CB_SEQUENCE = 11, - OP_CB_WANTS_CANCELLED = 12, - OP_CB_NOTIFY_LOCK = 13, - OP_CB_NOTIFY_DEVICEID = 14, - OP_CB_OFFLOAD = 15, - OP_CB_ILLEGAL = 10044 -}; +static void encode_uint32(struct xdr_stream *xdr, u32 n) +{ + WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0); +} + +static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap, + size_t len) +{ + xdr_stream_encode_uint32_array(xdr, bitmap, len); +} + +static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs4_cb_fattr *fattr) +{ + fattr->ncf_cb_change = 0; + fattr->ncf_cb_fsize = 0; + fattr->ncf_cb_atime.tv_sec = 0; + fattr->ncf_cb_atime.tv_nsec = 0; + fattr->ncf_cb_mtime.tv_sec = 0; + fattr->ncf_cb_mtime.tv_nsec = 0; + + if (bitmap[0] & FATTR4_WORD0_CHANGE) + if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0) + return -EIO; + if (bitmap[0] & FATTR4_WORD0_SIZE) + if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0) + return -EIO; + if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) { + fattr4_time_deleg_access access; + + if (!xdrgen_decode_fattr4_time_deleg_access(xdr, &access)) + return -EIO; + fattr->ncf_cb_atime.tv_sec = access.seconds; + fattr->ncf_cb_atime.tv_nsec = access.nseconds; + + } + if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) { + fattr4_time_deleg_modify modify; + + if (!xdrgen_decode_fattr4_time_deleg_modify(xdr, &modify)) + return -EIO; + fattr->ncf_cb_mtime.tv_sec = modify.seconds; + fattr->ncf_cb_mtime.tv_nsec = modify.nseconds; + + } + return 0; +} static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op) { @@ -130,7 +144,7 @@ static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh) BUG_ON(length > NFS4_FHSIZE); p = xdr_reserve_space(xdr, 4 + length); - xdr_encode_opaque(p, &fh->fh_base, length); + xdr_encode_opaque(p, &fh->fh_raw, length); } /* @@ -240,7 +254,6 @@ static int decode_cb_op_status(struct xdr_stream *xdr, *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); return 0; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; out_unexpected: dprintk("NFSD: Callback server returned operation %d but " @@ -296,20 +309,19 @@ static int decode_cb_compound4res(struct xdr_stream *xdr, u32 length; __be32 *p; - p = xdr_inline_decode(xdr, 4 + 4); + p = xdr_inline_decode(xdr, XDR_UNIT); if (unlikely(p == NULL)) goto out_overflow; - hdr->status = be32_to_cpup(p++); + hdr->status = be32_to_cpup(p); /* Ignore the tag */ - length = be32_to_cpup(p++); - p = xdr_inline_decode(xdr, length + 4); - if (unlikely(p == NULL)) + if (xdr_stream_decode_u32(xdr, &length) < 0) + goto out_overflow; + if (xdr_inline_decode(xdr, length) == NULL) + goto out_overflow; + if (xdr_stream_decode_u32(xdr, &hdr->nops) < 0) goto out_overflow; - p += XDR_QUADLEN(length); - hdr->nops = be32_to_cpup(p); return 0; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -340,6 +352,95 @@ static void encode_cb_recall4args(struct xdr_stream *xdr, } /* + * CB_RECALLANY4args + * + * struct CB_RECALLANY4args { + * uint32_t craa_objects_to_keep; + * bitmap4 craa_type_mask; + * }; + */ +static void +encode_cb_recallany4args(struct xdr_stream *xdr, + struct nfs4_cb_compound_hdr *hdr, struct nfsd4_cb_recall_any *ra) +{ + encode_nfs_cb_opnum4(xdr, OP_CB_RECALL_ANY); + encode_uint32(xdr, ra->ra_keep); + encode_bitmap4(xdr, ra->ra_bmval, ARRAY_SIZE(ra->ra_bmval)); + hdr->nops++; +} + +/* + * CB_GETATTR4args + * struct CB_GETATTR4args { + * nfs_fh4 fh; + * bitmap4 attr_request; + * }; + * + * The size and change attributes are the only one + * guaranteed to be serviced by the client. + */ +static void +encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr, + struct nfs4_cb_fattr *fattr) +{ + struct nfs4_delegation *dp = container_of(fattr, struct nfs4_delegation, dl_cb_fattr); + struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle; + struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr; + u32 bmap_size = 1; + u32 bmap[3]; + + bmap[0] = FATTR4_WORD0_SIZE; + if (!ncf->ncf_file_modified) + bmap[0] |= FATTR4_WORD0_CHANGE; + + if (deleg_attrs_deleg(dp->dl_type)) { + bmap[1] = 0; + bmap[2] = FATTR4_WORD2_TIME_DELEG_ACCESS | FATTR4_WORD2_TIME_DELEG_MODIFY; + bmap_size = 3; + } + encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR); + encode_nfs_fh4(xdr, fh); + encode_bitmap4(xdr, bmap, bmap_size); + hdr->nops++; +} + +static u32 highest_slotid(struct nfsd4_session *ses) +{ + u32 idx; + + spin_lock(&ses->se_lock); + idx = fls(~ses->se_cb_slot_avail); + if (idx > 0) + --idx; + idx = max(idx, ses->se_cb_highest_slot); + spin_unlock(&ses->se_lock); + return idx; +} + +static void +encode_referring_call4(struct xdr_stream *xdr, + const struct nfsd4_referring_call *rc) +{ + encode_uint32(xdr, rc->rc_sequenceid); + encode_uint32(xdr, rc->rc_slotid); +} + +static void +encode_referring_call_list4(struct xdr_stream *xdr, + const struct nfsd4_referring_call_list *rcl) +{ + struct nfsd4_referring_call *rc; + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); + xdr_encode_opaque_fixed(p, rcl->rcl_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + encode_uint32(xdr, rcl->__nr_referring_calls); + list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) + encode_referring_call4(xdr, rc); +} + +/* * CB_SEQUENCE4args * * struct CB_SEQUENCE4args { @@ -356,6 +457,7 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) { struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + struct nfsd4_referring_call_list *rcl; __be32 *p; if (hdr->minorversion == 0) @@ -364,16 +466,45 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr, encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE); encode_sessionid4(xdr, session); - p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4); - *p++ = cpu_to_be32(session->se_cb_seq_nr); /* csa_sequenceid */ - *p++ = xdr_zero; /* csa_slotid */ - *p++ = xdr_zero; /* csa_highest_slotid */ + p = xdr_reserve_space(xdr, XDR_UNIT * 4); + *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]); /* csa_sequenceid */ + *p++ = cpu_to_be32(cb->cb_held_slot); /* csa_slotid */ + *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */ *p++ = xdr_zero; /* csa_cachethis */ - xdr_encode_empty_array(p); /* csa_referring_call_lists */ + + /* csa_referring_call_lists */ + encode_uint32(xdr, cb->cb_nr_referring_call_list); + list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) + encode_referring_call_list4(xdr, rcl); hdr->nops++; } +static void update_cb_slot_table(struct nfsd4_session *ses, u32 target) +{ + /* No need to do anything if nothing changed */ + if (likely(target == READ_ONCE(ses->se_cb_highest_slot))) + return; + + spin_lock(&ses->se_lock); + if (target > ses->se_cb_highest_slot) { + int i; + + target = min(target, NFSD_BC_SLOT_TABLE_SIZE - 1); + + /* + * Growing the slot table. Reset any new sequences to 1. + * + * NB: There is some debate about whether the RFC requires this, + * but the Linux client expects it. + */ + for (i = ses->se_cb_highest_slot + 1; i <= target; ++i) + ses->se_cb_seq_nr[i] = 1; + } + ses->se_cb_highest_slot = target; + spin_unlock(&ses->se_lock); +} + /* * CB_SEQUENCE4resok * @@ -401,7 +532,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, struct nfsd4_session *session = cb->cb_clp->cl_cb_session; int status = -ESERVERFAULT; __be32 *p; - u32 dummy; + u32 seqid, slotid, target; /* * If the server returns different values for sessionID, slotID or @@ -417,27 +548,27 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, } p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN); - dummy = be32_to_cpup(p++); - if (dummy != session->se_cb_seq_nr) { + seqid = be32_to_cpup(p++); + if (seqid != session->se_cb_seq_nr[cb->cb_held_slot]) { dprintk("NFS: %s Invalid sequence number\n", __func__); goto out; } - dummy = be32_to_cpup(p++); - if (dummy != 0) { + slotid = be32_to_cpup(p++); + if (slotid != cb->cb_held_slot) { dprintk("NFS: %s Invalid slotid\n", __func__); goto out; } - /* - * FIXME: process highest slotid and target highest slotid - */ + p++; // ignore current highest slot value + + target = be32_to_cpup(p++); + update_cb_slot_table(session, target); status = 0; out: cb->cb_seq_status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out; } @@ -476,6 +607,26 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, } /* + * 20.1. Operation 3: CB_GETATTR - Get Attributes + */ +static void nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfsd4_callback *cb = data; + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_getattr4args(xdr, &hdr, ncf); + encode_cb_nops(&hdr); +} + +/* * 20.2. Operation 4: CB_RECALL - Recall a Delegation */ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -494,6 +645,26 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, encode_cb_nops(&hdr); } +/* + * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects + */ +static void +nfs4_xdr_enc_cb_recall_any(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfsd4_callback *cb = data; + struct nfsd4_cb_recall_any *ra; + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + ra = container_of(cb, struct nfsd4_cb_recall_any, ra_cb); + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_recallany4args(xdr, &hdr, ra); + encode_cb_nops(&hdr); +} /* * NFSv4.0 and NFSv4.1 XDR decode functions @@ -511,6 +682,46 @@ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, } /* + * 20.1. Operation 3: CB_GETATTR - Get Attributes + */ +static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + u32 bitmap[3] = {0}; + u32 attrlen, maxlen; + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + + status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status); + if (unlikely(status || cb->cb_status)) + return status; + if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) + return -EIO; + if (xdr_stream_decode_u32(xdr, &attrlen) < 0) + return -EIO; + maxlen = sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize); + if (bitmap[2] != 0) + maxlen += (sizeof(ncf->ncf_cb_mtime.tv_sec) + + sizeof(ncf->ncf_cb_mtime.tv_nsec)) * 2; + if (attrlen > maxlen) + return -EIO; + status = decode_cb_fattr4(xdr, bitmap, ncf); + return status; +} + +/* * 20.2. Operation 4: CB_RECALL - Recall a Delegation */ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, @@ -525,15 +736,35 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb != NULL) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } +/* + * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects + */ +static int +nfs4_xdr_dec_cb_recall_any(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + status = decode_cb_op_status(xdr, OP_CB_RECALL_ANY, &cb->cb_status); + return status; +} + #ifdef CONFIG_NFSD_PNFS /* * CB_LAYOUTRECALL4args @@ -617,11 +848,10 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); } #endif /* CONFIG_NFSD_PNFS */ @@ -642,7 +872,7 @@ static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req, const struct nfsd4_callback *cb = data; const struct nfsd4_blocked_lock *nbl = container_of(cb, struct nfsd4_blocked_lock, nbl_cb); - struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner; + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.c.flc_owner; struct nfs4_cb_compound_hdr hdr = { .ident = 0, .minorversion = cb->cb_clp->cl_minorversion, @@ -676,11 +906,10 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); } @@ -695,7 +924,7 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, * case NFS4_OK: * write_response4 coa_resok4; * default: - * length4 coa_bytes_copied; + * length4 coa_bytes_copied; * }; * struct CB_OFFLOAD4args { * nfs_fh4 coa_fh; @@ -704,21 +933,22 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, * }; */ static void encode_offload_info4(struct xdr_stream *xdr, - __be32 nfserr, - const struct nfsd4_copy *cp) + const struct nfsd4_cb_offload *cbo) { __be32 *p; p = xdr_reserve_space(xdr, 4); - *p++ = nfserr; - if (!nfserr) { + *p = cbo->co_nfserr; + switch (cbo->co_nfserr) { + case nfs_ok: p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); p = xdr_encode_empty_array(p); - p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written); - *p++ = cpu_to_be32(cp->cp_res.wr_stable_how); - p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data, + p = xdr_encode_hyper(p, cbo->co_res.wr_bytes_written); + *p++ = cpu_to_be32(cbo->co_res.wr_stable_how); + p = xdr_encode_opaque_fixed(p, cbo->co_res.wr_verifier.data, NFS4_VERIFIER_SIZE); - } else { + break; + default: p = xdr_reserve_space(xdr, 8); /* We always return success if bytes were written */ p = xdr_encode_hyper(p, 0); @@ -726,18 +956,16 @@ static void encode_offload_info4(struct xdr_stream *xdr, } static void encode_cb_offload4args(struct xdr_stream *xdr, - __be32 nfserr, - const struct knfsd_fh *fh, - const struct nfsd4_copy *cp, + const struct nfsd4_cb_offload *cbo, struct nfs4_cb_compound_hdr *hdr) { __be32 *p; p = xdr_reserve_space(xdr, 4); - *p++ = cpu_to_be32(OP_CB_OFFLOAD); - encode_nfs_fh4(xdr, fh); - encode_stateid4(xdr, &cp->cp_res.cb_stateid); - encode_offload_info4(xdr, nfserr, cp); + *p = cpu_to_be32(OP_CB_OFFLOAD); + encode_nfs_fh4(xdr, &cbo->co_fh); + encode_stateid4(xdr, &cbo->co_res.cb_stateid); + encode_offload_info4(xdr, cbo); hdr->nops++; } @@ -747,8 +975,8 @@ static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req, const void *data) { const struct nfsd4_callback *cb = data; - const struct nfsd4_copy *cp = - container_of(cb, struct nfsd4_copy, cp_cb); + const struct nfsd4_cb_offload *cbo = + container_of(cb, struct nfsd4_cb_offload, co_cb); struct nfs4_cb_compound_hdr hdr = { .ident = 0, .minorversion = cb->cb_clp->cl_minorversion, @@ -756,7 +984,7 @@ static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req, encode_cb_compound4args(xdr, &hdr); encode_cb_sequence4args(xdr, cb, &hdr); - encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr); + encode_cb_offload4args(xdr, cbo, &hdr); encode_cb_nops(&hdr); } @@ -772,11 +1000,10 @@ static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status); } /* @@ -801,6 +1028,8 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { #endif PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), + PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any), + PROC(CB_GETATTR, COMPOUND, cb_getattr, cb_getattr), }; static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; @@ -841,7 +1070,52 @@ static const struct rpc_program cb_program = { static int max_cb_time(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - return max(nn->nfsd4_lease/10, (time_t)1) * HZ; + + /* + * nfsd4_lease is set to at most one hour in __nfsd4_write_time, + * so we can use 32-bit math on it. Warn if that assumption + * ever stops being true. + */ + if (WARN_ON_ONCE(nn->nfsd4_lease > 3600)) + return 360 * HZ; + + return max(((u32)nn->nfsd4_lease)/10, 1u) * HZ; +} + +static bool nfsd4_queue_cb(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + trace_nfsd_cb_queue(clp, cb); + return queue_work(clp->cl_callback_wq, &cb->cb_work); +} + +static void nfsd4_requeue_cb(struct rpc_task *task, struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { + trace_nfsd_cb_restart(clp, cb); + task->tk_status = 0; + set_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); + } +} + +static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) +{ + atomic_inc(&clp->cl_cb_inflight); +} + +static void nfsd41_cb_inflight_end(struct nfs4_client *clp) +{ + + atomic_dec_and_wake_up(&clp->cl_cb_inflight); +} + +static void nfsd41_cb_inflight_wait_complete(struct nfs4_client *clp) +{ + wait_var_event(&clp->cl_cb_inflight, + !atomic_read(&clp->cl_cb_inflight)); } static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) @@ -854,12 +1128,12 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r } else { struct cred *kcred; - kcred = prepare_kernel_cred(NULL); + kcred = prepare_kernel_cred(&init_task); if (!kcred) return NULL; - kcred->uid = ses->se_cb_sec.uid; - kcred->gid = ses->se_cb_sec.gid; + kcred->fsuid = ses->se_cb_sec.uid; + kcred->fsgid = ses->se_cb_sec.gid; return kcred; } } @@ -881,23 +1155,25 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c .program = &cb_program, .version = 1, .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), + .cred = current_cred(), }; struct rpc_clnt *client; const struct cred *cred; if (clp->cl_minorversion == 0) { if (!clp->cl_cred.cr_principal && - (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) + (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) { + trace_nfsd_cb_setup_err(clp, -EINVAL); return -EINVAL; + } args.client_name = clp->cl_cred.cr_principal; args.prognumber = conn->cb_prog; args.protocol = XPRT_TRANSPORT_TCP; args.authflavor = clp->cl_cred.cr_flavor; clp->cl_cb_ident = conn->cb_ident; } else { - if (!conn->cb_xprt) + if (!conn->cb_xprt || !ses) return -EINVAL; - clp->cl_cb_conn.cb_xprt = conn->cb_xprt; clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; args.prognumber = clp->cl_cb_session->se_cb_prog; @@ -908,40 +1184,47 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { - dprintk("NFSD: couldn't create callback client: %ld\n", - PTR_ERR(client)); + trace_nfsd_cb_setup_err(clp, PTR_ERR(client)); return PTR_ERR(client); } cred = get_backchannel_cred(clp, client, ses); - if (IS_ERR(cred)) { + if (!cred) { + trace_nfsd_cb_setup_err(clp, -ENOMEM); rpc_shutdown_client(client); - return PTR_ERR(cred); + return -ENOMEM; } + + if (clp->cl_minorversion != 0) + clp->cl_cb_conn.cb_xprt = conn->cb_xprt; clp->cl_cb_client = client; clp->cl_cb_cred = cred; + rcu_read_lock(); + trace_nfsd_cb_setup(clp, rpc_peeraddr2str(client, RPC_DISPLAY_NETID), + args.authflavor); + rcu_read_unlock(); return 0; } -static void warn_no_callback_path(struct nfs4_client *clp, int reason) +static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate) { - dprintk("NFSD: warning: no callback path to client %.*s: error %d\n", - (int)clp->cl_name.len, clp->cl_name.data, reason); + if (clp->cl_cb_state != newstate) { + clp->cl_cb_state = newstate; + trace_nfsd_cb_new_state(clp); + } } -static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) +static void nfsd4_mark_cb_down(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; - clp->cl_cb_state = NFSD4_CB_DOWN; - warn_no_callback_path(clp, reason); + nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN); } -static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) +static void nfsd4_mark_cb_fault(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; - clp->cl_cb_state = NFSD4_CB_FAULT; - warn_no_callback_path(clp, reason); + nfsd4_mark_cb_state(clp, NFSD4_CB_FAULT); } static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) @@ -949,26 +1232,34 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); if (task->tk_status) - nfsd4_mark_cb_down(clp, task->tk_status); + nfsd4_mark_cb_down(clp); else - clp->cl_cb_state = NFSD4_CB_UP; + nfsd4_mark_cb_state(clp, NFSD4_CB_UP); +} + +static void nfsd4_cb_probe_release(void *calldata) +{ + struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); + + nfsd41_cb_inflight_end(clp); + } static const struct rpc_call_ops nfsd4_cb_probe_ops = { /* XXX: release method to ensure we set the cb channel down if * necessary on early failure? */ .rpc_call_done = nfsd4_cb_probe_done, + .rpc_release = nfsd4_cb_probe_release, }; -static struct workqueue_struct *callback_wq; - /* * Poke the callback thread to process any updates to the callback * parameters, and send a null probe. */ void nfsd4_probe_callback(struct nfs4_client *clp) { - clp->cl_cb_state = NFSD4_CB_UNKNOWN; + trace_nfsd_cb_probe(clp); + nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN); set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); nfsd4_run_cb(&clp->cl_cb_null); } @@ -976,40 +1267,184 @@ void nfsd4_probe_callback(struct nfs4_client *clp) void nfsd4_probe_callback_sync(struct nfs4_client *clp) { nfsd4_probe_callback(clp); - flush_workqueue(callback_wq); + flush_workqueue(clp->cl_callback_wq); } void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) { - clp->cl_cb_state = NFSD4_CB_UNKNOWN; + nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN); spin_lock(&clp->cl_lock); memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); spin_unlock(&clp->cl_lock); } +static int grab_slot(struct nfsd4_session *ses) +{ + int idx; + + spin_lock(&ses->se_lock); + idx = ffs(ses->se_cb_slot_avail) - 1; + if (idx < 0 || idx > ses->se_cb_highest_slot) { + spin_unlock(&ses->se_lock); + return -1; + } + /* clear the bit for the slot */ + ses->se_cb_slot_avail &= ~BIT(idx); + spin_unlock(&ses->se_lock); + return idx; +} + /* * There's currently a single callback channel slot. * If the slot is available, then mark it busy. Otherwise, set the * thread for sleeping on the callback RPC wait queue. */ -static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) +static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task) { - if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *ses = clp->cl_cb_session; + + if (cb->cb_held_slot >= 0) + return true; + cb->cb_held_slot = grab_slot(ses); + if (cb->cb_held_slot < 0) { rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); /* Race breaker */ - if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { - dprintk("%s slot is busy\n", __func__); + cb->cb_held_slot = grab_slot(ses); + if (cb->cb_held_slot < 0) return false; - } rpc_wake_up_queued_task(&clp->cl_cb_waitq, task); } return true; } -/* - * TODO: cb_sequence should support referring call lists, cachethis, multiple - * slots, and mark callback channel down on communication errors. +static void nfsd41_cb_release_slot(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + struct nfsd4_session *ses = clp->cl_cb_session; + + if (cb->cb_held_slot >= 0) { + spin_lock(&ses->se_lock); + ses->se_cb_slot_avail |= BIT(cb->cb_held_slot); + spin_unlock(&ses->se_lock); + cb->cb_held_slot = -1; + rpc_wake_up_next(&clp->cl_cb_waitq); + } +} + +static void nfsd41_destroy_cb(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + trace_nfsd_cb_destroy(clp, cb); + nfsd41_cb_release_slot(cb); + if (test_bit(NFSD4_CALLBACK_WAKE, &cb->cb_flags)) + clear_and_wake_up_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags); + else + clear_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags); + + if (cb->cb_ops && cb->cb_ops->release) + cb->cb_ops->release(cb); + nfsd41_cb_inflight_end(clp); +} + +/** + * nfsd41_cb_referring_call - add a referring call to a callback operation + * @cb: context of callback to add the rc to + * @sessionid: referring call's session ID + * @slotid: referring call's session slot index + * @seqno: referring call's slot sequence number + * + * Caller serializes access to @cb. + * + * NB: If memory allocation fails, the referring call is not added. + */ +void nfsd41_cb_referring_call(struct nfsd4_callback *cb, + struct nfs4_sessionid *sessionid, + u32 slotid, u32 seqno) +{ + struct nfsd4_referring_call_list *rcl; + struct nfsd4_referring_call *rc; + bool found; + + might_sleep(); + + found = false; + list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) { + if (!memcmp(rcl->rcl_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN)) { + found = true; + break; + } + } + if (!found) { + rcl = kmalloc(sizeof(*rcl), GFP_KERNEL); + if (!rcl) + return; + memcpy(rcl->rcl_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN); + rcl->__nr_referring_calls = 0; + INIT_LIST_HEAD(&rcl->rcl_referring_calls); + list_add(&rcl->__list, &cb->cb_referring_call_list); + cb->cb_nr_referring_call_list++; + } + + found = false; + list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) { + if (rc->rc_sequenceid == seqno && rc->rc_slotid == slotid) { + found = true; + break; + } + } + if (!found) { + rc = kmalloc(sizeof(*rc), GFP_KERNEL); + if (!rc) + goto out; + rc->rc_sequenceid = seqno; + rc->rc_slotid = slotid; + rcl->__nr_referring_calls++; + list_add(&rc->__list, &rcl->rcl_referring_calls); + } + +out: + if (!rcl->__nr_referring_calls) { + cb->cb_nr_referring_call_list--; + list_del(&rcl->__list); + kfree(rcl); + } +} + +/** + * nfsd41_cb_destroy_referring_call_list - release referring call info + * @cb: context of a callback that has completed + * + * Callers who allocate referring calls using nfsd41_cb_referring_call() must + * release those resources by calling nfsd41_cb_destroy_referring_call_list. + * + * Caller serializes access to @cb. */ +void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb) +{ + struct nfsd4_referring_call_list *rcl; + struct nfsd4_referring_call *rc; + + while (!list_empty(&cb->cb_referring_call_list)) { + rcl = list_first_entry(&cb->cb_referring_call_list, + struct nfsd4_referring_call_list, + __list); + + while (!list_empty(&rcl->rcl_referring_calls)) { + rc = list_first_entry(&rcl->rcl_referring_calls, + struct nfsd4_referring_call, + __list); + list_del(&rc->__list); + kfree(rc); + } + list_del(&rcl->__list); + kfree(rcl); + } +} + static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; @@ -1020,37 +1455,25 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) * cb_seq_status is only set in decode_cb_sequence4res, * and so will remain 1 if an rpc level failure occurs. */ + trace_nfsd_cb_rpc_prepare(clp); cb->cb_seq_status = 1; cb->cb_status = 0; - if (minorversion) { - if (!nfsd41_cb_get_slot(clp, task)) - return; - } + if (minorversion && !nfsd41_cb_get_slot(cb, task)) + return; rpc_call_start(task); } +/* Returns true if CB_COMPOUND processing should continue */ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb) { - struct nfs4_client *clp = cb->cb_clp; - struct nfsd4_session *session = clp->cl_cb_session; - bool ret = true; - - if (!clp->cl_minorversion) { - /* - * If the backchannel connection was shut down while this - * task was queued, we need to resubmit it after setting up - * a new backchannel connection. - * - * Note that if we lost our callback connection permanently - * the submission code will error out, so we don't need to - * handle that case here. - */ - if (task->tk_flags & RPC_TASK_KILLED) - goto need_restart; + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + bool ret = false; - return true; - } + if (cb->cb_held_slot < 0) + goto requeue; + /* This is the operation status code for CB_SEQUENCE */ + trace_nfsd_cb_seq_status(task, cb); switch (cb->cb_seq_status) { case 0: /* @@ -1060,51 +1483,64 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback * If CB_SEQUENCE returns an error, then the state of the slot * (sequence ID, cached reply) MUST NOT change. */ - ++session->se_cb_seq_nr; + ++session->se_cb_seq_nr[cb->cb_held_slot]; + ret = true; break; case -ESERVERFAULT: - ++session->se_cb_seq_nr; - /* Fall through */ + /* + * Call succeeded, but the session, slot index, or slot + * sequence number in the response do not match the same + * in the server's call. The sequence information is thus + * untrustworthy. + */ + nfsd4_mark_cb_fault(cb->cb_clp); + break; case 1: + /* + * cb_seq_status remains 1 if an RPC Reply was never + * received. NFSD can't know if the client processed + * the CB_SEQUENCE operation. Ask the client to send a + * DESTROY_SESSION to recover. + */ + fallthrough; case -NFS4ERR_BADSESSION: - nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); - ret = false; - break; + nfsd4_mark_cb_fault(cb->cb_clp); + goto requeue; case -NFS4ERR_DELAY: - if (!rpc_restart_call(task)) - goto out; - + cb->cb_seq_status = 1; + if (RPC_SIGNALLED(task) || !rpc_restart_call(task)) + goto requeue; rpc_delay(task, 2 * HZ); return false; + case -NFS4ERR_SEQ_MISORDERED: case -NFS4ERR_BADSLOT: + /* + * A SEQ_MISORDERED or BADSLOT error means that the client and + * server are out of sync as to the backchannel parameters. Mark + * the backchannel faulty and restart the RPC, but leak the slot + * so that it's no longer used. + */ + nfsd4_mark_cb_fault(cb->cb_clp); + cb->cb_held_slot = -1; goto retry_nowait; - case -NFS4ERR_SEQ_MISORDERED: - if (session->se_cb_seq_nr != 1) { - session->se_cb_seq_nr = 1; - goto retry_nowait; - } - break; default: - dprintk("%s: unprocessed error %d\n", __func__, - cb->cb_seq_status); + nfsd4_mark_cb_fault(cb->cb_clp); } - - clear_bit(0, &clp->cl_cb_slot_busy); - rpc_wake_up_next(&clp->cl_cb_waitq); - dprintk("%s: freed slot, new seqid=%d\n", __func__, - clp->cl_cb_session->se_cb_seq_nr); - - if (task->tk_flags & RPC_TASK_KILLED) - goto need_restart; -out: + trace_nfsd_cb_free_slot(task, cb); + nfsd41_cb_release_slot(cb); return ret; retry_nowait: - if (rpc_restart_call_prepare(task)) - ret = false; - goto out; -need_restart: - task->tk_status = 0; - cb->cb_need_restart = true; + /* + * RPC_SIGNALLED() means that the rpc_client is being torn down and + * (possibly) recreated. Requeue the call in that case. + */ + if (!RPC_SIGNALLED(task)) { + if (rpc_restart_call_prepare(task)) + return false; + } +requeue: + nfsd41_cb_release_slot(cb); + nfsd4_requeue_cb(task, cb); return false; } @@ -1113,14 +1549,28 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) struct nfsd4_callback *cb = calldata; struct nfs4_client *clp = cb->cb_clp; - dprintk("%s: minorversion=%d\n", __func__, - clp->cl_minorversion); + trace_nfsd_cb_rpc_done(clp); - if (!nfsd4_cb_sequence_done(task, cb)) + if (!clp->cl_minorversion) { + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (RPC_SIGNALLED(task)) + nfsd4_requeue_cb(task, cb); + } else if (!nfsd4_cb_sequence_done(task, cb)) { return; + } if (cb->cb_status) { - WARN_ON_ONCE(task->tk_status); + WARN_ONCE(task->tk_status, + "cb_status=%d tk_status=%d cb_opcode=%d", + cb->cb_status, task->tk_status, cb->cb_ops->opcode); task->tk_status = cb->cb_status; } @@ -1130,10 +1580,12 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) rpc_restart_call_prepare(task); return; case 1: - break; - case -1: - /* Network partition? */ - nfsd4_mark_cb_down(clp, task->tk_status); + switch (task->tk_status) { + case -EIO: + case -ETIMEDOUT: + case -EACCES: + nfsd4_mark_cb_down(clp); + } break; default: BUG(); @@ -1144,10 +1596,12 @@ static void nfsd4_cb_release(void *calldata) { struct nfsd4_callback *cb = calldata; - if (cb->cb_need_restart) - nfsd4_run_cb(cb); + trace_nfsd_cb_rpc_release(cb->cb_clp); + + if (test_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags)) + nfsd4_queue_cb(cb); else - cb->cb_ops->release(cb); + nfsd41_destroy_cb(cb); } @@ -1157,22 +1611,12 @@ static const struct rpc_call_ops nfsd4_cb_ops = { .rpc_release = nfsd4_cb_release, }; -int nfsd4_create_callback_queue(void) -{ - callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0); - if (!callback_wq) - return -ENOMEM; - return 0; -} - -void nfsd4_destroy_callback_queue(void) -{ - destroy_workqueue(callback_wq); -} - /* must be called under the state lock */ void nfsd4_shutdown_callback(struct nfs4_client *clp) { + if (clp->cl_cb_state != NFSD4_CB_UNKNOWN) + trace_nfsd_cb_shutdown(clp); + set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); /* * Note this won't actually result in a null callback; @@ -1180,15 +1624,17 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) * client, destroy the rpc client, and stop: */ nfsd4_run_cb(&clp->cl_cb_null); - flush_workqueue(callback_wq); + flush_workqueue(clp->cl_callback_wq); + nfsd41_cb_inflight_wait_complete(clp); } -/* requires cl_lock: */ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) { struct nfsd4_session *s; struct nfsd4_conn *c; + lockdep_assert_held(&clp->cl_lock); + list_for_each_entry(s, &clp->cl_sessions, se_perclnt) { list_for_each_entry(c, &s->se_conns, cn_persession) { if (c->cn_flags & NFS4_CDFC4_BACK) @@ -1198,6 +1644,12 @@ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) return NULL; } +/* + * Note there isn't a lot of locking in this code; instead we depend on + * the fact that it is run from clp->cl_callback_wq, which won't run two + * work items at once. So, for example, clp->cl_callback_wq handles all + * access of cl_cb_client and all calls to rpc_create or rpc_shutdown_client. + */ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) { struct nfs4_cb_conn conn; @@ -1206,11 +1658,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) struct nfsd4_conn *c; int err; + trace_nfsd_cb_bc_update(clp, cb); + /* * This is either an update, or the client dying; in either case, * kill the old client: */ if (clp->cl_cb_client) { + trace_nfsd_cb_bc_shutdown(clp, cb); rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; put_cred(clp->cl_cb_cred); @@ -1222,13 +1677,15 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) } if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) return; + spin_lock(&clp->cl_lock); /* * Only serialized callback code is allowed to clear these * flags; main nfsd code can only set them: */ - BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); + WARN_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); + memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); c = __nfsd4_find_backchannel(clp); if (c) { @@ -1240,7 +1697,9 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) err = setup_callback_client(clp, &conn, ses); if (err) { - nfsd4_mark_cb_down(clp, err); + nfsd4_mark_cb_down(clp); + if (c) + svc_xprt_put(c->cn_xprt); return; } } @@ -1252,22 +1711,20 @@ nfsd4_run_cb_work(struct work_struct *work) container_of(work, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; + int flags, ret; - if (cb->cb_need_restart) { - cb->cb_need_restart = false; - } else { - if (cb->cb_ops && cb->cb_ops->prepare) - cb->cb_ops->prepare(cb); - } + trace_nfsd_cb_start(clp); if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; - if (!clnt) { - /* Callback channel broken, or client killed; give up: */ - if (cb->cb_ops && cb->cb_ops->release) - cb->cb_ops->release(cb); + if (!clnt || clp->cl_state == NFSD4_COURTESY) { + /* + * Callback channel broken, client killed or + * nfs4_client in courtesy state; give up. + */ + nfsd41_destroy_cb(cb); return; } @@ -1275,13 +1732,24 @@ nfsd4_run_cb_work(struct work_struct *work) * Don't send probe messages for 4.1 or later. */ if (!cb->cb_ops && clp->cl_minorversion) { - clp->cl_cb_state = NFSD4_CB_UP; + nfsd4_mark_cb_state(clp, NFSD4_CB_UP); + nfsd41_destroy_cb(cb); return; } + if (!test_and_clear_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags)) { + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + } + cb->cb_msg.rpc_cred = clp->cl_cb_cred; - rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, - cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); + flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; + ret = rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, + cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); + if (ret != 0) { + set_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); + nfsd4_queue_cb(cb); + } } void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, @@ -1291,14 +1759,30 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; + cb->cb_flags = 0; cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); - cb->cb_seq_status = 1; cb->cb_status = 0; - cb->cb_need_restart = false; + cb->cb_held_slot = -1; + cb->cb_nr_referring_call_list = 0; + INIT_LIST_HEAD(&cb->cb_referring_call_list); } -void nfsd4_run_cb(struct nfsd4_callback *cb) +/** + * nfsd4_run_cb - queue up a callback job to run + * @cb: callback to queue + * + * Kick off a callback to do its thing. Returns false if it was already + * on a queue, true otherwise. + */ +bool nfsd4_run_cb(struct nfsd4_callback *cb) { - queue_work(callback_wq, &cb->cb_work); + struct nfs4_client *clp = cb->cb_clp; + bool queued; + + nfsd41_cb_inflight_begin(clp); + queued = nfsd4_queue_cb(cb); + if (!queued) + nfsd41_cb_inflight_end(clp); + return queued; } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index bf137fec33ff..8cca1329f348 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -41,6 +41,7 @@ #include "idmap.h" #include "nfsd.h" #include "netns.h" +#include "vfs.h" /* * Turn off idmapping when using AUTH_SYS. @@ -82,8 +83,8 @@ ent_init(struct cache_head *cnew, struct cache_head *citm) new->id = itm->id; new->type = itm->type; - strlcpy(new->name, itm->name, sizeof(new->name)); - strlcpy(new->authname, itm->authname, sizeof(new->name)); + strscpy(new->name, itm->name, sizeof(new->name)); + strscpy(new->authname, itm->authname, sizeof(new->authname)); } static void @@ -122,6 +123,12 @@ idtoname_hash(struct ent *ent) return hash; } +static int +idtoname_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + static void idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) @@ -162,7 +169,7 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) ent->id); if (test_bit(CACHE_VALID, &h->flags)) seq_printf(m, " %s", ent->name); - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -184,6 +191,7 @@ static const struct cache_detail idtoname_cache_template = { .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", .cache_put = ent_put, + .cache_upcall = idtoname_upcall, .cache_request = idtoname_request, .cache_parse = idtoname_parse, .cache_show = idtoname_show, @@ -232,8 +240,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) goto out; /* expiry */ - ent.h.expiry_time = get_expiry(&buf); - if (ent.h.expiry_time == 0) + error = get_expiry(&buf, &ent.h.expiry_time); + if (error) goto out; error = -ENOMEM; @@ -295,6 +303,12 @@ nametoid_hash(struct ent *ent) return hash_str(ent->name, ENT_HASHBITS); } +static int +nametoid_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + static void nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) @@ -333,7 +347,7 @@ nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) ent->name); if (test_bit(CACHE_VALID, &h->flags)) seq_printf(m, " %u", ent->id); - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -347,6 +361,7 @@ static const struct cache_detail nametoid_cache_template = { .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", .cache_put = ent_put, + .cache_upcall = nametoid_upcall, .cache_request = nametoid_request, .cache_parse = nametoid_parse, .cache_show = nametoid_show, @@ -393,8 +408,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) memcpy(ent.name, buf1, sizeof(ent.name)); /* expiry */ - ent.h.expiry_time = get_expiry(&buf); - if (ent.h.expiry_time == 0) + error = get_expiry(&buf, &ent.h.expiry_time); + if (error) goto out; /* ID */ @@ -534,7 +549,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen return nfserr_badowner; memcpy(key.name, name, namelen); key.name[namelen] = '\0'; - strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item); if (ret == -ENOENT) return nfserr_badowner; @@ -566,11 +581,12 @@ static __be32 idmap_id_to_name(struct xdr_stream *xdr, .id = id, .type = type, }; + __be32 status = nfs_ok; __be32 *p; int ret; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); + strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) return encode_ascii_id(xdr, id); @@ -578,12 +594,16 @@ static __be32 idmap_id_to_name(struct xdr_stream *xdr, return nfserrno(ret); ret = strlen(item->name); WARN_ON_ONCE(ret > IDMAP_NAMESZ); + p = xdr_reserve_space(xdr, ret + 4); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque(p, item->name, ret); + if (unlikely(!p)) { + status = nfserr_resource; + goto out_put; + } + xdr_encode_opaque(p, item->name, ret); +out_put: cache_put(&item->h, nn->idtoname_cache); - return 0; + return status; } static bool @@ -634,7 +654,7 @@ nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, return nfserr_inval; status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); - *uid = make_kuid(&init_user_ns, id); + *uid = make_kuid(nfsd_user_namespace(rqstp), id); if (!uid_valid(*uid)) status = nfserr_badowner; return status; @@ -651,7 +671,7 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, return nfserr_inval; status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); - *gid = make_kgid(&init_user_ns, id); + *gid = make_kgid(nfsd_user_namespace(rqstp), id); if (!gid_valid(*gid)) status = nfserr_badowner; return status; @@ -660,13 +680,13 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, __be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp, kuid_t uid) { - u32 id = from_kuid(&init_user_ns, uid); + u32 id = from_kuid_munged(nfsd_user_namespace(rqstp), uid); return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id); } __be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp, kgid_t gid) { - u32 id = from_kgid(&init_user_ns, gid); + u32 id = from_kgid_munged(nfsd_user_namespace(rqstp), gid); return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id); } diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 44517fb5c0de..683bd1130afe 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -25,7 +25,7 @@ static struct kmem_cache *nfs4_layout_cache; static struct kmem_cache *nfs4_layout_stateid_cache; static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; -static const struct lock_manager_operations nfsd4_layouts_lm_ops; +static const struct lease_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { #ifdef CONFIG_NFSD_FLEXFILELAYOUT @@ -65,7 +65,7 @@ nfsd4_alloc_devid_map(const struct svc_fh *fhp) return; map->fsid_type = fh->fh_fsid_type; - memcpy(&map->fsid, fh->fh_fsid, fsid_len); + memcpy(&map->fsid, fh_fsid(fh), fsid_len); spin_lock(&nfsd_devid_lock); if (fhp->fh_export->ex_devid_map) @@ -75,7 +75,7 @@ nfsd4_alloc_devid_map(const struct svc_fh *fhp) list_for_each_entry(old, &nfsd_devid_hash[i], hash) { if (old->fsid_type != fh->fh_fsid_type) continue; - if (memcmp(old->fsid, fh->fh_fsid, + if (memcmp(old->fsid, fh_fsid(fh), key_len(old->fsid_type))) continue; @@ -120,7 +120,6 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, id->fsid_idx = fhp->fh_export->ex_devid_map->idx; id->generation = device_generation; - id->pad = 0; return 0; } @@ -145,12 +144,30 @@ void nfsd4_setup_layout_type(struct svc_export *exp) #ifdef CONFIG_NFSD_SCSILAYOUT if (sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks && - sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops && - blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue)) + sb->s_bdev && + sb->s_bdev->bd_disk->fops->pr_ops && + sb->s_bdev->bd_disk->fops->get_unique_id) exp->ex_layout_types |= 1 << LAYOUT_SCSI; #endif } +void nfsd4_close_layout(struct nfs4_layout_stateid *ls) +{ + struct nfsd_file *fl; + + spin_lock(&ls->ls_stid.sc_file->fi_lock); + fl = ls->ls_file; + ls->ls_file = NULL; + spin_unlock(&ls->ls_stid.sc_file->fi_lock); + + if (fl) { + if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + kernel_setlease(fl->nf_file, F_UNLCK, NULL, + (void **)&ls); + nfsd_file_put(fl); + } +} + static void nfsd4_free_layout_stateid(struct nfs4_stid *stid) { @@ -168,9 +185,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) list_del_init(&ls->ls_perfile); spin_unlock(&fp->fi_lock); - if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) - vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); - fput(ls->ls_file); + nfsd4_close_layout(ls); if (ls->ls_recalled) atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); @@ -181,27 +196,26 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) static int nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) { - struct file_lock *fl; + struct file_lease *fl; int status; if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) return 0; - fl = locks_alloc_lock(); + fl = locks_alloc_lease(); if (!fl) return -ENOMEM; - locks_init_lock(fl); + locks_init_lease(fl); fl->fl_lmops = &nfsd4_layouts_lm_ops; - fl->fl_flags = FL_LAYOUT; - fl->fl_type = F_RDLCK; - fl->fl_end = OFFSET_MAX; - fl->fl_owner = ls; - fl->fl_pid = current->tgid; - fl->fl_file = ls->ls_file; - - status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); + fl->c.flc_flags = FL_LAYOUT; + fl->c.flc_type = F_RDLCK; + fl->c.flc_owner = ls; + fl->c.flc_pid = current->tgid; + fl->c.flc_file = ls->ls_file->nf_file; + + status = kernel_setlease(fl->c.flc_file, fl->c.flc_type, &fl, NULL); if (status) { - locks_free_lock(fl); + locks_free_lease(fl); return status; } BUG_ON(fl != NULL); @@ -235,21 +249,21 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, NFSPROC4_CLNT_CB_LAYOUT); - if (parent->sc_type == NFS4_DELEG_STID) - ls->ls_file = get_file(fp->fi_deleg_file); + if (parent->sc_type == SC_TYPE_DELEG) + ls->ls_file = nfsd_file_get(fp->fi_deleg_file); else ls->ls_file = find_any_file(fp); BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { - fput(ls->ls_file); + nfsd_file_put(ls->ls_file); put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; } spin_lock(&clp->cl_lock); - stp->sc_type = NFS4_LAYOUT_STID; + stp->sc_type = SC_TYPE_LAYOUT; list_add(&ls->ls_perclnt, &clp->cl_lo_states); spin_unlock(&clp->cl_lock); @@ -268,13 +282,13 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, { struct nfs4_layout_stateid *ls; struct nfs4_stid *stid; - unsigned char typemask = NFS4_LAYOUT_STID; + unsigned short typemask = SC_TYPE_LAYOUT; __be32 status; if (create) - typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID); + typemask |= (SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG); - status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid, + status = nfsd4_lookup_stateid(cstate, stateid, typemask, 0, &stid, net_generic(SVC_NET(rqstp), nfsd_net_id)); if (status) goto out; @@ -285,7 +299,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, goto out_put_stid; } - if (stid->sc_type != NFS4_LAYOUT_STID) { + if (stid->sc_type != SC_TYPE_LAYOUT) { ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type); nfs4_put_stid(stid); @@ -322,16 +336,17 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) if (ls->ls_recalled) goto out_unlock; - ls->ls_recalled = true; - atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); if (list_empty(&ls->ls_layouts)) goto out_unlock; + ls->ls_recalled = true; + atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid); - refcount_inc(&ls->ls_stid.sc_count); - nfsd4_run_cb(&ls->ls_recall); - + if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ls->ls_recall.cb_flags)) { + refcount_inc(&ls->ls_stid.sc_count); + nfsd4_run_cb(&ls->ls_recall); + } out_unlock: spin_unlock(&ls->ls_lock); } @@ -421,7 +436,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL); if (!new) return nfserr_jukebox; - memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg)); + memcpy(&new->lo_seg, seg, sizeof(new->lo_seg)); new->lo_state = ls; spin_lock(&fp->fi_lock); @@ -514,11 +529,11 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, if (!list_empty(&ls->ls_layouts)) { if (found) nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid); - lrp->lrs_present = 1; + lrp->lrs_present = true; } else { trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid); - nfs4_unhash_stid(&ls->ls_stid); - lrp->lrs_present = 0; + ls->ls_stid.sc_status |= SC_STATUS_CLOSED; + lrp->lrs_present = false; } spin_unlock(&ls->ls_lock); @@ -538,7 +553,7 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp, struct nfs4_layout *lp, *t; LIST_HEAD(reaplist); - lrp->lrs_present = 0; + lrp->lrs_present = false; spin_lock(&clp->cl_lock); list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) { @@ -604,7 +619,7 @@ nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) } static void -nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) +nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; char addr_str[INET6_ADDRSTRLEN]; @@ -626,7 +641,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) argv[0] = (char *)nfsd_recall_failed; argv[1] = addr_str; - argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; + argv[2] = file->nf_file->f_path.mnt->mnt_sb->s_id; argv[3] = NULL; error = call_usermodehelper(nfsd_recall_failed, argv, envp, @@ -656,8 +671,9 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) struct nfsd_net *nn; ktime_t now, cutoff; const struct nfsd4_layout_ops *ops; + struct nfsd_file *fl; - + trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task); switch (task->tk_status) { case 0: case -NFS4ERR_DELAY: @@ -675,25 +691,30 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) /* Client gets 2 lease periods to return it */ cutoff = ktime_add_ns(task->tk_start, - nn->nfsd4_lease * NSEC_PER_SEC * 2); + (u64)nn->nfsd4_lease * NSEC_PER_SEC * 2); if (ktime_before(now, cutoff)) { rpc_delay(task, HZ/100); /* 10 mili-seconds */ return 0; } - /* Fallthrough */ + fallthrough; default: /* * Unknown error or non-responding client, we'll need to fence. */ trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid); - - ops = nfsd4_layout_ops[ls->ls_layout_type]; - if (ops->fence_client) - ops->fence_client(ls); - else - nfsd4_cb_layout_fail(ls); - return -1; + rcu_read_lock(); + fl = nfsd_file_get(ls->ls_file); + rcu_read_unlock(); + if (fl) { + ops = nfsd4_layout_ops[ls->ls_layout_type]; + if (ops->fence_client) + ops->fence_client(ls, fl); + else + nfsd4_cb_layout_fail(ls, fl); + nfsd_file_put(fl); + } + return 1; case -NFS4ERR_NOMATCHING_LAYOUT: trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid); task->tk_status = 0; @@ -719,10 +740,11 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = { .prepare = nfsd4_cb_layout_prepare, .done = nfsd4_cb_layout_done, .release = nfsd4_cb_layout_release, + .opcode = OP_CB_LAYOUTRECALL, }; static bool -nfsd4_layout_lm_break(struct file_lock *fl) +nfsd4_layout_lm_break(struct file_lease *fl) { /* * We don't want the locks code to timeout the lease for us; @@ -730,19 +752,19 @@ nfsd4_layout_lm_break(struct file_lock *fl) * in time: */ fl->fl_break_time = 0; - nfsd4_recall_file_layout(fl->fl_owner); + nfsd4_recall_file_layout(fl->c.flc_owner); return false; } static int -nfsd4_layout_lm_change(struct file_lock *onlist, int arg, +nfsd4_layout_lm_change(struct file_lease *onlist, int arg, struct list_head *dispose) { BUG_ON(!(arg & F_UNLCK)); return lease_modify(onlist, arg, dispose); } -static const struct lock_manager_operations nfsd4_layouts_lm_ops = { +static const struct lease_manager_operations nfsd4_layouts_lm_ops = { .lm_break = nfsd4_layout_lm_break, .lm_change = nfsd4_layout_lm_change, }; @@ -755,13 +777,11 @@ nfsd4_init_pnfs(void) for (i = 0; i < DEVID_HASH_SIZE; i++) INIT_LIST_HEAD(&nfsd_devid_hash[i]); - nfs4_layout_cache = kmem_cache_create("nfs4_layout", - sizeof(struct nfs4_layout), 0, 0, NULL); + nfs4_layout_cache = KMEM_CACHE(nfs4_layout, 0); if (!nfs4_layout_cache) return -ENOMEM; - nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid", - sizeof(struct nfs4_layout_stateid), 0, 0, NULL); + nfs4_layout_stateid_cache = KMEM_CACHE(nfs4_layout_stateid, 0); if (!nfs4_layout_stateid_cache) { kmem_cache_destroy(nfs4_layout_cache); return -ENOMEM; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0cfd257ffdaf..b74800917583 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -37,6 +37,10 @@ #include <linux/falloc.h> #include <linux/slab.h> #include <linux/kthread.h> +#include <linux/namei.h> + +#include <linux/sunrpc/addr.h> +#include <linux/nfs_ssc.h> #include "idmap.h" #include "cache.h" @@ -48,34 +52,18 @@ #include "pnfs.h" #include "trace.h" -#ifdef CONFIG_NFSD_V4_SECURITY_LABEL -#include <linux/security.h> - -static inline void -nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) -{ - struct inode *inode = d_inode(resfh->fh_dentry); - int status; - - inode_lock(inode); - status = security_inode_setsecctx(resfh->fh_dentry, - label->data, label->len); - inode_unlock(inode); +static bool inter_copy_offload_enable; +module_param(inter_copy_offload_enable, bool, 0644); +MODULE_PARM_DESC(inter_copy_offload_enable, + "Enable inter server to server copy offload. Default: false"); - if (status) - /* - * XXX: We should really fail the whole open, but we may - * already have created a new file, so it may be too - * late. For now this seems the least of evils: - */ - bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL; +static void cleanup_async_copy(struct nfsd4_copy *copy); - return; -} -#else -static inline void -nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) -{ } +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +static int nfsd4_ssc_umount_timeout = 900000; /* default to 15 mins */ +module_param(nfsd4_ssc_umount_timeout, int, 0644); +MODULE_PARM_DESC(nfsd4_ssc_umount_timeout, + "idle msecs before unmount export from source server"); #endif #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -142,26 +130,6 @@ is_create_with_attrs(struct nfsd4_open *open) || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1); } -/* - * if error occurs when setting the acl, just clear the acl bit - * in the returned attr bitmap. - */ -static void -do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct nfs4_acl *acl, u32 *bmval) -{ - __be32 status; - - status = nfsd4_set_nfs4_acl(rqstp, fhp, acl); - if (status) - /* - * We should probably fail the whole open at this point, - * but we've already created the file, so it's too late; - * So this seems the least of evils: - */ - bmval[0] &= ~FATTR4_WORD0_ACL; -} - static inline void fh_dup2(struct svc_fh *dst, struct svc_fh *src) { @@ -175,7 +143,6 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src) static __be32 do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode) { - __be32 status; if (open->op_truncate && !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) @@ -190,12 +157,10 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs if (open->op_share_deny & NFS4_SHARE_DENY_READ) accmode |= NFSD_MAY_WRITE; - status = fh_verify(rqstp, current_fh, S_IFREG, accmode); - - return status; + return fh_verify(rqstp, current_fh, S_IFREG, accmode); } -static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) +static __be32 nfsd_check_obj_isreg(struct svc_fh *fh, u32 minor_version) { umode_t mode = d_inode(fh->fh_dentry)->i_mode; @@ -203,14 +168,15 @@ static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) return nfs_ok; if (S_ISDIR(mode)) return nfserr_isdir; - /* - * Using err_symlink as our catch-all case may look odd; but - * there's no other obvious error for this case in 4.0, and we - * happen to know that it will cause the linux v4 client to do - * the right thing on attempts to open something other than a - * regular file. - */ - return nfserr_symlink; + if (S_ISLNK(mode)) + return nfserr_symlink; + + /* RFC 7530 - 16.16.6 */ + if (minor_version == 0) + return nfserr_symlink; + else + return nfserr_wrong_type; + } static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh) @@ -221,6 +187,234 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate &resfh->fh_handle); } +static inline bool nfsd4_create_is_exclusive(int createmode) +{ + return createmode == NFS4_CREATE_EXCLUSIVE || + createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + +static __be32 +nfsd4_vfs_create(struct svc_fh *fhp, struct dentry *child, + struct nfsd4_open *open) +{ + struct file *filp; + struct path path; + int oflags; + + oflags = O_CREAT | O_LARGEFILE; + switch (open->op_share_access & NFS4_SHARE_ACCESS_BOTH) { + case NFS4_SHARE_ACCESS_WRITE: + oflags |= O_WRONLY; + break; + case NFS4_SHARE_ACCESS_BOTH: + oflags |= O_RDWR; + break; + default: + oflags |= O_RDONLY; + } + + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = child; + filp = dentry_create(&path, oflags, open->op_iattr.ia_mode, + current_cred()); + if (IS_ERR(filp)) + return nfserrno(PTR_ERR(filp)); + + open->op_filp = filp; + return nfs_ok; +} + +/* + * Implement NFSv4's unchecked, guarded, and exclusive create + * semantics for regular files. Open state for this new file is + * subsequently fabricated in nfsd4_process_open2(). + * + * Upon return, caller must release @fhp and @resfhp. + */ +static __be32 +nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct nfsd4_open *open) +{ + struct iattr *iap = &open->op_iattr; + struct nfsd_attrs attrs = { + .na_iattr = iap, + .na_seclabel = &open->op_label, + }; + struct dentry *parent, *child; + __u32 v_mtime, v_atime; + struct inode *inode; + __be32 status; + int host_err; + + if (isdotent(open->op_fname, open->op_fnamelen)) + return nfserr_exist; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (status != nfs_ok) + return status; + parent = fhp->fh_dentry; + inode = d_inode(parent); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + if (is_create_with_attrs(open)) + nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs); + + child = start_creating(&nop_mnt_idmap, parent, + &QSTR_LEN(open->op_fname, open->op_fnamelen)); + if (IS_ERR(child)) { + status = nfserrno(PTR_ERR(child)); + goto out_write; + } + + if (d_really_is_negative(child)) { + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (status != nfs_ok) + goto out; + } + + status = fh_compose(resfhp, fhp->fh_export, child, fhp); + if (status != nfs_ok) + goto out; + + v_mtime = 0; + v_atime = 0; + if (nfsd4_create_is_exclusive(open->op_createmode)) { + u32 *verifier = (u32 *)open->op_verf.data; + + /* + * Solaris 7 gets confused (bugid 4218508) if these have + * the high bit set, as do xfs filesystems without the + * "bigtime" feature. So just clear the high bits. If this + * is ever changed to use different attrs for storing the + * verifier, then do_open_lookup() will also need to be + * fixed accordingly. + */ + v_mtime = verifier[0] & 0x7fffffff; + v_atime = verifier[1] & 0x7fffffff; + } + + if (d_really_is_positive(child)) { + /* NFSv4 protocol requires change attributes even though + * no change happened. + */ + status = fh_fill_both_attrs(fhp); + if (status != nfs_ok) + goto out; + + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + if (!d_is_reg(child)) + break; + + /* + * In NFSv4, we don't want to truncate the file + * now. This would be wrong if the OPEN fails for + * some other reason. Furthermore, if the size is + * nonzero, we should ignore it according to spec! + */ + open->op_truncate = (iap->ia_valid & ATTR_SIZE) && + !iap->ia_size; + break; + case NFS4_CREATE_GUARDED: + status = nfserr_exist; + break; + case NFS4_CREATE_EXCLUSIVE: + if (inode_get_mtime_sec(d_inode(child)) == v_mtime && + inode_get_atime_sec(d_inode(child)) == v_atime && + d_inode(child)->i_size == 0) { + open->op_created = true; + break; /* subtle */ + } + status = nfserr_exist; + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (inode_get_mtime_sec(d_inode(child)) == v_mtime && + inode_get_atime_sec(d_inode(child)) == v_atime && + d_inode(child)->i_size == 0) { + open->op_created = true; + goto set_attr; /* subtle */ + } + status = nfserr_exist; + } + goto out; + } + + if (!IS_POSIXACL(inode)) + iap->ia_mode &= ~current_umask(); + + status = fh_fill_pre_attrs(fhp); + if (status != nfs_ok) + goto out; + status = nfsd4_vfs_create(fhp, child, open); + if (status != nfs_ok) + goto out; + open->op_created = true; + fh_fill_post_attrs(fhp); + + /* A newly created file already has a file size of zero. */ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; + if (nfsd4_create_is_exclusive(open->op_createmode)) { + iap->ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_MTIME_SET|ATTR_ATIME_SET; + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + +set_attr: + status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs); + + if (attrs.na_labelerr) + open->op_bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + if (attrs.na_aclerr) + open->op_bmval[0] &= ~FATTR4_WORD0_ACL; +out: + end_creating(child); + nfsd_attrs_free(&attrs); +out_write: + fh_drop_write(fhp); + return status; +} + +/** + * set_change_info - set up the change_info4 for a reply + * @cinfo: pointer to nfsd4_change_info to be populated + * @fhp: pointer to svc_fh to use as source + * + * Many operations in NFSv4 require change_info4 in the reply. This function + * populates that from the info that we (should!) have already collected. In + * the event that we didn't get any pre-attrs, just zero out both. + */ +static void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + cinfo->atomic = (u32)(fhp->fh_pre_saved && fhp->fh_post_saved && !fhp->fh_no_atomic_attr); + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + + /* + * If fetching the pre-change attributes failed, then we should + * have already failed the whole operation. We could have still + * failed to fetch post-change attributes however. + * + * If we didn't get post-op attrs, just zero-out the after + * field since we don't know what it should be. If the pre_saved + * field isn't set for some reason, throw warning and just copy + * whatever is in the after field. + */ + if (WARN_ON_ONCE(!fhp->fh_pre_saved)) + cinfo->before_change = 0; + if (!fhp->fh_post_saved) + cinfo->after_change = cinfo->before_change + 1; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { @@ -232,7 +426,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru if (!*resfh) return nfserr_jukebox; fh_init(*resfh, NFS4_FHSIZE); - open->op_truncate = 0; + open->op_truncate = false; if (open->op_create) { /* FIXME: check session persistence and pnfs flags. @@ -250,47 +444,33 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * yes | yes | GUARDED4 | GUARDED4 */ - /* - * Note: create modes (UNCHECKED,GUARDED...) are the same - * in NFSv4 as in v3 except EXCLUSIVE4_1. - */ current->fs->umask = open->op_umask; - status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, - open->op_fname.len, &open->op_iattr, - *resfh, open->op_createmode, - (u32 *)open->op_verf.data, - &open->op_truncate, &open->op_created); + status = nfsd4_create_file(rqstp, current_fh, *resfh, open); current->fs->umask = 0; - if (!status && open->op_label.len) - nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); - /* * Following rfc 3530 14.2.16, and rfc 5661 18.16.4 * use the returned bitmask to indicate which attributes * we used to store the verifier: */ - if (nfsd_create_is_exclusive(open->op_createmode) && status == 0) + if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0) open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); - } else - /* - * Note this may exit with the parent still locked. - * We will hold the lock until nfsd4_open's final - * lookup, to prevent renames or unlinks until we've had - * a chance to an acquire a delegation if appropriate. - */ + } else { status = nfsd_lookup(rqstp, current_fh, - open->op_fname.data, open->op_fname.len, *resfh); + open->op_fname, open->op_fnamelen, *resfh); + if (status == nfs_ok) + /* NFSv4 protocol requires change attributes even though + * no change happened. + */ + status = fh_fill_both_attrs(current_fh); + } if (status) goto out; - status = nfsd_check_obj_isreg(*resfh); + status = nfsd_check_obj_isreg(*resfh, cstate->minorversion); if (status) goto out; - if (is_create_with_attrs(open) && open->op_acl != NULL) - do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval); - nfsd4_set_open_owner_reply_cache(cstate, open, *resfh); accmode = NFSD_MAY_NOP; if (open->op_created || @@ -306,7 +486,6 @@ static __be32 do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { struct svc_fh *current_fh = &cstate->current_fh; - __be32 status; int accmode = 0; /* We don't know the target directory, and therefore can not @@ -331,9 +510,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, str if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH) accmode = NFSD_MAY_OWNER_OVERRIDE; - status = do_open_permission(rqstp, current_fh, open, accmode); - - return status; + return do_open_permission(rqstp, current_fh, open, accmode); } static void @@ -358,21 +535,23 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, bool reclaim = false; dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", - (int)open->op_fname.len, open->op_fname.data, + (int)open->op_fnamelen, open->op_fname, open->op_openowner); + open->op_filp = NULL; + open->op_rqstp = rqstp; + /* This check required by spec. */ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; - open->op_created = 0; + open->op_created = false; /* * RFC5661 18.51.3 * Before RECLAIM_COMPLETE done, server should deny new lock */ if (nfsd4_has_session(cstate) && - !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, - &cstate->session->se_client->cl_flags) && + !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) return nfserr_grace; @@ -414,50 +593,46 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; switch (open->op_claim_type) { - case NFS4_OPEN_CLAIM_DELEGATE_CUR: - case NFS4_OPEN_CLAIM_NULL: - status = do_open_lookup(rqstp, cstate, open, &resfh); - if (status) - goto out; - break; - case NFS4_OPEN_CLAIM_PREVIOUS: - status = nfs4_check_open_reclaim(&open->op_clientid, - cstate, nn); - if (status) - goto out; - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; - reclaim = true; - case NFS4_OPEN_CLAIM_FH: - case NFS4_OPEN_CLAIM_DELEG_CUR_FH: - status = do_open_fhandle(rqstp, cstate, open); - if (status) - goto out; - resfh = &cstate->current_fh; - break; - case NFS4_OPEN_CLAIM_DELEG_PREV_FH: - case NFS4_OPEN_CLAIM_DELEGATE_PREV: - dprintk("NFSD: unsupported OPEN claim type %d\n", - open->op_claim_type); - status = nfserr_notsupp; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_NULL: + status = do_open_lookup(rqstp, cstate, open, &resfh); + if (status) goto out; - default: - dprintk("NFSD: Invalid OPEN claim type %d\n", - open->op_claim_type); - status = nfserr_inval; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + status = nfs4_check_open_reclaim(cstate->clp); + if (status) + goto out; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + reclaim = true; + fallthrough; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + status = do_open_fhandle(rqstp, cstate, open); + if (status) goto out; + resfh = &cstate->current_fh; + break; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + status = nfserr_notsupp; + goto out; + default: + status = nfserr_inval; + goto out; } - /* - * nfsd4_process_open2() does the actual opening of the file. If - * successful, it (1) truncates the file if open->op_truncate was - * set, (2) sets open->op_stateid, (3) sets open->op_delegation. - */ + status = nfsd4_process_open2(rqstp, resfh, open); - WARN(status && open->op_created, - "nfsd4_process_open2 failed to open newly-created file! status=%u\n", - be32_to_cpu(status)); + if (status && open->op_created) + pr_warn("nfsd4_process_open2 failed to open newly-created file: status=%u\n", + be32_to_cpu(status)); if (reclaim && !status) nn->somebody_reclaimed = true; out: + if (open->op_filp) { + fput(open->op_filp); + open->op_filp = NULL; + } if (resfh && resfh != &cstate->current_fh) { fh_dup2(&cstate->current_fh, resfh); fh_put(resfh); @@ -502,23 +677,29 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_putfh *putfh = &u->putfh; + __be32 ret; fh_put(&cstate->current_fh); cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; - memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, + memcpy(&cstate->current_fh.fh_handle.fh_raw, putfh->pf_fhval, putfh->pf_fhlen); - return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS); + ret = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS); +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + if (ret == nfserr_stale && putfh->no_verify) { + SET_FH_FLAG(&cstate->current_fh, NFSD4_FH_FOREIGN); + ret = 0; + } +#endif + return ret; } static __be32 nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - __be32 status; - fh_put(&cstate->current_fh); - status = exp_pseudoroot(rqstp, &cstate->current_fh); - return status; + + return exp_pseudoroot(rqstp, &cstate->current_fh); } static __be32 @@ -529,9 +710,9 @@ nfsd4_restorefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_restorefh; fh_dup2(&cstate->current_fh, &cstate->save_fh); - if (HAS_STATE_ID(cstate, SAVED_STATE_ID_FLAG)) { + if (HAS_CSTATE_FLAG(cstate, SAVED_STATE_ID_FLAG)) { memcpy(&cstate->current_stateid, &cstate->save_stateid, sizeof(stateid_t)); - SET_STATE_ID(cstate, CURRENT_STATE_ID_FLAG); + SET_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG); } return nfs_ok; } @@ -541,9 +722,9 @@ nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { fh_dup2(&cstate->save_fh, &cstate->current_fh); - if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG)) { + if (HAS_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG)) { memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t)); - SET_STATE_ID(cstate, SAVED_STATE_ID_FLAG); + SET_CSTATE_FLAG(cstate, SAVED_STATE_ID_FLAG); } return nfs_ok; } @@ -556,8 +737,14 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_access *access = &u->access; + u32 access_full; + + access_full = NFS3_ACCESS_FULL; + if (cstate->minorversion >= 2) + access_full |= NFS4_ACCESS_XALIST | NFS4_ACCESS_XAREAD | + NFS4_ACCESS_XAWRITE; - if (access->ac_req_access & ~NFS3_ACCESS_FULL) + if (access->ac_req_access & ~access_full) return nfserr_inval; access->ac_resp_access = access->ac_req_access; @@ -565,30 +752,24 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &access->ac_supported); } -static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) -{ - __be32 verf[2]; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - /* - * This is opaque to client, so no need to byte-swap. Use - * __force to keep sparse happy. y2038 time_t overflow is - * irrelevant in this usage. - */ - verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; - memcpy(verifier->data, verf, sizeof(verifier->data)); -} - static __be32 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_commit *commit = &u->commit; + struct nfsd_file *nf; + __be32 status; + + status = nfsd_file_acquire(rqstp, &cstate->current_fh, NFSD_MAY_WRITE | + NFSD_MAY_NOT_BREAK_LEASE, &nf); + if (status != nfs_ok) + return status; - gen_boot_verifier(&commit->co_verf, SVC_NET(rqstp)); - return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, - commit->co_count); + status = nfsd_commit(rqstp, &cstate->current_fh, nf, commit->co_offset, + commit->co_count, + (__be32 *)commit->co_verf.data); + nfsd_file_put(nf); + return status; } static __be32 @@ -596,6 +777,10 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_create *create = &u->create; + struct nfsd_attrs attrs = { + .na_iattr = &create->cr_iattr, + .na_seclabel = &create->cr_label, + }; struct svc_fh resfh; __be32 status; dev_t rdev; @@ -611,12 +796,13 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; + status = nfsd4_acl_to_attr(create->cr_type, create->cr_acl, &attrs); current->fs->umask = create->cr_umask; switch (create->cr_type) { case NF4LNK: status = nfsd_symlink(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, - create->cr_data, &resfh); + create->cr_data, &attrs, &resfh); break; case NF4BLK: @@ -627,7 +813,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out_umask; status = nfsd_create(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, - &create->cr_iattr, S_IFBLK, rdev, &resfh); + &attrs, S_IFBLK, rdev, &resfh); break; case NF4CHR: @@ -638,26 +824,26 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out_umask; status = nfsd_create(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, - &create->cr_iattr,S_IFCHR, rdev, &resfh); + &attrs, S_IFCHR, rdev, &resfh); break; case NF4SOCK: status = nfsd_create(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, - &create->cr_iattr, S_IFSOCK, 0, &resfh); + &attrs, S_IFSOCK, 0, &resfh); break; case NF4FIFO: status = nfsd_create(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, - &create->cr_iattr, S_IFIFO, 0, &resfh); + &attrs, S_IFIFO, 0, &resfh); break; case NF4DIR: create->cr_iattr.ia_valid &= ~ATTR_SIZE; status = nfsd_create(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, - &create->cr_iattr, S_IFDIR, 0, &resfh); + &attrs, S_IFDIR, 0, &resfh); break; default: @@ -667,20 +853,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - if (create->cr_label.len) - nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval); - - if (create->cr_acl != NULL) - do_set_nfs4_acl(rqstp, &resfh, create->cr_acl, - create->cr_bmval); - - fh_unlock(&cstate->current_fh); + if (attrs.na_labelerr) + create->cr_bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + if (attrs.na_aclerr) + create->cr_bmval[0] &= ~FATTR4_WORD0_ACL; set_change_info(&create->cr_cinfo, &cstate->current_fh); fh_dup2(&cstate->current_fh, &resfh); out: fh_put(&resfh); out_umask: current->fs->umask = 0; + nfsd_attrs_free(&attrs); return status; } @@ -691,6 +874,8 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_getattr *getattr = &u->getattr; __be32 status; + trace_nfsd_vfs_getattr(rqstp, &cstate->current_fh); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) return status; @@ -760,13 +945,17 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_read *read = &u->read; __be32 status; - read->rd_filp = NULL; - if (read->rd_offset >= OFFSET_MAX) - return nfserr_inval; + read->rd_nf = NULL; trace_nfsd_read_start(rqstp, &cstate->current_fh, read->rd_offset, read->rd_length); + read->rd_length = min_t(u32, read->rd_length, svc_max_payload(rqstp)); + if (read->rd_offset > (u64)OFFSET_MAX) + read->rd_offset = (u64)OFFSET_MAX; + if (read->rd_offset + read->rd_length > (u64)OFFSET_MAX) + read->rd_length = (u64)OFFSET_MAX - read->rd_offset; + /* * If we do a zero copy read, then a client will see read data * that reflects the state of the file *after* performing the @@ -775,19 +964,17 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * To ensure proper ordering, we therefore turn off zero copy if * the client wants us to do more in this compound: */ - if (!nfsd4_last_compound_op(rqstp)) - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + if (!nfsd4_last_compound_op(rqstp)) { + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + argp->splice_ok = false; + } /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &read->rd_stateid, RD_STATE, - &read->rd_filp, &read->rd_tmp_file); - if (status) { - dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); - goto out; - } - status = nfs_ok; -out: + &read->rd_nf, NULL); + read->rd_rqstp = rqstp; read->rd_fhp = &cstate->current_fh; return status; @@ -797,10 +984,11 @@ out: static void nfsd4_read_release(union nfsd4_op_u *u) { - if (u->read.rd_filp) - fput(u->read.rd_filp); - trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, - u->read.rd_offset, u->read.rd_length); + if (u->read.rd_nf) { + trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, + u->read.rd_offset, u->read.rd_length); + nfsd_file_put(u->read.rd_nf); + } } static __be32 @@ -811,6 +999,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, u64 cookie = readdir->rd_cookie; static const nfs4_verifier zeroverf; + trace_nfsd_vfs_readdir(rqstp, &cstate->current_fh, + readdir->rd_maxcount, readdir->rd_cookie); + /* no need to check permission - this will be done in nfsd_readdir() */ if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) @@ -849,10 +1040,8 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); - if (!status) { - fh_unlock(&cstate->current_fh); + if (!status) set_change_info(&remove->rm_cinfo, &cstate->current_fh); - } return status; } @@ -870,8 +1059,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, rename->rn_tname, rename->rn_tnamelen); if (status) return status; - set_change_info(&rename->rn_sinfo, &cstate->current_fh); - set_change_info(&rename->rn_tinfo, &cstate->save_fh); + set_change_info(&rename->rn_sinfo, &cstate->save_fh); + set_change_info(&rename->rn_tinfo, &cstate->current_fh); return nfs_ok; } @@ -892,7 +1081,6 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &exp, &dentry); if (err) return err; - fh_unlock(&cstate->current_fh); if (d_really_is_negative(dentry)) { exp_put(exp); err = nfserr_noent; @@ -942,23 +1130,83 @@ nfsd4_secinfo_no_name_release(union nfsd4_op_u *u) exp_put(u->secinfo_no_name.sin_exp); } +/* + * Validate that the requested timestamps are within the acceptable range. If + * timestamp appears to be in the future, then it will be clamped to + * current_time(). + */ +static void +vet_deleg_attrs(struct nfsd4_setattr *setattr, struct nfs4_delegation *dp) +{ + struct timespec64 now = current_time(dp->dl_stid.sc_file->fi_inode); + struct iattr *iattr = &setattr->sa_iattr; + + if ((setattr->sa_bmval[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) && + !nfsd4_vet_deleg_time(&iattr->ia_atime, &dp->dl_atime, &now)) + iattr->ia_valid &= ~(ATTR_ATIME | ATTR_ATIME_SET); + + if (setattr->sa_bmval[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) { + if (nfsd4_vet_deleg_time(&iattr->ia_mtime, &dp->dl_mtime, &now)) { + iattr->ia_ctime = iattr->ia_mtime; + if (nfsd4_vet_deleg_time(&iattr->ia_ctime, &dp->dl_ctime, &now)) + dp->dl_setattr = true; + else + iattr->ia_valid &= ~(ATTR_CTIME | ATTR_CTIME_SET); + } else { + iattr->ia_valid &= ~(ATTR_CTIME | ATTR_CTIME_SET | + ATTR_MTIME | ATTR_MTIME_SET); + } + } +} + static __be32 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_setattr *setattr = &u->setattr; + struct nfsd_attrs attrs = { + .na_iattr = &setattr->sa_iattr, + .na_seclabel = &setattr->sa_label, + }; + bool save_no_wcc, deleg_attrs; + struct nfs4_stid *st = NULL; + struct inode *inode; __be32 status = nfs_ok; int err; - if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { + deleg_attrs = setattr->sa_bmval[2] & (FATTR4_WORD2_TIME_DELEG_ACCESS | + FATTR4_WORD2_TIME_DELEG_MODIFY); + + if (deleg_attrs || (setattr->sa_iattr.ia_valid & ATTR_SIZE)) { + int flags = WR_STATE; + + if (setattr->sa_bmval[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) + flags |= RD_STATE; + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &setattr->sa_stateid, - WR_STATE, NULL, NULL); - if (status) { - dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); + flags, NULL, &st); + if (status) return status; + } + + if (deleg_attrs) { + status = nfserr_bad_stateid; + if (st->sc_type & SC_TYPE_DELEG) { + struct nfs4_delegation *dp = delegstateid(st); + + /* Only for *_ATTRS_DELEG flavors */ + if (deleg_attrs_deleg(dp->dl_type)) { + vet_deleg_attrs(setattr, dp); + status = nfs_ok; + } } } + if (st) + nfs4_put_stid(st); + if (status) + return status; + err = fh_want_write(&cstate->current_fh); if (err) return nfserrno(err); @@ -969,58 +1217,73 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - if (setattr->sa_acl != NULL) - status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh, - setattr->sa_acl); - if (status) - goto out; - if (setattr->sa_label.len) - status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh, - &setattr->sa_label); + inode = cstate->current_fh.fh_dentry->d_inode; + status = nfsd4_acl_to_attr(S_ISDIR(inode->i_mode) ? NF4DIR : NF4REG, + setattr->sa_acl, &attrs); + if (status) goto out; - status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, - 0, (time_t)0); + save_no_wcc = cstate->current_fh.fh_no_wcc; + cstate->current_fh.fh_no_wcc = true; + status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs, NULL); + cstate->current_fh.fh_no_wcc = save_no_wcc; + if (!status) + status = nfserrno(attrs.na_labelerr); + if (!status) + status = nfserrno(attrs.na_aclerr); out: + nfsd_attrs_free(&attrs); fh_drop_write(&cstate->current_fh); return status; } +static void nfsd4_file_mark_deleg_written(struct nfs4_file *fi) +{ + spin_lock(&fi->fi_lock); + if (!list_empty(&fi->fi_delegations)) { + struct nfs4_delegation *dp = list_first_entry(&fi->fi_delegations, + struct nfs4_delegation, dl_perfile); + + if (dp->dl_type == OPEN_DELEGATE_WRITE_ATTRS_DELEG) + dp->dl_written = true; + } + spin_unlock(&fi->fi_lock); +} + static __be32 nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_write *write = &u->write; stateid_t *stateid = &write->wr_stateid; - struct file *filp = NULL; + struct nfs4_stid *stid = NULL; + struct nfsd_file *nf = NULL; __be32 status = nfs_ok; unsigned long cnt; - int nvecs; - if (write->wr_offset >= OFFSET_MAX) - return nfserr_inval; + if (write->wr_offset > (u64)OFFSET_MAX || + write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX) + return nfserr_fbig; cnt = write->wr_buflen; trace_nfsd_write_start(rqstp, &cstate->current_fh, write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - stateid, WR_STATE, &filp, NULL); - if (status) { - dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); + stateid, WR_STATE, &nf, &stid); + if (status) return status; + + if (stid) { + nfsd4_file_mark_deleg_written(stid->sc_file); + nfs4_put_stid(stid); } write->wr_how_written = write->wr_stable_how; - gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp)); - - nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist, - &write->wr_head, write->wr_buflen); - WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); - - status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, - write->wr_offset, rqstp->rq_vec, nvecs, &cnt, - write->wr_how_written); - fput(filp); + status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, + write->wr_offset, &write->wr_payload, + &cnt, write->wr_how_written, + (__be32 *)write->wr_verifier.data); + nfsd_file_put(nf); write->wr_bytes_written = cnt; trace_nfsd_write_done(rqstp, &cstate->current_fh, @@ -1030,8 +1293,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static __be32 nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - stateid_t *src_stateid, struct file **src, - stateid_t *dst_stateid, struct file **dst) + stateid_t *src_stateid, struct nfsd_file **src, + stateid_t *dst_stateid, struct nfsd_file **dst) { __be32 status; @@ -1040,21 +1303,17 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, src_stateid, RD_STATE, src, NULL); - if (status) { - dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); + if (status) goto out; - } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, dst_stateid, WR_STATE, dst, NULL); - if (status) { - dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); + if (status) goto out_put_src; - } /* fix up for NFS-specific error code */ - if (!S_ISREG(file_inode(*src)->i_mode) || - !S_ISREG(file_inode(*dst)->i_mode)) { + if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) || + !S_ISREG(file_inode((*dst)->nf_file)->i_mode)) { status = nfserr_wrong_type; goto out_put_dst; } @@ -1062,9 +1321,11 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: return status; out_put_dst: - fput(*dst); + nfsd_file_put(*dst); + *dst = NULL; out_put_src: - fput(*src); + nfsd_file_put(*src); + *src = NULL; goto out; } @@ -1073,7 +1334,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_clone *clone = &u->clone; - struct file *src, *dst; + struct nfsd_file *src, *dst; __be32 status; status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src, @@ -1081,44 +1342,101 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - status = nfsd4_clone_file_range(src, clone->cl_src_pos, - dst, clone->cl_dst_pos, clone->cl_count); + status = nfsd4_clone_file_range(rqstp, src, clone->cl_src_pos, + dst, clone->cl_dst_pos, clone->cl_count, + EX_ISSYNC(cstate->current_fh.fh_export)); - fput(dst); - fput(src); + nfsd_file_put(dst); + nfsd_file_put(src); out: return status; } -void nfs4_put_copy(struct nfsd4_copy *copy) +/** + * nfsd4_has_active_async_copies - Check for ongoing copy operations + * @clp: Client to be checked + * + * NFSD maintains state for async COPY operations after they complete, + * and this state remains in the nfs4_client's async_copies list. + * Ongoing copies should block the destruction of the nfs4_client, but + * completed copies should not. + * + * Return values: + * %true: At least one active async COPY is ongoing + * %false: No active async COPY operations were found + */ +bool nfsd4_has_active_async_copies(struct nfs4_client *clp) { - if (!refcount_dec_and_test(©->refcount)) - return; - kfree(copy); + struct nfsd4_copy *copy; + bool result = false; + + spin_lock(&clp->async_lock); + list_for_each_entry(copy, &clp->async_copies, copies) { + if (!test_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags) && + !test_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) { + result = true; + break; + } + } + spin_unlock(&clp->async_lock); + return result; } -static bool -check_and_set_stop_copy(struct nfsd4_copy *copy) +/** + * nfsd4_async_copy_reaper - Purge completed copies + * @nn: Network namespace with possible active copy information + */ +void nfsd4_async_copy_reaper(struct nfsd_net *nn) { - bool value; + struct nfs4_client *clp; + struct nfsd4_copy *copy; + LIST_HEAD(reaplist); + + spin_lock(&nn->client_lock); + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + struct list_head *pos, *next; + + spin_lock(&clp->async_lock); + list_for_each_safe(pos, next, &clp->async_copies) { + copy = list_entry(pos, struct nfsd4_copy, copies); + if (test_bit(NFSD4_COPY_F_OFFLOAD_DONE, ©->cp_flags)) { + if (--copy->cp_ttl) { + list_del_init(©->copies); + list_add(©->copies, &reaplist); + } + } + } + spin_unlock(&clp->async_lock); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&reaplist)) { + copy = list_first_entry(&reaplist, struct nfsd4_copy, copies); + list_del_init(©->copies); + cleanup_async_copy(copy); + } +} - spin_lock(©->cp_clp->async_lock); - value = copy->stopped; - if (!copy->stopped) - copy->stopped = true; - spin_unlock(©->cp_clp->async_lock); - return value; +static void nfs4_put_copy(struct nfsd4_copy *copy) +{ + if (!refcount_dec_and_test(©->refcount)) + return; + kfree(copy->cp_src); + kfree(copy); } static void nfsd4_stop_copy(struct nfsd4_copy *copy) { - /* only 1 thread should stop the copy */ - if (!check_and_set_stop_copy(copy)) + trace_nfsd_copy_async_cancel(copy); + if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) { kthread_stop(copy->copy_task); + copy->nfserr = nfs_ok; + set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); + } nfs4_put_copy(copy); } -static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp) +static struct nfsd4_copy *nfsd4_unhash_copy(struct nfs4_client *clp) { struct nfsd4_copy *copy = NULL; @@ -1127,6 +1445,9 @@ static struct nfsd4_copy *nfsd4_get_copy(struct nfs4_client *clp) copy = list_first_entry(&clp->async_copies, struct nfsd4_copy, copies); refcount_inc(©->refcount); + copy->cp_clp = NULL; + if (!list_empty(©->copies)) + list_del_init(©->copies); } spin_unlock(&clp->async_lock); return copy; @@ -1136,63 +1457,382 @@ void nfsd4_shutdown_copy(struct nfs4_client *clp) { struct nfsd4_copy *copy; - while ((copy = nfsd4_get_copy(clp)) != NULL) + while ((copy = nfsd4_unhash_copy(clp)) != NULL) nfsd4_stop_copy(copy); } +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + +extern struct file *nfs42_ssc_open(struct vfsmount *ss_mnt, + struct nfs_fh *src_fh, + nfs4_stateid *stateid); +extern void nfs42_ssc_close(struct file *filep); + +extern void nfs_sb_deactive(struct super_block *sb); + +#define NFSD42_INTERSSC_MOUNTOPS "vers=4.2,addr=%s,sec=sys" + +/* + * setup a work entry in the ssc delayed unmount list. + */ +static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr, + struct nfsd4_ssc_umount_item **nsui, + struct svc_rqst *rqstp) +{ + struct nfsd4_ssc_umount_item *ni = NULL; + struct nfsd4_ssc_umount_item *work = NULL; + struct nfsd4_ssc_umount_item *tmp; + DEFINE_WAIT(wait); + __be32 status = 0; + + *nsui = NULL; + work = kzalloc(sizeof(*work), GFP_KERNEL); +try_again: + spin_lock(&nn->nfsd_ssc_lock); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr))) + continue; + /* found a match */ + if (ni->nsui_busy) { + /* wait - and try again */ + prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, TASK_IDLE); + spin_unlock(&nn->nfsd_ssc_lock); + + /* allow 20secs for mount/unmount for now - revisit */ + if (svc_thread_should_stop(rqstp) || + (schedule_timeout(20*HZ) == 0)) { + finish_wait(&nn->nfsd_ssc_waitq, &wait); + kfree(work); + return nfserr_eagain; + } + finish_wait(&nn->nfsd_ssc_waitq, &wait); + goto try_again; + } + *nsui = ni; + refcount_inc(&ni->nsui_refcnt); + spin_unlock(&nn->nfsd_ssc_lock); + kfree(work); + + /* return vfsmount in (*nsui)->nsui_vfsmount */ + return 0; + } + if (work) { + strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr)); + refcount_set(&work->nsui_refcnt, 2); + work->nsui_busy = true; + list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list); + *nsui = work; + } else + status = nfserr_resource; + spin_unlock(&nn->nfsd_ssc_lock); + return status; +} + +static void nfsd4_ssc_update_dul(struct nfsd_net *nn, + struct nfsd4_ssc_umount_item *nsui, + struct vfsmount *ss_mnt) +{ + spin_lock(&nn->nfsd_ssc_lock); + nsui->nsui_vfsmount = ss_mnt; + nsui->nsui_busy = false; + wake_up_all(&nn->nfsd_ssc_waitq); + spin_unlock(&nn->nfsd_ssc_lock); +} + +static void nfsd4_ssc_cancel_dul(struct nfsd_net *nn, + struct nfsd4_ssc_umount_item *nsui) +{ + spin_lock(&nn->nfsd_ssc_lock); + list_del(&nsui->nsui_list); + wake_up_all(&nn->nfsd_ssc_waitq); + spin_unlock(&nn->nfsd_ssc_lock); + kfree(nsui); +} + +/* + * Support one copy source server for now. + */ +static __be32 +nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, + struct nfsd4_ssc_umount_item **nsui) +{ + struct file_system_type *type; + struct vfsmount *ss_mnt; + struct nfs42_netaddr *naddr; + struct sockaddr_storage tmp_addr; + size_t tmp_addrlen, match_netid_len = 3; + char *startsep = "", *endsep = "", *match_netid = "tcp"; + char *ipaddr, *dev_name, *raw_data; + int len, raw_len; + __be32 status = nfserr_inval; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + naddr = &nss->u.nl4_addr; + tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr, + naddr->addr_len, + (struct sockaddr *)&tmp_addr, + sizeof(tmp_addr)); + *nsui = NULL; + if (tmp_addrlen == 0) + goto out_err; + + if (tmp_addr.ss_family == AF_INET6) { + startsep = "["; + endsep = "]"; + match_netid = "tcp6"; + match_netid_len = 4; + } + + if (naddr->netid_len != match_netid_len || + strncmp(naddr->netid, match_netid, naddr->netid_len)) + goto out_err; + + /* Construct the raw data for the vfs_kern_mount call */ + len = RPC_MAX_ADDRBUFLEN + 1; + ipaddr = kzalloc(len, GFP_KERNEL); + if (!ipaddr) + goto out_err; + + rpc_ntop((struct sockaddr *)&tmp_addr, ipaddr, len); + + /* 2 for ipv6 endsep and startsep. 3 for ":/" and trailing '/0'*/ + + raw_len = strlen(NFSD42_INTERSSC_MOUNTOPS) + strlen(ipaddr); + raw_data = kzalloc(raw_len, GFP_KERNEL); + if (!raw_data) + goto out_free_ipaddr; + + snprintf(raw_data, raw_len, NFSD42_INTERSSC_MOUNTOPS, ipaddr); + + status = nfserr_nodev; + type = get_fs_type("nfs"); + if (!type) + goto out_free_rawdata; + + /* Set the server:<export> for the vfs_kern_mount call */ + dev_name = kzalloc(len + 5, GFP_KERNEL); + if (!dev_name) + goto out_free_rawdata; + snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep); + + status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui, rqstp); + if (status) + goto out_free_devname; + if ((*nsui)->nsui_vfsmount) + goto out_done; + + /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */ + ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data); + module_put(type->owner); + if (IS_ERR(ss_mnt)) { + status = nfserr_nodev; + nfsd4_ssc_cancel_dul(nn, *nsui); + goto out_free_devname; + } + nfsd4_ssc_update_dul(nn, *nsui, ss_mnt); +out_done: + status = 0; + +out_free_devname: + kfree(dev_name); +out_free_rawdata: + kfree(raw_data); +out_free_ipaddr: + kfree(ipaddr); +out_err: + return status; +} + +/* + * Verify COPY destination stateid. + * + * Connect to the source server with NFSv4.1. + * Create the source struct file for nfsd_copy_range. + * Called with COPY cstate: + * SAVED_FH: source filehandle + * CURRENT_FH: destination filehandle + */ +static __be32 +nfsd4_setup_inter_ssc(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_copy *copy) +{ + struct svc_fh *s_fh = NULL; + stateid_t *s_stid = ©->cp_src_stateid; + __be32 status = nfserr_inval; + + /* Verify the destination stateid and set dst struct file*/ + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + ©->cp_dst_stateid, + WR_STATE, ©->nf_dst, NULL); + if (status) + goto out; + + status = nfsd4_interssc_connect(copy->cp_src, rqstp, ©->ss_nsui); + if (status) + goto out; + + s_fh = &cstate->save_fh; + + copy->c_fh.size = s_fh->fh_handle.fh_size; + memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_raw, copy->c_fh.size); + copy->stateid.seqid = cpu_to_be32(s_stid->si_generation); + memcpy(copy->stateid.other, (void *)&s_stid->si_opaque, + sizeof(stateid_opaque_t)); + + status = 0; +out: + return status; +} + +static void +nfsd4_cleanup_inter_ssc(struct nfsd4_ssc_umount_item *nsui, struct file *filp, + struct nfsd_file *dst) +{ + struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id); + long timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout); + + nfs42_ssc_close(filp); + fput(filp); + + spin_lock(&nn->nfsd_ssc_lock); + list_del(&nsui->nsui_list); + /* + * vfsmount can be shared by multiple exports, + * decrement refcnt. If the count drops to 1 it + * will be unmounted when nsui_expire expires. + */ + refcount_dec(&nsui->nsui_refcnt); + nsui->nsui_expire = jiffies + timeout; + list_add_tail(&nsui->nsui_list, &nn->nfsd_ssc_mount_list); + spin_unlock(&nn->nfsd_ssc_lock); +} + +#else /* CONFIG_NFSD_V4_2_INTER_SSC */ + +static __be32 +nfsd4_setup_inter_ssc(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_copy *copy) +{ + return nfserr_inval; +} + +static void +nfsd4_cleanup_inter_ssc(struct nfsd4_ssc_umount_item *nsui, struct file *filp, + struct nfsd_file *dst) +{ +} + +static struct file *nfs42_ssc_open(struct vfsmount *ss_mnt, + struct nfs_fh *src_fh, + nfs4_stateid *stateid) +{ + return NULL; +} +#endif /* CONFIG_NFSD_V4_2_INTER_SSC */ + +static __be32 +nfsd4_setup_intra_ssc(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_copy *copy) +{ + return nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, + ©->nf_src, ©->cp_dst_stateid, + ©->nf_dst); +} static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) { - struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb); + struct nfsd4_cb_offload *cbo = + container_of(cb, struct nfsd4_cb_offload, co_cb); + struct nfsd4_copy *copy = + container_of(cbo, struct nfsd4_copy, cp_cb_offload); - nfs4_put_copy(copy); + set_bit(NFSD4_COPY_F_OFFLOAD_DONE, ©->cp_flags); } static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, struct rpc_task *task) { + struct nfsd4_cb_offload *cbo = + container_of(cb, struct nfsd4_cb_offload, co_cb); + + trace_nfsd_cb_offload_done(&cbo->co_res.cb_stateid, task); + switch (task->tk_status) { + case -NFS4ERR_DELAY: + if (cbo->co_retries--) { + rpc_delay(task, HZ / 5); + return 0; + } + } + nfsd41_cb_destroy_referring_call_list(cb); return 1; } static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = { .release = nfsd4_cb_offload_release, - .done = nfsd4_cb_offload_done + .done = nfsd4_cb_offload_done, + .opcode = OP_CB_OFFLOAD, }; static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) { - copy->cp_res.wr_stable_how = NFS_UNSTABLE; - copy->cp_synchronous = sync; - gen_boot_verifier(©->cp_res.wr_verifier, copy->cp_clp->net); + copy->cp_res.wr_stable_how = + test_bit(NFSD4_COPY_F_COMMITTED, ©->cp_flags) ? + NFS_FILE_SYNC : NFS_UNSTABLE; + nfsd4_copy_set_sync(copy, sync); } -static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) +static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy, + struct file *dst, + struct file *src) { + errseq_t since; ssize_t bytes_copied = 0; - size_t bytes_total = copy->cp_count; + u64 bytes_total = copy->cp_count; u64 src_pos = copy->cp_src_pos; u64 dst_pos = copy->cp_dst_pos; + int status; + loff_t end; + /* See RFC 7862 p.67: */ + if (bytes_total == 0) + bytes_total = ULLONG_MAX; do { + /* Only async copies can be stopped here */ if (kthread_should_stop()) break; - bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos, - copy->file_dst, dst_pos, bytes_total); + bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos, + bytes_total); if (bytes_copied <= 0) break; bytes_total -= bytes_copied; copy->cp_res.wr_bytes_written += bytes_copied; src_pos += bytes_copied; dst_pos += bytes_copied; - } while (bytes_total > 0 && !copy->cp_synchronous); + } while (bytes_total > 0 && nfsd4_copy_is_async(copy)); + /* for a non-zero asynchronous copy do a commit of data */ + if (nfsd4_copy_is_async(copy) && copy->cp_res.wr_bytes_written > 0) { + since = READ_ONCE(dst->f_wb_err); + end = copy->cp_dst_pos + copy->cp_res.wr_bytes_written - 1; + status = vfs_fsync_range(dst, copy->cp_dst_pos, end, 0); + if (!status) + status = filemap_check_wb_err(dst->f_mapping, since); + if (!status) + set_bit(NFSD4_COPY_F_COMMITTED, ©->cp_flags); + } return bytes_copied; } -static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) +static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, + struct file *src, struct file *dst, + bool sync) { __be32 status; ssize_t bytes; - bytes = _nfsd_copy_file_range(copy); + bytes = _nfsd_copy_file_range(copy, dst, src); + /* for async copy, we ignore the error, client can always retry * to get the error */ @@ -1202,9 +1842,6 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) nfsd4_init_copy_res(copy, sync); status = nfs_ok; } - - fput(copy->file_src); - fput(copy->file_dst); return status; } @@ -1213,44 +1850,107 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) dst->cp_src_pos = src->cp_src_pos; dst->cp_dst_pos = src->cp_dst_pos; dst->cp_count = src->cp_count; - dst->cp_synchronous = src->cp_synchronous; + dst->cp_flags = src->cp_flags; memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res)); memcpy(&dst->fh, &src->fh, sizeof(src->fh)); dst->cp_clp = src->cp_clp; - dst->file_dst = get_file(src->file_dst); - dst->file_src = get_file(src->file_src); + dst->nf_dst = nfsd_file_get(src->nf_dst); + /* for inter, nf_src doesn't exist yet */ + if (!nfsd4_ssc_is_inter(src)) + dst->nf_src = nfsd_file_get(src->nf_src); + memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid)); + memcpy(dst->cp_src, src->cp_src, sizeof(struct nl4_server)); + memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid)); + memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh)); + dst->ss_nsui = src->ss_nsui; +} + +static void release_copy_files(struct nfsd4_copy *copy) +{ + if (copy->nf_src) + nfsd_file_put(copy->nf_src); + if (copy->nf_dst) + nfsd_file_put(copy->nf_dst); } static void cleanup_async_copy(struct nfsd4_copy *copy) { - nfs4_free_cp_state(copy); - fput(copy->file_dst); - fput(copy->file_src); - spin_lock(©->cp_clp->async_lock); - list_del(©->copies); - spin_unlock(©->cp_clp->async_lock); + nfs4_free_copy_state(copy); + release_copy_files(copy); + if (copy->cp_clp) { + spin_lock(©->cp_clp->async_lock); + if (!list_empty(©->copies)) + list_del_init(©->copies); + spin_unlock(©->cp_clp->async_lock); + } nfs4_put_copy(copy); } +static void nfsd4_send_cb_offload(struct nfsd4_copy *copy) +{ + struct nfsd4_cb_offload *cbo = ©->cp_cb_offload; + + memcpy(&cbo->co_res, ©->cp_res, sizeof(copy->cp_res)); + memcpy(&cbo->co_fh, ©->fh, sizeof(copy->fh)); + cbo->co_nfserr = copy->nfserr; + cbo->co_retries = 5; + + nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops, + NFSPROC4_CLNT_CB_OFFLOAD); + nfsd41_cb_referring_call(&cbo->co_cb, &cbo->co_referring_sessionid, + cbo->co_referring_slotid, + cbo->co_referring_seqno); + trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid, + &cbo->co_fh, copy->cp_count, copy->nfserr); + nfsd4_try_run_cb(&cbo->co_cb); +} + +/** + * nfsd4_do_async_copy - kthread function for background server-side COPY + * @data: arguments for COPY operation + * + * Return values: + * %0: Copy operation is done. + */ static int nfsd4_do_async_copy(void *data) { struct nfsd4_copy *copy = (struct nfsd4_copy *)data; - struct nfsd4_copy *cb_copy; - copy->nfserr = nfsd4_do_copy(copy, 0); - cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); - if (!cb_copy) - goto out; - memcpy(&cb_copy->cp_res, ©->cp_res, sizeof(copy->cp_res)); - cb_copy->cp_clp = copy->cp_clp; - cb_copy->nfserr = copy->nfserr; - memcpy(&cb_copy->fh, ©->fh, sizeof(copy->fh)); - nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp, - &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); - nfsd4_run_cb(&cb_copy->cp_cb); -out: - cleanup_async_copy(copy); + trace_nfsd_copy_async(copy); + if (nfsd4_ssc_is_inter(copy)) { + struct file *filp; + + filp = nfs42_ssc_open(copy->ss_nsui->nsui_vfsmount, + ©->c_fh, ©->stateid); + if (IS_ERR(filp)) { + switch (PTR_ERR(filp)) { + case -EBADF: + copy->nfserr = nfserr_wrong_type; + break; + default: + copy->nfserr = nfserr_offload_denied; + } + /* ss_mnt will be unmounted by the laundromat */ + goto do_callback; + } + copy->nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file, + false); + nfsd4_cleanup_inter_ssc(copy->ss_nsui, filp, copy->nf_dst); + } else { + copy->nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file, + copy->nf_dst->nf_file, false); + } + +do_callback: + /* The kthread exits forthwith. Ensure that a subsequent + * OFFLOAD_CANCEL won't try to kill it again. */ + set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags); + + set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); + trace_nfsd_copy_async_done(copy); + nfsd4_send_cb_offload(copy); + atomic_dec(©->cp_nn->pending_async_copies); return 0; } @@ -1258,86 +1958,190 @@ static __be32 nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfsd4_copy *async_copy = NULL; struct nfsd4_copy *copy = &u->copy; + struct nfsd42_write_res *result; __be32 status; - struct nfsd4_copy *async_copy = NULL; - status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, - ©->file_src, ©->cp_dst_stateid, - ©->file_dst); - if (status) - goto out; + result = ©->cp_res; + nfsd_copy_write_verifier((__be32 *)&result->wr_verifier.data, nn); copy->cp_clp = cstate->clp; + if (nfsd4_ssc_is_inter(copy)) { + trace_nfsd_copy_inter(copy); + if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) { + status = nfserr_notsupp; + goto out; + } + status = nfsd4_setup_inter_ssc(rqstp, cstate, copy); + if (status) { + trace_nfsd_copy_done(copy, status); + return nfserr_offload_denied; + } + } else { + trace_nfsd_copy_intra(copy); + status = nfsd4_setup_intra_ssc(rqstp, cstate, copy); + if (status) { + trace_nfsd_copy_done(copy, status); + return status; + } + } + memcpy(©->fh, &cstate->current_fh.fh_handle, sizeof(struct knfsd_fh)); - if (!copy->cp_synchronous) { - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - - status = nfserrno(-ENOMEM); + if (nfsd4_copy_is_async(copy)) { async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); if (!async_copy) - goto out; - if (!nfs4_init_cp_state(nn, copy)) { - kfree(async_copy); - goto out; - } + goto out_err; + async_copy->cp_nn = nn; + INIT_LIST_HEAD(&async_copy->copies); refcount_set(&async_copy->refcount, 1); - memcpy(©->cp_res.cb_stateid, ©->cp_stateid, - sizeof(copy->cp_stateid)); + async_copy->cp_ttl = NFSD_COPY_INITIAL_TTL; + /* Arbitrary cap on number of pending async copy operations */ + if (atomic_inc_return(&nn->pending_async_copies) > + (int)rqstp->rq_pool->sp_nrthreads) + goto out_dec_async_copy_err; + async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); + if (!async_copy->cp_src) + goto out_dec_async_copy_err; + if (!nfs4_init_copy_state(nn, copy)) + goto out_dec_async_copy_err; + memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, + sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); + memcpy(async_copy->cp_cb_offload.co_referring_sessionid.data, + cstate->session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + async_copy->cp_cb_offload.co_referring_slotid = cstate->slot->sl_index; + async_copy->cp_cb_offload.co_referring_seqno = cstate->slot->sl_seqid; async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); if (IS_ERR(async_copy->copy_task)) - goto out_err; + goto out_dec_async_copy_err; spin_lock(&async_copy->cp_clp->async_lock); list_add(&async_copy->copies, &async_copy->cp_clp->async_copies); spin_unlock(&async_copy->cp_clp->async_lock); wake_up_process(async_copy->copy_task); status = nfs_ok; - } else - status = nfsd4_do_copy(copy, 1); + } else { + status = nfsd4_do_copy(copy, copy->nf_src->nf_file, + copy->nf_dst->nf_file, true); + } out: + trace_nfsd_copy_done(copy, status); + release_copy_files(copy); return status; +out_dec_async_copy_err: + if (async_copy) + atomic_dec(&nn->pending_async_copies); out_err: - cleanup_async_copy(async_copy); + if (nfsd4_ssc_is_inter(copy)) { + /* + * Source's vfsmount of inter-copy will be unmounted + * by the laundromat. Use copy instead of async_copy + * since async_copy->ss_nsui might not be set yet. + */ + refcount_dec(©->ss_nsui->nsui_refcnt); + } + if (async_copy) + cleanup_async_copy(async_copy); + status = nfserr_jukebox; goto out; } -struct nfsd4_copy * -find_async_copy(struct nfs4_client *clp, stateid_t *stateid) +static struct nfsd4_copy * +find_async_copy_locked(struct nfs4_client *clp, stateid_t *stateid) { struct nfsd4_copy *copy; - spin_lock(&clp->async_lock); + lockdep_assert_held(&clp->async_lock); + list_for_each_entry(copy, &clp->async_copies, copies) { - if (memcmp(©->cp_stateid, stateid, NFS4_STATEID_SIZE)) + if (memcmp(©->cp_stateid.cs_stid, stateid, NFS4_STATEID_SIZE)) continue; - refcount_inc(©->refcount); - spin_unlock(&clp->async_lock); return copy; } - spin_unlock(&clp->async_lock); return NULL; } +static struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *stateid) +{ + struct nfsd4_copy *copy; + + spin_lock(&clp->async_lock); + copy = find_async_copy_locked(clp, stateid); + if (copy) + refcount_inc(©->refcount); + spin_unlock(&clp->async_lock); + return copy; +} + static __be32 nfsd4_offload_cancel(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_offload_status *os = &u->offload_status; - __be32 status = 0; struct nfsd4_copy *copy; struct nfs4_client *clp = cstate->clp; copy = find_async_copy(clp, &os->stateid); - if (copy) + if (!copy) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + return manage_cpntf_state(nn, &os->stateid, clp, NULL); + } else nfsd4_stop_copy(copy); - else - status = nfserr_bad_stateid; + return nfs_ok; +} + +static __be32 +nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_copy_notify *cn = &u->copy_notify; + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfs4_stid *stid = NULL; + struct nfs4_cpntf_state *cps; + struct nfs4_client *clp = cstate->clp; + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + &cn->cpn_src_stateid, RD_STATE, NULL, + &stid); + if (status) + return status; + if (!stid) + return nfserr_bad_stateid; + + cn->cpn_lease_time.tv_sec = nn->nfsd4_lease; + cn->cpn_lease_time.tv_nsec = 0; + + status = nfserrno(-ENOMEM); + cps = nfs4_alloc_init_cpntf_state(nn, stid); + if (!cps) + goto out; + memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.cs_stid, sizeof(stateid_t)); + memcpy(&cps->cp_p_stateid, &stid->sc_stateid, sizeof(stateid_t)); + memcpy(&cps->cp_p_clid, &clp->cl_clientid, sizeof(clientid_t)); + + /* For now, only return one server address in cpn_src, the + * address used by the client to connect to this server. + */ + cn->cpn_src->nl4_type = NL4_NETADDR; + status = nfsd4_set_netaddr((struct sockaddr *)&rqstp->rq_daddr, + &cn->cpn_src->u.nl4_addr); + WARN_ON_ONCE(status); + if (status) { + nfs4_put_cpntf_state(nn, cps); + goto out; + } +out: + nfs4_put_stid(stid); return status; } @@ -1346,39 +2150,44 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { __be32 status; - struct file *file; + struct nfsd_file *nf; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &fallocate->falloc_stateid, - WR_STATE, &file, NULL); - if (status != nfs_ok) { - dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); + WR_STATE, &nf, NULL); + if (status != nfs_ok) return status; - } - status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, + status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file, fallocate->falloc_offset, fallocate->falloc_length, flags); - fput(file); + nfsd_file_put(nf); return status; } + static __be32 nfsd4_offload_status(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_offload_status *os = &u->offload_status; - __be32 status = 0; + __be32 status = nfs_ok; struct nfsd4_copy *copy; struct nfs4_client *clp = cstate->clp; - copy = find_async_copy(clp, &os->stateid); + os->completed = false; + spin_lock(&clp->async_lock); + copy = find_async_copy_locked(clp, &os->stateid); if (copy) { os->count = copy->cp_res.wr_bytes_written; - nfs4_put_copy(copy); + if (test_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags)) { + os->completed = true; + os->status = copy->nfserr; + } } else status = nfserr_bad_stateid; + spin_unlock(&clp->async_lock); return status; } @@ -1405,15 +2214,13 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_seek *seek = &u->seek; int whence; __be32 status; - struct file *file; + struct nfsd_file *nf; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &seek->seek_stateid, - RD_STATE, &file, NULL); - if (status) { - dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); + RD_STATE, &nf, NULL); + if (status) return status; - } switch (seek->seek_whence) { case NFS4_CONTENT_DATA: @@ -1431,14 +2238,14 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Note: This call does change file->f_pos, but nothing in NFSD * should ever file->f_pos. */ - seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence); + seek->seek_pos = vfs_llseek(nf->nf_file, seek->seek_offset, whence); if (seek->seek_pos < 0) status = nfserrno(seek->seek_pos); - else if (seek->seek_pos >= i_size_read(file_inode(file))) + else if (seek->seek_pos >= i_size_read(file_inode(nf->nf_file))) seek->seek_eof = true; out: - fput(file); + nfsd_file_put(nf); return status; } @@ -1525,6 +2332,49 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status == nfserr_same ? nfs_ok : status; } +static __be32 +nfsd4_get_dir_delegation(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation; + struct nfs4_delegation *dd; + struct nfsd_file *nf; + __be32 status; + + status = nfsd_file_acquire_dir(rqstp, &cstate->current_fh, &nf); + if (status != nfs_ok) + return status; + + /* + * RFC 8881, section 18.39.3 says: + * + * "The server may refuse to grant the delegation. In that case, the + * server will return NFS4ERR_DIRDELEG_UNAVAIL." + * + * This is sub-optimal, since it means that the server would need to + * abort compound processing just because the delegation wasn't + * available. RFC8881bis should change this to allow the server to + * return NFS4_OK with a non-fatal status of GDD4_UNAVAIL in this + * situation. + */ + dd = nfsd_get_dir_deleg(cstate, gdd, nf); + nfsd_file_put(nf); + if (IS_ERR(dd)) { + int err = PTR_ERR(dd); + + if (err != -EAGAIN) + return nfserrno(err); + gdd->gddrnf_status = GDD4_UNAVAIL; + return nfs_ok; + } + + gdd->gddrnf_status = GDD4_OK; + memcpy(&gdd->gddr_stateid, &dd->dl_stid.sc_stateid, sizeof(gdd->gddr_stateid)); + nfs4_put_stid(&dd->dl_stid); + return nfs_ok; +} + #ifdef CONFIG_NFSD_PNFS static const struct nfsd4_layout_ops * nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) @@ -1567,7 +2417,9 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp, return nfserr_noent; } - exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid); + exp = rqst_exp_find(&rqstp->rq_chandle, SVC_NET(rqstp), + rqstp->rq_client, rqstp->rq_gssclient, + map->fsid_type, map->fsid); if (IS_ERR(exp)) { dprintk("%s: could not find device id\n", __func__); return nfserr_noent; @@ -1581,7 +2433,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp, nfserr = nfs_ok; if (gdp->gd_maxcount != 0) { nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, - rqstp, cstate->session->se_client, gdp); + rqstp, cstate->clp, gdp); } gdp->gd_notify_types &= ops->notify_types; @@ -1605,7 +2457,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, const struct nfsd4_layout_ops *ops; struct nfs4_layout_stateid *ls; __be32 nfserr; - int accmode = NFSD_MAY_READ_IF_EXEC; + int accmode = NFSD_MAY_READ_IF_EXEC | NFSD_MAY_OWNER_OVERRIDE; switch (lgp->lg_seg.iomode) { case IOMODE_READ: @@ -1662,7 +2514,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) goto out_put_stid; - nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry), + nfserr = ops->proc_layoutget(rqstp, d_inode(current_fh->fh_dentry), current_fh, lgp); if (nfserr) goto out_put_stid; @@ -1686,16 +2538,17 @@ static __be32 nfsd4_layoutcommit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { + struct net *net = SVC_NET(rqstp); struct nfsd4_layoutcommit *lcp = &u->layoutcommit; const struct nfsd4_layout_seg *seg = &lcp->lc_seg; struct svc_fh *current_fh = &cstate->current_fh; const struct nfsd4_layout_ops *ops; - loff_t new_size = lcp->lc_last_wr + 1; struct inode *inode; struct nfs4_layout_stateid *ls; __be32 nfserr; - nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE); + nfserr = fh_verify(rqstp, current_fh, 0, + NFSD_MAY_WRITE | NFSD_MAY_OWNER_OVERRIDE); if (nfserr) goto out; @@ -1705,43 +2558,50 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, goto out; inode = d_inode(current_fh->fh_dentry); - nfserr = nfserr_inval; - if (new_size <= seg->offset) { - dprintk("pnfsd: last write before layout segment\n"); - goto out; + lcp->lc_size_chg = false; + if (lcp->lc_newoffset) { + loff_t new_size = lcp->lc_last_wr + 1; + + nfserr = nfserr_inval; + if (new_size <= seg->offset) + goto out; + if (new_size > seg->offset + seg->length) + goto out; + + if (new_size > i_size_read(inode)) { + lcp->lc_size_chg = true; + lcp->lc_newsize = new_size; + } } - if (new_size > seg->offset + seg->length) { - dprintk("pnfsd: last write beyond layout segment\n"); + + nfserr = nfserr_grace; + if (locks_in_grace(net) && !lcp->lc_reclaim) goto out; - } - if (!lcp->lc_newoffset && new_size > i_size_read(inode)) { - dprintk("pnfsd: layoutcommit beyond EOF\n"); + nfserr = nfserr_no_grace; + if (!locks_in_grace(net) && lcp->lc_reclaim) goto out; - } - nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid, - false, lcp->lc_layout_type, - &ls); - if (nfserr) { - trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid); - /* fixup error code as per RFC5661 */ - if (nfserr == nfserr_bad_stateid) - nfserr = nfserr_badlayout; - goto out; + if (!lcp->lc_reclaim) { + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, + &lcp->lc_sid, false, lcp->lc_layout_type, &ls); + if (nfserr) { + trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid); + /* fixup error code as per RFC5661 */ + if (nfserr == nfserr_bad_stateid) + nfserr = nfserr_badlayout; + goto out; + } + + /* LAYOUTCOMMIT does not require any serialization */ + mutex_unlock(&ls->ls_mutex); } - /* LAYOUTCOMMIT does not require any serialization */ - mutex_unlock(&ls->ls_mutex); + nfserr = ops->proc_layoutcommit(inode, rqstp, lcp); - if (new_size > i_size_read(inode)) { - lcp->lc_size_chg = 1; - lcp->lc_newsize = new_size; - } else { - lcp->lc_size_chg = 0; + if (!lcp->lc_reclaim) { + nfsd4_file_mark_deleg_written(ls->ls_stid.sc_file); + nfs4_put_stid(&ls->ls_stid); } - - nfserr = ops->proc_layoutcommit(inode, lcp); - nfs4_put_stid(&ls->ls_stid); out: return nfserr; } @@ -1793,19 +2653,81 @@ out: } #endif /* CONFIG_NFSD_PNFS */ +static __be32 +nfsd4_getxattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_getxattr *getxattr = &u->getxattr; + + return nfsd_getxattr(rqstp, &cstate->current_fh, + getxattr->getxa_name, &getxattr->getxa_buf, + &getxattr->getxa_len); +} + +static __be32 +nfsd4_setxattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_setxattr *setxattr = &u->setxattr; + __be32 ret; + + if (opens_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + + ret = nfsd_setxattr(rqstp, &cstate->current_fh, setxattr->setxa_name, + setxattr->setxa_buf, setxattr->setxa_len, + setxattr->setxa_flags); + + if (!ret) + set_change_info(&setxattr->setxa_cinfo, &cstate->current_fh); + + return ret; +} + +static __be32 +nfsd4_listxattrs(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + /* + * Get the entire list, then copy out only the user attributes + * in the encode function. + */ + return nfsd_listxattr(rqstp, &cstate->current_fh, + &u->listxattrs.lsxa_buf, &u->listxattrs.lsxa_len); +} + +static __be32 +nfsd4_removexattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_removexattr *removexattr = &u->removexattr; + __be32 ret; + + if (opens_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + + ret = nfsd_removexattr(rqstp, &cstate->current_fh, + removexattr->rmxa_name); + + if (!ret) + set_change_info(&removexattr->rmxa_cinfo, &cstate->current_fh); + + return ret; +} + /* * NULL call. */ static __be32 nfsd4_proc_null(struct svc_rqst *rqstp) { - return nfs_ok; + return rpc_success; } -static inline void nfsd4_increment_op_stats(u32 opnum) +static inline void nfsd4_increment_op_stats(struct nfsd_net *nn, u32 opnum) { if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) - nfsdstats.nfs4_opcount[opnum]++; + percpu_counter_inc(&nn->counter[NFSD_STATS_NFS4_OP(opnum)]); } static const struct nfsd4_operation nfsd4_ops[]; @@ -1895,24 +2817,44 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp) return !(nextd->op_flags & OP_HANDLES_WRONGSEC); } -static void svcxdr_init_encode(struct svc_rqst *rqstp, - struct nfsd4_compoundres *resp) +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +static void +check_if_stalefh_allowed(struct nfsd4_compoundargs *args) { - struct xdr_stream *xdr = &resp->xdr; - struct xdr_buf *buf = &rqstp->rq_res; - struct kvec *head = buf->head; + struct nfsd4_op *op, *current_op = NULL, *saved_op = NULL; + struct nfsd4_copy *copy; + struct nfsd4_putfh *putfh; + int i; - xdr->buf = buf; - xdr->iov = head; - xdr->p = head->iov_base + head->iov_len; - xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; - /* Tail and page_len should be zero at this point: */ - buf->len = buf->head[0].iov_len; - xdr->scratch.iov_len = 0; - xdr->page_ptr = buf->pages - 1; - buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages) - - rqstp->rq_auth_slack; + /* traverse all operation and if it's a COPY compound, mark the + * source filehandle to skip verification + */ + for (i = 0; i < args->opcnt; i++) { + op = &args->ops[i]; + if (op->opnum == OP_PUTFH) + current_op = op; + else if (op->opnum == OP_SAVEFH) + saved_op = current_op; + else if (op->opnum == OP_RESTOREFH) + current_op = saved_op; + else if (op->opnum == OP_COPY) { + copy = (struct nfsd4_copy *)&op->u; + if (!saved_op) { + op->status = nfserr_nofilehandle; + return; + } + putfh = (struct nfsd4_putfh *)&saved_op->u; + if (nfsd4_ssc_is_inter(copy)) + putfh->no_verify = true; + } + } +} +#else +static void +check_if_stalefh_allowed(struct nfsd4_compoundargs *args) +{ } +#endif /* * COMPOUND call. @@ -1926,12 +2868,17 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) struct nfsd4_compound_state *cstate = &resp->cstate; struct svc_fh *current_fh = &cstate->current_fh; struct svc_fh *save_fh = &cstate->save_fh; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); __be32 status; - svcxdr_init_encode(rqstp, resp); - resp->tagp = resp->xdr.p; + resp->xdr = &rqstp->rq_res_stream; + resp->statusp = resp->xdr->p; + + /* reserve space for: NFS status code */ + xdr_reserve_space(resp->xdr, XDR_UNIT); + /* reserve space for: taglen, tag, and opcnt */ - xdr_reserve_space(&resp->xdr, 8 + args->taglen); + xdr_reserve_space(resp->xdr, XDR_UNIT * 2 + args->taglen); resp->taglen = args->taglen; resp->tag = args->tag; resp->rqstp = rqstp; @@ -1948,10 +2895,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) * According to RFC3010, this takes precedence over all other errors. */ status = nfserr_minor_vers_mismatch; - if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0) - goto out; - status = nfserr_resource; - if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND) + if (nfsd_minorversion(nn, args->minorversion, NFSD_TEST) <= 0) goto out; status = nfs41_check_op_ordering(args); @@ -1961,11 +2905,24 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) resp->opcnt = 1; goto encode_op; } + check_if_stalefh_allowed(args); + + rqstp->rq_lease_breaker = (void **)&cstate->clp; - trace_nfsd_compound(rqstp, args->opcnt); + trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt); while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; + if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) { + /* If there are still more operations to process, + * stop here and report NFS4ERR_RESOURCE. */ + if (cstate->minorversion == 0 && + args->client_opcnt > resp->opcnt) { + op->status = nfserr_resource; + goto encode_op; + } + } + /* * The XDR decode routines may have pre-set op->status; * for example, if there is a miscellaneous XDR error @@ -1976,25 +2933,26 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) op->status = nfsd4_open_omfg(rqstp, cstate, op); goto encode_op; } - - if (!current_fh->fh_dentry) { + if (!current_fh->fh_dentry && + !HAS_FH_FLAG(current_fh, NFSD4_FH_FOREIGN)) { if (!(op->opdesc->op_flags & ALLOWED_WITHOUT_FH)) { op->status = nfserr_nofilehandle; goto encode_op; } - } else if (current_fh->fh_export->ex_fslocs.migrated && + } else if (current_fh->fh_export && + current_fh->fh_export->ex_fslocs.migrated && !(op->opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { op->status = nfserr_moved; goto encode_op; } - fh_clear_wcc(current_fh); + fh_clear_pre_post_attrs(current_fh); /* If op is non-idempotent */ if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) { /* * Don't execute this op if we couldn't encode a - * succesful reply: + * successful reply: */ u32 plen = op->opdesc->op_rsize_bop(rqstp, op); /* @@ -2012,6 +2970,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) if (op->opdesc->op_get_currentstateid) op->opdesc->op_get_currentstateid(cstate, &op->u); op->status = op->opdesc->op_func(rqstp, cstate, &op->u); + trace_nfsd_compound_op_err(rqstp, op->opnum, op->status); /* Only from SEQUENCE */ if (cstate->status == nfserr_replay_cache) { @@ -2026,35 +2985,35 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) if (op->opdesc->op_flags & OP_CLEAR_STATEID) clear_current_stateid(cstate); - if (need_wrongsec_check(rqstp)) - op->status = check_nfsd_access(current_fh->fh_export, rqstp); + if (current_fh->fh_export && + need_wrongsec_check(rqstp)) + op->status = check_nfsd_access(current_fh->fh_export, rqstp, false); } encode_op: if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; - nfsd4_encode_replay(&resp->xdr, op); + nfsd4_encode_replay(resp->xdr, op); status = op->status = op->replay->rp_status; } else { nfsd4_encode_operation(resp, op); status = op->status; } - trace_nfsd_compound_status(args->opcnt, resp->opcnt, status, - nfsd4_op_name(op->opnum)); + trace_nfsd_compound_status(args->client_opcnt, resp->opcnt, + status, nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); - nfsd4_increment_op_stats(op->opnum); + nfsd4_increment_op_stats(nn, op->opnum); } - cstate->status = status; fh_put(current_fh); fh_put(save_fh); BUG_ON(cstate->replay_owner); out: + cstate->status = status; /* Reset deferral mechanism for RPC deferrals */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); - dprintk("nfsv4 compound returned %d\n", ntohl(status)); - return status; + return rpc_success; } #define op_encode_hdr_size (2) @@ -2075,28 +3034,49 @@ out: #define op_encode_channel_attrs_maxsz (6 + 1 + 1) -static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +/* + * The _rsize() helpers are invoked by the NFSv4 COMPOUND decoder, which + * is called before sunrpc sets rq_res.buflen. Thus we have to compute + * the maximum payload size here, based on transport limits and the size + * of the remaining space in the rq_pages array. + */ +static u32 nfsd4_max_payload(const struct svc_rqst *rqstp) +{ + u32 buflen; + + buflen = (rqstp->rq_page_end - rqstp->rq_next_page) * PAGE_SIZE; + buflen -= rqstp->rq_auth_slack; + buflen -= rqstp->rq_res.head[0].iov_len; + return min_t(u32, buflen, svc_max_payload(rqstp)); +} + +static u32 nfsd4_only_status_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size) * sizeof(__be32); } -static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_status_stateid_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32); } -static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_access_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { /* ac_supported, ac_resp_access */ return (op_encode_hdr_size + 2)* sizeof(__be32); } -static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_commit_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_create_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); @@ -2107,17 +3087,17 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op * the op prematurely if the estimate is too large. We may turn off splice * reads unnecessarily. */ -static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_getattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 *bmap = op->u.getattr.ga_bmval; + const u32 *bmap = op->u.getattr.ga_bmval; u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2]; u32 ret = 0; if (bmap0 & FATTR4_WORD0_ACL) - return svc_max_payload(rqstp); + return nfsd4_max_payload(rqstp); if (bmap0 & FATTR4_WORD0_FS_LOCATIONS) - return svc_max_payload(rqstp); + return nfsd4_max_payload(rqstp); if (bmap1 & FATTR4_WORD1_OWNER) { ret += IDMAP_NAMESZ + 4; @@ -2145,24 +3125,28 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, return ret; } -static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_getfh_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE; } -static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_link_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_lock_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_lock_denied_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_open_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_stateid_maxsz + op_encode_change_info_maxsz + 1 @@ -2170,80 +3154,99 @@ static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) + op_encode_delegation_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_read_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = 0, rlen = 0; - - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.read.rd_length, maxcount); + u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_read_plus_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = 0, rlen = 0; + u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp)); + /* + * If we detect that the file changed during hole encoding, then we + * recover by encoding the remaining reply as data. This means we need + * to set aside enough room to encode two data segments. + */ + u32 seg_len = 2 * (1 + 2 + 1); - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.readdir.rd_maxcount, maxcount); + return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static u32 nfsd4_readdir_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) +{ + u32 rlen = min(op->u.readdir.rd_maxcount, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + op_encode_verifier_maxsz + XDR_QUADLEN(rlen)) * sizeof(__be32); } -static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_readlink_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE; } -static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_remove_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_rename_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz + op_encode_change_info_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_sequence_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32); } -static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_test_stateid_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids) * sizeof(__be32); } -static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_setattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_secinfo_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR * (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32); } -static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_setclientid_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * sizeof(__be32); } -static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_write_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_exchange_id_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ 1 + 1 + /* eir_flags, spr_how */\ @@ -2257,14 +3260,16 @@ static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_o 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32); } -static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_bind_conn_to_session_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\ 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32); } -static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_create_session_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + \ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\ @@ -2273,7 +3278,8 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd op_encode_channel_attrs_maxsz) * sizeof(__be32); } -static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_copy_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* wr_callback */ + @@ -2285,21 +3291,46 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1 /* cr_synchronous */) * sizeof(__be32); } -static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp, - struct nfsd4_op *op) +static u32 nfsd4_offload_status_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 2 /* osr_count */ + 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32); } -#ifdef CONFIG_NFSD_PNFS -static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { - u32 maxcount = 0, rlen = 0; + return (op_encode_hdr_size + + 3 /* cnr_lease_time */ + + 1 /* We support one cnr_source_server */ + + 1 /* cnr_stateid seq */ + + op_encode_stateid_maxsz /* cnr_stateid */ + + 1 /* num cnr_source_server*/ + + 1 /* nl4_type */ + + 1 /* nl4 size */ + + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) /*nl4_loc + nl4_loc_sz */) + * sizeof(__be32); +} - maxcount = svc_max_payload(rqstp); - rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount); +static u32 nfsd4_get_dir_delegation_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* gddr_status */ + + op_encode_verifier_maxsz + + op_encode_stateid_maxsz + + 2 /* gddr_notification */ + + 2 /* gddr_child_attributes */ + + 2 /* gddr_dir_attributes */); +} + +#ifdef CONFIG_NFSD_PNFS +static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) +{ + u32 rlen = min(op->u.getdeviceinfo.gd_maxcount, nfsd4_max_payload(rqstp)); return (op_encode_hdr_size + 1 /* gd_layout_type*/ + @@ -2312,7 +3343,8 @@ static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4 * so we need to define an arbitrary upper bound here. */ #define MAX_LAYOUT_SIZE 128 -static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_layoutget_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* logr_return_on_close */ + @@ -2321,14 +3353,16 @@ static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op MAX_LAYOUT_SIZE) * sizeof(__be32); } -static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_layoutcommit_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* locr_newsize */ + 2 /* ns_size */) * sizeof(__be32); } -static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_layoutreturn_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 1 /* lrs_stateid */ + @@ -2337,11 +3371,42 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_ #endif /* CONFIG_NFSD_PNFS */ -static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +static u32 nfsd4_seek_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) { return (op_encode_hdr_size + 3) * sizeof(__be32); } +static u32 nfsd4_getxattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) +{ + u32 rlen = min_t(u32, XATTR_SIZE_MAX, nfsd4_max_payload(rqstp)); + + return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static u32 nfsd4_setxattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} +static u32 nfsd4_listxattrs_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) +{ + u32 rlen = min(op->u.listxattrs.lsxa_maxcount, nfsd4_max_payload(rqstp)); + + return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static u32 nfsd4_removexattr_rsize(const struct svc_rqst *rqstp, + const struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + + static const struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = nfsd4_access, @@ -2395,6 +3460,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCK] = { .op_func = nfsd4_lock, + .op_release = nfsd4_lock_release, .op_flags = OP_MODIFIES_SOMETHING | OP_NONTRIVIAL_ERROR_ENCODE, .op_name = "OP_LOCK", @@ -2403,6 +3469,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCKT] = { .op_func = nfsd4_lockt, + .op_release = nfsd4_lockt_release, .op_flags = OP_NONTRIVIAL_ERROR_ENCODE, .op_name = "OP_LOCKT", .op_rsize_bop = nfsd4_lock_rsize, @@ -2576,6 +3643,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { /* NFSv4.1 operations */ [OP_EXCHANGE_ID] = { .op_func = nfsd4_exchange_id, + .op_release = nfsd4_exchange_id_release, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP | OP_MODIFIES_SOMETHING, .op_name = "OP_EXCHANGE_ID", @@ -2647,6 +3715,12 @@ static const struct nfsd4_operation nfsd4_ops[] = { .op_get_currentstateid = nfsd4_get_freestateid, .op_rsize_bop = nfsd4_only_status_rsize, }, + [OP_GET_DIR_DELEGATION] = { + .op_func = nfsd4_get_dir_delegation, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_GET_DIR_DELEGATION", + .op_rsize_bop = nfsd4_get_dir_delegation_rsize, + }, #ifdef CONFIG_NFSD_PNFS [OP_GETDEVICEINFO] = { .op_func = nfsd4_getdeviceinfo, @@ -2701,6 +3775,13 @@ static const struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_COPY", .op_rsize_bop = nfsd4_copy_rsize, }, + [OP_READ_PLUS] = { + .op_func = nfsd4_read, + .op_release = nfsd4_read_release, + .op_name = "OP_READ_PLUS", + .op_rsize_bop = nfsd4_read_plus_rsize, + .op_get_currentstateid = nfsd4_get_readstateid, + }, [OP_SEEK] = { .op_func = nfsd4_seek, .op_name = "OP_SEEK", @@ -2717,6 +3798,34 @@ static const struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_OFFLOAD_CANCEL", .op_rsize_bop = nfsd4_only_status_rsize, }, + [OP_COPY_NOTIFY] = { + .op_func = nfsd4_copy_notify, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_COPY_NOTIFY", + .op_rsize_bop = nfsd4_copy_notify_rsize, + }, + [OP_GETXATTR] = { + .op_func = nfsd4_getxattr, + .op_name = "OP_GETXATTR", + .op_rsize_bop = nfsd4_getxattr_rsize, + }, + [OP_SETXATTR] = { + .op_func = nfsd4_setxattr, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_SETXATTR", + .op_rsize_bop = nfsd4_setxattr_rsize, + }, + [OP_LISTXATTRS] = { + .op_func = nfsd4_listxattrs, + .op_name = "OP_LISTXATTRS", + .op_rsize_bop = nfsd4_listxattrs_rsize, + }, + [OP_REMOVEXATTR] = { + .op_func = nfsd4_removexattr, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_REMOVEXATTR", + .op_rsize_bop = nfsd4_removexattr_rsize, + }, }; /** @@ -2733,15 +3842,16 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) { struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfsd4_compoundargs *argp = rqstp->rq_argp; - struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_op *this; struct nfsd4_compound_state *cstate = &resp->cstate; struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; u32 opiter; - if (!cstate->minorversion) + if (rqstp->rq_procinfo != &nfsd_version4.vs_proc[NFSPROC4_COMPOUND] || + cstate->minorversion == 0) return false; - if (cstate->spo_must_allowed == true) + if (cstate->spo_must_allowed) return true; opiter = resp->opcnt; @@ -2770,7 +3880,7 @@ int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) void warn_on_nonidempotent_op(struct nfsd4_op *op) { if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) { - pr_err("unable to encode reply to nonidempotent op %d (%s)\n", + pr_err("unable to encode reply to nonidempotent op %u (%s)\n", op->opnum, nfsd4_op_name(op->opnum)); WARN_ON_ONCE(1); } @@ -2783,44 +3893,41 @@ static const char *nfsd4_op_name(unsigned opnum) return "unknown_operation"; } -#define nfsd4_voidres nfsd4_voidargs -struct nfsd4_voidargs { int dummy; }; - static const struct svc_procedure nfsd_procedures4[2] = { [NFSPROC4_NULL] = { .pc_func = nfsd4_proc_null, - .pc_encode = nfs4svc_encode_voidres, - .pc_argsize = sizeof(struct nfsd4_voidargs), - .pc_ressize = sizeof(struct nfsd4_voidres), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 1, + .pc_name = "NULL", }, [NFSPROC4_COMPOUND] = { .pc_func = nfsd4_proc_compound, .pc_decode = nfs4svc_decode_compoundargs, .pc_encode = nfs4svc_encode_compoundres, .pc_argsize = sizeof(struct nfsd4_compoundargs), + .pc_argzero = offsetof(struct nfsd4_compoundargs, iops), .pc_ressize = sizeof(struct nfsd4_compoundres), .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = NFSD_BUFSIZE/4, + .pc_xdrressize = 3+NFSSVC_MAXBLKSIZE/4, + .pc_name = "COMPOUND", }, }; -static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures4)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_count4[ARRAY_SIZE(nfsd_procedures4)]); const struct svc_version nfsd_version4 = { .vs_vers = 4, - .vs_nproc = 2, + .vs_nproc = ARRAY_SIZE(nfsd_procedures4), .vs_proc = nfsd_procedures4, - .vs_count = nfsd_count3, + .vs_count = nfsd_count4, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS4_SVC_XDRSIZE, .vs_rpcb_optnl = true, .vs_need_cong_ctrl = true, }; - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 5188f9f70c78..441dfbfe2d2b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -32,7 +32,8 @@ * */ -#include <crypto/hash.h> +#include <crypto/md5.h> +#include <crypto/sha2.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> @@ -59,8 +60,14 @@ struct nfsd4_client_tracking_ops { void (*remove)(struct nfs4_client *); int (*check)(struct nfs4_client *); void (*grace_done)(struct nfsd_net *); + uint8_t version; + size_t msglen; }; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2; + +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; @@ -76,97 +83,39 @@ nfs4_save_creds(const struct cred **original_creds) new->fsuid = GLOBAL_ROOT_UID; new->fsgid = GLOBAL_ROOT_GID; *original_creds = override_creds(new); - put_cred(new); return 0; } static void nfs4_reset_creds(const struct cred *original) { - revert_creds(original); + put_cred(revert_creds(original)); } static void -md5_to_hex(char *out, char *md5) -{ - int i; - - for (i=0; i<16; i++) { - unsigned char c = md5[i]; - - *out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); - *out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); - } - *out = '\0'; -} - -static int -nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) +nfs4_make_rec_clidname(char dname[HEXDIR_LEN], const struct xdr_netobj *clname) { - struct xdr_netobj cksum; - struct crypto_shash *tfm; - int status; + u8 digest[MD5_DIGEST_SIZE]; dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", clname->len, clname->data); - tfm = crypto_alloc_shash("md5", 0, 0); - if (IS_ERR(tfm)) { - status = PTR_ERR(tfm); - goto out_no_tfm; - } - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) { - status = -ENOMEM; - goto out; - } + md5(clname->data, clname->len, digest); - { - SHASH_DESC_ON_STACK(desc, tfm); - - desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - - status = crypto_shash_digest(desc, clname->data, clname->len, - cksum.data); - shash_desc_zero(desc); - } - - if (status) - goto out; - - md5_to_hex(dname, cksum.data); - - status = 0; -out: - kfree(cksum.data); - crypto_free_shash(tfm); -out_no_tfm: - return status; + static_assert(HEXDIR_LEN == 2 * MD5_DIGEST_SIZE + 1); + sprintf(dname, "%*phN", MD5_DIGEST_SIZE, digest); } -/* - * If we had an error generating the recdir name for the legacy tracker - * then warn the admin. If the error doesn't appear to be transient, - * then disable recovery tracking. - */ static void -legacy_recdir_name_error(struct nfs4_client *clp, int error) +__nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, + char *dname, struct nfsd_net *nn) { - printk(KERN_ERR "NFSD: unable to generate recoverydir " - "name (%d).\n", error); + struct xdr_netobj name = { .len = strlen(dname), .data = dname }; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; + struct nfs4_client_reclaim *crp; - /* - * if the algorithm just doesn't exist, then disable the recovery - * tracker altogether. The crypto libs will generally return this if - * FIPS is enabled as well. - */ - if (error == -ENOENT) { - printk(KERN_ERR "NFSD: disabling legacy clientid tracking. " - "Reboot recovery will not function correctly!\n"); - nfsd4_client_tracking_exit(clp->net); - } + crp = nfs4_client_to_reclaim(name, princhash, nn); + crp->cr_clp = clp; } static void @@ -175,7 +124,6 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) const struct cred *original_cred; char dname[HEXDIR_LEN]; struct dentry *dir, *dentry; - struct nfs4_client_reclaim *crp; int status; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); @@ -184,9 +132,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (!nn->rec_file) return; - status = nfs4_make_rec_clidname(dname, &clp->cl_name); - if (status) - return legacy_recdir_name_error(clp, status); + nfs4_make_rec_clidname(dname, &clp->cl_name); status = nfs4_save_creds(&original_cred); if (status < 0) @@ -197,13 +143,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) goto out_creds; dir = nn->rec_file->f_path.dentry; - /* lock the parent */ - inode_lock(d_inode(dir)); - dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); + dentry = start_creating(&nop_mnt_idmap, dir, &QSTR(dname)); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); - goto out_unlock; + goto out; } if (d_really_is_positive(dentry)) /* @@ -214,18 +158,16 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * In the 4.0 case, we should never get here; but we may * as well be forgiving and just succeed silently. */ - goto out_put; - status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU); -out_put: - dput(dentry); -out_unlock: - inode_unlock(d_inode(dir)); + goto out_end; + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, 0700, NULL); + if (IS_ERR(dentry)) + status = PTR_ERR(dentry); +out_end: + end_creating(dentry); +out: if (status == 0) { - if (nn->in_grace) { - crp = nfs4_client_to_reclaim(dname, nn); - if (crp) - crp->cr_clp = clp; - } + if (nn->in_grace) + __nfsd4_create_reclaim_record_grace(clp, dname, nn); vfs_fsync(nn->rec_file, 0); } else { printk(KERN_ERR "NFSD: failed to write recovery record" @@ -238,7 +180,7 @@ out_creds: nfs4_reset_creds(original_cred); } -typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *); +typedef int (recdir_func)(struct dentry *, char *, struct nfsd_net *); struct name_list { char name[HEXDIR_LEN]; @@ -250,7 +192,7 @@ struct nfs4_dir_ctx { struct list_head names; }; -static int +static bool nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -259,14 +201,14 @@ nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, struct name_list *entry; if (namlen != HEXDIR_LEN - 1) - return 0; + return true; entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); if (entry == NULL) - return -ENOMEM; + return false; memcpy(entry->name, name, HEXDIR_LEN - 1); entry->name[HEXDIR_LEN - 1] = '\0'; list_add(&entry->list, &ctx->names); - return 0; + return true; } static int @@ -292,23 +234,14 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } status = iterate_dir(nn->rec_file, &ctx.ctx); - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { - if (!status) { - struct dentry *dentry; - dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); - if (IS_ERR(dentry)) { - status = PTR_ERR(dentry); - break; - } - status = f(dir, dentry, nn); - dput(dentry); - } + if (!status) + status = f(dir, entry->name, nn); + list_del(&entry->list); kfree(entry); } - inode_unlock(d_inode(dir)); nfs4_reset_creds(original_cred); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { @@ -320,36 +253,47 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } static int -nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) +nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn) { struct dentry *dir, *dentry; int status; - dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); + dprintk("NFSD: nfsd4_unlink_clid_dir. name %s\n", name); dir = nn->rec_file->f_path.dentry; - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - dentry = lookup_one_len(name, dir, namlen); - if (IS_ERR(dentry)) { - status = PTR_ERR(dentry); - goto out_unlock; - } - status = -ENOENT; - if (d_really_is_negative(dentry)) - goto out; - status = vfs_rmdir(d_inode(dir), dentry); -out: - dput(dentry); -out_unlock: - inode_unlock(d_inode(dir)); + dentry = start_removing(&nop_mnt_idmap, dir, &QSTR(name)); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry, NULL); + end_removing(dentry); return status; } static void +__nfsd4_remove_reclaim_record_grace(const char *dname, int len, + struct nfsd_net *nn) +{ + struct xdr_netobj name; + struct nfs4_client_reclaim *crp; + + name.data = kmemdup(dname, len, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + return; + } + name.len = len; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) + nfs4_remove_reclaim_record(crp, nn); +} + +static void nfsd4_remove_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; - struct nfs4_client_reclaim *crp; char dname[HEXDIR_LEN]; int status; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); @@ -357,9 +301,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - status = nfs4_make_rec_clidname(dname, &clp->cl_name); - if (status) - return legacy_recdir_name_error(clp, status); + nfs4_make_rec_clidname(dname, &clp->cl_name); status = mnt_want_write_file(nn->rec_file); if (status) @@ -370,16 +312,13 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (status < 0) goto out_drop_write; - status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn); + status = nfsd4_unlink_clid_dir(dname, nn); nfs4_reset_creds(original_cred); if (status == 0) { vfs_fsync(nn->rec_file, 0); - if (nn->in_grace) { - /* remove reclaim record */ - crp = nfsd4_find_reclaim_client(dname, nn); - if (crp) - nfs4_remove_reclaim_record(crp, nn); - } + if (nn->in_grace) + __nfsd4_remove_reclaim_record_grace(dname, + HEXDIR_LEN, nn); } out_drop_write: mnt_drop_write_file(nn->rec_file); @@ -390,17 +329,42 @@ out: } static int -purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) +purge_old(struct dentry *parent, char *cname, struct nfsd_net *nn) { int status; + struct dentry *child; + struct xdr_netobj name; - if (nfs4_has_reclaimed_state(child->d_name.name, nn)) + if (strlen(cname) != HEXDIR_LEN - 1) { + printk("%s: illegal name %s in recovery directory\n", + __func__, cname); + /* Keep trying; maybe the others are OK: */ return 0; + } + name.data = kstrdup(cname, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + goto out; + } + name.len = HEXDIR_LEN; + if (nfs4_has_reclaimed_state(name, nn)) + goto out_free; + + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); + child = lookup_one(&nop_mnt_idmap, &QSTR(cname), parent); + if (!IS_ERR(child)) { + status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child, NULL); + if (status) + printk("failed to remove client recovery directory %pd\n", + child); + dput(child); + } + inode_unlock(d_inode(parent)); - status = vfs_rmdir(d_inode(parent), child); - if (status) - printk("failed to remove client recovery directory %pd\n", - child); +out_free: + kfree(name.data); +out: /* Keep trying, success or failure: */ return 0; } @@ -428,15 +392,18 @@ out: } static int -load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) +load_recdir(struct dentry *parent, char *cname, struct nfsd_net *nn) { - if (child->d_name.len != HEXDIR_LEN - 1) { - printk("nfsd4: illegal name %pd in recovery directory\n", - child); + struct xdr_netobj name = { .len = HEXDIR_LEN, .data = cname }; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; + + if (strlen(cname) != HEXDIR_LEN - 1) { + printk("%s: illegal name %s in recovery directory\n", + __func__, cname); /* Keep trying; maybe the others are OK: */ return 0; } - nfs4_client_to_reclaim(child->d_name.name, nn); + nfs4_client_to_reclaim(name, princhash, nn); return 0; } @@ -565,6 +532,7 @@ nfsd4_legacy_tracking_init(struct net *net) status = nfsd4_load_reboot_recovery_data(net); if (status) goto err; + pr_info("NFSD: Using legacy client tracking operations.\n"); return 0; err: @@ -596,7 +564,8 @@ nfs4_reset_recoverydir(char *recdir) return status; status = -ENOTDIR; if (d_is_dir(path.dentry)) { - strcpy(user_recovery_dirname, recdir); + strscpy(user_recovery_dirname, recdir, + sizeof(user_recovery_dirname)); status = 0; } path_put(&path); @@ -612,29 +581,34 @@ nfs4_recoverydir(void) static int nfsd4_check_legacy_client(struct nfs4_client *clp) { - int status; char dname[HEXDIR_LEN]; struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct xdr_netobj name; /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; - status = nfs4_make_rec_clidname(dname, &clp->cl_name); - if (status) { - legacy_recdir_name_error(clp, status); - return status; - } + nfs4_make_rec_clidname(dname, &clp->cl_name); /* look for it in the reclaim hashtable otherwise */ - crp = nfsd4_find_reclaim_client(dname, nn); + name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + goto out_enoent; + } + name.len = HEXDIR_LEN; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); if (crp) { set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); crp->cr_clp = clp; return 0; } +out_enoent: return -ENOENT; } @@ -645,7 +619,10 @@ static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .remove = nfsd4_remove_clid_dir, .check = nfsd4_check_legacy_client, .grace_done = nfsd4_recdir_purge_old, + .version = 1, + .msglen = 0, }; +#endif /* CONFIG_NFSD_LEGACY_CLIENT_TRACKING */ /* Globals */ #define NFSD_PIPE_DIR "nfsd" @@ -657,25 +634,32 @@ struct cld_net { spinlock_t cn_lock; struct list_head cn_list; unsigned int cn_xid; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING + bool cn_has_legacy; +#endif }; struct cld_upcall { struct list_head cu_list; struct cld_net *cu_net; struct completion cu_done; - struct cld_msg cu_msg; + union { + struct cld_msg_hdr cu_hdr; + struct cld_msg cu_msg; + struct cld_msg_v2 cu_msg_v2; + } cu_u; }; static int -__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) { int ret; struct rpc_pipe_msg msg; - struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg); + struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u); memset(&msg, 0, sizeof(msg)); msg.data = cmsg; - msg.len = sizeof(*cmsg); + msg.len = nn->client_tracking_ops->msglen; ret = rpc_queue_upcall(pipe, &msg); if (ret < 0) { @@ -691,7 +675,7 @@ out: } static int -cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) { int ret; @@ -700,41 +684,127 @@ cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) * upcalls queued. */ do { - ret = __cld_pipe_upcall(pipe, cmsg); + ret = __cld_pipe_upcall(pipe, cmsg, nn); } while (ret == -EAGAIN); return ret; } static ssize_t +__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, + struct nfsd_net *nn) +{ + uint8_t cmd, princhashlen; + struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; + char *namecopy __free(kfree) = NULL; + char *princhashcopy __free(kfree) = NULL; + uint16_t namelen; + + if (get_user(cmd, &cmsg->cm_cmd)) { + dprintk("%s: error when copying cmd from userspace", __func__); + return -EFAULT; + } + if (cmd == Cld_GraceStart) { + if (nn->client_tracking_ops->version >= 2) { + const struct cld_clntinfo __user *ci; + + ci = &cmsg->cm_u.cm_clntinfo; + if (get_user(namelen, &ci->cc_name.cn_len)) + return -EFAULT; + if (namelen == 0 || namelen > NFS4_OPAQUE_LIMIT) { + dprintk("%s: invalid namelen (%u)", __func__, namelen); + return -EINVAL; + } + namecopy = memdup_user(&ci->cc_name.cn_id, namelen); + if (IS_ERR(namecopy)) + return PTR_ERR(namecopy); + name.data = namecopy; + name.len = namelen; + get_user(princhashlen, &ci->cc_princhash.cp_len); + if (princhashlen > 0) { + princhashcopy = memdup_user( + &ci->cc_princhash.cp_data, + princhashlen); + if (IS_ERR(princhashcopy)) + return PTR_ERR(princhashcopy); + princhash.data = princhashcopy; + princhash.len = princhashlen; + } else + princhash.len = 0; + } else { + const struct cld_name __user *cnm; + + cnm = &cmsg->cm_u.cm_name; + if (get_user(namelen, &cnm->cn_len)) + return -EFAULT; + if (namelen == 0 || namelen > NFS4_OPAQUE_LIMIT) { + dprintk("%s: invalid namelen (%u)", __func__, namelen); + return -EINVAL; + } + namecopy = memdup_user(&cnm->cn_id, namelen); + if (IS_ERR(namecopy)) + return PTR_ERR(namecopy); + name.data = namecopy; + name.len = namelen; + } +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING + if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { + struct cld_net *cn = nn->cld_net; + + name.len = name.len - 5; + name.data = name.data + 5; + cn->cn_has_legacy = true; + } +#endif + if (!nfs4_client_to_reclaim(name, princhash, nn)) + return -EFAULT; + return nn->client_tracking_ops->msglen; + } + return -EFAULT; +} + +static ssize_t cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct cld_upcall *tmp, *cup; - struct cld_msg __user *cmsg = (struct cld_msg __user *)src; + struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src; + struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src; uint32_t xid; struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfsd_net_id); struct cld_net *cn = nn->cld_net; + int16_t status; - if (mlen != sizeof(*cmsg)) { + if (mlen != nn->client_tracking_ops->msglen) { dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen, - sizeof(*cmsg)); + nn->client_tracking_ops->msglen); return -EINVAL; } /* copy just the xid so we can try to find that */ - if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) { + if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) { dprintk("%s: error when copying xid from userspace", __func__); return -EFAULT; } + /* + * copy the status so we know whether to remove the upcall from the + * list (for -EINPROGRESS, we just want to make sure the xid is + * valid, not remove the upcall from the list) + */ + if (get_user(status, &hdr->cm_status)) { + dprintk("%s: error when copying status from userspace", __func__); + return -EFAULT; + } + /* walk the list and find corresponding xid */ cup = NULL; spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { - if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) { + if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) { cup = tmp; - list_del_init(&cup->cu_list); + if (status != -EINPROGRESS) + list_del_init(&cup->cu_list); break; } } @@ -746,7 +816,10 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) return -EINVAL; } - if (copy_from_user(&cup->cu_msg, src, mlen) != 0) + if (status == -EINPROGRESS) + return __cld_pipe_inprogress_downcall(cmsg, nn); + + if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0) return -EFAULT; complete(&cup->cu_done); @@ -758,7 +831,7 @@ cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct cld_msg *cmsg = msg->data; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, - cu_msg); + cu_u.cu_msg); /* errno >= 0 means we got a downcall */ if (msg->errno >= 0) @@ -773,38 +846,32 @@ static const struct rpc_pipe_ops cld_upcall_ops = { .destroy_msg = cld_pipe_destroy_msg, }; -static struct dentry * +static int nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { - struct dentry *dir, *dentry; + struct dentry *dir; + int err; dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR); if (dir == NULL) - return ERR_PTR(-ENOENT); - dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe); + return -ENOENT; + err = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe); dput(dir); - return dentry; + return err; } -static void -nfsd4_cld_unregister_sb(struct rpc_pipe *pipe) -{ - if (pipe->dentry) - rpc_unlink(pipe->dentry); -} - -static struct dentry * +static int nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *sb; - struct dentry *dentry; + int err; sb = rpc_get_sb_net(net); if (!sb) - return NULL; - dentry = nfsd4_cld_register_sb(sb, pipe); + return 0; + err = nfsd4_cld_register_sb(sb, pipe); rpc_put_sb_net(net); - return dentry; + return err; } static void @@ -814,17 +881,16 @@ nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe) sb = rpc_get_sb_net(net); if (sb) { - nfsd4_cld_unregister_sb(pipe); + rpc_unlink(pipe); rpc_put_sb_net(net); } } /* Initialize rpc_pipefs pipe for communication with client tracking daemon */ static int -nfsd4_init_cld_pipe(struct net *net) +__nfsd4_init_cld_pipe(struct net *net) { int ret; - struct dentry *dentry; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn; @@ -845,13 +911,13 @@ nfsd4_init_cld_pipe(struct net *net) spin_lock_init(&cn->cn_lock); INIT_LIST_HEAD(&cn->cn_list); - dentry = nfsd4_cld_register_net(net, cn->cn_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); + ret = nfsd4_cld_register_net(net, cn->cn_pipe); + if (unlikely(ret)) goto err_destroy_data; - } - cn->cn_pipe->dentry = dentry; +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING + cn->cn_has_legacy = false; +#endif nn->cld_net = cn; return 0; @@ -864,6 +930,17 @@ err: return ret; } +static int +nfsd4_init_cld_pipe(struct net *net) +{ + int status; + + status = __nfsd4_init_cld_pipe(net); + if (!status) + pr_info("NFSD: Using old nfsdcld client tracking operations.\n"); + return status; +} + static void nfsd4_remove_cld_pipe(struct net *net) { @@ -877,9 +954,10 @@ nfsd4_remove_cld_pipe(struct net *net) } static struct cld_upcall * -alloc_cld_upcall(struct cld_net *cn) +alloc_cld_upcall(struct nfsd_net *nn) { struct cld_upcall *new, *tmp; + struct cld_net *cn = nn->cld_net; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) @@ -889,20 +967,20 @@ alloc_cld_upcall(struct cld_net *cn) restart_search: spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { - if (tmp->cu_msg.cm_xid == cn->cn_xid) { + if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) { cn->cn_xid++; spin_unlock(&cn->cn_lock); goto restart_search; } } init_completion(&new->cu_done); - new->cu_msg.cm_vers = CLD_UPCALL_VERSION; - put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid); + new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version; + put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid); new->cu_net = cn; list_add(&new->cu_list, &cn->cn_list); spin_unlock(&cn->cn_lock); - dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid); + dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid); return new; } @@ -931,20 +1009,20 @@ nfsd4_cld_create(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_Create; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Create; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -957,6 +1035,56 @@ out_err: /* Ask daemon to create a new record */ static void +nfsd4_cld_create_v2(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + struct cld_msg_v2 *cmsg; + char *principal = NULL; + + /* Don't upcall if it's already stored */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cmsg = &cup->cu_u.cu_msg_v2; + cmsg->cm_cmd = Cld_Create; + cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal) { + sha256(principal, strlen(principal), + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data); + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = SHA256_DIGEST_SIZE; + } else + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; + + ret = cld_pipe_upcall(cn->cn_pipe, cmsg, nn); + if (!ret) { + ret = cmsg->cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + + free_cld_upcall(cup); +out_err: + if (ret) + pr_err("NFSD: Unable to create client record on stable storage: %d\n", + ret); +} + +/* Ask daemon to create a new record */ +static void nfsd4_cld_remove(struct nfs4_client *clp) { int ret; @@ -968,20 +1096,20 @@ nfsd4_cld_remove(struct nfs4_client *clp) if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_Remove; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Remove; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -992,9 +1120,14 @@ out_err: "record from stable storage: %d\n", ret); } -/* Check for presence of a record, and update its timestamp */ +/* + * For older nfsdcld's that do not allow us to "slurp" the clients + * from the tracking database during startup. + * + * Check for presence of a record, and update its timestamp + */ static int -nfsd4_cld_check(struct nfs4_client *clp) +nfsd4_cld_check_v0(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; @@ -1005,21 +1138,21 @@ nfsd4_cld_check(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { printk(KERN_ERR "NFSD: Unable to check client record on " "stable storage: %d\n", -ENOMEM); return -ENOMEM; } - cup->cu_msg.cm_cmd = Cld_Check; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Check; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1027,6 +1160,170 @@ nfsd4_cld_check(struct nfs4_client *clp) return ret; } +/* + * For newer nfsdcld's that allow us to "slurp" the clients + * from the tracking database during startup. + * + * Check for presence of a record in the reclaim_str_hashtbl + */ +static int +nfsd4_cld_check(struct nfs4_client *clp) +{ + struct nfs4_client_reclaim *crp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + /* look for it in the reclaim hashtable otherwise */ + crp = nfsd4_find_reclaim_client(clp->cl_name, nn); + if (crp) + goto found; + +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING + if (nn->cld_net->cn_has_legacy) { + char dname[HEXDIR_LEN]; + struct xdr_netobj name; + + nfs4_make_rec_clidname(dname, &clp->cl_name); + + name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + return -ENOENT; + } + name.len = HEXDIR_LEN; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) + goto found; + + } +#endif + return -ENOENT; +found: + crp->cr_clp = clp; + return 0; +} + +static int +nfsd4_cld_check_v2(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING + struct cld_net *cn = nn->cld_net; +#endif + struct nfs4_client_reclaim *crp; + char *principal = NULL; + + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + /* look for it in the reclaim hashtable otherwise */ + crp = nfsd4_find_reclaim_client(clp->cl_name, nn); + if (crp) + goto found; + +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING + if (cn->cn_has_legacy) { + struct xdr_netobj name; + char dname[HEXDIR_LEN]; + + nfs4_make_rec_clidname(dname, &clp->cl_name); + + name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data\n", + __func__); + return -ENOENT; + } + name.len = HEXDIR_LEN; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) + goto found; + + } +#endif + return -ENOENT; +found: + if (crp->cr_princhash.len) { + u8 digest[SHA256_DIGEST_SIZE]; + + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal == NULL) + return -ENOENT; + sha256(principal, strlen(principal), digest); + if (memcmp(crp->cr_princhash.data, digest, + crp->cr_princhash.len)) + return -ENOENT; + } + crp->cr_clp = clp; + return 0; +} + +static int +nfsd4_cld_grace_start(struct nfsd_net *nn) +{ + int ret; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); + if (!ret) + ret = cup->cu_u.cu_msg.cm_status; + + free_cld_upcall(cup); +out_err: + if (ret) + dprintk("%s: Unable to get clients from userspace: %d\n", + __func__, ret); + return ret; +} + +/* For older nfsdcld's that need cm_gracetime */ +static void +nfsd4_cld_grace_done_v0(struct nfsd_net *nn) +{ + int ret; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + cup->cu_u.cu_msg.cm_u.cm_gracetime = nn->boot_time; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); + if (!ret) + ret = cup->cu_u.cu_msg.cm_status; + + free_cld_upcall(cup); +out_err: + if (ret) + printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); +} + +/* + * For newer nfsdcld's that do not need cm_gracetime. We also need to call + * nfs4_release_reclaim() to clear out the reclaim_str_hashtbl. + */ static void nfsd4_cld_grace_done(struct nfsd_net *nn) { @@ -1034,33 +1331,210 @@ nfsd4_cld_grace_done(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceDone; - cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: + nfs4_release_reclaim(nn); if (ret) printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); } -static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { +static int +nfs4_cld_state_init(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int i; + + nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, + sizeof(struct list_head), + GFP_KERNEL); + if (!nn->reclaim_str_hashtbl) + return -ENOMEM; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) + INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]); + nn->reclaim_str_hashtbl_size = 0; + nn->track_reclaim_completes = true; + atomic_set(&nn->nr_reclaim_complete, 0); + + return 0; +} + +static void +nfs4_cld_state_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nn->track_reclaim_completes = false; + kfree(nn->reclaim_str_hashtbl); +} + +static bool +cld_running(struct nfsd_net *nn) +{ + struct cld_net *cn = nn->cld_net; + struct rpc_pipe *pipe = cn->cn_pipe; + + return pipe->nreaders || pipe->nwriters; +} + +static int +nfsd4_cld_get_version(struct nfsd_net *nn) +{ + int ret = 0; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + uint8_t version; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); + if (!ret) { + ret = cup->cu_u.cu_msg.cm_status; + if (ret) + goto out_free; + version = cup->cu_u.cu_msg.cm_u.cm_version; + dprintk("%s: userspace returned version %u\n", + __func__, version); + if (version < 1) + version = 1; + else if (version > CLD_UPCALL_VERSION) + version = CLD_UPCALL_VERSION; + + switch (version) { + case 1: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; + break; + case 2: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2; + break; + default: + break; + } + } +out_free: + free_cld_upcall(cup); +out_err: + if (ret) + dprintk("%s: Unable to get version from userspace: %d\n", + __func__, ret); + return ret; +} + +static int +nfsd4_cld_tracking_init(struct net *net) +{ + int status; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + bool running; + int retries = 10; + + status = nfs4_cld_state_init(net); + if (status) + return status; + + status = __nfsd4_init_cld_pipe(net); + if (status) + goto err_shutdown; + + /* + * rpc pipe upcalls take 30 seconds to time out, so we don't want to + * queue an upcall unless we know that nfsdcld is running (because we + * want this to fail fast so that nfsd4_client_tracking_init() can try + * the next client tracking method). nfsdcld should already be running + * before nfsd is started, so the wait here is for nfsdcld to open the + * pipefs file we just created. + */ + while (!(running = cld_running(nn)) && retries--) + msleep(100); + + if (!running) { + status = -ETIMEDOUT; + goto err_remove; + } + + status = nfsd4_cld_get_version(nn); + if (status == -EOPNOTSUPP) + pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n"); + + status = nfsd4_cld_grace_start(nn); + if (status) { + if (status == -EOPNOTSUPP) + pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n"); + nfs4_release_reclaim(nn); + goto err_remove; + } else + pr_info("NFSD: Using nfsdcld client tracking operations.\n"); + return 0; + +err_remove: + nfsd4_remove_cld_pipe(net); +err_shutdown: + nfs4_cld_state_shutdown(net); + return status; +} + +static void +nfsd4_cld_tracking_exit(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfs4_release_reclaim(nn); + nfsd4_remove_cld_pipe(net); + nfs4_cld_state_shutdown(net); +} + +/* For older nfsdcld's */ +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = { .init = nfsd4_init_cld_pipe, .exit = nfsd4_remove_cld_pipe, .create = nfsd4_cld_create, .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check_v0, + .grace_done = nfsd4_cld_grace_done_v0, + .version = 1, + .msglen = sizeof(struct cld_msg), +}; + +/* For newer nfsdcld's */ +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { + .init = nfsd4_cld_tracking_init, + .exit = nfsd4_cld_tracking_exit, + .create = nfsd4_cld_create, + .remove = nfsd4_cld_remove, .check = nfsd4_cld_check, .grace_done = nfsd4_cld_grace_done, + .version = 1, + .msglen = sizeof(struct cld_msg), }; +/* v2 create/check ops include the principal, if available */ +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = { + .init = nfsd4_cld_tracking_init, + .exit = nfsd4_cld_tracking_exit, + .create = nfsd4_cld_create_v2, + .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check_v2, + .grace_done = nfsd4_cld_grace_done, + .version = 2, + .msglen = sizeof(struct cld_msg_v2), +}; + +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING /* upcall via usermodehelper */ static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), @@ -1131,11 +1605,7 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name) return NULL; } - copied = nfs4_make_rec_clidname(result + copied, name); - if (copied) { - kfree(result); - return NULL; - } + nfs4_make_rec_clidname(result + copied, name); return result; } @@ -1166,7 +1636,7 @@ nfsd4_cltrack_client_has_session(struct nfs4_client *clp) } static char * -nfsd4_cltrack_grace_start(time_t grace_start) +nfsd4_cltrack_grace_start(time64_t grace_start) { int copied; size_t len; @@ -1179,7 +1649,7 @@ nfsd4_cltrack_grace_start(time_t grace_start) if (!result) return result; - copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld", + copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%lld", grace_start); if (copied >= len) { /* just return nothing if output was truncated */ @@ -1236,19 +1706,14 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1) static char * bin_to_hex_dup(const unsigned char *src, int srclen) { - int i; - char *buf, *hex; + char *buf; /* +1 for terminating NULL */ - buf = kmalloc((srclen * 2) + 1, GFP_KERNEL); + buf = kzalloc((srclen * 2) + 1, GFP_KERNEL); if (!buf) return buf; - hex = buf; - for (i = 0; i < srclen; i++) { - sprintf(hex, "%2.2x", *src++); - hex += 2; - } + bin2hex(buf, src, srclen); return buf; } @@ -1268,6 +1733,8 @@ nfsd4_umh_cltrack_init(struct net *net) ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL); kfree(grace_start); + if (!ret) + pr_info("NFSD: Using UMH upcall client tracking operations.\n"); return ret; } @@ -1281,10 +1748,7 @@ nfsd4_cltrack_upcall_lock(struct nfs4_client *clp) static void nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp) { - smp_mb__before_atomic(); - clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags); - smp_mb__after_atomic(); - wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK); + clear_and_wake_up_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags); } static void @@ -1391,7 +1855,7 @@ nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn) char *legacy; char timestr[22]; /* FIXME: better way to determine max size? */ - sprintf(timestr, "%ld", nn->boot_time); + sprintf(timestr, "%lld", nn->boot_time); legacy = nfsd4_cltrack_legacy_topdir(); nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL); kfree(legacy); @@ -1404,22 +1868,17 @@ static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .remove = nfsd4_umh_cltrack_remove, .check = nfsd4_umh_cltrack_check, .grace_done = nfsd4_umh_cltrack_grace_done, + .version = 1, + .msglen = 0, }; -int -nfsd4_client_tracking_init(struct net *net) +static inline int check_for_legacy_methods(int status, struct net *net) { - int status; - struct path path; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - /* just run the init if it the method is already decided */ - if (nn->client_tracking_ops) - goto do_init; + struct path path; /* - * First, try a UMH upcall. It should succeed or fail quickly, so - * there's little harm in trying that first. + * Next, try the UMH upcall. */ nn->client_tracking_ops = &nfsd4_umh_tracking_ops; status = nn->client_tracking_ops->init(net); @@ -1427,28 +1886,57 @@ nfsd4_client_tracking_init(struct net *net) return status; /* - * See if the recoverydir exists and is a directory. If it is, - * then use the legacy ops. + * Finally, See if the recoverydir exists and is a directory. + * If it is, then use the legacy ops. */ nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); if (!status) { - status = d_is_dir(path.dentry); + status = !d_is_dir(path.dentry); path_put(&path); if (status) - goto do_init; + return -ENOTDIR; } + return status; +} +#else +static inline int check_for_legacy_methods(int status, struct net *net) +{ + return status; +} +#endif /* CONFIG_LEGACY_NFSD_CLIENT_TRACKING */ + +int +nfsd4_client_tracking_init(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int status; + + /* just run the init if it the method is already decided */ + if (nn->client_tracking_ops) + goto do_init; - /* Finally, try to use nfsdcld */ + /* First, try to use nfsdcld */ nn->client_tracking_ops = &nfsd4_cld_tracking_ops; - printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be " - "removed in 3.10. Please transition to using " - "nfsdcltrack.\n"); + status = nn->client_tracking_ops->init(net); + if (!status) + return status; + if (status != -ETIMEDOUT) { + nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0; + status = nn->client_tracking_ops->init(net); + if (!status) + return status; + } + + status = check_for_legacy_methods(status, net); + if (status) + goto out; do_init: status = nn->client_tracking_ops->init(net); +out: if (status) { - printk(KERN_WARNING "NFSD: Unable to initialize client " - "recovery tracking! (%d)\n", status); + pr_warn("NFSD: Unable to initialize client recovery tracking! (%d)\n", status); + pr_warn("NFSD: Is nfsdcld running? If not, enable CONFIG_NFSD_LEGACY_CLIENT_TRACKING.\n"); nn->client_tracking_ops = NULL; } return status; @@ -1509,7 +1997,6 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) struct net *net = sb->s_fs_info; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn = nn->cld_net; - struct dentry *dentry; int ret = 0; if (!try_module_get(THIS_MODULE)) @@ -1522,16 +2009,10 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) switch (event) { case RPC_PIPEFS_MOUNT: - dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - break; - } - cn->cn_pipe->dentry = dentry; + ret = nfsd4_cld_register_sb(sb, cn->cn_pipe); break; case RPC_PIPEFS_UMOUNT: - if (cn->cn_pipe->dentry) - nfsd4_cld_unregister_sb(cn->cn_pipe); + rpc_unlink(cn->cn_pipe); break; default: ret = -ENOTSUPP; @@ -1548,6 +2029,7 @@ static struct notifier_block nfsd4_cld_block = { int register_cld_notifier(void) { + WARN_ON(!nfsd_net_id); return rpc_pipefs_notifier_register(&nfsd4_cld_block); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fb3c9844c82a..808c24fb5c9a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -42,6 +42,11 @@ #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/addr.h> #include <linux/jhash.h> +#include <linux/string_helpers.h> +#include <linux/fsnotify.h> +#include <linux/rhashtable.h> +#include <linux/nfs_ssc.h> + #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" @@ -49,10 +54,12 @@ #include "netns.h" #include "pnfs.h" +#include "filecache.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC -#define all_ones {{~0,~0},~0} +#define all_ones {{ ~0, ~0}, ~0} static const stateid_t one_stateid = { .si_generation = ~0, .si_opaque = all_ones, @@ -77,6 +84,10 @@ static u64 current_sessionid = 1; /* forward declarations */ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); static void nfs4_free_ol_stateid(struct nfs4_stid *stid); +void nfsd4_end_grace(struct nfsd_net *nn); +static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); +static void nfsd4_file_hash_remove(struct nfs4_file *fi); +static void deleg_reaper(struct nfsd_net *nn); /* Locking: */ @@ -98,6 +109,13 @@ enum nfsd4_st_mutex_lock_subclass { */ static DECLARE_WAIT_QUEUE_HEAD(close_wq); +/* + * A waitqueue where a writer to clients/#/ctl destroying a client can + * wait for cl_rpc_users to drop to 0 and then for the client to be + * unhashed. + */ +static DECLARE_WAIT_QUEUE_HEAD(expiry_wq); + static struct kmem_cache *client_slab; static struct kmem_cache *openowner_slab; static struct kmem_cache *lockowner_slab; @@ -110,17 +128,35 @@ static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; +static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops; + +static struct workqueue_struct *laundry_wq; + +int nfsd4_create_laundry_wq(void) +{ + int rc = 0; + + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); + if (laundry_wq == NULL) + rc = -ENOMEM; + return rc; +} + +void nfsd4_destroy_laundry_wq(void) +{ + destroy_workqueue(laundry_wq); +} static bool is_session_dead(struct nfsd4_session *ses) { - return ses->se_flags & NFS4_SESSION_DEAD; + return ses->se_dead; } static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me) { if (atomic_read(&ses->se_ref) > ref_held_by_me) return nfserr_jukebox; - ses->se_flags |= NFS4_SESSION_DEAD; + ses->se_dead = true; return nfs_ok; } @@ -129,6 +165,13 @@ static bool is_client_expired(struct nfs4_client *clp) return clp->cl_time == 0; } +static void nfsd4_dec_courtesy_client_count(struct nfsd_net *nn, + struct nfs4_client *clp) +{ + if (clp->cl_state != NFSD4_ACTIVE) + atomic_add_unless(&nn->nfsd_courtesy_clients, -1, 0); +} + static __be32 get_client_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); @@ -137,7 +180,9 @@ static __be32 get_client_locked(struct nfs4_client *clp) if (is_client_expired(clp)) return nfserr_expired; - atomic_inc(&clp->cl_refcount); + atomic_inc(&clp->cl_rpc_users); + nfsd4_dec_courtesy_client_count(nn, clp); + clp->cl_state = NFSD4_ACTIVE; return nfs_ok; } @@ -156,11 +201,10 @@ renew_client_locked(struct nfs4_client *clp) return; } - dprintk("renewing client (clientid %08x/%08x)\n", - clp->cl_clientid.cl_boot, - clp->cl_clientid.cl_id); list_move_tail(&clp->cl_lru, &nn->client_lru); - clp->cl_time = get_seconds(); + clp->cl_time = ktime_get_boottime_seconds(); + nfsd4_dec_courtesy_client_count(nn, clp); + clp->cl_state = NFSD4_ACTIVE; } static void put_client_renew_locked(struct nfs4_client *clp) @@ -169,20 +213,24 @@ static void put_client_renew_locked(struct nfs4_client *clp) lockdep_assert_held(&nn->client_lock); - if (!atomic_dec_and_test(&clp->cl_refcount)) + if (!atomic_dec_and_test(&clp->cl_rpc_users)) return; if (!is_client_expired(clp)) renew_client_locked(clp); + else + wake_up_all(&expiry_wq); } static void put_client_renew(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock)) + if (!atomic_dec_and_lock(&clp->cl_rpc_users, &nn->client_lock)) return; if (!is_client_expired(clp)) renew_client_locked(clp); + else + wake_up_all(&expiry_wq); spin_unlock(&nn->client_lock); } @@ -231,6 +279,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { if (fh_match(fh, &cur->nbl_fh)) { list_del_init(&cur->nbl_list); + WARN_ON(list_empty(&cur->nbl_lru)); list_del_init(&cur->nbl_lru); found = cur; break; @@ -250,10 +299,13 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, nbl = find_blocked_lock(lo, fh, nn); if (!nbl) { - nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); + nbl = kmalloc(sizeof(*nbl), GFP_KERNEL); if (nbl) { + INIT_LIST_HEAD(&nbl->nbl_list); + INIT_LIST_HEAD(&nbl->nbl_lru); fh_copy_shallow(&nbl->nbl_fh, fh); locks_init_lock(&nbl->nbl_lock); + kref_init(&nbl->nbl_kref); nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, &nfsd4_cb_notify_lock_ops, NFSPROC4_CLNT_CB_NOTIFY_LOCK); @@ -263,13 +315,23 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, } static void -free_blocked_lock(struct nfsd4_blocked_lock *nbl) +free_nbl(struct kref *kref) { + struct nfsd4_blocked_lock *nbl; + + nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref); locks_release_private(&nbl->nbl_lock); kfree(nbl); } static void +free_blocked_lock(struct nfsd4_blocked_lock *nbl) +{ + locks_delete_block(&nbl->nbl_lock); + kref_put(&nbl->nbl_kref, free_nbl); +} + +static void remove_blocked_locks(struct nfs4_lockowner *lo) { struct nfs4_client *clp = lo->lo_owner.so_client; @@ -284,6 +346,7 @@ remove_blocked_locks(struct nfs4_lockowner *lo) struct nfsd4_blocked_lock, nbl_list); list_del_init(&nbl->nbl_list); + WARN_ON(list_empty(&nbl->nbl_lru)); list_move(&nbl->nbl_lru, &reaplist); } spin_unlock(&nn->blocked_locks_lock); @@ -293,14 +356,23 @@ remove_blocked_locks(struct nfs4_lockowner *lo) nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); - locks_delete_block(&nbl->nbl_lock); free_blocked_lock(nbl); } } +static void +nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb) +{ + struct nfsd4_blocked_lock *nbl = container_of(cb, + struct nfsd4_blocked_lock, nbl_cb); + locks_delete_block(&nbl->nbl_lock); +} + static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { + trace_nfsd_cb_notify_lock_done(&zero_stateid, task); + /* * Since this is just an optimization, we don't try very hard if it * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and @@ -325,10 +397,136 @@ nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) } static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { + .prepare = nfsd4_cb_notify_lock_prepare, .done = nfsd4_cb_notify_lock_done, .release = nfsd4_cb_notify_lock_release, + .opcode = OP_CB_NOTIFY_LOCK, }; +/* + * We store the NONE, READ, WRITE, and BOTH bits separately in the + * st_{access,deny}_bmap field of the stateid, in order to track not + * only what share bits are currently in force, but also what + * combinations of share bits previous opens have used. This allows us + * to enforce the recommendation in + * https://datatracker.ietf.org/doc/html/rfc7530#section-16.19.4 that + * the server return an error if the client attempt to downgrade to a + * combination of share bits not explicable by closing some of its + * previous opens. + * + * This enforcement is arguably incomplete, since we don't keep + * track of access/deny bit combinations; so, e.g., we allow: + * + * OPEN allow read, deny write + * OPEN allow both, deny none + * DOWNGRADE allow read, deny none + * + * which we should reject. + * + * But you could also argue that our current code is already overkill, + * since it only exists to return NFS4ERR_INVAL on incorrect client + * behavior. + */ +static unsigned int +bmap_to_share_mode(unsigned long bmap) +{ + int i; + unsigned int access = 0; + + for (i = 1; i < 4; i++) { + if (test_bit(i, &bmap)) + access |= i; + } + return access; +} + +/* set share access for a given stateid */ +static inline void +set_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap |= mask; +} + +/* clear share access for a given stateid */ +static inline void +clear_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap &= ~mask; +} + +/* test whether a given stateid has access */ +static inline bool +test_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + return (bool)(stp->st_access_bmap & mask); +} + +/* set share deny for a given stateid */ +static inline void +set_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap |= mask; +} + +/* clear share deny for a given stateid */ +static inline void +clear_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap &= ~mask; +} + +/* test whether a given stateid is denying specific access */ +static inline bool +test_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + return (bool)(stp->st_deny_bmap & mask); +} + +static int nfs4_access_to_omode(u32 access) +{ + switch (access & NFS4_SHARE_ACCESS_BOTH) { + case NFS4_SHARE_ACCESS_READ: + return O_RDONLY; + case NFS4_SHARE_ACCESS_WRITE: + return O_WRONLY; + case NFS4_SHARE_ACCESS_BOTH: + return O_RDWR; + } + WARN_ON_ONCE(1); + return O_RDONLY; +} + +static inline int +access_permit_read(struct nfs4_ol_stateid *stp) +{ + return test_access(NFS4_SHARE_ACCESS_READ, stp) || + test_access(NFS4_SHARE_ACCESS_BOTH, stp) || + test_access(NFS4_SHARE_ACCESS_WRITE, stp); +} + +static inline int +access_permit_write(struct nfs4_ol_stateid *stp) +{ + return test_access(NFS4_SHARE_ACCESS_WRITE, stp) || + test_access(NFS4_SHARE_ACCESS_BOTH, stp); +} + static inline struct nfs4_stateowner * nfs4_get_stateowner(struct nfs4_stateowner *sop) { @@ -344,7 +542,7 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner) } static struct nfs4_openowner * -find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open, +find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, struct nfs4_client *clp) { struct nfs4_stateowner *so; @@ -361,18 +559,6 @@ find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open, return NULL; } -static struct nfs4_openowner * -find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, - struct nfs4_client *clp) -{ - struct nfs4_openowner *oo; - - spin_lock(&clp->cl_lock); - oo = find_openstateowner_str_locked(hashval, open, clp); - spin_unlock(&clp->cl_lock); - return oo; -} - static inline u32 opaque_hashval(const void *ptr, int nbytes) { @@ -386,52 +572,34 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -static void nfsd4_free_file_rcu(struct rcu_head *rcu) -{ - struct nfs4_file *fp = container_of(rcu, struct nfs4_file, fi_rcu); - - kmem_cache_free(file_slab, fp); -} - void put_nfs4_file(struct nfs4_file *fi) { - might_lock(&state_lock); - - if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { - hlist_del_rcu(&fi->fi_hash); - spin_unlock(&state_lock); + if (refcount_dec_and_test(&fi->fi_ref)) { + nfsd4_file_hash_remove(fi); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); - call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); + kfree_rcu(fi, fi_rcu); } } -static struct file * -__nfs4_get_fd(struct nfs4_file *f, int oflag) -{ - if (f->fi_fds[oflag]) - return get_file(f->fi_fds[oflag]); - return NULL; -} - -static struct file * +static struct nfsd_file * find_writeable_file_locked(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); - ret = __nfs4_get_fd(f, O_WRONLY); + ret = nfsd_file_get(f->fi_fds[O_WRONLY]); if (!ret) - ret = __nfs4_get_fd(f, O_RDWR); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); return ret; } -static struct file * +static struct nfsd_file * find_writeable_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_writeable_file_locked(f); @@ -440,22 +608,23 @@ find_writeable_file(struct nfs4_file *f) return ret; } -static struct file *find_readable_file_locked(struct nfs4_file *f) +static struct nfsd_file * +find_readable_file_locked(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); - ret = __nfs4_get_fd(f, O_RDONLY); + ret = nfsd_file_get(f->fi_fds[O_RDONLY]); if (!ret) - ret = __nfs4_get_fd(f, O_RDWR); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); return ret; } -static struct file * +static struct nfsd_file * find_readable_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_readable_file_locked(f); @@ -464,22 +633,37 @@ find_readable_file(struct nfs4_file *f) return ret; } -struct file * +struct nfsd_file * find_any_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; + if (!f) + return NULL; spin_lock(&f->fi_lock); - ret = __nfs4_get_fd(f, O_RDWR); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); if (!ret) { - ret = __nfs4_get_fd(f, O_WRONLY); + ret = nfsd_file_get(f->fi_fds[O_WRONLY]); if (!ret) - ret = __nfs4_get_fd(f, O_RDONLY); + ret = nfsd_file_get(f->fi_fds[O_RDONLY]); } spin_unlock(&f->fi_lock); return ret; } +static struct nfsd_file *find_any_file_locked(struct nfs4_file *f) +{ + lockdep_assert_held(&f->fi_lock); + + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + if (f->fi_fds[O_WRONLY]) + return f->fi_fds[O_WRONLY]; + if (f->fi_fds[O_RDONLY]) + return f->fi_fds[O_RDONLY]; + return NULL; +} + static atomic_long_t num_delegations; unsigned long max_delegations; @@ -500,21 +684,71 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) return ret & OWNER_HASH_MASK; } -/* hash table for nfs4_file */ -#define FILE_HASH_BITS 8 -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) +static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp; -static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh) -{ - return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0); -} +static const struct rhashtable_params nfs4_file_rhash_params = { + .key_len = sizeof_field(struct nfs4_file, fi_inode), + .key_offset = offsetof(struct nfs4_file, fi_inode), + .head_offset = offsetof(struct nfs4_file, fi_rlist), -static unsigned int file_hashval(struct knfsd_fh *fh) + /* + * Start with a single page hash table to reduce resizing churn + * on light workloads. + */ + .min_size = 256, + .automatic_shrinking = true, +}; + +/* + * Check if courtesy clients have conflicting access and resolve it if possible + * + * access: is op_share_access if share_access is true. + * Check if access mode, op_share_access, would conflict with + * the current deny mode of the file 'fp'. + * access: is op_share_deny if share_access is false. + * Check if the deny mode, op_share_deny, would conflict with + * current access of the file 'fp'. + * stp: skip checking this entry. + * new_stp: normal open, not open upgrade. + * + * Function returns: + * false - access/deny mode conflict with normal client. + * true - no conflict or conflict with courtesy client(s) is resolved. + */ +static bool +nfs4_resolve_deny_conflicts_locked(struct nfs4_file *fp, bool new_stp, + struct nfs4_ol_stateid *stp, u32 access, bool share_access) { - return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); -} + struct nfs4_ol_stateid *st; + bool resolvable = true; + unsigned char bmap; + struct nfsd_net *nn; + struct nfs4_client *clp; -static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; + lockdep_assert_held(&fp->fi_lock); + list_for_each_entry(st, &fp->fi_stateids, st_perfile) { + /* ignore lock stateid */ + if (st->st_openstp) + continue; + if (st == stp && new_stp) + continue; + /* check file access against deny mode or vice versa */ + bmap = share_access ? st->st_deny_bmap : st->st_access_bmap; + if (!(access & bmap_to_share_mode(bmap))) + continue; + clp = st->st_stid.sc_client; + if (try_to_expire_client(clp)) + continue; + resolvable = false; + break; + } + if (resolvable) { + clp = stp->st_stid.sc_client; + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + } + return resolvable; +} static void __nfs4_file_get_access(struct nfs4_file *fp, u32 access) @@ -568,17 +802,17 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) might_lock(&fp->fi_lock); if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { - struct file *f1 = NULL; - struct file *f2 = NULL; + struct nfsd_file *f1 = NULL; + struct nfsd_file *f2 = NULL; swap(f1, fp->fi_fds[oflag]); if (atomic_read(&fp->fi_access[1 - oflag]) == 0) swap(f2, fp->fi_fds[O_RDWR]); spin_unlock(&fp->fi_lock); if (f1) - fput(f1); + nfsd_file_put(f1); if (f2) - fput(f2); + nfsd_file_put(f2); } } @@ -684,7 +918,8 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla idr_preload(GFP_KERNEL); spin_lock(&cl->cl_lock); - new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 0, 0, GFP_NOWAIT); + /* Reserving 0 for start of file in nfsdfs "states" file: */ + new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 1, 0, GFP_NOWAIT); spin_unlock(&cl->cl_lock); idr_preload_end(); if (new_id < 0) @@ -697,16 +932,8 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla /* Will be incremented before return to client: */ refcount_set(&stid->sc_count, 1); spin_lock_init(&stid->sc_lock); + INIT_LIST_HEAD(&stid->sc_cp_list); - /* - * It shouldn't be a problem to reuse an opaque stateid value. - * I don't think it is for 4.1. But with 4.0 I worry that, for - * example, a stray write retransmission could be accepted by - * the server when it should have been rejected. Therefore, - * adopt a trick from the sctp code to attempt to maximize the - * amount of time until an id is reused, by ensuring they always - * "increase" (mod INT_MAX): - */ return stid; out_free: kmem_cache_free(slab, stid); @@ -716,30 +943,78 @@ out_free: /* * Create a unique stateid_t to represent each COPY. */ -int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy) +static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, + unsigned char cs_type) { int new_id; + stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time; + stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; + idr_preload(GFP_KERNEL); spin_lock(&nn->s2s_cp_lock); - new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, copy, 0, 0, GFP_NOWAIT); + new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT); + stid->cs_stid.si_opaque.so_id = new_id; + stid->cs_stid.si_generation = 1; spin_unlock(&nn->s2s_cp_lock); idr_preload_end(); if (new_id < 0) return 0; - copy->cp_stateid.si_opaque.so_id = new_id; - copy->cp_stateid.si_opaque.so_clid.cl_boot = nn->boot_time; - copy->cp_stateid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; + stid->cs_type = cs_type; return 1; } -void nfs4_free_cp_state(struct nfsd4_copy *copy) +int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy) +{ + return nfs4_init_cp_state(nn, ©->cp_stateid, NFS4_COPY_STID); +} + +struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, + struct nfs4_stid *p_stid) +{ + struct nfs4_cpntf_state *cps; + + cps = kzalloc(sizeof(struct nfs4_cpntf_state), GFP_KERNEL); + if (!cps) + return NULL; + cps->cpntf_time = ktime_get_boottime_seconds(); + refcount_set(&cps->cp_stateid.cs_count, 1); + if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID)) + goto out_free; + spin_lock(&nn->s2s_cp_lock); + list_add(&cps->cp_list, &p_stid->sc_cp_list); + spin_unlock(&nn->s2s_cp_lock); + return cps; +out_free: + kfree(cps); + return NULL; +} + +void nfs4_free_copy_state(struct nfsd4_copy *copy) { struct nfsd_net *nn; + if (copy->cp_stateid.cs_type != NFS4_COPY_STID) + return; nn = net_generic(copy->cp_clp->net, nfsd_net_id); spin_lock(&nn->s2s_cp_lock); - idr_remove(&nn->s2s_cp_stateids, copy->cp_stateid.si_opaque.so_id); + idr_remove(&nn->s2s_cp_stateids, + copy->cp_stateid.cs_stid.si_opaque.so_id); + spin_unlock(&nn->s2s_cp_lock); +} + +static void nfs4_free_cpntf_statelist(struct net *net, struct nfs4_stid *stid) +{ + struct nfs4_cpntf_state *cps; + struct nfsd_net *nn; + + nn = net_generic(net, nfsd_net_id); + spin_lock(&nn->s2s_cp_lock); + while (!list_empty(&stid->sc_cp_list)) { + cps = list_first_entry(&stid->sc_cp_list, + struct nfs4_cpntf_state, cp_list); + _free_cpntf_state_locked(nn, cps); + } spin_unlock(&nn->s2s_cp_lock); } @@ -754,8 +1029,20 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) return openlockstateid(stid); } +/* + * As the sc_free callback of deleg, this may be called by nfs4_put_stid + * in nfsd_break_one_deleg. + * Considering nfsd_break_one_deleg is called with the flc->flc_lock held, + * this function mustn't ever sleep. + */ static void nfs4_free_deleg(struct nfs4_stid *stid) { + struct nfs4_delegation *dp = delegstateid(stid); + + WARN_ON_ONCE(!list_empty(&stid->sc_cp_list)); + WARN_ON_ONCE(!list_empty(&dp->dl_perfile)); + WARN_ON_ONCE(!list_empty(&dp->dl_perclnt)); + WARN_ON_ONCE(!list_empty(&dp->dl_recall_lru)); kmem_cache_free(deleg_slab, stid); atomic_long_dec(&num_delegations); } @@ -769,7 +1056,8 @@ static void nfs4_free_deleg(struct nfs4_stid *stid) * When a delegation is recalled, the filehandle is stored in the "new" * filter. * Every 30 seconds we swap the filters and clear the "new" one, - * unless both are empty of course. + * unless both are empty of course. This results in delegations for a + * given filehandle being blocked for between 30 and 60 seconds. * * Each filter is 256 bits. We hash the filehandle to 32bit and use the * low 3 bytes as hash-table indices. @@ -781,7 +1069,7 @@ static void nfs4_free_deleg(struct nfs4_stid *stid) static DEFINE_SPINLOCK(blocked_delegations_lock); static struct bloom_pair { int entries, old_entries; - time_t swap_time; + time64_t swap_time; int new; /* index into 'set' */ DECLARE_BITMAP(set[2], 256); } blocked_delegations; @@ -793,19 +1081,19 @@ static int delegation_blocked(struct knfsd_fh *fh) if (bd->entries == 0) return 0; - if (seconds_since_boot() - bd->swap_time > 30) { + if (ktime_get_seconds() - bd->swap_time > 30) { spin_lock(&blocked_delegations_lock); - if (seconds_since_boot() - bd->swap_time > 30) { + if (ktime_get_seconds() - bd->swap_time > 30) { bd->entries -= bd->old_entries; bd->old_entries = bd->entries; + bd->new = 1-bd->new; memset(bd->set[bd->new], 0, sizeof(bd->set[0])); - bd->new = 1-bd->new; - bd->swap_time = seconds_since_boot(); + bd->swap_time = ktime_get_seconds(); } spin_unlock(&blocked_delegations_lock); } - hash = jhash(&fh->fh_base, fh->fh_size, 0); + hash = jhash(&fh->fh_raw, fh->fh_size, 0); if (test_bit(hash&255, bd->set[0]) && test_bit((hash>>8)&255, bd->set[0]) && test_bit((hash>>16)&255, bd->set[0])) @@ -824,35 +1112,36 @@ static void block_delegations(struct knfsd_fh *fh) u32 hash; struct bloom_pair *bd = &blocked_delegations; - hash = jhash(&fh->fh_base, fh->fh_size, 0); + hash = jhash(&fh->fh_raw, fh->fh_size, 0); spin_lock(&blocked_delegations_lock); __set_bit(hash&255, bd->set[bd->new]); __set_bit((hash>>8)&255, bd->set[bd->new]); __set_bit((hash>>16)&255, bd->set[bd->new]); if (bd->entries == 0) - bd->swap_time = seconds_since_boot(); + bd->swap_time = ktime_get_seconds(); bd->entries += 1; spin_unlock(&blocked_delegations_lock); } static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, - struct svc_fh *current_fh, - struct nfs4_clnt_odstate *odstate) + struct nfs4_clnt_odstate *odstate, u32 dl_type) { struct nfs4_delegation *dp; + struct nfs4_stid *stid; long n; dprintk("NFSD alloc_init_deleg\n"); n = atomic_long_inc_return(&num_delegations); if (n < 0 || n > max_delegations) goto out_dec; - if (delegation_blocked(¤t_fh->fh_handle)) + if (delegation_blocked(&fp->fi_fhandle)) goto out_dec; - dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg)); - if (dp == NULL) + stid = nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg); + if (stid == NULL) goto out_dec; + dp = delegstateid(stid); /* * delegation seqid's are never incremented. The 4.1 special @@ -865,10 +1154,14 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, INIT_LIST_HEAD(&dp->dl_recall_lru); dp->dl_clnt_odstate = odstate; get_clnt_odstate(odstate); - dp->dl_type = NFS4_OPEN_DELEGATE_READ; + dp->dl_type = dl_type; dp->dl_retries = 1; + dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); + nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client, + &nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR); + dp->dl_cb_fattr.ncf_file_modified = false; get_nfs4_file(fp); dp->dl_stid.sc_file = fp; return dp; @@ -890,6 +1183,9 @@ nfs4_put_stid(struct nfs4_stid *s) return; } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + atomic_dec(&s->sc_client->cl_admin_revoked); + nfs4_free_cpntf_statelist(clp->net, s); spin_unlock(&clp->cl_lock); s->sc_free(s); if (fp) @@ -910,25 +1206,67 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) static void put_deleg_file(struct nfs4_file *fp) { - struct file *filp = NULL; + struct nfsd_file *rnf = NULL; + struct nfsd_file *nf = NULL; spin_lock(&fp->fi_lock); - if (--fp->fi_delegees == 0) - swap(filp, fp->fi_deleg_file); + if (--fp->fi_delegees == 0) { + swap(nf, fp->fi_deleg_file); + swap(rnf, fp->fi_rdeleg_file); + } spin_unlock(&fp->fi_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); + if (rnf) + nfs4_file_put_access(fp, NFS4_SHARE_ACCESS_READ); +} + +static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct file *f) +{ + struct iattr ia = { .ia_valid = ATTR_ATIME | ATTR_CTIME | ATTR_MTIME }; + struct inode *inode = file_inode(f); + int ret; + + /* don't do anything if FMODE_NOCMTIME isn't set */ + if ((READ_ONCE(f->f_mode) & FMODE_NOCMTIME) == 0) + return; + + spin_lock(&f->f_lock); + f->f_mode &= ~FMODE_NOCMTIME; + spin_unlock(&f->f_lock); + + /* was it never written? */ + if (!dp->dl_written) + return; + + /* did it get a setattr for the timestamps at some point? */ + if (dp->dl_setattr) + return; + + /* Stamp everything to "now" */ + inode_lock(inode); + ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &ia, NULL); + inode_unlock(inode); + if (ret) { + struct inode *inode = file_inode(f); + + pr_notice_ratelimited("Unable to update timestamps on inode %02x:%02x:%lu: %d\n", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino, ret); + } } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; - struct file *filp = fp->fi_deleg_file; + struct nfsd_file *nf = fp->fi_deleg_file; WARN_ON_ONCE(!fp->fi_delegees); - vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp); + nfsd4_finalize_deleg_timestamps(dp, nf->nf_file); + kernel_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } @@ -939,11 +1277,6 @@ static void destroy_unhashed_deleg(struct nfs4_delegation *dp) nfs4_put_stid(&dp->dl_stid); } -void nfs4_unhash_stid(struct nfs4_stid *s) -{ - s->sc_type = 0; -} - /** * nfs4_delegation_exists - Discover if this delegation already exists * @clp: a pointer to the nfs4_client we're granting a delegation to @@ -991,27 +1324,39 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); + lockdep_assert_held(&clp->cl_lock); if (nfs4_delegation_exists(clp, fp)) return -EAGAIN; refcount_inc(&dp->dl_stid.sc_count); - dp->dl_stid.sc_type = NFS4_DELEG_STID; + dp->dl_stid.sc_type = SC_TYPE_DELEG; list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &clp->cl_delegations); return 0; } +static bool delegation_hashed(struct nfs4_delegation *dp) +{ + return !(list_empty(&dp->dl_perfile)); +} + static bool -unhash_delegation_locked(struct nfs4_delegation *dp) +unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) { struct nfs4_file *fp = dp->dl_stid.sc_file; lockdep_assert_held(&state_lock); - if (list_empty(&dp->dl_perfile)) + if (!delegation_hashed(dp)) return false; - dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; + if (statusmask == SC_STATUS_REVOKED && + dp->dl_stid.sc_client->cl_minorversion == 0) + statusmask = SC_STATUS_CLOSED; + dp->dl_stid.sc_status |= statusmask; + if (statusmask & SC_STATUS_ADMIN_REVOKED) + atomic_inc(&dp->dl_stid.sc_client->cl_admin_revoked); + /* Ensure that deleg break won't try to requeue it */ ++dp->dl_time; spin_lock(&fp->fi_lock); @@ -1027,30 +1372,59 @@ static void destroy_delegation(struct nfs4_delegation *dp) bool unhashed; spin_lock(&state_lock); - unhashed = unhash_delegation_locked(dp); + unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED); spin_unlock(&state_lock); if (unhashed) destroy_unhashed_deleg(dp); } +/** + * revoke_delegation - perform nfs4 delegation structure cleanup + * @dp: pointer to the delegation + * + * This function assumes that it's called either from the administrative + * interface (nfsd4_revoke_states()) that's revoking a specific delegation + * stateid or it's called from a laundromat thread (nfsd4_landromat()) that + * determined that this specific state has expired and needs to be revoked + * (both mark state with the appropriate stid sc_status mode). It is also + * assumed that a reference was taken on the @dp state. + * + * If this function finds that the @dp state is SC_STATUS_FREED it means + * that a FREE_STATEID operation for this stateid has been processed and + * we can proceed to removing it from recalled list. However, if @dp state + * isn't marked SC_STATUS_FREED, it means we need place it on the cl_revoked + * list and wait for the FREE_STATEID to arrive from the client. At the same + * time, we need to mark it as SC_STATUS_FREEABLE to indicate to the + * nfsd4_free_stateid() function that this stateid has already been added + * to the cl_revoked list and that nfsd4_free_stateid() is now responsible + * for removing it from the list. Inspection of where the delegation state + * in the revocation process is protected by the clp->cl_lock. + */ static void revoke_delegation(struct nfs4_delegation *dp) { struct nfs4_client *clp = dp->dl_stid.sc_client; WARN_ON(!list_empty(&dp->dl_recall_lru)); + WARN_ON_ONCE(dp->dl_stid.sc_client->cl_minorversion > 0 && + !(dp->dl_stid.sc_status & + (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED))); - if (clp->cl_minorversion) { - dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; - refcount_inc(&dp->dl_stid.sc_count); - spin_lock(&clp->cl_lock); - list_add(&dp->dl_recall_lru, &clp->cl_revoked); - spin_unlock(&clp->cl_lock); + trace_nfsd_stid_revoke(&dp->dl_stid); + + spin_lock(&clp->cl_lock); + if (dp->dl_stid.sc_status & SC_STATUS_FREED) { + list_del_init(&dp->dl_recall_lru); + goto out; } + list_add(&dp->dl_recall_lru, &clp->cl_revoked); + dp->dl_stid.sc_status |= SC_STATUS_FREEABLE; +out: + spin_unlock(&clp->cl_lock); destroy_unhashed_deleg(dp); } -/* - * SETCLIENTID state +/* + * SETCLIENTID state */ static unsigned int clientid_hashval(u32 id) @@ -1058,111 +1432,9 @@ static unsigned int clientid_hashval(u32 id) return id & CLIENT_HASH_MASK; } -static unsigned int clientstr_hashval(const char *name) +static unsigned int clientstr_hashval(struct xdr_netobj name) { - return opaque_hashval(name, 8) & CLIENT_HASH_MASK; -} - -/* - * We store the NONE, READ, WRITE, and BOTH bits separately in the - * st_{access,deny}_bmap field of the stateid, in order to track not - * only what share bits are currently in force, but also what - * combinations of share bits previous opens have used. This allows us - * to enforce the recommendation of rfc 3530 14.2.19 that the server - * return an error if the client attempt to downgrade to a combination - * of share bits not explicable by closing some of its previous opens. - * - * XXX: This enforcement is actually incomplete, since we don't keep - * track of access/deny bit combinations; so, e.g., we allow: - * - * OPEN allow read, deny write - * OPEN allow both, deny none - * DOWNGRADE allow read, deny none - * - * which we should reject. - */ -static unsigned int -bmap_to_share_mode(unsigned long bmap) { - int i; - unsigned int access = 0; - - for (i = 1; i < 4; i++) { - if (test_bit(i, &bmap)) - access |= i; - } - return access; -} - -/* set share access for a given stateid */ -static inline void -set_access(u32 access, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << access; - - WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); - stp->st_access_bmap |= mask; -} - -/* clear share access for a given stateid */ -static inline void -clear_access(u32 access, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << access; - - WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); - stp->st_access_bmap &= ~mask; -} - -/* test whether a given stateid has access */ -static inline bool -test_access(u32 access, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << access; - - return (bool)(stp->st_access_bmap & mask); -} - -/* set share deny for a given stateid */ -static inline void -set_deny(u32 deny, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << deny; - - WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); - stp->st_deny_bmap |= mask; -} - -/* clear share deny for a given stateid */ -static inline void -clear_deny(u32 deny, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << deny; - - WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); - stp->st_deny_bmap &= ~mask; -} - -/* test whether a given stateid is denying specific access */ -static inline bool -test_deny(u32 deny, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << deny; - - return (bool)(stp->st_deny_bmap & mask); -} - -static int nfs4_access_to_omode(u32 access) -{ - switch (access & NFS4_SHARE_ACCESS_BOTH) { - case NFS4_SHARE_ACCESS_READ: - return O_RDONLY; - case NFS4_SHARE_ACCESS_WRITE: - return O_WRONLY; - case NFS4_SHARE_ACCESS_BOTH: - return O_RDWR; - } - WARN_ON_ONCE(1); - return O_RDONLY; + return opaque_hashval(name.data, 8) & CLIENT_HASH_MASK; } /* @@ -1173,11 +1445,16 @@ static void recalculate_deny_mode(struct nfs4_file *fp) { struct nfs4_ol_stateid *stp; + u32 old_deny; spin_lock(&fp->fi_lock); + old_deny = fp->fi_share_deny; fp->fi_share_deny = 0; - list_for_each_entry(stp, &fp->fi_stateids, st_perfile) + list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap); + if (fp->fi_share_deny == old_deny) + break; + } spin_unlock(&fp->fi_lock); } @@ -1235,6 +1512,12 @@ static void nfs4_put_stateowner(struct nfs4_stateowner *sop) nfs4_free_stateowner(sop); } +static bool +nfs4_ol_stateid_unhashed(const struct nfs4_ol_stateid *stp) +{ + return list_empty(&stp->st_perfile); +} + static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_file *fp = stp->st_stid.sc_file; @@ -1259,6 +1542,8 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid) release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); + if (!list_empty(&stid->sc_cp_list)) + nfs4_free_cpntf_statelist(stid->sc_client->net, stid); kmem_cache_free(stateid_slab, stid); } @@ -1266,11 +1551,14 @@ static void nfs4_free_lock_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); - struct file *file; + struct nfsd_file *nf; - file = find_any_file(stp->st_stid.sc_file); - if (file) - filp_close(file, (fl_owner_t)lo); + nf = find_any_file(stp->st_stid.sc_file); + if (nf) { + get_file(nf->nf_file); + filp_close(nf->nf_file, (fl_owner_t)lo); + nfsd_file_put(nf); + } nfs4_free_ol_stateid(stid); } @@ -1295,6 +1583,8 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + atomic_dec(&s->sc_client->cl_admin_revoked); list_add(&stp->st_locks, reaplist); } @@ -1302,9 +1592,11 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) { lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); + if (!unhash_ol_stateid(stp)) + return false; list_del_init(&stp->st_locks); - nfs4_unhash_stid(&stp->st_stid); - return unhash_ol_stateid(stp); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; + return true; } static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -1361,7 +1653,7 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, while (!list_empty(&open_stp->st_locks)) { stp = list_entry(open_stp->st_locks.next, struct nfs4_ol_stateid, st_locks); - WARN_ON(!unhash_lock_stateid(stp)); + unhash_lock_stateid(stp); put_ol_stateid_locked(stp, reaplist); } } @@ -1369,13 +1661,12 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, static bool unhash_open_stateid(struct nfs4_ol_stateid *stp, struct list_head *reaplist) { - bool unhashed; - lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); - unhashed = unhash_ol_stateid(stp); + if (!unhash_ol_stateid(stp)) + return false; release_open_stateid_locks(stp, reaplist); - return unhashed; + return true; } static void release_open_stateid(struct nfs4_ol_stateid *stp) @@ -1383,12 +1674,21 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp) LIST_HEAD(reaplist); spin_lock(&stp->st_stid.sc_client->cl_lock); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; if (unhash_open_stateid(stp, &reaplist)) put_ol_stateid_locked(stp, &reaplist); spin_unlock(&stp->st_stid.sc_client->cl_lock); free_ol_stateid_reaplist(&reaplist); } +static bool nfs4_openowner_unhashed(struct nfs4_openowner *oo) +{ + lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); + + return list_empty(&oo->oo_owner.so_strhash) && + list_empty(&oo->oo_perclient); +} + static void unhash_openowner_locked(struct nfs4_openowner *oo) { struct nfs4_client *clp = oo->oo_owner.so_client; @@ -1420,9 +1720,7 @@ static void release_openowner(struct nfs4_openowner *oo) { struct nfs4_ol_stateid *stp; struct nfs4_client *clp = oo->oo_owner.so_client; - struct list_head reaplist; - - INIT_LIST_HEAD(&reaplist); + LIST_HEAD(reaplist); spin_lock(&clp->cl_lock); unhash_openowner_locked(oo); @@ -1438,6 +1736,137 @@ static void release_openowner(struct nfs4_openowner *oo) nfs4_put_stateowner(&oo->oo_owner); } +static struct nfs4_stid *find_one_sb_stid(struct nfs4_client *clp, + struct super_block *sb, + unsigned int sc_types) +{ + unsigned long id, tmp; + struct nfs4_stid *stid; + + spin_lock(&clp->cl_lock); + idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) + if ((stid->sc_type & sc_types) && + stid->sc_status == 0 && + stid->sc_file->fi_inode->i_sb == sb) { + refcount_inc(&stid->sc_count); + break; + } + spin_unlock(&clp->cl_lock); + return stid; +} + +/** + * nfsd4_revoke_states - revoke all nfsv4 states associated with given filesystem + * @net: used to identify instance of nfsd (there is one per net namespace) + * @sb: super_block used to identify target filesystem + * + * All nfs4 states (open, lock, delegation, layout) held by the server instance + * and associated with a file on the given filesystem will be revoked resulting + * in any files being closed and so all references from nfsd to the filesystem + * being released. Thus nfsd will no longer prevent the filesystem from being + * unmounted. + * + * The clients which own the states will subsequently being notified that the + * states have been "admin-revoked". + */ +void nfsd4_revoke_states(struct net *net, struct super_block *sb) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unsigned int idhashval; + unsigned int sc_types; + + sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG | SC_TYPE_LAYOUT; + + spin_lock(&nn->client_lock); + for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) { + struct list_head *head = &nn->conf_id_hashtbl[idhashval]; + struct nfs4_client *clp; + retry: + list_for_each_entry(clp, head, cl_idhash) { + struct nfs4_stid *stid = find_one_sb_stid(clp, sb, + sc_types); + if (stid) { + struct nfs4_ol_stateid *stp; + struct nfs4_delegation *dp; + struct nfs4_layout_stateid *ls; + + spin_unlock(&nn->client_lock); + switch (stid->sc_type) { + case SC_TYPE_OPEN: + stp = openlockstateid(stid); + mutex_lock_nested(&stp->st_mutex, + OPEN_STATEID_MUTEX); + + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + spin_unlock(&clp->cl_lock); + release_all_access(stp); + } else + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + break; + case SC_TYPE_LOCK: + stp = openlockstateid(stid); + mutex_lock_nested(&stp->st_mutex, + LOCK_STATEID_MUTEX); + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + struct nfs4_lockowner *lo = + lockowner(stp->st_stateowner); + struct nfsd_file *nf; + + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + spin_unlock(&clp->cl_lock); + nf = find_any_file(stp->st_stid.sc_file); + if (nf) { + get_file(nf->nf_file); + filp_close(nf->nf_file, + (fl_owner_t)lo); + nfsd_file_put(nf); + } + release_all_access(stp); + } else + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + break; + case SC_TYPE_DELEG: + refcount_inc(&stid->sc_count); + dp = delegstateid(stid); + spin_lock(&state_lock); + if (!unhash_delegation_locked( + dp, SC_STATUS_ADMIN_REVOKED)) + dp = NULL; + spin_unlock(&state_lock); + if (dp) + revoke_delegation(dp); + break; + case SC_TYPE_LAYOUT: + ls = layoutstateid(stid); + nfsd4_close_layout(ls); + break; + } + nfs4_put_stid(stid); + spin_lock(&nn->client_lock); + if (clp->cl_minorversion == 0) + /* Allow cleanup after a lease period. + * store_release ensures cleanup will + * see any newly revoked states if it + * sees the time updated. + */ + nn->nfs40_last_revoke = + ktime_get_boottime_seconds(); + goto retry; + } + } + } + spin_unlock(&nn->client_lock); +} + static inline int hash_sessionid(struct nfs4_sessionid *sessionid) { @@ -1509,96 +1938,145 @@ gen_sessionid(struct nfsd4_session *ses) */ #define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44) +static struct shrinker *nfsd_slot_shrinker; +static DEFINE_SPINLOCK(nfsd_session_list_lock); +static LIST_HEAD(nfsd_session_list); +/* The sum of "target_slots-1" on every session. The shrinker can push this + * down, though it can take a little while for the memory to actually + * be freed. The "-1" is because we can never free slot 0 while the + * session is active. + */ +static atomic_t nfsd_total_target_slots = ATOMIC_INIT(0); + static void -free_session_slots(struct nfsd4_session *ses) +free_session_slots(struct nfsd4_session *ses, int from) { int i; - for (i = 0; i < ses->se_fchannel.maxreqs; i++) { - free_svc_cred(&ses->se_slots[i]->sl_cred); - kfree(ses->se_slots[i]); + if (from >= ses->se_fchannel.maxreqs) + return; + + for (i = from; i < ses->se_fchannel.maxreqs; i++) { + struct nfsd4_slot *slot = xa_load(&ses->se_slots, i); + + /* + * Save the seqid in case we reactivate this slot. + * This will never require a memory allocation so GFP + * flag is irrelevant + */ + xa_store(&ses->se_slots, i, xa_mk_value(slot->sl_seqid), 0); + free_svc_cred(&slot->sl_cred); + kfree(slot); + } + ses->se_fchannel.maxreqs = from; + if (ses->se_target_maxslots > from) { + int new_target = from ?: 1; + atomic_sub(ses->se_target_maxslots - new_target, &nfsd_total_target_slots); + ses->se_target_maxslots = new_target; } } -/* - * We don't actually need to cache the rpc and session headers, so we - * can allocate a little less for each slot: +/** + * reduce_session_slots - reduce the target max-slots of a session if possible + * @ses: The session to affect + * @dec: how much to decrease the target by + * + * This interface can be used by a shrinker to reduce the target max-slots + * for a session so that some slots can eventually be freed. + * It uses spin_trylock() as it may be called in a context where another + * spinlock is held that has a dependency on client_lock. As shrinkers are + * best-effort, skiping a session is client_lock is already held has no + * great coast + * + * Return value: + * The number of slots that the target was reduced by. */ -static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) +static int +reduce_session_slots(struct nfsd4_session *ses, int dec) { - u32 size; + struct nfsd_net *nn = net_generic(ses->se_client->net, + nfsd_net_id); + int ret = 0; - if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ) - size = 0; - else - size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; - return size + sizeof(struct nfsd4_slot); + if (ses->se_target_maxslots <= 1) + return ret; + if (!spin_trylock(&nn->client_lock)) + return ret; + ret = min(dec, ses->se_target_maxslots-1); + ses->se_target_maxslots -= ret; + atomic_sub(ret, &nfsd_total_target_slots); + ses->se_slot_gen += 1; + if (ses->se_slot_gen == 0) { + int i; + ses->se_slot_gen = 1; + for (i = 0; i < ses->se_fchannel.maxreqs; i++) { + struct nfsd4_slot *slot = xa_load(&ses->se_slots, i); + slot->sl_generation = 0; + } + } + spin_unlock(&nn->client_lock); + return ret; } -/* - * XXX: If we run out of reserved DRC memory we could (up to a point) - * re-negotiate active sessions and reduce their slot usage to make - * room for new connections. For now we just fail the create session. - */ -static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) +static struct nfsd4_slot *nfsd4_alloc_slot(struct nfsd4_channel_attrs *fattrs, + int index, gfp_t gfp) { - u32 slotsize = slot_bytes(ca); - u32 num = ca->maxreqs; - int avail; + struct nfsd4_slot *slot; + size_t size; - spin_lock(&nfsd_drc_lock); - avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, - nfsd_drc_max_mem - nfsd_drc_mem_used); /* - * Never use more than a third of the remaining memory, - * unless it's the only way to give this client a slot: + * The RPC and NFS session headers are never saved in + * the slot reply cache buffer. */ - avail = clamp_t(int, avail, slotsize, avail/3); - num = min_t(int, num, avail / slotsize); - nfsd_drc_mem_used += num * slotsize; - spin_unlock(&nfsd_drc_lock); - - return num; -} + size = fattrs->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ ? + 0 : fattrs->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; -static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca) -{ - int slotsize = slot_bytes(ca); - - spin_lock(&nfsd_drc_lock); - nfsd_drc_mem_used -= slotsize * ca->maxreqs; - spin_unlock(&nfsd_drc_lock); + slot = kzalloc(struct_size(slot, sl_data, size), gfp); + if (!slot) + return NULL; + slot->sl_index = index; + return slot; } static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, struct nfsd4_channel_attrs *battrs) { int numslots = fattrs->maxreqs; - int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; - int mem, i; - - BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *) - + sizeof(struct nfsd4_session) > PAGE_SIZE); - mem = numslots * sizeof(struct nfsd4_slot *); + struct nfsd4_slot *slot; + int i; - new = kzalloc(sizeof(*new) + mem, GFP_KERNEL); + new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; - /* allocate each struct nfsd4_slot and data cache in one piece */ - for (i = 0; i < numslots; i++) { - new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL); - if (!new->se_slots[i]) - goto out_free; - } + xa_init(&new->se_slots); - memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs)); - memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs)); + slot = nfsd4_alloc_slot(fattrs, 0, GFP_KERNEL); + if (!slot || xa_is_err(xa_store(&new->se_slots, 0, slot, GFP_KERNEL))) + goto out_free; + for (i = 1; i < numslots; i++) { + const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + slot = nfsd4_alloc_slot(fattrs, i, gfp); + if (!slot) + break; + if (xa_is_err(xa_store(&new->se_slots, i, slot, gfp))) { + kfree(slot); + break; + } + } + fattrs->maxreqs = i; + memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs)); + new->se_target_maxslots = i; + atomic_add(i - 1, &nfsd_total_target_slots); + new->se_cb_slot_avail = ~0U; + new->se_cb_highest_slot = min(battrs->maxreqs - 1, + NFSD_BC_SLOT_TABLE_SIZE - 1); + spin_lock_init(&new->se_lock); return new; out_free: - while (i--) - kfree(new->se_slots[i]); + kfree(slot); + xa_destroy(&new->se_slots); kfree(new); return NULL; } @@ -1614,6 +2092,8 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u) struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user); struct nfs4_client *clp = c->cn_session->se_client; + trace_nfsd_cb_lost(clp); + spin_lock(&clp->cl_lock); if (!list_empty(&c->cn_persession)) { list_del(&c->cn_persession); @@ -1702,17 +2182,47 @@ static void nfsd4_del_conns(struct nfsd4_session *s) static void __free_session(struct nfsd4_session *ses) { - free_session_slots(ses); + free_session_slots(ses, 0); + xa_destroy(&ses->se_slots); kfree(ses); } static void free_session(struct nfsd4_session *ses) { nfsd4_del_conns(ses); - nfsd4_put_drc_mem(&ses->se_fchannel); __free_session(ses); } +static unsigned long +nfsd_slot_count(struct shrinker *s, struct shrink_control *sc) +{ + unsigned long cnt = atomic_read(&nfsd_total_target_slots); + + return cnt ? cnt : SHRINK_EMPTY; +} + +static unsigned long +nfsd_slot_scan(struct shrinker *s, struct shrink_control *sc) +{ + struct nfsd4_session *ses; + unsigned long scanned = 0; + unsigned long freed = 0; + + spin_lock(&nfsd_session_list_lock); + list_for_each_entry(ses, &nfsd_session_list, se_all_sessions) { + freed += reduce_session_slots(ses, 1); + scanned += 1; + if (scanned >= sc->nr_to_scan) { + /* Move starting point for next scan */ + list_move(&nfsd_session_list, &ses->se_all_sessions); + break; + } + } + spin_unlock(&nfsd_session_list_lock); + sc->nr_scanned = scanned; + return freed; +} + static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) { int idx; @@ -1723,17 +2233,24 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru INIT_LIST_HEAD(&new->se_conns); - new->se_cb_seq_nr = 1; - new->se_flags = cses->flags; + atomic_set(&new->se_ref, 0); + new->se_dead = false; new->se_cb_prog = cses->callback_prog; new->se_cb_sec = cses->cb_sec; - atomic_set(&new->se_ref, 0); + + for (idx = 0; idx < NFSD_BC_SLOT_TABLE_SIZE; ++idx) + new->se_cb_seq_nr[idx] = 1; + idx = hash_sessionid(&new->se_sessionid); list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); spin_lock(&clp->cl_lock); list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); + spin_lock(&nfsd_session_list_lock); + list_add_tail(&new->se_all_sessions, &nfsd_session_list); + spin_unlock(&nfsd_session_list_lock); + { struct sockaddr *sa = svc_addr(rqstp); /* @@ -1803,6 +2320,9 @@ unhash_session(struct nfsd4_session *ses) spin_lock(&ses->se_client->cl_lock); list_del(&ses->se_perclnt); spin_unlock(&ses->se_client->cl_lock); + spin_lock(&nfsd_session_list_lock); + list_del(&ses->se_all_sessions); + spin_unlock(&nfsd_session_list_lock); } /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ @@ -1816,25 +2336,24 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) */ if (clid->cl_boot == (u32)nn->boot_time) return 0; - dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", - clid->cl_boot, clid->cl_id, nn->boot_time); + trace_nfsd_clid_stale(clid); return 1; } -/* - * XXX Should we use a slab cache ? - * This type of memory management is somewhat inefficient, but we use it - * anyway since SETCLIENTID is not a common operation. - */ -static struct nfs4_client *alloc_client(struct xdr_netobj name) +static struct nfs4_client *alloc_client(struct xdr_netobj name, + struct nfsd_net *nn) { struct nfs4_client *clp; int i; + if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients && + atomic_read(&nn->nfsd_courtesy_clients) > 0) + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + clp = kmem_cache_zalloc(client_slab, GFP_KERNEL); if (clp == NULL) return NULL; - clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL); + xdr_netobj_dup(&clp->cl_name, &name, GFP_KERNEL); if (clp->cl_name.data == NULL) goto err_no_name; clp->cl_ownerstr_hashtbl = kmalloc_array(OWNER_HASH_SIZE, @@ -1842,13 +2361,19 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) GFP_KERNEL); if (!clp->cl_ownerstr_hashtbl) goto err_no_hashtbl; + clp->cl_callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0); + if (!clp->cl_callback_wq) + goto err_no_callback_wq; + for (i = 0; i < OWNER_HASH_SIZE; i++) INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]); - clp->cl_name.len = name.len; INIT_LIST_HEAD(&clp->cl_sessions); idr_init(&clp->cl_stateids); - atomic_set(&clp->cl_refcount, 0); + atomic_set(&clp->cl_rpc_users, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; + clp->cl_state = NFSD4_ACTIVE; + atomic_inc(&nn->nfs4_client_count); + atomic_set(&clp->cl_delegs_in_recall, 0); INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); @@ -1862,6 +2387,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; +err_no_callback_wq: + kfree(clp->cl_ownerstr_hashtbl); err_no_hashtbl: kfree(clp->cl_name.data); err_no_name: @@ -1869,6 +2396,27 @@ err_no_name: return NULL; } +static void __free_client(struct kref *k) +{ + struct nfsdfs_client *c = container_of(k, struct nfsdfs_client, cl_ref); + struct nfs4_client *clp = container_of(c, struct nfs4_client, cl_nfsdfs); + + free_svc_cred(&clp->cl_cred); + destroy_workqueue(clp->cl_callback_wq); + kfree(clp->cl_ownerstr_hashtbl); + kfree(clp->cl_name.data); + kfree(clp->cl_nii_domain.data); + kfree(clp->cl_nii_name.data); + idr_destroy(&clp->cl_stateids); + kfree(clp->cl_ra); + kmem_cache_free(client_slab, clp); +} + +static void drop_client(struct nfs4_client *clp) +{ + kref_put(&clp->cl_nfsdfs.cl_ref, __free_client); +} + static void free_client(struct nfs4_client *clp) { @@ -1881,11 +2429,12 @@ free_client(struct nfs4_client *clp) free_session(ses); } rpc_destroy_wait_queue(&clp->cl_cb_waitq); - free_svc_cred(&clp->cl_cred); - kfree(clp->cl_ownerstr_hashtbl); - kfree(clp->cl_name.data); - idr_destroy(&clp->cl_stateids); - kmem_cache_free(client_slab, clp); + if (clp->cl_nfsd_dentry) { + nfsd_client_rmdir(clp->cl_nfsd_dentry); + clp->cl_nfsd_dentry = NULL; + wake_up_all(&expiry_wq); + } + drop_client(clp); } /* must be called under the client_lock */ @@ -1909,8 +2458,12 @@ unhash_client_locked(struct nfs4_client *clp) } list_del_init(&clp->cl_lru); spin_lock(&clp->cl_lock); - list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) + spin_lock(&nfsd_session_list_lock); + list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) { list_del_init(&ses->se_hash); + list_del_init(&ses->se_all_sessions); + } + spin_unlock(&nfsd_session_list_lock); spin_unlock(&clp->cl_lock); } @@ -1926,7 +2479,11 @@ unhash_client(struct nfs4_client *clp) static __be32 mark_client_expired_locked(struct nfs4_client *clp) { - if (atomic_read(&clp->cl_refcount)) + int users = atomic_read(&clp->cl_rpc_users); + + trace_nfsd_mark_client_expired(clp, users); + + if (users) return nfserr_jukebox; unhash_client_locked(clp); return nfs_ok; @@ -1935,16 +2492,16 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp) static void __destroy_client(struct nfs4_client *clp) { + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); int i; struct nfs4_openowner *oo; struct nfs4_delegation *dp; - struct list_head reaplist; + LIST_HEAD(reaplist); - INIT_LIST_HEAD(&reaplist); spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -1978,7 +2535,10 @@ __destroy_client(struct nfs4_client *clp) nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); + atomic_add_unless(&nn->nfs4_client_count, -1, 0); + nfsd4_dec_courtesy_client_count(nn, clp); free_client(clp); + wake_up_all(&expiry_wq); } static void @@ -1988,6 +2548,22 @@ destroy_client(struct nfs4_client *clp) __destroy_client(clp); } +static void inc_reclaim_complete(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + if (!nn->track_reclaim_completes) + return; + if (!nfsd4_find_reclaim_client(clp->cl_name, nn)) + return; + if (atomic_inc_return(&nn->nr_reclaim_complete) == + nn->reclaim_str_hashtbl_size) { + printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n", + clp->net->ns.inum); + nfsd4_end_grace(nn); + } +} + static void expire_client(struct nfs4_client *clp) { unhash_client(clp); @@ -2039,11 +2615,6 @@ compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2) return memcmp(o1->data, o2->data, o1->len); } -static int same_name(const char *n1, const char *n2) -{ - return 0 == memcmp(n1, n2, HEXDIR_LEN); -} - static int same_verf(nfs4_verifier *v1, nfs4_verifier *v2) { @@ -2138,14 +2709,14 @@ static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn) * This is opaque to client, so no need to byte-swap. Use * __force to keep sparse happy */ - verf[0] = (__force __be32)get_seconds(); + verf[0] = (__force __be32)(u32)ktime_get_real_seconds(); verf[1] = (__force __be32)nn->clverifier_counter++; memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data)); } static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) { - clp->cl_clientid.cl_boot = nn->boot_time; + clp->cl_clientid.cl_boot = (u32)nn->boot_time; clp->cl_clientid.cl_id = nn->clientid_counter++; gen_confirm(clp, nn); } @@ -2162,14 +2733,16 @@ find_stateid_locked(struct nfs4_client *cl, stateid_t *t) } static struct nfs4_stid * -find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, + unsigned short typemask, unsigned short ok_states) { struct nfs4_stid *s; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, t); if (s != NULL) { - if (typemask & s->sc_type) + if ((s->sc_status & ~ok_states) == 0 && + (typemask & s->sc_type)) refcount_inc(&s->sc_count); else s = NULL; @@ -2178,6 +2751,519 @@ find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) return s; } +static struct nfs4_client *get_nfsdfs_clp(struct inode *inode) +{ + struct nfsdfs_client *nc; + nc = get_nfsdfs_client(inode); + if (!nc) + return NULL; + return container_of(nc, struct nfs4_client, cl_nfsdfs); +} + +static void seq_quote_mem(struct seq_file *m, char *data, int len) +{ + seq_puts(m, "\""); + seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\"); + seq_puts(m, "\""); +} + +static const char *cb_state2str(int state) +{ + switch (state) { + case NFSD4_CB_UP: + return "UP"; + case NFSD4_CB_UNKNOWN: + return "UNKNOWN"; + case NFSD4_CB_DOWN: + return "DOWN"; + case NFSD4_CB_FAULT: + return "FAULT"; + } + return "UNDEFINED"; +} + +static int client_info_show(struct seq_file *m, void *v) +{ + struct inode *inode = file_inode(m->file); + struct nfsd4_session *ses; + struct nfs4_client *clp; + u64 clid; + + clp = get_nfsdfs_clp(inode); + if (!clp) + return -ENXIO; + memcpy(&clid, &clp->cl_clientid, sizeof(clid)); + seq_printf(m, "clientid: 0x%llx\n", clid); + seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr); + + if (clp->cl_state == NFSD4_COURTESY) + seq_puts(m, "status: courtesy\n"); + else if (clp->cl_state == NFSD4_EXPIRABLE) + seq_puts(m, "status: expirable\n"); + else if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + seq_puts(m, "status: confirmed\n"); + else + seq_puts(m, "status: unconfirmed\n"); + seq_printf(m, "seconds from last renew: %lld\n", + ktime_get_boottime_seconds() - clp->cl_time); + seq_puts(m, "name: "); + seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); + seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); + if (clp->cl_nii_domain.data) { + seq_puts(m, "Implementation domain: "); + seq_quote_mem(m, clp->cl_nii_domain.data, + clp->cl_nii_domain.len); + seq_puts(m, "\nImplementation name: "); + seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len); + seq_printf(m, "\nImplementation time: [%lld, %ld]\n", + clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec); + } + seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state)); + seq_printf(m, "callback address: \"%pISpc\"\n", &clp->cl_cb_conn.cb_addr); + seq_printf(m, "admin-revoked states: %d\n", + atomic_read(&clp->cl_admin_revoked)); + spin_lock(&clp->cl_lock); + seq_printf(m, "session slots:"); + list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) + seq_printf(m, " %u", ses->se_fchannel.maxreqs); + seq_printf(m, "\nsession target slots:"); + list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) + seq_printf(m, " %u", ses->se_target_maxslots); + spin_unlock(&clp->cl_lock); + seq_puts(m, "\n"); + + drop_client(clp); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(client_info); + +static void *states_start(struct seq_file *s, loff_t *pos) + __acquires(&clp->cl_lock) +{ + struct nfs4_client *clp = s->private; + unsigned long id = *pos; + void *ret; + + spin_lock(&clp->cl_lock); + ret = idr_get_next_ul(&clp->cl_stateids, &id); + *pos = id; + return ret; +} + +static void *states_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct nfs4_client *clp = s->private; + unsigned long id = *pos; + void *ret; + + id = *pos; + id++; + ret = idr_get_next_ul(&clp->cl_stateids, &id); + *pos = id; + return ret; +} + +static void states_stop(struct seq_file *s, void *v) + __releases(&clp->cl_lock) +{ + struct nfs4_client *clp = s->private; + + spin_unlock(&clp->cl_lock); +} + +static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f) +{ + seq_printf(s, "filename: \"%pD2\"", f->nf_file); +} + +static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) +{ + struct inode *inode = file_inode(f->nf_file); + + seq_printf(s, "superblock: \"%02x:%02x:%ld\"", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino); +} + +static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo) +{ + seq_puts(s, "owner: "); + seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len); +} + +static void nfs4_show_stateid(struct seq_file *s, stateid_t *stid) +{ + seq_printf(s, "0x%.8x", stid->si_generation); + seq_printf(s, "%12phN", &stid->si_opaque); +} + +static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) +{ + struct nfs4_ol_stateid *ols; + struct nfs4_file *nf; + struct nfsd_file *file; + struct nfs4_stateowner *oo; + unsigned int access, deny; + + ols = openlockstateid(st); + oo = ols->st_stateowner; + nf = st->sc_file; + + seq_puts(s, "- "); + nfs4_show_stateid(s, &st->sc_stateid); + seq_puts(s, ": { type: open, "); + + access = bmap_to_share_mode(ols->st_access_bmap); + deny = bmap_to_share_mode(ols->st_deny_bmap); + + seq_printf(s, "access: %s%s, ", + access & NFS4_SHARE_ACCESS_READ ? "r" : "-", + access & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); + seq_printf(s, "deny: %s%s, ", + deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", + deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); + + if (nf) { + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (file) { + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } + spin_unlock(&nf->fi_lock); + } else + seq_puts(s, "closed, "); + nfs4_show_owner(s, oo); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); + return 0; +} + +static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) +{ + struct nfs4_ol_stateid *ols; + struct nfs4_file *nf; + struct nfsd_file *file; + struct nfs4_stateowner *oo; + + ols = openlockstateid(st); + oo = ols->st_stateowner; + nf = st->sc_file; + + seq_puts(s, "- "); + nfs4_show_stateid(s, &st->sc_stateid); + seq_puts(s, ": { type: lock, "); + + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (file) { + /* + * Note: a lock stateid isn't really the same thing as a lock, + * it's the locking state held by one owner on a file, and there + * may be multiple (or no) lock ranges associated with it. + * (Same for the matter is true of open stateids.) + */ + + nfs4_show_superblock(s, file); + /* XXX: open stateid? */ + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } + nfs4_show_owner(s, oo); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); + spin_unlock(&nf->fi_lock); + return 0; +} + +static char *nfs4_show_deleg_type(u32 dl_type) +{ + switch (dl_type) { + case OPEN_DELEGATE_READ: + return "r"; + case OPEN_DELEGATE_WRITE: + return "w"; + case OPEN_DELEGATE_READ_ATTRS_DELEG: + return "ra"; + case OPEN_DELEGATE_WRITE_ATTRS_DELEG: + return "wa"; + } + return "?"; +} + +static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) +{ + struct nfs4_delegation *ds; + struct nfs4_file *nf; + struct nfsd_file *file; + + ds = delegstateid(st); + nf = st->sc_file; + + seq_puts(s, "- "); + nfs4_show_stateid(s, &st->sc_stateid); + seq_puts(s, ": { type: deleg, "); + + seq_printf(s, "access: %s", nfs4_show_deleg_type(ds->dl_type)); + + /* XXX: lease time, whether it's being recalled. */ + + spin_lock(&nf->fi_lock); + file = nf->fi_deleg_file; + if (file) { + seq_puts(s, ", "); + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + } + spin_unlock(&nf->fi_lock); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); + return 0; +} + +static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) +{ + struct nfs4_layout_stateid *ls; + struct nfsd_file *file; + + ls = container_of(st, struct nfs4_layout_stateid, ls_stid); + + seq_puts(s, "- "); + nfs4_show_stateid(s, &st->sc_stateid); + seq_puts(s, ": { type: layout"); + + /* XXX: What else would be useful? */ + + spin_lock(&ls->ls_stid.sc_file->fi_lock); + file = ls->ls_file; + if (file) { + seq_puts(s, ", "); + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + } + spin_unlock(&ls->ls_stid.sc_file->fi_lock); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); + + return 0; +} + +static int states_show(struct seq_file *s, void *v) +{ + struct nfs4_stid *st = v; + + switch (st->sc_type) { + case SC_TYPE_OPEN: + return nfs4_show_open(s, st); + case SC_TYPE_LOCK: + return nfs4_show_lock(s, st); + case SC_TYPE_DELEG: + return nfs4_show_deleg(s, st); + case SC_TYPE_LAYOUT: + return nfs4_show_layout(s, st); + default: + return 0; /* XXX: or SEQ_SKIP? */ + } + /* XXX: copy stateids? */ +} + +static struct seq_operations states_seq_ops = { + .start = states_start, + .next = states_next, + .stop = states_stop, + .show = states_show +}; + +static int client_states_open(struct inode *inode, struct file *file) +{ + struct seq_file *s; + struct nfs4_client *clp; + int ret; + + clp = get_nfsdfs_clp(inode); + if (!clp) + return -ENXIO; + + ret = seq_open(file, &states_seq_ops); + if (ret) + return ret; + s = file->private_data; + s->private = clp; + return 0; +} + +static int client_opens_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct nfs4_client *clp = m->private; + + /* XXX: alternatively, we could get/drop in seq start/stop */ + drop_client(clp); + return seq_release(inode, file); +} + +static const struct file_operations client_states_fops = { + .open = client_states_open, + .read = seq_read, + .llseek = seq_lseek, + .release = client_opens_release, +}; + +/* + * Normally we refuse to destroy clients that are in use, but here the + * administrator is telling us to just do it. We also want to wait + * so the caller has a guarantee that the client's locks are gone by + * the time the write returns: + */ +static void force_expire_client(struct nfs4_client *clp) +{ + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + bool already_expired; + + trace_nfsd_clid_admin_expired(&clp->cl_clientid); + + spin_lock(&nn->client_lock); + clp->cl_time = 0; + spin_unlock(&nn->client_lock); + + wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0); + spin_lock(&nn->client_lock); + already_expired = list_empty(&clp->cl_lru); + if (!already_expired) + unhash_client_locked(clp); + spin_unlock(&nn->client_lock); + + if (!already_expired) + expire_client(clp); + else + wait_event(expiry_wq, clp->cl_nfsd_dentry == NULL); +} + +static ssize_t client_ctl_write(struct file *file, const char __user *buf, + size_t size, loff_t *pos) +{ + char *data; + struct nfs4_client *clp; + + data = simple_transaction_get(file, buf, size); + if (IS_ERR(data)) + return PTR_ERR(data); + if (size != 7 || 0 != memcmp(data, "expire\n", 7)) + return -EINVAL; + clp = get_nfsdfs_clp(file_inode(file)); + if (!clp) + return -ENXIO; + force_expire_client(clp); + drop_client(clp); + return 7; +} + +static const struct file_operations client_ctl_fops = { + .write = client_ctl_write, + .release = simple_transaction_release, +}; + +static const struct tree_descr client_files[] = { + [0] = {"info", &client_info_fops, S_IRUSR}, + [1] = {"states", &client_states_fops, S_IRUSR}, + [2] = {"ctl", &client_ctl_fops, S_IWUSR}, + [3] = {""}, +}; + +static int +nfsd4_cb_recall_any_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + trace_nfsd_cb_recall_any_done(cb, task); + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 2 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + drop_client(clp); +} + +static int +nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + struct nfs4_delegation *dp = + container_of(ncf, struct nfs4_delegation, dl_cb_fattr); + + trace_nfsd_cb_getattr_done(&dp->dl_stid.sc_stateid, task); + ncf->ncf_cb_status = task->tk_status; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 2 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_getattr_release(struct nfsd4_callback *cb) +{ + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + struct nfs4_delegation *dp = + container_of(ncf, struct nfs4_delegation, dl_cb_fattr); + + nfs4_put_stid(&dp->dl_stid); +} + +static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { + .done = nfsd4_cb_recall_any_done, + .release = nfsd4_cb_recall_any_release, + .opcode = OP_CB_RECALL_ANY, +}; + +static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = { + .done = nfsd4_cb_getattr_done, + .release = nfsd4_cb_getattr_release, + .opcode = OP_CB_GETATTR, +}; + +static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf) +{ + struct nfs4_delegation *dp = + container_of(ncf, struct nfs4_delegation, dl_cb_fattr); + + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ncf->ncf_getattr.cb_flags)) + return; + + /* set to proper status when nfsd4_cb_getattr_done runs */ + ncf->ncf_cb_status = NFS4ERR_IO; + + /* ensure that wake_bit is done when RUNNING is cleared */ + set_bit(NFSD4_CALLBACK_WAKE, &ncf->ncf_getattr.cb_flags); + + refcount_inc(&dp->dl_stid.sc_count); + nfsd4_run_cb(&ncf->ncf_getattr); +} + static struct nfs4_client *create_client(struct xdr_netobj name, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -2185,8 +3271,10 @@ static struct nfs4_client *create_client(struct xdr_netobj name, struct sockaddr *sa = svc_addr(rqstp); int ret; struct net *net = SVC_NET(rqstp); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct dentry *dentries[ARRAY_SIZE(client_files)]; - clp = alloc_client(name); + clp = alloc_client(name, nn); if (clp == NULL) return NULL; @@ -2195,13 +3283,31 @@ static struct nfs4_client *create_client(struct xdr_netobj name, free_client(clp); return NULL; } + gen_clid(clp, nn); + kref_init(&clp->cl_nfsdfs.cl_ref); nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL); - clp->cl_time = get_seconds(); - clear_bit(0, &clp->cl_cb_slot_busy); + clp->cl_time = ktime_get_boottime_seconds(); copy_verf(clp, verf); - rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); + memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage)); clp->cl_cb_session = NULL; clp->net = net; + clp->cl_nfsd_dentry = nfsd_client_mkdir( + nn, &clp->cl_nfsdfs, + clp->cl_clientid.cl_id - nn->clientid_base, + client_files, dentries); + clp->cl_nfsd_info_dentry = dentries[0]; + if (!clp->cl_nfsd_dentry) { + free_client(clp); + return NULL; + } + clp->cl_ra = kzalloc(sizeof(*clp->cl_ra), GFP_KERNEL); + if (!clp->cl_ra) { + free_client(clp); + return NULL; + } + clp->cl_ra_time = 0; + nfsd4_init_cb(&clp->cl_ra->ra_cb, clp, &nfsd4_cb_recall_any_ops, + NFSPROC4_CLNT_CB_RECALL_ANY); return clp; } @@ -2268,11 +3374,11 @@ move_to_confirmed(struct nfs4_client *clp) lockdep_assert_held(&nn->client_lock); - dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); add_clp_to_name_tree(clp, &nn->conf_name_tree); set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); + trace_nfsd_clid_confirmed(&clp->cl_clientid); renew_client_locked(clp); } @@ -2362,14 +3468,12 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r conn->cb_prog = se->se_callback_prog; conn->cb_ident = se->se_callback_ident; memcpy(&conn->cb_saddr, &rqstp->rq_daddr, rqstp->rq_daddrlen); + trace_nfsd_cb_args(clp, conn); return; out_err: conn->cb_addr.ss_family = AF_UNSPEC; conn->cb_addrlen = 0; - dprintk("NFSD: this client (clientid %08x/%08x) " - "will not receive delegations\n", - clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); - + trace_nfsd_cb_nodelegs(clp); return; } @@ -2379,11 +3483,24 @@ out_err: static void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { - struct xdr_buf *buf = resp->xdr.buf; + struct xdr_buf *buf = resp->xdr->buf; struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; - dprintk("--> %s slot %p\n", __func__, slot); + /* + * RFC 5661 Section 2.10.6.1.2: + * + * Any time SEQUENCE ... returns an error ... [t]he replier MUST NOT + * modify the reply cache entry for the slot whenever an error is + * returned from SEQUENCE ... + * + * Because nfsd4_store_cache_entry is called only by + * nfsd4_sequence_done(), nfsd4_store_cache_entry() is called only + * when a SEQUENCE operation was part of the COMPOUND. + * nfs41_check_op_ordering() ensures SEQUENCE is the first op. + */ + if (resp->opcnt == 1 && resp->cstate.status != nfs_ok) + return; slot->sl_flags |= NFSD4_SLOT_INITIALIZED; slot->sl_opcnt = resp->opcnt; @@ -2391,7 +3508,7 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) free_svc_cred(&slot->sl_cred); copy_cred(&slot->sl_cred, &resp->rqstp->rq_cred); - if (!nfsd4_cache_this(resp)) { + if (!(resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS)) { slot->sl_flags &= ~NFSD4_SLOT_CACHED; return; } @@ -2406,41 +3523,6 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) } /* - * Encode the replay sequence operation from the slot values. - * If cachethis is FALSE encode the uncached rep error on the next - * operation which sets resp->p and increments resp->opcnt for - * nfs4svc_encode_compoundres. - * - */ -static __be32 -nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, - struct nfsd4_compoundres *resp) -{ - struct nfsd4_op *op; - struct nfsd4_slot *slot = resp->cstate.slot; - - /* Encode the replayed sequence operation */ - op = &args->ops[resp->opcnt - 1]; - nfsd4_encode_operation(resp, op); - - if (slot->sl_flags & NFSD4_SLOT_CACHED) - return op->status; - if (args->opcnt == 1) { - /* - * The original operation wasn't a solo sequence--we - * always cache those--so this retry must not match the - * original: - */ - op->status = nfserr_seq_false_retry; - } else { - op = &args->ops[resp->opcnt++]; - op->status = nfserr_retry_uncached_rep; - nfsd4_encode_operation(resp, op); - } - return op->status; -} - -/* * The sequence operation is not cached because we can use the slot and * session values. */ @@ -2448,17 +3530,30 @@ static __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; struct nfsd4_slot *slot = resp->cstate.slot; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; - __be32 status; dprintk("--> %s slot %p\n", __func__, slot); - status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); - if (status) - return status; + /* Always encode the SEQUENCE response. */ + nfsd4_encode_operation(resp, &args->ops[0]); + if (args->opcnt == 1) + /* A solo SEQUENCE - nothing was cached */ + return args->ops[0].status; + + if (!(slot->sl_flags & NFSD4_SLOT_CACHED)) { + /* We weren't asked to cache this. */ + struct nfsd4_op *op; + + op = &args->ops[resp->opcnt++]; + op->status = nfserr_retry_uncached_rep; + nfsd4_encode_operation(resp, op); + return op->status; + } + /* return reply from cache */ p = xdr_reserve_space(xdr, slot->sl_datalen); if (!p) { WARN_ON_ONCE(1); @@ -2509,7 +3604,22 @@ static bool client_has_state(struct nfs4_client *clp) #endif || !list_empty(&clp->cl_delegations) || !list_empty(&clp->cl_sessions) - || !list_empty(&clp->async_copies); + || nfsd4_has_active_async_copies(clp); +} + +static __be32 copy_impl_id(struct nfs4_client *clp, + struct nfsd4_exchange_id *exid) +{ + if (!exid->nii_domain.data) + return 0; + xdr_netobj_dup(&clp->cl_nii_domain, &exid->nii_domain, GFP_KERNEL); + if (!clp->cl_nii_domain.data) + return nfserr_jukebox; + xdr_netobj_dup(&clp->cl_nii_name, &exid->nii_name, GFP_KERNEL); + if (!clp->cl_nii_name.data) + return nfserr_jukebox; + clp->cl_nii_time = exid->nii_time; + return 0; } __be32 @@ -2528,16 +3638,25 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, rpc_ntop(sa, addr_str, sizeof(addr_str)); dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " - "ip_addr=%s flags %x, spa_how %d\n", + "ip_addr=%s flags %x, spa_how %u\n", __func__, rqstp, exid, exid->clname.len, exid->clname.data, addr_str, exid->flags, exid->spa_how); + exid->server_impl_name = kasprintf(GFP_KERNEL, "%s %s %s %s", + utsname()->sysname, utsname()->release, + utsname()->version, utsname()->machine); + if (!exid->server_impl_name) + return nfserr_jukebox; + if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; new = create_client(exid->clname, rqstp, &verf); if (new == NULL) return nfserr_jukebox; + status = copy_impl_id(new, exid); + if (status) + goto out_nolock; switch (exid->spa_how) { case SP4_MACH_CRED: @@ -2572,10 +3691,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out_nolock; } new->cl_mach_cred = true; + break; case SP4_NONE: break; default: /* checked by xdr code */ WARN_ON_ONCE(1); + fallthrough; case SP4_SSV: status = nfserr_encr_alg_unsupp; goto out_nolock; @@ -2607,20 +3728,24 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } /* case 6 */ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; + trace_nfsd_clid_confirmed_r(conf); goto out_copy; } if (!creds_match) { /* case 3 */ if (client_has_state(conf)) { status = nfserr_clid_inuse; + trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } goto out_new; } if (verfs_match) { /* case 2 */ conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + trace_nfsd_clid_confirmed_r(conf); goto out_copy; } /* case 5, client reboot */ + trace_nfsd_clid_verf_mismatch(conf, rqstp, &verf); conf = NULL; goto out_new; } @@ -2630,22 +3755,27 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - unconf = find_unconfirmed_client_by_name(&exid->clname, nn); + unconf = find_unconfirmed_client_by_name(&exid->clname, nn); if (unconf) /* case 4, possible retry or client restart */ unhash_client_locked(unconf); - /* case 1 (normal case) */ + /* case 1, new owner ID */ + trace_nfsd_clid_fresh(new); + out_new: if (conf) { status = mark_client_expired_locked(conf); if (status) goto out; + trace_nfsd_clid_replaced(&conf->cl_clientid); } new->cl_minorversion = cstate->minorversion; new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1]; - gen_clid(new, nn); + /* Contrived initial CREATE_SESSION response */ + new->cl_cs_slot.sl_status = nfserr_seq_misordered; + add_to_unconfirmed(new); swap(new, conf); out_copy: @@ -2655,6 +3785,23 @@ out_copy: exid->seqid = conf->cl_cs_slot.sl_seqid + 1; nfsd4_set_ex_flags(conf, exid); + exid->nii_domain.len = sizeof("kernel.org") - 1; + exid->nii_domain.data = "kernel.org"; + + /* + * Note that RFC 8881 places no length limit on + * nii_name, but this implementation permits no + * more than NFS4_OPAQUE_LIMIT bytes. + */ + exid->nii_name.len = strlen(exid->server_impl_name); + if (exid->nii_name.len > NFS4_OPAQUE_LIMIT) + exid->nii_name.len = NFS4_OPAQUE_LIMIT; + exid->nii_name.data = exid->server_impl_name; + + /* just send zeros - the date is in nii_name */ + exid->nii_time.tv_sec = 0; + exid->nii_time.tv_nsec = 0; + dprintk("nfsd4_exchange_id seqid %d flags %x\n", conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags); status = nfs_ok; @@ -2664,19 +3811,25 @@ out: out_nolock: if (new) expire_client(new); - if (unconf) + if (unconf) { + trace_nfsd_clid_expire_unconf(&unconf->cl_clientid); expire_client(unconf); + } return status; } -static __be32 -check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) +void +nfsd4_exchange_id_release(union nfsd4_op_u *u) { - dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid, - slot_seqid); + struct nfsd4_exchange_id *exid = &u->exchange_id; + + kfree(exid->server_impl_name); +} +static __be32 check_slot_seqid(u32 seqid, u32 slot_seqid, u8 flags) +{ /* The slot is in use, and no response has been sent. */ - if (slot_inuse) { + if (flags & NFSD4_SLOT_INUSE) { if (seqid == slot_seqid) return nfserr_jukebox; else @@ -2685,6 +3838,8 @@ check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) /* Note unsigned 32-bit arithmetic handles wraparound: */ if (likely(seqid == slot_seqid + 1)) return nfs_ok; + if ((flags & NFSD4_SLOT_REUSED) && seqid == 1) + return nfs_ok; if (seqid == slot_seqid) return nfserr_replay_cache; return nfserr_seq_misordered; @@ -2743,17 +3898,6 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs ca->maxresp_cached = min_t(u32, ca->maxresp_cached, NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ); ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION); - /* - * Note decreasing slot size below client's request may make it - * difficult for client to function correctly, whereas - * decreasing the number of slots will (just?) affect - * performance. When short on memory we therefore prefer to - * decrease number of slots instead of their size. Clients that - * request larger slots than they need will get poor results: - */ - ca->maxreqs = nfsd4_get_drc_mem(ca); - if (!ca->maxreqs) - return nfserr_jukebox; return nfs_ok; } @@ -2814,10 +3958,10 @@ nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_create_session *cr_ses = &u->create_session; struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; + struct nfsd4_clid_slot *cs_slot; struct nfs4_client *old = NULL; struct nfsd4_session *new; struct nfsd4_conn *conn; - struct nfsd4_clid_slot *cs_slot = NULL; __be32 status = 0; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); @@ -2831,93 +3975,120 @@ nfsd4_create_session(struct svc_rqst *rqstp, return status; status = check_backchannel_attrs(&cr_ses->back_channel); if (status) - goto out_release_drc_mem; + goto out_err; status = nfserr_jukebox; new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel); if (!new) - goto out_release_drc_mem; + goto out_err; conn = alloc_conn_from_crses(rqstp, cr_ses); if (!conn) goto out_free_session; spin_lock(&nn->client_lock); + + /* RFC 8881 Section 18.36.4 Phase 1: Client record look-up. */ unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); - WARN_ON_ONCE(conf && unconf); + if (!conf && !unconf) { + status = nfserr_stale_clientid; + goto out_free_conn; + } + + /* RFC 8881 Section 18.36.4 Phase 2: Sequence ID processing. */ + if (conf) { + cs_slot = &conf->cl_cs_slot; + trace_nfsd_slot_seqid_conf(conf, cr_ses); + } else { + cs_slot = &unconf->cl_cs_slot; + trace_nfsd_slot_seqid_unconf(unconf, cr_ses); + } + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); + switch (status) { + case nfs_ok: + cs_slot->sl_seqid++; + cr_ses->seqid = cs_slot->sl_seqid; + break; + case nfserr_replay_cache: + status = nfsd4_replay_create_session(cr_ses, cs_slot); + fallthrough; + case nfserr_jukebox: + /* The server MUST NOT cache NFS4ERR_DELAY */ + goto out_free_conn; + default: + goto out_cache_error; + } + /* RFC 8881 Section 18.36.4 Phase 3: Client ID confirmation. */ if (conf) { status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(conf, rqstp)) - goto out_free_conn; - cs_slot = &conf->cl_cs_slot; - status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); - if (status) { - if (status == nfserr_replay_cache) - status = nfsd4_replay_create_session(cr_ses, cs_slot); - goto out_free_conn; - } - } else if (unconf) { + goto out_cache_error; + } else { + status = nfserr_clid_inuse; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { - status = nfserr_clid_inuse; - goto out_free_conn; + trace_nfsd_clid_cred_mismatch(unconf, rqstp); + goto out_cache_error; } status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(unconf, rqstp)) - goto out_free_conn; - cs_slot = &unconf->cl_cs_slot; - status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); - if (status) { - /* an unconfirmed replay returns misordered */ - status = nfserr_seq_misordered; - goto out_free_conn; - } + goto out_cache_error; old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = mark_client_expired_locked(old); - if (status) { - old = NULL; - goto out_free_conn; - } + if (status) + goto out_expired_error; + trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; - } else { - status = nfserr_stale_clientid; - goto out_free_conn; } + + /* RFC 8881 Section 18.36.4 Phase 4: Session creation. */ status = nfs_ok; /* Persistent sessions are not supported */ cr_ses->flags &= ~SESSION4_PERSIST; /* Upshifting from TCP to RDMA is not supported */ cr_ses->flags &= ~SESSION4_RDMA; + /* Report the correct number of backchannel slots */ + cr_ses->back_channel.maxreqs = new->se_cb_highest_slot + 1; init_session(rqstp, new, conf, cr_ses); nfsd4_get_session_locked(new); memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); - cs_slot->sl_seqid++; - cr_ses->seqid = cs_slot->sl_seqid; /* cache solo and embedded create sessions under the client_lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); spin_unlock(&nn->client_lock); + if (conf == unconf) + fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); /* init connection and backchannel */ nfsd4_init_conn(rqstp, conn, new); nfsd4_put_session(new); if (old) expire_client(old); return status; + +out_expired_error: + /* + * Revert the slot seq_nr change so the server will process + * the client's resend instead of returning a cached response. + */ + if (status == nfserr_jukebox) { + cs_slot->sl_seqid--; + cr_ses->seqid = cs_slot->sl_seqid; + goto out_free_conn; + } +out_cache_error: + nfsd4_cache_create_session(cr_ses, cs_slot, status); out_free_conn: spin_unlock(&nn->client_lock); free_conn(conn); - if (old) - expire_client(old); out_free_session: __free_session(new); -out_release_drc_mem: - nfsd4_put_drc_mem(&cr_ses->fore_channel); +out_err: return status; } @@ -2931,7 +4102,7 @@ static __be32 nfsd4_map_bcts_dir(u32 *dir) case NFS4_CDFC4_BACK_OR_BOTH: *dir = NFS4_CDFC4_BOTH; return nfs_ok; - }; + } return nfserr_inval; } @@ -2957,6 +4128,47 @@ __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, return nfs_ok; } +static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s) +{ + struct nfsd4_conn *c; + + list_for_each_entry(c, &s->se_conns, cn_persession) { + if (c->cn_xprt == xpt) { + return c; + } + } + return NULL; +} + +static __be32 nfsd4_match_existing_connection(struct svc_rqst *rqst, + struct nfsd4_session *session, u32 req, struct nfsd4_conn **conn) +{ + struct nfs4_client *clp = session->se_client; + struct svc_xprt *xpt = rqst->rq_xprt; + struct nfsd4_conn *c; + __be32 status; + + /* Following the last paragraph of RFC 5661 Section 18.34.3: */ + spin_lock(&clp->cl_lock); + c = __nfsd4_find_conn(xpt, session); + if (!c) + status = nfserr_noent; + else if (req == c->cn_flags) + status = nfs_ok; + else if (req == NFS4_CDFC4_FORE_OR_BOTH && + c->cn_flags != NFS4_CDFC4_BACK) + status = nfs_ok; + else if (req == NFS4_CDFC4_BACK_OR_BOTH && + c->cn_flags != NFS4_CDFC4_FORE) + status = nfs_ok; + else + status = nfserr_inval; + spin_unlock(&clp->cl_lock); + if (status == nfs_ok && conn) + *conn = c; + return status; +} + __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -2978,6 +4190,17 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(session->se_client, rqstp)) goto out; + status = nfsd4_match_existing_connection(rqstp, session, + bcts->dir, &conn); + if (status == nfs_ok) { + if (bcts->dir == NFS4_CDFC4_FORE_OR_BOTH || + bcts->dir == NFS4_CDFC4_BACK) + conn->cn_flags |= NFS4_CDFC4_BACK; + nfsd4_probe_callback(session->se_client); + goto out; + } + if (status == nfserr_inval) + goto out; status = nfsd4_map_bcts_dir(&bcts->dir); if (status) goto out; @@ -3043,18 +4266,6 @@ out: return status; } -static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_session *s) -{ - struct nfsd4_conn *c; - - list_for_each_entry(c, &s->se_conns, cn_persession) { - if (c->cn_xprt == xpt) { - return c; - } - } - return NULL; -} - static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) { struct nfs4_client *clp = ses->se_client; @@ -3106,12 +4317,17 @@ static bool replay_matches_cache(struct svc_rqst *rqstp, (bool)seq->cachethis) return false; /* - * If there's an error than the reply can have fewer ops than - * the call. But if we cached a reply with *more* ops than the - * call you're sending us now, then this new call is clearly not - * really a replay of the old one: + * If there's an error then the reply can have fewer ops than + * the call. */ - if (slot->sl_opcnt < argp->opcnt) + if (slot->sl_opcnt < argp->opcnt && !slot->sl_status) + return false; + /* + * But if we cached a reply with *more* ops than the call you're + * sending us now, then this new call is clearly not really a + * replay of the old one: + */ + if (slot->sl_opcnt > argp->opcnt) return false; /* This is the only check explicitly called by spec: */ if (!same_creds(&rqstp->rq_cred, &slot->sl_cred)) @@ -3125,13 +4341,43 @@ static bool replay_matches_cache(struct svc_rqst *rqstp, return true; } +/* + * Note that the response is constructed here both for the case + * of a new SEQUENCE request and for a replayed SEQUENCE request. + * We do not cache SEQUENCE responses as SEQUENCE is idempotent. + */ +static void nfsd4_construct_sequence_response(struct nfsd4_session *session, + struct nfsd4_sequence *seq) +{ + struct nfs4_client *clp = session->se_client; + + seq->maxslots_response = max(session->se_target_maxslots, + seq->maxslots); + seq->target_maxslots = session->se_target_maxslots; + + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; + } + if (!list_empty(&clp->cl_revoked)) + seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; + if (atomic_read(&clp->cl_admin_revoked)) + seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_sequence *seq = &u->sequence; struct nfsd4_compoundres *resp = rqstp->rq_resp; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct nfsd4_session *session; struct nfs4_client *clp; struct nfsd4_slot *slot; @@ -3170,16 +4416,14 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (seq->slotid >= session->se_fchannel.maxreqs) goto out_put_session; - slot = session->se_slots[seq->slotid]; + slot = xa_load(&session->se_slots, seq->slotid); dprintk("%s: slotid %d\n", __func__, seq->slotid); - /* We do not negotiate the number of slots yet, so set the - * maxslots to the session maxreqs which is used to encode - * sr_highest_slotid and the sr_target_slot id to maxslots */ - seq->maxslots = session->se_fchannel.maxreqs; + trace_nfsd_slot_seqid_sequence(clp, seq, slot); + + nfsd4_construct_sequence_response(session, seq); - status = check_slot_seqid(seq->seqid, slot->sl_seqid, - slot->sl_flags & NFSD4_SLOT_INUSE); + status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_flags); if (status == nfserr_replay_cache) { status = nfserr_seq_misordered; if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED)) @@ -3204,6 +4448,12 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out_put_session; + if (session->se_target_maxslots < session->se_fchannel.maxreqs && + slot->sl_generation == session->se_slot_gen && + seq->maxslots <= session->se_target_maxslots) + /* Client acknowledged our reduce maxreqs */ + free_session_slots(session, session->se_target_maxslots); + buflen = (seq->cachethis) ? session->se_fchannel.maxresp_cached : session->se_fchannel.maxresp_sz; @@ -3211,12 +4461,14 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfserr_rep_too_big; if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) goto out_put_session; - svc_reserve(rqstp, buflen); + svc_reserve_auth(rqstp, buflen); status = nfs_ok; - /* Success! bump slot seqid */ + /* Success! accept new slot seqid */ slot->sl_seqid = seq->seqid; + slot->sl_flags &= ~NFSD4_SLOT_REUSED; slot->sl_flags |= NFSD4_SLOT_INUSE; + slot->sl_generation = session->se_slot_gen; if (seq->cachethis) slot->sl_flags |= NFSD4_SLOT_CACHETHIS; else @@ -3226,19 +4478,49 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->session = session; cstate->clp = clp; -out: - switch (clp->cl_cb_state) { - case NFSD4_CB_DOWN: - seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; - break; - case NFSD4_CB_FAULT: - seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; - break; - default: - seq->status_flags = 0; + /* + * If the client ever uses the highest available slot, + * gently try to allocate another 20%. This allows + * fairly quick growth without grossly over-shooting what + * the client might use. + */ + if (seq->slotid == session->se_fchannel.maxreqs - 1 && + session->se_target_maxslots >= session->se_fchannel.maxreqs && + session->se_fchannel.maxreqs < NFSD_MAX_SLOTS_PER_SESSION) { + int s = session->se_fchannel.maxreqs; + int cnt = DIV_ROUND_UP(s, 5); + void *prev_slot; + + do { + /* + * GFP_NOWAIT both allows allocation under a + * spinlock, and only succeeds if there is + * plenty of memory. + */ + slot = nfsd4_alloc_slot(&session->se_fchannel, s, + GFP_NOWAIT); + prev_slot = xa_load(&session->se_slots, s); + if (xa_is_value(prev_slot) && slot) { + slot->sl_seqid = xa_to_value(prev_slot); + slot->sl_flags |= NFSD4_SLOT_REUSED; + } + if (slot && + !xa_is_err(xa_store(&session->se_slots, s, slot, + GFP_NOWAIT))) { + s += 1; + session->se_fchannel.maxreqs = s; + atomic_add(s - session->se_target_maxslots, + &nfsd_total_target_slots); + session->se_target_maxslots = s; + } else { + kfree(slot); + slot = NULL; + } + } while (slot && --cnt > 0); } - if (!list_empty(&clp->cl_revoked)) - seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; + +out: + trace_nfsd_seq4_status(rqstp, seq); out_no_session: if (conn) free_conn(conn); @@ -3301,6 +4583,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, status = nfserr_wrong_cred; goto out; } + trace_nfsd_clid_destroyed(&clp->cl_clientid); unhash_client_locked(clp); out: spin_unlock(&nn->client_lock); @@ -3314,6 +4597,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; + struct nfs4_client *clp = cstate->clp; __be32 status = 0; if (rc->rca_one_fs) { @@ -3327,12 +4611,11 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, } status = nfserr_complete_already; - if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, - &cstate->session->se_client->cl_flags)) + if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) goto out; status = nfserr_stale_clientid; - if (is_client_expired(cstate->session->se_client)) + if (is_client_expired(clp)) /* * The following error isn't really legal. * But we only get here if the client just explicitly @@ -3343,7 +4626,9 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, goto out; status = nfs_ok; - nfsd4_client_record_create(cstate->session->se_client); + trace_nfsd_clid_reclaim_complete(&clp->cl_clientid); + nfsd4_client_record_create(clp); + inc_reclaim_complete(clp); out: return status; } @@ -3363,32 +4648,29 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, new = create_client(clname, rqstp, &clverifier); if (new == NULL) return nfserr_jukebox; - /* Cases below refer to rfc 3530 section 14.2.33: */ spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&clname, nn); if (conf && client_has_state(conf)) { - /* case 0: */ status = nfserr_clid_inuse; if (clp_used_exchangeid(conf)) goto out; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { - char addr_str[INET6_ADDRSTRLEN]; - rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str, - sizeof(addr_str)); - dprintk("NFSD: setclientid: string in use by client " - "at %s\n", addr_str); + trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } } unconf = find_unconfirmed_client_by_name(&clname, nn); if (unconf) unhash_client_locked(unconf); - if (conf && same_verf(&conf->cl_verifier, &clverifier)) { - /* case 1: probable callback update */ - copy_clid(new, conf); - gen_confirm(new, nn); - } else /* case 4 (new client) or cases 2, 3 (client reboot): */ - gen_clid(new, nn); + if (conf) { + if (same_verf(&conf->cl_verifier, &clverifier)) { + copy_clid(new, conf); + gen_confirm(new, nn); + } else + trace_nfsd_clid_verf_mismatch(conf, rqstp, + &clverifier); + } else + trace_nfsd_clid_fresh(new); new->cl_minorversion = 0; gen_callback(new, setclid, rqstp); add_to_unconfirmed(new); @@ -3401,12 +4683,13 @@ out: spin_unlock(&nn->client_lock); if (new) free_client(new); - if (unconf) + if (unconf) { + trace_nfsd_clid_expire_unconf(&unconf->cl_clientid); expire_client(unconf); + } return status; } - __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -3435,43 +4718,60 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, * Nevertheless, RFC 7530 recommends INUSE for this case: */ status = nfserr_clid_inuse; - if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) + if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { + trace_nfsd_clid_cred_mismatch(unconf, rqstp); goto out; - if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) + } + if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; - /* cases below refer to rfc 3530 section 14.2.34: */ + } if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { if (conf && same_verf(&confirm, &conf->cl_confirm)) { - /* case 2: probable retransmit */ status = nfs_ok; - } else /* case 4: client hasn't noticed we rebooted yet? */ + } else status = nfserr_stale_clientid; goto out; } status = nfs_ok; - if (conf) { /* case 1: callback update */ - old = unconf; - unhash_client_locked(old); - nfsd4_change_callback(conf, &unconf->cl_cb_conn); - } else { /* case 3: normal case; new or rebooted client */ + if (conf) { + if (get_client_locked(conf) == nfs_ok) { + old = unconf; + unhash_client_locked(old); + nfsd4_change_callback(conf, &unconf->cl_cb_conn); + } else { + conf = NULL; + } + } + + if (!conf) { old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = nfserr_clid_inuse; if (client_has_state(old) && !same_creds(&unconf->cl_cred, - &old->cl_cred)) + &old->cl_cred)) { + old = NULL; goto out; + } status = mark_client_expired_locked(old); if (status) { old = NULL; goto out; } + trace_nfsd_clid_replaced(&old->cl_clientid); + } + status = get_client_locked(unconf); + if (status != nfs_ok) { + old = NULL; + goto out; } move_to_confirmed(unconf); conf = unconf; } - get_client_locked(conf); spin_unlock(&nn->client_lock); + if (conf == unconf) + fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); nfsd4_probe_callback(conf); spin_lock(&nn->client_lock); put_client_renew_locked(conf); @@ -3488,27 +4788,27 @@ static struct nfs4_file *nfsd4_alloc_file(void) } /* OPEN Share state helper functions */ -static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, - struct nfs4_file *fp) -{ - lockdep_assert_held(&state_lock); +static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp) +{ refcount_set(&fp->fi_ref, 1); spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); INIT_LIST_HEAD(&fp->fi_clnt_odstate); - fh_copy_shallow(&fp->fi_fhandle, fh); + fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle); fp->fi_deleg_file = NULL; + fp->fi_rdeleg_file = NULL; fp->fi_had_conflict = false; fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); + fp->fi_aliased = false; + fp->fi_inode = d_inode(fh->fh_dentry); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&fp->fi_lo_states); atomic_set(&fp->fi_lo_recalls, 0); #endif - hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); } void @@ -3526,32 +4826,25 @@ nfsd4_free_slabs(void) int nfsd4_init_slabs(void) { - client_slab = kmem_cache_create("nfsd4_clients", - sizeof(struct nfs4_client), 0, 0, NULL); + client_slab = KMEM_CACHE(nfs4_client, 0); if (client_slab == NULL) goto out; - openowner_slab = kmem_cache_create("nfsd4_openowners", - sizeof(struct nfs4_openowner), 0, 0, NULL); + openowner_slab = KMEM_CACHE(nfs4_openowner, 0); if (openowner_slab == NULL) goto out_free_client_slab; - lockowner_slab = kmem_cache_create("nfsd4_lockowners", - sizeof(struct nfs4_lockowner), 0, 0, NULL); + lockowner_slab = KMEM_CACHE(nfs4_lockowner, 0); if (lockowner_slab == NULL) goto out_free_openowner_slab; - file_slab = kmem_cache_create("nfsd4_files", - sizeof(struct nfs4_file), 0, 0, NULL); + file_slab = KMEM_CACHE(nfs4_file, 0); if (file_slab == NULL) goto out_free_lockowner_slab; - stateid_slab = kmem_cache_create("nfsd4_stateids", - sizeof(struct nfs4_ol_stateid), 0, 0, NULL); + stateid_slab = KMEM_CACHE(nfs4_ol_stateid, 0); if (stateid_slab == NULL) goto out_free_file_slab; - deleg_slab = kmem_cache_create("nfsd4_delegations", - sizeof(struct nfs4_delegation), 0, 0, NULL); + deleg_slab = KMEM_CACHE(nfs4_delegation, 0); if (deleg_slab == NULL) goto out_free_stateid_slab; - odstate_slab = kmem_cache_create("nfsd4_odstate", - sizeof(struct nfs4_clnt_odstate), 0, 0, NULL); + odstate_slab = KMEM_CACHE(nfs4_clnt_odstate, 0); if (odstate_slab == NULL) goto out_free_deleg_slab; return 0; @@ -3569,25 +4862,79 @@ out_free_openowner_slab: out_free_client_slab: kmem_cache_destroy(client_slab); out: - dprintk("nfsd4: out of memory while initializing nfsv4\n"); return -ENOMEM; } +static unsigned long +nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) +{ + struct nfsd_net *nn = shrink->private_data; + long count; + + count = atomic_read(&nn->nfsd_courtesy_clients); + if (!count) + count = atomic_long_read(&num_delegations); + if (count) + queue_work(laundry_wq, &nn->nfsd_shrinker_work); + return (unsigned long)count; +} + +static unsigned long +nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + return SHRINK_STOP; +} + +void +nfsd4_init_leases_net(struct nfsd_net *nn) +{ + struct sysinfo si; + u64 max_clients; + + nn->nfsd4_lease = 90; /* default lease time */ + nn->nfsd4_grace = 90; + nn->somebody_reclaimed = false; + nn->track_reclaim_completes = false; + nn->clverifier_counter = get_random_u32(); + nn->clientid_base = get_random_u32(); + nn->clientid_counter = nn->clientid_base + 1; + nn->s2s_cp_cl_id = nn->clientid_counter++; + + atomic_set(&nn->nfs4_client_count, 0); + si_meminfo(&si); + max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024); + max_clients *= NFS4_CLIENTS_PER_GB; + nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); + + atomic_set(&nn->nfsd_courtesy_clients, 0); +} + +enum rp_lock { + RP_UNLOCKED, + RP_LOCKED, + RP_UNHASHED, +}; + static void init_nfs4_replay(struct nfs4_replay *rp) { rp->rp_status = nfserr_serverfault; rp->rp_buflen = 0; rp->rp_buf = rp->rp_ibuf; - mutex_init(&rp->rp_mutex); + rp->rp_locked = RP_UNLOCKED; } -static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, - struct nfs4_stateowner *so) +static int nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, + struct nfs4_stateowner *so) { if (!nfsd4_has_session(cstate)) { - mutex_lock(&so->so_replay.rp_mutex); + wait_var_event(&so->so_replay.rp_locked, + cmpxchg(&so->so_replay.rp_locked, + RP_UNLOCKED, RP_LOCKED) != RP_LOCKED); + if (so->so_replay.rp_locked == RP_UNHASHED) + return -EAGAIN; cstate->replay_owner = nfs4_get_stateowner(so); } + return 0; } void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate) @@ -3596,7 +4943,7 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate) if (so != NULL) { cstate->replay_owner = NULL; - mutex_unlock(&so->so_replay.rp_mutex); + store_release_wake_up(&so->so_replay.rp_locked, RP_UNLOCKED); nfs4_put_stateowner(so); } } @@ -3609,12 +4956,11 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj if (!sop) return NULL; - sop->so_owner.data = kmemdup(owner->data, owner->len, GFP_KERNEL); + xdr_netobj_dup(&sop->so_owner, owner, GFP_KERNEL); if (!sop->so_owner.data) { kmem_cache_free(slab, sop); return NULL; } - sop->so_owner.len = owner->len; INIT_LIST_HEAD(&sop->so_stateids); sop->so_client = clp; @@ -3663,7 +5009,8 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) continue; if (local->st_stateowner != &oo->oo_owner) continue; - if (local->st_stid.sc_type == NFS4_OPEN_STID) { + if (local->st_stid.sc_type == SC_TYPE_OPEN && + !local->st_stid.sc_status) { ret = local; refcount_inc(&ret->st_stid.sc_count); break; @@ -3672,22 +5019,75 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) return ret; } -static __be32 -nfsd4_verify_open_stid(struct nfs4_stid *s) +static void nfsd4_drop_revoked_stid(struct nfs4_stid *s) + __releases(&s->sc_client->cl_lock) { - __be32 ret = nfs_ok; + struct nfs4_client *cl = s->sc_client; + LIST_HEAD(reaplist); + struct nfs4_ol_stateid *stp; + struct nfs4_delegation *dp; + bool unhashed; switch (s->sc_type) { - default: + case SC_TYPE_OPEN: + stp = openlockstateid(s); + if (unhash_open_stateid(stp, &reaplist)) + put_ol_stateid_locked(stp, &reaplist); + spin_unlock(&cl->cl_lock); + free_ol_stateid_reaplist(&reaplist); break; - case 0: - case NFS4_CLOSED_STID: - case NFS4_CLOSED_DELEG_STID: - ret = nfserr_bad_stateid; + case SC_TYPE_LOCK: + stp = openlockstateid(s); + unhashed = unhash_lock_stateid(stp); + spin_unlock(&cl->cl_lock); + if (unhashed) + nfs4_put_stid(s); break; - case NFS4_REVOKED_DELEG_STID: - ret = nfserr_deleg_revoked; + case SC_TYPE_DELEG: + dp = delegstateid(s); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + break; + default: + spin_unlock(&cl->cl_lock); } +} + +static void nfsd40_drop_revoked_stid(struct nfs4_client *cl, + stateid_t *stid) +{ + /* NFSv4.0 has no way for the client to tell the server + * that it can forget an admin-revoked stateid. + * So we keep it around until the first time that the + * client uses it, and drop it the first time + * nfserr_admin_revoked is returned. + * For v4.1 and later we wait until explicitly told + * to free the stateid. + */ + if (cl->cl_minorversion == 0) { + struct nfs4_stid *st; + + spin_lock(&cl->cl_lock); + st = find_stateid_locked(cl, stid); + if (st) + nfsd4_drop_revoked_stid(st); + else + spin_unlock(&cl->cl_lock); + } +} + +static __be32 +nfsd4_verify_open_stid(struct nfs4_stid *s) +{ + __be32 ret = nfs_ok; + + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + ret = nfserr_admin_revoked; + else if (s->sc_status & SC_STATUS_REVOKED) + ret = nfserr_deleg_revoked; + else if (s->sc_status & SC_STATUS_CLOSED) + ret = nfserr_bad_stateid; return ret; } @@ -3699,6 +5099,10 @@ nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); ret = nfsd4_verify_open_stid(&stp->st_stid); + if (ret == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(stp->st_stid.sc_client, + &stp->st_stid.sc_stateid); + if (ret != nfs_ok) mutex_unlock(&stp->st_mutex); return ret; @@ -3720,34 +5124,46 @@ nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) } static struct nfs4_openowner * -alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, - struct nfsd4_compound_state *cstate) +find_or_alloc_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, + struct nfsd4_compound_state *cstate) { struct nfs4_client *clp = cstate->clp; - struct nfs4_openowner *oo, *ret; + struct nfs4_openowner *oo, *new = NULL; - oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); - if (!oo) - return NULL; - oo->oo_owner.so_ops = &openowner_ops; - oo->oo_owner.so_is_open_owner = 1; - oo->oo_owner.so_seqid = open->op_seqid; - oo->oo_flags = 0; - if (nfsd4_has_session(cstate)) - oo->oo_flags |= NFS4_OO_CONFIRMED; - oo->oo_time = 0; - oo->oo_last_closed_stid = NULL; - INIT_LIST_HEAD(&oo->oo_close_lru); +retry: spin_lock(&clp->cl_lock); - ret = find_openstateowner_str_locked(strhashval, open, clp); - if (ret == NULL) { - hash_openowner(oo, clp, strhashval); - ret = oo; - } else - nfs4_free_stateowner(&oo->oo_owner); - + oo = find_openstateowner_str(strhashval, open, clp); + if (!oo && new) { + hash_openowner(new, clp, strhashval); + spin_unlock(&clp->cl_lock); + return new; + } spin_unlock(&clp->cl_lock); - return ret; + + if (oo && !(oo->oo_flags & NFS4_OO_CONFIRMED)) { + /* Replace unconfirmed owners without checking for replay. */ + release_openowner(oo); + oo = NULL; + } + if (oo) { + if (new) + nfs4_free_stateowner(&new->oo_owner); + return oo; + } + + new = alloc_stateowner(openowner_slab, &open->op_owner, clp); + if (!new) + return NULL; + new->oo_owner.so_ops = &openowner_ops; + new->oo_owner.so_is_open_owner = 1; + new->oo_owner.so_seqid = open->op_seqid; + new->oo_flags = 0; + if (nfsd4_has_session(cstate)) + new->oo_flags |= NFS4_OO_CONFIRMED; + new->oo_time = 0; + new->oo_last_closed_stid = NULL; + INIT_LIST_HEAD(&new->oo_close_lru); + goto retry; } static struct nfs4_ol_stateid * @@ -3767,13 +5183,19 @@ retry: spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); + if (nfs4_openowner_unhashed(oo)) { + mutex_unlock(&stp->st_mutex); + stp = NULL; + goto out_unlock; + } + retstp = nfsd4_find_existing_open(fp, open); if (retstp) goto out_unlock; open->op_stp = NULL; refcount_inc(&stp->st_stid.sc_count); - stp->st_stid.sc_type = NFS4_OPEN_STID; + stp->st_stid.sc_type = SC_TYPE_OPEN; INIT_LIST_HEAD(&stp->st_locks); stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); get_nfs4_file(fp); @@ -3823,7 +5245,10 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) * Wait for the refcount to drop to 2. Since it has been unhashed, * there should be no danger of the refcount going back up again at * this point. + * Some threads with a reference might be waiting for rp_locked, + * so tell them to stop waiting. */ + store_release_wake_up(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED); wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2); release_all_access(s); @@ -3836,60 +5261,86 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) last = oo->oo_last_closed_stid; oo->oo_last_closed_stid = s; list_move_tail(&oo->oo_close_lru, &nn->close_lru); - oo->oo_time = get_seconds(); + oo->oo_time = ktime_get_boottime_seconds(); spin_unlock(&nn->client_lock); if (last) nfs4_put_stid(&last->st_stid); } -/* search file_hashtbl[] for file */ -static struct nfs4_file * -find_file_locked(struct knfsd_fh *fh, unsigned int hashval) +static noinline_for_stack struct nfs4_file * +nfsd4_file_hash_lookup(const struct svc_fh *fhp) { - struct nfs4_file *fp; + struct inode *inode = d_inode(fhp->fh_dentry); + struct rhlist_head *tmp, *list; + struct nfs4_file *fi; - hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { - if (fh_match(&fp->fi_fhandle, fh)) { - if (refcount_inc_not_zero(&fp->fi_ref)) - return fp; + rcu_read_lock(); + list = rhltable_lookup(&nfs4_file_rhltable, &inode, + nfs4_file_rhash_params); + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { + if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { + if (refcount_inc_not_zero(&fi->fi_ref)) { + rcu_read_unlock(); + return fi; + } } } + rcu_read_unlock(); return NULL; } -struct nfs4_file * -find_file(struct knfsd_fh *fh) -{ - struct nfs4_file *fp; - unsigned int hashval = file_hashval(fh); +/* + * On hash insertion, identify entries with the same inode but + * distinct filehandles. They will all be on the list returned + * by rhltable_lookup(). + * + * inode->i_lock prevents racing insertions from adding an entry + * for the same inode/fhp pair twice. + */ +static noinline_for_stack struct nfs4_file * +nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp) +{ + struct inode *inode = d_inode(fhp->fh_dentry); + struct rhlist_head *tmp, *list; + struct nfs4_file *ret = NULL; + bool alias_found = false; + struct nfs4_file *fi; + int err; rcu_read_lock(); - fp = find_file_locked(fh, hashval); - rcu_read_unlock(); - return fp; -} + spin_lock(&inode->i_lock); -static struct nfs4_file * -find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh) -{ - struct nfs4_file *fp; - unsigned int hashval = file_hashval(fh); + list = rhltable_lookup(&nfs4_file_rhltable, &inode, + nfs4_file_rhash_params); + rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) { + if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) { + if (refcount_inc_not_zero(&fi->fi_ref)) + ret = fi; + } else + fi->fi_aliased = alias_found = true; + } + if (ret) + goto out_unlock; - rcu_read_lock(); - fp = find_file_locked(fh, hashval); - rcu_read_unlock(); - if (fp) - return fp; + nfsd4_file_init(fhp, new); + err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist, + nfs4_file_rhash_params); + if (err) + goto out_unlock; - spin_lock(&state_lock); - fp = find_file_locked(fh, hashval); - if (likely(fp == NULL)) { - nfsd4_init_file(fh, hashval, new); - fp = new; - } - spin_unlock(&state_lock); + new->fi_aliased = alias_found; + ret = new; - return fp; +out_unlock: + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + return ret; +} + +static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi) +{ + rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist, + nfs4_file_rhash_params); } /* @@ -3902,9 +5353,10 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) struct nfs4_file *fp; __be32 ret = nfs_ok; - fp = find_file(¤t_fh->fh_handle); + fp = nfsd4_file_hash_lookup(current_fh); if (!fp) return ret; + /* Check for conflicting share reservations */ spin_lock(&fp->fi_lock); if (fp->fi_share_deny & deny_type) @@ -3914,6 +5366,35 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) return ret; } +static bool nfsd4_deleg_present(const struct inode *inode) +{ + struct file_lock_context *ctx = locks_inode_context(inode); + + return ctx && !list_empty_careful(&ctx->flc_lease); +} + +/** + * nfsd_wait_for_delegreturn - wait for delegations to be returned + * @rqstp: the RPC transaction being executed + * @inode: in-core inode of the file being waited for + * + * The timeout prevents deadlock if all nfsd threads happen to be + * tied up waiting for returning delegations. + * + * Return values: + * %true: delegation was returned + * %false: timed out waiting for delegreturn + */ +bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode) +{ + long __maybe_unused timeo; + + timeo = wait_var_event_timeout(inode, !nfsd4_deleg_present(inode), + NFSD_DELEGRETURN_TIMEOUT); + trace_nfsd_delegret_wakeup(rqstp, inode, timeo); + return timeo > 0; +} + static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) { struct nfs4_delegation *dp = cb_to_delegation(cb); @@ -3930,8 +5411,8 @@ static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) * queued for a lease break. Don't queue it again. */ spin_lock(&state_lock); - if (dp->dl_time == 0) { - dp->dl_time = get_seconds(); + if (delegation_hashed(dp) && dp->dl_time == 0) { + dp->dl_time = ktime_get_boottime_seconds(); list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); } spin_unlock(&state_lock); @@ -3942,12 +5423,18 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, { struct nfs4_delegation *dp = cb_to_delegation(cb); - if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID) - return 1; + trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task); + + if (dp->dl_stid.sc_status) + /* CLOSED or REVOKED */ + return 1; switch (task->tk_status) { case 0: return 1; + case -NFS4ERR_DELAY: + rpc_delay(task, 2 * HZ); + return 0; case -EBADHANDLE: case -NFS4ERR_BAD_STATEID: /* @@ -3958,9 +5445,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, rpc_delay(task, 2 * HZ); return 0; } - /*FALLTHRU*/ + fallthrough; default: - return -1; + return 1; } } @@ -3975,28 +5462,47 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = { .prepare = nfsd4_cb_recall_prepare, .done = nfsd4_cb_recall_done, .release = nfsd4_cb_recall_release, + .opcode = OP_CB_RECALL, }; static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { + bool queued; + + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags)) + return; + /* * We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the - * i_lock) we know the server hasn't removed the lease yet, and + * flc_lock) we know the server hasn't removed the lease yet, and * we know it's safe to take a reference. */ refcount_inc(&dp->dl_stid.sc_count); - nfsd4_run_cb(&dp->dl_recall); + queued = nfsd4_run_cb(&dp->dl_recall); + WARN_ON_ONCE(!queued); + if (!queued) + refcount_dec(&dp->dl_stid.sc_count); } -/* Called from break_lease() with i_lock held. */ +/* Called from break_lease() with flc_lock held. */ static bool -nfsd_break_deleg_cb(struct file_lock *fl) +nfsd_break_deleg_cb(struct file_lease *fl) { - bool ret = false; - struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; + struct nfs4_delegation *dp = (struct nfs4_delegation *) fl->c.flc_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; + struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfsd_net *nn; + + trace_nfsd_cb_recall(&dp->dl_stid); + + dp->dl_recalled = true; + atomic_inc(&clp->cl_delegs_in_recall); + if (try_to_expire_client(clp)) { + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + } /* * We don't want the locks code to timeout the lease for us; @@ -4005,24 +5511,49 @@ nfsd_break_deleg_cb(struct file_lock *fl) */ fl->fl_break_time = 0; - spin_lock(&fp->fi_lock); fp->fi_had_conflict = true; nfsd_break_one_deleg(dp); - spin_unlock(&fp->fi_lock); - return ret; + return false; +} + +/** + * nfsd_breaker_owns_lease - Check if lease conflict was resolved + * @fl: Lock state to check + * + * Return values: + * %true: Lease conflict was resolved + * %false: Lease conflict was not resolved. + */ +static bool nfsd_breaker_owns_lease(struct file_lease *fl) +{ + struct nfs4_delegation *dl = fl->c.flc_owner; + struct svc_rqst *rqst; + struct nfs4_client *clp; + + rqst = nfsd_current_rqst(); + if (!nfsd_v4client(rqst)) + return false; + clp = *(rqst->rq_lease_breaker); + return dl->dl_stid.sc_client == clp; } static int -nfsd_change_deleg_cb(struct file_lock *onlist, int arg, +nfsd_change_deleg_cb(struct file_lease *onlist, int arg, struct list_head *dispose) { - if (arg & F_UNLCK) + struct nfs4_delegation *dp = (struct nfs4_delegation *) onlist->c.flc_owner; + struct nfs4_client *clp = dp->dl_stid.sc_client; + + if (arg & F_UNLCK) { + if (dp->dl_recalled) + atomic_dec(&clp->cl_delegs_in_recall); return lease_modify(onlist, arg, dispose); - else + } else return -EAGAIN; } -static const struct lock_manager_operations nfsd_lease_mng_ops = { +static const struct lease_manager_operations nfsd_lease_mng_ops = { + .lm_breaker_owns_lease = nfsd_breaker_owns_lease, .lm_break = nfsd_break_deleg_cb, .lm_change = nfsd_change_deleg_cb, }; @@ -4038,39 +5569,37 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4 return nfserr_bad_seqid; } -static __be32 lookup_clientid(clientid_t *clid, - struct nfsd4_compound_state *cstate, - struct nfsd_net *nn) +static struct nfs4_client *lookup_clientid(clientid_t *clid, bool sessions, + struct nfsd_net *nn) { struct nfs4_client *found; + spin_lock(&nn->client_lock); + found = find_confirmed_client(clid, sessions, nn); + if (found) + atomic_inc(&found->cl_rpc_users); + spin_unlock(&nn->client_lock); + return found; +} + +static __be32 set_client(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn) +{ if (cstate->clp) { - found = cstate->clp; - if (!same_clid(&found->cl_clientid, clid)) + if (!same_clid(&cstate->clp->cl_clientid, clid)) return nfserr_stale_clientid; return nfs_ok; } - if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; - /* - * For v4.1+ we get the client in the SEQUENCE op. If we don't have one - * cached already then we know this is for is for v4.0 and "sessions" - * will be false. + * We're in the 4.0 case (otherwise the SEQUENCE op would have + * set cstate->clp), so session = false: */ - WARN_ON_ONCE(cstate->session); - spin_lock(&nn->client_lock); - found = find_confirmed_client(clid, false, nn); - if (!found) { - spin_unlock(&nn->client_lock); + cstate->clp = lookup_clientid(clid, false, nn); + if (!cstate->clp) return nfserr_expired; - } - atomic_inc(&found->cl_refcount); - spin_unlock(&nn->client_lock); - - /* Cache the nfs4_client in cstate! */ - cstate->clp = found; return nfs_ok; } @@ -4084,8 +5613,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfs4_openowner *oo = NULL; __be32 status; - if (STALE_CLIENTID(&open->op_clientid, nn)) - return nfserr_stale_clientid; /* * In case we need it later, after we've already created the * file and don't want to risk a further failure: @@ -4094,33 +5621,25 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (open->op_file == NULL) return nfserr_jukebox; - status = lookup_clientid(clientid, cstate, nn); + status = set_client(clientid, cstate, nn); if (status) return status; clp = cstate->clp; strhashval = ownerstr_hashval(&open->op_owner); - oo = find_openstateowner_str(strhashval, open, clp); +retry: + oo = find_or_alloc_open_stateowner(strhashval, open, cstate); open->op_openowner = oo; - if (!oo) { - goto new_owner; - } - if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { - /* Replace unconfirmed owners without checking for replay. */ - release_openowner(oo); - open->op_openowner = NULL; - goto new_owner; + if (!oo) + return nfserr_jukebox; + if (nfsd4_cstate_assign_replay(cstate, &oo->oo_owner) == -EAGAIN) { + nfs4_put_stateowner(&oo->oo_owner); + goto retry; } status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); if (status) return status; - goto alloc_stateid; -new_owner: - oo = alloc_init_open_stateowner(strhashval, open, cstate); - if (oo == NULL) - return nfserr_jukebox; - open->op_openowner = oo; -alloc_stateid: + open->op_stp = nfs4_alloc_open_stateid(clp); if (!open->op_stp) return nfserr_jukebox; @@ -4138,7 +5657,7 @@ alloc_stateid: static inline __be32 nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) { - if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ)) + if (!(flags & RD_STATE) && deleg_is_read(dp->dl_type)) return nfserr_openmode; else return nfs_ok; @@ -4149,12 +5668,12 @@ static int share_access_to_flags(u32 share_access) return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } -static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) +static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, + stateid_t *s) { struct nfs4_stid *ret; - ret = find_stateid_by_type(cl, s, - NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID); + ret = find_stateid_by_type(cl, s, SC_TYPE_DELEG, SC_STATUS_REVOKED); if (!ret) return NULL; return delegstateid(ret); @@ -4177,10 +5696,15 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; - if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { + if (deleg->dl_stid.sc_status & SC_STATUS_ADMIN_REVOKED) { nfs4_put_stid(&deleg->dl_stid); - if (cl->cl_minorversion) - status = nfserr_deleg_revoked; + status = nfserr_admin_revoked; + goto out; + } + if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) { + nfs4_put_stid(&deleg->dl_stid); + nfsd40_drop_revoked_stid(cl, &open->op_delegate_stateid); + status = nfserr_deleg_revoked; goto out; } flags = share_access_to_flags(open->op_share_access); @@ -4218,18 +5742,21 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, .ia_valid = ATTR_SIZE, .ia_size = 0, }; + struct nfsd_attrs attrs = { + .na_iattr = &iattr, + }; if (!open->op_truncate) return 0; if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) return nfserr_inval; - return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0); + return nfsd_setattr(rqstp, fh, &attrs, NULL); } static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, - struct nfsd4_open *open) + struct nfsd4_open *open, bool new_stp) { - struct file *filp = NULL; + struct nfsd_file *nf = NULL; __be32 status; int oflag = nfs4_access_to_omode(open->op_share_access); int access = nfs4_access_to_access(open->op_share_access); @@ -4243,6 +5770,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, */ status = nfs4_file_check_deny(fp, open->op_share_deny); if (status != nfs_ok) { + if (status != nfserr_share_denied) { + spin_unlock(&fp->fi_lock); + goto out; + } + if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, + stp, open->op_share_deny, false)) + status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } @@ -4250,6 +5784,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, /* set access to the file */ status = nfs4_file_get_access(fp, open->op_share_access); if (status != nfs_ok) { + if (status != nfserr_share_denied) { + spin_unlock(&fp->fi_lock); + goto out; + } + if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, + stp, open->op_share_access, true)) + status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } @@ -4265,18 +5806,26 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); - status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp); - if (status) + + status = nfsd_file_acquire_opened(rqstp, cur_fh, access, + open->op_filp, &nf); + if (status != nfs_ok) goto out_put_access; + spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { - fp->fi_fds[oflag] = filp; - filp = NULL; + fp->fi_fds[oflag] = nf; + nf = NULL; } } spin_unlock(&fp->fi_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); + + status = nfserrno(nfsd_open_break_lease(cur_fh->fh_dentry->d_inode, + access)); + if (status) + goto out_put_access; status = nfsd4_truncate(rqstp, cur_fh, open); if (status) @@ -4291,21 +5840,30 @@ out_put_access: } static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, + struct nfsd4_open *open) { __be32 status; unsigned char old_deny_bmap = stp->st_deny_bmap; if (!test_access(open->op_share_access, stp)) - return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open); + return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open, false); /* test and set deny mode */ spin_lock(&fp->fi_lock); status = nfs4_file_check_deny(fp, open->op_share_deny); - if (status == nfs_ok) { + switch (status) { + case nfs_ok: set_deny(open->op_share_deny, stp); fp->fi_share_deny |= - (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + break; + case nfserr_share_denied: + if (nfs4_resolve_deny_conflicts_locked(fp, false, + stp, open->op_share_deny, false)) + status = nfserr_jukebox; + break; } spin_unlock(&fp->fi_lock); @@ -4331,32 +5889,151 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp) return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; } -static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, - int flag) +static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp) { - struct file_lock *fl; + struct file_lease *fl; - fl = locks_alloc_lock(); + fl = locks_alloc_lease(); if (!fl) return NULL; fl->fl_lmops = &nfsd_lease_mng_ops; - fl->fl_flags = FL_DELEG; - fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; - fl->fl_end = OFFSET_MAX; - fl->fl_owner = (fl_owner_t)dp; - fl->fl_pid = current->tgid; - fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file; + fl->c.flc_flags = FL_DELEG; + fl->c.flc_type = deleg_is_read(dp->dl_type) ? F_RDLCK : F_WRLCK; + fl->c.flc_owner = (fl_owner_t)dp; + fl->c.flc_pid = current->tgid; + fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; return fl; } +static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, + struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *st; + struct file *f = fp->fi_deleg_file->nf_file; + struct inode *ino = file_inode(f); + int writes; + + writes = atomic_read(&ino->i_writecount); + if (!writes) + return 0; + /* + * There could be multiple filehandles (hence multiple + * nfs4_files) referencing this file, but that's not too + * common; let's just give up in that case rather than + * trying to go look up all the clients using that other + * nfs4_file as well: + */ + if (fp->fi_aliased) + return -EAGAIN; + /* + * If there's a close in progress, make sure that we see it + * clear any fi_fds[] entries before we see it decrement + * i_writecount: + */ + smp_mb__after_atomic(); + + if (fp->fi_fds[O_WRONLY]) + writes--; + if (fp->fi_fds[O_RDWR]) + writes--; + if (writes > 0) + return -EAGAIN; /* There may be non-NFSv4 writers */ + /* + * It's possible there are non-NFSv4 write opens in progress, + * but if they haven't incremented i_writecount yet then they + * also haven't called break lease yet; so, they'll break this + * lease soon enough. So, all that's left to check for is NFSv4 + * opens: + */ + spin_lock(&fp->fi_lock); + list_for_each_entry(st, &fp->fi_stateids, st_perfile) { + if (st->st_openstp == NULL /* it's an open */ && + access_permit_write(st) && + st->st_stid.sc_client != clp) { + spin_unlock(&fp->fi_lock); + return -EAGAIN; + } + } + spin_unlock(&fp->fi_lock); + /* + * There's a small chance that we could be racing with another + * NFSv4 open. However, any open that hasn't added itself to + * the fi_stateids list also hasn't called break_lease yet; so, + * they'll break this lease soon enough. + */ + return 0; +} + +/* + * It's possible that between opening the dentry and setting the delegation, + * that it has been renamed or unlinked. Redo the lookup to verify that this + * hasn't happened. + */ +static int +nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp, + struct svc_fh *parent) +{ + struct svc_export *exp; + struct dentry *child; + __be32 err; + + err = nfsd_lookup_dentry(open->op_rqstp, parent, + open->op_fname, open->op_fnamelen, + &exp, &child); + + if (err) + return -EAGAIN; + + exp_put(exp); + dput(child); + if (child != file_dentry(fp->fi_deleg_file->nf_file)) + return -EAGAIN; + + return 0; +} + +/* + * We avoid breaking delegations held by a client due to its own activity, but + * clearing setuid/setgid bits on a write is an implicit activity and the client + * may not notice and continue using the old mode. Avoid giving out a delegation + * on setuid/setgid files when the client is requesting an open for write. + */ +static int +nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf) +{ + struct inode *inode = file_inode(nf->nf_file); + + if ((open->op_share_access & NFS4_SHARE_ACCESS_WRITE) && + (inode->i_mode & (S_ISUID|S_ISGID))) + return -EAGAIN; + return 0; +} + +#ifdef CONFIG_NFSD_V4_DELEG_TIMESTAMPS +static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) +{ + return open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; +} +#else /* CONFIG_NFSD_V4_DELEG_TIMESTAMPS */ +static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) +{ + return false; +} +#endif /* CONFIG NFSD_V4_DELEG_TIMESTAMPS */ + static struct nfs4_delegation * -nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, - struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) +nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, + struct svc_fh *parent) { - int status = 0; + bool deleg_ts = nfsd4_want_deleg_timestamps(open); + struct nfs4_client *clp = stp->st_stid.sc_client; + struct nfs4_file *fp = stp->st_stid.sc_file; + struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; - struct file *filp; - struct file_lock *fl; + struct nfsd_file *nf = NULL; + struct file_lease *fl; + int status = 0; + u32 dl_type; /* * The fi_had_conflict and nfs_get_existing_delegation checks @@ -4366,53 +6043,114 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - filp = find_readable_file(fp); - if (!filp) { - /* We should always have a readable file here */ - WARN_ON_ONCE(1); - return ERR_PTR(-EBADF); + /* + * Try for a write delegation first. RFC8881 section 10.4 says: + * + * "An OPEN_DELEGATE_WRITE delegation allows the client to handle, + * on its own, all opens." + * + * Furthermore, section 9.1.2 says: + * + * "In the case of READ, the server may perform the corresponding + * check on the access mode, or it may choose to allow READ for + * OPEN4_SHARE_ACCESS_WRITE, to accommodate clients whose WRITE + * implementation may unavoidably do reads (e.g., due to buffer + * cache constraints)." + * + * We choose to offer a write delegation for OPEN with the + * OPEN4_SHARE_ACCESS_WRITE access mode to accommodate such clients. + */ + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { + nf = find_writeable_file(fp); + dl_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : OPEN_DELEGATE_WRITE; + } + + /* + * If the file is being opened O_RDONLY or we couldn't get a O_RDWR + * file for some reason, then try for a read delegation instead. + */ + if (!nf && (open->op_share_access & NFS4_SHARE_ACCESS_READ)) { + nf = find_readable_file(fp); + dl_type = deleg_ts ? OPEN_DELEGATE_READ_ATTRS_DELEG : OPEN_DELEGATE_READ; + } + + if (!nf) + return ERR_PTR(-EAGAIN); + + /* + * File delegations and associated locks cannot be recovered if the + * export is from an NFS proxy server. + */ + if (exportfs_cannot_lock(nf->nf_file->f_path.mnt->mnt_sb->s_export_op)) { + nfsd_file_put(nf); + return ERR_PTR(-EOPNOTSUPP); } + spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; + else if (nfsd4_verify_setuid_write(open, nf)) + status = -EAGAIN; else if (!fp->fi_deleg_file) { - fp->fi_deleg_file = filp; + fp->fi_deleg_file = nf; /* increment early to prevent fi_deleg_file from being * cleared */ fp->fi_delegees = 1; - filp = NULL; + nf = NULL; } else fp->fi_delegees++; spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); if (status) return ERR_PTR(status); status = -ENOMEM; - dp = alloc_init_deleg(clp, fp, fh, odstate); + dp = alloc_init_deleg(clp, fp, odstate, dl_type); if (!dp) goto out_delegees; - fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); + fl = nfs4_alloc_init_lease(dp); if (!fl) goto out_clnt_odstate; - status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL); + status = kernel_setlease(fp->fi_deleg_file->nf_file, + fl->c.flc_type, &fl, NULL); if (fl) - locks_free_lock(fl); + locks_free_lease(fl); if (status) goto out_clnt_odstate; + if (parent) { + status = nfsd4_verify_deleg_dentry(open, fp, parent); + if (status) + goto out_unlock; + } + + status = nfsd4_check_conflicting_opens(clp, fp); + if (status) + goto out_unlock; + + /* + * Now that the deleg is set, check again to ensure that nothing + * raced in and changed the mode while we weren't looking. + */ + status = nfsd4_verify_setuid_write(open, fp->fi_deleg_file); + if (status) + goto out_unlock; + + status = -EAGAIN; + if (fp->fi_had_conflict) + goto out_unlock; + spin_lock(&state_lock); + spin_lock(&clp->cl_lock); spin_lock(&fp->fi_lock); - if (fp->fi_had_conflict) - status = -EAGAIN; - else - status = hash_delegation_locked(dp, fp); + status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); + spin_unlock(&clp->cl_lock); spin_unlock(&state_lock); if (status) @@ -4420,7 +6158,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, return dp; out_unlock: - vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp); + kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); @@ -4431,51 +6169,124 @@ out_delegees: static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) { - open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + open->op_delegate_type = OPEN_DELEGATE_NONE_EXT; if (status == -EAGAIN) open->op_why_no_deleg = WND4_CONTENTION; else { open->op_why_no_deleg = WND4_RESOURCE; switch (open->op_deleg_want) { - case NFS4_SHARE_WANT_READ_DELEG: - case NFS4_SHARE_WANT_WRITE_DELEG: - case NFS4_SHARE_WANT_ANY_DELEG: + case OPEN4_SHARE_ACCESS_WANT_READ_DELEG: + case OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG: + case OPEN4_SHARE_ACCESS_WANT_ANY_DELEG: break; - case NFS4_SHARE_WANT_CANCEL: + case OPEN4_SHARE_ACCESS_WANT_CANCEL: open->op_why_no_deleg = WND4_CANCELLED; break; - case NFS4_SHARE_WANT_NO_DELEG: + case OPEN4_SHARE_ACCESS_WANT_NO_DELEG: WARN_ON_ONCE(1); } } } +static bool +nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh, + struct kstat *stat) +{ + struct nfsd_file *nf = find_writeable_file(dp->dl_stid.sc_file); + struct path path; + int rc; + + if (!nf) + return false; + + path.mnt = currentfh->fh_export->ex_path.mnt; + path.dentry = file_dentry(nf->nf_file); + + rc = vfs_getattr(&path, stat, + STATX_MODE | STATX_SIZE | STATX_ATIME | + STATX_MTIME | STATX_CTIME | STATX_CHANGE_COOKIE, + AT_STATX_SYNC_AS_STAT); + + nfsd_file_put(nf); + return rc == 0; +} + +/* + * Add NFS4_SHARE_ACCESS_READ to the write delegation granted on OPEN + * with NFS4_SHARE_ACCESS_WRITE by allocating separate nfsd_file and + * struct file to be used for read with delegation stateid. + * + */ +static bool +nfsd4_add_rdaccess_to_wrdeleg(struct svc_rqst *rqstp, struct nfsd4_open *open, + struct svc_fh *fh, struct nfs4_ol_stateid *stp) +{ + struct nfs4_file *fp; + struct nfsd_file *nf = NULL; + + if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == + NFS4_SHARE_ACCESS_WRITE) { + if (nfsd_file_acquire_opened(rqstp, fh, NFSD_MAY_READ, NULL, &nf)) + return (false); + fp = stp->st_stid.sc_file; + spin_lock(&fp->fi_lock); + __nfs4_file_get_access(fp, NFS4_SHARE_ACCESS_READ); + fp = stp->st_stid.sc_file; + fp->fi_fds[O_RDONLY] = nf; + fp->fi_rdeleg_file = nf; + spin_unlock(&fp->fi_lock); + } + return true; +} + /* - * Attempt to hand out a delegation. + * The Linux NFS server does not offer write delegations to NFSv4.0 + * clients in order to avoid conflicts between write delegations and + * GETATTRs requesting CHANGE or SIZE attributes. + * + * With NFSv4.1 and later minorversions, the SEQUENCE operation that + * begins each COMPOUND contains a client ID. Delegation recall can + * be avoided when the server recognizes the client sending a + * GETATTR also holds write delegation it conflicts with. * - * Note we don't support write delegations, and won't until the vfs has - * proper support for them. + * However, the NFSv4.0 protocol does not enable a server to + * determine that a GETATTR originated from the client holding the + * conflicting delegation versus coming from some other client. Per + * RFC 7530 Section 16.7.5, the server must recall or send a + * CB_GETATTR even when the GETATTR originates from the client that + * holds the conflicting delegation. + * + * An NFSv4.0 client can trigger a pathological situation if it + * always sends a DELEGRETURN preceded by a conflicting GETATTR in + * the same COMPOUND. COMPOUND execution will always stop at the + * GETATTR and the DELEGRETURN will never get executed. The server + * eventually revokes the delegation, which can result in loss of + * open or lock state. */ static void -nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, - struct nfs4_ol_stateid *stp) +nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, + struct nfs4_ol_stateid *stp, struct svc_fh *currentfh, + struct svc_fh *fh) { - struct nfs4_delegation *dp; struct nfs4_openowner *oo = openowner(stp->st_stateowner); + bool deleg_ts = nfsd4_want_deleg_timestamps(open); struct nfs4_client *clp = stp->st_stid.sc_client; - int cb_up; + struct svc_fh *parent = NULL; + struct nfs4_delegation *dp; + struct kstat stat; int status = 0; + int cb_up; cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); - open->op_recall = 0; + open->op_recall = false; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_PREVIOUS: if (!cb_up) - open->op_recall = 1; - if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ) - goto out_no_deleg; + open->op_recall = true; break; case NFS4_OPEN_CLAIM_NULL: + parent = currentfh; + fallthrough; case NFS4_OPEN_CLAIM_FH: /* * Let's not give out any delegations till everyone's @@ -4486,39 +6297,49 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; - /* - * Also, if the file was opened for write or - * create, there's a good chance the client's - * about to write to it, resulting in an - * immediate recall (since we don't support - * write delegations): - */ - if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - goto out_no_deleg; - if (open->op_create == NFS4_OPEN_CREATE) + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE && + !clp->cl_minorversion) goto out_no_deleg; break; default: goto out_no_deleg; } - dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate); + dp = nfs4_set_delegation(open, stp, parent); if (IS_ERR(dp)) goto out_no_deleg; memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); - dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", - STATEID_VAL(&dp->dl_stid.sc_stateid)); - open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { + struct file *f = dp->dl_stid.sc_file->fi_deleg_file->nf_file; + + if (!nfsd4_add_rdaccess_to_wrdeleg(rqstp, open, fh, stp) || + !nfs4_delegation_stat(dp, currentfh, &stat)) { + nfs4_put_stid(&dp->dl_stid); + destroy_delegation(dp); + goto out_no_deleg; + } + open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : + OPEN_DELEGATE_WRITE; + dp->dl_cb_fattr.ncf_cur_fsize = stat.size; + dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat); + dp->dl_atime = stat.atime; + dp->dl_ctime = stat.ctime; + dp->dl_mtime = stat.mtime; + spin_lock(&f->f_lock); + f->f_mode |= FMODE_NOCMTIME; + spin_unlock(&f->f_lock); + trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); + } else { + open->op_delegate_type = deleg_ts && nfs4_delegation_stat(dp, currentfh, &stat) ? + OPEN_DELEGATE_READ_ATTRS_DELEG : OPEN_DELEGATE_READ; + dp->dl_atime = stat.atime; + trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); + } nfs4_put_stid(&dp->dl_stid); return; out_no_deleg: - open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; - if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && - open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) { - dprintk("NFSD: WARNING: refusing delegation reclaim\n"); - open->op_recall = 1; - } + open->op_delegate_type = OPEN_DELEGATE_NONE; /* 4.1 client asking for a delegation? */ if (open->op_deleg_want) @@ -4529,21 +6350,44 @@ out_no_deleg: static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, struct nfs4_delegation *dp) { - if (open->op_deleg_want == NFS4_SHARE_WANT_READ_DELEG && - dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) { - open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; - open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE; - } else if (open->op_deleg_want == NFS4_SHARE_WANT_WRITE_DELEG && - dp->dl_type == NFS4_OPEN_DELEGATE_WRITE) { - open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; - open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE; + if (deleg_is_write(dp->dl_type)) { + if (open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_READ_DELEG) { + open->op_delegate_type = OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_SUPP_DOWNGRADE; + } else if (open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG) { + open->op_delegate_type = OPEN_DELEGATE_NONE_EXT; + open->op_why_no_deleg = WND4_NOT_SUPP_UPGRADE; + } } /* Otherwise the client must be confused wanting a delegation * it already has, therefore we don't return - * NFS4_OPEN_DELEGATE_NONE_EXT and reason. + * OPEN_DELEGATE_NONE_EXT and reason. */ } +/* Are we returning only a delegation stateid? */ +static bool open_xor_delegation(struct nfsd4_open *open) +{ + if (!(open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION)) + return false; + /* Did we actually get a delegation? */ + if (!deleg_is_read(open->op_delegate_type) && !deleg_is_write(open->op_delegate_type)) + return false; + return true; +} + +/** + * nfsd4_process_open2 - finish open processing + * @rqstp: the RPC transaction being executed + * @current_fh: NFSv4 COMPOUND's current filehandle + * @open: OPEN arguments + * + * If successful, (1) truncate the file if open->op_truncate was + * set, (2) set open->op_stateid, (3) set open->op_delegation. + * + * Returns %nfs_ok on success; otherwise an nfs4stat value in + * network byte order is returned. + */ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { @@ -4560,11 +6404,27 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * and check for delegations in the process of being recalled. * If not found, create the nfs4_file struct */ - fp = find_or_add_file(open->op_file, ¤t_fh->fh_handle); + fp = nfsd4_file_hash_insert(open->op_file, current_fh); + if (unlikely(!fp)) + return nfserr_jukebox; if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; + if (dp && nfsd4_is_deleg_cur(open) && + (dp->dl_stid.sc_file != fp)) { + /* + * RFC8881 section 8.2.4 mandates the server to return + * NFS4ERR_BAD_STATEID if the selected table entry does + * not match the current filehandle. However returning + * NFS4ERR_BAD_STATEID in the OPEN can cause the client + * to repeatedly retry the operation with the same + * stateid, since the stateid itself is valid. To avoid + * this situation NFSD returns NFS4ERR_INVAL instead. + */ + status = nfserr_inval; + goto out; + } stp = nfsd4_find_and_lock_existing_open(fp, open); } else { open->op_file = NULL; @@ -4575,6 +6435,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (!stp) { stp = init_open_stateid(fp, open); + if (!stp) { + status = nfserr_jukebox; + goto out; + } + if (!open->op_stp) new_stp = true; } @@ -4593,9 +6458,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf goto out; } } else { - status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true); if (status) { - stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); mutex_unlock(&stp->st_mutex); goto out; @@ -4611,8 +6475,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf mutex_unlock(&stp->st_mutex); if (nfsd4_has_session(&resp->cstate)) { - if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { - open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; + if (open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_NO_DELEG) { + open->op_delegate_type = OPEN_DELEGATE_NONE_EXT; open->op_why_no_deleg = WND4_NOT_WANTED; goto nodeleg; } @@ -4622,15 +6486,25 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. */ - nfs4_open_delegation(current_fh, open, stp); + nfs4_open_delegation(rqstp, open, stp, + &resp->cstate.current_fh, current_fh); + + /* + * If there is an existing open stateid, it must be updated and + * returned. Only respect WANT_OPEN_XOR_DELEGATION when a new + * open stateid would have to be created. + */ + if (new_stp && open_xor_delegation(open)) { + memcpy(&open->op_stateid, &zero_stateid, sizeof(open->op_stateid)); + open->op_rflags |= OPEN4_RESULT_NO_OPEN_STATEID; + release_open_stateid(stp); + } nodeleg: status = nfs_ok; - - dprintk("%s: stateid=" STATEID_FMT "\n", __func__, - STATEID_VAL(&stp->st_stid.sc_stateid)); + trace_nfsd_open(&stp->st_stid.sc_stateid); out: /* 4.1 client trying to upgrade/downgrade delegation? */ - if (open->op_delegate_type == NFS4_OPEN_DELEGATE_NONE && dp && + if (open->op_delegate_type == OPEN_DELEGATE_NONE && dp && open->op_deleg_want) nfsd4_deleg_xgrade_none_ext(open, dp); @@ -4641,7 +6515,7 @@ out: /* * To finish the open response, we just need to set the rflags. */ - open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; + open->op_rflags |= NFS4_OPEN_RESULT_LOCKTYPE_POSIX; if (nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK; else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED)) @@ -4658,12 +6532,8 @@ out: void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { - if (open->op_openowner) { - struct nfs4_stateowner *so = &open->op_openowner->oo_owner; - - nfsd4_cstate_assign_replay(cstate, so); - nfs4_put_stateowner(so); - } + if (open->op_openowner) + nfs4_put_stateowner(&open->op_openowner->oo_owner); if (open->op_file) kmem_cache_free(file_slab, open->op_file); if (open->op_stp) @@ -4681,19 +6551,15 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - dprintk("process_renew(%08x/%08x): starting\n", - clid->cl_boot, clid->cl_id); - status = lookup_clientid(clid, cstate, nn); + trace_nfsd_clid_renew(clid); + status = set_client(clid, cstate, nn); if (status) - goto out; + return status; clp = cstate->clp; - status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) - goto out; - status = nfs_ok; -out: - return status; + return nfserr_cb_path_down; + return nfs_ok; } void @@ -4703,7 +6569,7 @@ nfsd4_end_grace(struct nfsd_net *nn) if (nn->grace_ended) return; - dprintk("NFSD: end of grace period\n"); + trace_nfsd_grace_complete(nn); nn->grace_ended = true; /* * If the server goes down again right now, an NFSv4 @@ -4735,10 +6601,13 @@ nfsd4_end_grace(struct nfsd_net *nn) */ static bool clients_still_reclaiming(struct nfsd_net *nn) { - unsigned long now = get_seconds(); - unsigned long double_grace_period_end = nn->boot_time + - 2 * nn->nfsd4_lease; + time64_t double_grace_period_end = nn->boot_time + + 2 * nn->nfsd4_lease; + if (nn->track_reclaim_completes && + atomic_read(&nn->nr_reclaim_complete) == + nn->reclaim_str_hashtbl_size) + return false; if (!nn->somebody_reclaimed) return false; nn->somebody_reclaimed = false; @@ -4746,63 +6615,291 @@ static bool clients_still_reclaiming(struct nfsd_net *nn) * If we've given them *two* lease times to reclaim, and they're * still not done, give up: */ - if (time_after(now, double_grace_period_end)) + if (ktime_get_boottime_seconds() > double_grace_period_end) return false; return true; } -static time_t -nfs4_laundromat(struct nfsd_net *nn) +struct laundry_time { + time64_t cutoff; + time64_t new_timeo; +}; + +static bool state_expired(struct laundry_time *lt, time64_t last_refresh) { - struct nfs4_client *clp; - struct nfs4_openowner *oo; - struct nfs4_delegation *dp; + time64_t time_remaining; + + if (last_refresh < lt->cutoff) + return true; + time_remaining = last_refresh - lt->cutoff; + lt->new_timeo = min(lt->new_timeo, time_remaining); + return false; +} + +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +void nfsd4_ssc_init_umount_work(struct nfsd_net *nn) +{ + spin_lock_init(&nn->nfsd_ssc_lock); + INIT_LIST_HEAD(&nn->nfsd_ssc_mount_list); + init_waitqueue_head(&nn->nfsd_ssc_waitq); +} + +/* + * This is called when nfsd is being shutdown, after all inter_ssc + * cleanup were done, to destroy the ssc delayed unmount list. + */ +static void nfsd4_ssc_shutdown_umount(struct nfsd_net *nn) +{ + struct nfsd4_ssc_umount_item *ni = NULL; + struct nfsd4_ssc_umount_item *tmp; + + spin_lock(&nn->nfsd_ssc_lock); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + list_del(&ni->nsui_list); + spin_unlock(&nn->nfsd_ssc_lock); + mntput(ni->nsui_vfsmount); + kfree(ni); + spin_lock(&nn->nfsd_ssc_lock); + } + spin_unlock(&nn->nfsd_ssc_lock); +} + +static void nfsd4_ssc_expire_umount(struct nfsd_net *nn) +{ + bool do_wakeup = false; + struct nfsd4_ssc_umount_item *ni = NULL; + struct nfsd4_ssc_umount_item *tmp; + + spin_lock(&nn->nfsd_ssc_lock); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + if (time_after(jiffies, ni->nsui_expire)) { + if (refcount_read(&ni->nsui_refcnt) > 1) + continue; + + /* mark being unmount */ + ni->nsui_busy = true; + spin_unlock(&nn->nfsd_ssc_lock); + mntput(ni->nsui_vfsmount); + spin_lock(&nn->nfsd_ssc_lock); + + /* waiters need to start from begin of list */ + list_del(&ni->nsui_list); + kfree(ni); + + /* wakeup ssc_connect waiters */ + do_wakeup = true; + continue; + } + break; + } + if (do_wakeup) + wake_up_all(&nn->nfsd_ssc_waitq); + spin_unlock(&nn->nfsd_ssc_lock); +} +#endif + +/* Check if any lock belonging to this lockowner has any blockers */ +static bool +nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo) +{ + struct file_lock_context *ctx; struct nfs4_ol_stateid *stp; - struct nfsd4_blocked_lock *nbl; - struct list_head *pos, *next, reaplist; - time_t cutoff = get_seconds() - nn->nfsd4_lease; - time_t t, new_timeo = nn->nfsd4_lease; + struct nfs4_file *nf; - dprintk("NFSD: laundromat service - starting\n"); + list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { + nf = stp->st_stid.sc_file; + ctx = locks_inode_context(nf->fi_inode); + if (!ctx) + continue; + if (locks_owner_has_blockers(ctx, lo)) + return true; + } + return false; +} - if (clients_still_reclaiming(nn)) { - new_timeo = 0; - goto out; +static bool +nfs4_anylock_blockers(struct nfs4_client *clp) +{ + int i; + struct nfs4_stateowner *so; + struct nfs4_lockowner *lo; + + if (atomic_read(&clp->cl_delegs_in_recall)) + return true; + spin_lock(&clp->cl_lock); + for (i = 0; i < OWNER_HASH_SIZE; i++) { + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[i], + so_strhash) { + if (so->so_is_open_owner) + continue; + lo = lockowner(so); + if (nfs4_lockowner_has_blockers(lo)) { + spin_unlock(&clp->cl_lock); + return true; + } + } } - nfsd4_end_grace(nn); - INIT_LIST_HEAD(&reaplist); + spin_unlock(&clp->cl_lock); + return false; +} + +static void +nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, + struct laundry_time *lt) +{ + unsigned int maxreap, reapcnt = 0; + struct list_head *pos, *next; + struct nfs4_client *clp; + + maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ? + NFSD_CLIENT_MAX_TRIM_PER_RUN : 0; + INIT_LIST_HEAD(reaplist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); - if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { - t = clp->cl_time - cutoff; - new_timeo = min(new_timeo, t); + if (clp->cl_state == NFSD4_EXPIRABLE) + goto exp_client; + if (!state_expired(lt, clp->cl_time)) break; + if (!atomic_read(&clp->cl_rpc_users)) { + if (clp->cl_state == NFSD4_ACTIVE) + atomic_inc(&nn->nfsd_courtesy_clients); + clp->cl_state = NFSD4_COURTESY; } - if (mark_client_expired_locked(clp)) { - dprintk("NFSD: client in use (clientid %08x)\n", - clp->cl_clientid.cl_id); - continue; + if (!client_has_state(clp)) + goto exp_client; + if (!nfs4_anylock_blockers(clp)) + if (reapcnt >= maxreap) + continue; +exp_client: + if (!mark_client_expired_locked(clp)) { + list_add(&clp->cl_lru, reaplist); + reapcnt++; } - list_add(&clp->cl_lru, &reaplist); } spin_unlock(&nn->client_lock); - list_for_each_safe(pos, next, &reaplist) { +} + +static void +nfs4_get_courtesy_client_reaplist(struct nfsd_net *nn, + struct list_head *reaplist) +{ + unsigned int maxreap = 0, reapcnt = 0; + struct list_head *pos, *next; + struct nfs4_client *clp; + + maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; + INIT_LIST_HEAD(reaplist); + + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); - dprintk("NFSD: purging unused client (clientid %08x)\n", - clp->cl_clientid.cl_id); + if (clp->cl_state == NFSD4_ACTIVE) + break; + if (reapcnt >= maxreap) + break; + if (!mark_client_expired_locked(clp)) { + list_add(&clp->cl_lru, reaplist); + reapcnt++; + } + } + spin_unlock(&nn->client_lock); +} + +static void +nfs4_process_client_reaplist(struct list_head *reaplist) +{ + struct list_head *pos, *next; + struct nfs4_client *clp; + + list_for_each_safe(pos, next, reaplist) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + trace_nfsd_clid_purged(&clp->cl_clientid); list_del_init(&clp->cl_lru); expire_client(clp); } +} + +static void nfs40_clean_admin_revoked(struct nfsd_net *nn, + struct laundry_time *lt) +{ + struct nfs4_client *clp; + + spin_lock(&nn->client_lock); + if (nn->nfs40_last_revoke == 0 || + nn->nfs40_last_revoke > lt->cutoff) { + spin_unlock(&nn->client_lock); + return; + } + nn->nfs40_last_revoke = 0; + +retry: + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + unsigned long id, tmp; + struct nfs4_stid *stid; + + if (atomic_read(&clp->cl_admin_revoked) == 0) + continue; + + spin_lock(&clp->cl_lock); + idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) + if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { + refcount_inc(&stid->sc_count); + spin_unlock(&nn->client_lock); + /* this function drops ->cl_lock */ + nfsd4_drop_revoked_stid(stid); + nfs4_put_stid(stid); + spin_lock(&nn->client_lock); + goto retry; + } + spin_unlock(&clp->cl_lock); + } + spin_unlock(&nn->client_lock); +} + +static time64_t +nfs4_laundromat(struct nfsd_net *nn) +{ + struct nfs4_openowner *oo; + struct nfs4_delegation *dp; + struct nfs4_ol_stateid *stp; + struct nfsd4_blocked_lock *nbl; + struct list_head *pos, *next, reaplist; + struct laundry_time lt = { + .cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease, + .new_timeo = nn->nfsd4_lease + }; + struct nfs4_cpntf_state *cps; + copy_stateid_t *cps_t; + int i; + + if (clients_still_reclaiming(nn)) { + lt.new_timeo = 0; + goto out; + } + nfsd4_end_grace(nn); + + spin_lock(&nn->s2s_cp_lock); + idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { + cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); + if (cps->cp_stateid.cs_type == NFS4_COPYNOTIFY_STID && + state_expired(<, cps->cpntf_time)) + _free_cpntf_state_locked(nn, cps); + } + spin_unlock(&nn->s2s_cp_lock); + nfsd4_async_copy_reaper(nn); + nfs4_get_client_reaplist(nn, &reaplist, <); + nfs4_process_client_reaplist(&reaplist); + + nfs40_clean_admin_revoked(nn, <); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { - t = dp->dl_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, dp->dl_time)) break; - } - WARN_ON(!unhash_delegation_locked(dp)); + refcount_inc(&dp->dl_stid.sc_count); + unhash_delegation_locked(dp, SC_STATUS_REVOKED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -4817,12 +6914,8 @@ nfs4_laundromat(struct nfsd_net *nn) while (!list_empty(&nn->close_lru)) { oo = list_first_entry(&nn->close_lru, struct nfs4_openowner, oo_close_lru); - if (time_after((unsigned long)oo->oo_time, - (unsigned long)cutoff)) { - t = oo->oo_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, oo->oo_time)) break; - } list_del_init(&oo->oo_close_lru); stp = oo->oo_last_closed_stid; oo->oo_last_closed_stid = NULL; @@ -4848,12 +6941,8 @@ nfs4_laundromat(struct nfsd_net *nn) while (!list_empty(&nn->blocked_locks_lru)) { nbl = list_first_entry(&nn->blocked_locks_lru, struct nfsd4_blocked_lock, nbl_lru); - if (time_after((unsigned long)nbl->nbl_time, - (unsigned long)cutoff)) { - t = nbl->nbl_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, nbl->nbl_time)) break; - } list_move(&nbl->nbl_lru, &reaplist); list_del_init(&nbl->nbl_list); } @@ -4863,50 +6952,91 @@ nfs4_laundromat(struct nfsd_net *nn) nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); - locks_delete_block(&nbl->nbl_lock); free_blocked_lock(nbl); } +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + /* service the server-to-server copy delayed unmount list */ + nfsd4_ssc_expire_umount(nn); +#endif + if (atomic_long_read(&num_delegations) >= max_delegations) + deleg_reaper(nn); out: - new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); - return new_timeo; + return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } -static struct workqueue_struct *laundry_wq; static void laundromat_main(struct work_struct *); static void laundromat_main(struct work_struct *laundry) { - time_t t; + time64_t t; struct delayed_work *dwork = to_delayed_work(laundry); struct nfsd_net *nn = container_of(dwork, struct nfsd_net, laundromat_work); t = nfs4_laundromat(nn); - dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t); queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); } -static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) +static void +courtesy_client_reaper(struct nfsd_net *nn) { - if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) - return nfserr_bad_stateid; - return nfs_ok; + struct list_head reaplist; + + nfs4_get_courtesy_client_reaplist(nn, &reaplist); + nfs4_process_client_reaplist(&reaplist); } -static inline int -access_permit_read(struct nfs4_ol_stateid *stp) +static void +deleg_reaper(struct nfsd_net *nn) { - return test_access(NFS4_SHARE_ACCESS_READ, stp) || - test_access(NFS4_SHARE_ACCESS_BOTH, stp) || - test_access(NFS4_SHARE_ACCESS_WRITE, stp); + struct list_head *pos, *next; + struct nfs4_client *clp; + + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + + if (clp->cl_state != NFSD4_ACTIVE) + continue; + if (list_empty(&clp->cl_delegations)) + continue; + if (atomic_read(&clp->cl_delegs_in_recall)) + continue; + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &clp->cl_ra->ra_cb.cb_flags)) + continue; + if (ktime_get_boottime_seconds() - clp->cl_ra_time < 5) + continue; + if (clp->cl_cb_state != NFSD4_CB_UP) + continue; + + /* release in nfsd4_cb_recall_any_release */ + kref_get(&clp->cl_nfsdfs.cl_ref); + clp->cl_ra_time = ktime_get_boottime_seconds(); + clp->cl_ra->ra_keep = 0; + clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) | + BIT(RCA4_TYPE_MASK_WDATA_DLG); + trace_nfsd_cb_recall_any(clp->cl_ra); + nfsd4_run_cb(&clp->cl_ra->ra_cb); + } + spin_unlock(&nn->client_lock); } -static inline int -access_permit_write(struct nfs4_ol_stateid *stp) +static void +nfsd4_state_shrinker_worker(struct work_struct *work) { - return test_access(NFS4_SHARE_ACCESS_WRITE, stp) || - test_access(NFS4_SHARE_ACCESS_BOTH, stp); + struct nfsd_net *nn = container_of(work, struct nfsd_net, + nfsd_shrinker_work); + + courtesy_client_reaper(nn); + deleg_reaper(nn); +} + +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) +{ + if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) + return nfserr_bad_stateid; + return nfs_ok; } static @@ -4943,16 +7073,6 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, NFS4_SHARE_DENY_READ); } -/* - * Allow READ/WRITE during grace period on recovered state only for files - * that are not able to provide mandatory locking. - */ -static inline int -grace_disallows_io(struct net *net, struct inode *inode) -{ - return opens_in_grace(net) && mandatory_lock(inode); -} - static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* @@ -4990,6 +7110,9 @@ static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_sti if (ret == nfs_ok) ret = check_stateid_generation(in, &s->sc_stateid, has_session); spin_unlock(&s->sc_lock); + if (ret == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(s->sc_client, + &s->sc_stateid); return ret; } @@ -5009,15 +7132,6 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return status; - /* Client debugging aid. */ - if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { - char addr_str[INET6_ADDRSTRLEN]; - rpc_ntop((struct sockaddr *)&cl->cl_addr, addr_str, - sizeof(addr_str)); - pr_warn_ratelimited("NFSD: client %s testing state ID " - "with incorrect client ID\n", addr_str); - return status; - } spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); if (!s) @@ -5025,50 +7139,57 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) status = nfsd4_stid_check_stateid_generation(stateid, s, 1); if (status) goto out_unlock; + status = nfsd4_verify_open_stid(s); + if (status) + goto out_unlock; + switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: status = nfs_ok; break; - case NFS4_REVOKED_DELEG_STID: - status = nfserr_deleg_revoked; - break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: status = nfsd4_check_openowner_confirmed(openlockstateid(s)); break; default: printk("unknown stateid type %x\n", s->sc_type); - /* Fallthrough */ - case NFS4_CLOSED_STID: - case NFS4_CLOSED_DELEG_STID: status = nfserr_bad_stateid; } out_unlock: spin_unlock(&cl->cl_lock); + if (status == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(cl, stateid); return status; } __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, - stateid_t *stateid, unsigned char typemask, + stateid_t *stateid, + unsigned short typemask, unsigned short statusmask, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; + struct nfs4_stid *stid; bool return_revoked = false; /* * only return revoked delegations if explicitly asked. * otherwise we report revoked or bad_stateid status. */ - if (typemask & NFS4_REVOKED_DELEG_STID) + if (statusmask & SC_STATUS_REVOKED) return_revoked = true; - else if (typemask & NFS4_DELEG_STID) - typemask |= NFS4_REVOKED_DELEG_STID; + if (typemask & SC_TYPE_DELEG) + /* Always allow REVOKED for DELEG so we can + * return the appropriate error. + */ + statusmask |= SC_STATUS_REVOKED; + + statusmask |= SC_STATUS_ADMIN_REVOKED | SC_STATUS_FREEABLE; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return nfserr_bad_stateid; - status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn); + status = set_client(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { if (cstate->session) return nfserr_bad_stateid; @@ -5076,39 +7197,41 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, } if (status) return status; - *s = find_stateid_by_type(cstate->clp, stateid, typemask); - if (!*s) - return nfserr_bad_stateid; - if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { - nfs4_put_stid(*s); - if (cstate->minorversion) - return nfserr_deleg_revoked; + stid = find_stateid_by_type(cstate->clp, stateid, typemask, statusmask); + if (!stid) return nfserr_bad_stateid; + if ((stid->sc_status & SC_STATUS_REVOKED) && !return_revoked) { + nfs4_put_stid(stid); + return nfserr_deleg_revoked; + } + if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { + nfsd40_drop_revoked_stid(cstate->clp, stateid); + nfs4_put_stid(stid); + return nfserr_admin_revoked; } + *s = stid; return nfs_ok; } -static struct file * +static struct nfsd_file * nfs4_find_file(struct nfs4_stid *s, int flags) { - if (!s) + struct nfsd_file *ret = NULL; + + if (!s || s->sc_status) return NULL; switch (s->sc_type) { - case NFS4_DELEG_STID: - if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file)) - return NULL; - return get_file(s->sc_file->fi_deleg_file); - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_DELEG: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: if (flags & RD_STATE) - return find_readable_file(s->sc_file); + ret = find_readable_file(s->sc_file); else - return find_writeable_file(s->sc_file); - break; + ret = find_writeable_file(s->sc_file); } - return NULL; + return ret; } static __be32 @@ -5124,55 +7247,144 @@ nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags) static __be32 nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, - struct file **filpp, bool *tmp_file, int flags) + struct nfsd_file **nfp, int flags) { int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; - struct file *file; + struct nfsd_file *nf; __be32 status; - file = nfs4_find_file(s, flags); - if (file) { - status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + nf = nfs4_find_file(s, flags); + if (nf) { + status = nfsd_permission(&rqstp->rq_cred, + fhp->fh_export, fhp->fh_dentry, acc | NFSD_MAY_OWNER_OVERRIDE); if (status) { - fput(file); - return status; + nfsd_file_put(nf); + goto out; } - - *filpp = file; } else { - status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp); + status = nfsd_file_acquire(rqstp, fhp, acc, &nf); if (status) return status; - - if (tmp_file) - *tmp_file = true; } + *nfp = nf; +out: + return status; +} +static void +_free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) +{ + WARN_ON_ONCE(cps->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID); + if (!refcount_dec_and_test(&cps->cp_stateid.cs_count)) + return; + list_del(&cps->cp_list); + idr_remove(&nn->s2s_cp_stateids, + cps->cp_stateid.cs_stid.si_opaque.so_id); + kfree(cps); +} +/* + * A READ from an inter server to server COPY will have a + * copy stateid. Look up the copy notify stateid from the + * idr structure and take a reference on it. + */ +__be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st, + struct nfs4_client *clp, + struct nfs4_cpntf_state **cps) +{ + copy_stateid_t *cps_t; + struct nfs4_cpntf_state *state = NULL; + if (st->si_opaque.so_clid.cl_id != nn->s2s_cp_cl_id) + return nfserr_bad_stateid; + spin_lock(&nn->s2s_cp_lock); + cps_t = idr_find(&nn->s2s_cp_stateids, st->si_opaque.so_id); + if (cps_t) { + state = container_of(cps_t, struct nfs4_cpntf_state, + cp_stateid); + if (state->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID) { + state = NULL; + goto unlock; + } + if (!clp) + refcount_inc(&state->cp_stateid.cs_count); + else + _free_cpntf_state_locked(nn, state); + } +unlock: + spin_unlock(&nn->s2s_cp_lock); + if (!state) + return nfserr_bad_stateid; + if (!clp) + *cps = state; return 0; } -/* - * Checks for stateid operations +static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, + struct nfs4_stid **stid) +{ + __be32 status; + struct nfs4_cpntf_state *cps = NULL; + struct nfs4_client *found; + + status = manage_cpntf_state(nn, st, NULL, &cps); + if (status) + return status; + + cps->cpntf_time = ktime_get_boottime_seconds(); + + status = nfserr_expired; + found = lookup_clientid(&cps->cp_p_clid, true, nn); + if (!found) + goto out; + + *stid = find_stateid_by_type(found, &cps->cp_p_stateid, + SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, + 0); + if (*stid) + status = nfs_ok; + else + status = nfserr_bad_stateid; + + put_client_renew(found); +out: + nfs4_put_cpntf_state(nn, cps); + return status; +} + +void nfs4_put_cpntf_state(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) +{ + spin_lock(&nn->s2s_cp_lock); + _free_cpntf_state_locked(nn, cps); + spin_unlock(&nn->s2s_cp_lock); +} + +/** + * nfs4_preprocess_stateid_op - find and prep stateid for an operation + * @rqstp: incoming request from client + * @cstate: current compound state + * @fhp: filehandle associated with requested stateid + * @stateid: stateid (provided by client) + * @flags: flags describing type of operation to be done + * @nfp: optional nfsd_file return pointer (may be NULL) + * @cstid: optional returned nfs4_stid pointer (may be NULL) + * + * Given info from the client, look up a nfs4_stid for the operation. On + * success, it returns a reference to the nfs4_stid and/or the nfsd_file + * associated with it. */ __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, - stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file) + stateid_t *stateid, int flags, struct nfsd_file **nfp, + struct nfs4_stid **cstid) { - struct inode *ino = d_inode(fhp->fh_dentry); struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfs4_stid *s = NULL; __be32 status; - if (filpp) - *filpp = NULL; - if (tmp_file) - *tmp_file = false; - - if (grace_disallows_io(net, ino)) - return nfserr_grace; + if (nfp) + *nfp = NULL; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { status = check_special_stateids(net, fhp, stateid, flags); @@ -5180,8 +7392,10 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, } status = nfsd4_lookup_stateid(cstate, stateid, - NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, - &s, nn); + SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, + 0, &s, nn); + if (status == nfserr_bad_stateid) + status = find_cpntf_state(nn, stateid, &s); if (status) return status; status = nfsd4_stid_check_stateid_generation(stateid, s, @@ -5190,27 +7404,28 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, goto out; switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: status = nfs4_check_delegmode(delegstateid(s), flags); break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: status = nfs4_check_olstateid(openlockstateid(s), flags); break; - default: - status = nfserr_bad_stateid; - break; } if (status) goto out; status = nfs4_check_fh(fhp, s); done: - if (!status && filpp) - status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags); + if (status == nfs_ok && nfp) + status = nfs4_check_file(rqstp, fhp, s, nfp, flags); out: - if (s) - nfs4_put_stid(s); + if (s) { + if (!status && cstid) + *cstid = s; + else + nfs4_put_stid(s); + } return status; } @@ -5223,7 +7438,7 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid; - struct nfs4_client *cl = cstate->session->se_client; + struct nfs4_client *cl = cstate->clp; list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) stateid->ts_id_status = @@ -5269,39 +7484,47 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; - struct nfs4_client *cl = cstate->session->se_client; + struct nfs4_client *cl = cstate->clp; __be32 ret = nfserr_bad_stateid; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); - if (!s) + if (!s || s->sc_status & SC_STATUS_CLOSED) goto out_unlock; + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) { + nfsd4_drop_revoked_stid(s); + ret = nfs_ok; + goto out; + } spin_lock(&s->sc_lock); switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: + if (s->sc_status & SC_STATUS_REVOKED) { + s->sc_status |= SC_STATUS_CLOSED; + spin_unlock(&s->sc_lock); + dp = delegstateid(s); + if (s->sc_status & SC_STATUS_FREEABLE) + list_del_init(&dp->dl_recall_lru); + s->sc_status |= SC_STATUS_FREED; + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + ret = nfs_ok; + goto out; + } ret = nfserr_locks_held; break; - case NFS4_OPEN_STID: + case SC_TYPE_OPEN: ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) break; ret = nfserr_locks_held; break; - case NFS4_LOCK_STID: + case SC_TYPE_LOCK: spin_unlock(&s->sc_lock); refcount_inc(&s->sc_count); spin_unlock(&cl->cl_lock); ret = nfsd4_free_lock_stateid(stateid, s); goto out; - case NFS4_REVOKED_DELEG_STID: - spin_unlock(&s->sc_lock); - dp = delegstateid(s); - list_del_init(&dp->dl_recall_lru); - spin_unlock(&cl->cl_lock); - nfs4_put_stid(s); - ret = nfs_ok; - goto out; - /* Default falls through and returns nfserr_bad_stateid */ } spin_unlock(&s->sc_lock); out_unlock: @@ -5337,12 +7560,24 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ return status; } -/* - * Checks for sequence id mutating operations. +/** + * nfs4_preprocess_seqid_op - find and prep an ol_stateid for a seqid-morphing op + * @cstate: compund state + * @seqid: seqid (provided by client) + * @stateid: stateid (provided by client) + * @typemask: mask of allowable types for this operation + * @statusmask: mask of allowed states: 0 or STID_CLOSED + * @stpp: return pointer for the stateid found + * @nn: net namespace for request + * + * Given a stateid+seqid from a client, look up an nfs4_ol_stateid and + * return it in @stpp. On a nfs_ok return, the returned stateid will + * have its st_mutex locked. */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, - stateid_t *stateid, char typemask, + stateid_t *stateid, + unsigned short typemask, unsigned short statusmask, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) { @@ -5350,15 +7585,19 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, struct nfs4_stid *s; struct nfs4_ol_stateid *stp = NULL; - dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, - seqid, STATEID_VAL(stateid)); + trace_nfsd_preprocess(seqid, stateid); *stpp = NULL; - status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn); +retry: + status = nfsd4_lookup_stateid(cstate, stateid, + typemask, statusmask, &s, nn); if (status) return status; stp = openlockstateid(s); - nfsd4_cstate_assign_replay(cstate, stp->st_stateowner); + if (nfsd4_cstate_assign_replay(cstate, stp->st_stateowner) == -EAGAIN) { + nfs4_put_stateowner(stp->st_stateowner); + goto retry; + } status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp); if (!status) @@ -5376,7 +7615,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs struct nfs4_ol_stateid *stp; status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, - NFS4_OPEN_STID, &stp, nn); + SC_TYPE_OPEN, 0, &stp, nn); if (status) return status; oo = openowner(stp->st_stateowner); @@ -5407,8 +7646,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; status = nfs4_preprocess_seqid_op(cstate, - oc->oc_seqid, &oc->oc_req_stateid, - NFS4_OPEN_STID, &stp, nn); + oc->oc_seqid, &oc->oc_req_stateid, + SC_TYPE_OPEN, 0, &stp, nn); if (status) goto out; oo = openowner(stp->st_stateowner); @@ -5420,9 +7659,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, oo->oo_flags |= NFS4_OO_CONFIRMED; nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid); mutex_unlock(&stp->st_mutex); - dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", - __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); - + trace_nfsd_open_confirm(oc->oc_seqid, &stp->st_stid.sc_stateid); nfsd4_client_record_create(oo->oo_owner.so_client); status = nfs_ok; put_stateid: @@ -5502,11 +7739,12 @@ out: return status; } -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) { struct nfs4_client *clp = s->st_stid.sc_client; bool unhashed; LIST_HEAD(reaplist); + struct nfs4_ol_stateid *stp; spin_lock(&clp->cl_lock); unhashed = unhash_open_stateid(s, &reaplist); @@ -5515,12 +7753,14 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) if (unhashed) put_ol_stateid_locked(s, &reaplist); spin_unlock(&clp->cl_lock); + list_for_each_entry(stp, &reaplist, st_locks) + nfs4_free_cpntf_statelist(clp->net, &stp->st_stid); free_ol_stateid_reaplist(&reaplist); + return false; } else { spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); - if (unhashed) - move_to_close_lru(s, clp->net); + return unhashed; } } @@ -5536,19 +7776,22 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *stp; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); + bool need_move_to_close_list; - dprintk("NFSD: nfsd4_close on file %pd\n", + dprintk("NFSD: nfsd4_close on file %pd\n", cstate->current_fh.fh_dentry); status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, - &close->cl_stateid, - NFS4_OPEN_STID|NFS4_CLOSED_STID, - &stp, nn); + &close->cl_stateid, + SC_TYPE_OPEN, SC_STATUS_CLOSED, + &stp, nn); nfsd4_bump_seqid(cstate, status); if (status) - goto out; + goto out; - stp->st_stid.sc_type = NFS4_CLOSED_STID; + spin_lock(&stp->st_stid.sc_client->cl_lock); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; + spin_unlock(&stp->st_stid.sc_client->cl_lock); /* * Technically we don't _really_ have to increment or copy it, since @@ -5558,8 +7801,10 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, */ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - nfsd4_close_open_stateid(stp); + need_move_to_close_list = nfsd4_close_open_stateid(stp); mutex_unlock(&stp->st_mutex); + if (need_move_to_close_list) + move_to_close_lru(stp, net); /* v4.1+ suggests that we send a special stateid in here, since the * clients should just ignore this anyway. Since this is not useful @@ -5587,10 +7832,11 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) + status = fh_verify(rqstp, &cstate->current_fh, 0, 0); + if (status) return status; - status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED, &s, nn); if (status) goto out; dp = delegstateid(s); @@ -5598,22 +7844,16 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto put_stateid; + trace_nfsd_deleg_return(stateid); destroy_delegation(dp); + smp_mb__after_atomic(); + wake_up_var(d_inode(cstate->current_fh.fh_dentry)); put_stateid: nfs4_put_stid(&dp->dl_stid); out: return status; } -static inline u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end: NFS4_MAX_UINT64; -} - /* last octet in a range */ static inline u64 last_byte_offset(u64 start, u64 len) @@ -5643,7 +7883,7 @@ nfs4_transform_lock_offset(struct file_lock *lock) } static fl_owner_t -nfsd4_fl_get_owner(fl_owner_t owner) +nfsd4_lm_get_owner(fl_owner_t owner) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; @@ -5652,7 +7892,7 @@ nfsd4_fl_get_owner(fl_owner_t owner) } static void -nfsd4_fl_put_owner(fl_owner_t owner) +nfsd4_lm_put_owner(fl_owner_t owner) { struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; @@ -5660,10 +7900,33 @@ nfsd4_fl_put_owner(fl_owner_t owner) nfs4_put_stateowner(&lo->lo_owner); } +/* return pointer to struct nfs4_client if client is expirable */ +static bool +nfsd4_lm_lock_expirable(struct file_lock *cfl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *) cfl->c.flc_owner; + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfsd_net *nn; + + if (try_to_expire_client(clp)) { + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + return true; + } + return false; +} + +/* schedule laundromat to run immediately and wait for it to complete */ +static void +nfsd4_lm_expire_lock(void) +{ + flush_workqueue(laundry_wq); +} + static void nfsd4_lm_notify(struct file_lock *fl) { - struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + struct nfs4_lockowner *lo = (struct nfs4_lockowner *) fl->c.flc_owner; struct net *net = lo->lo_owner.so_client->net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfsd4_blocked_lock *nbl = container_of(fl, @@ -5679,14 +7942,19 @@ nfsd4_lm_notify(struct file_lock *fl) } spin_unlock(&nn->blocked_locks_lock); - if (queue) - nfsd4_run_cb(&nbl->nbl_cb); + if (queue) { + trace_nfsd_cb_notify_lock(lo, nbl); + nfsd4_try_run_cb(&nbl->nbl_cb); + } } static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_mod_owner = THIS_MODULE, .lm_notify = nfsd4_lm_notify, - .lm_get_owner = nfsd4_fl_get_owner, - .lm_put_owner = nfsd4_fl_put_owner, + .lm_get_owner = nfsd4_lm_get_owner, + .lm_put_owner = nfsd4_lm_put_owner, + .lm_lock_expirable = nfsd4_lm_lock_expirable, + .lm_expire_lock = nfsd4_lm_expire_lock, }; static inline void @@ -5695,13 +7963,12 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) struct nfs4_lockowner *lo; if (fl->fl_lmops == &nfsd_posix_mng_ops) { - lo = (struct nfs4_lockowner *) fl->fl_owner; - deny->ld_owner.data = kmemdup(lo->lo_owner.so_owner.data, - lo->lo_owner.so_owner.len, GFP_KERNEL); + lo = (struct nfs4_lockowner *) fl->c.flc_owner; + xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner, + GFP_KERNEL); if (!deny->ld_owner.data) /* We just don't care that much */ goto nevermind; - deny->ld_owner.len = lo->lo_owner.so_owner.len; deny->ld_clientid = lo->lo_owner.so_client->cl_clientid; } else { nevermind: @@ -5715,7 +7982,7 @@ nevermind: if (fl->fl_end != NFS4_MAX_UINT64) deny->ld_length = fl->fl_end - fl->fl_start + 1; deny->ld_type = NFS4_READ_LT; - if (fl->fl_type != F_RDLCK) + if (fl->c.flc_type != F_RDLCK) deny->ld_type = NFS4_WRITE_LT; } @@ -5801,21 +8068,21 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, } static struct nfs4_ol_stateid * -find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) +find_lock_stateid(const struct nfs4_lockowner *lo, + const struct nfs4_ol_stateid *ost) { struct nfs4_ol_stateid *lst; - struct nfs4_client *clp = lo->lo_owner.so_client; - lockdep_assert_held(&clp->cl_lock); + lockdep_assert_held(&ost->st_stid.sc_client->cl_lock); - list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { - if (lst->st_stid.sc_type != NFS4_LOCK_STID) - continue; - if (lst->st_stid.sc_file == fp) { - refcount_inc(&lst->st_stid.sc_count); - return lst; + /* If ost is not hashed, ost->st_locks will not be valid */ + if (!nfs4_ol_stateid_unhashed(ost)) + list_for_each_entry(lst, &ost->st_locks, st_locks) { + if (lst->st_stateowner == &lo->lo_owner) { + refcount_inc(&lst->st_stid.sc_count); + return lst; + } } - } return NULL; } @@ -5831,35 +8098,39 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); retry: spin_lock(&clp->cl_lock); - spin_lock(&fp->fi_lock); - retstp = find_lock_stateid(lo, fp); + if (nfs4_ol_stateid_unhashed(open_stp)) + goto out_close; + retstp = find_lock_stateid(lo, open_stp); if (retstp) - goto out_unlock; - + goto out_found; refcount_inc(&stp->st_stid.sc_count); - stp->st_stid.sc_type = NFS4_LOCK_STID; + stp->st_stid.sc_type = SC_TYPE_LOCK; stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; + spin_lock(&fp->fi_lock); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); -out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); - if (retstp) { - if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { - nfs4_put_stid(&retstp->st_stid); - goto retry; - } - /* To keep mutex tracking happy */ - mutex_unlock(&stp->st_mutex); - stp = retstp; - } return stp; +out_found: + spin_unlock(&clp->cl_lock); + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; + } + /* To keep mutex tracking happy */ + mutex_unlock(&stp->st_mutex); + return retstp; +out_close: + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + return NULL; } static struct nfs4_ol_stateid * @@ -5874,7 +8145,7 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, *new = false; spin_lock(&clp->cl_lock); - lst = find_lock_stateid(lo, fi); + lst = find_lock_stateid(lo, ost); spin_unlock(&clp->cl_lock); if (lst != NULL) { if (nfsd4_lock_ol_stateid(lst) == nfs_ok) @@ -5968,7 +8239,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *lock_stp = NULL; struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; @@ -5976,8 +8247,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int lkflg; int err; bool new = false; - unsigned char fl_type; - unsigned int fl_flags = FL_POSIX; + unsigned char type; + unsigned int flags = FL_POSIX; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -5988,23 +8259,21 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (check_lock_length(lock->lk_offset, lock->lk_length)) return nfserr_inval; - if ((status = fh_verify(rqstp, &cstate->current_fh, - S_IFREG, NFSD_MAY_LOCK))) { - dprintk("NFSD: nfsd4_lock: permission denied!\n"); + status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); + if (status != nfs_ok) return status; + if (exportfs_cannot_lock(cstate->current_fh.fh_dentry->d_sb->s_export_op)) { + status = nfserr_notsupp; + goto out; } if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) /* See rfc 5661 18.10.3: given clientid is ignored: */ memcpy(&lock->lk_new_clientid, - &cstate->session->se_client->cl_clientid, + &cstate->clp->cl_clientid, sizeof(clientid_t)); - status = nfserr_stale_clientid; - if (STALE_CLIENTID(&lock->lk_new_clientid, nn)) - goto out; - /* validate and update open stateid and open seqid */ status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, @@ -6022,9 +8291,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &lock_stp, &new); } else { status = nfs4_preprocess_seqid_op(cstate, - lock->lk_old_lock_seqid, - &lock->lk_old_lock_stateid, - NFS4_LOCK_STID, &lock_stp, nn); + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, + SC_TYPE_LOCK, 0, &lock_stp, + nn); } if (status) goto out; @@ -6042,42 +8312,46 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!locks_in_grace(net) && lock->lk_reclaim) goto out; + if (lock->lk_reclaim) + flags |= FL_RECLAIM; + fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate)) - fl_flags |= FL_SLEEP; - /* Fallthrough */ + fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); - filp = find_readable_file_locked(fp); - if (filp) + nf = find_readable_file_locked(fp); + if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); - fl_type = F_RDLCK; + type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate)) - fl_flags |= FL_SLEEP; - /* Fallthrough */ + fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); - filp = find_writeable_file_locked(fp); - if (filp) + nf = find_writeable_file_locked(fp); + if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); - fl_type = F_WRLCK; + type = F_WRLCK; break; default: status = nfserr_inval; goto out; } - if (!filp) { + if (!nf) { status = nfserr_openmode; goto out; } + if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) && + nfsd4_has_session(cstate) && + locks_can_async_lock(nf->nf_file->f_op)) + flags |= FL_SLEEP; + nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { dprintk("NFSD: %s: unable to allocate block!\n", __func__); @@ -6086,11 +8360,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } file_lock = &nbl->nbl_lock; - file_lock->fl_type = fl_type; - file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); - file_lock->fl_pid = current->tgid; - file_lock->fl_file = filp; - file_lock->fl_flags = fl_flags; + file_lock->c.flc_type = type; + file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); + file_lock->c.flc_pid = current->tgid; + file_lock->c.flc_file = nf->nf_file; + file_lock->c.flc_flags = flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); @@ -6103,15 +8377,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - if (fl_flags & FL_SLEEP) { - nbl->nbl_time = jiffies; + if (flags & FL_SLEEP) { + nbl->nbl_time = ktime_get_boottime_seconds(); spin_lock(&nn->blocked_locks_lock); list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); + kref_get(&nbl->nbl_kref); spin_unlock(&nn->blocked_locks_lock); } - err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock); switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); @@ -6120,8 +8395,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nn->somebody_reclaimed = true; break; case FILE_LOCK_DEFERRED: + kref_put(&nbl->nbl_kref, free_nbl); nbl = NULL; - /* Fallthrough */ + fallthrough; case -EAGAIN: /* conflock holds conflicting lock */ status = nfserr_denied; dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); @@ -6138,16 +8414,21 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: if (nbl) { /* dequeue it if we queued it before */ - if (fl_flags & FL_SLEEP) { + if (flags & FL_SLEEP) { spin_lock(&nn->blocked_locks_lock); - list_del_init(&nbl->nbl_list); - list_del_init(&nbl->nbl_lru); + if (!list_empty(&nbl->nbl_list) && + !list_empty(&nbl->nbl_lru)) { + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + kref_put(&nbl->nbl_kref, free_nbl); + } + /* nbl can use one of lists to be linked to reaplist */ spin_unlock(&nn->blocked_locks_lock); } free_blocked_lock(nbl); } - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); if (lock_stp) { /* Bump seqid manually if the 4.0 replay owner is openowner */ if (cstate->replay_owner && @@ -6174,20 +8455,39 @@ out: return status; } +void nfsd4_lock_release(union nfsd4_op_u *u) +{ + struct nfsd4_lock *lock = &u->lock; + struct nfsd4_lock_denied *deny = &lock->lk_denied; + + kfree(deny->ld_owner.data); +} + /* * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, * so we do a temporary open here just to get an open file to pass to - * vfs_test_lock. (Arguably perhaps test_lock should be done with an - * inode operation.) + * vfs_test_lock. */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { - struct file *file; - __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); - if (!err) { - err = nfserrno(vfs_test_lock(file, lock)); - fput(file); - } + struct nfsd_file *nf; + struct inode *inode; + __be32 err; + + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); + if (err) + return err; + inode = fhp->fh_dentry->d_inode; + inode_lock(inode); /* to block new leases till after test_lock: */ + err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (err) + goto out; + lock->c.flc_file = nf->nf_file; + err = nfserrno(vfs_test_lock(nf->nf_file, lock)); + lock->c.flc_file = NULL; +out: + inode_unlock(inode); + nfsd_file_put(nf); return err; } @@ -6211,7 +8511,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; if (!nfsd4_has_session(cstate)) { - status = lookup_clientid(&lockt->lt_clientid, cstate, nn); + status = set_client(&lockt->lt_clientid, cstate, nn); if (status) goto out; } @@ -6229,11 +8529,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (lockt->lt_type) { case NFS4_READ_LT: case NFS4_READW_LT: - file_lock->fl_type = F_RDLCK; + file_lock->c.flc_type = F_RDLCK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: - file_lock->fl_type = F_WRLCK; + file_lock->c.flc_type = F_WRLCK; break; default: dprintk("NFSD: nfs4_lockt: bad lock type!\n"); @@ -6243,9 +8543,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, lo = find_lockowner_str(cstate->clp, &lockt->lt_owner); if (lo) - file_lock->fl_owner = (fl_owner_t)lo; - file_lock->fl_pid = current->tgid; - file_lock->fl_flags = FL_POSIX; + file_lock->c.flc_owner = (fl_owner_t)lo; + file_lock->c.flc_pid = current->tgid; + file_lock->c.flc_flags = FL_POSIX; file_lock->fl_start = lockt->lt_offset; file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); @@ -6256,7 +8556,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - if (file_lock->fl_type != F_UNLCK) { + if (file_lock->c.flc_type != F_UNLCK) { status = nfserr_denied; nfs4_set_lock_denied(file_lock, &lockt->lt_denied); } @@ -6268,13 +8568,21 @@ out: return status; } +void nfsd4_lockt_release(union nfsd4_op_u *u) +{ + struct nfsd4_lockt *lockt = &u->lockt; + struct nfsd4_lock_denied *deny = &lockt->lt_denied; + + kfree(deny->ld_owner.data); +} + __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_locku *locku = &u->locku; struct nfs4_ol_stateid *stp; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; struct file_lock *file_lock = NULL; __be32 status; int err; @@ -6288,27 +8596,32 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, - &locku->lu_stateid, NFS4_LOCK_STID, - &stp, nn); + &locku->lu_stateid, SC_TYPE_LOCK, 0, + &stp, nn); if (status) goto out; - filp = find_any_file(stp->st_stid.sc_file); - if (!filp) { + nf = find_any_file(stp->st_stid.sc_file); + if (!nf) { status = nfserr_lock_range; goto put_stateid; } + if (exportfs_cannot_lock(nf->nf_file->f_path.mnt->mnt_sb->s_export_op)) { + status = nfserr_notsupp; + goto put_file; + } + file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; - goto fput; + goto put_file; } - file_lock->fl_type = F_UNLCK; - file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); - file_lock->fl_pid = current->tgid; - file_lock->fl_file = filp; - file_lock->fl_flags = FL_POSIX; + file_lock->c.flc_type = F_UNLCK; + file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); + file_lock->c.flc_pid = current->tgid; + file_lock->c.flc_file = nf->nf_file; + file_lock->c.flc_flags = FL_POSIX; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = locku->lu_offset; @@ -6316,14 +8629,14 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, locku->lu_length); nfs4_transform_lock_offset(file_lock); - err = vfs_lock_file(filp, F_SETLK, file_lock, NULL); + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL); if (err) { dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid); -fput: - fput(filp); +put_file: + nfsd_file_put(nf); put_stateid: mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); @@ -6335,7 +8648,7 @@ out: out_nfserr: status = nfserrno(err); - goto fput; + goto put_file; } /* @@ -6348,98 +8661,101 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; - struct file *filp = find_any_file(fp); + struct nfsd_file *nf; struct inode *inode; struct file_lock_context *flctx; - if (!filp) { + spin_lock(&fp->fi_lock); + nf = find_any_file_locked(fp); + if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); - return status; + goto out; } - inode = locks_inode(filp); - flctx = inode->i_flctx; + inode = file_inode(nf->nf_file); + flctx = locks_inode_context(inode); if (flctx && !list_empty_careful(&flctx->flc_posix)) { spin_lock(&flctx->flc_lock); - list_for_each_entry(fl, &flctx->flc_posix, fl_list) { - if (fl->fl_owner == (fl_owner_t)lowner) { + for_each_file_lock(fl, &flctx->flc_posix) { + if (fl->c.flc_owner == (fl_owner_t)lowner) { status = true; break; } } spin_unlock(&flctx->flc_lock); } - fput(filp); +out: + spin_unlock(&fp->fi_lock); return status; } +/** + * nfsd4_release_lockowner - process NFSv4.0 RELEASE_LOCKOWNER operations + * @rqstp: RPC transaction + * @cstate: NFSv4 COMPOUND state + * @u: RELEASE_LOCKOWNER arguments + * + * Check if there are any locks still held and if not, free the lockowner + * and any lock state that is owned. + * + * Return values: + * %nfs_ok: lockowner released or not found + * %nfserr_locks_held: lockowner still in use + * %nfserr_stale_clientid: clientid no longer active + * %nfserr_expired: clientid not recognized + */ __be32 nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); clientid_t *clid = &rlockowner->rl_clientid; - struct nfs4_stateowner *sop; - struct nfs4_lockowner *lo = NULL; struct nfs4_ol_stateid *stp; - struct xdr_netobj *owner = &rlockowner->rl_owner; - unsigned int hashval = ownerstr_hashval(owner); - __be32 status; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfs4_lockowner *lo; struct nfs4_client *clp; - LIST_HEAD (reaplist); + LIST_HEAD(reaplist); + __be32 status; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); - status = lookup_clientid(clid, cstate, nn); + status = set_client(clid, cstate, nn); if (status) return status; - clp = cstate->clp; - /* Find the matching lock stateowner */ - spin_lock(&clp->cl_lock); - list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval], - so_strhash) { - - if (sop->so_is_open_owner || !same_owner_str(sop, owner)) - continue; - - /* see if there are still any locks associated with it */ - lo = lockowner(sop); - list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { - if (check_for_locks(stp->st_stid.sc_file, lo)) { - status = nfserr_locks_held; - spin_unlock(&clp->cl_lock); - return status; - } - } - nfs4_get_stateowner(sop); - break; - } + spin_lock(&clp->cl_lock); + lo = find_lockowner_str_locked(clp, &rlockowner->rl_owner); if (!lo) { spin_unlock(&clp->cl_lock); - return status; + return nfs_ok; } + list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { + if (check_for_locks(stp->st_stid.sc_file, lo)) { + spin_unlock(&clp->cl_lock); + nfs4_put_stateowner(&lo->lo_owner); + return nfserr_locks_held; + } + } unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); - WARN_ON(!unhash_lock_stateid(stp)); + unhash_lock_stateid(stp); put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); remove_blocked_locks(lo); nfs4_put_stateowner(&lo->lo_owner); - - return status; + return nfs_ok; } static inline struct nfs4_client_reclaim * @@ -6449,7 +8765,7 @@ alloc_reclaim(void) } bool -nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn) +nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn) { struct nfs4_client_reclaim *crp; @@ -6461,20 +8777,42 @@ nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn) * failure => all reset bets are off, nfserr_no_grace... */ struct nfs4_client_reclaim * -nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn) +nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, + struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp; - dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name); + name.data = kmemdup(name.data, name.len, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data!\n", + __func__); + return NULL; + } + if (princhash.len) { + princhash.data = kmemdup(princhash.data, princhash.len, GFP_KERNEL); + if (!princhash.data) { + dprintk("%s: failed to allocate memory for princhash.data!\n", + __func__); + kfree(name.data); + return NULL; + } + } else + princhash.data = NULL; crp = alloc_reclaim(); if (crp) { strhashval = clientstr_hashval(name); INIT_LIST_HEAD(&crp->cr_strhash); list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); - memcpy(crp->cr_recdir, name, HEXDIR_LEN); + crp->cr_name.data = name.data; + crp->cr_name.len = name.len; + crp->cr_princhash.data = princhash.data; + crp->cr_princhash.len = princhash.len; crp->cr_clp = NULL; nn->reclaim_str_hashtbl_size++; + } else { + kfree(name.data); + kfree(princhash.data); } return crp; } @@ -6483,6 +8821,8 @@ void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) { list_del(&crp->cr_strhash); + kfree(crp->cr_name.data); + kfree(crp->cr_princhash.data); kfree(crp); nn->reclaim_str_hashtbl_size--; } @@ -6506,636 +8846,32 @@ nfs4_release_reclaim(struct nfsd_net *nn) /* * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ struct nfs4_client_reclaim * -nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn) +nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp = NULL; - dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir); - - strhashval = clientstr_hashval(recdir); + strhashval = clientstr_hashval(name); list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) { - if (same_name(crp->cr_recdir, recdir)) { + if (compare_blob(&crp->cr_name, &name) == 0) { return crp; } } return NULL; } -/* -* Called from OPEN. Look for clientid in reclaim list. -*/ __be32 -nfs4_check_open_reclaim(clientid_t *clid, - struct nfsd4_compound_state *cstate, - struct nfsd_net *nn) +nfs4_check_open_reclaim(struct nfs4_client *clp) { - __be32 status; - - /* find clientid in conf_id_hashtbl */ - status = lookup_clientid(clid, cstate, nn); - if (status) - return nfserr_reclaim_bad; - - if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags)) + if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) return nfserr_no_grace; - if (nfsd4_client_record_check(cstate->clp)) + if (nfsd4_client_record_check(clp)) return nfserr_reclaim_bad; return nfs_ok; } -#ifdef CONFIG_NFSD_FAULT_INJECTION -static inline void -put_client(struct nfs4_client *clp) -{ - atomic_dec(&clp->cl_refcount); -} - -static struct nfs4_client * -nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size) -{ - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!nfsd_netns_ready(nn)) - return NULL; - - list_for_each_entry(clp, &nn->client_lru, cl_lru) { - if (memcmp(&clp->cl_addr, addr, addr_size) == 0) - return clp; - } - return NULL; -} - -u64 -nfsd_inject_print_clients(void) -{ - struct nfs4_client *clp; - u64 count = 0; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - char buf[INET6_ADDRSTRLEN]; - - if (!nfsd_netns_ready(nn)) - return 0; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) { - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); - pr_info("NFS Client: %s\n", buf); - ++count; - } - spin_unlock(&nn->client_lock); - - return count; -} - -u64 -nfsd_inject_forget_client(struct sockaddr_storage *addr, size_t addr_size) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - clp = nfsd_find_client(addr, addr_size); - if (clp) { - if (mark_client_expired_locked(clp) == nfs_ok) - ++count; - else - clp = NULL; - } - spin_unlock(&nn->client_lock); - - if (clp) - expire_client(clp); - - return count; -} - -u64 -nfsd_inject_forget_clients(u64 max) -{ - u64 count = 0; - struct nfs4_client *clp, *next; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { - if (mark_client_expired_locked(clp) == nfs_ok) { - list_add(&clp->cl_lru, &reaplist); - if (max != 0 && ++count >= max) - break; - } - } - spin_unlock(&nn->client_lock); - - list_for_each_entry_safe(clp, next, &reaplist, cl_lru) - expire_client(clp); - - return count; -} - -static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, - const char *type) -{ - char buf[INET6_ADDRSTRLEN]; - rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); - printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type); -} - -static void -nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst, - struct list_head *collect) -{ - struct nfs4_client *clp = lst->st_stid.sc_client; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!collect) - return; - - lockdep_assert_held(&nn->client_lock); - atomic_inc(&clp->cl_refcount); - list_add(&lst->st_locks, collect); -} - -static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, - struct list_head *collect, - bool (*func)(struct nfs4_ol_stateid *)) -{ - struct nfs4_openowner *oop; - struct nfs4_ol_stateid *stp, *st_next; - struct nfs4_ol_stateid *lst, *lst_next; - u64 count = 0; - - spin_lock(&clp->cl_lock); - list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) { - list_for_each_entry_safe(stp, st_next, - &oop->oo_owner.so_stateids, st_perstateowner) { - list_for_each_entry_safe(lst, lst_next, - &stp->st_locks, st_locks) { - if (func) { - if (func(lst)) - nfsd_inject_add_lock_to_list(lst, - collect); - } - ++count; - /* - * Despite the fact that these functions deal - * with 64-bit integers for "count", we must - * ensure that it doesn't blow up the - * clp->cl_refcount. Throw a warning if we - * start to approach INT_MAX here. - */ - WARN_ON_ONCE(count == (INT_MAX / 2)); - if (count == max) - goto out; - } - } - } -out: - spin_unlock(&clp->cl_lock); - - return count; -} - -static u64 -nfsd_collect_client_locks(struct nfs4_client *clp, struct list_head *collect, - u64 max) -{ - return nfsd_foreach_client_lock(clp, max, collect, unhash_lock_stateid); -} - -static u64 -nfsd_print_client_locks(struct nfs4_client *clp) -{ - u64 count = nfsd_foreach_client_lock(clp, 0, NULL, NULL); - nfsd_print_count(clp, count, "locked files"); - return count; -} - -u64 -nfsd_inject_print_locks(void) -{ - struct nfs4_client *clp; - u64 count = 0; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!nfsd_netns_ready(nn)) - return 0; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) - count += nfsd_print_client_locks(clp); - spin_unlock(&nn->client_lock); - - return count; -} - -static void -nfsd_reap_locks(struct list_head *reaplist) -{ - struct nfs4_client *clp; - struct nfs4_ol_stateid *stp, *next; - - list_for_each_entry_safe(stp, next, reaplist, st_locks) { - list_del_init(&stp->st_locks); - clp = stp->st_stid.sc_client; - nfs4_put_stid(&stp->st_stid); - put_client(clp); - } -} - -u64 -nfsd_inject_forget_client_locks(struct sockaddr_storage *addr, size_t addr_size) -{ - unsigned int count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - clp = nfsd_find_client(addr, addr_size); - if (clp) - count = nfsd_collect_client_locks(clp, &reaplist, 0); - spin_unlock(&nn->client_lock); - nfsd_reap_locks(&reaplist); - return count; -} - -u64 -nfsd_inject_forget_locks(u64 max) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) { - count += nfsd_collect_client_locks(clp, &reaplist, max - count); - if (max != 0 && count >= max) - break; - } - spin_unlock(&nn->client_lock); - nfsd_reap_locks(&reaplist); - return count; -} - -static u64 -nfsd_foreach_client_openowner(struct nfs4_client *clp, u64 max, - struct list_head *collect, - void (*func)(struct nfs4_openowner *)) -{ - struct nfs4_openowner *oop, *next; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - u64 count = 0; - - lockdep_assert_held(&nn->client_lock); - - spin_lock(&clp->cl_lock); - list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) { - if (func) { - func(oop); - if (collect) { - atomic_inc(&clp->cl_refcount); - list_add(&oop->oo_perclient, collect); - } - } - ++count; - /* - * Despite the fact that these functions deal with - * 64-bit integers for "count", we must ensure that - * it doesn't blow up the clp->cl_refcount. Throw a - * warning if we start to approach INT_MAX here. - */ - WARN_ON_ONCE(count == (INT_MAX / 2)); - if (count == max) - break; - } - spin_unlock(&clp->cl_lock); - - return count; -} - -static u64 -nfsd_print_client_openowners(struct nfs4_client *clp) -{ - u64 count = nfsd_foreach_client_openowner(clp, 0, NULL, NULL); - - nfsd_print_count(clp, count, "openowners"); - return count; -} - -static u64 -nfsd_collect_client_openowners(struct nfs4_client *clp, - struct list_head *collect, u64 max) -{ - return nfsd_foreach_client_openowner(clp, max, collect, - unhash_openowner_locked); -} - -u64 -nfsd_inject_print_openowners(void) -{ - struct nfs4_client *clp; - u64 count = 0; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!nfsd_netns_ready(nn)) - return 0; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) - count += nfsd_print_client_openowners(clp); - spin_unlock(&nn->client_lock); - - return count; -} - -static void -nfsd_reap_openowners(struct list_head *reaplist) -{ - struct nfs4_client *clp; - struct nfs4_openowner *oop, *next; - - list_for_each_entry_safe(oop, next, reaplist, oo_perclient) { - list_del_init(&oop->oo_perclient); - clp = oop->oo_owner.so_client; - release_openowner(oop); - put_client(clp); - } -} - -u64 -nfsd_inject_forget_client_openowners(struct sockaddr_storage *addr, - size_t addr_size) -{ - unsigned int count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - clp = nfsd_find_client(addr, addr_size); - if (clp) - count = nfsd_collect_client_openowners(clp, &reaplist, 0); - spin_unlock(&nn->client_lock); - nfsd_reap_openowners(&reaplist); - return count; -} - -u64 -nfsd_inject_forget_openowners(u64 max) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) { - count += nfsd_collect_client_openowners(clp, &reaplist, - max - count); - if (max != 0 && count >= max) - break; - } - spin_unlock(&nn->client_lock); - nfsd_reap_openowners(&reaplist); - return count; -} - -static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, - struct list_head *victims) -{ - struct nfs4_delegation *dp, *next; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - u64 count = 0; - - lockdep_assert_held(&nn->client_lock); - - spin_lock(&state_lock); - list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { - if (victims) { - /* - * It's not safe to mess with delegations that have a - * non-zero dl_time. They might have already been broken - * and could be processed by the laundromat outside of - * the state_lock. Just leave them be. - */ - if (dp->dl_time != 0) - continue; - - atomic_inc(&clp->cl_refcount); - WARN_ON(!unhash_delegation_locked(dp)); - list_add(&dp->dl_recall_lru, victims); - } - ++count; - /* - * Despite the fact that these functions deal with - * 64-bit integers for "count", we must ensure that - * it doesn't blow up the clp->cl_refcount. Throw a - * warning if we start to approach INT_MAX here. - */ - WARN_ON_ONCE(count == (INT_MAX / 2)); - if (count == max) - break; - } - spin_unlock(&state_lock); - return count; -} - -static u64 -nfsd_print_client_delegations(struct nfs4_client *clp) -{ - u64 count = nfsd_find_all_delegations(clp, 0, NULL); - - nfsd_print_count(clp, count, "delegations"); - return count; -} - -u64 -nfsd_inject_print_delegations(void) -{ - struct nfs4_client *clp; - u64 count = 0; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - - if (!nfsd_netns_ready(nn)) - return 0; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) - count += nfsd_print_client_delegations(clp); - spin_unlock(&nn->client_lock); - - return count; -} - -static void -nfsd_forget_delegations(struct list_head *reaplist) -{ - struct nfs4_client *clp; - struct nfs4_delegation *dp, *next; - - list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) { - list_del_init(&dp->dl_recall_lru); - clp = dp->dl_stid.sc_client; - revoke_delegation(dp); - put_client(clp); - } -} - -u64 -nfsd_inject_forget_client_delegations(struct sockaddr_storage *addr, - size_t addr_size) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - clp = nfsd_find_client(addr, addr_size); - if (clp) - count = nfsd_find_all_delegations(clp, 0, &reaplist); - spin_unlock(&nn->client_lock); - - nfsd_forget_delegations(&reaplist); - return count; -} - -u64 -nfsd_inject_forget_delegations(u64 max) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - list_for_each_entry(clp, &nn->client_lru, cl_lru) { - count += nfsd_find_all_delegations(clp, max - count, &reaplist); - if (max != 0 && count >= max) - break; - } - spin_unlock(&nn->client_lock); - nfsd_forget_delegations(&reaplist); - return count; -} - -static void -nfsd_recall_delegations(struct list_head *reaplist) -{ - struct nfs4_client *clp; - struct nfs4_delegation *dp, *next; - - list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) { - list_del_init(&dp->dl_recall_lru); - clp = dp->dl_stid.sc_client; - /* - * We skipped all entries that had a zero dl_time before, - * so we can now reset the dl_time back to 0. If a delegation - * break comes in now, then it won't make any difference since - * we're recalling it either way. - */ - spin_lock(&state_lock); - dp->dl_time = 0; - spin_unlock(&state_lock); - nfsd_break_one_deleg(dp); - put_client(clp); - } -} - -u64 -nfsd_inject_recall_client_delegations(struct sockaddr_storage *addr, - size_t addr_size) -{ - u64 count = 0; - struct nfs4_client *clp; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - clp = nfsd_find_client(addr, addr_size); - if (clp) - count = nfsd_find_all_delegations(clp, 0, &reaplist); - spin_unlock(&nn->client_lock); - - nfsd_recall_delegations(&reaplist); - return count; -} - -u64 -nfsd_inject_recall_delegations(u64 max) -{ - u64 count = 0; - struct nfs4_client *clp, *next; - struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, - nfsd_net_id); - LIST_HEAD(reaplist); - - if (!nfsd_netns_ready(nn)) - return count; - - spin_lock(&nn->client_lock); - list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { - count += nfsd_find_all_delegations(clp, max - count, &reaplist); - if (max != 0 && ++count >= max) - break; - } - spin_unlock(&nn->client_lock); - nfsd_recall_delegations(&reaplist); - return count; -} -#endif /* CONFIG_NFSD_FAULT_INJECTION */ - /* * Since the lifetime of a delegation isn't limited to that of an open, a * client may quite reasonably hang on to a delegation as long as it has @@ -7186,7 +8922,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; - nn->boot_time = get_seconds(); + nn->boot_time = ktime_get_real_seconds(); nn->grace_ended = false; nn->nfsd4_manager.block_opens = true; INIT_LIST_HEAD(&nn->nfsd4_manager.list); @@ -7196,15 +8932,30 @@ static int nfs4_state_create_net(struct net *net) spin_lock_init(&nn->client_lock); spin_lock_init(&nn->s2s_cp_lock); idr_init(&nn->s2s_cp_stateids); + atomic_set(&nn->pending_async_copies, 0); spin_lock_init(&nn->blocked_locks_lock); INIT_LIST_HEAD(&nn->blocked_locks_lru); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); + INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); + nn->nfsd_client_shrinker = shrinker_alloc(0, "nfsd-client"); + if (!nn->nfsd_client_shrinker) + goto err_shrinker; + + nn->nfsd_client_shrinker->scan_objects = nfsd4_state_shrinker_scan; + nn->nfsd_client_shrinker->count_objects = nfsd4_state_shrinker_count; + nn->nfsd_client_shrinker->private_data = nn; + + shrinker_register(nn->nfsd_client_shrinker); + return 0; +err_shrinker: + put_net(net); + kfree(nn->sessionid_hashtbl); err_sessionid: kfree(nn->unconf_id_hashtbl); err_unconf_id: @@ -7253,35 +9004,43 @@ nfs4_state_start_net(struct net *net) return ret; locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); - printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n", + if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0) + goto skip_grace; + printk(KERN_INFO "NFSD: starting %lld-second grace period (net %x)\n", nn->nfsd4_grace, net->ns.inum); + trace_nfsd_grace_start(nn); queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); return 0; + +skip_grace: + printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n", + net->ns.inum); + queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ); + nfsd4_end_grace(nn); + return 0; } /* initialization to perform when the nfsd service is started: */ - int nfs4_state_start(void) { int ret; - laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); - if (laundry_wq == NULL) { - ret = -ENOMEM; - goto out; - } - ret = nfsd4_create_callback_queue(); + ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params); if (ret) - goto out_free_laundry; + return ret; + + nfsd_slot_shrinker = shrinker_alloc(0, "nfsd-DRC-slot"); + if (!nfsd_slot_shrinker) { + rhltable_destroy(&nfs4_file_rhltable); + return -ENOMEM; + } + nfsd_slot_shrinker->count_objects = nfsd_slot_count; + nfsd_slot_shrinker->scan_objects = nfsd_slot_scan; + shrinker_register(nfsd_slot_shrinker); set_max_delegations(); return 0; - -out_free_laundry: - destroy_workqueue(laundry_wq); -out: - return ret; } void @@ -7291,6 +9050,8 @@ nfs4_state_shutdown_net(struct net *net) struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + shrinker_free(nn->nfsd_client_shrinker); + cancel_work_sync(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -7298,7 +9059,7 @@ nfs4_state_shutdown_net(struct net *net) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -7310,19 +9071,23 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + nfsd4_ssc_shutdown_umount(nn); +#endif } void nfs4_state_shutdown(void) { - destroy_workqueue(laundry_wq); - nfsd4_destroy_callback_queue(); + rhltable_destroy(&nfs4_file_rhltable); + shrinker_free(nfsd_slot_shrinker); } static void get_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) { - if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG) && CURRENT_STATEID(stateid)) + if (HAS_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG) && + CURRENT_STATEID(stateid)) memcpy(stateid, &cstate->current_stateid, sizeof(stateid_t)); } @@ -7331,14 +9096,14 @@ put_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid) { if (cstate->minorversion) { memcpy(&cstate->current_stateid, stateid, sizeof(stateid_t)); - SET_STATE_ID(cstate, CURRENT_STATE_ID_FLAG); + SET_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG); } } void clear_current_stateid(struct nfsd4_compound_state *cstate) { - CLEAR_STATE_ID(cstate, CURRENT_STATE_ID_FLAG); + CLEAR_CSTATE_FLAG(cstate, CURRENT_STATE_ID_FLAG); } /* @@ -7431,3 +9196,274 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, { get_stateid(cstate, &u->write.wr_stateid); } + +/** + * nfsd4_vet_deleg_time - vet and set the timespec for a delegated timestamp update + * @req: timestamp from the client + * @orig: original timestamp in the inode + * @now: current time + * + * Given a timestamp from the client response, check it against the + * current timestamp in the inode and the current time. Returns true + * if the inode's timestamp needs to be updated, and false otherwise. + * @req may also be changed if the timestamp needs to be clamped. + */ +bool nfsd4_vet_deleg_time(struct timespec64 *req, const struct timespec64 *orig, + const struct timespec64 *now) +{ + + /* + * "When the time presented is before the original time, then the + * update is ignored." Also no need to update if there is no change. + */ + if (timespec64_compare(req, orig) <= 0) + return false; + + /* + * "When the time presented is in the future, the server can either + * clamp the new time to the current time, or it may + * return NFS4ERR_DELAY to the client, allowing it to retry." + */ + if (timespec64_compare(req, now) > 0) + *req = *now; + + return true; +} + +static int cb_getattr_update_times(struct dentry *dentry, struct nfs4_delegation *dp) +{ + struct inode *inode = d_inode(dentry); + struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr; + struct iattr attrs = { }; + int ret; + + if (deleg_attrs_deleg(dp->dl_type)) { + struct timespec64 now = current_time(inode); + + attrs.ia_atime = ncf->ncf_cb_atime; + attrs.ia_mtime = ncf->ncf_cb_mtime; + + if (nfsd4_vet_deleg_time(&attrs.ia_atime, &dp->dl_atime, &now)) + attrs.ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; + + if (nfsd4_vet_deleg_time(&attrs.ia_mtime, &dp->dl_mtime, &now)) { + attrs.ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; + attrs.ia_ctime = attrs.ia_mtime; + if (nfsd4_vet_deleg_time(&attrs.ia_ctime, &dp->dl_ctime, &now)) + attrs.ia_valid |= ATTR_CTIME | ATTR_CTIME_SET; + } + } else { + attrs.ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + + if (!attrs.ia_valid) + return 0; + + attrs.ia_valid |= ATTR_DELEG; + inode_lock(inode); + ret = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL); + inode_unlock(inode); + return ret; +} + +/** + * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict + * @rqstp: RPC transaction context + * @dentry: dentry of inode to be checked for a conflict + * @pdp: returned WRITE delegation, if one was found + * + * This function is called when there is a conflict between a write + * delegation and a change/size GETATTR from another client. The server + * must either use the CB_GETATTR to get the current values of the + * attributes from the client that holds the delegation or recall the + * delegation before replying to the GETATTR. See RFC 8881 section + * 18.7.4. + * + * Returns 0 if there is no conflict; otherwise an nfs_stat + * code is returned. If @pdp is set to a non-NULL value, then the + * caller must put the reference. + */ +__be32 +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_delegation **pdp) +{ + __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct file_lock_context *ctx; + struct nfs4_delegation *dp = NULL; + struct file_lease *fl; + struct nfs4_cb_fattr *ncf; + struct inode *inode = d_inode(dentry); + + ctx = locks_inode_context(inode); + if (!ctx) + return nfs_ok; + +#define NON_NFSD_LEASE ((void *)1) + + spin_lock(&ctx->flc_lock); + for_each_file_lock(fl, &ctx->flc_lease) { + if (fl->c.flc_flags == FL_LAYOUT) + continue; + if (fl->c.flc_type == F_WRLCK) { + if (fl->fl_lmops == &nfsd_lease_mng_ops) + dp = fl->c.flc_owner; + else + dp = NON_NFSD_LEASE; + } + break; + } + if (dp == NULL || dp == NON_NFSD_LEASE || + dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { + spin_unlock(&ctx->flc_lock); + if (dp == NON_NFSD_LEASE) { + status = nfserrno(nfsd_open_break_lease(inode, + NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + } + return 0; + } + + nfsd_stats_wdeleg_getattr_inc(nn); + refcount_inc(&dp->dl_stid.sc_count); + ncf = &dp->dl_cb_fattr; + nfs4_cb_getattr(&dp->dl_cb_fattr); + spin_unlock(&ctx->flc_lock); + + wait_on_bit_timeout(&ncf->ncf_getattr.cb_flags, NFSD4_CALLBACK_RUNNING, + TASK_UNINTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT); + if (ncf->ncf_cb_status) { + /* Recall delegation only if client didn't respond */ + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + goto out_status; + } + if (!ncf->ncf_file_modified && + (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || + ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) + ncf->ncf_file_modified = true; + if (ncf->ncf_file_modified) { + int err; + + /* + * Per section 10.4.3 of RFC 8881, the server would + * not update the file's metadata with the client's + * modified size + */ + err = cb_getattr_update_times(dentry, dp); + if (err) { + status = nfserrno(err); + goto out_status; + } + ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; + *pdp = dp; + return nfs_ok; + } + status = nfs_ok; +out_status: + nfs4_put_stid(&dp->dl_stid); + return status; +} + +/** + * nfsd_get_dir_deleg - attempt to get a directory delegation + * @cstate: compound state + * @gdd: GET_DIR_DELEGATION arg/resp structure + * @nf: nfsd_file opened on the directory + * + * Given a GET_DIR_DELEGATION request @gdd, attempt to acquire a delegation + * on the directory to which @nf refers. Note that this does not set up any + * sort of async notifications for the delegation. + */ +struct nfs4_delegation * +nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, + struct nfsd4_get_dir_delegation *gdd, + struct nfsd_file *nf) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_delegation *dp; + struct file_lease *fl; + struct nfs4_file *fp, *rfp; + int status = 0; + + fp = nfsd4_alloc_file(); + if (!fp) + return ERR_PTR(-ENOMEM); + + nfsd4_file_init(&cstate->current_fh, fp); + + rfp = nfsd4_file_hash_insert(fp, &cstate->current_fh); + if (unlikely(!rfp)) { + put_nfs4_file(fp); + return ERR_PTR(-ENOMEM); + } + + if (rfp != fp) { + put_nfs4_file(fp); + fp = rfp; + } + + /* if this client already has one, return that it's unavailable */ + spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + /* existing delegation? */ + if (nfs4_delegation_exists(clp, fp)) { + status = -EAGAIN; + } else if (!fp->fi_deleg_file) { + fp->fi_deleg_file = nfsd_file_get(nf); + fp->fi_delegees = 1; + } else { + ++fp->fi_delegees; + } + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + + if (status) { + put_nfs4_file(fp); + return ERR_PTR(status); + } + + /* Try to set up the lease */ + status = -ENOMEM; + dp = alloc_init_deleg(clp, fp, NULL, NFS4_OPEN_DELEGATE_READ); + if (!dp) + goto out_delegees; + + fl = nfs4_alloc_init_lease(dp); + if (!fl) + goto out_put_stid; + + status = kernel_setlease(nf->nf_file, + fl->c.flc_type, &fl, NULL); + if (fl) + locks_free_lease(fl); + if (status) + goto out_put_stid; + + /* + * Now, try to hash it. This can fail if we race another nfsd task + * trying to set a delegation on the same file. If that happens, + * then just say UNAVAIL. + */ + spin_lock(&state_lock); + spin_lock(&clp->cl_lock); + spin_lock(&fp->fi_lock); + status = hash_delegation_locked(dp, fp); + spin_unlock(&fp->fi_lock); + spin_unlock(&clp->cl_lock); + spin_unlock(&state_lock); + + if (!status) + return dp; + + /* Something failed. Drop the lease and clean up the stid */ + kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); +out_put_stid: + nfs4_put_stid(&dp->dl_stid); +out_delegees: + put_deleg_file(fp); + return ERR_PTR(status); +} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 3de42a729093..30ce5851fe4c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -40,6 +40,11 @@ #include <linux/utsname.h> #include <linux/pagemap.h> #include <linux/sunrpc/svcauth_gss.h> +#include <linux/sunrpc/addr.h> +#include <linux/xattr.h> +#include <linux/vmalloc.h> + +#include <uapi/linux/xattr.h> #include "idmap.h" #include "acl.h" @@ -49,6 +54,10 @@ #include "cache.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" +#include "nfs4xdr_gen.h" + +#include "trace.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -86,6 +95,8 @@ check_filename(char *str, int len) if (len == 0) return nfserr_inval; + if (len > NFS4_MAXNAMLEN) + return nfserr_nametoolong; if (isdotent(str, len)) return nfserr_badname; for (i = 0; i < len; i++) @@ -94,115 +105,6 @@ check_filename(char *str, int len) return 0; } -#define DECODE_HEAD \ - __be32 *p; \ - __be32 status -#define DECODE_TAIL \ - status = 0; \ -out: \ - return status; \ -xdr_error: \ - dprintk("NFSD: xdr error (%s:%d)\n", \ - __FILE__, __LINE__); \ - status = nfserr_bad_xdr; \ - goto out - -#define READMEM(x,nbytes) do { \ - x = (char *)p; \ - p += XDR_QUADLEN(nbytes); \ -} while (0) -#define SAVEMEM(x,nbytes) do { \ - if (!(x = (p==argp->tmp || p == argp->tmpp) ? \ - savemem(argp, p, nbytes) : \ - (char *)p)) { \ - dprintk("NFSD: xdr error (%s:%d)\n", \ - __FILE__, __LINE__); \ - goto xdr_error; \ - } \ - p += XDR_QUADLEN(nbytes); \ -} while (0) -#define COPYMEM(x,nbytes) do { \ - memcpy((x), p, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -} while (0) - -/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */ -#define READ_BUF(nbytes) do { \ - if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) { \ - p = argp->p; \ - argp->p += XDR_QUADLEN(nbytes); \ - } else if (!(p = read_buf(argp, nbytes))) { \ - dprintk("NFSD: xdr error (%s:%d)\n", \ - __FILE__, __LINE__); \ - goto xdr_error; \ - } \ -} while (0) - -static void next_decode_page(struct nfsd4_compoundargs *argp) -{ - argp->p = page_address(argp->pagelist[0]); - argp->pagelist++; - if (argp->pagelen < PAGE_SIZE) { - argp->end = argp->p + XDR_QUADLEN(argp->pagelen); - argp->pagelen = 0; - } else { - argp->end = argp->p + (PAGE_SIZE>>2); - argp->pagelen -= PAGE_SIZE; - } -} - -static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) -{ - /* We want more bytes than seem to be available. - * Maybe we need a new page, maybe we have just run out - */ - unsigned int avail = (char *)argp->end - (char *)argp->p; - __be32 *p; - - if (argp->pagelen == 0) { - struct kvec *vec = &argp->rqstp->rq_arg.tail[0]; - - if (!argp->tail) { - argp->tail = true; - avail = vec->iov_len; - argp->p = vec->iov_base; - argp->end = vec->iov_base + avail; - } - - if (avail < nbytes) - return NULL; - - p = argp->p; - argp->p += XDR_QUADLEN(nbytes); - return p; - } - - if (avail + argp->pagelen < nbytes) - return NULL; - if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */ - return NULL; - /* ok, we can do it with the current plus the next page */ - if (nbytes <= sizeof(argp->tmp)) - p = argp->tmp; - else { - kfree(argp->tmpp); - p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL); - if (!p) - return NULL; - - } - /* - * The following memcpy is safe because read_buf is always - * called with nbytes > avail, and the two cases above both - * guarantee p points to at least nbytes bytes. - */ - memcpy(p, argp->p, avail); - next_decode_page(argp); - memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); - argp->p += XDR_QUADLEN(nbytes - avail); - return p; -} - static int zero_clientid(clientid_t *clid) { return (clid->cl_boot == 0) && (clid->cl_id == 0); @@ -211,17 +113,17 @@ static int zero_clientid(clientid_t *clid) /** * svcxdr_tmpalloc - allocate memory to be freed after compound processing * @argp: NFSv4 compound argument structure - * @p: pointer to be freed (with kfree()) + * @len: length of buffer to allocate * - * Marks @p to be freed when processing the compound operation - * described in @argp finishes. + * Allocates a buffer of size @len to be freed when processing the compound + * operation described in @argp finishes. */ static void * -svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len) +svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, size_t len) { struct svcxdr_tmpbuf *tb; - tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL); + tb = kmalloc(struct_size(tb, buf, len), GFP_KERNEL); if (!tb) return NULL; tb->next = argp->to_free; @@ -237,9 +139,9 @@ svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len) * buffer might end on a page boundary. */ static char * -svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len) +svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, size_t len) { - char *p = svcxdr_tmpalloc(argp, len + 1); + char *p = svcxdr_tmpalloc(argp, size_add(len, 1)); if (!p) return NULL; @@ -248,87 +150,246 @@ svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len) return p; } -/** - * savemem - duplicate a chunk of memory for later processing - * @argp: NFSv4 compound argument structure to be freed with - * @p: pointer to be duplicated - * @nbytes: length to be duplicated - * - * Returns a pointer to a copy of @nbytes bytes of memory at @p - * that are preserved until processing of the NFSv4 compound - * operation described by @argp finishes. - */ -static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) +static void * +svcxdr_savemem(struct nfsd4_compoundargs *argp, __be32 *p, size_t len) { - void *ret; + __be32 *tmp; - ret = svcxdr_tmpalloc(argp, nbytes); - if (!ret) + /* + * The location of the decoded data item is stable, + * so @p is OK to use. This is the common case. + */ + if (p != argp->xdr->scratch.iov_base) + return p; + + tmp = svcxdr_tmpalloc(argp, len); + if (!tmp) return NULL; - memcpy(ret, p, nbytes); - return ret; + memcpy(tmp, p, len); + return tmp; } /* - * We require the high 32 bits of 'seconds' to be 0, and - * we ignore all 32 bits of 'nseconds'. + * NFSv4 basic data type decoders + */ + +/* + * This helper handles variable-length opaques which belong to protocol + * elements that this implementation does not support. */ static __be32 -nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv) +nfsd4_decode_ignored_string(struct nfsd4_compoundargs *argp, u32 maxlen) { - DECODE_HEAD; - u64 sec; + u32 len; + + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) + return nfserr_bad_xdr; + if (maxlen && len > maxlen) + return nfserr_bad_xdr; + if (!xdr_inline_decode(argp->xdr, len)) + return nfserr_bad_xdr; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) +{ + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) + return nfserr_bad_xdr; + if (len == 0 || len > NFS4_OPAQUE_LIMIT) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, len); + if (!p) + return nfserr_bad_xdr; + o->data = svcxdr_savemem(argp, p, len); + if (!o->data) + return nfserr_jukebox; + o->len = len; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_component4(struct nfsd4_compoundargs *argp, char **namp, u32 *lenp) +{ + __be32 *p, status; + + if (xdr_stream_decode_u32(argp->xdr, lenp) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, *lenp); + if (!p) + return nfserr_bad_xdr; + status = check_filename((char *)p, *lenp); + if (status) + return status; + *namp = svcxdr_savemem(argp, p, *lenp); + if (!*namp) + return nfserr_jukebox; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_nfstime4(struct nfsd4_compoundargs *argp, struct timespec64 *tv) +{ + __be32 *p; - READ_BUF(12); - p = xdr_decode_hyper(p, &sec); - tv->tv_sec = sec; + p = xdr_inline_decode(argp->xdr, XDR_UNIT * 3); + if (!p) + return nfserr_bad_xdr; + p = xdr_decode_hyper(p, &tv->tv_sec); tv->tv_nsec = be32_to_cpup(p++); if (tv->tv_nsec >= (u32)1000000000) return nfserr_inval; + return nfs_ok; +} + +static __be32 +nfsd4_decode_verifier4(struct nfsd4_compoundargs *argp, nfs4_verifier *verf) +{ + __be32 *p; - DECODE_TAIL; + p = xdr_inline_decode(argp->xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_bad_xdr; + memcpy(verf->data, p, sizeof(verf->data)); + return nfs_ok; } +/** + * nfsd4_decode_bitmap4 - Decode an NFSv4 bitmap4 + * @argp: NFSv4 compound argument structure + * @bmval: pointer to an array of u32's to decode into + * @bmlen: size of the @bmval array + * + * The server needs to return nfs_ok rather than nfserr_bad_xdr when + * encountering bitmaps containing bits it does not recognize. This + * includes bits in bitmap words past WORDn, where WORDn is the last + * bitmap WORD the implementation currently supports. Thus we are + * careful here to simply ignore bits in bitmap words that this + * implementation has yet to support explicitly. + * + * Return values: + * %nfs_ok: @bmval populated successfully + * %nfserr_bad_xdr: the encoded bitmap was invalid + */ static __be32 -nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) +nfsd4_decode_bitmap4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen) { - u32 bmlen; - DECODE_HEAD; + ssize_t status; - bmval[0] = 0; - bmval[1] = 0; - bmval[2] = 0; + status = xdr_stream_decode_uint32_array(argp->xdr, bmval, bmlen); + return status == -EBADMSG ? nfserr_bad_xdr : nfs_ok; +} - READ_BUF(4); - bmlen = be32_to_cpup(p++); - if (bmlen > 1000) - goto xdr_error; +static __be32 +nfsd4_decode_nfsace4(struct nfsd4_compoundargs *argp, struct nfs4_ace *ace) +{ + __be32 *p, status; + u32 length; + + if (xdr_stream_decode_u32(argp->xdr, &ace->type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &ace->flag) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &ace->access_mask) < 0) + return nfserr_bad_xdr; - READ_BUF(bmlen << 2); - if (bmlen > 0) - bmval[0] = be32_to_cpup(p++); - if (bmlen > 1) - bmval[1] = be32_to_cpup(p++); - if (bmlen > 2) - bmval[2] = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + ace->whotype = nfs4_acl_get_whotype((char *)p, length); + if (ace->whotype != NFS4_ACL_WHO_NAMED) + status = nfs_ok; + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + status = nfsd_map_name_to_gid(argp->rqstp, + (char *)p, length, &ace->who_gid); + else + status = nfsd_map_name_to_uid(argp->rqstp, + (char *)p, length, &ace->who_uid); - DECODE_TAIL; + return status; +} + +/* A counted array of nfsace4's */ +static noinline __be32 +nfsd4_decode_acl(struct nfsd4_compoundargs *argp, struct nfs4_acl **acl) +{ + struct nfs4_ace *ace; + __be32 status; + u32 count; + + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + + if (count > xdr_stream_remaining(argp->xdr) / 20) + /* + * Even with 4-byte names there wouldn't be + * space for that many aces; something fishy is + * going on: + */ + return nfserr_fbig; + + *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(count)); + if (*acl == NULL) + return nfserr_jukebox; + + (*acl)->naces = count; + for (ace = (*acl)->aces; ace < (*acl)->aces + count; ace++) { + status = nfsd4_decode_nfsace4(argp, ace); + if (status) + return status; + } + + return nfs_ok; +} + +static noinline __be32 +nfsd4_decode_security_label(struct nfsd4_compoundargs *argp, + struct xdr_netobj *label) +{ + u32 lfs, pi, length; + __be32 *p; + + if (xdr_stream_decode_u32(argp->xdr, &lfs) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &pi) < 0) + return nfserr_bad_xdr; + + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + if (length > NFS4_MAXLABELLEN) + return nfserr_badlabel; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + label->len = length; + label->data = svcxdr_dupstr(argp, p, length); + if (!label->data) + return nfserr_jukebox; + + return nfs_ok; } static __be32 -nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, - struct iattr *iattr, struct nfs4_acl **acl, - struct xdr_netobj *label, int *umask) +nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen, + struct iattr *iattr, struct nfs4_acl **acl, + struct xdr_netobj *label, int *umask) { - struct timespec ts; - int expected_len, len = 0; - u32 dummy32; - char *buf; + unsigned int starting_pos; + u32 attrlist4_count; + __be32 *p, status; - DECODE_HEAD; iattr->ia_valid = 0; - if ((status = nfsd4_decode_bitmap(argp, bmval))) - return status; + status = nfsd4_decode_bitmap4(argp, bmval, bmlen); + if (status) + return nfserr_bad_xdr; if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 @@ -338,92 +399,69 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_attrnotsupp; } - READ_BUF(4); - expected_len = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &attrlist4_count) < 0) + return nfserr_bad_xdr; + starting_pos = xdr_stream_pos(argp->xdr); if (bmval[0] & FATTR4_WORD0_SIZE) { - READ_BUF(8); - len += 8; - p = xdr_decode_hyper(p, &iattr->ia_size); + u64 size; + + if (xdr_stream_decode_u64(argp->xdr, &size) < 0) + return nfserr_bad_xdr; + iattr->ia_size = size; iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { - u32 nace; - struct nfs4_ace *ace; - - READ_BUF(4); len += 4; - nace = be32_to_cpup(p++); - - if (nace > NFS4_ACL_MAX) - return nfserr_fbig; - - *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); - if (*acl == NULL) - return nfserr_jukebox; - - (*acl)->naces = nace; - for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { - READ_BUF(16); len += 16; - ace->type = be32_to_cpup(p++); - ace->flag = be32_to_cpup(p++); - ace->access_mask = be32_to_cpup(p++); - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - len += XDR_QUADLEN(dummy32) << 2; - READMEM(buf, dummy32); - ace->whotype = nfs4_acl_get_whotype(buf, dummy32); - status = nfs_ok; - if (ace->whotype != NFS4_ACL_WHO_NAMED) - ; - else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) - status = nfsd_map_name_to_gid(argp->rqstp, - buf, dummy32, &ace->who_gid); - else - status = nfsd_map_name_to_uid(argp->rqstp, - buf, dummy32, &ace->who_uid); - if (status) - return status; - } + status = nfsd4_decode_acl(argp, acl); + if (status) + return status; } else *acl = NULL; if (bmval[1] & FATTR4_WORD1_MODE) { - READ_BUF(4); - len += 4; - iattr->ia_mode = be32_to_cpup(p++); + u32 mode; + + if (xdr_stream_decode_u32(argp->xdr, &mode) < 0) + return nfserr_bad_xdr; + iattr->ia_mode = mode; iattr->ia_mode &= (S_IFMT | S_IALLUGO); iattr->ia_valid |= ATTR_MODE; } if (bmval[1] & FATTR4_WORD1_OWNER) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - len += (XDR_QUADLEN(dummy32) << 2); - READMEM(buf, dummy32); - if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) + u32 length; + + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + status = nfsd_map_name_to_uid(argp->rqstp, (char *)p, length, + &iattr->ia_uid); + if (status) return status; iattr->ia_valid |= ATTR_UID; } if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - len += (XDR_QUADLEN(dummy32) << 2); - READMEM(buf, dummy32); - if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) + u32 length; + + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + status = nfsd_map_name_to_gid(argp->rqstp, (char *)p, length, + &iattr->ia_gid); + if (status) return status; iattr->ia_valid |= ATTR_GID; } if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - switch (dummy32) { + u32 set_it; + + if (xdr_stream_decode_u32(argp->xdr, &set_it) < 0) + return nfserr_bad_xdr; + switch (set_it) { case NFS4_SET_TO_CLIENT_TIME: - len += 12; - status = nfsd4_decode_time(argp, &ts); - iattr->ia_atime = timespec_to_timespec64(ts); + status = nfsd4_decode_nfstime4(argp, &iattr->ia_atime); if (status) return status; iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); @@ -432,18 +470,26 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, iattr->ia_valid |= ATTR_ATIME; break; default: - goto xdr_error; + return nfserr_bad_xdr; } } + if (bmval[1] & FATTR4_WORD1_TIME_CREATE) { + struct timespec64 ts; + + /* No Linux filesystem supports setting this attribute. */ + bmval[1] &= ~FATTR4_WORD1_TIME_CREATE; + status = nfsd4_decode_nfstime4(argp, &ts); + if (status) + return status; + } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - switch (dummy32) { + u32 set_it; + + if (xdr_stream_decode_u32(argp->xdr, &set_it) < 0) + return nfserr_bad_xdr; + switch (set_it) { case NFS4_SET_TO_CLIENT_TIME: - len += 12; - status = nfsd4_decode_time(argp, &ts); - iattr->ia_mtime = timespec_to_timespec64(ts); + status = nfsd4_decode_nfstime4(argp, &iattr->ia_mtime); if (status) return status; iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); @@ -452,221 +498,343 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, iattr->ia_valid |= ATTR_MTIME; break; default: - goto xdr_error; + return nfserr_bad_xdr; } } - label->len = 0; if (IS_ENABLED(CONFIG_NFSD_V4_SECURITY_LABEL) && bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */ - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */ - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - if (dummy32 > NFS4_MAXLABELLEN) - return nfserr_badlabel; - len += (XDR_QUADLEN(dummy32) << 2); - READMEM(buf, dummy32); - label->len = dummy32; - label->data = svcxdr_dupstr(argp, buf, dummy32); - if (!label->data) - return nfserr_jukebox; + status = nfsd4_decode_security_label(argp, label); + if (status) + return status; } if (bmval[2] & FATTR4_WORD2_MODE_UMASK) { + u32 mode, mask; + if (!umask) - goto xdr_error; - READ_BUF(8); - len += 8; - dummy32 = be32_to_cpup(p++); - iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO); - dummy32 = be32_to_cpup(p++); - *umask = dummy32 & S_IRWXUGO; + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &mode) < 0) + return nfserr_bad_xdr; + iattr->ia_mode = mode & (S_IFMT | S_IALLUGO); + if (xdr_stream_decode_u32(argp->xdr, &mask) < 0) + return nfserr_bad_xdr; + *umask = mask & S_IRWXUGO; iattr->ia_valid |= ATTR_MODE; } - if (len != expected_len) - goto xdr_error; + if (bmval[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) { + fattr4_time_deleg_access access; - DECODE_TAIL; + if (!xdrgen_decode_fattr4_time_deleg_access(argp->xdr, &access)) + return nfserr_bad_xdr; + iattr->ia_atime.tv_sec = access.seconds; + iattr->ia_atime.tv_nsec = access.nseconds; + iattr->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET | ATTR_DELEG; + } + if (bmval[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) { + fattr4_time_deleg_modify modify; + + if (!xdrgen_decode_fattr4_time_deleg_modify(argp->xdr, &modify)) + return nfserr_bad_xdr; + iattr->ia_mtime.tv_sec = modify.seconds; + iattr->ia_mtime.tv_nsec = modify.nseconds; + iattr->ia_ctime.tv_sec = modify.seconds; + iattr->ia_ctime.tv_nsec = modify.nseconds; + iattr->ia_valid |= ATTR_CTIME | ATTR_CTIME_SET | + ATTR_MTIME | ATTR_MTIME_SET | ATTR_DELEG; + } + + /* request sanity: did attrlist4 contain the expected number of words? */ + if (attrlist4_count != xdr_stream_pos(argp->xdr) - starting_pos) + return nfserr_bad_xdr; + + return nfs_ok; } static __be32 -nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +nfsd4_decode_stateid4(struct nfsd4_compoundargs *argp, stateid_t *sid) { - DECODE_HEAD; + __be32 *p; - READ_BUF(sizeof(stateid_t)); + p = xdr_inline_decode(argp->xdr, NFS4_STATEID_SIZE); + if (!p) + return nfserr_bad_xdr; sid->si_generation = be32_to_cpup(p++); - COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + memcpy(&sid->si_opaque, p, sizeof(sid->si_opaque)); + return nfs_ok; +} + +static __be32 +nfsd4_decode_clientid4(struct nfsd4_compoundargs *argp, clientid_t *clientid) +{ + __be32 *p; - DECODE_TAIL; + p = xdr_inline_decode(argp->xdr, sizeof(__be64)); + if (!p) + return nfserr_bad_xdr; + memcpy(clientid, p, sizeof(*clientid)); + return nfs_ok; +} + +static __be32 +nfsd4_decode_state_owner4(struct nfsd4_compoundargs *argp, + clientid_t *clientid, struct xdr_netobj *owner) +{ + __be32 status; + + status = nfsd4_decode_clientid4(argp, clientid); + if (status) + return status; + return nfsd4_decode_opaque(argp, owner); } +#ifdef CONFIG_NFSD_PNFS + static __be32 -nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) +nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutcommit *lcp) { - DECODE_HEAD; + u32 len; - READ_BUF(4); - access->ac_req_access = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0) + return nfserr_bad_xdr; + if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES) + return nfserr_bad_xdr; + if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX) + return nfserr_bad_xdr; + + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) + return nfserr_bad_xdr; + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len)) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } -static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) +static __be32 +nfsd4_decode_layoutreturn4(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutreturn *lrp) { - DECODE_HEAD; - u32 dummy, uid, gid; - char *machine_name; - int i; - int nr_secflavs; + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_return_type) < 0) + return nfserr_bad_xdr; + switch (lrp->lr_return_type) { + case RETURN_FILE: + if (xdr_stream_decode_u64(argp->xdr, &lrp->lr_seg.offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lrp->lr_seg.length) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lrp->lr_sid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &lrp->lrf_body_len) < 0) + return nfserr_bad_xdr; + if (lrp->lrf_body_len > 0) { + lrp->lrf_body = xdr_inline_decode(argp->xdr, lrp->lrf_body_len); + if (!lrp->lrf_body) + return nfserr_bad_xdr; + } + break; + case RETURN_FSID: + case RETURN_ALL: + lrp->lr_seg.offset = 0; + lrp->lr_seg.length = NFS4_MAX_UINT64; + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; +} + +#endif /* CONFIG_NFSD_PNFS */ + +static __be32 +nfsd4_decode_sessionid4(struct nfsd4_compoundargs *argp, + struct nfs4_sessionid *sessionid) +{ + __be32 *p; + + p = xdr_inline_decode(argp->xdr, NFS4_MAX_SESSIONID_LEN); + if (!p) + return nfserr_bad_xdr; + memcpy(sessionid->data, p, sizeof(sessionid->data)); + return nfs_ok; +} + +/* Defined in Appendix A of RFC 5531 */ +static __be32 +nfsd4_decode_authsys_parms(struct nfsd4_compoundargs *argp, + struct nfsd4_cb_sec *cbs) +{ + u32 stamp, gidcount, uid, gid; + __be32 *p, status; + + if (xdr_stream_decode_u32(argp->xdr, &stamp) < 0) + return nfserr_bad_xdr; + /* machine name */ + status = nfsd4_decode_ignored_string(argp, 255); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &uid) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &gid) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &gidcount) < 0) + return nfserr_bad_xdr; + if (gidcount > 16) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, gidcount << 2); + if (!p) + return nfserr_bad_xdr; + if (cbs->flavor == (u32)(-1)) { + struct user_namespace *userns = nfsd_user_namespace(argp->rqstp); + + kuid_t kuid = make_kuid(userns, uid); + kgid_t kgid = make_kgid(userns, gid); + if (uid_valid(kuid) && gid_valid(kgid)) { + cbs->uid = kuid; + cbs->gid = kgid; + cbs->flavor = RPC_AUTH_UNIX; + } else { + dprintk("RPC_AUTH_UNIX with invalid uid or gid, ignoring!\n"); + } + } + + return nfs_ok; +} + +static __be32 +nfsd4_decode_gss_cb_handles4(struct nfsd4_compoundargs *argp, + struct nfsd4_cb_sec *cbs) +{ + __be32 status; + u32 service; + + dprintk("RPC_AUTH_GSS callback secflavor not supported!\n"); + + if (xdr_stream_decode_u32(argp->xdr, &service) < 0) + return nfserr_bad_xdr; + if (service < RPC_GSS_SVC_NONE || service > RPC_GSS_SVC_PRIVACY) + return nfserr_bad_xdr; + /* gcbp_handle_from_server */ + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + /* gcbp_handle_from_client */ + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + + return nfs_ok; +} + +/* a counted array of callback_sec_parms4 items */ +static __be32 +nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) +{ + u32 i, secflavor, nr_secflavs; + __be32 status; /* callback_sec_params4 */ - READ_BUF(4); - nr_secflavs = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &nr_secflavs) < 0) + return nfserr_bad_xdr; if (nr_secflavs) cbs->flavor = (u32)(-1); else /* Is this legal? Be generous, take it to mean AUTH_NONE: */ cbs->flavor = 0; + for (i = 0; i < nr_secflavs; ++i) { - READ_BUF(4); - dummy = be32_to_cpup(p++); - switch (dummy) { + if (xdr_stream_decode_u32(argp->xdr, &secflavor) < 0) + return nfserr_bad_xdr; + switch (secflavor) { case RPC_AUTH_NULL: - /* Nothing to read */ + /* void */ if (cbs->flavor == (u32)(-1)) cbs->flavor = RPC_AUTH_NULL; break; case RPC_AUTH_UNIX: - READ_BUF(8); - /* stamp */ - dummy = be32_to_cpup(p++); - - /* machine name */ - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - SAVEMEM(machine_name, dummy); - - /* uid, gid */ - READ_BUF(8); - uid = be32_to_cpup(p++); - gid = be32_to_cpup(p++); - - /* more gids */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - if (cbs->flavor == (u32)(-1)) { - kuid_t kuid = make_kuid(&init_user_ns, uid); - kgid_t kgid = make_kgid(&init_user_ns, gid); - if (uid_valid(kuid) && gid_valid(kgid)) { - cbs->uid = kuid; - cbs->gid = kgid; - cbs->flavor = RPC_AUTH_UNIX; - } else { - dprintk("RPC_AUTH_UNIX with invalid" - "uid or gid ignoring!\n"); - } - } + status = nfsd4_decode_authsys_parms(argp, cbs); + if (status) + return status; break; case RPC_AUTH_GSS: - dprintk("RPC_AUTH_GSS callback secflavor " - "not supported!\n"); - READ_BUF(8); - /* gcbp_service */ - dummy = be32_to_cpup(p++); - /* gcbp_handle_from_server */ - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - /* gcbp_handle_from_client */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); + status = nfsd4_decode_gss_cb_handles4(argp, cbs); + if (status) + return status; break; default: - dprintk("Illegal callback secflavor\n"); return nfserr_inval; } } - DECODE_TAIL; -} -static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) -{ - DECODE_HEAD; + return nfs_ok; +} - READ_BUF(4); - bc->bc_cb_program = be32_to_cpup(p++); - nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); - DECODE_TAIL; -} +/* + * NFSv4 operation argument decoders + */ -static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +static __be32 +nfsd4_decode_access(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; - - READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); - COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); - bcts->dir = be32_to_cpup(p++); - /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker - * could help us figure out we should be using it. */ - DECODE_TAIL; + struct nfsd4_access *access = &u->access; + if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0) + return nfserr_bad_xdr; + return nfs_ok; } static __be32 -nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) +nfsd4_decode_close(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; - - READ_BUF(4); - close->cl_seqid = be32_to_cpup(p++); - return nfsd4_decode_stateid(argp, &close->cl_stateid); - - DECODE_TAIL; + struct nfsd4_close *close = &u->close; + if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_stateid4(argp, &close->cl_stateid); } static __be32 -nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit) +nfsd4_decode_commit(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; - - READ_BUF(12); - p = xdr_decode_hyper(p, &commit->co_offset); - commit->co_count = be32_to_cpup(p++); - - DECODE_TAIL; + struct nfsd4_commit *commit = &u->commit; + if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0) + return nfserr_bad_xdr; + memset(&commit->co_verf, 0, sizeof(commit->co_verf)); + return nfs_ok; } static __be32 -nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create) +nfsd4_decode_create(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_create *create = &u->create; + __be32 *p, status; - READ_BUF(4); - create->cr_type = be32_to_cpup(p++); + memset(create, 0, sizeof(*create)); + if (xdr_stream_decode_u32(argp->xdr, &create->cr_type) < 0) + return nfserr_bad_xdr; switch (create->cr_type) { case NF4LNK: - READ_BUF(4); - create->cr_datalen = be32_to_cpup(p++); - READ_BUF(create->cr_datalen); + if (xdr_stream_decode_u32(argp->xdr, &create->cr_datalen) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, create->cr_datalen); + if (!p) + return nfserr_bad_xdr; create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen); if (!create->cr_data) return nfserr_jukebox; break; case NF4BLK: case NF4CHR: - READ_BUF(8); - create->cr_specdata1 = be32_to_cpup(p++); - create->cr_specdata2 = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &create->cr_specdata1) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &create->cr_specdata2) < 0) + return nfserr_bad_xdr; break; case NF4SOCK: case NF4FIFO: @@ -674,151 +842,221 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create default: break; } - - READ_BUF(4); - create->cr_namelen = be32_to_cpup(p++); - READ_BUF(create->cr_namelen); - SAVEMEM(create->cr_name, create->cr_namelen); - if ((status = check_filename(create->cr_name, create->cr_namelen))) + status = nfsd4_decode_component4(argp, &create->cr_name, + &create->cr_namelen); + if (status) return status; - - status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, - &create->cr_acl, &create->cr_label, - &create->cr_umask); + status = nfsd4_decode_fattr4(argp, create->cr_bmval, + ARRAY_SIZE(create->cr_bmval), + &create->cr_iattr, &create->cr_acl, + &create->cr_label, &create->cr_umask); if (status) - goto out; + return status; - DECODE_TAIL; + return nfs_ok; } static inline __be32 -nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) +nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - return nfsd4_decode_stateid(argp, &dr->dr_stateid); + struct nfsd4_delegreturn *dr = &u->delegreturn; + return nfsd4_decode_stateid4(argp, &dr->dr_stateid); } static inline __be32 -nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) +nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - return nfsd4_decode_bitmap(argp, getattr->ga_bmval); + struct nfsd4_getattr *getattr = &u->getattr; + memset(getattr, 0, sizeof(*getattr)); + return nfsd4_decode_bitmap4(argp, getattr->ga_bmval, + ARRAY_SIZE(getattr->ga_bmval)); } static __be32 -nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) +nfsd4_decode_link(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_link *link = &u->link; + memset(link, 0, sizeof(*link)); + return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen); +} - READ_BUF(4); - link->li_namelen = be32_to_cpup(p++); - READ_BUF(link->li_namelen); - SAVEMEM(link->li_name, link->li_namelen); - if ((status = check_filename(link->li_name, link->li_namelen))) - return status; +static __be32 +nfsd4_decode_open_to_lock_owner4(struct nfsd4_compoundargs *argp, + struct nfsd4_lock *lock) +{ + __be32 status; - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_open_seqid) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lock->lk_new_open_stateid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_lock_seqid) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_state_owner4(argp, &lock->lk_new_clientid, + &lock->lk_new_owner); } static __be32 -nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) +nfsd4_decode_exist_lock_owner4(struct nfsd4_compoundargs *argp, + struct nfsd4_lock *lock) { - DECODE_HEAD; + __be32 status; - /* - * type, reclaim(boolean), offset, length, new_lock_owner(boolean) - */ - READ_BUF(28); - lock->lk_type = be32_to_cpup(p++); - if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) - goto xdr_error; - lock->lk_reclaim = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &lock->lk_offset); - p = xdr_decode_hyper(p, &lock->lk_length); - lock->lk_is_new = be32_to_cpup(p++); - - if (lock->lk_is_new) { - READ_BUF(4); - lock->lk_new_open_seqid = be32_to_cpup(p++); - status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); - if (status) - return status; - READ_BUF(8 + sizeof(clientid_t)); - lock->lk_new_lock_seqid = be32_to_cpup(p++); - COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); - lock->lk_new_owner.len = be32_to_cpup(p++); - READ_BUF(lock->lk_new_owner.len); - READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); - } else { - status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); - if (status) - return status; - READ_BUF(4); - lock->lk_old_lock_seqid = be32_to_cpup(p++); - } + status = nfsd4_decode_stateid4(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_old_lock_seqid) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) +nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) { - DECODE_HEAD; - - READ_BUF(32); - lockt->lt_type = be32_to_cpup(p++); - if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) - goto xdr_error; - p = xdr_decode_hyper(p, &lockt->lt_offset); - p = xdr_decode_hyper(p, &lockt->lt_length); - COPYMEM(&lockt->lt_clientid, 8); - lockt->lt_owner.len = be32_to_cpup(p++); - READ_BUF(lockt->lt_owner.len); - READMEM(lockt->lt_owner.data, lockt->lt_owner.len); + if (xdr_stream_decode_bool(argp->xdr, &lock->lk_is_new) < 0) + return nfserr_bad_xdr; + if (lock->lk_is_new) + return nfsd4_decode_open_to_lock_owner4(argp, lock); + return nfsd4_decode_exist_lock_owner4(argp, lock); +} + +static __be32 +nfsd4_decode_lock(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) +{ + struct nfsd4_lock *lock = &u->lock; + memset(lock, 0, sizeof(*lock)); + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0) + return nfserr_bad_xdr; + if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) + return nfserr_bad_xdr; + if (xdr_stream_decode_bool(argp->xdr, &lock->lk_reclaim) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lock->lk_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lock->lk_length) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_locker4(argp, lock); +} - DECODE_TAIL; +static __be32 +nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) +{ + struct nfsd4_lockt *lockt = &u->lockt; + memset(lockt, 0, sizeof(*lockt)); + if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0) + return nfserr_bad_xdr; + if ((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lockt->lt_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lockt->lt_length) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_state_owner4(argp, &lockt->lt_clientid, + &lockt->lt_owner); } static __be32 -nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) +nfsd4_decode_locku(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_locku *locku = &u->locku; + __be32 status; - READ_BUF(8); - locku->lu_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0) + return nfserr_bad_xdr; if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) - goto xdr_error; - locku->lu_seqid = be32_to_cpup(p++); - status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &locku->lu_seqid) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &locku->lu_stateid); if (status) return status; - READ_BUF(16); - p = xdr_decode_hyper(p, &locku->lu_offset); - p = xdr_decode_hyper(p, &locku->lu_length); + if (xdr_stream_decode_u64(argp->xdr, &locku->lu_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &locku->lu_length) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup) +nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_lookup *lookup = &u->lookup; + return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len); +} - READ_BUF(4); - lookup->lo_len = be32_to_cpup(p++); - READ_BUF(lookup->lo_len); - SAVEMEM(lookup->lo_name, lookup->lo_len); - if ((status = check_filename(lookup->lo_name, lookup->lo_len))) - return status; +static __be32 +nfsd4_decode_createhow4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &open->op_createmode) < 0) + return nfserr_bad_xdr; + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + case NFS4_CREATE_GUARDED: + status = nfsd4_decode_fattr4(argp, open->op_bmval, + ARRAY_SIZE(open->op_bmval), + &open->op_iattr, &open->op_acl, + &open->op_label, &open->op_umask); + if (status) + return status; + break; + case NFS4_CREATE_EXCLUSIVE: + status = nfsd4_decode_verifier4(argp, &open->op_verf); + if (status) + return status; + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (argp->minorversion < 1) + return nfserr_bad_xdr; + status = nfsd4_decode_verifier4(argp, &open->op_verf); + if (status) + return status; + status = nfsd4_decode_fattr4(argp, open->op_bmval, + ARRAY_SIZE(open->op_bmval), + &open->op_iattr, &open->op_acl, + &open->op_label, &open->op_umask); + if (status) + return status; + break; + default: + return nfserr_bad_xdr; + } - DECODE_TAIL; + return nfs_ok; +} + +static __be32 +nfsd4_decode_openflag4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &open->op_create) < 0) + return nfserr_bad_xdr; + switch (open->op_create) { + case NFS4_OPEN_NOCREATE: + break; + case NFS4_OPEN_CREATE: + status = nfsd4_decode_createhow4(argp, open); + if (status) + return status; + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; } static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when) { - __be32 *p; u32 w; - READ_BUF(4); - w = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &w) < 0) + return nfserr_bad_xdr; *share_access = w & NFS4_SHARE_ACCESS_MASK; *deleg_want = w & NFS4_SHARE_WANT_MASK; if (deleg_when) @@ -837,13 +1075,13 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh return nfs_ok; if (!argp->minorversion) return nfserr_bad_xdr; - switch (w & NFS4_SHARE_WANT_MASK) { - case NFS4_SHARE_WANT_NO_PREFERENCE: - case NFS4_SHARE_WANT_READ_DELEG: - case NFS4_SHARE_WANT_WRITE_DELEG: - case NFS4_SHARE_WANT_ANY_DELEG: - case NFS4_SHARE_WANT_NO_DELEG: - case NFS4_SHARE_WANT_CANCEL: + switch (w & NFS4_SHARE_WANT_TYPE_MASK) { + case OPEN4_SHARE_ACCESS_WANT_NO_PREFERENCE: + case OPEN4_SHARE_ACCESS_WANT_READ_DELEG: + case OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG: + case OPEN4_SHARE_ACCESS_WANT_ANY_DELEG: + case OPEN4_SHARE_ACCESS_WANT_NO_DELEG: + case OPEN4_SHARE_ACCESS_WANT_CANCEL: break; default: return nfserr_bad_xdr; @@ -861,1031 +1099,1349 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED): return nfs_ok; } -xdr_error: return nfserr_bad_xdr; } static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) { - __be32 *p; - - READ_BUF(4); - *x = be32_to_cpup(p++); - /* Note: unlinke access bits, deny bits may be zero. */ - if (*x & ~NFS4_SHARE_DENY_BOTH) + if (xdr_stream_decode_u32(argp->xdr, x) < 0) return nfserr_bad_xdr; - return nfs_ok; -xdr_error: - return nfserr_bad_xdr; -} - -static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) -{ - __be32 *p; - - READ_BUF(4); - o->len = be32_to_cpup(p++); - - if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) + /* Note: unlike access bits, deny bits may be zero. */ + if (*x & ~NFS4_SHARE_DENY_BOTH) return nfserr_bad_xdr; - READ_BUF(o->len); - SAVEMEM(o->data, o->len); return nfs_ok; -xdr_error: - return nfserr_bad_xdr; } static __be32 -nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp, + struct nfsd4_open *open) { - DECODE_HEAD; - u32 dummy; - - memset(open->op_bmval, 0, sizeof(open->op_bmval)); - open->op_iattr.ia_valid = 0; - open->op_openowner = NULL; - - open->op_xdr_error = 0; - /* seqid, share_access, share_deny, clientid, ownerlen */ - READ_BUF(4); - open->op_seqid = be32_to_cpup(p++); - /* decode, yet ignore deleg_when until supported */ - status = nfsd4_decode_share_access(argp, &open->op_share_access, - &open->op_deleg_want, &dummy); - if (status) - goto xdr_error; - status = nfsd4_decode_share_deny(argp, &open->op_share_deny); - if (status) - goto xdr_error; - READ_BUF(sizeof(clientid_t)); - COPYMEM(&open->op_clientid, sizeof(clientid_t)); - status = nfsd4_decode_opaque(argp, &open->op_owner); - if (status) - goto xdr_error; - READ_BUF(4); - open->op_create = be32_to_cpup(p++); - switch (open->op_create) { - case NFS4_OPEN_NOCREATE: - break; - case NFS4_OPEN_CREATE: - READ_BUF(4); - open->op_createmode = be32_to_cpup(p++); - switch (open->op_createmode) { - case NFS4_CREATE_UNCHECKED: - case NFS4_CREATE_GUARDED: - status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label, - &open->op_umask); - if (status) - goto out; - break; - case NFS4_CREATE_EXCLUSIVE: - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); - break; - case NFS4_CREATE_EXCLUSIVE4_1: - if (argp->minorversion < 1) - goto xdr_error; - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); - status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label, - &open->op_umask); - if (status) - goto out; - break; - default: - goto xdr_error; - } - break; - default: - goto xdr_error; - } + __be32 status; - /* open_claim */ - READ_BUF(4); - open->op_claim_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &open->op_claim_type) < 0) + return nfserr_bad_xdr; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_DELEGATE_PREV: - READ_BUF(4); - open->op_fname.len = be32_to_cpup(p++); - READ_BUF(open->op_fname.len); - SAVEMEM(open->op_fname.data, open->op_fname.len); - if ((status = check_filename(open->op_fname.data, open->op_fname.len))) + status = nfsd4_decode_component4(argp, &open->op_fname, + &open->op_fnamelen); + if (status) return status; break; case NFS4_OPEN_CLAIM_PREVIOUS: - READ_BUF(4); - open->op_delegate_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &open->op_delegate_type) < 0) + return nfserr_bad_xdr; break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: - status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + status = nfsd4_decode_stateid4(argp, &open->op_delegate_stateid); if (status) return status; - READ_BUF(4); - open->op_fname.len = be32_to_cpup(p++); - READ_BUF(open->op_fname.len); - SAVEMEM(open->op_fname.data, open->op_fname.len); - if ((status = check_filename(open->op_fname.data, open->op_fname.len))) + status = nfsd4_decode_component4(argp, &open->op_fname, + &open->op_fnamelen); + if (status) return status; break; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: if (argp->minorversion < 1) - goto xdr_error; + return nfserr_bad_xdr; /* void */ break; case NFS4_OPEN_CLAIM_DELEG_CUR_FH: if (argp->minorversion < 1) - goto xdr_error; - status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &open->op_delegate_stateid); if (status) return status; break; default: - goto xdr_error; + return nfserr_bad_xdr; } - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) +nfsd4_decode_open(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_open *open = &u->open; + __be32 status; + u32 dummy; - if (argp->minorversion >= 1) - return nfserr_notsupp; + memset(open, 0, sizeof(*open)); - status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + if (xdr_stream_decode_u32(argp->xdr, &open->op_seqid) < 0) + return nfserr_bad_xdr; + /* deleg_want is ignored */ + status = nfsd4_decode_share_access(argp, &open->op_share_access, + &open->op_deleg_want, &dummy); if (status) return status; - READ_BUF(4); - open_conf->oc_seqid = be32_to_cpup(p++); - - DECODE_TAIL; -} - -static __be32 -nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down) -{ - DECODE_HEAD; - - status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + status = nfsd4_decode_share_deny(argp, &open->op_share_deny); if (status) return status; - READ_BUF(4); - open_down->od_seqid = be32_to_cpup(p++); - status = nfsd4_decode_share_access(argp, &open_down->od_share_access, - &open_down->od_deleg_want, NULL); + status = nfsd4_decode_state_owner4(argp, &open->op_clientid, + &open->op_owner); if (status) return status; - status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny); + status = nfsd4_decode_openflag4(argp, open); if (status) return status; - DECODE_TAIL; + return nfsd4_decode_open_claim4(argp, open); } static __be32 -nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) +nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_open_confirm *open_conf = &u->open_confirm; + __be32 status; - READ_BUF(4); - putfh->pf_fhlen = be32_to_cpup(p++); - if (putfh->pf_fhlen > NFS4_FHSIZE) - goto xdr_error; - READ_BUF(putfh->pf_fhlen); - SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen); + if (argp->minorversion >= 1) + return nfserr_notsupp; - DECODE_TAIL; -} + status = nfsd4_decode_stateid4(argp, &open_conf->oc_req_stateid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &open_conf->oc_seqid) < 0) + return nfserr_bad_xdr; -static __be32 -nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) -{ - if (argp->minorversion == 0) - return nfs_ok; - return nfserr_notsupp; + memset(&open_conf->oc_resp_stateid, 0, + sizeof(open_conf->oc_resp_stateid)); + return nfs_ok; } static __be32 -nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) +nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_open_downgrade *open_down = &u->open_downgrade; + __be32 status; - status = nfsd4_decode_stateid(argp, &read->rd_stateid); + memset(open_down, 0, sizeof(*open_down)); + status = nfsd4_decode_stateid4(argp, &open_down->od_stateid); if (status) return status; - READ_BUF(12); - p = xdr_decode_hyper(p, &read->rd_offset); - read->rd_length = be32_to_cpup(p++); - - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &open_down->od_seqid) < 0) + return nfserr_bad_xdr; + /* deleg_want is ignored */ + status = nfsd4_decode_share_access(argp, &open_down->od_share_access, + &open_down->od_deleg_want, NULL); + if (status) + return status; + return nfsd4_decode_share_deny(argp, &open_down->od_share_deny); } static __be32 -nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir) +nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_putfh *putfh = &u->putfh; + __be32 *p; - READ_BUF(24); - p = xdr_decode_hyper(p, &readdir->rd_cookie); - COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data)); - readdir->rd_dircount = be32_to_cpup(p++); - readdir->rd_maxcount = be32_to_cpup(p++); - if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval))) - goto out; + if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0) + return nfserr_bad_xdr; + if (putfh->pf_fhlen > NFS4_FHSIZE) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, putfh->pf_fhlen); + if (!p) + return nfserr_bad_xdr; + putfh->pf_fhval = svcxdr_savemem(argp, p, putfh->pf_fhlen); + if (!putfh->pf_fhval) + return nfserr_jukebox; - DECODE_TAIL; + putfh->no_verify = false; + return nfs_ok; } static __be32 -nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) +nfsd4_decode_read(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_read *read = &u->read; + __be32 status; - READ_BUF(4); - remove->rm_namelen = be32_to_cpup(p++); - READ_BUF(remove->rm_namelen); - SAVEMEM(remove->rm_name, remove->rm_namelen); - if ((status = check_filename(remove->rm_name, remove->rm_namelen))) + memset(read, 0, sizeof(*read)); + status = nfsd4_decode_stateid4(argp, &read->rd_stateid); + if (status) return status; + if (xdr_stream_decode_u64(argp->xdr, &read->rd_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &read->rd_length) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename) +nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_readdir *readdir = &u->readdir; + __be32 status; - READ_BUF(4); - rename->rn_snamelen = be32_to_cpup(p++); - READ_BUF(rename->rn_snamelen); - SAVEMEM(rename->rn_sname, rename->rn_snamelen); - READ_BUF(4); - rename->rn_tnamelen = be32_to_cpup(p++); - READ_BUF(rename->rn_tnamelen); - SAVEMEM(rename->rn_tname, rename->rn_tnamelen); - if ((status = check_filename(rename->rn_sname, rename->rn_snamelen))) - return status; - if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen))) + memset(readdir, 0, sizeof(*readdir)); + if (xdr_stream_decode_u64(argp->xdr, &readdir->rd_cookie) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_verifier4(argp, &readdir->rd_verf); + if (status) return status; + if (xdr_stream_decode_u32(argp->xdr, &readdir->rd_dircount) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &readdir->rd_maxcount) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_uint32_array(argp->xdr, readdir->rd_bmval, + ARRAY_SIZE(readdir->rd_bmval)) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) +nfsd4_decode_remove(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; - - if (argp->minorversion >= 1) - return nfserr_notsupp; - - READ_BUF(sizeof(clientid_t)); - COPYMEM(clientid, sizeof(clientid_t)); - - DECODE_TAIL; + struct nfsd4_remove *remove = &u->remove; + memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo)); + return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen); } static __be32 -nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, - struct nfsd4_secinfo *secinfo) +nfsd4_decode_rename(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_rename *rename = &u->rename; + __be32 status; - READ_BUF(4); - secinfo->si_namelen = be32_to_cpup(p++); - READ_BUF(secinfo->si_namelen); - SAVEMEM(secinfo->si_name, secinfo->si_namelen); - status = check_filename(secinfo->si_name, secinfo->si_namelen); + memset(rename, 0, sizeof(*rename)); + status = nfsd4_decode_component4(argp, &rename->rn_sname, &rename->rn_snamelen); if (status) return status; - DECODE_TAIL; + return nfsd4_decode_component4(argp, &rename->rn_tname, &rename->rn_tnamelen); } static __be32 -nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, - struct nfsd4_secinfo_no_name *sin) +nfsd4_decode_renew(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + clientid_t *clientid = &u->renew; + return nfsd4_decode_clientid4(argp, clientid); +} - READ_BUF(4); - sin->sin_style = be32_to_cpup(p++); - DECODE_TAIL; +static __be32 +nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_secinfo *secinfo = &u->secinfo; + secinfo->si_exp = NULL; + return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen); } static __be32 -nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) +nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { + struct nfsd4_setattr *setattr = &u->setattr; __be32 status; - status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + memset(setattr, 0, sizeof(*setattr)); + status = nfsd4_decode_stateid4(argp, &setattr->sa_stateid); if (status) return status; - return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, - &setattr->sa_acl, &setattr->sa_label, NULL); + return nfsd4_decode_fattr4(argp, setattr->sa_bmval, + ARRAY_SIZE(setattr->sa_bmval), + &setattr->sa_iattr, &setattr->sa_acl, + &setattr->sa_label, NULL); } static __be32 -nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid) +nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_setclientid *setclientid = &u->setclientid; + __be32 *p, status; + + memset(setclientid, 0, sizeof(*setclientid)); if (argp->minorversion >= 1) return nfserr_notsupp; - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE); - + status = nfsd4_decode_verifier4(argp, &setclientid->se_verf); + if (status) + return status; status = nfsd4_decode_opaque(argp, &setclientid->se_name); if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_prog) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_netid_len) < 0) return nfserr_bad_xdr; - READ_BUF(8); - setclientid->se_callback_prog = be32_to_cpup(p++); - setclientid->se_callback_netid_len = be32_to_cpup(p++); - READ_BUF(setclientid->se_callback_netid_len); - SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); - READ_BUF(4); - setclientid->se_callback_addr_len = be32_to_cpup(p++); + p = xdr_inline_decode(argp->xdr, setclientid->se_callback_netid_len); + if (!p) + return nfserr_bad_xdr; + setclientid->se_callback_netid_val = svcxdr_savemem(argp, p, + setclientid->se_callback_netid_len); + if (!setclientid->se_callback_netid_val) + return nfserr_jukebox; - READ_BUF(setclientid->se_callback_addr_len); - SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); - READ_BUF(4); - setclientid->se_callback_ident = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_addr_len) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, setclientid->se_callback_addr_len); + if (!p) + return nfserr_bad_xdr; + setclientid->se_callback_addr_val = svcxdr_savemem(argp, p, + setclientid->se_callback_addr_len); + if (!setclientid->se_callback_addr_val) + return nfserr_jukebox; + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_ident) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c) +nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_setclientid_confirm *scd_c = &u->setclientid_confirm; + __be32 status; if (argp->minorversion >= 1) return nfserr_notsupp; - READ_BUF(8 + NFS4_VERIFIER_SIZE); - COPYMEM(&scd_c->sc_clientid, 8); - COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE); - - DECODE_TAIL; + status = nfsd4_decode_clientid4(argp, &scd_c->sc_clientid); + if (status) + return status; + return nfsd4_decode_verifier4(argp, &scd_c->sc_confirm); } /* Also used for NVERIFY */ static __be32 -nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) +nfsd4_decode_verify(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_verify *verify = &u->verify; + __be32 *p, status; - if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval))) - goto out; + memset(verify, 0, sizeof(*verify)); + + status = nfsd4_decode_bitmap4(argp, verify->ve_bmval, + ARRAY_SIZE(verify->ve_bmval)); + if (status) + return status; /* For convenience's sake, we compare raw xdr'd attributes in * nfsd4_proc_verify */ - READ_BUF(4); - verify->ve_attrlen = be32_to_cpup(p++); - READ_BUF(verify->ve_attrlen); - SAVEMEM(verify->ve_attrval, verify->ve_attrlen); + if (xdr_stream_decode_u32(argp->xdr, &verify->ve_attrlen) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, verify->ve_attrlen); + if (!p) + return nfserr_bad_xdr; + verify->ve_attrval = svcxdr_savemem(argp, p, verify->ve_attrlen); + if (!verify->ve_attrval) + return nfserr_jukebox; - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) +nfsd4_decode_write(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) { - int avail; - int len; - DECODE_HEAD; + struct nfsd4_write *write = &u->write; + __be32 status; - status = nfsd4_decode_stateid(argp, &write->wr_stateid); + status = nfsd4_decode_stateid4(argp, &write->wr_stateid); if (status) return status; - READ_BUF(16); - p = xdr_decode_hyper(p, &write->wr_offset); - write->wr_stable_how = be32_to_cpup(p++); + if (xdr_stream_decode_u64(argp->xdr, &write->wr_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &write->wr_stable_how) < 0) + return nfserr_bad_xdr; if (write->wr_stable_how > NFS_FILE_SYNC) - goto xdr_error; - write->wr_buflen = be32_to_cpup(p++); - - /* Sorry .. no magic macros for this.. * - * READ_BUF(write->wr_buflen); - * SAVEMEM(write->wr_buf, write->wr_buflen); - */ - avail = (char*)argp->end - (char*)argp->p; - if (avail + argp->pagelen < write->wr_buflen) { - dprintk("NFSD: xdr error (%s:%d)\n", - __FILE__, __LINE__); - goto xdr_error; - } - write->wr_head.iov_base = p; - write->wr_head.iov_len = avail; - write->wr_pagelist = argp->pagelist; - - len = XDR_QUADLEN(write->wr_buflen) << 2; - if (len >= avail) { - int pages; - - len -= avail; - - pages = len >> PAGE_SHIFT; - argp->pagelist += pages; - argp->pagelen -= pages * PAGE_SIZE; - len -= pages * PAGE_SIZE; - - next_decode_page(argp); - } - argp->p += XDR_QUADLEN(len); + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &write->wr_buflen) < 0) + return nfserr_bad_xdr; + if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen)) + return nfserr_bad_xdr; - DECODE_TAIL; + write->wr_bytes_written = 0; + write->wr_how_written = 0; + memset(&write->wr_verifier, 0, sizeof(write->wr_verifier)); + return nfs_ok; } static __be32 -nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner) +nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; + __be32 status; if (argp->minorversion >= 1) return nfserr_notsupp; - READ_BUF(12); - COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t)); - rlockowner->rl_owner.len = be32_to_cpup(p++); - READ_BUF(rlockowner->rl_owner.len); - READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); + status = nfsd4_decode_state_owner4(argp, &rlockowner->rl_clientid, + &rlockowner->rl_owner); + if (status) + return status; if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid)) return nfserr_inval; - DECODE_TAIL; + + return nfs_ok; +} + +static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl; + memset(bc, 0, sizeof(*bc)); + if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); +} + +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; + u32 use_conn_in_rdma_mode; + __be32 status; + + memset(bcts, 0, sizeof(*bcts)); + status = nfsd4_decode_sessionid4(argp, &bcts->sessionid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &bcts->dir) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &use_conn_in_rdma_mode) < 0) + return nfserr_bad_xdr; + + return nfs_ok; } static __be32 -nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, - struct nfsd4_exchange_id *exid) +nfsd4_decode_state_protect_ops(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) { - int dummy, tmp; - DECODE_HEAD; + __be32 status; + + status = nfsd4_decode_bitmap4(argp, exid->spo_must_enforce, + ARRAY_SIZE(exid->spo_must_enforce)); + if (status) + return nfserr_bad_xdr; + status = nfsd4_decode_bitmap4(argp, exid->spo_must_allow, + ARRAY_SIZE(exid->spo_must_allow)); + if (status) + return nfserr_bad_xdr; - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); + return nfs_ok; +} - status = nfsd4_decode_opaque(argp, &exid->clname); +/* + * This implementation currently does not support SP4_SSV. + * This decoder simply skips over these arguments. + */ +static noinline __be32 +nfsd4_decode_ssv_sp_parms(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + u32 count, window, num_gss_handles; + __be32 status; + + /* ssp_ops */ + status = nfsd4_decode_state_protect_ops(argp, exid); if (status) + return status; + + /* ssp_hash_algs<> */ + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) return nfserr_bad_xdr; + while (count--) { + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + } - READ_BUF(4); - exid->flags = be32_to_cpup(p++); + /* ssp_encr_algs<> */ + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + while (count--) { + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + } - /* Ignore state_protect4_a */ - READ_BUF(4); - exid->spa_how = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &window) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &num_gss_handles) < 0) + return nfserr_bad_xdr; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_state_protect4_a(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &exid->spa_how) < 0) + return nfserr_bad_xdr; switch (exid->spa_how) { case SP4_NONE: break; case SP4_MACH_CRED: - /* spo_must_enforce */ - status = nfsd4_decode_bitmap(argp, - exid->spo_must_enforce); - if (status) - goto out; - /* spo_must_allow */ - status = nfsd4_decode_bitmap(argp, exid->spo_must_allow); + status = nfsd4_decode_state_protect_ops(argp, exid); if (status) - goto out; + return status; break; case SP4_SSV: - /* ssp_ops */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; - - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; - - /* ssp_hash_algs<> */ - READ_BUF(4); - tmp = be32_to_cpup(p++); - while (tmp--) { - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - } + status = nfsd4_decode_ssv_sp_parms(argp, exid); + if (status) + return status; + break; + default: + return nfserr_bad_xdr; + } - /* ssp_encr_algs<> */ - READ_BUF(4); - tmp = be32_to_cpup(p++); - while (tmp--) { - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - } + return nfs_ok; +} + +static __be32 +nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + __be32 status; + u32 count; - /* ignore ssp_window and ssp_num_gss_handles: */ - READ_BUF(8); + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + switch (count) { + case 0: + break; + case 1: + /* Note that RFC 8881 places no length limit on + * nii_domain, but this implementation permits no + * more than NFS4_OPAQUE_LIMIT bytes */ + status = nfsd4_decode_opaque(argp, &exid->nii_domain); + if (status) + return status; + /* Note that RFC 8881 places no length limit on + * nii_name, but this implementation permits no + * more than NFS4_OPAQUE_LIMIT bytes */ + status = nfsd4_decode_opaque(argp, &exid->nii_name); + if (status) + return status; + status = nfsd4_decode_nfstime4(argp, &exid->nii_time); + if (status) + return status; break; default: - goto xdr_error; + return nfserr_bad_xdr; } - /* Ignore Implementation ID */ - READ_BUF(4); /* nfs_impl_id4 array length */ - dummy = be32_to_cpup(p++); - - if (dummy > 1) - goto xdr_error; - - if (dummy == 1) { - /* nii_domain */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - - /* nii_name */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - - /* nii_date */ - READ_BUF(12); - p += 3; - } - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, - struct nfsd4_create_session *sess) +nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; - u32 dummy; + struct nfsd4_exchange_id *exid = &u->exchange_id; + __be32 status; - READ_BUF(16); - COPYMEM(&sess->clientid, 8); - sess->seqid = be32_to_cpup(p++); - sess->flags = be32_to_cpup(p++); - - /* Fore channel attrs */ - READ_BUF(28); - dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ - sess->fore_channel.maxreq_sz = be32_to_cpup(p++); - sess->fore_channel.maxresp_sz = be32_to_cpup(p++); - sess->fore_channel.maxresp_cached = be32_to_cpup(p++); - sess->fore_channel.maxops = be32_to_cpup(p++); - sess->fore_channel.maxreqs = be32_to_cpup(p++); - sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++); - if (sess->fore_channel.nr_rdma_attrs == 1) { - READ_BUF(4); - sess->fore_channel.rdma_attrs = be32_to_cpup(p++); - } else if (sess->fore_channel.nr_rdma_attrs > 1) { - dprintk("Too many fore channel attr bitmaps!\n"); - goto xdr_error; - } + memset(exid, 0, sizeof(*exid)); + status = nfsd4_decode_verifier4(argp, &exid->verifier); + if (status) + return status; + status = nfsd4_decode_opaque(argp, &exid->clname); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &exid->flags) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_state_protect4_a(argp, exid); + if (status) + return status; + return nfsd4_decode_nfs_impl_id4(argp, exid); +} - /* Back channel attrs */ - READ_BUF(28); - dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ - sess->back_channel.maxreq_sz = be32_to_cpup(p++); - sess->back_channel.maxresp_sz = be32_to_cpup(p++); - sess->back_channel.maxresp_cached = be32_to_cpup(p++); - sess->back_channel.maxops = be32_to_cpup(p++); - sess->back_channel.maxreqs = be32_to_cpup(p++); - sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++); - if (sess->back_channel.nr_rdma_attrs == 1) { - READ_BUF(4); - sess->back_channel.rdma_attrs = be32_to_cpup(p++); - } else if (sess->back_channel.nr_rdma_attrs > 1) { - dprintk("Too many back channel attr bitmaps!\n"); - goto xdr_error; +static __be32 +nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp, + struct nfsd4_channel_attrs *ca) +{ + __be32 *p; + + p = xdr_inline_decode(argp->xdr, XDR_UNIT * 7); + if (!p) + return nfserr_bad_xdr; + + /* headerpadsz is ignored */ + p++; + ca->maxreq_sz = be32_to_cpup(p++); + ca->maxresp_sz = be32_to_cpup(p++); + ca->maxresp_cached = be32_to_cpup(p++); + ca->maxops = be32_to_cpup(p++); + ca->maxreqs = be32_to_cpup(p++); + ca->nr_rdma_attrs = be32_to_cpup(p); + switch (ca->nr_rdma_attrs) { + case 0: + break; + case 1: + if (xdr_stream_decode_u32(argp->xdr, &ca->rdma_attrs) < 0) + return nfserr_bad_xdr; + break; + default: + return nfserr_bad_xdr; } - READ_BUF(4); - sess->callback_prog = be32_to_cpup(p++); - nfsd4_decode_cb_sec(argp, &sess->cb_sec); - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, - struct nfsd4_destroy_session *destroy_session) +nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; - READ_BUF(NFS4_MAX_SESSIONID_LEN); - COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN); + struct nfsd4_create_session *sess = &u->create_session; + __be32 status; - DECODE_TAIL; + memset(sess, 0, sizeof(*sess)); + status = nfsd4_decode_clientid4(argp, &sess->clientid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &sess->seqid) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &sess->flags) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_channel_attrs4(argp, &sess->fore_channel); + if (status) + return status; + status = nfsd4_decode_channel_attrs4(argp, &sess->back_channel); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &sess->callback_prog) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_cb_sec(argp, &sess->cb_sec); +} + +static __be32 +nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_destroy_session *destroy_session = &u->destroy_session; + return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid); } static __be32 nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, - struct nfsd4_free_stateid *free_stateid) + union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_free_stateid *free_stateid = &u->free_stateid; + return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid); +} - READ_BUF(sizeof(stateid_t)); - free_stateid->fr_stateid.si_generation = be32_to_cpup(p++); - COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t)); +static __be32 +nfsd4_decode_get_dir_delegation(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation; + __be32 status; - DECODE_TAIL; + memset(gdd, 0, sizeof(*gdd)); + + if (xdr_stream_decode_bool(argp->xdr, &gdd->gdda_signal_deleg_avail) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_bitmap4(argp, gdd->gdda_notification_types, + ARRAY_SIZE(gdd->gdda_notification_types)); + if (status) + return status; + status = nfsd4_decode_nfstime4(argp, &gdd->gdda_child_attr_delay); + if (status) + return status; + status = nfsd4_decode_nfstime4(argp, &gdd->gdda_dir_attr_delay); + if (status) + return status; + status = nfsd4_decode_bitmap4(argp, gdd->gdda_child_attributes, + ARRAY_SIZE(gdd->gdda_child_attributes)); + if (status) + return status; + return nfsd4_decode_bitmap4(argp, gdd->gdda_dir_attributes, + ARRAY_SIZE(gdd->gdda_dir_attributes)); +} + +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; + __be32 status; + + memset(gdev, 0, sizeof(*gdev)); + status = nfsd4_decode_deviceid4(argp->xdr, &gdev->gd_devid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_layout_type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_maxcount) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_uint32_array(argp->xdr, + &gdev->gd_notify_types, 1) < 0) + return nfserr_bad_xdr; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_layoutcommit *lcp = &u->layoutcommit; + __be32 *p, status; + + memset(lcp, 0, sizeof(*lcp)); + if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.length) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_bool(argp->xdr, &lcp->lc_reclaim) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lcp->lc_sid); + if (status) + return status; + if (xdr_stream_decode_bool(argp->xdr, &lcp->lc_newoffset) < 0) + return nfserr_bad_xdr; + if (lcp->lc_newoffset) { + if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_last_wr) < 0) + return nfserr_bad_xdr; + } else + lcp->lc_last_wr = 0; + p = xdr_inline_decode(argp->xdr, XDR_UNIT); + if (!p) + return nfserr_bad_xdr; + if (xdr_item_is_present(p)) { + status = nfsd4_decode_nfstime4(argp, &lcp->lc_mtime); + if (status) + return status; + } else { + lcp->lc_mtime.tv_nsec = UTIME_NOW; + } + return nfsd4_decode_layoutupdate4(argp, lcp); +} + +static __be32 +nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_layoutget *lgp = &u->layoutget; + __be32 status; + + memset(lgp, 0, sizeof(*lgp)); + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_signal) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_layout_type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_seg.iomode) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_seg.offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_seg.length) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_minlength) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lgp->lg_sid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_maxcount) < 0) + return nfserr_bad_xdr; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_layoutreturn *lrp = &u->layoutreturn; + memset(lrp, 0, sizeof(*lrp)); + if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_layout_type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_seg.iomode) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_layoutreturn4(argp, lrp); +} +#endif /* CONFIG_NFSD_PNFS */ + +static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_secinfo_no_name *sin = &u->secinfo_no_name; + if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0) + return nfserr_bad_xdr; + + sin->sin_exp = NULL; + return nfs_ok; } static __be32 nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, - struct nfsd4_sequence *seq) + union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_sequence *seq = &u->sequence; + __be32 *p, status; - READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); - COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); + status = nfsd4_decode_sessionid4(argp, &seq->sessionid); + if (status) + return status; + p = xdr_inline_decode(argp->xdr, XDR_UNIT * 4); + if (!p) + return nfserr_bad_xdr; seq->seqid = be32_to_cpup(p++); seq->slotid = be32_to_cpup(p++); - seq->maxslots = be32_to_cpup(p++); - seq->cachethis = be32_to_cpup(p++); + /* sa_highest_slotid counts from 0 but maxslots counts from 1 ... */ + seq->maxslots = be32_to_cpup(p++) + 1; + seq->cachethis = be32_to_cpup(p); - DECODE_TAIL; + seq->status_flags = 0; + return nfs_ok; } static __be32 -nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) +nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - int i; - __be32 *p, status; + struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid; + __be32 status; + u32 i; - READ_BUF(4); - test_stateid->ts_num_ids = ntohl(*p++); + memset(test_stateid, 0, sizeof(*test_stateid)); + if (xdr_stream_decode_u32(argp->xdr, &test_stateid->ts_num_ids) < 0) + return nfserr_bad_xdr; INIT_LIST_HEAD(&test_stateid->ts_stateid_list); - for (i = 0; i < test_stateid->ts_num_ids; i++) { stateid = svcxdr_tmpalloc(argp, sizeof(*stateid)); - if (!stateid) { - status = nfserrno(-ENOMEM); - goto out; - } - + if (!stateid) + return nfserr_jukebox; INIT_LIST_HEAD(&stateid->ts_id_list); list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); - - status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid); + status = nfsd4_decode_stateid4(argp, &stateid->ts_id_stateid); if (status) - goto out; + return status; } - status = 0; -out: - return status; -xdr_error: - dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__); - status = nfserr_bad_xdr; - goto out; + return nfs_ok; +} + +static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_destroy_clientid *dc = &u->destroy_clientid; + return nfsd4_decode_clientid4(argp, &dc->clientid); } -static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc) +static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; + if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0) + return nfserr_bad_xdr; + return nfs_ok; +} + +static __be32 +nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_fallocate *fallocate = &u->allocate; + __be32 status; - READ_BUF(8); - COPYMEM(&dc->clientid, 8); + status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid); + if (status) + return status; + if (xdr_stream_decode_u64(argp->xdr, &fallocate->falloc_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &fallocate->falloc_length) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } -static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) +static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp, + struct nl4_server *ns) { - DECODE_HEAD; + struct nfs42_netaddr *naddr; + __be32 *p; + + if (xdr_stream_decode_u32(argp->xdr, &ns->nl4_type) < 0) + return nfserr_bad_xdr; - READ_BUF(4); - rc->rca_one_fs = be32_to_cpup(p++); + /* currently support for 1 inter-server source server */ + switch (ns->nl4_type) { + case NL4_NETADDR: + naddr = &ns->u.nl4_addr; - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &naddr->netid_len) < 0) + return nfserr_bad_xdr; + if (naddr->netid_len > RPCBIND_MAXNETIDLEN) + return nfserr_bad_xdr; + + p = xdr_inline_decode(argp->xdr, naddr->netid_len); + if (!p) + return nfserr_bad_xdr; + memcpy(naddr->netid, p, naddr->netid_len); + + if (xdr_stream_decode_u32(argp->xdr, &naddr->addr_len) < 0) + return nfserr_bad_xdr; + if (naddr->addr_len > RPCBIND_MAXUADDRLEN) + return nfserr_bad_xdr; + + p = xdr_inline_decode(argp->xdr, naddr->addr_len); + if (!p) + return nfserr_bad_xdr; + memcpy(naddr->addr, p, naddr->addr_len); + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; } -#ifdef CONFIG_NFSD_PNFS static __be32 -nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, - struct nfsd4_getdeviceinfo *gdev) -{ - DECODE_HEAD; - u32 num, i; - - READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4); - COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid)); - gdev->gd_layout_type = be32_to_cpup(p++); - gdev->gd_maxcount = be32_to_cpup(p++); - num = be32_to_cpup(p++); - if (num) { - if (num > 1000) - goto xdr_error; - READ_BUF(4 * num); - gdev->gd_notify_types = be32_to_cpup(p++); - for (i = 1; i < num; i++) { - if (be32_to_cpup(p++)) { - status = nfserr_inval; - goto out; - } +nfsd4_decode_copy(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) +{ + struct nfsd4_copy *copy = &u->copy; + u32 consecutive, i, count, sync; + struct nl4_server *ns_dummy; + __be32 status; + + memset(copy, 0, sizeof(*copy)); + status = nfsd4_decode_stateid4(argp, ©->cp_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid4(argp, ©->cp_dst_stateid); + if (status) + return status; + if (xdr_stream_decode_u64(argp->xdr, ©->cp_src_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, ©->cp_dst_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, ©->cp_count) < 0) + return nfserr_bad_xdr; + /* ca_consecutive: we always do consecutive copies */ + if (xdr_stream_decode_u32(argp->xdr, &consecutive) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_bool(argp->xdr, &sync) < 0) + return nfserr_bad_xdr; + nfsd4_copy_set_sync(copy, sync); + + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + copy->cp_src = svcxdr_tmpalloc(argp, sizeof(*copy->cp_src)); + if (copy->cp_src == NULL) + return nfserr_jukebox; + if (count == 0) { /* intra-server copy */ + __set_bit(NFSD4_COPY_F_INTRA, ©->cp_flags); + return nfs_ok; + } + + /* decode all the supplied server addresses but use only the first */ + status = nfsd4_decode_nl4_server(argp, copy->cp_src); + if (status) + return status; + + ns_dummy = kmalloc(sizeof(struct nl4_server), GFP_KERNEL); + if (ns_dummy == NULL) + return nfserr_jukebox; + for (i = 0; i < count - 1; i++) { + status = nfsd4_decode_nl4_server(argp, ns_dummy); + if (status) { + kfree(ns_dummy); + return status; } } - DECODE_TAIL; + kfree(ns_dummy); + + return nfs_ok; } static __be32 -nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutget *lgp) +nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_copy_notify *cn = &u->copy_notify; + __be32 status; - READ_BUF(36); - lgp->lg_signal = be32_to_cpup(p++); - lgp->lg_layout_type = be32_to_cpup(p++); - lgp->lg_seg.iomode = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &lgp->lg_seg.offset); - p = xdr_decode_hyper(p, &lgp->lg_seg.length); - p = xdr_decode_hyper(p, &lgp->lg_minlength); + memset(cn, 0, sizeof(*cn)); + cn->cpn_src = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_src)); + if (cn->cpn_src == NULL) + return nfserr_jukebox; + cn->cpn_dst = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_dst)); + if (cn->cpn_dst == NULL) + return nfserr_jukebox; - status = nfsd4_decode_stateid(argp, &lgp->lg_sid); + status = nfsd4_decode_stateid4(argp, &cn->cpn_src_stateid); if (status) return status; - - READ_BUF(4); - lgp->lg_maxcount = be32_to_cpup(p++); - - DECODE_TAIL; + return nfsd4_decode_nl4_server(argp, cn->cpn_dst); } static __be32 -nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutcommit *lcp) +nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; - u32 timechange; + struct nfsd4_offload_status *os = &u->offload_status; + os->count = 0; + os->status = 0; + return nfsd4_decode_stateid4(argp, &os->stateid); +} - READ_BUF(20); - p = xdr_decode_hyper(p, &lcp->lc_seg.offset); - p = xdr_decode_hyper(p, &lcp->lc_seg.length); - lcp->lc_reclaim = be32_to_cpup(p++); +static __be32 +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) +{ + struct nfsd4_seek *seek = &u->seek; + __be32 status; - status = nfsd4_decode_stateid(argp, &lcp->lc_sid); + status = nfsd4_decode_stateid4(argp, &seek->seek_stateid); if (status) return status; + if (xdr_stream_decode_u64(argp->xdr, &seek->seek_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &seek->seek_whence) < 0) + return nfserr_bad_xdr; - READ_BUF(4); - lcp->lc_newoffset = be32_to_cpup(p++); - if (lcp->lc_newoffset) { - READ_BUF(8); - p = xdr_decode_hyper(p, &lcp->lc_last_wr); - } else - lcp->lc_last_wr = 0; - READ_BUF(4); - timechange = be32_to_cpup(p++); - if (timechange) { - status = nfsd4_decode_time(argp, &lcp->lc_mtime); - if (status) - return status; - } else { - lcp->lc_mtime.tv_nsec = UTIME_NOW; - } - READ_BUF(8); - lcp->lc_layout_type = be32_to_cpup(p++); + seek->seek_eof = 0; + seek->seek_pos = 0; + return nfs_ok; +} - /* - * Save the layout update in XDR format and let the layout driver deal - * with it later. - */ - lcp->lc_up_len = be32_to_cpup(p++); - if (lcp->lc_up_len > 0) { - READ_BUF(lcp->lc_up_len); - READMEM(lcp->lc_up_layout, lcp->lc_up_len); - } +static __be32 +nfsd4_decode_clone(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u) +{ + struct nfsd4_clone *clone = &u->clone; + __be32 status; - DECODE_TAIL; + status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid4(argp, &clone->cl_dst_stateid); + if (status) + return status; + if (xdr_stream_decode_u64(argp->xdr, &clone->cl_src_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &clone->cl_dst_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &clone->cl_count) < 0) + return nfserr_bad_xdr; + + return nfs_ok; } +/* + * XDR data that is more than PAGE_SIZE in size is normally part of a + * read or write. However, the size of extended attributes is limited + * by the maximum request size, and then further limited by the underlying + * filesystem limits. This can exceed PAGE_SIZE (currently, XATTR_SIZE_MAX + * is 64k). Since there is no kvec- or page-based interface to xattrs, + * and we're not dealing with contiguous pages, we need to do some copying. + */ + +/* + * Decode data into buffer. + */ static __be32 -nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutreturn *lrp) +nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct xdr_buf *xdr, + char **bufp, size_t buflen) { - DECODE_HEAD; + struct page **pages = xdr->pages; + struct kvec *head = xdr->head; + char *tmp, *dp; + u32 len; - READ_BUF(16); - lrp->lr_reclaim = be32_to_cpup(p++); - lrp->lr_layout_type = be32_to_cpup(p++); - lrp->lr_seg.iomode = be32_to_cpup(p++); - lrp->lr_return_type = be32_to_cpup(p++); - if (lrp->lr_return_type == RETURN_FILE) { - READ_BUF(16); - p = xdr_decode_hyper(p, &lrp->lr_seg.offset); - p = xdr_decode_hyper(p, &lrp->lr_seg.length); + if (buflen <= head->iov_len) { + /* + * We're in luck, the head has enough space. Just return + * the head, no need for copying. + */ + *bufp = head->iov_base; + return 0; + } - status = nfsd4_decode_stateid(argp, &lrp->lr_sid); - if (status) - return status; + tmp = svcxdr_tmpalloc(argp, buflen); + if (tmp == NULL) + return nfserr_jukebox; - READ_BUF(4); - lrp->lrf_body_len = be32_to_cpup(p++); - if (lrp->lrf_body_len > 0) { - READ_BUF(lrp->lrf_body_len); - READMEM(lrp->lrf_body, lrp->lrf_body_len); - } - } else { - lrp->lr_seg.offset = 0; - lrp->lr_seg.length = NFS4_MAX_UINT64; + dp = tmp; + memcpy(dp, head->iov_base, head->iov_len); + buflen -= head->iov_len; + dp += head->iov_len; + + while (buflen > 0) { + len = min_t(u32, buflen, PAGE_SIZE); + memcpy(dp, page_address(*pages), len); + + buflen -= len; + dp += len; + pages++; } - DECODE_TAIL; + *bufp = tmp; + return 0; } -#endif /* CONFIG_NFSD_PNFS */ +/* + * Get a user extended attribute name from the XDR buffer. + * It will not have the "user." prefix, so prepend it. + * Lastly, check for nul characters in the name. + */ static __be32 -nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, - struct nfsd4_fallocate *fallocate) +nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep) { - DECODE_HEAD; + char *name, *sp, *dp; + u32 namelen, cnt; + __be32 *p; - status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid); - if (status) - return status; + if (xdr_stream_decode_u32(argp->xdr, &namelen) < 0) + return nfserr_bad_xdr; + if (namelen > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) + return nfserr_nametoolong; + if (namelen == 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, namelen); + if (!p) + return nfserr_bad_xdr; + name = svcxdr_tmpalloc(argp, namelen + XATTR_USER_PREFIX_LEN + 1); + if (!name) + return nfserr_jukebox; + memcpy(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); - READ_BUF(16); - p = xdr_decode_hyper(p, &fallocate->falloc_offset); - xdr_decode_hyper(p, &fallocate->falloc_length); + /* + * Copy the extended attribute name over while checking for 0 + * characters. + */ + sp = (char *)p; + dp = name + XATTR_USER_PREFIX_LEN; + cnt = namelen; + + while (cnt-- > 0) { + if (*sp == '\0') + return nfserr_bad_xdr; + *dp++ = *sp++; + } + *dp = '\0'; - DECODE_TAIL; + *namep = name; + + return nfs_ok; } +/* + * A GETXATTR op request comes without a length specifier. We just set the + * maximum length for the reply based on XATTR_SIZE_MAX and the maximum + * channel reply size. nfsd_getxattr will probe the length of the xattr, + * check it against getxa_len, and allocate + return the value. + */ static __be32 -nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) +nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_getxattr *getxattr = &u->getxattr; + __be32 status; + u32 maxcount; - status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid); - if (status) - return status; - status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid); + memset(getxattr, 0, sizeof(*getxattr)); + status = nfsd4_decode_xattr_name(argp, &getxattr->getxa_name); if (status) return status; - READ_BUF(8 + 8 + 8); - p = xdr_decode_hyper(p, &clone->cl_src_pos); - p = xdr_decode_hyper(p, &clone->cl_dst_pos); - p = xdr_decode_hyper(p, &clone->cl_count); - DECODE_TAIL; + maxcount = svc_max_payload(argp->rqstp); + maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount); + + getxattr->getxa_len = maxcount; + return nfs_ok; } static __be32 -nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) +nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; - unsigned int tmp; + struct nfsd4_setxattr *setxattr = &u->setxattr; + u32 flags, maxcount, size; + __be32 status; - status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); - if (status) - return status; - status = nfsd4_decode_stateid(argp, ©->cp_dst_stateid); + memset(setxattr, 0, sizeof(*setxattr)); + + if (xdr_stream_decode_u32(argp->xdr, &flags) < 0) + return nfserr_bad_xdr; + + if (flags > SETXATTR4_REPLACE) + return nfserr_inval; + setxattr->setxa_flags = flags; + + status = nfsd4_decode_xattr_name(argp, &setxattr->setxa_name); if (status) return status; - READ_BUF(8 + 8 + 8 + 4 + 4 + 4); - p = xdr_decode_hyper(p, ©->cp_src_pos); - p = xdr_decode_hyper(p, ©->cp_dst_pos); - p = xdr_decode_hyper(p, ©->cp_count); - p++; /* ca_consecutive: we always do consecutive copies */ - copy->cp_synchronous = be32_to_cpup(p++); - tmp = be32_to_cpup(p); /* Source server list not supported */ + maxcount = svc_max_payload(argp->rqstp); + maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount); - DECODE_TAIL; -} + if (xdr_stream_decode_u32(argp->xdr, &size) < 0) + return nfserr_bad_xdr; + if (size > maxcount) + return nfserr_xattr2big; -static __be32 -nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, - struct nfsd4_offload_status *os) -{ - return nfsd4_decode_stateid(argp, &os->stateid); + setxattr->setxa_len = size; + if (size > 0) { + struct xdr_buf payload; + + if (!xdr_stream_subsegment(argp->xdr, &payload, size)) + return nfserr_bad_xdr; + status = nfsd4_vbuf_from_vector(argp, &payload, + &setxattr->setxa_buf, size); + } + + return nfs_ok; } static __be32 -nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) +nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) { - DECODE_HEAD; + struct nfsd4_listxattrs *listxattrs = &u->listxattrs; + u32 maxcount; - status = nfsd4_decode_stateid(argp, &seek->seek_stateid); - if (status) - return status; + memset(listxattrs, 0, sizeof(*listxattrs)); + + if (xdr_stream_decode_u64(argp->xdr, &listxattrs->lsxa_cookie) < 0) + return nfserr_bad_xdr; + + /* + * If the cookie is too large to have even one user.x attribute + * plus trailing '\0' left in a maximum size buffer, it's invalid. + */ + if (listxattrs->lsxa_cookie >= + (XATTR_LIST_MAX / (XATTR_USER_PREFIX_LEN + 2))) + return nfserr_badcookie; + + if (xdr_stream_decode_u32(argp->xdr, &maxcount) < 0) + return nfserr_bad_xdr; + if (maxcount < 8) + /* Always need at least 2 words (length and one character) */ + return nfserr_inval; - READ_BUF(8 + 4); - p = xdr_decode_hyper(p, &seek->seek_offset); - seek->seek_whence = be32_to_cpup(p); + maxcount = min(maxcount, svc_max_payload(argp->rqstp)); + listxattrs->lsxa_maxcount = maxcount; - DECODE_TAIL; + return nfs_ok; +} + +static __be32 +nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp, + union nfsd4_op_u *u) +{ + struct nfsd4_removexattr *removexattr = &u->removexattr; + memset(removexattr, 0, sizeof(*removexattr)); + return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name); } static __be32 -nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_noop(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { return nfs_ok; } static __be32 -nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) +nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p) { return nfserr_notsupp; } -typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); +typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u); static const nfsd4_dec nfsd4_dec_ops[] = { - [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, - [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, - [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, - [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create, - [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn, - [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr, - [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_LINK] = (nfsd4_dec)nfsd4_decode_link, - [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock, - [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt, - [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku, - [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup, - [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop, - [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify, - [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open, - [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, - [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, - [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, - [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh, - [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_READ] = (nfsd4_dec)nfsd4_decode_read, - [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, - [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop, - [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove, - [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename, - [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew, - [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop, - [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo, - [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr, - [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid, - [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm, - [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify, - [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write, - [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, + [OP_ACCESS] = nfsd4_decode_access, + [OP_CLOSE] = nfsd4_decode_close, + [OP_COMMIT] = nfsd4_decode_commit, + [OP_CREATE] = nfsd4_decode_create, + [OP_DELEGPURGE] = nfsd4_decode_notsupp, + [OP_DELEGRETURN] = nfsd4_decode_delegreturn, + [OP_GETATTR] = nfsd4_decode_getattr, + [OP_GETFH] = nfsd4_decode_noop, + [OP_LINK] = nfsd4_decode_link, + [OP_LOCK] = nfsd4_decode_lock, + [OP_LOCKT] = nfsd4_decode_lockt, + [OP_LOCKU] = nfsd4_decode_locku, + [OP_LOOKUP] = nfsd4_decode_lookup, + [OP_LOOKUPP] = nfsd4_decode_noop, + [OP_NVERIFY] = nfsd4_decode_verify, + [OP_OPEN] = nfsd4_decode_open, + [OP_OPENATTR] = nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] = nfsd4_decode_open_confirm, + [OP_OPEN_DOWNGRADE] = nfsd4_decode_open_downgrade, + [OP_PUTFH] = nfsd4_decode_putfh, + [OP_PUTPUBFH] = nfsd4_decode_noop, + [OP_PUTROOTFH] = nfsd4_decode_noop, + [OP_READ] = nfsd4_decode_read, + [OP_READDIR] = nfsd4_decode_readdir, + [OP_READLINK] = nfsd4_decode_noop, + [OP_REMOVE] = nfsd4_decode_remove, + [OP_RENAME] = nfsd4_decode_rename, + [OP_RENEW] = nfsd4_decode_renew, + [OP_RESTOREFH] = nfsd4_decode_noop, + [OP_SAVEFH] = nfsd4_decode_noop, + [OP_SECINFO] = nfsd4_decode_secinfo, + [OP_SETATTR] = nfsd4_decode_setattr, + [OP_SETCLIENTID] = nfsd4_decode_setclientid, + [OP_SETCLIENTID_CONFIRM] = nfsd4_decode_setclientid_confirm, + [OP_VERIFY] = nfsd4_decode_verify, + [OP_WRITE] = nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] = nfsd4_decode_release_lockowner, /* new operations for NFSv4.1 */ - [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl, - [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, - [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, - [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, - [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, - [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_BACKCHANNEL_CTL] = nfsd4_decode_backchannel_ctl, + [OP_BIND_CONN_TO_SESSION] = nfsd4_decode_bind_conn_to_session, + [OP_EXCHANGE_ID] = nfsd4_decode_exchange_id, + [OP_CREATE_SESSION] = nfsd4_decode_create_session, + [OP_DESTROY_SESSION] = nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = nfsd4_decode_free_stateid, + [OP_GET_DIR_DELEGATION] = nfsd4_decode_get_dir_delegation, #ifdef CONFIG_NFSD_PNFS - [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo, - [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, - [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, - [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, + [OP_GETDEVICEINFO] = nfsd4_decode_getdeviceinfo, + [OP_GETDEVICELIST] = nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = nfsd4_decode_layoutcommit, + [OP_LAYOUTGET] = nfsd4_decode_layoutget, + [OP_LAYOUTRETURN] = nfsd4_decode_layoutreturn, #else - [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICEINFO] = nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = nfsd4_decode_notsupp, + [OP_LAYOUTGET] = nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = nfsd4_decode_notsupp, #endif - [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, - [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, - [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid, - [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, - [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, + [OP_SECINFO_NO_NAME] = nfsd4_decode_secinfo_no_name, + [OP_SEQUENCE] = nfsd4_decode_sequence, + [OP_SET_SSV] = nfsd4_decode_notsupp, + [OP_TEST_STATEID] = nfsd4_decode_test_stateid, + [OP_WANT_DELEGATION] = nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] = nfsd4_decode_destroy_clientid, + [OP_RECLAIM_COMPLETE] = nfsd4_decode_reclaim_complete, /* new operations for NFSv4.2 */ - [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy, - [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status, - [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status, - [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, - [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone, + [OP_ALLOCATE] = nfsd4_decode_fallocate, + [OP_COPY] = nfsd4_decode_copy, + [OP_COPY_NOTIFY] = nfsd4_decode_copy_notify, + [OP_DEALLOCATE] = nfsd4_decode_fallocate, + [OP_IO_ADVISE] = nfsd4_decode_notsupp, + [OP_LAYOUTERROR] = nfsd4_decode_notsupp, + [OP_LAYOUTSTATS] = nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = nfsd4_decode_offload_status, + [OP_OFFLOAD_STATUS] = nfsd4_decode_offload_status, + [OP_READ_PLUS] = nfsd4_decode_read, + [OP_SEEK] = nfsd4_decode_seek, + [OP_WRITE_SAME] = nfsd4_decode_notsupp, + [OP_CLONE] = nfsd4_decode_clone, + /* RFC 8276 extended atributes operations */ + [OP_GETXATTR] = nfsd4_decode_getxattr, + [OP_SETXATTR] = nfsd4_decode_setxattr, + [OP_LISTXATTRS] = nfsd4_decode_listxattrs, + [OP_REMOVEXATTR] = nfsd4_decode_removexattr, }; static inline bool @@ -1902,43 +2458,46 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) return true; } -static __be32 +static bool nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { - DECODE_HEAD; struct nfsd4_op *op; bool cachethis = false; int auth_slack= argp->rqstp->rq_auth_slack; int max_reply = auth_slack + 8; /* opcnt, status */ int readcount = 0; int readbytes = 0; + __be32 *p; int i; - READ_BUF(4); - argp->taglen = be32_to_cpup(p++); - READ_BUF(argp->taglen); - SAVEMEM(argp->tag, argp->taglen); - READ_BUF(8); - argp->minorversion = be32_to_cpup(p++); - argp->opcnt = be32_to_cpup(p++); - max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); - - if (argp->taglen > NFSD4_MAX_TAGLEN) - goto xdr_error; - /* - * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS - * here, so we return success at the xdr level so that - * nfsd4_proc can handle this is an NFS-level error. - */ - if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND) - return 0; + if (xdr_stream_decode_u32(argp->xdr, &argp->taglen) < 0) + return false; + max_reply += XDR_UNIT; + argp->tag = NULL; + if (unlikely(argp->taglen)) { + if (argp->taglen > NFSD4_MAX_TAGLEN) + return false; + p = xdr_inline_decode(argp->xdr, argp->taglen); + if (!p) + return false; + argp->tag = svcxdr_savemem(argp, p, argp->taglen); + if (!argp->tag) + return false; + max_reply += xdr_align_size(argp->taglen); + } + + if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0) + return false; + if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0) + return false; + argp->opcnt = min_t(u32, argp->client_opcnt, + NFSD_MAX_OPS_PER_COMPOUND); if (argp->opcnt > ARRAY_SIZE(argp->iops)) { - argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); + argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops)); if (!argp->ops) { argp->ops = argp->iops; - dprintk("nfsd: couldn't allocate room for COMPOUND\n"); - goto xdr_error; + return false; } } @@ -1948,24 +2507,30 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) for (i = 0; i < argp->opcnt; i++) { op = &argp->ops[i]; op->replay = NULL; + op->opdesc = NULL; - READ_BUF(4); - op->opnum = be32_to_cpup(p++); - - if (nfsd4_opnum_in_range(argp, op)) + if (xdr_stream_decode_u32(argp->xdr, &op->opnum) < 0) + return false; + if (nfsd4_opnum_in_range(argp, op)) { + op->opdesc = OPDESC(op); op->status = nfsd4_dec_ops[op->opnum](argp, &op->u); - else { + if (op->status != nfs_ok) + trace_nfsd_compound_decode_err(argp->rqstp, + argp->opcnt, i, + op->opnum, + op->status); + } else { op->opnum = OP_ILLEGAL; op->status = nfserr_op_illegal; } - op->opdesc = OPDESC(op); + /* * We'll try to cache the result in the DRC if any one * op in the compound wants to be cached: */ cachethis |= nfsd4_cache_this_op(op); - if (op->opnum == OP_READ) { + if (op->opnum == OP_READ || op->opnum == OP_READ_PLUS) { readcount++; readbytes += nfsd4_max_reply(argp->rqstp, op); } else @@ -1987,68 +2552,72 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) /* Sessions make the DRC unnecessary: */ if (argp->minorversion) cachethis = false; - svc_reserve(argp->rqstp, max_reply + readbytes); + svc_reserve_auth(argp->rqstp, max_reply + readbytes); argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; + argp->splice_ok = nfsd_read_splice_ok(argp->rqstp); if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) - clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); + argp->splice_ok = false; - DECODE_TAIL; + return true; } -static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, - struct svc_export *exp) +static __be32 nfsd4_encode_nfs_fh4(struct xdr_stream *xdr, + struct knfsd_fh *fh_handle) { - if (exp->ex_flags & NFSEXP_V4ROOT) { - *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); - *p++ = 0; - } else if (IS_I_VERSION(inode)) { - p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode)); - } else { - *p++ = cpu_to_be32(stat->ctime.tv_sec); - *p++ = cpu_to_be32(stat->ctime.tv_nsec); - } - return p; + return nfsd4_encode_opaque(xdr, fh_handle->fh_raw, fh_handle->fh_size); } -/* - * ctime (in NFSv4, time_metadata) is not writeable, and the client - * doesn't really care what resolution could theoretically be stored by - * the filesystem. - * - * The client cares how close together changes can be while still - * guaranteeing ctime changes. For most filesystems (which have - * timestamps with nanosecond fields) that is limited by the resolution - * of the time returned from current_time() (which I'm assuming to be - * 1/HZ). - */ -static __be32 *encode_time_delta(__be32 *p, struct inode *inode) +/* This is a frequently-encoded type; open-coded for speed */ +static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr, + const struct timespec64 *tv) { - struct timespec ts; - u32 ns; + __be32 *p; - ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran); - ts = ns_to_timespec(ns); + p = xdr_reserve_space(xdr, XDR_UNIT * 3); + if (!p) + return nfserr_resource; + p = xdr_encode_hyper(p, tv->tv_sec); + *p = cpu_to_be32(tv->tv_nsec); + return nfs_ok; +} - p = xdr_encode_hyper(p, ts.tv_sec); - *p++ = cpu_to_be32(ts.tv_nsec); +static __be32 nfsd4_encode_specdata4(struct xdr_stream *xdr, + unsigned int major, unsigned int minor) +{ + __be32 status; - return p; + status = nfsd4_encode_uint32_t(xdr, major); + if (status != nfs_ok) + return status; + return nfsd4_encode_uint32_t(xdr, minor); } -static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) +static __be32 +nfsd4_encode_change_info4(struct xdr_stream *xdr, const struct nfsd4_change_info *c) { - *p++ = cpu_to_be32(c->atomic); - if (c->change_supported) { - p = xdr_encode_hyper(p, c->before_change); - p = xdr_encode_hyper(p, c->after_change); - } else { - *p++ = cpu_to_be32(c->before_ctime_sec); - *p++ = cpu_to_be32(c->before_ctime_nsec); - *p++ = cpu_to_be32(c->after_ctime_sec); - *p++ = cpu_to_be32(c->after_ctime_nsec); - } - return p; + __be32 status; + + status = nfsd4_encode_bool(xdr, c->atomic); + if (status != nfs_ok) + return status; + status = nfsd4_encode_changeid4(xdr, c->before_change); + if (status != nfs_ok) + return status; + return nfsd4_encode_changeid4(xdr, c->after_change); +} + +static __be32 nfsd4_encode_netaddr4(struct xdr_stream *xdr, + const struct nfs42_netaddr *addr) +{ + __be32 status; + + /* na_r_netid */ + status = nfsd4_encode_opaque(xdr, addr->netid, addr->netid_len); + if (status != nfs_ok) + return status; + /* na_r_addr */ + return nfsd4_encode_opaque(xdr, addr->addr, addr->addr_len); } /* Encode as an array of strings the string given with components @@ -2061,10 +2630,8 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, __be32 *p; __be32 pathlen; int pathlen_offset; - int strlen, count=0; char *str, *end, *next; - - dprintk("nfsd4_encode_components(%s)\n", components); + int count = 0; pathlen_offset = xdr->buf->len; p = xdr_reserve_space(xdr, 4); @@ -2091,15 +2658,11 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, for (; *end && (*end != sep); end++) /* find sep or end of string */; - strlen = end - str; - if (strlen) { - p = xdr_reserve_space(xdr, strlen + 4); - if (!p) + if (end > str) { + if (xdr_stream_encode_opaque(xdr, str, end - str) < 0) return nfserr_resource; - p = xdr_encode_opaque(p, str, strlen); count++; - } - else + } else end++; if (found_esc) end = next; @@ -2120,9 +2683,6 @@ static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep, return nfsd4_encode_components_esc(xdr, sep, components, 0, 0); } -/* - * encode a location element of a fs_locations structure - */ static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, struct nfsd4_fs_location *location) { @@ -2135,18 +2695,14 @@ static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, status = nfsd4_encode_components(xdr, '/', location->path); if (status) return status; - return 0; + return nfs_ok; } -/* - * Encode a path in RFC3530 'pathname4' format - */ -static __be32 nfsd4_encode_path(struct xdr_stream *xdr, - const struct path *root, - const struct path *path) +static __be32 nfsd4_encode_pathname4(struct xdr_stream *xdr, + const struct path *root, + const struct path *path) { struct path cur = *path; - __be32 *p; struct dentry **components = NULL; unsigned int ncomponents = 0; __be32 err = nfserr_jukebox; @@ -2177,24 +2733,19 @@ static __be32 nfsd4_encode_path(struct xdr_stream *xdr, components[ncomponents++] = cur.dentry; cur.dentry = dget_parent(cur.dentry); } + err = nfserr_resource; - p = xdr_reserve_space(xdr, 4); - if (!p) + if (xdr_stream_encode_u32(xdr, ncomponents) != XDR_UNIT) goto out_free; - *p++ = cpu_to_be32(ncomponents); - while (ncomponents) { struct dentry *dentry = components[ncomponents - 1]; - unsigned int len; spin_lock(&dentry->d_lock); - len = dentry->d_name.len; - p = xdr_reserve_space(xdr, len + 4); - if (!p) { + if (xdr_stream_encode_opaque(xdr, dentry->d_name.name, + dentry->d_name.len) < 0) { spin_unlock(&dentry->d_lock); goto out_free; } - p = xdr_encode_opaque(p, dentry->d_name.name, len); dprintk("/%pd", dentry); spin_unlock(&dentry->d_lock); dput(dentry); @@ -2211,89 +2762,59 @@ out_free: return err; } -static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr, - struct svc_rqst *rqstp, const struct path *path) +static __be32 nfsd4_encode_fs_locations4(struct xdr_stream *xdr, + struct svc_rqst *rqstp, + struct svc_export *exp) { + struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; struct svc_export *exp_ps; - __be32 res; + unsigned int i; + __be32 status; + /* fs_root */ exp_ps = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp_ps)) return nfserrno(PTR_ERR(exp_ps)); - res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path); + status = nfsd4_encode_pathname4(xdr, &exp_ps->ex_path, &exp->ex_path); exp_put(exp_ps); - return res; -} - -/* - * encode a fs_locations structure - */ -static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr, - struct svc_rqst *rqstp, struct svc_export *exp) -{ - __be32 status; - int i; - __be32 *p; - struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; - - status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path); - if (status) + if (status != nfs_ok) return status; - p = xdr_reserve_space(xdr, 4); - if (!p) + + /* locations<> */ + if (xdr_stream_encode_u32(xdr, fslocs->locations_count) != XDR_UNIT) return nfserr_resource; - *p++ = cpu_to_be32(fslocs->locations_count); - for (i=0; i<fslocs->locations_count; i++) { + for (i = 0; i < fslocs->locations_count; i++) { status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]); - if (status) + if (status != nfs_ok) return status; } - return 0; -} -static u32 nfs4_file_type(umode_t mode) -{ - switch (mode & S_IFMT) { - case S_IFIFO: return NF4FIFO; - case S_IFCHR: return NF4CHR; - case S_IFDIR: return NF4DIR; - case S_IFBLK: return NF4BLK; - case S_IFLNK: return NF4LNK; - case S_IFREG: return NF4REG; - case S_IFSOCK: return NF4SOCK; - default: return NF4BAD; - }; + return nfs_ok; } -static inline __be32 -nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, - struct nfs4_ace *ace) +static __be32 nfsd4_encode_nfsace4(struct xdr_stream *xdr, struct svc_rqst *rqstp, + struct nfs4_ace *ace) { + __be32 status; + + /* type */ + status = nfsd4_encode_acetype4(xdr, ace->type); + if (status != nfs_ok) + return nfserr_resource; + /* flag */ + status = nfsd4_encode_aceflag4(xdr, ace->flag); + if (status != nfs_ok) + return nfserr_resource; + /* access mask */ + status = nfsd4_encode_acemask4(xdr, ace->access_mask & NFS4_ACE_MASK_ALL); + if (status != nfs_ok) + return nfserr_resource; + /* who */ if (ace->whotype != NFS4_ACL_WHO_NAMED) return nfs4_acl_write_who(xdr, ace->whotype); - else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) return nfsd4_encode_group(xdr, rqstp, ace->who_gid); - else - return nfsd4_encode_user(xdr, rqstp, ace->who_uid); -} - -static inline __be32 -nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types) -{ - __be32 *p; - unsigned long i = hweight_long(layout_types); - - p = xdr_reserve_space(xdr, 4 + 4 * i); - if (!p) - return nfserr_resource; - - *p++ = cpu_to_be32(i); - - for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) - if (layout_types & (1 << i)) - *p++ = cpu_to_be32(i); - - return 0; + return nfsd4_encode_user(xdr, rqstp, ace->who_uid); } #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -2304,11 +2825,11 @@ nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types) #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static inline __be32 nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, - void *context, int len) + const struct lsm_context *context) { __be32 *p; - p = xdr_reserve_space(xdr, len + 4 + 4 + 4); + p = xdr_reserve_space(xdr, context->len + 4 + 4 + 4); if (!p) return nfserr_resource; @@ -2318,13 +2839,13 @@ nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, */ *p++ = cpu_to_be32(0); /* lfs */ *p++ = cpu_to_be32(0); /* pi */ - p = xdr_encode_opaque(p, context, len); + p = xdr_encode_opaque(p, context->context, context->len); return 0; } #else static inline __be32 nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, - void *context, int len) + struct lsm_context *context) { return 0; } #endif @@ -2346,9 +2867,10 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 } -static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) +static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino) { struct path path = exp->ex_path; + struct kstat stat; int err; path_get(&path); @@ -2356,18 +2878,20 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) if (path.dentry != path.mnt->mnt_root) break; } - err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + err = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); path_put(&path); + if (!err) + *pino = stat.ino; return err; } static __be32 -nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) +nfsd4_encode_bitmap4(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) { __be32 *p; if (bmval2) { - p = xdr_reserve_space(xdr, 16); + p = xdr_reserve_space(xdr, XDR_UNIT * 4); if (!p) goto out_resource; *p++ = cpu_to_be32(3); @@ -2375,83 +2899,778 @@ nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) *p++ = cpu_to_be32(bmval1); *p++ = cpu_to_be32(bmval2); } else if (bmval1) { - p = xdr_reserve_space(xdr, 12); + p = xdr_reserve_space(xdr, XDR_UNIT * 3); if (!p) goto out_resource; *p++ = cpu_to_be32(2); *p++ = cpu_to_be32(bmval0); *p++ = cpu_to_be32(bmval1); } else { - p = xdr_reserve_space(xdr, 8); + p = xdr_reserve_space(xdr, XDR_UNIT * 2); if (!p) goto out_resource; *p++ = cpu_to_be32(1); *p++ = cpu_to_be32(bmval0); } - return 0; + return nfs_ok; out_resource: return nfserr_resource; } +struct nfsd4_fattr_args { + struct svc_rqst *rqstp; + struct svc_fh *fhp; + struct svc_export *exp; + struct dentry *dentry; + struct kstat stat; + struct kstatfs statfs; + struct nfs4_acl *acl; + u64 change_attr; +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + struct lsm_context context; +#endif + u32 rdattr_err; + bool contextsupport; + bool ignore_crossmnt; +}; + +typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args); + +static __be32 nfsd4_encode_fattr4__inval(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfserr_inval; +} + +static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4__true(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_bool(xdr, true); +} + +static __be32 nfsd4_encode_fattr4__false(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_bool(xdr, false); +} + +static __be32 nfsd4_encode_fattr4_supported_attrs(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfsd4_compoundres *resp = args->rqstp->rq_resp; + u32 minorversion = resp->cstate.minorversion; + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); + if (!IS_POSIXACL(d_inode(args->dentry))) + supp[0] &= ~FATTR4_WORD0_ACL; + if (!args->contextsupport) + supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + + return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]); +} + +static __be32 nfsd4_encode_fattr4_type(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT); + if (!p) + return nfserr_resource; + + switch (args->stat.mode & S_IFMT) { + case S_IFIFO: + *p = cpu_to_be32(NF4FIFO); + break; + case S_IFCHR: + *p = cpu_to_be32(NF4CHR); + break; + case S_IFDIR: + *p = cpu_to_be32(NF4DIR); + break; + case S_IFBLK: + *p = cpu_to_be32(NF4BLK); + break; + case S_IFLNK: + *p = cpu_to_be32(NF4LNK); + break; + case S_IFREG: + *p = cpu_to_be32(NF4REG); + break; + case S_IFSOCK: + *p = cpu_to_be32(NF4SOCK); + break; + default: + return nfserr_serverfault; + } + + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_fh_expire_type(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u32 mask; + + mask = NFS4_FH_PERSISTENT; + if (!(args->exp->ex_flags & NFSEXP_NOSUBTREECHECK)) + mask |= NFS4_FH_VOL_RENAME; + return nfsd4_encode_uint32_t(xdr, mask); +} + +static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + const struct svc_export *exp = args->exp; + + if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) { + u32 flush_time = convert_to_wallclock(exp->cd->flush_time); + + if (xdr_stream_encode_u32(xdr, flush_time) != XDR_UNIT) + return nfserr_resource; + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + return nfsd4_encode_changeid4(xdr, args->change_attr); +} + +static __be32 nfsd4_encode_fattr4_size(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->stat.size); +} + +static __be32 nfsd4_encode_fattr4_fsid(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 2 + XDR_UNIT * 2); + if (!p) + return nfserr_resource; + + if (unlikely(args->exp->ex_fslocs.migrated)) { + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); + xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); + return nfs_ok; + } + switch (fsid_source(args->fhp)) { + case FSIDSOURCE_FSID: + p = xdr_encode_hyper(p, (u64)args->exp->ex_fsid); + xdr_encode_hyper(p, (u64)0); + break; + case FSIDSOURCE_DEV: + *p++ = xdr_zero; + *p++ = cpu_to_be32(MAJOR(args->stat.dev)); + *p++ = xdr_zero; + *p = cpu_to_be32(MINOR(args->stat.dev)); + break; + case FSIDSOURCE_UUID: + xdr_encode_opaque_fixed(p, args->exp->ex_uuid, EX_UUID_LEN); + break; + } + + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_lease_time(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfsd_net *nn = net_generic(SVC_NET(args->rqstp), nfsd_net_id); + + return nfsd4_encode_nfs_lease4(xdr, nn->nfsd4_lease); +} + +static __be32 nfsd4_encode_fattr4_rdattr_error(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->rdattr_err); +} + +static __be32 nfsd4_encode_fattr4_aclsupport(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u32 mask; + + mask = 0; + if (IS_POSIXACL(d_inode(args->dentry))) + mask = ACL4_SUPPORT_ALLOW_ACL | ACL4_SUPPORT_DENY_ACL; + return nfsd4_encode_uint32_t(xdr, mask); +} + +static __be32 nfsd4_encode_fattr4_acl(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfs4_acl *acl = args->acl; + struct nfs4_ace *ace; + __be32 status; + + /* nfsace4<> */ + if (!acl) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + } else { + if (xdr_stream_encode_u32(xdr, acl->naces) != XDR_UNIT) + return nfserr_resource; + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { + status = nfsd4_encode_nfsace4(xdr, args->rqstp, ace); + if (status != nfs_ok) + return status; + } + } + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_filehandle(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfs_fh4(xdr, &args->fhp->fh_handle); +} + +static __be32 nfsd4_encode_fattr4_fileid(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->stat.ino); +} + +static __be32 nfsd4_encode_fattr4_files_avail(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree); +} + +static __be32 nfsd4_encode_fattr4_files_free(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree); +} + +static __be32 nfsd4_encode_fattr4_files_total(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->statfs.f_files); +} + +static __be32 nfsd4_encode_fattr4_fs_locations(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_fs_locations4(xdr, args->rqstp, args->exp); +} + +static __be32 nfsd4_encode_fattr4_maxfilesize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct super_block *sb = args->exp->ex_path.mnt->mnt_sb; + + return nfsd4_encode_uint64_t(xdr, sb->s_maxbytes); +} + +static __be32 nfsd4_encode_fattr4_maxlink(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, 255); +} + +static __be32 nfsd4_encode_fattr4_maxname(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->statfs.f_namelen); +} + +static __be32 nfsd4_encode_fattr4_maxread(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp)); +} + +static __be32 nfsd4_encode_fattr4_maxwrite(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp)); +} + +static __be32 nfsd4_encode_fattr4_mode(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_mode4(xdr, args->stat.mode & S_IALLUGO); +} + +static __be32 nfsd4_encode_fattr4_numlinks(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->stat.nlink); +} + +static __be32 nfsd4_encode_fattr4_owner(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_user(xdr, args->rqstp, args->stat.uid); +} + +static __be32 nfsd4_encode_fattr4_owner_group(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_group(xdr, args->rqstp, args->stat.gid); +} + +static __be32 nfsd4_encode_fattr4_rawdev(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_specdata4(xdr, MAJOR(args->stat.rdev), + MINOR(args->stat.rdev)); +} + +static __be32 nfsd4_encode_fattr4_space_avail(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 avail = (u64)args->statfs.f_bavail * (u64)args->statfs.f_bsize; + + return nfsd4_encode_uint64_t(xdr, avail); +} + +static __be32 nfsd4_encode_fattr4_space_free(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 free = (u64)args->statfs.f_bfree * (u64)args->statfs.f_bsize; + + return nfsd4_encode_uint64_t(xdr, free); +} + +static __be32 nfsd4_encode_fattr4_space_total(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 total = (u64)args->statfs.f_blocks * (u64)args->statfs.f_bsize; + + return nfsd4_encode_uint64_t(xdr, total); +} + +static __be32 nfsd4_encode_fattr4_space_used(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, (u64)args->stat.blocks << 9); +} + +static __be32 nfsd4_encode_fattr4_time_access(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.atime); +} + +static __be32 nfsd4_encode_fattr4_time_create(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.btime); +} + +/* + * ctime (in NFSv4, time_metadata) is not writeable, and the client + * doesn't really care what resolution could theoretically be stored by + * the filesystem. + * + * The client cares how close together changes can be while still + * guaranteeing ctime changes. For most filesystems (which have + * timestamps with nanosecond fields) that is limited by the resolution + * of the time returned from current_time() (which I'm assuming to be + * 1/HZ). + */ +static __be32 nfsd4_encode_fattr4_time_delta(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + const struct inode *inode = d_inode(args->dentry); + u32 ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran); + struct timespec64 ts = ns_to_timespec64(ns); + + return nfsd4_encode_nfstime4(xdr, &ts); +} + +static __be32 nfsd4_encode_fattr4_time_metadata(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.ctime); +} + +static __be32 nfsd4_encode_fattr4_time_modify(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.mtime); +} + +static __be32 nfsd4_encode_fattr4_mounted_on_fileid(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 ino; + int err; + + if (!args->ignore_crossmnt && + args->dentry == args->exp->ex_path.mnt->mnt_root) { + err = nfsd4_get_mounted_on_ino(args->exp, &ino); + if (err) + return nfserrno(err); + } else + ino = args->stat.ino; + + return nfsd4_encode_uint64_t(xdr, ino); +} + +#ifdef CONFIG_NFSD_PNFS + +static __be32 nfsd4_encode_fattr4_fs_layout_types(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + unsigned long mask = args->exp->ex_layout_types; + int i; + + /* Hamming weight of @mask is the number of layout types to return */ + if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT) + return nfserr_resource; + for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) + if (mask & BIT(i)) { + /* layouttype4 */ + if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT) + return nfserr_resource; + } + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_layout_types(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + unsigned long mask = args->exp->ex_layout_types; + int i; + + /* Hamming weight of @mask is the number of layout types to return */ + if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT) + return nfserr_resource; + for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) + if (mask & BIT(i)) { + /* layouttype4 */ + if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT) + return nfserr_resource; + } + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_layout_blksize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->stat.blksize); +} + +#endif + +static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfsd4_compoundres *resp = args->rqstp->rq_resp; + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[resp->cstate.minorversion], sizeof(supp)); + supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; + supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; + supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; + + return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]); +} + +/* + * Copied from generic_remap_checks/generic_remap_file_range_prep. + * + * These generic functions use the file system's s_blocksize, but + * individual file systems aren't required to use + * generic_remap_file_range_prep. Until there is a mechanism for + * determining a particular file system's (or file's) clone block + * size, this is the best NFSD can do. + */ +static __be32 nfsd4_encode_fattr4_clone_blksize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct inode *inode = d_inode(args->dentry); + + return nfsd4_encode_uint32_t(xdr, inode->i_sb->s_blocksize); +} + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_security_label(xdr, args->rqstp, &args->context); +} +#endif + +static __be32 nfsd4_encode_fattr4_xattr_support(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + int err = xattr_supports_user_prefix(d_inode(args->dentry)); + + return nfsd4_encode_bool(xdr, err == 0); +} + +#define NFSD_OA_SHARE_ACCESS (BIT(OPEN_ARGS_SHARE_ACCESS_READ) | \ + BIT(OPEN_ARGS_SHARE_ACCESS_WRITE) | \ + BIT(OPEN_ARGS_SHARE_ACCESS_BOTH)) + +#define NFSD_OA_SHARE_DENY (BIT(OPEN_ARGS_SHARE_DENY_NONE) | \ + BIT(OPEN_ARGS_SHARE_DENY_READ) | \ + BIT(OPEN_ARGS_SHARE_DENY_WRITE) | \ + BIT(OPEN_ARGS_SHARE_DENY_BOTH)) + +#define NFSD_OA_SHARE_ACCESS_WANT (BIT(OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG) | \ + BIT(OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG) | \ + BIT(OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL) | \ + BIT(OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS) | \ + BIT(OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION)) + +#define NFSD_OA_OPEN_CLAIM (BIT(OPEN_ARGS_OPEN_CLAIM_NULL) | \ + BIT(OPEN_ARGS_OPEN_CLAIM_PREVIOUS) | \ + BIT(OPEN_ARGS_OPEN_CLAIM_DELEGATE_CUR) | \ + BIT(OPEN_ARGS_OPEN_CLAIM_DELEGATE_PREV)| \ + BIT(OPEN_ARGS_OPEN_CLAIM_FH) | \ + BIT(OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH) | \ + BIT(OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH)) + +#define NFSD_OA_CREATE_MODE (BIT(OPEN_ARGS_CREATEMODE_UNCHECKED4) | \ + BIT(OPEN_ARGS_CREATE_MODE_GUARDED) | \ + BIT(OPEN_ARGS_CREATEMODE_EXCLUSIVE4) | \ + BIT(OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1)) + +static uint32_t oa_share_access = NFSD_OA_SHARE_ACCESS; +static uint32_t oa_share_deny = NFSD_OA_SHARE_DENY; +static uint32_t oa_share_access_want = NFSD_OA_SHARE_ACCESS_WANT; +static uint32_t oa_open_claim = NFSD_OA_OPEN_CLAIM; +static uint32_t oa_create_mode = NFSD_OA_CREATE_MODE; + +static const struct open_arguments4 nfsd_open_arguments = { + .oa_share_access = { .count = 1, .element = &oa_share_access }, + .oa_share_deny = { .count = 1, .element = &oa_share_deny }, + .oa_share_access_want = { .count = 1, .element = &oa_share_access_want }, + .oa_open_claim = { .count = 1, .element = &oa_open_claim }, + .oa_create_mode = { .count = 1, .element = &oa_create_mode }, +}; + +static __be32 nfsd4_encode_fattr4_open_arguments(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + if (!xdrgen_encode_fattr4_open_arguments(xdr, &nfsd_open_arguments)) + return nfserr_resource; + return nfs_ok; +} + +static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = { + [FATTR4_SUPPORTED_ATTRS] = nfsd4_encode_fattr4_supported_attrs, + [FATTR4_TYPE] = nfsd4_encode_fattr4_type, + [FATTR4_FH_EXPIRE_TYPE] = nfsd4_encode_fattr4_fh_expire_type, + [FATTR4_CHANGE] = nfsd4_encode_fattr4_change, + [FATTR4_SIZE] = nfsd4_encode_fattr4_size, + [FATTR4_LINK_SUPPORT] = nfsd4_encode_fattr4__true, + [FATTR4_SYMLINK_SUPPORT] = nfsd4_encode_fattr4__true, + [FATTR4_NAMED_ATTR] = nfsd4_encode_fattr4__false, + [FATTR4_FSID] = nfsd4_encode_fattr4_fsid, + [FATTR4_UNIQUE_HANDLES] = nfsd4_encode_fattr4__true, + [FATTR4_LEASE_TIME] = nfsd4_encode_fattr4_lease_time, + [FATTR4_RDATTR_ERROR] = nfsd4_encode_fattr4_rdattr_error, + [FATTR4_ACL] = nfsd4_encode_fattr4_acl, + [FATTR4_ACLSUPPORT] = nfsd4_encode_fattr4_aclsupport, + [FATTR4_ARCHIVE] = nfsd4_encode_fattr4__noop, + [FATTR4_CANSETTIME] = nfsd4_encode_fattr4__true, + [FATTR4_CASE_INSENSITIVE] = nfsd4_encode_fattr4__false, + [FATTR4_CASE_PRESERVING] = nfsd4_encode_fattr4__true, + [FATTR4_CHOWN_RESTRICTED] = nfsd4_encode_fattr4__true, + [FATTR4_FILEHANDLE] = nfsd4_encode_fattr4_filehandle, + [FATTR4_FILEID] = nfsd4_encode_fattr4_fileid, + [FATTR4_FILES_AVAIL] = nfsd4_encode_fattr4_files_avail, + [FATTR4_FILES_FREE] = nfsd4_encode_fattr4_files_free, + [FATTR4_FILES_TOTAL] = nfsd4_encode_fattr4_files_total, + [FATTR4_FS_LOCATIONS] = nfsd4_encode_fattr4_fs_locations, + [FATTR4_HIDDEN] = nfsd4_encode_fattr4__noop, + [FATTR4_HOMOGENEOUS] = nfsd4_encode_fattr4__true, + [FATTR4_MAXFILESIZE] = nfsd4_encode_fattr4_maxfilesize, + [FATTR4_MAXLINK] = nfsd4_encode_fattr4_maxlink, + [FATTR4_MAXNAME] = nfsd4_encode_fattr4_maxname, + [FATTR4_MAXREAD] = nfsd4_encode_fattr4_maxread, + [FATTR4_MAXWRITE] = nfsd4_encode_fattr4_maxwrite, + [FATTR4_MIMETYPE] = nfsd4_encode_fattr4__noop, + [FATTR4_MODE] = nfsd4_encode_fattr4_mode, + [FATTR4_NO_TRUNC] = nfsd4_encode_fattr4__true, + [FATTR4_NUMLINKS] = nfsd4_encode_fattr4_numlinks, + [FATTR4_OWNER] = nfsd4_encode_fattr4_owner, + [FATTR4_OWNER_GROUP] = nfsd4_encode_fattr4_owner_group, + [FATTR4_QUOTA_AVAIL_HARD] = nfsd4_encode_fattr4__noop, + [FATTR4_QUOTA_AVAIL_SOFT] = nfsd4_encode_fattr4__noop, + [FATTR4_QUOTA_USED] = nfsd4_encode_fattr4__noop, + [FATTR4_RAWDEV] = nfsd4_encode_fattr4_rawdev, + [FATTR4_SPACE_AVAIL] = nfsd4_encode_fattr4_space_avail, + [FATTR4_SPACE_FREE] = nfsd4_encode_fattr4_space_free, + [FATTR4_SPACE_TOTAL] = nfsd4_encode_fattr4_space_total, + [FATTR4_SPACE_USED] = nfsd4_encode_fattr4_space_used, + [FATTR4_SYSTEM] = nfsd4_encode_fattr4__noop, + [FATTR4_TIME_ACCESS] = nfsd4_encode_fattr4_time_access, + [FATTR4_TIME_ACCESS_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_TIME_BACKUP] = nfsd4_encode_fattr4__noop, + [FATTR4_TIME_CREATE] = nfsd4_encode_fattr4_time_create, + [FATTR4_TIME_DELTA] = nfsd4_encode_fattr4_time_delta, + [FATTR4_TIME_METADATA] = nfsd4_encode_fattr4_time_metadata, + [FATTR4_TIME_MODIFY] = nfsd4_encode_fattr4_time_modify, + [FATTR4_TIME_MODIFY_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_MOUNTED_ON_FILEID] = nfsd4_encode_fattr4_mounted_on_fileid, + [FATTR4_DIR_NOTIF_DELAY] = nfsd4_encode_fattr4__noop, + [FATTR4_DIRENT_NOTIF_DELAY] = nfsd4_encode_fattr4__noop, + [FATTR4_DACL] = nfsd4_encode_fattr4__noop, + [FATTR4_SACL] = nfsd4_encode_fattr4__noop, + [FATTR4_CHANGE_POLICY] = nfsd4_encode_fattr4__noop, + [FATTR4_FS_STATUS] = nfsd4_encode_fattr4__noop, + +#ifdef CONFIG_NFSD_PNFS + [FATTR4_FS_LAYOUT_TYPES] = nfsd4_encode_fattr4_fs_layout_types, + [FATTR4_LAYOUT_HINT] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_TYPES] = nfsd4_encode_fattr4_layout_types, + [FATTR4_LAYOUT_BLKSIZE] = nfsd4_encode_fattr4_layout_blksize, + [FATTR4_LAYOUT_ALIGNMENT] = nfsd4_encode_fattr4__noop, +#else + [FATTR4_FS_LAYOUT_TYPES] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_HINT] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_TYPES] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_BLKSIZE] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_ALIGNMENT] = nfsd4_encode_fattr4__noop, +#endif + + [FATTR4_FS_LOCATIONS_INFO] = nfsd4_encode_fattr4__noop, + [FATTR4_MDSTHRESHOLD] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTION_GET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTION_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTEVT_GET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTEVT_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTION_HOLD] = nfsd4_encode_fattr4__noop, + [FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop, + [FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat, + [FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop, + [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4_clone_blksize, + [FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop, + [FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop, + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + [FATTR4_SEC_LABEL] = nfsd4_encode_fattr4_sec_label, +#else + [FATTR4_SEC_LABEL] = nfsd4_encode_fattr4__noop, +#endif + + [FATTR4_MODE_UMASK] = nfsd4_encode_fattr4__noop, + [FATTR4_XATTR_SUPPORT] = nfsd4_encode_fattr4_xattr_support, + [FATTR4_TIME_DELEG_ACCESS] = nfsd4_encode_fattr4__inval, + [FATTR4_TIME_DELEG_MODIFY] = nfsd4_encode_fattr4__inval, + [FATTR4_OPEN_ARGUMENTS] = nfsd4_encode_fattr4_open_arguments, +}; + /* * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. */ static __be32 -nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, - struct svc_export *exp, - struct dentry *dentry, u32 *bmval, - struct svc_rqst *rqstp, int ignore_crossmnt) -{ - u32 bmval0 = bmval[0]; - u32 bmval1 = bmval[1]; - u32 bmval2 = bmval[2]; - struct kstat stat; +nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, const u32 *bmval, + int ignore_crossmnt) +{ + DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); + struct nfs4_delegation *dp = NULL; + struct nfsd4_fattr_args args; struct svc_fh *tempfh = NULL; - struct kstatfs statfs; - __be32 *p; int starting_len = xdr->buf->len; - int attrlen_offset; - __be32 attrlen; - u32 dummy; - u64 dummy64; - u32 rdattr_err = 0; - __be32 status; + unsigned int attrlen_offset; + __be32 attrlen, status; + u32 attrmask[3]; int err; - struct nfs4_acl *acl = NULL; - void *context = NULL; - int contextlen; - bool contextsupport = false; struct nfsd4_compoundres *resp = rqstp->rq_resp; u32 minorversion = resp->cstate.minorversion; struct path path = { .mnt = exp->ex_path.mnt, .dentry = dentry, }; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + unsigned long bit; - BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); - BUG_ON(!nfsd_attrs_supported(minorversion, bmval)); + WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1); + WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval)); + args.rqstp = rqstp; + args.exp = exp; + args.dentry = dentry; + args.ignore_crossmnt = (ignore_crossmnt != 0); + args.acl = NULL; +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + args.context.context = NULL; +#endif + + /* + * Make a local copy of the attribute bitmap that can be modified. + */ + attrmask[0] = bmval[0]; + attrmask[1] = bmval[1]; + attrmask[2] = bmval[2]; + + args.rdattr_err = 0; if (exp->ex_fslocs.migrated) { - status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err); + status = fattr_handle_absent_fs(&attrmask[0], &attrmask[1], + &attrmask[2], &args.rdattr_err); + if (status) + goto out; + } + if ((attrmask[0] & (FATTR4_WORD0_CHANGE | + FATTR4_WORD0_SIZE)) || + (attrmask[1] & (FATTR4_WORD1_TIME_ACCESS | + FATTR4_WORD1_TIME_MODIFY | + FATTR4_WORD1_TIME_METADATA))) { + status = nfsd4_deleg_getattr_conflict(rqstp, dentry, &dp); if (status) goto out; } - err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); + err = vfs_getattr(&path, &args.stat, + STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, + AT_STATX_SYNC_AS_STAT); + if (dp) { + struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr; + + if (ncf->ncf_file_modified) { + ++ncf->ncf_initial_cinfo; + args.stat.size = ncf->ncf_cur_fsize; + if (!timespec64_is_epoch(&ncf->ncf_cb_mtime)) + args.stat.mtime = ncf->ncf_cb_mtime; + } + args.change_attr = ncf->ncf_initial_cinfo; + + if (!timespec64_is_epoch(&ncf->ncf_cb_atime)) + args.stat.atime = ncf->ncf_cb_atime; + + nfs4_put_stid(&dp->dl_stid); + } else { + args.change_attr = nfsd4_change_attribute(&args.stat); + } + if (err) goto out_nfserr; - if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + + if (!(args.stat.result_mask & STATX_BTIME)) + /* underlying FS does not offer btime so we can't share it */ + attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; + if ((attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || - (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | + (attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { - err = vfs_statfs(&path, &statfs); + err = vfs_statfs(&path, &args.statfs); if (err) goto out_nfserr; } - if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { + if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && + !fhp) { tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); status = nfserr_jukebox; if (!tempfh) @@ -2460,12 +3679,14 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, status = fh_compose(tempfh, exp, dentry, NULL); if (status) goto out; - fhp = tempfh; - } - if (bmval0 & FATTR4_WORD0_ACL) { - err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); + args.fhp = tempfh; + } else + args.fhp = fhp; + + if (attrmask[0] & FATTR4_WORD0_ACL) { + err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl); if (err == -EOPNOTSUPP) - bmval0 &= ~FATTR4_WORD0_ACL; + attrmask[0] &= ~FATTR4_WORD0_ACL; else if (err == -EINVAL) { status = nfserr_attrnotsupp; goto out; @@ -2473,456 +3694,54 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out_nfserr; } + args.contextsupport = false; + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || - bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { + if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || + attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { if (exp->ex_flags & NFSEXP_SECURITY_LABEL) err = security_inode_getsecctx(d_inode(dentry), - &context, &contextlen); + &args.context); else err = -EOPNOTSUPP; - contextsupport = (err == 0); - if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { + args.contextsupport = (err == 0); + if (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { if (err == -EOPNOTSUPP) - bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL; + attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; else if (err) goto out_nfserr; } } #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ - status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2); + /* attrmask */ + status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1], + attrmask[2]); if (status) goto out; + /* attr_vals */ attrlen_offset = xdr->buf->len; - p = xdr_reserve_space(xdr, 4); - if (!p) + if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT))) goto out_resource; - p++; /* to be backfilled later */ - - if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - u32 supp[3]; - - memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); - - if (!IS_POSIXACL(dentry->d_inode)) - supp[0] &= ~FATTR4_WORD0_ACL; - if (!contextsupport) - supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - if (!supp[2]) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(supp[0]); - *p++ = cpu_to_be32(supp[1]); - } else { - p = xdr_reserve_space(xdr, 16); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(supp[0]); - *p++ = cpu_to_be32(supp[1]); - *p++ = cpu_to_be32(supp[2]); - } - } - if (bmval0 & FATTR4_WORD0_TYPE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - dummy = nfs4_file_type(stat.mode); - if (dummy == NF4BAD) { - status = nfserr_serverfault; - goto out; - } - *p++ = cpu_to_be32(dummy); - } - if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) - *p++ = cpu_to_be32(NFS4_FH_PERSISTENT); - else - *p++ = cpu_to_be32(NFS4_FH_PERSISTENT| - NFS4_FH_VOL_RENAME); - } - if (bmval0 & FATTR4_WORD0_CHANGE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = encode_change(p, &stat, d_inode(dentry), exp); - } - if (bmval0 & FATTR4_WORD0_SIZE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, stat.size); - } - if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_NAMED_ATTR) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } - if (bmval0 & FATTR4_WORD0_FSID) { - p = xdr_reserve_space(xdr, 16); - if (!p) - goto out_resource; - if (exp->ex_fslocs.migrated) { - p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); - p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); - } else switch(fsid_source(fhp)) { - case FSIDSOURCE_FSID: - p = xdr_encode_hyper(p, (u64)exp->ex_fsid); - p = xdr_encode_hyper(p, (u64)0); - break; - case FSIDSOURCE_DEV: - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(MAJOR(stat.dev)); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(MINOR(stat.dev)); - break; - case FSIDSOURCE_UUID: - p = xdr_encode_opaque_fixed(p, exp->ex_uuid, - EX_UUID_LEN); - break; - } - } - if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } - if (bmval0 & FATTR4_WORD0_LEASE_TIME) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(nn->nfsd4_lease); - } - if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(rdattr_err); - } - if (bmval0 & FATTR4_WORD0_ACL) { - struct nfs4_ace *ace; - - if (acl == NULL) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - - *p++ = cpu_to_be32(0); - goto out_acl; - } - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(acl->naces); - - for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { - p = xdr_reserve_space(xdr, 4*3); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(ace->type); - *p++ = cpu_to_be32(ace->flag); - *p++ = cpu_to_be32(ace->access_mask & - NFS4_ACE_MASK_ALL); - status = nfsd4_encode_aclname(xdr, rqstp, ace); - if (status) - goto out; - } - } -out_acl: - if (bmval0 & FATTR4_WORD0_ACLSUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ? - ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); - } - if (bmval0 & FATTR4_WORD0_CANSETTIME) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } - if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_FILEHANDLE) { - p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4); - if (!p) - goto out_resource; - p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, - fhp->fh_handle.fh_size); - } - if (bmval0 & FATTR4_WORD0_FILEID) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, stat.ino); - } - if (bmval0 & FATTR4_WORD0_FILES_AVAIL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) statfs.f_ffree); - } - if (bmval0 & FATTR4_WORD0_FILES_FREE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) statfs.f_ffree); - } - if (bmval0 & FATTR4_WORD0_FILES_TOTAL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) statfs.f_files); - } - if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { - status = nfsd4_encode_fs_locations(xdr, rqstp, exp); - if (status) + bitmap_from_arr32(attr_bitmap, attrmask, + ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)); + for_each_set_bit(bit, attr_bitmap, + ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) { + status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args); + if (status != nfs_ok) goto out; } - if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_MAXFILESIZE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes); - } - if (bmval0 & FATTR4_WORD0_MAXLINK) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(255); - } - if (bmval0 & FATTR4_WORD0_MAXNAME) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(statfs.f_namelen); - } - if (bmval0 & FATTR4_WORD0_MAXREAD) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); - } - if (bmval0 & FATTR4_WORD0_MAXWRITE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); - } - if (bmval1 & FATTR4_WORD1_MODE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(stat.mode & S_IALLUGO); - } - if (bmval1 & FATTR4_WORD1_NO_TRUNC) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval1 & FATTR4_WORD1_NUMLINKS) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(stat.nlink); - } - if (bmval1 & FATTR4_WORD1_OWNER) { - status = nfsd4_encode_user(xdr, rqstp, stat.uid); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { - status = nfsd4_encode_group(xdr, rqstp, stat.gid); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_RAWDEV) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - *p++ = cpu_to_be32((u32) MAJOR(stat.rdev)); - *p++ = cpu_to_be32((u32) MINOR(stat.rdev)); - } - if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_SPACE_FREE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_SPACE_USED) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)stat.blocks << 9; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec); - *p++ = cpu_to_be32(stat.atime.tv_nsec); - } - if (bmval1 & FATTR4_WORD1_TIME_DELTA) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = encode_time_delta(p, d_inode(dentry)); - } - if (bmval1 & FATTR4_WORD1_TIME_METADATA) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec); - *p++ = cpu_to_be32(stat.ctime.tv_nsec); - } - if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec); - *p++ = cpu_to_be32(stat.mtime.tv_nsec); - } - if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { - struct kstat parent_stat; - u64 ino = stat.ino; - - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - /* - * Get parent's attributes if not ignoring crossmount - * and this is the root of a cross-mounted filesystem. - */ - if (ignore_crossmnt == 0 && - dentry == exp->ex_path.mnt->mnt_root) { - err = get_parent_attributes(exp, &parent_stat); - if (err) - goto out_nfserr; - ino = parent_stat.ino; - } - p = xdr_encode_hyper(p, ino); - } -#ifdef CONFIG_NFSD_PNFS - if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); - if (status) - goto out; - } - - if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) { - status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); - if (status) - goto out; - } - - if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(stat.blksize); - } -#endif /* CONFIG_NFSD_PNFS */ - if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - u32 supp[3]; - - memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); - supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; - supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; - supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; - - status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]); - if (status) - goto out; - } - - if (bmval2 & FATTR4_WORD2_CHANGE_ATTR_TYPE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - if (IS_I_VERSION(d_inode(dentry))) - *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR); - else - *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_TIME_METADATA); - } - - if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { - status = nfsd4_encode_security_label(xdr, rqstp, context, - contextlen); - if (status) - goto out; - } - - attrlen = htonl(xdr->buf->len - attrlen_offset - 4); - write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4); + attrlen = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT); + write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, XDR_UNIT); status = nfs_ok; out: #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if (context) - security_release_secctx(context, contextlen); + if (args.context.context) + security_release_secctx(&args.context); #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ - kfree(acl); + kfree(args.acl); if (tempfh) { fh_put(tempfh); kfree(tempfh); @@ -2963,12 +3782,28 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, __be32 ret; svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2); - ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp, - ignore_crossmnt); + ret = nfsd4_encode_fattr4(rqstp, &xdr, fhp, exp, dentry, bmval, + ignore_crossmnt); *p = xdr.p; return ret; } +/* + * The buffer space for this field was reserved during a previous + * call to nfsd4_encode_entry4(). + */ +static void nfsd4_encode_entry4_nfs_cookie4(const struct nfsd4_readdir *readdir, + u64 offset) +{ + __be64 cookie = cpu_to_be64(offset); + struct xdr_stream *xdr = readdir->xdr; + + if (!readdir->cookie_offset) + return; + write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, &cookie, + sizeof(cookie)); +} + static inline int attributes_need_mount(u32 *bmval) { if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME)) @@ -2979,26 +3814,19 @@ static inline int attributes_need_mount(u32 *bmval) } static __be32 -nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, - const char *name, int namlen) +nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name, + int namlen) { struct svc_export *exp = cd->rd_fhp->fh_export; struct dentry *dentry; __be32 nfserr; int ignore_crossmnt = 0; - dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen); + dentry = lookup_one_positive_unlocked(&nop_mnt_idmap, + &QSTR_LEN(name, namlen), + cd->rd_fhp->fh_dentry); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); - if (d_really_is_negative(dentry)) { - /* - * we're not holding the i_mutex here, so there's - * a window where this directory entry could have gone - * away. - */ - dput(dentry); - return nfserr_noent; - } exp_get(exp); /* @@ -3026,39 +3854,40 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, nfserr = nfserrno(err); goto out_put; } - nfserr = check_nfsd_access(exp, cd->rd_rqstp); + nfserr = check_nfsd_access(exp, cd->rd_rqstp, false); if (nfserr) goto out_put; } out_encode: - nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval, - cd->rd_rqstp, ignore_crossmnt); + nfserr = nfsd4_encode_fattr4(cd->rd_rqstp, cd->xdr, NULL, exp, dentry, + cd->rd_bmval, ignore_crossmnt); out_put: dput(dentry); exp_put(exp); return nfserr; } -static __be32 * -nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) +static __be32 +nfsd4_encode_entry4_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { - __be32 *p; - - p = xdr_reserve_space(xdr, 20); - if (!p) - return NULL; - *p++ = htonl(2); - *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ - *p++ = htonl(0); /* bmval1 */ + __be32 status; - *p++ = htonl(4); /* attribute length */ - *p++ = nfserr; /* no htonl */ - return p; + /* attrmask */ + status = nfsd4_encode_bitmap4(xdr, FATTR4_WORD0_RDATTR_ERROR, 0, 0); + if (status != nfs_ok) + return status; + /* attr_vals */ + if (xdr_stream_encode_u32(xdr, XDR_UNIT) != XDR_UNIT) + return nfserr_resource; + /* rdattr_error */ + if (xdr_stream_encode_be32(xdr, nfserr) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; } static int -nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, +nfsd4_encode_entry4(void *ccdv, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct readdir_cd *ccd = ccdv; @@ -3069,8 +3898,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, u32 name_and_cookie; int entry_bytes; __be32 nfserr = nfserr_toosmall; - __be64 wire_offset; - __be32 *p; /* In nfsv4, "." and ".." never make it onto the wire.. */ if (name && isdotent(name, namlen)) { @@ -3078,24 +3905,19 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, return 0; } - if (cd->cookie_offset) { - wire_offset = cpu_to_be64(offset); - write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset, - &wire_offset, 8); - } + /* Encode the previous entry's cookie value */ + nfsd4_encode_entry4_nfs_cookie4(cd, offset); - p = xdr_reserve_space(xdr, 4); - if (!p) + if (xdr_stream_encode_item_present(xdr) != XDR_UNIT) goto fail; - *p++ = xdr_one; /* mark entry present */ + + /* Reserve send buffer space for this entry's cookie value. */ cookie_offset = xdr->buf->len; - p = xdr_reserve_space(xdr, 3*4 + namlen); - if (!p) + if (nfsd4_encode_nfs_cookie4(xdr, OFFSET_MAX) != nfs_ok) goto fail; - p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ - p = xdr_encode_array(p, name, namlen); /* name length & name */ - - nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); + if (nfsd4_encode_component4(xdr, name, namlen) != nfs_ok) + goto fail; + nfserr = nfsd4_encode_entry4_fattr(cd, name, namlen); switch (nfserr) { case nfs_ok: break; @@ -3105,6 +3927,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, case nfserr_noent: xdr_truncate_encode(xdr, start_offset); goto skip_entry; + case nfserr_jukebox: + /* + * The pseudoroot should only display dentries that lead to + * exports. If we get EJUKEBOX here, then we can't tell whether + * this entry should be included. Just fail the whole READDIR + * with NFS4ERR_DELAY in that case, and hope that the situation + * will resolve itself by the client's next attempt. + */ + if (cd->rd_fhp->fh_export->ex_flags & NFSEXP_V4ROOT) + goto fail; + fallthrough; default: /* * If the client requested the RDATTR_ERROR attribute, @@ -3115,8 +3948,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, */ if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) goto fail; - p = nfsd4_encode_rdattr_error(xdr, nfserr); - if (p == NULL) { + if (nfsd4_encode_entry4_rdattr_error(xdr, nfserr)) { nfserr = nfserr_toosmall; goto fail; } @@ -3127,15 +3959,18 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, goto fail; cd->rd_maxcount -= entry_bytes; /* - * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so - * let's always let through the first entry, at least: + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and + * notes that it could be zero. If it is zero, then the server + * should enforce only the rd_maxcount value. */ - if (!cd->rd_dircount) - goto fail; - name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; - if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) - goto fail; - cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (cd->rd_dircount) { + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (!cd->rd_dircount) + cd->rd_maxcount = 0; + } cd->cookie_offset = cookie_offset; skip_entry: @@ -3148,647 +3983,782 @@ fail: } static __be32 -nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) +nfsd4_encode_verifier4(struct xdr_stream *xdr, const nfs4_verifier *verf) { __be32 *p; - p = xdr_reserve_space(xdr, sizeof(stateid_t)); + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); if (!p) return nfserr_resource; - *p++ = cpu_to_be32(sid->si_generation); - p = xdr_encode_opaque_fixed(p, &sid->si_opaque, - sizeof(stateid_opaque_t)); - return 0; + memcpy(p, verf->data, sizeof(verf->data)); + return nfs_ok; } static __be32 -nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) +nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid) { - struct xdr_stream *xdr = &resp->xdr; __be32 *p; - p = xdr_reserve_space(xdr, 8); + p = xdr_reserve_space(xdr, sizeof(__be64)); if (!p) return nfserr_resource; - *p++ = cpu_to_be32(access->ac_supported); - *p++ = cpu_to_be32(access->ac_resp_access); - return 0; + memcpy(p, clientid, sizeof(*clientid)); + return nfs_ok; } -static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) +/* This is a frequently-encoded item; open-coded for speed */ +static __be32 +nfsd4_encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid) { - struct xdr_stream *xdr = &resp->xdr; __be32 *p; - p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); + p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE); if (!p) return nfserr_resource; - p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, - NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(bcts->dir); - /* Upshifting from TCP to RDMA is not supported */ - *p++ = cpu_to_be32(0); - return 0; + *p++ = cpu_to_be32(sid->si_generation); + memcpy(p, &sid->si_opaque, sizeof(sid->si_opaque)); + return nfs_ok; } static __be32 -nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) +nfsd4_encode_sessionid4(struct xdr_stream *xdr, + const struct nfs4_sessionid *sessionid) { - struct xdr_stream *xdr = &resp->xdr; + return nfsd4_encode_opaque_fixed(xdr, sessionid->data, + NFS4_MAX_SESSIONID_LEN); +} - return nfsd4_encode_stateid(xdr, &close->cl_stateid); +static __be32 +nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_access *access = &u->access; + struct xdr_stream *xdr = resp->xdr; + __be32 status; + + /* supported */ + status = nfsd4_encode_uint32_t(xdr, access->ac_supported); + if (status != nfs_ok) + return status; + /* access */ + return nfsd4_encode_uint32_t(xdr, access->ac_resp_access); } +static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; + struct xdr_stream *xdr = resp->xdr; + + /* bctsr_sessid */ + nfserr = nfsd4_encode_sessionid4(xdr, &bcts->sessionid); + if (nfserr != nfs_ok) + return nfserr; + /* bctsr_dir */ + if (xdr_stream_encode_u32(xdr, bcts->dir) != XDR_UNIT) + return nfserr_resource; + /* bctsr_use_conn_in_rdma_mode */ + return nfsd4_encode_bool(xdr, false); +} static __be32 -nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) +nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_close *close = &u->close; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, commit->co_verf.data, - NFS4_VERIFIER_SIZE); - return 0; + /* open_stateid */ + return nfsd4_encode_stateid4(xdr, &close->cl_stateid); } + static __be32 -nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) +nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_commit *commit = &u->commit; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - encode_cinfo(p, &create->cr_cinfo); - nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0], - create->cr_bmval[1], create->cr_bmval[2]); - return 0; + return nfsd4_encode_verifier4(resp->xdr, &commit->co_verf); } static __be32 -nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) +nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { + struct nfsd4_create *create = &u->create; + struct xdr_stream *xdr = resp->xdr; + + /* cinfo */ + nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo); + if (nfserr) + return nfserr; + /* attrset */ + return nfsd4_encode_bitmap4(xdr, create->cr_bmval[0], + create->cr_bmval[1], create->cr_bmval[2]); +} + +static __be32 +nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_getattr *getattr = &u->getattr; struct svc_fh *fhp = getattr->ga_fhp; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, - getattr->ga_bmval, resp->rqstp, 0); + /* obj_attributes */ + return nfsd4_encode_fattr4(resp->rqstp, xdr, fhp, fhp->fh_export, + fhp->fh_dentry, getattr->ga_bmval, 0); } static __be32 -nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) +nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - struct svc_fh *fhp = *fhpp; - unsigned int len; - __be32 *p; + struct xdr_stream *xdr = resp->xdr; + struct svc_fh *fhp = u->getfh; - len = fhp->fh_handle.fh_size; - p = xdr_reserve_space(xdr, len + 4); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len); - return 0; + /* object */ + return nfsd4_encode_nfs_fh4(xdr, &fhp->fh_handle); } -/* -* Including all fields other than the name, a LOCK4denied structure requires -* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes. -*/ static __be32 -nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld) +nfsd4_encode_lock_owner4(struct xdr_stream *xdr, const clientid_t *clientid, + const struct xdr_netobj *owner) { - struct xdr_netobj *conf = &ld->ld_owner; - __be32 *p; + __be32 status; -again: - p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len)); - if (!p) { - /* - * Don't fail to return the result just because we can't - * return the conflicting open: - */ - if (conf->len) { - kfree(conf->data); - conf->len = 0; - conf->data = NULL; - goto again; - } + /* clientid */ + status = nfsd4_encode_clientid4(xdr, clientid); + if (status != nfs_ok) + return status; + /* owner */ + return nfsd4_encode_opaque(xdr, owner->data, owner->len); +} + +static __be32 +nfsd4_encode_lock4denied(struct xdr_stream *xdr, + const struct nfsd4_lock_denied *ld) +{ + __be32 status; + + /* offset */ + status = nfsd4_encode_offset4(xdr, ld->ld_start); + if (status != nfs_ok) + return status; + /* length */ + status = nfsd4_encode_length4(xdr, ld->ld_length); + if (status != nfs_ok) + return status; + /* locktype */ + if (xdr_stream_encode_u32(xdr, ld->ld_type) != XDR_UNIT) return nfserr_resource; - } - p = xdr_encode_hyper(p, ld->ld_start); - p = xdr_encode_hyper(p, ld->ld_length); - *p++ = cpu_to_be32(ld->ld_type); - if (conf->len) { - p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8); - p = xdr_encode_opaque(p, conf->data, conf->len); - kfree(conf->data); - } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ - p = xdr_encode_hyper(p, (u64)0); /* clientid */ - *p++ = cpu_to_be32(0); /* length of owner name */ - } - return nfserr_denied; + /* owner */ + return nfsd4_encode_lock_owner4(xdr, &ld->ld_clientid, + &ld->ld_owner); } static __be32 -nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) +nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_lock *lock = &u->lock; + struct xdr_stream *xdr = resp->xdr; + __be32 status; - if (!nfserr) - nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); - else if (nfserr == nfserr_denied) - nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied); + switch (nfserr) { + case nfs_ok: + /* resok4 */ + status = nfsd4_encode_stateid4(xdr, &lock->lk_resp_stateid); + break; + case nfserr_denied: + /* denied */ + status = nfsd4_encode_lock4denied(xdr, &lock->lk_denied); + break; + default: + return nfserr; + } + return status != nfs_ok ? status : nfserr; +} + +static __be32 +nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_lockt *lockt = &u->lockt; + struct xdr_stream *xdr = resp->xdr; + __be32 status; + if (nfserr == nfserr_denied) { + /* denied */ + status = nfsd4_encode_lock4denied(xdr, &lockt->lt_denied); + if (status != nfs_ok) + return status; + } return nfserr; } static __be32 -nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) +nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_locku *locku = &u->locku; + struct xdr_stream *xdr = resp->xdr; - if (nfserr == nfserr_denied) - nfsd4_encode_lock_denied(xdr, &lockt->lt_denied); - return nfserr; + /* lock_stateid */ + return nfsd4_encode_stateid4(xdr, &locku->lu_stateid); } + static __be32 -nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) +nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_link *link = &u->link; + struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &locku->lu_stateid); + return nfsd4_encode_change_info4(xdr, &link->li_cinfo); } +/* + * This implementation does not yet support returning an ACE in an + * OPEN that offers a delegation. + */ +static __be32 +nfsd4_encode_open_nfsace4(struct xdr_stream *xdr) +{ + __be32 status; + + /* type */ + status = nfsd4_encode_acetype4(xdr, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + if (status != nfs_ok) + return nfserr_resource; + /* flag */ + status = nfsd4_encode_aceflag4(xdr, 0); + if (status != nfs_ok) + return nfserr_resource; + /* access mask */ + status = nfsd4_encode_acemask4(xdr, 0); + if (status != nfs_ok) + return nfserr_resource; + /* who - empty for now */ + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; +} static __be32 -nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) +nfsd4_encode_open_read_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + __be32 status; - p = xdr_reserve_space(xdr, 20); - if (!p) + /* stateid */ + status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid); + if (status != nfs_ok) + return status; + /* recall */ + status = nfsd4_encode_bool(xdr, open->op_recall); + if (status != nfs_ok) + return status; + /* permissions */ + return nfsd4_encode_open_nfsace4(xdr); +} + +static __be32 +nfsd4_encode_nfs_space_limit4(struct xdr_stream *xdr, u64 filesize) +{ + /* limitby */ + if (xdr_stream_encode_u32(xdr, NFS4_LIMIT_SIZE) != XDR_UNIT) return nfserr_resource; - p = encode_cinfo(p, &link->li_cinfo); - return 0; + /* filesize */ + return nfsd4_encode_uint64_t(xdr, filesize); } +static __be32 +nfsd4_encode_open_write_delegation4(struct xdr_stream *xdr, + struct nfsd4_open *open) +{ + __be32 status; + + /* stateid */ + status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid); + if (status != nfs_ok) + return status; + /* recall */ + status = nfsd4_encode_bool(xdr, open->op_recall); + if (status != nfs_ok) + return status; + /* space_limit */ + status = nfsd4_encode_nfs_space_limit4(xdr, 0); + if (status != nfs_ok) + return status; + return nfsd4_encode_open_nfsace4(xdr); +} static __be32 -nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) +nfsd4_encode_open_none_delegation4(struct xdr_stream *xdr, + struct nfsd4_open *open) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + __be32 status = nfs_ok; - nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 24); - if (!p) + /* ond_why */ + if (xdr_stream_encode_u32(xdr, open->op_why_no_deleg) != XDR_UNIT) return nfserr_resource; - p = encode_cinfo(p, &open->op_cinfo); - *p++ = cpu_to_be32(open->op_rflags); + switch (open->op_why_no_deleg) { + case WND4_CONTENTION: + /* ond_server_will_push_deleg */ + status = nfsd4_encode_bool(xdr, false); + break; + case WND4_RESOURCE: + /* ond_server_will_signal_avail */ + status = nfsd4_encode_bool(xdr, false); + } + return status; +} - nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1], - open->op_bmval[2]); - if (nfserr) - return nfserr; +static __be32 +nfsd4_encode_open_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open) +{ + __be32 status; - p = xdr_reserve_space(xdr, 4); - if (!p) + /* delegation_type */ + if (xdr_stream_encode_u32(xdr, open->op_delegate_type) != XDR_UNIT) return nfserr_resource; - - *p++ = cpu_to_be32(open->op_delegate_type); switch (open->op_delegate_type) { - case NFS4_OPEN_DELEGATE_NONE: + case OPEN_DELEGATE_NONE: + status = nfs_ok; break; - case NFS4_OPEN_DELEGATE_READ: - nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_recall); - - /* - * TODO: ACE's in delegations - */ - *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ + case OPEN_DELEGATE_READ: + case OPEN_DELEGATE_READ_ATTRS_DELEG: + /* read */ + status = nfsd4_encode_open_read_delegation4(xdr, open); break; - case NFS4_OPEN_DELEGATE_WRITE: - nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 32); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); - - /* - * TODO: space_limit's in delegations - */ - *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); - *p++ = cpu_to_be32(~(u32)0); - *p++ = cpu_to_be32(~(u32)0); - - /* - * TODO: ACE's in delegations - */ - *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ + case OPEN_DELEGATE_WRITE: + case OPEN_DELEGATE_WRITE_ATTRS_DELEG: + /* write */ + status = nfsd4_encode_open_write_delegation4(xdr, open); break; - case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */ - switch (open->op_why_no_deleg) { - case WND4_CONTENTION: - case WND4_RESOURCE: - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_why_no_deleg); - /* deleg signaling not supported yet: */ - *p++ = cpu_to_be32(0); - break; - default: - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_why_no_deleg); - } + case OPEN_DELEGATE_NONE_EXT: + /* od_whynone */ + status = nfsd4_encode_open_none_delegation4(xdr, open); break; default: - BUG(); + status = nfserr_serverfault; } - /* XXX save filehandle here */ - return 0; + + return status; +} + +static __be32 +nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_open *open = &u->open; + struct xdr_stream *xdr = resp->xdr; + + /* stateid */ + nfserr = nfsd4_encode_stateid4(xdr, &open->op_stateid); + if (nfserr != nfs_ok) + return nfserr; + /* cinfo */ + nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo); + if (nfserr != nfs_ok) + return nfserr; + /* rflags */ + nfserr = nfsd4_encode_uint32_t(xdr, open->op_rflags); + if (nfserr != nfs_ok) + return nfserr; + /* attrset */ + nfserr = nfsd4_encode_bitmap4(xdr, open->op_bmval[0], + open->op_bmval[1], open->op_bmval[2]); + if (nfserr != nfs_ok) + return nfserr; + /* delegation */ + return nfsd4_encode_open_delegation4(xdr, open); } static __be32 -nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) +nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_open_confirm *oc = &u->open_confirm; + struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); + /* open_stateid */ + return nfsd4_encode_stateid4(xdr, &oc->oc_resp_stateid); } static __be32 -nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) +nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_open_downgrade *od = &u->open_downgrade; + struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &od->od_stateid); + /* open_stateid */ + return nfsd4_encode_stateid4(xdr, &od->od_stateid); } +/* + * The operation of this function assumes that this is the only + * READ operation in the COMPOUND. If there are multiple READs, + * we use nfsd4_encode_readv(). + */ static __be32 nfsd4_encode_splice_read( struct nfsd4_compoundres *resp, struct nfsd4_read *read, struct file *file, unsigned long maxcount) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct xdr_buf *buf = xdr->buf; - u32 eof; - long len; - int space_left; + int status, space_left; __be32 nfserr; - __be32 *p = xdr->p - 2; - /* Make sure there will be room for padding if needed */ - if (xdr->end - xdr->p < 1) + /* + * Splice read doesn't work if encoding has already wandered + * into the XDR buf's page array. + */ + if (unlikely(xdr->buf->page_len)) { + WARN_ON_ONCE(1); + return nfserr_serverfault; + } + + /* + * Make sure there is room at the end of buf->head for + * svcxdr_encode_opaque_pages() to create a tail buffer + * to XDR-pad the payload. + */ + if (xdr->iov != xdr->buf->head || xdr->end - xdr->p < 1) return nfserr_resource; - len = maxcount; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, - file, read->rd_offset, &maxcount); + file, read->rd_offset, &maxcount, + &read->rd_eof); read->rd_length = maxcount; - if (nfserr) { - /* - * nfsd_splice_actor may have already messed with the - * page length; reset it so as not to confuse - * xdr_truncate_encode: - */ - buf->page_len = 0; - return nfserr; - } - - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - - *(p++) = htonl(eof); - *(p++) = htonl(maxcount); - - buf->page_len = maxcount; - buf->len += maxcount; - xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1) - / PAGE_SIZE; - - /* Use rest of head for padding and remaining ops: */ - buf->tail[0].iov_base = xdr->p; - buf->tail[0].iov_len = 0; - xdr->iov = buf->tail; - if (maxcount&3) { - int pad = 4 - (maxcount&3); - - *(xdr->p++) = 0; - - buf->tail[0].iov_base += maxcount&3; - buf->tail[0].iov_len = pad; - buf->len += pad; + if (nfserr) + goto out_err; + svcxdr_encode_opaque_pages(read->rd_rqstp, xdr, buf->pages, + buf->page_base, maxcount); + status = svc_encode_result_payload(read->rd_rqstp, + buf->head[0].iov_len, maxcount); + if (status) { + nfserr = nfserrno(status); + goto out_err; } + /* + * Prepare to encode subsequent operations. + * + * xdr_truncate_encode() is not safe to use after a successful + * splice read has been done, so the following stream + * manipulations are open-coded. + */ space_left = min_t(int, (void *)xdr->end - (void *)xdr->p, buf->buflen - buf->len); buf->buflen = buf->len + space_left; xdr->end = (__be32 *)((void *)xdr->end + space_left); - return 0; + return nfs_ok; + +out_err: + /* + * nfsd_splice_actor may have already messed with the + * page length; reset it so as not to confuse + * xdr_truncate_encode in our caller. + */ + buf->page_len = 0; + return nfserr; } static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, struct nfsd4_read *read, - struct file *file, unsigned long maxcount) -{ - struct xdr_stream *xdr = &resp->xdr; - u32 eof; - int v; - int starting_len = xdr->buf->len - 8; - long len; - int thislen; + unsigned long maxcount) +{ + struct xdr_stream *xdr = resp->xdr; + unsigned int base = xdr->buf->page_len & ~PAGE_MASK; + unsigned int starting_len = xdr->buf->len; + __be32 zero = xdr_zero; __be32 nfserr; - __be32 tmp; - __be32 *p; - u32 zzz = 0; - int pad; - - len = maxcount; - v = 0; - - thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p)); - p = xdr_reserve_space(xdr, (thislen+3)&~3); - WARN_ON_ONCE(!p); - resp->rqstp->rq_vec[v].iov_base = p; - resp->rqstp->rq_vec[v].iov_len = thislen; - v++; - len -= thislen; - - while (len) { - thislen = min_t(long, len, PAGE_SIZE); - p = xdr_reserve_space(xdr, (thislen+3)&~3); - WARN_ON_ONCE(!p); - resp->rqstp->rq_vec[v].iov_base = p; - resp->rqstp->rq_vec[v].iov_len = thislen; - v++; - len -= thislen; - } - read->rd_vlen = v; - len = maxcount; - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, &maxcount); + nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, read->rd_nf, + read->rd_offset, &maxcount, base, + &read->rd_eof); read->rd_length = maxcount; if (nfserr) return nfserr; - xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); - - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - tmp = htonl(eof); - write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); - tmp = htonl(maxcount); - write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); + /* + * svcxdr_encode_opaque_pages() is not used here because + * we don't want to encode subsequent results in this + * COMPOUND into the xdr->buf's tail, but rather those + * results should follow the NFS READ payload in the + * buf's pages. + */ + if (xdr_reserve_space_vec(xdr, maxcount) < 0) + return nfserr_resource; - pad = (maxcount&3) ? 4 - (maxcount&3) : 0; - write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount, - &zzz, pad); - return 0; + /* + * Mark the buffer location of the NFS READ payload so that + * direct placement-capable transports send only the + * payload bytes out-of-band. + */ + if (svc_encode_result_payload(resp->rqstp, starting_len, maxcount)) + return nfserr_io; + write_bytes_to_xdr_buf(xdr->buf, starting_len + maxcount, &zero, + xdr_pad_size(maxcount)); + return nfs_ok; } static __be32 nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_read *read) + union nfsd4_op_u *u) { + struct nfsd4_compoundargs *argp = resp->rqstp->rq_argp; + struct nfsd4_read *read = &u->read; + struct xdr_stream *xdr = resp->xdr; + bool splice_ok = argp->splice_ok; + unsigned int eof_offset; unsigned long maxcount; - struct xdr_stream *xdr = &resp->xdr; - struct file *file = read->rd_filp; - int starting_len = xdr->buf->len; - struct raparms *ra = NULL; - __be32 *p; + __be32 wire_data[2]; + struct file *file; - p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ - if (!p) { - WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); - return nfserr_resource; - } - if (resp->xdr.buf->page_len && - test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { - WARN_ON_ONCE(1); + if (nfserr) + return nfserr; + + eof_offset = xdr->buf->len; + file = read->rd_nf->nf_file; + + /* Reserve space for the eof flag and byte count */ + if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT * 2))) { + WARN_ON_ONCE(splice_ok); return nfserr_resource; } xdr_commit_encode(xdr); - maxcount = svc_max_payload(resp->rqstp); - maxcount = min_t(unsigned long, maxcount, + maxcount = min_t(unsigned long, read->rd_length, (xdr->buf->buflen - xdr->buf->len)); - maxcount = min_t(unsigned long, maxcount, read->rd_length); - - if (read->rd_tmp_file) - ra = nfsd_init_raparms(file); - if (file->f_op->splice_read && - test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) + if (file->f_op->splice_read && splice_ok) nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); else - nfserr = nfsd4_encode_readv(resp, read, file, maxcount); - - if (ra) - nfsd_put_raparams(file, ra); - - if (nfserr) - xdr_truncate_encode(xdr, starting_len); + nfserr = nfsd4_encode_readv(resp, read, maxcount); + if (nfserr) { + xdr_truncate_encode(xdr, eof_offset); + return nfserr; + } - return nfserr; + wire_data[0] = read->rd_eof ? xdr_one : xdr_zero; + wire_data[1] = cpu_to_be32(read->rd_length); + write_bytes_to_xdr_buf(xdr->buf, eof_offset, &wire_data, XDR_UNIT * 2); + return nfs_ok; } static __be32 -nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) -{ - int maxcount; - __be32 wire_count; - int zero = 0; - struct xdr_stream *xdr = &resp->xdr; - int length_offset = xdr->buf->len; - __be32 *p; - - p = xdr_reserve_space(xdr, 4); - if (!p) +nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_readlink *readlink = &u->readlink; + __be32 *p, wire_count, zero = xdr_zero; + struct xdr_stream *xdr = resp->xdr; + unsigned int length_offset; + int maxcount, status; + + /* linktext4.count */ + length_offset = xdr->buf->len; + if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT))) return nfserr_resource; - maxcount = PAGE_SIZE; + /* linktext4.data */ + maxcount = PAGE_SIZE; p = xdr_reserve_space(xdr, maxcount); if (!p) return nfserr_resource; - /* - * XXX: By default, vfs_readlink() will truncate symlinks if they - * would overflow the buffer. Is this kosher in NFSv4? If not, one - * easy fix is: if vfs_readlink() precisely fills the buffer, assume - * that truncation occurred, and return NFS4ERR_RESOURCE. - */ nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, (char *)p, &maxcount); if (nfserr == nfserr_isdir) nfserr = nfserr_inval; - if (nfserr) { - xdr_truncate_encode(xdr, length_offset); - return nfserr; - } + if (nfserr) + goto out_err; + status = svc_encode_result_payload(readlink->rl_rqstp, length_offset, + maxcount); + if (status) { + nfserr = nfserrno(status); + goto out_err; + } + + wire_count = cpu_to_be32(maxcount); + write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, XDR_UNIT); + xdr_truncate_encode(xdr, length_offset + 4 + xdr_align_size(maxcount)); + write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero, + xdr_pad_size(maxcount)); + return nfs_ok; - wire_count = htonl(maxcount); - write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); - xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4)); - if (maxcount & 3) - write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, - &zero, 4 - (maxcount&3)); - return 0; +out_err: + xdr_truncate_encode(xdr, length_offset); + return nfserr; } -static __be32 -nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir) +static __be32 nfsd4_encode_dirlist4(struct xdr_stream *xdr, + struct nfsd4_readdir *readdir, + u32 max_payload) { - int maxcount; - int bytes_left; + int bytes_left, maxcount, starting_len = xdr->buf->len; loff_t offset; - __be64 wire_offset; - struct xdr_stream *xdr = &resp->xdr; - int starting_len = xdr->buf->len; - __be32 *p; - - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - - /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p) - - (char *)resp->xdr.buf->head[0].iov_base; + __be32 status; /* * Number of bytes left for directory entries allowing for the - * final 8 bytes of the readdir and a following failed op: + * final 8 bytes of the readdir and a following failed op. */ - bytes_left = xdr->buf->buflen - xdr->buf->len - - COMPOUND_ERR_SLACK_SPACE - 8; - if (bytes_left < 0) { - nfserr = nfserr_resource; - goto err_no_verf; - } - maxcount = svc_max_payload(resp->rqstp); - maxcount = min_t(u32, readdir->rd_maxcount, maxcount); + bytes_left = xdr->buf->buflen - xdr->buf->len - + COMPOUND_ERR_SLACK_SPACE - XDR_UNIT * 2; + if (bytes_left < 0) + return nfserr_resource; + maxcount = min_t(u32, readdir->rd_maxcount, max_payload); + /* - * Note the rfc defines rd_maxcount as the size of the - * READDIR4resok structure, which includes the verifier above - * and the 8 bytes encoded at the end of this function: + * The RFC defines rd_maxcount as the size of the + * READDIR4resok structure, which includes the verifier + * and the 8 bytes encoded at the end of this function. */ - if (maxcount < 16) { - nfserr = nfserr_toosmall; - goto err_no_verf; - } - maxcount = min_t(int, maxcount-16, bytes_left); + if (maxcount < XDR_UNIT * 4) + return nfserr_toosmall; + maxcount = min_t(int, maxcount - XDR_UNIT * 4, bytes_left); - /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */ + /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0 */ if (!readdir->rd_dircount) - readdir->rd_dircount = svc_max_payload(resp->rqstp); + readdir->rd_dircount = max_payload; + /* *entries */ readdir->xdr = xdr; readdir->rd_maxcount = maxcount; readdir->common.err = 0; readdir->cookie_offset = 0; - offset = readdir->rd_cookie; - nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, - &offset, - &readdir->common, nfsd4_encode_dirent); - if (nfserr == nfs_ok && - readdir->common.err == nfserr_toosmall && - xdr->buf->len == starting_len + 8) { - /* nothing encoded; which limit did we hit?: */ - if (maxcount - 16 < bytes_left) - /* It was the fault of rd_maxcount: */ - nfserr = nfserr_toosmall; - else - /* We ran out of buffer space: */ - nfserr = nfserr_resource; + status = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, &offset, + &readdir->common, nfsd4_encode_entry4); + if (status) + return status; + if (readdir->common.err == nfserr_toosmall && + xdr->buf->len == starting_len) { + /* No entries were encoded. Which limit did we hit? */ + if (maxcount - XDR_UNIT * 4 < bytes_left) + /* It was the fault of rd_maxcount */ + return nfserr_toosmall; + /* We ran out of buffer space */ + return nfserr_resource; } - if (nfserr) - goto err_no_verf; + /* Encode the final entry's cookie value */ + nfsd4_encode_entry4_nfs_cookie4(readdir, offset); + /* No entries follow */ + if (xdr_stream_encode_item_absent(xdr) != XDR_UNIT) + return nfserr_resource; - if (readdir->cookie_offset) { - wire_offset = cpu_to_be64(offset); - write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, - &wire_offset, 8); - } + /* eof */ + return nfsd4_encode_bool(xdr, readdir->common.err == nfserr_eof); +} - p = xdr_reserve_space(xdr, 8); - if (!p) { - WARN_ON_ONCE(1); - goto err_no_verf; - } - *p++ = 0; /* no more entries */ - *p++ = htonl(readdir->common.err == nfserr_eof); +static __be32 +nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_readdir *readdir = &u->readdir; + struct xdr_stream *xdr = resp->xdr; + int starting_len = xdr->buf->len; - return 0; -err_no_verf: - xdr_truncate_encode(xdr, starting_len); + /* cookieverf */ + nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf); + if (nfserr != nfs_ok) + return nfserr; + + /* reply */ + nfserr = nfsd4_encode_dirlist4(xdr, readdir, svc_max_payload(resp->rqstp)); + if (nfserr != nfs_ok) + xdr_truncate_encode(xdr, starting_len); return nfserr; } static __be32 -nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) +nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_remove *remove = &u->remove; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &remove->rm_cinfo); - return 0; + return nfsd4_encode_change_info4(xdr, &remove->rm_cinfo); } static __be32 -nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) +nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_rename *rename = &u->rename; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 40); - if (!p) + nfserr = nfsd4_encode_change_info4(xdr, &rename->rn_sinfo); + if (nfserr) + return nfserr; + return nfsd4_encode_change_info4(xdr, &rename->rn_tinfo); +} + +static __be32 +nfsd4_encode_rpcsec_gss_info(struct xdr_stream *xdr, + struct rpcsec_gss_info *info) +{ + __be32 status; + + /* oid */ + if (xdr_stream_encode_opaque(xdr, info->oid.data, info->oid.len) < 0) return nfserr_resource; - p = encode_cinfo(p, &rename->rn_sinfo); - p = encode_cinfo(p, &rename->rn_tinfo); - return 0; + /* qop */ + status = nfsd4_encode_qop4(xdr, info->qop); + if (status != nfs_ok) + return status; + /* service */ + if (xdr_stream_encode_u32(xdr, info->service) != XDR_UNIT) + return nfserr_resource; + + return nfs_ok; +} + +static __be32 +nfsd4_encode_secinfo4(struct xdr_stream *xdr, rpc_authflavor_t pf, + u32 *supported) +{ + struct rpcsec_gss_info info; + __be32 status; + + if (rpcauth_get_gssinfo(pf, &info) == 0) { + (*supported)++; + + /* flavor */ + status = nfsd4_encode_uint32_t(xdr, RPC_AUTH_GSS); + if (status != nfs_ok) + return status; + /* flavor_info */ + status = nfsd4_encode_rpcsec_gss_info(xdr, &info); + if (status != nfs_ok) + return status; + } else if (pf < RPC_AUTH_MAXFLAVOR) { + (*supported)++; + + /* flavor */ + status = nfsd4_encode_uint32_t(xdr, pf); + if (status != nfs_ok) + return status; + } + return nfs_ok; } static __be32 -nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) +nfsd4_encode_SECINFO4resok(struct xdr_stream *xdr, struct svc_export *exp) { u32 i, nflavs, supported; struct exp_flavor_info *flavs; struct exp_flavor_info def_flavs[2]; - __be32 *p, *flavorsp; - static bool report = true; + unsigned int count_offset; + __be32 status, wire_count; if (exp->ex_nflavors) { flavs = exp->ex_flavors; @@ -3810,508 +4780,965 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) } } - supported = 0; - p = xdr_reserve_space(xdr, 4); - if (!p) + count_offset = xdr->buf->len; + if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT))) return nfserr_resource; - flavorsp = p++; /* to be backfilled later */ - - for (i = 0; i < nflavs; i++) { - rpc_authflavor_t pf = flavs[i].pseudoflavor; - struct rpcsec_gss_info info; - if (rpcauth_get_gssinfo(pf, &info) == 0) { - supported++; - p = xdr_reserve_space(xdr, 4 + 4 + - XDR_LEN(info.oid.len) + 4 + 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(RPC_AUTH_GSS); - p = xdr_encode_opaque(p, info.oid.data, info.oid.len); - *p++ = cpu_to_be32(info.qop); - *p++ = cpu_to_be32(info.service); - } else if (pf < RPC_AUTH_MAXFLAVOR) { - supported++; - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(pf); - } else { - if (report) - pr_warn("NFS: SECINFO: security flavor %u " - "is not supported\n", pf); - } + for (i = 0, supported = 0; i < nflavs; i++) { + status = nfsd4_encode_secinfo4(xdr, flavs[i].pseudoflavor, + &supported); + if (status != nfs_ok) + return status; } - if (nflavs != supported) - report = false; - *flavorsp = htonl(supported); + wire_count = cpu_to_be32(supported); + write_bytes_to_xdr_buf(xdr->buf, count_offset, &wire_count, + XDR_UNIT); return 0; } static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_secinfo *secinfo) + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_secinfo *secinfo = &u->secinfo; + struct xdr_stream *xdr = resp->xdr; - return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp); + return nfsd4_encode_SECINFO4resok(xdr, secinfo->si_exp); } static __be32 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_secinfo_no_name *secinfo) + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_secinfo_no_name *secinfo = &u->secinfo_no_name; + struct xdr_stream *xdr = resp->xdr; - return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp); + return nfsd4_encode_SECINFO4resok(xdr, secinfo->sin_exp); } -/* - * The SETATTR encode routine is special -- it always encodes a bitmap, - * regardless of the error status. - */ static __be32 -nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) +nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_setattr *setattr = &u->setattr; + __be32 status; - p = xdr_reserve_space(xdr, 16); - if (!p) - return nfserr_resource; - if (nfserr) { - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - } - else { - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(setattr->sa_bmval[0]); - *p++ = cpu_to_be32(setattr->sa_bmval[1]); - *p++ = cpu_to_be32(setattr->sa_bmval[2]); + switch (nfserr) { + case nfs_ok: + /* attrsset */ + status = nfsd4_encode_bitmap4(resp->xdr, setattr->sa_bmval[0], + setattr->sa_bmval[1], + setattr->sa_bmval[2]); + break; + default: + /* attrsset */ + status = nfsd4_encode_bitmap4(resp->xdr, 0, 0, 0); } - return nfserr; + return status != nfs_ok ? status : nfserr; } static __be32 -nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) +nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_setclientid *scd = &u->setclientid; + struct xdr_stream *xdr = resp->xdr; if (!nfserr) { - p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8); - p = xdr_encode_opaque_fixed(p, &scd->se_confirm, - NFS4_VERIFIER_SIZE); - } - else if (nfserr == nfserr_clid_inuse) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); + nfserr = nfsd4_encode_clientid4(xdr, &scd->se_clientid); + if (nfserr != nfs_ok) + goto out; + nfserr = nfsd4_encode_verifier4(xdr, &scd->se_confirm); + } else if (nfserr == nfserr_clid_inuse) { + /* empty network id */ + if (xdr_stream_encode_u32(xdr, 0) < 0) { + nfserr = nfserr_resource; + goto out; + } + /* empty universal address */ + if (xdr_stream_encode_u32(xdr, 0) < 0) { + nfserr = nfserr_resource; + goto out; + } } +out: return nfserr; } static __be32 -nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) +nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_write *write = &u->write; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 16); - if (!p) + /* count */ + nfserr = nfsd4_encode_count4(xdr, write->wr_bytes_written); + if (nfserr) + return nfserr; + /* committed */ + if (xdr_stream_encode_u32(xdr, write->wr_how_written) != XDR_UNIT) return nfserr_resource; - *p++ = cpu_to_be32(write->wr_bytes_written); - *p++ = cpu_to_be32(write->wr_how_written); - p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, - NFS4_VERIFIER_SIZE); - return 0; + /* writeverf */ + return nfsd4_encode_verifier4(xdr, &write->wr_verifier); } static __be32 -nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_exchange_id *exid) +nfsd4_encode_state_protect_ops4(struct xdr_stream *xdr, + struct nfsd4_exchange_id *exid) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; - char *major_id; - char *server_scope; - int major_id_sz; - int server_scope_sz; - uint64_t minor_id = 0; - - major_id = utsname()->nodename; - major_id_sz = strlen(major_id); - server_scope = utsname()->nodename; - server_scope_sz = strlen(server_scope); - - p = xdr_reserve_space(xdr, - 8 /* eir_clientid */ + - 4 /* eir_sequenceid */ + - 4 /* eir_flags */ + - 4 /* spr_how */); - if (!p) - return nfserr_resource; + __be32 status; - p = xdr_encode_opaque_fixed(p, &exid->clientid, 8); - *p++ = cpu_to_be32(exid->seqid); - *p++ = cpu_to_be32(exid->flags); + /* spo_must_enforce */ + status = nfsd4_encode_bitmap4(xdr, exid->spo_must_enforce[0], + exid->spo_must_enforce[1], + exid->spo_must_enforce[2]); + if (status != nfs_ok) + return status; + /* spo_must_allow */ + return nfsd4_encode_bitmap4(xdr, exid->spo_must_allow[0], + exid->spo_must_allow[1], + exid->spo_must_allow[2]); +} - *p++ = cpu_to_be32(exid->spa_how); +static __be32 +nfsd4_encode_state_protect4_r(struct xdr_stream *xdr, struct nfsd4_exchange_id *exid) +{ + __be32 status; + if (xdr_stream_encode_u32(xdr, exid->spa_how) != XDR_UNIT) + return nfserr_resource; switch (exid->spa_how) { case SP4_NONE: + status = nfs_ok; break; case SP4_MACH_CRED: - /* spo_must_enforce bitmap: */ - nfserr = nfsd4_encode_bitmap(xdr, - exid->spo_must_enforce[0], - exid->spo_must_enforce[1], - exid->spo_must_enforce[2]); - if (nfserr) - return nfserr; - /* spo_must_allow bitmap: */ - nfserr = nfsd4_encode_bitmap(xdr, - exid->spo_must_allow[0], - exid->spo_must_allow[1], - exid->spo_must_allow[2]); - if (nfserr) - return nfserr; + /* spr_mach_ops */ + status = nfsd4_encode_state_protect_ops4(xdr, exid); break; default: - WARN_ON_ONCE(1); + status = nfserr_serverfault; } + return status; +} - p = xdr_reserve_space(xdr, - 8 /* so_minor_id */ + - 4 /* so_major_id.len */ + - (XDR_QUADLEN(major_id_sz) * 4) + - 4 /* eir_server_scope.len */ + - (XDR_QUADLEN(server_scope_sz) * 4) + - 4 /* eir_server_impl_id.count (0) */); - if (!p) - return nfserr_resource; +static __be32 +nfsd4_encode_server_owner4(struct xdr_stream *xdr, struct svc_rqst *rqstp) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 status; - /* The server_owner struct */ - p = xdr_encode_hyper(p, minor_id); /* Minor id */ - /* major id */ - p = xdr_encode_opaque(p, major_id, major_id_sz); + /* so_minor_id */ + status = nfsd4_encode_uint64_t(xdr, 0); + if (status != nfs_ok) + return status; + /* so_major_id */ + return nfsd4_encode_opaque(xdr, nn->nfsd_name, strlen(nn->nfsd_name)); +} - /* Server scope */ - p = xdr_encode_opaque(p, server_scope, server_scope_sz); +static __be32 +nfsd4_encode_nfs_impl_id4(struct xdr_stream *xdr, struct nfsd4_exchange_id *exid) +{ + __be32 status; - /* Implementation id */ - *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ - return 0; + /* nii_domain */ + status = nfsd4_encode_opaque(xdr, exid->nii_domain.data, + exid->nii_domain.len); + if (status != nfs_ok) + return status; + /* nii_name */ + status = nfsd4_encode_opaque(xdr, exid->nii_name.data, + exid->nii_name.len); + if (status != nfs_ok) + return status; + /* nii_time */ + return nfsd4_encode_nfstime4(xdr, &exid->nii_time); } static __be32 -nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_create_session *sess) +nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id); + struct nfsd4_exchange_id *exid = &u->exchange_id; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 24); - if (!p) + /* eir_clientid */ + nfserr = nfsd4_encode_clientid4(xdr, &exid->clientid); + if (nfserr != nfs_ok) + return nfserr; + /* eir_sequenceid */ + nfserr = nfsd4_encode_sequenceid4(xdr, exid->seqid); + if (nfserr != nfs_ok) + return nfserr; + /* eir_flags */ + nfserr = nfsd4_encode_uint32_t(xdr, exid->flags); + if (nfserr != nfs_ok) + return nfserr; + /* eir_state_protect */ + nfserr = nfsd4_encode_state_protect4_r(xdr, exid); + if (nfserr != nfs_ok) + return nfserr; + /* eir_server_owner */ + nfserr = nfsd4_encode_server_owner4(xdr, resp->rqstp); + if (nfserr != nfs_ok) + return nfserr; + /* eir_server_scope */ + nfserr = nfsd4_encode_opaque(xdr, nn->nfsd_name, + strlen(nn->nfsd_name)); + if (nfserr != nfs_ok) + return nfserr; + /* eir_server_impl_id<1> */ + if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT) return nfserr_resource; - p = xdr_encode_opaque_fixed(p, sess->sessionid.data, - NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(sess->seqid); - *p++ = cpu_to_be32(sess->flags); + nfserr = nfsd4_encode_nfs_impl_id4(xdr, exid); + if (nfserr != nfs_ok) + return nfserr; - p = xdr_reserve_space(xdr, 28); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); /* headerpadsz */ - *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz); - *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz); - *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached); - *p++ = cpu_to_be32(sess->fore_channel.maxops); - *p++ = cpu_to_be32(sess->fore_channel.maxreqs); - *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs); - - if (sess->fore_channel.nr_rdma_attrs) { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs); - } + return nfs_ok; +} - p = xdr_reserve_space(xdr, 28); - if (!p) +static __be32 +nfsd4_encode_channel_attrs4(struct xdr_stream *xdr, + const struct nfsd4_channel_attrs *attrs) +{ + __be32 status; + + /* ca_headerpadsize */ + status = nfsd4_encode_count4(xdr, 0); + if (status != nfs_ok) + return status; + /* ca_maxrequestsize */ + status = nfsd4_encode_count4(xdr, attrs->maxreq_sz); + if (status != nfs_ok) + return status; + /* ca_maxresponsesize */ + status = nfsd4_encode_count4(xdr, attrs->maxresp_sz); + if (status != nfs_ok) + return status; + /* ca_maxresponsesize_cached */ + status = nfsd4_encode_count4(xdr, attrs->maxresp_cached); + if (status != nfs_ok) + return status; + /* ca_maxoperations */ + status = nfsd4_encode_count4(xdr, attrs->maxops); + if (status != nfs_ok) + return status; + /* ca_maxrequests */ + status = nfsd4_encode_count4(xdr, attrs->maxreqs); + if (status != nfs_ok) + return status; + /* ca_rdma_ird<1> */ + if (xdr_stream_encode_u32(xdr, attrs->nr_rdma_attrs) != XDR_UNIT) return nfserr_resource; - *p++ = cpu_to_be32(0); /* headerpadsz */ - *p++ = cpu_to_be32(sess->back_channel.maxreq_sz); - *p++ = cpu_to_be32(sess->back_channel.maxresp_sz); - *p++ = cpu_to_be32(sess->back_channel.maxresp_cached); - *p++ = cpu_to_be32(sess->back_channel.maxops); - *p++ = cpu_to_be32(sess->back_channel.maxreqs); - *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs); - - if (sess->back_channel.nr_rdma_attrs) { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(sess->back_channel.rdma_attrs); - } - return 0; + if (attrs->nr_rdma_attrs) + return nfsd4_encode_uint32_t(xdr, attrs->rdma_attrs); + return nfs_ok; +} + +static __be32 +nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_create_session *sess = &u->create_session; + struct xdr_stream *xdr = resp->xdr; + + /* csr_sessionid */ + nfserr = nfsd4_encode_sessionid4(xdr, &sess->sessionid); + if (nfserr != nfs_ok) + return nfserr; + /* csr_sequence */ + nfserr = nfsd4_encode_sequenceid4(xdr, sess->seqid); + if (nfserr != nfs_ok) + return nfserr; + /* csr_flags */ + nfserr = nfsd4_encode_uint32_t(xdr, sess->flags); + if (nfserr != nfs_ok) + return nfserr; + /* csr_fore_chan_attrs */ + nfserr = nfsd4_encode_channel_attrs4(xdr, &sess->fore_channel); + if (nfserr != nfs_ok) + return nfserr; + /* csr_back_chan_attrs */ + return nfsd4_encode_channel_attrs4(xdr, &sess->back_channel); } static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_sequence *seq) + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_sequence *seq = &u->sequence; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, seq->sessionid.data, - NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(seq->seqid); - *p++ = cpu_to_be32(seq->slotid); + /* sr_sessionid */ + nfserr = nfsd4_encode_sessionid4(xdr, &seq->sessionid); + if (nfserr != nfs_ok) + return nfserr; + /* sr_sequenceid */ + nfserr = nfsd4_encode_sequenceid4(xdr, seq->seqid); + if (nfserr != nfs_ok) + return nfserr; + /* sr_slotid */ + nfserr = nfsd4_encode_slotid4(xdr, seq->slotid); + if (nfserr != nfs_ok) + return nfserr; /* Note slotid's are numbered from zero: */ - *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */ - *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */ - *p++ = cpu_to_be32(seq->status_flags); + /* sr_highest_slotid */ + nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots_response - 1); + if (nfserr != nfs_ok) + return nfserr; + /* sr_target_highest_slotid */ + nfserr = nfsd4_encode_slotid4(xdr, seq->target_maxslots - 1); + if (nfserr != nfs_ok) + return nfserr; + /* sr_status_flags */ + nfserr = nfsd4_encode_uint32_t(xdr, seq->status_flags); + if (nfserr != nfs_ok) + return nfserr; resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */ - return 0; + return nfs_ok; } static __be32 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_test_stateid *test_stateid) + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; + struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid, *next; - __be32 *p; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids)); - if (!p) + /* tsr_status_codes<> */ + if (xdr_stream_encode_u32(xdr, test_stateid->ts_num_ids) != XDR_UNIT) return nfserr_resource; - *p++ = htonl(test_stateid->ts_num_ids); - - list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) { - *p++ = stateid->ts_id_status; + list_for_each_entry_safe(stateid, next, + &test_stateid->ts_stateid_list, ts_id_list) { + if (xdr_stream_encode_be32(xdr, stateid->ts_id_status) != XDR_UNIT) + return nfserr_resource; } + return nfs_ok; +} - return 0; +static __be32 +nfsd4_encode_get_dir_delegation(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation; + struct xdr_stream *xdr = resp->xdr; + __be32 status = nfserr_resource; + + switch(gdd->gddrnf_status) { + case GDD4_OK: + if (xdr_stream_encode_u32(xdr, GDD4_OK) != XDR_UNIT) + break; + status = nfsd4_encode_verifier4(xdr, &gdd->gddr_cookieverf); + if (status) + break; + status = nfsd4_encode_stateid4(xdr, &gdd->gddr_stateid); + if (status) + break; + status = nfsd4_encode_bitmap4(xdr, gdd->gddr_notification[0], 0, 0); + if (status) + break; + status = nfsd4_encode_bitmap4(xdr, gdd->gddr_child_attributes[0], + gdd->gddr_child_attributes[1], + gdd->gddr_child_attributes[2]); + if (status) + break; + status = nfsd4_encode_bitmap4(xdr, gdd->gddr_dir_attributes[0], + gdd->gddr_dir_attributes[1], + gdd->gddr_dir_attributes[2]); + break; + default: + pr_warn("nfsd: bad gddrnf_status (%u)\n", gdd->gddrnf_status); + gdd->gddrnf_will_signal_deleg_avail = 0; + fallthrough; + case GDD4_UNAVAIL: + if (xdr_stream_encode_u32(xdr, GDD4_UNAVAIL) != XDR_UNIT) + break; + status = nfsd4_encode_bool(xdr, gdd->gddrnf_will_signal_deleg_avail); + break; + } + return status; } #ifdef CONFIG_NFSD_PNFS static __be32 -nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_getdeviceinfo *gdev) +nfsd4_encode_device_addr4(struct xdr_stream *xdr, + const struct nfsd4_getdeviceinfo *gdev) { - struct xdr_stream *xdr = &resp->xdr; + u32 needed_len, starting_len = xdr->buf->len; const struct nfsd4_layout_ops *ops; - u32 starting_len = xdr->buf->len, needed_len; - __be32 *p; + __be32 status; - p = xdr_reserve_space(xdr, 4); - if (!p) + /* da_layout_type */ + if (xdr_stream_encode_u32(xdr, gdev->gd_layout_type) != XDR_UNIT) return nfserr_resource; - - *p++ = cpu_to_be32(gdev->gd_layout_type); - - /* If maxcount is 0 then just update notifications */ - if (gdev->gd_maxcount != 0) { - ops = nfsd4_layout_ops[gdev->gd_layout_type]; - nfserr = ops->encode_getdeviceinfo(xdr, gdev); - if (nfserr) { - /* - * We don't bother to burden the layout drivers with - * enforcing gd_maxcount, just tell the client to - * come back with a bigger buffer if it's not enough. - */ - if (xdr->buf->len + 4 > gdev->gd_maxcount) - goto toosmall; - return nfserr; - } + /* da_addr_body */ + ops = nfsd4_layout_ops[gdev->gd_layout_type]; + status = ops->encode_getdeviceinfo(xdr, gdev); + if (status != nfs_ok) { + /* + * Don't burden the layout drivers with enforcing + * gd_maxcount. Just tell the client to come back + * with a bigger buffer if it's not enough. + */ + if (xdr->buf->len + XDR_UNIT > gdev->gd_maxcount) + goto toosmall; + return status; } - if (gdev->gd_notify_types) { - p = xdr_reserve_space(xdr, 4 + 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(1); /* bitmap length */ - *p++ = cpu_to_be32(gdev->gd_notify_types); - } else { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = 0; - } + return nfs_ok; - return 0; toosmall: - dprintk("%s: maxcount too small\n", __func__); - needed_len = xdr->buf->len + 4 /* notifications */; + needed_len = xdr->buf->len + XDR_UNIT; /* notifications */ xdr_truncate_encode(xdr, starting_len); - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(needed_len); + + status = nfsd4_encode_count4(xdr, needed_len); + if (status != nfs_ok) + return status; return nfserr_toosmall; } static __be32 -nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutget *lgp) +nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops; - __be32 *p; + struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t)); - if (!p) - return nfserr_resource; - - *p++ = cpu_to_be32(1); /* we always set return-on-close */ - *p++ = cpu_to_be32(lgp->lg_sid.si_generation); - p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque, - sizeof(stateid_opaque_t)); + /* gdir_device_addr */ + nfserr = nfsd4_encode_device_addr4(xdr, gdev); + if (nfserr) + return nfserr; + /* gdir_notification */ + return nfsd4_encode_bitmap4(xdr, gdev->gd_notify_types, 0, 0); +} - *p++ = cpu_to_be32(1); /* we always return a single layout */ - p = xdr_encode_hyper(p, lgp->lg_seg.offset); - p = xdr_encode_hyper(p, lgp->lg_seg.length); - *p++ = cpu_to_be32(lgp->lg_seg.iomode); - *p++ = cpu_to_be32(lgp->lg_layout_type); +static __be32 +nfsd4_encode_layout4(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp) +{ + const struct nfsd4_layout_ops *ops = nfsd4_layout_ops[lgp->lg_layout_type]; + __be32 status; - ops = nfsd4_layout_ops[lgp->lg_layout_type]; + /* lo_offset */ + status = nfsd4_encode_offset4(xdr, lgp->lg_seg.offset); + if (status != nfs_ok) + return status; + /* lo_length */ + status = nfsd4_encode_length4(xdr, lgp->lg_seg.length); + if (status != nfs_ok) + return status; + /* lo_iomode */ + if (xdr_stream_encode_u32(xdr, lgp->lg_seg.iomode) != XDR_UNIT) + return nfserr_resource; + /* lo_content */ + if (xdr_stream_encode_u32(xdr, lgp->lg_layout_type) != XDR_UNIT) + return nfserr_resource; return ops->encode_layoutget(xdr, lgp); } static __be32 -nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutcommit *lcp) +nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_layoutget *lgp = &u->layoutget; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 4); - if (!p) + /* logr_return_on_close */ + nfserr = nfsd4_encode_bool(xdr, true); + if (nfserr != nfs_ok) + return nfserr; + /* logr_stateid */ + nfserr = nfsd4_encode_stateid4(xdr, &lgp->lg_sid); + if (nfserr != nfs_ok) + return nfserr; + /* logr_layout<> */ + if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT) return nfserr_resource; - *p++ = cpu_to_be32(lcp->lc_size_chg); - if (lcp->lc_size_chg) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - p = xdr_encode_hyper(p, lcp->lc_newsize); - } + return nfsd4_encode_layout4(xdr, lgp); +} - return 0; +static __be32 +nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_layoutcommit *lcp = &u->layoutcommit; + struct xdr_stream *xdr = resp->xdr; + + /* ns_sizechanged */ + nfserr = nfsd4_encode_bool(xdr, lcp->lc_size_chg); + if (nfserr != nfs_ok) + return nfserr; + if (lcp->lc_size_chg) + /* ns_size */ + return nfsd4_encode_length4(xdr, lcp->lc_newsize); + return nfs_ok; } static __be32 nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_layoutreturn *lrp) + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_layoutreturn *lrp = &u->layoutreturn; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(lrp->lrs_present); + /* lrs_present */ + nfserr = nfsd4_encode_bool(xdr, lrp->lrs_present); + if (nfserr != nfs_ok) + return nfserr; if (lrp->lrs_present) - return nfsd4_encode_stateid(xdr, &lrp->lr_sid); - return 0; + /* lrs_stateid */ + return nfsd4_encode_stateid4(xdr, &lrp->lr_sid); + return nfs_ok; } #endif /* CONFIG_NFSD_PNFS */ static __be32 -nfsd42_encode_write_res(struct nfsd4_compoundres *resp, - struct nfsd42_write_res *write, bool sync) +nfsd4_encode_write_response4(struct xdr_stream *xdr, + const struct nfsd4_copy *copy) { - __be32 *p; - p = xdr_reserve_space(&resp->xdr, 4); - if (!p) - return nfserr_resource; + const struct nfsd42_write_res *write = ©->cp_res; + u32 count = nfsd4_copy_is_sync(copy) ? 0 : 1; + __be32 status; - if (sync) - *p++ = cpu_to_be32(0); - else { - __be32 nfserr; - *p++ = cpu_to_be32(1); - nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid); - if (nfserr) - return nfserr; + /* wr_callback_id<1> */ + if (xdr_stream_encode_u32(xdr, count) != XDR_UNIT) + return nfserr_resource; + if (count) { + status = nfsd4_encode_stateid4(xdr, &write->cb_stateid); + if (status != nfs_ok) + return status; } - p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); - if (!p) + + /* wr_count */ + status = nfsd4_encode_length4(xdr, write->wr_bytes_written); + if (status != nfs_ok) + return status; + /* wr_committed */ + if (xdr_stream_encode_u32(xdr, write->wr_stable_how) != XDR_UNIT) return nfserr_resource; + /* wr_writeverf */ + return nfsd4_encode_verifier4(xdr, &write->wr_verifier); +} - p = xdr_encode_hyper(p, write->wr_bytes_written); - *p++ = cpu_to_be32(write->wr_stable_how); - p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, - NFS4_VERIFIER_SIZE); - return nfs_ok; +static __be32 nfsd4_encode_copy_requirements4(struct xdr_stream *xdr, + const struct nfsd4_copy *copy) +{ + __be32 status; + + /* cr_consecutive */ + status = nfsd4_encode_bool(xdr, true); + if (status != nfs_ok) + return status; + /* cr_synchronous */ + return nfsd4_encode_bool(xdr, nfsd4_copy_is_sync(copy)); } static __be32 nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_copy *copy) + union nfsd4_op_u *u) { - __be32 *p; + struct nfsd4_copy *copy = &u->copy; - nfserr = nfsd42_encode_write_res(resp, ©->cp_res, - copy->cp_synchronous); - if (nfserr) + nfserr = nfsd4_encode_write_response4(resp->xdr, copy); + if (nfserr != nfs_ok) return nfserr; + return nfsd4_encode_copy_requirements4(resp->xdr, copy); +} - p = xdr_reserve_space(&resp->xdr, 4 + 4); - *p++ = xdr_one; /* cr_consecutive */ - *p++ = cpu_to_be32(copy->cp_synchronous); - return 0; +static __be32 +nfsd4_encode_netloc4(struct xdr_stream *xdr, const struct nl4_server *ns) +{ + __be32 status; + + if (xdr_stream_encode_u32(xdr, ns->nl4_type) != XDR_UNIT) + return nfserr_resource; + switch (ns->nl4_type) { + case NL4_NETADDR: + /* nl_addr */ + status = nfsd4_encode_netaddr4(xdr, &ns->u.nl4_addr); + break; + default: + status = nfserr_serverfault; + } + return status; +} + +static __be32 +nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_copy_notify *cn = &u->copy_notify; + struct xdr_stream *xdr = resp->xdr; + + /* cnr_lease_time */ + nfserr = nfsd4_encode_nfstime4(xdr, &cn->cpn_lease_time); + if (nfserr) + return nfserr; + /* cnr_stateid */ + nfserr = nfsd4_encode_stateid4(xdr, &cn->cpn_cnr_stateid); + if (nfserr) + return nfserr; + /* cnr_source_server<> */ + if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT) + return nfserr_resource; + return nfsd4_encode_netloc4(xdr, cn->cpn_src); } static __be32 nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_offload_status *os) + union nfsd4_op_u *u) { - struct xdr_stream *xdr = &resp->xdr; - __be32 *p; + struct nfsd4_offload_status *os = &u->offload_status; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 8 + 4); - if (!p) + /* osr_count */ + nfserr = nfsd4_encode_length4(xdr, os->count); + if (nfserr != nfs_ok) + return nfserr; + /* osr_complete<1> */ + if (os->completed) { + if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT) + return nfserr_resource; + if (xdr_stream_encode_be32(xdr, os->status) != XDR_UNIT) + return nfserr_resource; + } else if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) return nfserr_resource; - p = xdr_encode_hyper(p, os->count); - *p++ = cpu_to_be32(0); + return nfs_ok; +} +static __be32 +nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, + struct nfsd4_read *read) +{ + struct nfsd4_compoundargs *argp = resp->rqstp->rq_argp; + struct file *file = read->rd_nf->nf_file; + struct xdr_stream *xdr = resp->xdr; + bool splice_ok = argp->splice_ok; + unsigned int offset_offset; + __be32 nfserr, wire_count; + unsigned long maxcount; + __be64 wire_offset; + + if (xdr_stream_encode_u32(xdr, NFS4_CONTENT_DATA) != XDR_UNIT) + return nfserr_io; + + offset_offset = xdr->buf->len; + + /* Reserve space for the byte offset and count */ + if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT * 3))) + return nfserr_io; + xdr_commit_encode(xdr); + + maxcount = min_t(unsigned long, read->rd_length, + (xdr->buf->buflen - xdr->buf->len)); + + if (file->f_op->splice_read && splice_ok) + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); + else + nfserr = nfsd4_encode_readv(resp, read, maxcount); + if (nfserr) + return nfserr; + + wire_offset = cpu_to_be64(read->rd_offset); + write_bytes_to_xdr_buf(xdr->buf, offset_offset, &wire_offset, + XDR_UNIT * 2); + wire_count = cpu_to_be32(read->rd_length); + write_bytes_to_xdr_buf(xdr->buf, offset_offset + XDR_UNIT * 2, + &wire_count, XDR_UNIT); + return nfs_ok; +} + +static __be32 +nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_read *read = &u->read; + struct file *file = read->rd_nf->nf_file; + struct xdr_stream *xdr = resp->xdr; + unsigned int eof_offset; + __be32 wire_data[2]; + u32 segments = 0; + + if (nfserr) + return nfserr; + + eof_offset = xdr->buf->len; + + /* Reserve space for the eof flag and segment count */ + if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT * 2))) + return nfserr_io; + xdr_commit_encode(xdr); + + read->rd_eof = read->rd_offset >= i_size_read(file_inode(file)); + if (read->rd_eof) + goto out; + + nfserr = nfsd4_encode_read_plus_data(resp, read); + if (nfserr) { + xdr_truncate_encode(xdr, eof_offset); + return nfserr; + } + + segments++; + +out: + wire_data[0] = read->rd_eof ? xdr_one : xdr_zero; + wire_data[1] = cpu_to_be32(segments); + write_bytes_to_xdr_buf(xdr->buf, eof_offset, &wire_data, XDR_UNIT * 2); return nfserr; } static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_seek *seek) + union nfsd4_op_u *u) +{ + struct nfsd4_seek *seek = &u->seek; + struct xdr_stream *xdr = resp->xdr; + + /* sr_eof */ + nfserr = nfsd4_encode_bool(xdr, seek->seek_eof); + if (nfserr != nfs_ok) + return nfserr; + /* sr_offset */ + return nfsd4_encode_offset4(xdr, seek->seek_pos); +} + +static __be32 +nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *p) +{ + return nfserr; +} + +/* + * Encode kmalloc-ed buffer in to XDR stream. + */ +static __be32 +nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen) { + u32 cplen; __be32 *p; - p = xdr_reserve_space(&resp->xdr, 4 + 8); - *p++ = cpu_to_be32(seek->seek_eof); - p = xdr_encode_hyper(p, seek->seek_pos); + cplen = min_t(unsigned long, buflen, + ((void *)xdr->end - (void *)xdr->p)); + p = xdr_reserve_space(xdr, cplen); + if (!p) + return nfserr_resource; + + memcpy(p, buf, cplen); + buf += cplen; + buflen -= cplen; + + while (buflen) { + cplen = min_t(u32, buflen, PAGE_SIZE); + p = xdr_reserve_space(xdr, cplen); + if (!p) + return nfserr_resource; + + memcpy(p, buf, cplen); + + if (cplen < PAGE_SIZE) { + /* + * We're done, with a length that wasn't page + * aligned, so possibly not word aligned. Pad + * any trailing bytes with 0. + */ + xdr_encode_opaque_fixed(p, NULL, cplen); + break; + } + + buflen -= PAGE_SIZE; + buf += PAGE_SIZE; + } return 0; } static __be32 -nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) +nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - return nfserr; + struct nfsd4_getxattr *getxattr = &u->getxattr; + struct xdr_stream *xdr = resp->xdr; + __be32 *p, err; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + + *p = cpu_to_be32(getxattr->getxa_len); + + if (getxattr->getxa_len == 0) + return 0; + + err = nfsd4_vbuf_to_stream(xdr, getxattr->getxa_buf, + getxattr->getxa_len); + + kvfree(getxattr->getxa_buf); + + return err; +} + +static __be32 +nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_setxattr *setxattr = &u->setxattr; + struct xdr_stream *xdr = resp->xdr; + + return nfsd4_encode_change_info4(xdr, &setxattr->setxa_cinfo); +} + +/* + * See if there are cookie values that can be rejected outright. + */ +static __be32 +nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs, + u32 *offsetp) +{ + u64 cookie = listxattrs->lsxa_cookie; + + /* + * If the cookie is larger than the maximum number we can fit + * in the buffer we just got back from vfs_listxattr, it's invalid. + */ + if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2)) + return nfserr_badcookie; + + *offsetp = (u32)cookie; + return 0; +} + +static __be32 +nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_listxattrs *listxattrs = &u->listxattrs; + struct xdr_stream *xdr = resp->xdr; + u32 cookie_offset, count_offset, eof; + u32 left, xdrleft, slen, count; + u32 xdrlen, offset; + u64 cookie; + char *sp; + __be32 status, tmp; + __be64 wire_cookie; + __be32 *p; + u32 nuser; + + eof = 1; + + status = nfsd4_listxattr_validate_cookie(listxattrs, &offset); + if (status) + goto out; + + /* + * Reserve space for the cookie and the name array count. Record + * the offsets to save them later. + */ + cookie_offset = xdr->buf->len; + count_offset = cookie_offset + 8; + p = xdr_reserve_space(xdr, XDR_UNIT * 3); + if (!p) { + status = nfserr_resource; + goto out; + } + + count = 0; + left = listxattrs->lsxa_len; + sp = listxattrs->lsxa_buf; + nuser = 0; + + /* Bytes left is maxcount - 8 (cookie) - 4 (array count) */ + xdrleft = listxattrs->lsxa_maxcount - XDR_UNIT * 3; + + while (left > 0 && xdrleft > 0) { + slen = strlen(sp); + + /* + * Check if this is a "user." attribute, skip it if not. + */ + if (strncmp(sp, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + goto contloop; + + slen -= XATTR_USER_PREFIX_LEN; + xdrlen = 4 + ((slen + 3) & ~3); + /* Check if both entry and eof can fit in the XDR buffer */ + if (xdrlen + XDR_UNIT > xdrleft) { + if (count == 0) { + /* + * Can't even fit the first attribute name. + */ + status = nfserr_toosmall; + goto out; + } + eof = 0; + goto wreof; + } + + left -= XATTR_USER_PREFIX_LEN; + sp += XATTR_USER_PREFIX_LEN; + if (nuser++ < offset) + goto contloop; + + + p = xdr_reserve_space(xdr, xdrlen); + if (!p) { + status = nfserr_resource; + goto out; + } + + xdr_encode_opaque(p, sp, slen); + + xdrleft -= xdrlen; + count++; +contloop: + sp += slen + 1; + left -= slen + 1; + } + + /* + * If there were user attributes to copy, but we didn't copy + * any, the offset was too large (e.g. the cookie was invalid). + */ + if (nuser > 0 && count == 0) { + status = nfserr_badcookie; + goto out; + } + +wreof: + p = xdr_reserve_space(xdr, 4); + if (!p) { + status = nfserr_resource; + goto out; + } + *p = cpu_to_be32(eof); + + cookie = offset + count; + + wire_cookie = cpu_to_be64(cookie); + write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &wire_cookie, 8); + tmp = cpu_to_be32(count); + write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4); +out: + if (listxattrs->lsxa_len) + kvfree(listxattrs->lsxa_buf); + return status; } -typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); +static __be32 +nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_removexattr *removexattr = &u->removexattr; + struct xdr_stream *xdr = resp->xdr; + + return nfsd4_encode_change_info4(xdr, &removexattr->rmxa_cinfo); +} + +typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u); /* * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1 @@ -4319,87 +5746,93 @@ typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); * done in the decoding phase. */ static const nfsd4_enc nfsd4_enc_ops[] = { - [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, - [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, - [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit, - [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create, - [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr, - [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh, - [OP_LINK] = (nfsd4_enc)nfsd4_encode_link, - [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock, - [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt, - [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku, - [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, - [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, - [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, - [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, - [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_READ] = (nfsd4_enc)nfsd4_encode_read, - [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir, - [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink, - [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove, - [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename, - [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop, - [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo, - [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr, - [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid, - [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop, - [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, - [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, - [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, + [OP_ACCESS] = nfsd4_encode_access, + [OP_CLOSE] = nfsd4_encode_close, + [OP_COMMIT] = nfsd4_encode_commit, + [OP_CREATE] = nfsd4_encode_create, + [OP_DELEGPURGE] = nfsd4_encode_noop, + [OP_DELEGRETURN] = nfsd4_encode_noop, + [OP_GETATTR] = nfsd4_encode_getattr, + [OP_GETFH] = nfsd4_encode_getfh, + [OP_LINK] = nfsd4_encode_link, + [OP_LOCK] = nfsd4_encode_lock, + [OP_LOCKT] = nfsd4_encode_lockt, + [OP_LOCKU] = nfsd4_encode_locku, + [OP_LOOKUP] = nfsd4_encode_noop, + [OP_LOOKUPP] = nfsd4_encode_noop, + [OP_NVERIFY] = nfsd4_encode_noop, + [OP_OPEN] = nfsd4_encode_open, + [OP_OPENATTR] = nfsd4_encode_noop, + [OP_OPEN_CONFIRM] = nfsd4_encode_open_confirm, + [OP_OPEN_DOWNGRADE] = nfsd4_encode_open_downgrade, + [OP_PUTFH] = nfsd4_encode_noop, + [OP_PUTPUBFH] = nfsd4_encode_noop, + [OP_PUTROOTFH] = nfsd4_encode_noop, + [OP_READ] = nfsd4_encode_read, + [OP_READDIR] = nfsd4_encode_readdir, + [OP_READLINK] = nfsd4_encode_readlink, + [OP_REMOVE] = nfsd4_encode_remove, + [OP_RENAME] = nfsd4_encode_rename, + [OP_RENEW] = nfsd4_encode_noop, + [OP_RESTOREFH] = nfsd4_encode_noop, + [OP_SAVEFH] = nfsd4_encode_noop, + [OP_SECINFO] = nfsd4_encode_secinfo, + [OP_SETATTR] = nfsd4_encode_setattr, + [OP_SETCLIENTID] = nfsd4_encode_setclientid, + [OP_SETCLIENTID_CONFIRM] = nfsd4_encode_noop, + [OP_VERIFY] = nfsd4_encode_noop, + [OP_WRITE] = nfsd4_encode_write, + [OP_RELEASE_LOCKOWNER] = nfsd4_encode_noop, /* NFSv4.1 operations */ - [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, - [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, - [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, - [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, - [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BACKCHANNEL_CTL] = nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = nfsd4_encode_bind_conn_to_session, + [OP_EXCHANGE_ID] = nfsd4_encode_exchange_id, + [OP_CREATE_SESSION] = nfsd4_encode_create_session, + [OP_DESTROY_SESSION] = nfsd4_encode_noop, + [OP_FREE_STATEID] = nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = nfsd4_encode_get_dir_delegation, #ifdef CONFIG_NFSD_PNFS - [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo, - [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, - [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, - [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, + [OP_GETDEVICEINFO] = nfsd4_encode_getdeviceinfo, + [OP_GETDEVICELIST] = nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = nfsd4_encode_layoutcommit, + [OP_LAYOUTGET] = nfsd4_encode_layoutget, + [OP_LAYOUTRETURN] = nfsd4_encode_layoutreturn, #else - [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, - [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICEINFO] = nfsd4_encode_noop, + [OP_GETDEVICELIST] = nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = nfsd4_encode_noop, + [OP_LAYOUTGET] = nfsd4_encode_noop, + [OP_LAYOUTRETURN] = nfsd4_encode_noop, #endif - [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, - [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, - [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, - [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid, - [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, - [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, - [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO_NO_NAME] = nfsd4_encode_secinfo_no_name, + [OP_SEQUENCE] = nfsd4_encode_sequence, + [OP_SET_SSV] = nfsd4_encode_noop, + [OP_TEST_STATEID] = nfsd4_encode_test_stateid, + [OP_WANT_DELEGATION] = nfsd4_encode_noop, + [OP_DESTROY_CLIENTID] = nfsd4_encode_noop, + [OP_RECLAIM_COMPLETE] = nfsd4_encode_noop, /* NFSv4.2 operations */ - [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy, - [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop, - [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, - [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, - [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status, - [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, - [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, - [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, - [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_ALLOCATE] = nfsd4_encode_noop, + [OP_COPY] = nfsd4_encode_copy, + [OP_COPY_NOTIFY] = nfsd4_encode_copy_notify, + [OP_DEALLOCATE] = nfsd4_encode_noop, + [OP_IO_ADVISE] = nfsd4_encode_noop, + [OP_LAYOUTERROR] = nfsd4_encode_noop, + [OP_LAYOUTSTATS] = nfsd4_encode_noop, + [OP_OFFLOAD_CANCEL] = nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = nfsd4_encode_offload_status, + [OP_READ_PLUS] = nfsd4_encode_read_plus, + [OP_SEEK] = nfsd4_encode_seek, + [OP_WRITE_SAME] = nfsd4_encode_noop, + [OP_CLONE] = nfsd4_encode_noop, + + /* RFC 8276 extended atributes operations */ + [OP_GETXATTR] = nfsd4_encode_getxattr, + [OP_SETXATTR] = nfsd4_encode_setxattr, + [OP_LISTXATTRS] = nfsd4_encode_listxattrs, + [OP_REMOVEXATTR] = nfsd4_encode_removexattr, }; /* @@ -4431,36 +5864,50 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize) return nfserr_rep_too_big; } +static __be32 nfsd4_map_status(__be32 status, u32 minor) +{ + switch (status) { + case nfs_ok: + break; + case nfserr_wrong_type: + /* RFC 8881 - 15.1.2.9 */ + if (minor == 0) + status = nfserr_inval; + break; + case nfserr_symlink_not_dir: + status = nfserr_symlink; + break; + } + return status; +} + void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct nfs4_stateowner *so = resp->cstate.replay_owner; struct svc_rqst *rqstp = resp->rqstp; const struct nfsd4_operation *opdesc = op->opdesc; - int post_err_offset; + unsigned int op_status_offset; nfsd4_enc encoder; - __be32 *p; - p = xdr_reserve_space(xdr, 8); - if (!p) { - WARN_ON_ONCE(1); - return; - } - *p++ = cpu_to_be32(op->opnum); - post_err_offset = xdr->buf->len; + if (xdr_stream_encode_u32(xdr, op->opnum) != XDR_UNIT) + goto release; + op_status_offset = xdr->buf->len; + if (!xdr_reserve_space(xdr, XDR_UNIT)) + goto release; if (op->opnum == OP_ILLEGAL) goto status; if (op->status && opdesc && !(opdesc->op_flags & OP_NONTRIVIAL_ERROR_ENCODE)) goto status; - BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || + BUG_ON(op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); encoder = nfsd4_enc_ops[op->opnum]; op->status = encoder(resp, op->status, &op->u); - if (opdesc && opdesc->op_release) - opdesc->op_release(&op->u); + if (op->status) + trace_nfsd_compound_encode_err(rqstp, op->opnum, op->status); xdr_commit_encode(xdr); /* nfsd4_check_resp_size guarantees enough room for error status */ @@ -4489,50 +5936,49 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) * bug if we had to do this on a non-idempotent op: */ warn_on_nonidempotent_op(op); - xdr_truncate_encode(xdr, post_err_offset); - } - if (so) { - int len = xdr->buf->len - post_err_offset; + xdr_truncate_encode(xdr, op_status_offset + XDR_UNIT); + } else if (so) { + int len = xdr->buf->len - (op_status_offset + XDR_UNIT); so->so_replay.rp_status = op->status; so->so_replay.rp_buflen = len; - read_bytes_from_xdr_buf(xdr->buf, post_err_offset, + read_bytes_from_xdr_buf(xdr->buf, op_status_offset + XDR_UNIT, so->so_replay.rp_buf, len); } status: - /* Note that op->status is already in network byte order: */ - write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4); + op->status = nfsd4_map_status(op->status, + resp->cstate.minorversion); + write_bytes_to_xdr_buf(xdr->buf, op_status_offset, + &op->status, XDR_UNIT); +release: + if (opdesc && opdesc->op_release) + opdesc->op_release(&op->u); + + /* + * Account for pages consumed while encoding this operation. + * The xdr_stream primitives don't manage rq_next_page. + */ + rqstp->rq_next_page = xdr->page_ptr + 1; } -/* - * Encode the reply stored in the stateowner reply cache - * - * XDR note: do not encode rp->rp_buflen: the buffer contains the - * previously sent already encoded operation. +/** + * nfsd4_encode_replay - encode a result stored in the stateowner reply cache + * @xdr: send buffer's XDR stream + * @op: operation being replayed + * + * @op->replay->rp_buf contains the previously-sent already-encoded result. */ -void -nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) +void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) { - __be32 *p; struct nfs4_replay *rp = op->replay; - BUG_ON(!rp); + trace_nfsd_stateowner_replay(op->opnum, rp); - p = xdr_reserve_space(xdr, 8 + rp->rp_buflen); - if (!p) { - WARN_ON_ONCE(1); + if (xdr_stream_encode_u32(xdr, op->opnum) != XDR_UNIT) return; - } - *p++ = cpu_to_be32(op->opnum); - *p++ = rp->rp_status; /* already xdr'ed */ - - p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen); -} - -int -nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); + if (xdr_stream_encode_be32(xdr, rp->rp_status) != XDR_UNIT) + return; + xdr_stream_encode_opaque_fixed(xdr, rp->rp_buf, rp->rp_buflen); } void nfsd4_release_compoundargs(struct svc_rqst *rqstp) @@ -4540,11 +5986,9 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp) struct nfsd4_compoundargs *args = rqstp->rq_argp; if (args->ops != args->iops) { - kfree(args->ops); + vfree(args->ops); args->ops = args->iops; } - kfree(args->tmpp); - args->tmpp = NULL; while (args->to_free) { struct svcxdr_tmpbuf *tb = args->to_free; args->to_free = tb->next; @@ -4552,56 +5996,39 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp) } } -int -nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd4_compoundargs *args = rqstp->rq_argp; - if (rqstp->rq_arg.head[0].iov_len % 4) { - /* client is nuts */ - dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)", - __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid)); - return 0; - } - args->p = p; - args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; - args->pagelist = rqstp->rq_arg.pages; - args->pagelen = rqstp->rq_arg.page_len; - args->tail = false; - args->tmpp = NULL; + /* svcxdr_tmp_alloc */ args->to_free = NULL; + + args->xdr = xdr; args->ops = args->iops; args->rqstp = rqstp; - return !nfsd4_decode_compound(args); + return nfsd4_decode_compound(args); } -int -nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p) +bool +nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { - /* - * All that remains is to write the tag and operation count... - */ struct nfsd4_compoundres *resp = rqstp->rq_resp; - struct xdr_buf *buf = resp->xdr.buf; - - WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + - buf->tail[0].iov_len); + __be32 *p; - rqstp->rq_next_page = resp->xdr.page_ptr + 1; + /* + * Send buffer space for the following items is reserved + * at the top of nfsd4_proc_compound(). + */ + p = resp->statusp; - p = resp->tagp; + *p++ = resp->cstate.status; *p++ = htonl(resp->taglen); memcpy(p, resp->tag, resp->taglen); p += XDR_QUADLEN(resp->taglen); *p++ = htonl(resp->opcnt); nfsd4_sequence_done(resp); - return 1; + return true; } - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfsd/nfs4xdr_gen.c b/fs/nfsd/nfs4xdr_gen.c new file mode 100644 index 000000000000..a17b5d8e60b3 --- /dev/null +++ b/fs/nfsd/nfs4xdr_gen.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 +// Generated by xdrgen. Manual edits will be lost. +// XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x +// XDR specification modification time: Mon Oct 14 09:10:13 2024 + +#include <linux/sunrpc/svc.h> + +#include "nfs4xdr_gen.h" + +static bool __maybe_unused +xdrgen_decode_int64_t(struct xdr_stream *xdr, int64_t *ptr) +{ + return xdrgen_decode_hyper(xdr, ptr); +}; + +static bool __maybe_unused +xdrgen_decode_uint32_t(struct xdr_stream *xdr, uint32_t *ptr) +{ + return xdrgen_decode_unsigned_int(xdr, ptr); +}; + +static bool __maybe_unused +xdrgen_decode_bitmap4(struct xdr_stream *xdr, bitmap4 *ptr) +{ + if (xdr_stream_decode_u32(xdr, &ptr->count) < 0) + return false; + for (u32 i = 0; i < ptr->count; i++) + if (!xdrgen_decode_uint32_t(xdr, &ptr->element[i])) + return false; + return true; +}; + +static bool __maybe_unused +xdrgen_decode_nfstime4(struct xdr_stream *xdr, struct nfstime4 *ptr) +{ + if (!xdrgen_decode_int64_t(xdr, &ptr->seconds)) + return false; + if (!xdrgen_decode_uint32_t(xdr, &ptr->nseconds)) + return false; + return true; +}; + +static bool __maybe_unused +xdrgen_decode_fattr4_offline(struct xdr_stream *xdr, fattr4_offline *ptr) +{ + return xdrgen_decode_bool(xdr, ptr); +}; + +static bool __maybe_unused +xdrgen_decode_open_arguments4(struct xdr_stream *xdr, struct open_arguments4 *ptr) +{ + if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_share_access)) + return false; + if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_share_deny)) + return false; + if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_share_access_want)) + return false; + if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_open_claim)) + return false; + if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_create_mode)) + return false; + return true; +}; + +static bool __maybe_unused +xdrgen_decode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_access4 *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_decode_open_args_share_deny4(struct xdr_stream *xdr, open_args_share_deny4 *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_decode_open_args_share_access_want4(struct xdr_stream *xdr, open_args_share_access_want4 *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_decode_open_args_open_claim4(struct xdr_stream *xdr, open_args_open_claim4 *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_decode_open_args_createmode4(struct xdr_stream *xdr, open_args_createmode4 *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +bool +xdrgen_decode_fattr4_open_arguments(struct xdr_stream *xdr, fattr4_open_arguments *ptr) +{ + return xdrgen_decode_open_arguments4(xdr, ptr); +}; + +bool +xdrgen_decode_fattr4_time_deleg_access(struct xdr_stream *xdr, fattr4_time_deleg_access *ptr) +{ + return xdrgen_decode_nfstime4(xdr, ptr); +}; + +bool +xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg_modify *ptr) +{ + return xdrgen_decode_nfstime4(xdr, ptr); +}; + +static bool __maybe_unused +xdrgen_decode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 *ptr) +{ + u32 val; + + if (xdr_stream_decode_u32(xdr, &val) < 0) + return false; + *ptr = val; + return true; +} + +static bool __maybe_unused +xdrgen_encode_int64_t(struct xdr_stream *xdr, const int64_t value) +{ + return xdrgen_encode_hyper(xdr, value); +}; + +static bool __maybe_unused +xdrgen_encode_uint32_t(struct xdr_stream *xdr, const uint32_t value) +{ + return xdrgen_encode_unsigned_int(xdr, value); +}; + +static bool __maybe_unused +xdrgen_encode_bitmap4(struct xdr_stream *xdr, const bitmap4 value) +{ + if (xdr_stream_encode_u32(xdr, value.count) != XDR_UNIT) + return false; + for (u32 i = 0; i < value.count; i++) + if (!xdrgen_encode_uint32_t(xdr, value.element[i])) + return false; + return true; +}; + +static bool __maybe_unused +xdrgen_encode_nfstime4(struct xdr_stream *xdr, const struct nfstime4 *value) +{ + if (!xdrgen_encode_int64_t(xdr, value->seconds)) + return false; + if (!xdrgen_encode_uint32_t(xdr, value->nseconds)) + return false; + return true; +}; + +static bool __maybe_unused +xdrgen_encode_fattr4_offline(struct xdr_stream *xdr, const fattr4_offline value) +{ + return xdrgen_encode_bool(xdr, value); +}; + +static bool __maybe_unused +xdrgen_encode_open_arguments4(struct xdr_stream *xdr, const struct open_arguments4 *value) +{ + if (!xdrgen_encode_bitmap4(xdr, value->oa_share_access)) + return false; + if (!xdrgen_encode_bitmap4(xdr, value->oa_share_deny)) + return false; + if (!xdrgen_encode_bitmap4(xdr, value->oa_share_access_want)) + return false; + if (!xdrgen_encode_bitmap4(xdr, value->oa_open_claim)) + return false; + if (!xdrgen_encode_bitmap4(xdr, value->oa_create_mode)) + return false; + return true; +}; + +static bool __maybe_unused +xdrgen_encode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_access4 value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_open_args_share_deny4(struct xdr_stream *xdr, open_args_share_deny4 value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_open_args_share_access_want4(struct xdr_stream *xdr, open_args_share_access_want4 value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_open_args_open_claim4(struct xdr_stream *xdr, open_args_open_claim4 value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +static bool __maybe_unused +xdrgen_encode_open_args_createmode4(struct xdr_stream *xdr, open_args_createmode4 value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} + +bool +xdrgen_encode_fattr4_open_arguments(struct xdr_stream *xdr, const fattr4_open_arguments *value) +{ + return xdrgen_encode_open_arguments4(xdr, value); +}; + +bool +xdrgen_encode_fattr4_time_deleg_access(struct xdr_stream *xdr, const fattr4_time_deleg_access *value) +{ + return xdrgen_encode_nfstime4(xdr, value); +}; + +bool +xdrgen_encode_fattr4_time_deleg_modify(struct xdr_stream *xdr, const fattr4_time_deleg_modify *value) +{ + return xdrgen_encode_nfstime4(xdr, value); +}; + +static bool __maybe_unused +xdrgen_encode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 value) +{ + return xdr_stream_encode_u32(xdr, value) == XDR_UNIT; +} diff --git a/fs/nfsd/nfs4xdr_gen.h b/fs/nfsd/nfs4xdr_gen.h new file mode 100644 index 000000000000..41a0033b7256 --- /dev/null +++ b/fs/nfsd/nfs4xdr_gen.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ +/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */ + +#ifndef _LINUX_XDRGEN_NFS4_1_DECL_H +#define _LINUX_XDRGEN_NFS4_1_DECL_H + +#include <linux/types.h> + +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/xdrgen/_defs.h> +#include <linux/sunrpc/xdrgen/_builtins.h> +#include <linux/sunrpc/xdrgen/nfs4_1.h> + +bool xdrgen_decode_fattr4_open_arguments(struct xdr_stream *xdr, fattr4_open_arguments *ptr); +bool xdrgen_encode_fattr4_open_arguments(struct xdr_stream *xdr, const fattr4_open_arguments *value); + +bool xdrgen_decode_fattr4_time_deleg_access(struct xdr_stream *xdr, fattr4_time_deleg_access *ptr); +bool xdrgen_encode_fattr4_time_deleg_access(struct xdr_stream *xdr, const fattr4_time_deleg_access *value); + +bool xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg_modify *ptr); +bool xdrgen_encode_fattr4_time_deleg_modify(struct xdr_stream *xdr, const fattr4_time_deleg_modify *value); + +#endif /* _LINUX_XDRGEN_NFS4_1_DECL_H */ diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index da52b594362a..ab13ee9c7fd8 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -9,6 +9,7 @@ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ +#include <linux/sunrpc/svc_xprt.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/sunrpc/addr.h> @@ -19,15 +20,14 @@ #include "nfsd.h" #include "cache.h" - -#define NFSDDBG_FACILITY NFSDDBG_REPCACHE +#include "trace.h" /* * We use this value to determine the number of hash buckets from the max * cache size, the idea being that when the cache is at its maximum number * of entries, then this should be the average number of entries per bucket. */ -#define TARGET_BUCKET_SIZE 64 +#define TARGET_BUCKET_SIZE 8 struct nfsd_drc_bucket { struct rb_root rb_head; @@ -35,48 +35,14 @@ struct nfsd_drc_bucket { spinlock_t cache_lock; }; -static struct nfsd_drc_bucket *drc_hashtbl; static struct kmem_cache *drc_slab; -/* max number of entries allowed in the cache */ -static unsigned int max_drc_entries; - -/* number of significant bits in the hash value */ -static unsigned int maskbits; -static unsigned int drc_hashsize; - -/* - * Stats and other tracking of on the duplicate reply cache. All of these and - * the "rc" fields in nfsdstats are protected by the cache_lock - */ - -/* total number of entries */ -static atomic_t num_drc_entries; - -/* cache misses due only to checksum comparison failures */ -static unsigned int payload_misses; - -/* amount of memory (in bytes) currently consumed by the DRC */ -static unsigned int drc_mem_usage; - -/* longest hash chain seen */ -static unsigned int longest_chain; - -/* size of cache when we saw the longest hash chain */ -static unsigned int longest_chain_cachesize; - static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc); -static struct shrinker nfsd_reply_cache_shrinker = { - .scan_objects = nfsd_reply_cache_scan, - .count_objects = nfsd_reply_cache_count, - .seeks = 1, -}; - /* * Put a cap on the size of the DRC based on the amount of available * low memory in the machine. @@ -94,6 +60,9 @@ static struct shrinker nfsd_reply_cache_shrinker = { * ...with a hard cap of 256k entries. In the worst case, each entry will be * ~1k, so the above numbers should give a rough max of the amount of memory * used in k. + * + * XXX: these limits are per-container, so memory used will increase + * linearly with number of containers. Maybe that's OK. */ static unsigned int nfsd_cache_size_limit(void) @@ -115,16 +84,11 @@ nfsd_hashsize(unsigned int limit) return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); } -static u32 -nfsd_cache_hash(__be32 xid) -{ - return hash_32(be32_to_cpu(xid), maskbits); -} - -static struct svc_cacherep * -nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum) +static struct nfsd_cacherep * +nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum, + struct nfsd_net *nn) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); if (rp) { @@ -146,187 +110,276 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum) return rp; } -static void -nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) +static void nfsd_cacherep_free(struct nfsd_cacherep *rp) { - if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { - drc_mem_usage -= rp->c_replvec.iov_len; + if (rp->c_type == RC_REPLBUFF) kfree(rp->c_replvec.iov_base); + kmem_cache_free(drc_slab, rp); +} + +static unsigned long +nfsd_cacherep_dispose(struct list_head *dispose) +{ + struct nfsd_cacherep *rp; + unsigned long freed = 0; + + while (!list_empty(dispose)) { + rp = list_first_entry(dispose, struct nfsd_cacherep, c_lru); + list_del(&rp->c_lru); + nfsd_cacherep_free(rp); + freed++; } + return freed; +} + +static void +nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, + struct nfsd_cacherep *rp) +{ + if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) + nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); if (rp->c_state != RC_UNUSED) { rb_erase(&rp->c_node, &b->rb_head); list_del(&rp->c_lru); - atomic_dec(&num_drc_entries); - drc_mem_usage -= sizeof(*rp); + atomic_dec(&nn->num_drc_entries); + nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp)); } - kmem_cache_free(drc_slab, rp); } static void -nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, + struct nfsd_net *nn) +{ + nfsd_cacherep_unlink_locked(nn, b, rp); + nfsd_cacherep_free(rp); +} + +static void +nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, + struct nfsd_net *nn) { spin_lock(&b->cache_lock); - nfsd_reply_cache_free_locked(b, rp); + nfsd_cacherep_unlink_locked(nn, b, rp); spin_unlock(&b->cache_lock); + nfsd_cacherep_free(rp); +} + +int nfsd_drc_slab_create(void) +{ + drc_slab = KMEM_CACHE(nfsd_cacherep, 0); + return drc_slab ? 0: -ENOMEM; +} + +void nfsd_drc_slab_free(void) +{ + kmem_cache_destroy(drc_slab); } -int nfsd_reply_cache_init(void) +int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; unsigned int i; - int status = 0; - - max_drc_entries = nfsd_cache_size_limit(); - atomic_set(&num_drc_entries, 0); - hashsize = nfsd_hashsize(max_drc_entries); - maskbits = ilog2(hashsize); - - status = register_shrinker(&nfsd_reply_cache_shrinker); - if (status) - return status; - - drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), - 0, 0, NULL); - if (!drc_slab) - goto out_nomem; - - drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL); - if (!drc_hashtbl) { - drc_hashtbl = vzalloc(array_size(hashsize, - sizeof(*drc_hashtbl))); - if (!drc_hashtbl) - goto out_nomem; - } + + nn->max_drc_entries = nfsd_cache_size_limit(); + atomic_set(&nn->num_drc_entries, 0); + hashsize = nfsd_hashsize(nn->max_drc_entries); + nn->maskbits = ilog2(hashsize); + + nn->drc_hashtbl = kvzalloc(array_size(hashsize, + sizeof(*nn->drc_hashtbl)), GFP_KERNEL); + if (!nn->drc_hashtbl) + return -ENOMEM; + + nn->nfsd_reply_cache_shrinker = shrinker_alloc(0, "nfsd-reply:%s", + nn->nfsd_name); + if (!nn->nfsd_reply_cache_shrinker) + goto out_shrinker; + + nn->nfsd_reply_cache_shrinker->scan_objects = nfsd_reply_cache_scan; + nn->nfsd_reply_cache_shrinker->count_objects = nfsd_reply_cache_count; + nn->nfsd_reply_cache_shrinker->seeks = 1; + nn->nfsd_reply_cache_shrinker->private_data = nn; + + shrinker_register(nn->nfsd_reply_cache_shrinker); for (i = 0; i < hashsize; i++) { - INIT_LIST_HEAD(&drc_hashtbl[i].lru_head); - spin_lock_init(&drc_hashtbl[i].cache_lock); + INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head); + spin_lock_init(&nn->drc_hashtbl[i].cache_lock); } - drc_hashsize = hashsize; + nn->drc_hashsize = hashsize; return 0; -out_nomem: +out_shrinker: + kvfree(nn->drc_hashtbl); printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); - nfsd_reply_cache_shutdown(); return -ENOMEM; } -void nfsd_reply_cache_shutdown(void) +void nfsd_reply_cache_shutdown(struct nfsd_net *nn) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; unsigned int i; - unregister_shrinker(&nfsd_reply_cache_shrinker); + shrinker_free(nn->nfsd_reply_cache_shrinker); - for (i = 0; i < drc_hashsize; i++) { - struct list_head *head = &drc_hashtbl[i].lru_head; + for (i = 0; i < nn->drc_hashsize; i++) { + struct list_head *head = &nn->drc_hashtbl[i].lru_head; while (!list_empty(head)) { - rp = list_first_entry(head, struct svc_cacherep, c_lru); - nfsd_reply_cache_free_locked(&drc_hashtbl[i], rp); + rp = list_first_entry(head, struct nfsd_cacherep, c_lru); + nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i], + rp, nn); } } - kvfree(drc_hashtbl); - drc_hashtbl = NULL; - drc_hashsize = 0; + kvfree(nn->drc_hashtbl); + nn->drc_hashtbl = NULL; + nn->drc_hashsize = 0; - kmem_cache_destroy(drc_slab); - drc_slab = NULL; } -/* - * Move cache entry to end of LRU list, and queue the cleaner to run if it's - * not already scheduled. - */ static void -lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) +lru_put_end(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp) { rp->c_timestamp = jiffies; list_move_tail(&rp->c_lru, &b->lru_head); } -static long -prune_bucket(struct nfsd_drc_bucket *b) +static noinline struct nfsd_drc_bucket * +nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn) { - struct svc_cacherep *rp, *tmp; - long freed = 0; + unsigned int hash = hash_32((__force u32)xid, nn->maskbits); - list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { - /* - * Don't free entries attached to calls that are still - * in-progress, but do keep scanning the list. - */ - if (rp->c_state == RC_INPROG) - continue; - if (atomic_read(&num_drc_entries) <= max_drc_entries && - time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) - break; - nfsd_reply_cache_free_locked(b, rp); - freed++; - } - return freed; + return &nn->drc_hashtbl[hash]; } /* - * Walk the LRU list and prune off entries that are older than RC_EXPIRE. - * Also prune the oldest ones when the total exceeds the max number of entries. + * Remove and return no more than @max expired entries in bucket @b. + * If @max is zero, do not limit the number of removed entries. */ -static long -prune_cache_entries(void) +static void +nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, + unsigned int max, struct list_head *dispose) { - unsigned int i; - long freed = 0; + unsigned long expiry = jiffies - RC_EXPIRE; + struct nfsd_cacherep *rp, *tmp; + unsigned int freed = 0; - for (i = 0; i < drc_hashsize; i++) { - struct nfsd_drc_bucket *b = &drc_hashtbl[i]; + lockdep_assert_held(&b->cache_lock); - if (list_empty(&b->lru_head)) - continue; - spin_lock(&b->cache_lock); - freed += prune_bucket(b); - spin_unlock(&b->cache_lock); + /* The bucket LRU is ordered oldest-first. */ + list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { + if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries && + time_before(expiry, rp->c_timestamp)) + break; + + nfsd_cacherep_unlink_locked(nn, b, rp); + list_add(&rp->c_lru, dispose); + + if (max && ++freed > max) + break; } - return freed; } +/** + * nfsd_reply_cache_count - count_objects method for the DRC shrinker + * @shrink: our registered shrinker context + * @sc: garbage collection parameters + * + * Returns the total number of entries in the duplicate reply cache. To + * keep things simple and quick, this is not the number of expired entries + * in the cache (ie, the number that would be removed by a call to + * nfsd_reply_cache_scan). + */ static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - return atomic_read(&num_drc_entries); + struct nfsd_net *nn = shrink->private_data; + + return atomic_read(&nn->num_drc_entries); } +/** + * nfsd_reply_cache_scan - scan_objects method for the DRC shrinker + * @shrink: our registered shrinker context + * @sc: garbage collection parameters + * + * Free expired entries on each bucket's LRU list until we've released + * nr_to_scan freed objects. Nothing will be released if the cache + * has not exceeded it's max_drc_entries limit. + * + * Returns the number of entries released by this call. + */ static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - return prune_cache_entries(); + struct nfsd_net *nn = shrink->private_data; + unsigned long freed = 0; + LIST_HEAD(dispose); + unsigned int i; + + for (i = 0; i < nn->drc_hashsize; i++) { + struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i]; + + if (list_empty(&b->lru_head)) + continue; + + spin_lock(&b->cache_lock); + nfsd_prune_bucket_locked(nn, b, 0, &dispose); + spin_unlock(&b->cache_lock); + + freed += nfsd_cacherep_dispose(&dispose); + if (freed > sc->nr_to_scan) + break; + } + return freed; } -/* - * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes + +/** + * nfsd_cache_csum - Checksum incoming NFS Call arguments + * @buf: buffer containing a whole RPC Call message + * @start: starting byte of the NFS Call header + * @remaining: size of the NFS Call header, in bytes + * + * Compute a weak checksum of the leading bytes of an NFS procedure + * call header to help verify that a retransmitted Call matches an + * entry in the duplicate reply cache. + * + * To avoid assumptions about how the RPC message is laid out in + * @buf and what else it might contain (eg, a GSS MIC suffix), the + * caller passes us the exact location and length of the NFS Call + * header. + * + * Returns a 32-bit checksum value, as defined in RFC 793. */ -static __wsum -nfsd_cache_csum(struct svc_rqst *rqstp) +static __wsum nfsd_cache_csum(struct xdr_buf *buf, unsigned int start, + unsigned int remaining) { + unsigned int base, len; + struct xdr_buf subbuf; + __wsum csum = 0; + void *p; int idx; - unsigned int base; - __wsum csum; - struct xdr_buf *buf = &rqstp->rq_arg; - const unsigned char *p = buf->head[0].iov_base; - size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, - RC_CSUMLEN); - size_t len = min(buf->head[0].iov_len, csum_len); + + if (remaining > RC_CSUMLEN) + remaining = RC_CSUMLEN; + if (xdr_buf_subsegment(buf, &subbuf, start, remaining)) + return csum; /* rq_arg.head first */ - csum = csum_partial(p, len, 0); - csum_len -= len; + if (subbuf.head[0].iov_len) { + len = min_t(unsigned int, subbuf.head[0].iov_len, remaining); + csum = csum_partial(subbuf.head[0].iov_base, len, csum); + remaining -= len; + } /* Continue into page array */ - idx = buf->page_base / PAGE_SIZE; - base = buf->page_base & ~PAGE_MASK; - while (csum_len) { - p = page_address(buf->pages[idx]) + base; - len = min_t(size_t, PAGE_SIZE - base, csum_len); + idx = subbuf.page_base / PAGE_SIZE; + base = subbuf.page_base & ~PAGE_MASK; + while (remaining) { + p = page_address(subbuf.pages[idx]) + base; + len = min_t(unsigned int, PAGE_SIZE - base, remaining); csum = csum_partial(p, len, csum); - csum_len -= len; + remaining -= len; base = 0; ++idx; } @@ -334,11 +387,14 @@ nfsd_cache_csum(struct svc_rqst *rqstp) } static int -nfsd_cache_key_cmp(const struct svc_cacherep *key, const struct svc_cacherep *rp) +nfsd_cache_key_cmp(const struct nfsd_cacherep *key, + const struct nfsd_cacherep *rp, struct nfsd_net *nn) { if (key->c_key.k_xid == rp->c_key.k_xid && - key->c_key.k_csum != rp->c_key.k_csum) - ++payload_misses; + key->c_key.k_csum != rp->c_key.k_csum) { + nfsd_stats_payload_misses_inc(nn); + trace_nfsd_drc_mismatch(nn, key, rp); + } return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key)); } @@ -348,10 +404,11 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key, const struct svc_cacherep *rp * Must be called with cache_lock held. Returns the found entry or * inserts an empty key on failure. */ -static struct svc_cacherep * -nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key) +static struct nfsd_cacherep * +nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key, + struct nfsd_net *nn) { - struct svc_cacherep *rp, *ret = key; + struct nfsd_cacherep *rp, *ret = key; struct rb_node **p = &b->rb_head.rb_node, *parent = NULL; unsigned int entries = 0; @@ -360,9 +417,9 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key) while (*p != NULL) { ++entries; parent = *p; - rp = rb_entry(parent, struct svc_cacherep, c_node); + rp = rb_entry(parent, struct nfsd_cacherep, c_node); - cmp = nfsd_cache_key_cmp(key, rp); + cmp = nfsd_cache_key_cmp(key, rp, nn); if (cmp < 0) p = &parent->rb_left; else if (cmp > 0) @@ -376,117 +433,130 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key) rb_insert_color(&key->c_node, &b->rb_head); out: /* tally hash chain length stats */ - if (entries > longest_chain) { - longest_chain = entries; - longest_chain_cachesize = atomic_read(&num_drc_entries); - } else if (entries == longest_chain) { + if (entries > nn->longest_chain) { + nn->longest_chain = entries; + nn->longest_chain_cachesize = atomic_read(&nn->num_drc_entries); + } else if (entries == nn->longest_chain) { /* prefer to keep the smallest cachesize possible here */ - longest_chain_cachesize = min_t(unsigned int, - longest_chain_cachesize, - atomic_read(&num_drc_entries)); + nn->longest_chain_cachesize = min_t(unsigned int, + nn->longest_chain_cachesize, + atomic_read(&nn->num_drc_entries)); } - - lru_put_end(b, ret); return ret; } -/* +/** + * nfsd_cache_lookup - Find an entry in the duplicate reply cache + * @rqstp: Incoming Call to find + * @start: starting byte in @rqstp->rq_arg of the NFS Call header + * @len: size of the NFS Call header, in bytes + * @cacherep: OUT: DRC entry for this request + * * Try to find an entry matching the current call in the cache. When none * is found, we try to grab the oldest expired entry off the LRU list. If * a suitable one isn't there, then drop the cache_lock and allocate a * new one, then search again in case one got inserted while this thread * didn't hold the lock. + * + * Return values: + * %RC_DOIT: Process the request normally + * %RC_REPLY: Reply from cache + * %RC_DROPIT: Do not process the request further */ -int -nfsd_cache_lookup(struct svc_rqst *rqstp) +int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, + unsigned int len, struct nfsd_cacherep **cacherep) { - struct svc_cacherep *rp, *found; - __be32 xid = rqstp->rq_xid; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct nfsd_cacherep *rp, *found; __wsum csum; - u32 hash = nfsd_cache_hash(xid); - struct nfsd_drc_bucket *b = &drc_hashtbl[hash]; + struct nfsd_drc_bucket *b; int type = rqstp->rq_cachetype; + LIST_HEAD(dispose); int rtn = RC_DOIT; - rqstp->rq_cacherep = NULL; if (type == RC_NOCACHE) { - nfsdstats.rcnocache++; - return rtn; + nfsd_stats_rc_nocache_inc(nn); + goto out; } - csum = nfsd_cache_csum(rqstp); + csum = nfsd_cache_csum(&rqstp->rq_arg, start, len); /* * Since the common case is a cache miss followed by an insert, * preallocate an entry. */ - rp = nfsd_reply_cache_alloc(rqstp, csum); - if (!rp) { - dprintk("nfsd: unable to allocate DRC entry!\n"); - return rtn; - } + rp = nfsd_cacherep_alloc(rqstp, csum, nn); + if (!rp) + goto out; + b = nfsd_cache_bucket_find(rqstp->rq_xid, nn); spin_lock(&b->cache_lock); - found = nfsd_cache_insert(b, rp); - if (found != rp) { - nfsd_reply_cache_free_locked(NULL, rp); - rp = found; + found = nfsd_cache_insert(b, rp, nn); + if (found != rp) goto found_entry; - } - - nfsdstats.rcmisses++; - rqstp->rq_cacherep = rp; + *cacherep = rp; rp->c_state = RC_INPROG; + nfsd_prune_bucket_locked(nn, b, 3, &dispose); + spin_unlock(&b->cache_lock); - atomic_inc(&num_drc_entries); - drc_mem_usage += sizeof(*rp); + nfsd_cacherep_dispose(&dispose); - /* go ahead and prune the cache */ - prune_bucket(b); - out: - spin_unlock(&b->cache_lock); - return rtn; + nfsd_stats_rc_misses_inc(nn); + atomic_inc(&nn->num_drc_entries); + nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); + goto out; found_entry: /* We found a matching entry which is either in progress or done. */ - nfsdstats.rchits++; + nfsd_reply_cache_free_locked(NULL, rp, nn); + nfsd_stats_rc_hits_inc(nn); rtn = RC_DROPIT; + rp = found; /* Request being processed */ if (rp->c_state == RC_INPROG) - goto out; + goto out_trace; /* From the hall of fame of impractical attacks: * Is this a user who tries to snoop on the cache? */ rtn = RC_DOIT; if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure) - goto out; + goto out_trace; /* Compose RPC reply header */ switch (rp->c_type) { case RC_NOCACHE: break; case RC_REPLSTAT: - svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat); + xdr_stream_encode_be32(&rqstp->rq_res_stream, rp->c_replstat); rtn = RC_REPLY; break; case RC_REPLBUFF: if (!nfsd_cache_append(rqstp, &rp->c_replvec)) - goto out; /* should not happen */ + goto out_unlock; /* should not happen */ rtn = RC_REPLY; break; default: - printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type); - nfsd_reply_cache_free_locked(b, rp); + WARN_ONCE(1, "nfsd: bad repcache type %d\n", rp->c_type); } - goto out; +out_trace: + trace_nfsd_drc_found(nn, rqstp, rtn); +out_unlock: + spin_unlock(&b->cache_lock); +out: + return rtn; } -/* - * Update a cache entry. This is called from nfsd_dispatch when - * the procedure has been executed and the complete reply is in - * rqstp->rq_res. +/** + * nfsd_cache_update - Update an entry in the duplicate reply cache. + * @rqstp: svc_rqst with a finished Reply + * @rp: IN: DRC entry for this request + * @cachetype: which cache to update + * @statp: pointer to Reply's NFS status code, or NULL + * + * This is called from nfsd_dispatch when the procedure has been + * executed and the complete reply is in rqstp->rq_res. * * We're copying around data here rather than swapping buffers because * the toplevel loop requires max-sized buffers, which would be a waste @@ -499,12 +569,11 @@ found_entry: * nfsd failed to encode a reply that otherwise would have been cached. * In this case, nfsd_cache_update is called with statp == NULL. */ -void -nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) +void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, + int cachetype, __be32 *statp) { - struct svc_cacherep *rp = rqstp->rq_cacherep; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct kvec *resv = &rqstp->rq_res.head[0], *cachv; - u32 hash; struct nfsd_drc_bucket *b; int len; size_t bufsize = 0; @@ -512,15 +581,14 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) if (!rp) return; - hash = nfsd_cache_hash(rp->c_key.k_xid); - b = &drc_hashtbl[hash]; + b = nfsd_cache_bucket_find(rp->c_key.k_xid, nn); len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); len >>= 2; /* Don't cache excessive amounts of data and XDR failures */ if (!statp || len > (256 >> 2)) { - nfsd_reply_cache_free(b, rp); + nfsd_reply_cache_free(b, rp, nn); return; } @@ -535,18 +603,18 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) bufsize = len << 2; cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); if (!cachv->iov_base) { - nfsd_reply_cache_free(b, rp); + nfsd_reply_cache_free(b, rp, nn); return; } cachv->iov_len = bufsize; memcpy(cachv->iov_base, statp, bufsize); break; case RC_NOCACHE: - nfsd_reply_cache_free(b, rp); + nfsd_reply_cache_free(b, rp, nn); return; } spin_lock(&b->cache_lock); - drc_mem_usage += bufsize; + nfsd_stats_drc_mem_usage_add(nn, bufsize); lru_put_end(b, rp); rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); rp->c_type = cachetype; @@ -555,24 +623,17 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) return; } -/* - * Copy cached reply to current reply buffer. Should always fit. - * FIXME as reply is in a page, we should just attach the page, and - * keep a refcount.... - */ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) { - struct kvec *vec = &rqstp->rq_res.head[0]; - - if (vec->iov_len + data->iov_len > PAGE_SIZE) { - printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n", - data->iov_len); - return 0; - } - memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len); - vec->iov_len += data->iov_len; - return 1; + __be32 *p; + + p = xdr_reserve_space(&rqstp->rq_res_stream, data->iov_len); + if (unlikely(!p)) + return false; + memcpy(p, data->iov_base, data->iov_len); + xdr_commit_encode(&rqstp->rq_res_stream); + return true; } /* @@ -580,23 +641,26 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) * scraping this file for info should test the labels to ensure they're * getting the correct field. */ -static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) +int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "max entries: %u\n", max_drc_entries); + struct nfsd_net *nn = net_generic(file_inode(m->file)->i_sb->s_fs_info, + nfsd_net_id); + + seq_printf(m, "max entries: %u\n", nn->max_drc_entries); seq_printf(m, "num entries: %u\n", - atomic_read(&num_drc_entries)); - seq_printf(m, "hash buckets: %u\n", 1 << maskbits); - seq_printf(m, "mem usage: %u\n", drc_mem_usage); - seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); - seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses); - seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache); - seq_printf(m, "payload misses: %u\n", payload_misses); - seq_printf(m, "longest chain len: %u\n", longest_chain); - seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize); + atomic_read(&nn->num_drc_entries)); + seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits); + seq_printf(m, "mem usage: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE])); + seq_printf(m, "cache hits: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS])); + seq_printf(m, "cache misses: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES])); + seq_printf(m, "not cached: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE])); + seq_printf(m, "payload misses: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES])); + seq_printf(m, "longest chain len: %u\n", nn->longest_chain); + seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); return 0; } - -int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, nfsd_reply_cache_stats_show, NULL); -} diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b33f9785b756..5ce9a49e76ba 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Syscall interface to knfsd. * @@ -7,14 +8,17 @@ #include <linux/slab.h> #include <linux/namei.h> #include <linux/ctype.h> +#include <linux/fs_context.h> #include <linux/sunrpc/svcsock.h> #include <linux/lockd/lockd.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/gss_api.h> -#include <linux/sunrpc/gss_krb5_enctypes.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <linux/sunrpc/svc.h> #include <linux/module.h> +#include <linux/fsnotify.h> +#include <linux/nfslocalio.h> #include "idmap.h" #include "nfsd.h" @@ -22,6 +26,9 @@ #include "state.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" +#include "trace.h" +#include "netlink.h" /* * We have a single directory with several nodes in it. @@ -29,6 +36,7 @@ enum { NFSD_Root = 1, NFSD_List, + NFSD_Export_Stats, NFSD_Export_features, NFSD_Fh, NFSD_FO_UnlockIP, @@ -40,18 +48,12 @@ enum { NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, - NFSD_MaxConnections, - NFSD_SupportedEnctypes, - /* - * The below MUST come last. Otherwise we leave a hole in nfsd_files[] - * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops - */ -#ifdef CONFIG_NFSD_V4 + NFSD_Filecache, NFSD_Leasetime, NFSD_Gracetime, NFSD_RecoveryDir, NFSD_V4EndGrace, -#endif + NFSD_MaxReserved }; /* @@ -65,11 +67,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); static ssize_t write_versions(struct file *file, char *buf, size_t size); static ssize_t write_ports(struct file *file, char *buf, size_t size); static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); -static ssize_t write_maxconn(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_V4 static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_gracetime(struct file *file, char *buf, size_t size); +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); +#endif static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size); #endif @@ -82,11 +85,12 @@ static ssize_t (*const write_op[])(struct file *, char *, size_t) = { [NFSD_Versions] = write_versions, [NFSD_Ports] = write_ports, [NFSD_MaxBlkSize] = write_maxblksize, - [NFSD_MaxConnections] = write_maxconn, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = write_leasetime, [NFSD_Gracetime] = write_gracetime, +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING [NFSD_RecoveryDir] = write_recoverydir, +#endif [NFSD_V4EndGrace] = write_v4_end_grace, #endif }; @@ -104,12 +108,12 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu if (IS_ERR(data)) return PTR_ERR(data); - rv = write_op[ino](file, data, size); - if (rv >= 0) { - simple_transaction_set(file, rv); - rv = size; - } - return rv; + rv = write_op[ino](file, data, size); + if (rv < 0) + return rv; + + simple_transaction_set(file, rv); + return size; } static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) @@ -148,18 +152,6 @@ static int exports_net_open(struct net *net, struct file *file) return 0; } -static int exports_proc_open(struct inode *inode, struct file *file) -{ - return exports_net_open(current->nsproxy->net_ns, file); -} - -static const struct file_operations exports_proc_operations = { - .open = exports_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int exports_nfsd_open(struct inode *inode, struct file *file) { return exports_net_open(inode->i_sb->s_fs_info, file); @@ -178,51 +170,25 @@ static int export_features_show(struct seq_file *m, void *v) return 0; } -static int export_features_open(struct inode *inode, struct file *file) -{ - return single_open(file, export_features_show, NULL); -} - -static const struct file_operations export_features_operations = { - .open = export_features_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(export_features); -#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) -static int supported_enctypes_show(struct seq_file *m, void *v) +static int nfsd_pool_stats_open(struct inode *inode, struct file *file) { - seq_printf(m, KRB5_SUPPORTED_ENCTYPES); - return 0; -} + struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); -static int supported_enctypes_open(struct inode *inode, struct file *file) -{ - return single_open(file, supported_enctypes_show, NULL); + return svc_pool_stats_open(&nn->nfsd_info, file); } -static const struct file_operations supported_enctypes_ops = { - .open = supported_enctypes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ - static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, .llseek = seq_lseek, - .release = nfsd_pool_stats_release, + .release = seq_release, }; -static const struct file_operations reply_cache_stats_operations = { - .open = nfsd_reply_cache_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats); + +DEFINE_SHOW_ATTRIBUTE(nfsd_file_cache_stats); /*----------------------------------------------------------------------------*/ /* @@ -234,7 +200,7 @@ static inline struct net *netns(struct file *file) return file_inode(file)->i_sb->s_fs_info; } -/** +/* * write_unlock_ip - Release all locks used by a client * * Experimental. @@ -270,10 +236,11 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (rpc_pton(net, fo_path, size, sap, salen) == 0) return -EINVAL; + trace_nfsd_ctl_unlock_ip(net, buf); return nlmsvc_unlock_all_by_ip(sap); } -/** +/* * write_unlock_fs - Release all locks on a local file system * * Experimental. @@ -303,7 +270,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) fo_path = buf; if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - + trace_nfsd_ctl_unlock_fs(netns(file), fo_path); error = kern_path(fo_path, 0, &path); if (error) return error; @@ -318,12 +285,13 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) * 3. Is that directory the root of an exported file system? */ error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); + nfsd4_revoke_states(netns(file), path.dentry->d_sb); path_put(&path); return error; } -/** +/* * write_filehandle - Get a variable-length NFS file handle by path * * On input, the buffer contains a '\n'-terminated C string comprised of @@ -347,7 +315,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) static ssize_t write_filehandle(struct file *file, char *buf, size_t size) { char *dname, *path; - int uninitialized_var(maxsize); + int maxsize; char *mesg = buf; int len; struct auth_domain *dom; @@ -364,7 +332,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) len = qword_get(&mesg, dname, size); if (len <= 0) return -EINVAL; - + path = dname+len+1; len = qword_get(&mesg, path, size); if (len <= 0) @@ -378,27 +346,29 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) return -EINVAL; maxsize = min(maxsize, NFS3_FHSIZE); - if (qword_get(&mesg, mesg, size)>0) + if (qword_get(&mesg, mesg, size) > 0) return -EINVAL; + trace_nfsd_ctl_filehandle(netns(file), dname, path, maxsize); + /* we have all the words, they are in buf.. */ dom = unix_domain_find(dname); if (!dom) return -ENOMEM; - len = exp_rootfh(netns(file), dom, path, &fh, maxsize); + len = exp_rootfh(netns(file), dom, path, &fh, maxsize); auth_domain_put(dom); if (len) return len; - + mesg = buf; len = SIMPLE_TRANSACTION_LIMIT; - qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); + qword_addhex(&mesg, &len, fh.fh_raw, fh.fh_size); mesg[-1] = '\n'; - return mesg - buf; + return mesg - buf; } -/** +/* * write_threads - Start NFSD, or report the current number of running threads * * Input: @@ -439,7 +409,10 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return rv; if (newthreads < 0) return -EINVAL; - rv = nfsd_svc(newthreads, net); + trace_nfsd_ctl_threads(net, newthreads); + mutex_lock(&nfsd_mutex); + rv = nfsd_svc(1, &newthreads, net, file->f_cred, NULL); + mutex_unlock(&nfsd_mutex); if (rv < 0) return rv; } else @@ -448,7 +421,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv); } -/** +/* * write_pool_threads - Set or report the current number of threads per pool * * Input: @@ -458,8 +431,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: C string containing whitespace- - * separated unsigned integer values + * buf: C string containing whitespace- + * separated unsigned integer values * representing the number of NFSD * threads to start in each pool * size: non-zero length of C string in @buf @@ -511,7 +484,16 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) rv = -EINVAL; if (nthreads[i] < 0) goto out_free; + trace_nfsd_ctl_pool_threads(net, i, nthreads[i]); } + + /* + * There must always be a thread in pool 0; the admin + * can't shut down NFS completely using pool_threads. + */ + if (nthreads[0] == 0) + nthreads[0] = 1; + rv = nfsd_set_nrthreads(i, nthreads, net); if (rv) goto out_free; @@ -537,14 +519,14 @@ out_free: } static ssize_t -nfsd_print_version_support(char *buf, int remaining, const char *sep, - unsigned vers, int minor) +nfsd_print_version_support(struct nfsd_net *nn, char *buf, int remaining, + const char *sep, unsigned vers, int minor) { const char *format = minor < 0 ? "%s%c%u" : "%s%c%u.%u"; - bool supported = !!nfsd_vers(vers, NFSD_TEST); + bool supported = !!nfsd_vers(nn, vers, NFSD_TEST); if (vers == 4 && minor >= 0 && - !nfsd_minorversion(minor, NFSD_TEST)) + !nfsd_minorversion(nn, minor, NFSD_TEST)) supported = false; if (minor == 0 && supported) /* @@ -566,7 +548,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) char *sep; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); - if (size>0) { + if (size > 0) { if (nn->nfsd_serv) /* Cannot change versions without updating * nn->nfsd_serv->sv_xdrsize, and reallocing @@ -576,6 +558,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) if (buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; + trace_nfsd_ctl_version(netns(file), buf); vers = mesg; len = qword_get(&mesg, vers, size); @@ -597,48 +580,51 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET; switch(num) { +#ifdef CONFIG_NFSD_V2 case 2: +#endif case 3: - nfsd_vers(num, cmd); + nfsd_vers(nn, num, cmd); break; case 4: if (*minorp == '.') { - if (nfsd_minorversion(minor, cmd) < 0) + if (nfsd_minorversion(nn, minor, cmd) < 0) return -EINVAL; - } else if ((cmd == NFSD_SET) != nfsd_vers(num, NFSD_TEST)) { + } else if ((cmd == NFSD_SET) != nfsd_vers(nn, num, NFSD_TEST)) { /* * Either we have +4 and no minors are enabled, * or we have -4 and at least one minor is enabled. * In either case, propagate 'cmd' to all minors. */ minor = 0; - while (nfsd_minorversion(minor, cmd) >= 0) + while (nfsd_minorversion(nn, minor, cmd) >= 0) minor++; } break; default: - return -EINVAL; + /* Ignore requests to disable non-existent versions */ + if (cmd == NFSD_SET) + return -EINVAL; } vers += len + 1; } while ((len = qword_get(&mesg, vers, size)) > 0); /* If all get turned off, turn them back on, as * having no versions is BAD */ - nfsd_reset_versions(); + nfsd_reset_versions(nn); } /* Now write current state into reply buffer */ - len = 0; sep = ""; remaining = SIMPLE_TRANSACTION_LIMIT; for (num=2 ; num <= 4 ; num++) { int minor; - if (!nfsd_vers(num, NFSD_AVAIL)) + if (!nfsd_vers(nn, num, NFSD_AVAIL)) continue; minor = -1; do { - len = nfsd_print_version_support(buf, remaining, + len = nfsd_print_version_support(nn, buf, remaining, sep, num, minor); if (len >= remaining) goto out; @@ -657,7 +643,7 @@ out: return tlen + len; } -/** +/* * write_versions - Set or report the available NFS protocol versions * * Input: @@ -674,11 +660,11 @@ out: * OR * * Input: - * buf: C string containing whitespace- - * separated positive or negative - * integer values representing NFS - * protocol versions to enable ("+n") - * or disable ("-n") + * buf: C string containing whitespace- + * separated positive or negative + * integer values representing NFS + * protocol versions to enable ("+n") + * or disable ("-n") * size: non-zero length of C string in @buf * Output: * On success: status of zero or more protocol versions has @@ -717,78 +703,76 @@ static ssize_t __write_ports_names(char *buf, struct net *net) * a socket of a supported family/protocol, and we use it as an * nfsd listener. */ -static ssize_t __write_ports_addfd(char *buf, struct net *net) +static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred *cred) { char *mesg = buf; int fd, err; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; err = get_int(&mesg, &fd); if (err != 0 || fd < 0) return -EINVAL; - - if (svc_alien_sock(net, fd)) { - printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__); - return -EINVAL; - } + trace_nfsd_ctl_ports_addfd(net, fd); err = nfsd_create_serv(net); if (err != 0) return err; - err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); - if (err < 0) { - nfsd_destroy(net); - return err; - } + serv = nn->nfsd_serv; + err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); + + if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) + nfsd_destroy_serv(net); - /* Decrease the count, but don't shut down the service */ - nn->nfsd_serv->sv_nrthreads--; return err; } /* - * A transport listener is added by writing it's transport name and + * A transport listener is added by writing its transport name and * a port number. */ -static ssize_t __write_ports_addxprt(char *buf, struct net *net) +static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred) { char transport[16]; struct svc_xprt *xprt; int port, err; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; if (sscanf(buf, "%15s %5u", transport, &port) != 2) return -EINVAL; if (port < 1 || port > USHRT_MAX) return -EINVAL; + trace_nfsd_ctl_ports_addxprt(net, transport, port); err = nfsd_create_serv(net); if (err != 0) return err; - err = svc_create_xprt(nn->nfsd_serv, transport, net, - PF_INET, port, SVC_SOCK_ANONYMOUS); + serv = nn->nfsd_serv; + err = svc_xprt_create(serv, transport, net, + PF_INET, port, SVC_SOCK_ANONYMOUS, cred); if (err < 0) goto out_err; - err = svc_create_xprt(nn->nfsd_serv, transport, net, - PF_INET6, port, SVC_SOCK_ANONYMOUS); + err = svc_xprt_create(serv, transport, net, + PF_INET6, port, SVC_SOCK_ANONYMOUS, cred); if (err < 0 && err != -EAFNOSUPPORT) goto out_close; - /* Decrease the count, but don't shut down the service */ - nn->nfsd_serv->sv_nrthreads--; return 0; out_close: - xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port); + xprt = svc_find_xprt(serv, transport, net, PF_INET, port); if (xprt != NULL) { - svc_close_xprt(xprt); + svc_xprt_close(xprt); svc_xprt_put(xprt); } out_err: - nfsd_destroy(net); + if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) + nfsd_destroy_serv(net); + return err; } @@ -799,15 +783,15 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size, return __write_ports_names(buf, net); if (isdigit(buf[0])) - return __write_ports_addfd(buf, net); + return __write_ports_addfd(buf, net, file->f_cred); if (isalpha(buf[0])) - return __write_ports_addxprt(buf, net); + return __write_ports_addxprt(buf, net, file->f_cred); return -EINVAL; } -/** +/* * write_ports - Pass a socket file descriptor or transport name to listen on * * Input: @@ -863,7 +847,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) int nfsd_max_blksize; -/** +/* * write_maxblksize - Set or report the current NFS blksize * * Input: @@ -873,9 +857,9 @@ int nfsd_max_blksize; * OR * * Input: - * buf: C string containing an unsigned - * integer value representing the new - * NFS blksize + * buf: C string containing an unsigned + * integer value representing the new + * NFS blksize * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string @@ -894,6 +878,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) int rv = get_int(&mesg, &bsize); if (rv) return rv; + trace_nfsd_ctl_maxblksize(netns(file), bsize); + /* force bsize into allowed range and * required alignment. */ @@ -913,47 +899,11 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) nfsd_max_blksize); } -/** - * write_maxconn - Set or report the current max number of connections - * - * Input: - * buf: ignored - * size: zero - * OR - * - * Input: - * buf: C string containing an unsigned - * integer value representing the new - * number of max connections - * size: non-zero length of C string in @buf - * Output: - * On success: passed-in buffer filled with '\n'-terminated C string - * containing numeric value of max_connections setting - * for this net namespace; - * return code is the size in bytes of the string - * On error: return code is zero or a negative errno value - */ -static ssize_t write_maxconn(struct file *file, char *buf, size_t size) -{ - char *mesg = buf; - struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); - unsigned int maxconn = nn->max_connections; - - if (size > 0) { - int rv = get_uint(&mesg, &maxconn); - - if (rv) - return rv; - nn->max_connections = maxconn; - } - - return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn); -} - #ifdef CONFIG_NFSD_V4 static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, - time_t *time, struct nfsd_net *nn) + time64_t *time, struct nfsd_net *nn) { + struct dentry *dentry = file_dentry(file); char *mesg = buf; int rv, i; @@ -963,6 +913,9 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, rv = get_int(&mesg, &i); if (rv) return rv; + trace_nfsd_ctl_time(netns(file), dentry->d_name.name, + dentry->d_name.len, i); + /* * Some sanity checking. We don't have a reason for * these particular numbers, but problems with the @@ -980,11 +933,11 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, *time = i; } - return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time); + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%lld\n", *time); } static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, - time_t *time, struct nfsd_net *nn) + time64_t *time, struct nfsd_net *nn) { ssize_t rv; @@ -994,7 +947,7 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, return rv; } -/** +/* * write_leasetime - Set or report the current NFSv4 lease time * * Input: @@ -1021,7 +974,7 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn); } -/** +/* * write_gracetime - Set or report current NFSv4 grace period time * * As above, but sets the time of the NFSv4 grace period. @@ -1037,6 +990,7 @@ static ssize_t write_gracetime(struct file *file, char *buf, size_t size) return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); } +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, struct nfsd_net *nn) { @@ -1055,6 +1009,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, len = qword_get(&mesg, recdir, size); if (len <= 0) return -EINVAL; + trace_nfsd_ctl_recoverydir(netns(file), recdir); status = nfs4_reset_recoverydir(recdir); if (status) @@ -1065,7 +1020,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, nfs4_recoverydir()); } -/** +/* * write_recoverydir - Set or report the pathname of the recovery directory * * Input: @@ -1096,8 +1051,9 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) mutex_unlock(&nfsd_mutex); return rv; } +#endif -/** +/* * write_v4_end_grace - release grace period for nfsd's v4.x lock manager * * Input: @@ -1106,7 +1062,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: any value + * buf: any value * size: non-zero length of C string in @buf * Output: * passed-in buffer filled with "Y" or "N" with a newline @@ -1126,8 +1082,9 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case 'Y': case 'y': case '1': - if (nn->nfsd_serv) + if (!nn->nfsd_serv) return -EBUSY; + trace_nfsd_end_grace(netns(file)); nfsd4_end_grace(nn); break; default: @@ -1146,12 +1103,182 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) * populating the filesystem. */ -static int nfsd_fill_super(struct super_block * sb, void * data, int silent) +static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) { + struct inode *inode = new_inode(sb); + if (inode) { + /* Following advice from simple_fill_super documentation: */ + inode->i_ino = iunique(sb, NFSD_MaxReserved); + inode->i_mode = mode; + simple_inode_init_ts(inode); + } + return inode; +} + +static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name) +{ + struct inode *dir = parent->d_inode; + struct dentry *dentry; + struct inode *inode; + + inode = nfsd_get_inode(parent->d_sb, S_IFDIR | 0600); + if (!inode) + return ERR_PTR(-ENOMEM); + + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) { + iput(inode); + return dentry; + } + inode->i_fop = &simple_dir_operations; + inode->i_op = &simple_dir_inode_operations; + inc_nlink(inode); + if (ncl) { + inode->i_private = ncl; + kref_get(&ncl->cl_ref); + } + d_make_persistent(dentry, inode); + inc_nlink(dir); + fsnotify_mkdir(dir, dentry); + simple_done_creating(dentry); + return dentry; // borrowed +} + +#if IS_ENABLED(CONFIG_SUNRPC_GSS) +/* + * @content is assumed to be a NUL-terminated string that lives + * longer than the symlink itself. + */ +static void _nfsd_symlink(struct dentry *parent, const char *name, + const char *content) +{ + struct inode *dir = parent->d_inode; + struct inode *inode; + struct dentry *dentry; + + inode = nfsd_get_inode(dir->i_sb, S_IFLNK | 0777); + if (!inode) + return; + + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) { + iput(inode); + return; + } + + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = (char *)content; + inode->i_size = strlen(content); + + d_make_persistent(dentry, inode); + fsnotify_create(dir, dentry); + simple_done_creating(dentry); +} +#else +static inline void _nfsd_symlink(struct dentry *parent, const char *name, + const char *content) +{ +} + +#endif + +static void clear_ncl(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct nfsdfs_client *ncl = inode->i_private; + + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); + kref_put(&ncl->cl_ref, ncl->cl_release); +} + +struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) +{ + struct nfsdfs_client *nc; + + spin_lock(&inode->i_lock); + nc = inode->i_private; + if (nc) + kref_get(&nc->cl_ref); + spin_unlock(&inode->i_lock); + return nc; +} + +/* XXX: cut'n'paste from simple_fill_super; figure out if we could share + * code instead. */ +static int nfsdfs_create_files(struct dentry *root, + const struct tree_descr *files, + struct nfsdfs_client *ncl, + struct dentry **fdentries) +{ + struct inode *dir = d_inode(root); + struct dentry *dentry; + + for (int i = 0; files->name && files->name[0]; i++, files++) { + struct inode *inode = nfsd_get_inode(root->d_sb, + S_IFREG | files->mode); + if (!inode) + return -ENOMEM; + dentry = simple_start_creating(root, files->name); + if (IS_ERR(dentry)) { + iput(inode); + return PTR_ERR(dentry); + } + kref_get(&ncl->cl_ref); + inode->i_fop = files->ops; + inode->i_private = ncl; + d_make_persistent(dentry, inode); + fsnotify_create(dir, dentry); + if (fdentries) + fdentries[i] = dentry; // borrowed + simple_done_creating(dentry); + } + return 0; +} + +/* on success, returns positive number unique to that client. */ +struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, + struct nfsdfs_client *ncl, u32 id, + const struct tree_descr *files, + struct dentry **fdentries) +{ + struct dentry *dentry; + char name[11]; + int ret; + + sprintf(name, "%u", id); + + dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name); + if (IS_ERR(dentry)) /* XXX: tossing errors? */ + return NULL; + ret = nfsdfs_create_files(dentry, files, ncl, fdentries); + if (ret) { + nfsd_client_rmdir(dentry); + return NULL; + } + return dentry; +} + +/* Taken from __rpc_rmdir: */ +void nfsd_client_rmdir(struct dentry *dentry) +{ + simple_recursive_removal(dentry, clear_ncl); +} + +static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, + nfsd_net_id); + struct dentry *dentry; + int ret; + static const struct tree_descr nfsd_files[] = { [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, + /* Per-export io stats use same ops as exports file */ + [NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", - &export_features_operations, S_IRUGO}, + &export_features_fops, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_FO_UnlockFS] = {"unlock_filesystem", @@ -1160,50 +1287,91 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, - [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO}, + [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", + &nfsd_reply_cache_stats_fops, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, - [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, -#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) - [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, -#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ + [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO}, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, +#endif [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO}, #endif /* last one */ {""} }; - get_net(sb->s_fs_info); - return simple_fill_super(sb, 0x6e667364, nfsd_files); + + ret = simple_fill_super(sb, 0x6e667364, nfsd_files); + if (ret) + return ret; + _nfsd_symlink(sb->s_root, "supported_krb5_enctypes", + "/proc/net/rpc/gss_krb5_enctypes"); + dentry = nfsd_mkdir(sb->s_root, NULL, "clients"); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + nn->nfsd_client_dir = dentry; + return 0; } -static struct dentry *nfsd_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int nfsd_fs_get_tree(struct fs_context *fc) { - struct net *net = current->nsproxy->net_ns; - return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super); + return get_tree_keyed(fc, nfsd_fill_super, get_net(fc->net_ns)); +} + +static void nfsd_fs_free_fc(struct fs_context *fc) +{ + if (fc->s_fs_info) + put_net(fc->s_fs_info); +} + +static const struct fs_context_operations nfsd_fs_context_ops = { + .free = nfsd_fs_free_fc, + .get_tree = nfsd_fs_get_tree, +}; + +static int nfsd_init_fs_context(struct fs_context *fc) +{ + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(fc->net_ns->user_ns); + fc->ops = &nfsd_fs_context_ops; + return 0; } static void nfsd_umount(struct super_block *sb) { struct net *net = sb->s_fs_info; - kill_litter_super(sb); + nfsd_shutdown_threads(net); + + kill_anon_super(sb); put_net(net); } static struct file_system_type nfsd_fs_type = { .owner = THIS_MODULE, .name = "nfsd", - .mount = nfsd_mount, + .init_fs_context = nfsd_init_fs_context, .kill_sb = nfsd_umount, }; MODULE_ALIAS_FS("nfsd"); #ifdef CONFIG_PROC_FS + +static int exports_proc_open(struct inode *inode, struct file *file) +{ + return exports_net_open(current->nsproxy->net_ns, file); +} + +static const struct proc_ops exports_proc_ops = { + .proc_open = exports_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + static int create_proc_exports_entry(void) { struct proc_dir_entry *entry; @@ -1211,8 +1379,7 @@ static int create_proc_exports_entry(void) entry = proc_mkdir("fs/nfs", NULL); if (!entry) return -ENOMEM; - entry = proc_create("exports", 0, entry, - &exports_proc_operations); + entry = proc_create("exports", 0, entry, &exports_proc_ops); if (!entry) { remove_proc_entry("fs/nfs", NULL); return -ENOMEM; @@ -1228,10 +1395,755 @@ static int create_proc_exports_entry(void) unsigned int nfsd_net_id; -static __net_init int nfsd_init_net(struct net *net) +static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, + struct netlink_callback *cb, + struct nfsd_genl_rqstp *genl_rqstp) +{ + void *hdr; + u32 i; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &nfsd_nl_family, 0, NFSD_CMD_RPC_STATUS_GET); + if (!hdr) + return -ENOBUFS; + + if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, genl_rqstp->rq_xid) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, genl_rqstp->rq_flags) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, genl_rqstp->rq_prog) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, genl_rqstp->rq_proc) || + nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, genl_rqstp->rq_vers) || + nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME, + ktime_to_us(genl_rqstp->rq_stime), + NFSD_A_RPC_STATUS_PAD)) + return -ENOBUFS; + + switch (genl_rqstp->rq_saddr.sa_family) { + case AF_INET: { + const struct sockaddr_in *s_in, *d_in; + + s_in = (const struct sockaddr_in *)&genl_rqstp->rq_saddr; + d_in = (const struct sockaddr_in *)&genl_rqstp->rq_daddr; + if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4, + s_in->sin_addr.s_addr) || + nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4, + d_in->sin_addr.s_addr) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT, + s_in->sin_port) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT, + d_in->sin_port)) + return -ENOBUFS; + break; + } + case AF_INET6: { + const struct sockaddr_in6 *s_in, *d_in; + + s_in = (const struct sockaddr_in6 *)&genl_rqstp->rq_saddr; + d_in = (const struct sockaddr_in6 *)&genl_rqstp->rq_daddr; + if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6, + &s_in->sin6_addr) || + nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6, + &d_in->sin6_addr) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT, + s_in->sin6_port) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT, + d_in->sin6_port)) + return -ENOBUFS; + break; + } + } + + for (i = 0; i < genl_rqstp->rq_opcnt; i++) + if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS, + genl_rqstp->rq_opnum[i])) + return -ENOBUFS; + + genlmsg_end(skb, hdr); + return 0; +} + +/** + * nfsd_nl_rpc_status_get_dumpit - Handle rpc_status_get dumpit + * @skb: reply buffer + * @cb: netlink metadata and command arguments + * + * Returns the size of the reply or a negative errno. + */ +int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int i, ret, rqstp_index = 0; + struct nfsd_net *nn; + + mutex_lock(&nfsd_mutex); + + nn = net_generic(sock_net(skb->sk), nfsd_net_id); + if (!nn->nfsd_serv) { + ret = -ENODEV; + goto out_unlock; + } + + rcu_read_lock(); + + for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) { + struct svc_rqst *rqstp; + + if (i < cb->args[0]) /* already consumed */ + continue; + + rqstp_index = 0; + list_for_each_entry_rcu(rqstp, + &nn->nfsd_serv->sv_pools[i].sp_all_threads, + rq_all) { + struct nfsd_genl_rqstp genl_rqstp; + unsigned int status_counter; + + if (rqstp_index++ < cb->args[1]) /* already consumed */ + continue; + /* + * Acquire rq_status_counter before parsing the rqst + * fields. rq_status_counter is set to an odd value in + * order to notify the consumers the rqstp fields are + * meaningful. + */ + status_counter = + smp_load_acquire(&rqstp->rq_status_counter); + if (!(status_counter & 1)) + continue; + + genl_rqstp.rq_xid = rqstp->rq_xid; + genl_rqstp.rq_flags = rqstp->rq_flags; + genl_rqstp.rq_vers = rqstp->rq_vers; + genl_rqstp.rq_prog = rqstp->rq_prog; + genl_rqstp.rq_proc = rqstp->rq_proc; + genl_rqstp.rq_stime = rqstp->rq_stime; + genl_rqstp.rq_opcnt = 0; + memcpy(&genl_rqstp.rq_daddr, svc_daddr(rqstp), + sizeof(struct sockaddr)); + memcpy(&genl_rqstp.rq_saddr, svc_addr(rqstp), + sizeof(struct sockaddr)); + +#ifdef CONFIG_NFSD_V4 + if (rqstp->rq_vers == NFS4_VERSION && + rqstp->rq_proc == NFSPROC4_COMPOUND) { + /* NFSv4 compound */ + struct nfsd4_compoundargs *args; + int j; + + args = rqstp->rq_argp; + genl_rqstp.rq_opcnt = min_t(u32, args->opcnt, + ARRAY_SIZE(genl_rqstp.rq_opnum)); + for (j = 0; j < genl_rqstp.rq_opcnt; j++) + genl_rqstp.rq_opnum[j] = + args->ops[j].opnum; + } +#endif /* CONFIG_NFSD_V4 */ + + /* + * Acquire rq_status_counter before reporting the rqst + * fields to the user. + */ + if (smp_load_acquire(&rqstp->rq_status_counter) != + status_counter) + continue; + + ret = nfsd_genl_rpc_status_compose_msg(skb, cb, + &genl_rqstp); + if (ret) + goto out; + } + } + + cb->args[0] = i; + cb->args[1] = rqstp_index; + ret = skb->len; +out: + rcu_read_unlock(); +out_unlock: + mutex_unlock(&nfsd_mutex); + + return ret; +} + +/** + * nfsd_nl_threads_set_doit - set the number of running threads + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + int *nthreads, nrpools = 0, i, ret = -EOPNOTSUPP, rem; + struct net *net = genl_info_net(info); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + const struct nlattr *attr; + const char *scope = NULL; + + if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_THREADS)) + return -EINVAL; + + /* count number of SERVER_THREADS values */ + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr, + GENL_HDRLEN, rem) + nrpools++; + + mutex_lock(&nfsd_mutex); + + nthreads = kcalloc(nrpools, sizeof(int), GFP_KERNEL); + if (!nthreads) { + ret = -ENOMEM; + goto out_unlock; + } + + i = 0; + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr, + GENL_HDRLEN, rem) { + nthreads[i++] = nla_get_u32(attr); + if (i >= nrpools) + break; + } + + if (info->attrs[NFSD_A_SERVER_GRACETIME] || + info->attrs[NFSD_A_SERVER_LEASETIME] || + info->attrs[NFSD_A_SERVER_SCOPE]) { + ret = -EBUSY; + if (nn->nfsd_serv && nn->nfsd_serv->sv_nrthreads) + goto out_unlock; + + ret = -EINVAL; + attr = info->attrs[NFSD_A_SERVER_GRACETIME]; + if (attr) { + u32 gracetime = nla_get_u32(attr); + + if (gracetime < 10 || gracetime > 3600) + goto out_unlock; + + nn->nfsd4_grace = gracetime; + } + + attr = info->attrs[NFSD_A_SERVER_LEASETIME]; + if (attr) { + u32 leasetime = nla_get_u32(attr); + + if (leasetime < 10 || leasetime > 3600) + goto out_unlock; + + nn->nfsd4_lease = leasetime; + } + + attr = info->attrs[NFSD_A_SERVER_SCOPE]; + if (attr) + scope = nla_data(attr); + } + + ret = nfsd_svc(nrpools, nthreads, net, get_current_cred(), scope); + if (ret > 0) + ret = 0; +out_unlock: + mutex_unlock(&nfsd_mutex); + kfree(nthreads); + return ret; +} + +/** + * nfsd_nl_threads_get_doit - get the number of running threads + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + void *hdr; + int err; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_iput(skb, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_free_msg; + } + + mutex_lock(&nfsd_mutex); + + err = nla_put_u32(skb, NFSD_A_SERVER_GRACETIME, + nn->nfsd4_grace) || + nla_put_u32(skb, NFSD_A_SERVER_LEASETIME, + nn->nfsd4_lease) || + nla_put_string(skb, NFSD_A_SERVER_SCOPE, + nn->nfsd_name); + if (err) + goto err_unlock; + + if (nn->nfsd_serv) { + int i; + + for (i = 0; i < nfsd_nrpools(net); ++i) { + struct svc_pool *sp = &nn->nfsd_serv->sv_pools[i]; + + err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, + sp->sp_nrthreads); + if (err) + goto err_unlock; + } + } else { + err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, 0); + if (err) + goto err_unlock; + } + + mutex_unlock(&nfsd_mutex); + + genlmsg_end(skb, hdr); + + return genlmsg_reply(skb, info); + +err_unlock: + mutex_unlock(&nfsd_mutex); +err_free_msg: + nlmsg_free(skb); + + return err; +} + +/** + * nfsd_nl_version_set_doit - set the nfs enabled versions + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + const struct nlattr *attr; + struct nfsd_net *nn; + int i, rem; + + if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_PROTO_VERSION)) + return -EINVAL; + + mutex_lock(&nfsd_mutex); + + nn = net_generic(genl_info_net(info), nfsd_net_id); + if (nn->nfsd_serv) { + mutex_unlock(&nfsd_mutex); + return -EBUSY; + } + + /* clear current supported versions. */ + nfsd_vers(nn, 2, NFSD_CLEAR); + nfsd_vers(nn, 3, NFSD_CLEAR); + for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) + nfsd_minorversion(nn, i, NFSD_CLEAR); + + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_PROTO_VERSION, info->nlhdr, + GENL_HDRLEN, rem) { + struct nlattr *tb[NFSD_A_VERSION_MAX + 1]; + u32 major, minor = 0; + bool enabled; + + if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr, + nfsd_version_nl_policy, info->extack) < 0) + continue; + + if (!tb[NFSD_A_VERSION_MAJOR]) + continue; + + major = nla_get_u32(tb[NFSD_A_VERSION_MAJOR]); + if (tb[NFSD_A_VERSION_MINOR]) + minor = nla_get_u32(tb[NFSD_A_VERSION_MINOR]); + + enabled = nla_get_flag(tb[NFSD_A_VERSION_ENABLED]); + + switch (major) { + case 4: + nfsd_minorversion(nn, minor, enabled ? NFSD_SET : NFSD_CLEAR); + break; + case 3: + case 2: + if (!minor) + nfsd_vers(nn, major, enabled ? NFSD_SET : NFSD_CLEAR); + break; + default: + break; + } + } + + mutex_unlock(&nfsd_mutex); + + return 0; +} + +/** + * nfsd_nl_version_get_doit - get the enabled status for all supported nfs versions + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nfsd_net *nn; + int i, err; + void *hdr; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_iput(skb, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_free_msg; + } + + mutex_lock(&nfsd_mutex); + nn = net_generic(genl_info_net(info), nfsd_net_id); + + for (i = 2; i <= 4; i++) { + int j; + + for (j = 0; j <= NFSD_SUPPORTED_MINOR_VERSION; j++) { + struct nlattr *attr; + + /* Don't record any versions the kernel doesn't have + * compiled in + */ + if (!nfsd_support_version(i)) + continue; + + /* NFSv{2,3} does not support minor numbers */ + if (i < 4 && j) + continue; + + attr = nla_nest_start(skb, + NFSD_A_SERVER_PROTO_VERSION); + if (!attr) { + err = -EINVAL; + goto err_nfsd_unlock; + } + + if (nla_put_u32(skb, NFSD_A_VERSION_MAJOR, i) || + nla_put_u32(skb, NFSD_A_VERSION_MINOR, j)) { + err = -EINVAL; + goto err_nfsd_unlock; + } + + /* Set the enabled flag if the version is enabled */ + if (nfsd_vers(nn, i, NFSD_TEST) && + (i < 4 || nfsd_minorversion(nn, j, NFSD_TEST)) && + nla_put_flag(skb, NFSD_A_VERSION_ENABLED)) { + err = -EINVAL; + goto err_nfsd_unlock; + } + + nla_nest_end(skb, attr); + } + } + + mutex_unlock(&nfsd_mutex); + genlmsg_end(skb, hdr); + + return genlmsg_reply(skb, info); + +err_nfsd_unlock: + mutex_unlock(&nfsd_mutex); +err_free_msg: + nlmsg_free(skb); + + return err; +} + +/** + * nfsd_nl_listener_set_doit - set the nfs running sockets + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct svc_xprt *xprt, *tmp; + const struct nlattr *attr; + struct svc_serv *serv; + LIST_HEAD(permsocks); + struct nfsd_net *nn; + bool delete = false; + int err, rem; + + mutex_lock(&nfsd_mutex); + + err = nfsd_create_serv(net); + if (err) { + mutex_unlock(&nfsd_mutex); + return err; + } + + nn = net_generic(net, nfsd_net_id); + serv = nn->nfsd_serv; + + spin_lock_bh(&serv->sv_lock); + + /* Move all of the old listener sockets to a temp list */ + list_splice_init(&serv->sv_permsocks, &permsocks); + + /* + * Walk the list of server_socks from userland and move any that match + * back to sv_permsocks + */ + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, + GENL_HDRLEN, rem) { + struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; + const char *xcl_name; + struct sockaddr *sa; + + if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, + nfsd_sock_nl_policy, info->extack) < 0) + continue; + + if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME]) + continue; + + if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa)) + continue; + + xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]); + sa = nla_data(tb[NFSD_A_SOCK_ADDR]); + + /* Put back any matching sockets */ + list_for_each_entry_safe(xprt, tmp, &permsocks, xpt_list) { + /* This shouldn't be possible */ + if (WARN_ON_ONCE(xprt->xpt_net != net)) { + list_move(&xprt->xpt_list, &serv->sv_permsocks); + continue; + } + + /* If everything matches, put it back */ + if (!strcmp(xprt->xpt_class->xcl_name, xcl_name) && + rpc_cmp_addr_port(sa, (struct sockaddr *)&xprt->xpt_local)) { + list_move(&xprt->xpt_list, &serv->sv_permsocks); + break; + } + } + } + + /* + * If there are listener transports remaining on the permsocks list, + * it means we were asked to remove a listener. + */ + if (!list_empty(&permsocks)) { + list_splice_init(&permsocks, &serv->sv_permsocks); + delete = true; + } + spin_unlock_bh(&serv->sv_lock); + + /* Do not remove listeners while there are active threads. */ + if (serv->sv_nrthreads) { + err = -EBUSY; + goto out_unlock_mtx; + } + + /* + * Since we can't delete an arbitrary llist entry, destroy the + * remaining listeners and recreate the list. + */ + if (delete) + svc_xprt_destroy_all(serv, net, false); + + /* walk list of addrs again, open any that still don't exist */ + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, + GENL_HDRLEN, rem) { + struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; + const char *xcl_name; + struct sockaddr *sa; + int ret; + + if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, + nfsd_sock_nl_policy, info->extack) < 0) + continue; + + if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME]) + continue; + + if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa)) + continue; + + xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]); + sa = nla_data(tb[NFSD_A_SOCK_ADDR]); + + xprt = svc_find_listener(serv, xcl_name, net, sa); + if (xprt) { + if (delete) + WARN_ONCE(1, "Transport type=%s already exists\n", + xcl_name); + svc_xprt_put(xprt); + continue; + } + + ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, 0, + get_current_cred()); + /* always save the latest error */ + if (ret < 0) + err = ret; + } + + if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks)) + nfsd_destroy_serv(net); + +out_unlock_mtx: + mutex_unlock(&nfsd_mutex); + + return err; +} + +/** + * nfsd_nl_listener_get_doit - get the nfs running listeners + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct svc_xprt *xprt; + struct svc_serv *serv; + struct nfsd_net *nn; + void *hdr; + int err; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_iput(skb, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_free_msg; + } + + mutex_lock(&nfsd_mutex); + nn = net_generic(genl_info_net(info), nfsd_net_id); + + /* no nfs server? Just send empty socket list */ + if (!nn->nfsd_serv) + goto out_unlock_mtx; + + serv = nn->nfsd_serv; + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { + struct nlattr *attr; + + attr = nla_nest_start(skb, NFSD_A_SERVER_SOCK_ADDR); + if (!attr) { + err = -EINVAL; + goto err_serv_unlock; + } + + if (nla_put_string(skb, NFSD_A_SOCK_TRANSPORT_NAME, + xprt->xpt_class->xcl_name) || + nla_put(skb, NFSD_A_SOCK_ADDR, + sizeof(struct sockaddr_storage), + &xprt->xpt_local)) { + err = -EINVAL; + goto err_serv_unlock; + } + + nla_nest_end(skb, attr); + } + spin_unlock_bh(&serv->sv_lock); +out_unlock_mtx: + mutex_unlock(&nfsd_mutex); + genlmsg_end(skb, hdr); + + return genlmsg_reply(skb, info); + +err_serv_unlock: + spin_unlock_bh(&serv->sv_lock); + mutex_unlock(&nfsd_mutex); +err_free_msg: + nlmsg_free(skb); + + return err; +} + +/** + * nfsd_nl_pool_mode_set_doit - set the number of running threads + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_pool_mode_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + const struct nlattr *attr; + + if (GENL_REQ_ATTR_CHECK(info, NFSD_A_POOL_MODE_MODE)) + return -EINVAL; + + attr = info->attrs[NFSD_A_POOL_MODE_MODE]; + return sunrpc_set_pool_mode(nla_data(attr)); +} + +/** + * nfsd_nl_pool_mode_get_doit - get info about pool_mode + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int nfsd_nl_pool_mode_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + char buf[16]; + void *hdr; + int err; + + if (sunrpc_get_pool_mode(buf, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + return -ERANGE; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + err = -EMSGSIZE; + hdr = genlmsg_iput(skb, info); + if (!hdr) + goto err_free_msg; + + err = nla_put_string(skb, NFSD_A_POOL_MODE_MODE, buf) | + nla_put_u32(skb, NFSD_A_POOL_MODE_NPOOLS, nfsd_nrpools(net)); + if (err) + goto err_free_msg; + + genlmsg_end(skb, hdr); + return genlmsg_reply(skb, info); + +err_free_msg: + nlmsg_free(skb); + return err; +} + +/** + * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace + * @net: a freshly-created network namespace + * + * This information stays around as long as the network namespace is + * alive whether or not there is an NFSD instance running in the + * namespace. + * + * Returns zero on success, or a negative errno otherwise. + */ +static __net_init int nfsd_net_init(struct net *net) { - int retval; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + int retval; + int i; retval = nfsd_export_init(net); if (retval) @@ -1239,32 +2151,81 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - nn->nfsd4_lease = 45; /* default lease time */ - nn->nfsd4_grace = 45; - nn->somebody_reclaimed = false; - nn->clverifier_counter = prandom_u32(); - nn->clientid_counter = prandom_u32(); - nn->s2s_cp_cl_id = nn->clientid_counter++; - - atomic_set(&nn->ntf_refcnt, 0); - init_waitqueue_head(&nn->ntf_wq); + retval = percpu_counter_init_many(nn->counter, 0, GFP_KERNEL, + NFSD_STATS_COUNTERS_NUM); + if (retval) + goto out_repcache_error; + + memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats)); + nn->nfsd_svcstats.program = &nfsd_programs[0]; + if (!nfsd_proc_stat_init(net)) { + retval = -ENOMEM; + goto out_proc_error; + } + + for (i = 0; i < sizeof(nn->nfsd_versions); i++) + nn->nfsd_versions[i] = nfsd_support_version(i); + for (i = 0; i < sizeof(nn->nfsd4_minorversions); i++) + nn->nfsd4_minorversions[i] = nfsd_support_version(4); + nn->nfsd_info.mutex = &nfsd_mutex; + nn->nfsd_serv = NULL; + nfsd4_init_leases_net(nn); + get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); + seqlock_init(&nn->writeverf_lock); +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + spin_lock_init(&nn->local_clients_lock); + INIT_LIST_HEAD(&nn->local_clients); +#endif return 0; +out_proc_error: + percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); +out_repcache_error: + nfsd_idmap_shutdown(net); out_idmap_error: nfsd_export_shutdown(net); out_export_error: return retval; } -static __net_exit void nfsd_exit_net(struct net *net) +#if IS_ENABLED(CONFIG_NFS_LOCALIO) +/** + * nfsd_net_pre_exit - Disconnect localio clients from net namespace + * @net: a network namespace that is about to be destroyed + * + * This invalidates ->net pointers held by localio clients + * while they can still safely access nn->counter. + */ +static __net_exit void nfsd_net_pre_exit(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfs_localio_invalidate_clients(&nn->local_clients, + &nn->local_clients_lock); +} +#endif + +/** + * nfsd_net_exit - Release the nfsd_net portion of a net namespace + * @net: a network namespace that is about to be destroyed + * + */ +static __net_exit void nfsd_net_exit(struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfsd_proc_stat_shutdown(net); + percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); } static struct pernet_operations nfsd_net_ops = { - .init = nfsd_init_net, - .exit = nfsd_exit_net, + .init = nfsd_net_init, +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + .pre_exit = nfsd_net_pre_exit, +#endif + .exit = nfsd_net_exit, .id = &nfsd_net_id, .size = sizeof(struct nfsd_net), }; @@ -1272,71 +2233,79 @@ static struct pernet_operations nfsd_net_ops = { static int __init init_nfsd(void) { int retval; - printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); - retval = register_pernet_subsys(&nfsd_net_ops); - if (retval < 0) - return retval; - retval = register_cld_notifier(); - if (retval) - goto out_unregister_pernet; + nfsd_debugfs_init(); + retval = nfsd4_init_slabs(); if (retval) - goto out_unregister_notifier; + return retval; retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ + retval = nfsd_drc_slab_create(); if (retval) - goto out_exit_pnfs; - nfsd_stat_init(); /* Statistics */ - retval = nfsd_reply_cache_init(); - if (retval) - goto out_free_stat; + goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ - retval = create_proc_exports_entry(); - if (retval) + retval = register_pernet_subsys(&nfsd_net_ops); + if (retval < 0) goto out_free_lockd; + retval = register_cld_notifier(); + if (retval) + goto out_free_subsys; + retval = nfsd4_create_laundry_wq(); + if (retval) + goto out_free_cld; retval = register_filesystem(&nfsd_fs_type); if (retval) + goto out_free_nfsd4; + retval = genl_register_family(&nfsd_nl_family); + if (retval) + goto out_free_filesystem; + retval = create_proc_exports_entry(); + if (retval) goto out_free_all; + nfsd_localio_ops_init(); + return 0; out_free_all: - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); + genl_unregister_family(&nfsd_nl_family); +out_free_filesystem: + unregister_filesystem(&nfsd_fs_type); +out_free_nfsd4: + nfsd4_destroy_laundry_wq(); +out_free_cld: + unregister_cld_notifier(); +out_free_subsys: + unregister_pernet_subsys(&nfsd_net_ops); out_free_lockd: nfsd_lockd_shutdown(); - nfsd_reply_cache_shutdown(); -out_free_stat: - nfsd_stat_shutdown(); - nfsd_fault_inject_cleanup(); -out_exit_pnfs: + nfsd_drc_slab_free(); +out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); -out_unregister_notifier: - unregister_cld_notifier(); -out_unregister_pernet: - unregister_pernet_subsys(&nfsd_net_ops); + nfsd_debugfs_exit(); return retval; } static void __exit exit_nfsd(void) { - nfsd_reply_cache_shutdown(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); - nfsd_stat_shutdown(); - nfsd_lockd_shutdown(); - nfsd4_free_slabs(); - nfsd4_exit_pnfs(); - nfsd_fault_inject_cleanup(); + genl_unregister_family(&nfsd_nl_family); unregister_filesystem(&nfsd_fs_type); + nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); + nfsd_drc_slab_free(); + nfsd_lockd_shutdown(); + nfsd4_free_slabs(); + nfsd4_exit_pnfs(); + nfsd_debugfs_exit(); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); +MODULE_DESCRIPTION("In-kernel NFS server"); MODULE_LICENSE("GPL"); module_init(init_nfsd) module_exit(exit_nfsd) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 066899929863..e4263326ca4a 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -17,11 +17,12 @@ #include <linux/nfs3.h> #include <linux/nfs4.h> #include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/msg_prot.h> +#include <linux/sunrpc/addr.h> #include <uapi/linux/nfsd/debug.h> -#include "stats.h" #include "export.h" #undef ifdebug @@ -34,56 +35,90 @@ /* * nfsd version */ +#define NFSD_MINVERS 2 +#define NFSD_MAXVERS 4 #define NFSD_SUPPORTED_MINOR_VERSION 2 -/* - * Maximum blocksizes supported by daemon under various circumstances. - */ -#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD -/* NFSv2 is limited by the protocol specification, see RFC 1094 */ -#define NFSSVC_MAXBLKSIZE_V2 (8*1024) +bool nfsd_support_version(int vers); +#include "netns.h" +#include "stats.h" /* - * Largest number of bytes we need to allocate for an NFS - * call or reply. Used to control buffer sizes. We use - * the length of v3 WRITE, READDIR and READDIR replies - * which are an RPC header, up to 26 XDR units of reply - * data, and some page data. - * - * Note that accuracy here doesn't matter too much as the - * size is rounded up to a page size when allocating space. + * Default and maximum payload size (NFS READ or WRITE), in bytes. + * The default is historical, and the maximum is an implementation + * limit. */ -#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) +enum { + NFSSVC_DEFBLKSIZE = 1 * 1024 * 1024, + NFSSVC_MAXBLKSIZE = RPCSVC_MAXPAYLOAD, +}; struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ }; +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 200 + +struct nfsd_genl_rqstp { + struct sockaddr rq_daddr; + struct sockaddr rq_saddr; + unsigned long rq_flags; + ktime_t rq_stime; + __be32 rq_xid; + u32 rq_vers; + u32 rq_prog; + u32 rq_proc; + + /* NFSv4 compound */ + u32 rq_opcnt; + u32 rq_opnum[16]; +}; -extern struct svc_program nfsd_program; -extern const struct svc_version nfsd_version2, nfsd_version3, - nfsd_version4; +extern struct svc_program nfsd_programs[]; +extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; extern struct mutex nfsd_mutex; -extern spinlock_t nfsd_drc_lock; -extern unsigned long nfsd_drc_max_mem; -extern unsigned long nfsd_drc_mem_used; +extern atomic_t nfsd_th_cnt; /* number of available threads */ extern const struct seq_operations nfs_exports_op; /* + * Common void argument and result helpers + */ +struct nfsd_voidargs { }; +struct nfsd_voidres { }; +bool nfssvc_decode_voidarg(struct svc_rqst *rqstp, + struct xdr_stream *xdr); +bool nfssvc_encode_voidres(struct svc_rqst *rqstp, + struct xdr_stream *xdr); + +/* * Function prototypes. */ -int nfsd_svc(int nrservs, struct net *net); -int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); +int nfsd_svc(int n, int *nservers, struct net *net, + const struct cred *cred, const char *scope); +int nfsd_dispatch(struct svc_rqst *rqstp); int nfsd_nrthreads(struct net *); int nfsd_nrpools(struct net *); int nfsd_get_nrthreads(int n, int *, struct net *); int nfsd_set_nrthreads(int n, int *, struct net *); -int nfsd_pool_stats_open(struct inode *, struct file *); -int nfsd_pool_stats_release(struct inode *, struct file *); +void nfsd_shutdown_threads(struct net *net); + +struct svc_rqst *nfsd_current_rqst(void); + +struct nfsdfs_client { + struct kref cl_ref; + void (*cl_release)(struct kref *kref); +}; + +struct nfsdfs_client *get_nfsdfs_client(struct inode *); +struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, + struct nfsdfs_client *ncl, u32 id, + const struct tree_descr *, + struct dentry **fdentries); +void nfsd_client_rmdir(struct dentry *dentry); -void nfsd_destroy(struct net *net); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) #ifdef CONFIG_NFSD_V2_ACL @@ -98,17 +133,50 @@ extern const struct svc_version nfsd_acl_version3; #endif #endif +#if IS_ENABLED(CONFIG_NFS_LOCALIO) +extern const struct svc_version localio_version1; +#endif + +struct nfsd_net; + enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; -int nfsd_vers(int vers, enum vers_op change); -int nfsd_minorversion(u32 minorversion, enum vers_op change); -void nfsd_reset_versions(void); +int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change); +int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change); +void nfsd_reset_versions(struct nfsd_net *nn); int nfsd_create_serv(struct net *net); +void nfsd_destroy_serv(struct net *net); + +#ifdef CONFIG_DEBUG_FS +void nfsd_debugfs_init(void); +void nfsd_debugfs_exit(void); +#else +static inline void nfsd_debugfs_init(void) {} +static inline void nfsd_debugfs_exit(void) {} +#endif + +extern bool nfsd_disable_splice_read __read_mostly; + +enum { + /* Any new NFSD_IO enum value must be added at the end */ + NFSD_IO_BUFFERED, + NFSD_IO_DONTCACHE, + NFSD_IO_DIRECT, +}; + +extern u64 nfsd_io_cache_read __read_mostly; +extern u64 nfsd_io_cache_write __read_mostly; extern int nfsd_max_blksize; static inline int nfsd_v4client(struct svc_rqst *rq) { - return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; + return rq && rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4; +} +static inline struct user_namespace * +nfsd_user_namespace(const struct svc_rqst *rqstp) +{ + const struct cred *cred = rqstp->rq_xprt->xpt_cred; + return cred ? cred->user_ns : &init_user_ns; } /* @@ -122,10 +190,12 @@ int nfs4_state_start(void); int nfs4_state_start_net(struct net *net); void nfs4_state_shutdown(void); void nfs4_state_shutdown_net(struct net *net); -void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); bool nfsd4_spo_must_allow(struct svc_rqst *rqstp); +int nfsd4_create_laundry_wq(void); +void nfsd4_destroy_laundry_wq(void); +bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode); #else static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } @@ -133,13 +203,19 @@ static inline int nfs4_state_start(void) { return 0; } static inline int nfs4_state_start_net(struct net *net) { return 0; } static inline void nfs4_state_shutdown(void) { } static inline void nfs4_state_shutdown_net(struct net *net) { } -static inline void nfs4_reset_lease(time_t leasetime) { } static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } static inline char * nfs4_recoverydir(void) {return NULL; } static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) { return false; } +static inline int nfsd4_create_laundry_wq(void) { return 0; }; +static inline void nfsd4_destroy_laundry_wq(void) {}; +static inline bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, + struct inode *inode) +{ + return false; +} #endif /* @@ -169,7 +245,6 @@ void nfsd_lockd_shutdown(void); #define nfserr_nospc cpu_to_be32(NFSERR_NOSPC) #define nfserr_rofs cpu_to_be32(NFSERR_ROFS) #define nfserr_mlink cpu_to_be32(NFSERR_MLINK) -#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP) #define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG) #define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY) #define nfserr_dquot cpu_to_be32(NFSERR_DQUOT) @@ -214,9 +289,11 @@ void nfsd_lockd_shutdown(void); #define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) #define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) #define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_admin_revoked cpu_to_be32(NFS4ERR_ADMIN_REVOKED) #define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) #define nfserr_locked cpu_to_be32(NFSERR_LOCKED) #define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) +#define nfserr_delay cpu_to_be32(NFS4ERR_DELAY) #define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE) #define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT) #define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST) @@ -260,19 +337,35 @@ void nfsd_lockd_shutdown(void); #define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP) #define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED) #define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) -#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) +#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) +#define nfserr_file_open cpu_to_be32(NFS4ERR_FILE_OPEN) +#define nfserr_xattr2big cpu_to_be32(NFS4ERR_XATTR2BIG) +#define nfserr_noxattr cpu_to_be32(NFS4ERR_NOXATTR) -/* error codes for internal use */ -/* if a request fails due to kmalloc failure, it gets dropped. - * Client should resend eventually +/* + * Error codes for internal use. We use enum to choose numbers that are + * not already assigned, then covert to be32 resulting in a number that + * cannot conflict with any existing be32 nfserr value. */ -#define nfserr_dropit cpu_to_be32(30000) +enum { /* end-of-file indicator in readdir */ -#define nfserr_eof cpu_to_be32(30001) + NFSERR_EOF = NFS4ERR_FIRST_FREE, +#define nfserr_eof cpu_to_be32(NFSERR_EOF) + /* replay detected */ -#define nfserr_replay_me cpu_to_be32(11001) + NFSERR_REPLAY_ME, +#define nfserr_replay_me cpu_to_be32(NFSERR_REPLAY_ME) + /* nfs41 replay detected */ -#define nfserr_replay_cache cpu_to_be32(11002) + NFSERR_REPLAY_CACHE, +#define nfserr_replay_cache cpu_to_be32(NFSERR_REPLAY_CACHE) + +/* symlink found where dir expected - handled differently to + * other symlink found errors by NFSv3. + */ + NFSERR_SYMLINK_NOT_DIR, +#define nfserr_symlink_not_dir cpu_to_be32(NFSERR_SYMLINK_NOT_DIR) +}; /* Check for dir entries '.' and '..' */ #define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) @@ -298,16 +391,20 @@ void nfsd_lockd_shutdown(void); #define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */ #define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ +#define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */ +#define NFSD_CLIENT_MAX_TRIM_PER_RUN 128 +#define NFS4_CLIENTS_PER_GB 1024 +#define NFSD_DELEGRETURN_TIMEOUT (HZ / 34) /* 30ms */ +#define NFSD_CB_GETATTR_TIMEOUT NFSD_DELEGRETURN_TIMEOUT /* - * The following attributes are currently not supported by the NFSv4 server: + * The following attributes are not implemented by NFSD: * ARCHIVE (deprecated anyway) * HIDDEN (unlikely to be supported any time soon) * MIMETYPE (unlikely to be supported any time soon) * QUOTA_* (will be supported in a forthcoming patch) * SYSTEM (unlikely to be supported any time soon) * TIME_BACKUP (unlikely to be supported any time soon) - * TIME_CREATE (unlikely to be supported any time soon) */ #define NFSD4_SUPPORTED_ATTRS_WORD0 \ (FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \ @@ -326,7 +423,7 @@ void nfsd_lockd_shutdown(void); | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \ | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ - | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ + | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_CREATE \ | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) #define NFSD4_SUPPORTED_ATTRS_WORD2 0 @@ -360,12 +457,47 @@ void nfsd_lockd_shutdown(void); #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ - FATTR4_WORD2_CHANGE_ATTR_TYPE | \ FATTR4_WORD2_MODE_UMASK | \ - NFSD4_2_SECURITY_ATTRS) + FATTR4_WORD2_CLONE_BLKSIZE | \ + NFSD4_2_SECURITY_ATTRS | \ + FATTR4_WORD2_XATTR_SUPPORT | \ + FATTR4_WORD2_TIME_DELEG_ACCESS | \ + FATTR4_WORD2_TIME_DELEG_MODIFY | \ + FATTR4_WORD2_OPEN_ARGUMENTS) extern const u32 nfsd_suppattrs[3][3]; +static inline __be32 nfsd4_set_netaddr(struct sockaddr *addr, + struct nfs42_netaddr *netaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; + unsigned int port; + size_t ret_addr, ret_port; + + switch (addr->sa_family) { + case AF_INET: + port = ntohs(sin->sin_port); + sprintf(netaddr->netid, "tcp"); + netaddr->netid_len = 3; + break; + case AF_INET6: + port = ntohs(sin6->sin6_port); + sprintf(netaddr->netid, "tcp6"); + netaddr->netid_len = 4; + break; + default: + return nfserr_inval; + } + ret_addr = rpc_ntop(addr, netaddr->addr, sizeof(netaddr->addr)); + ret_port = snprintf(netaddr->addr + ret_addr, + RPCBIND_MAXUADDRLEN + 1 - ret_addr, + ".%u.%u", port >> 8, port & 0xff); + WARN_ON(ret_port >= RPCBIND_MAXUADDRLEN + 1 - ret_addr); + netaddr->addr_len = ret_addr + ret_port; + return 0; +} + static inline bool bmval_is_subset(const u32 *bm1, const u32 *bm2) { return !((bm1[0] & ~bm2[0]) || @@ -391,7 +523,8 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL) #define NFSD_WRITEABLE_ATTRS_WORD1 \ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ - | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_CREATE \ + | FATTR4_WORD1_TIME_MODIFY_SET) #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #define MAYBE_FATTR4_WORD2_SECURITY_LABEL \ FATTR4_WORD2_SECURITY_LABEL @@ -400,7 +533,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) #endif #define NFSD_WRITEABLE_ATTRS_WORD2 \ (FATTR4_WORD2_MODE_UMASK \ - | MAYBE_FATTR4_WORD2_SECURITY_LABEL) + | MAYBE_FATTR4_WORD2_SECURITY_LABEL \ + | FATTR4_WORD2_TIME_DELEG_ACCESS \ + | FATTR4_WORD2_TIME_DELEG_MODIFY \ + ) #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ NFSD_WRITEABLE_ATTRS_WORD0 @@ -417,12 +553,20 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) extern int nfsd4_is_junction(struct dentry *dentry); extern int register_cld_notifier(void); extern void unregister_cld_notifier(void); +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn); +#endif + +extern void nfsd4_init_leases_net(struct nfsd_net *nn); + #else /* CONFIG_NFSD_V4 */ static inline int nfsd4_is_junction(struct dentry *dentry) { return 0; } +static inline void nfsd4_init_leases_net(struct nfsd_net *nn) { }; + #define register_cld_notifier() 0 #define unregister_cld_notifier() do { } while(0) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index b319080288c3..ed85dd43da18 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -14,6 +14,7 @@ #include "nfsd.h" #include "vfs.h" #include "auth.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FH @@ -39,7 +40,8 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) /* make sure parents give x permission to user */ int err; parent = dget_parent(tdentry); - err = inode_permission(d_inode(parent), MAY_EXEC); + err = inode_permission(&nop_mnt_idmap, + d_inode(parent), MAY_EXEC); if (err < 0) { dput(parent); break; @@ -60,8 +62,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) * the write call). */ static inline __be32 -nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry, - umode_t requested) +nfsd_mode_check(struct dentry *dentry, umode_t requested) { umode_t mode = d_inode(dentry)->i_mode & S_IFMT; @@ -74,36 +75,36 @@ nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry, } return nfs_ok; } - /* - * v4 has an error more specific than err_notdir which we should - * return in preference to err_notdir: - */ - if (rqstp->rq_vers == 4 && mode == S_IFLNK) + if (mode == S_IFLNK) { + if (requested == S_IFDIR) + return nfserr_symlink_not_dir; return nfserr_symlink; + } if (requested == S_IFDIR) return nfserr_notdir; if (mode == S_IFDIR) return nfserr_isdir; - return nfserr_inval; + return nfserr_wrong_type; } -static bool nfsd_originating_port_ok(struct svc_rqst *rqstp, int flags) +static bool nfsd_originating_port_ok(struct svc_rqst *rqstp, + struct svc_cred *cred, + struct svc_export *exp) { - if (flags & NFSEXP_INSECURE_PORT) + if (nfsexp_flags(cred, exp) & NFSEXP_INSECURE_PORT) return true; /* We don't require gss requests to use low ports: */ - if (rqstp->rq_cred.cr_flavor >= RPC_AUTH_GSS) + if (cred->cr_flavor >= RPC_AUTH_GSS) return true; return test_bit(RQ_SECURE, &rqstp->rq_flags); } static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, + struct svc_cred *cred, struct svc_export *exp) { - int flags = nfsexp_flags(rqstp, exp); - /* Check if the request originated from a secure port. */ - if (!nfsd_originating_port_ok(rqstp, flags)) { + if (rqstp && !nfsd_originating_port_ok(rqstp, cred, exp)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("nfsd: request from insecure port %s!\n", svc_print_addr(rqstp, buf, sizeof(buf))); @@ -111,23 +112,15 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, } /* Set user creds for this exportpoint */ - return nfserrno(nfsd_setuser(rqstp, exp)); + return nfserrno(nfsd_setuser(cred, exp)); } -static inline __be32 check_pseudo_root(struct svc_rqst *rqstp, - struct dentry *dentry, struct svc_export *exp) +static inline __be32 check_pseudo_root(struct dentry *dentry, + struct svc_export *exp) { if (!(exp->ex_flags & NFSEXP_V4ROOT)) return nfs_ok; /* - * v2/v3 clients have no need for the V4ROOT export--they use - * the mount protocl instead; also, further V4ROOT checks may be - * in v4-specific code, in which case v2/v3 clients could bypass - * them. - */ - if (!nfsd_v4client(rqstp)) - return nfserr_stale; - /* * We're exposing only the directories and symlinks that have to be * traversed on the way to real exports: */ @@ -149,71 +142,68 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp, * dentry. On success, the results are used to set fh_export and * fh_dentry. */ -static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) +static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, + struct svc_cred *cred, + struct auth_domain *client, + struct auth_domain *gssclient, + struct svc_fh *fhp) { struct knfsd_fh *fh = &fhp->fh_handle; - struct fid *fid = NULL, sfid; + struct fid *fid = NULL; struct svc_export *exp; struct dentry *dentry; int fileid_type; int data_left = fh->fh_size/4; + int len; __be32 error; - error = nfserr_stale; - if (rqstp->rq_vers > 2) - error = nfserr_badhandle; - if (rqstp->rq_vers == 4 && fh->fh_size == 0) + error = nfserr_badhandle; + if (fh->fh_size == 0) return nfserr_nofilehandle; - if (fh->fh_version == 1) { - int len; + if (fh->fh_version != 1) + return error; - if (--data_left < 0) - return error; - if (fh->fh_auth_type != 0) - return error; - len = key_len(fh->fh_fsid_type) / 4; - if (len == 0) - return error; - if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { - /* deprecated, convert to type 3 */ - len = key_len(FSID_ENCODE_DEV)/4; - fh->fh_fsid_type = FSID_ENCODE_DEV; - /* - * struct knfsd_fh uses host-endian fields, which are - * sometimes used to hold net-endian values. This - * confuses sparse, so we must use __force here to - * keep it from complaining. - */ - fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]), - ntohl((__force __be32)fh->fh_fsid[1]))); - fh->fh_fsid[1] = fh->fh_fsid[2]; - } - data_left -= len; - if (data_left < 0) - return error; - exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid); - fid = (struct fid *)(fh->fh_fsid + len); - } else { - __u32 tfh[2]; - dev_t xdev; - ino_t xino; + if (--data_left < 0) + return error; + if (fh->fh_auth_type != 0) + return error; + len = key_len(fh->fh_fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { + u32 *fsid = fh_fsid(fh); - if (fh->fh_size != NFS_FHSIZE) - return error; - /* assume old filehandle format */ - xdev = old_decode_dev(fh->ofh_xdev); - xino = u32_to_ino_t(fh->ofh_xino); - mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL); - exp = rqst_exp_find(rqstp, FSID_DEV, tfh); + /* deprecated, convert to type 3 */ + len = key_len(FSID_ENCODE_DEV)/4; + fh->fh_fsid_type = FSID_ENCODE_DEV; + /* + * struct knfsd_fh uses host-endian fields, which are + * sometimes used to hold net-endian values. This + * confuses sparse, so we must use __force here to + * keep it from complaining. + */ + fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fsid[0]), + ntohl((__force __be32)fsid[1]))); + fsid[1] = fsid[2]; } + data_left -= len; + if (data_left < 0) + return error; + exp = rqst_exp_find(rqstp ? &rqstp->rq_chandle : NULL, + net, client, gssclient, + fh->fh_fsid_type, fh_fsid(fh)); + fid = (struct fid *)(fh_fsid(fh) + len); error = nfserr_stale; - if (PTR_ERR(exp) == -ENOENT) - return error; + if (IS_ERR(exp)) { + trace_nfsd_set_fh_dentry_badexport(rqstp, fhp, PTR_ERR(exp)); + + if (PTR_ERR(exp) == -ENOENT) + return error; - if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); + } if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { /* Elevate privileges so that the lack of 'r' or 'x' @@ -234,9 +224,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); put_cred(override_creds(new)); - put_cred(new); } else { - error = nfsd_setuser_and_check_port(rqstp, exp); + error = nfsd_setuser_and_check_port(rqstp, cred, exp); if (error) goto out; } @@ -244,29 +233,27 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) /* * Look up the dentry using the NFS file handle. */ - error = nfserr_stale; - if (rqstp->rq_vers > 2) - error = nfserr_badhandle; - - if (fh->fh_version != 1) { - sfid.i32.ino = fh->ofh_ino; - sfid.i32.gen = fh->ofh_generation; - sfid.i32.parent_ino = fh->ofh_dirino; - fid = &sfid; - data_left = 3; - if (fh->ofh_dirino == 0) - fileid_type = FILEID_INO32_GEN; - else - fileid_type = FILEID_INO32_GEN_PARENT; - } else - fileid_type = fh->fh_fileid_type; + error = nfserr_badhandle; + + fileid_type = fh->fh_fileid_type; if (fileid_type == FILEID_ROOT) dentry = dget(exp->ex_path.dentry); else { - dentry = exportfs_decode_fh(exp->ex_path.mnt, fid, - data_left, fileid_type, - nfsd_acceptable, exp); + dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid, + data_left, fileid_type, 0, + nfsd_acceptable, exp); + if (IS_ERR_OR_NULL(dentry)) { + trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp, + dentry ? PTR_ERR(dentry) : -ESTALE); + switch (PTR_ERR(dentry)) { + case -ENOMEM: + case -ETIMEDOUT: + break; + default: + dentry = ERR_PTR(-ESTALE); + } + } } if (dentry == NULL) goto out; @@ -282,8 +269,30 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) dentry); } + switch (fhp->fh_maxsize) { + case NFS4_FHSIZE: + if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR) + fhp->fh_no_atomic_attr = true; + fhp->fh_64bit_cookies = true; + break; + case NFS3_FHSIZE: + if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOWCC) + fhp->fh_no_wcc = true; + fhp->fh_64bit_cookies = true; + if (exp->ex_flags & NFSEXP_V4ROOT) + goto out; + break; + case NFS_FHSIZE: + fhp->fh_no_wcc = true; + if (EX_WGATHER(exp)) + fhp->fh_use_wgather = true; + if (exp->ex_flags & NFSEXP_V4ROOT) + goto out; + } + fhp->fh_dentry = dentry; fhp->fh_export = exp; + return 0; out: exp_put(exp); @@ -291,48 +300,42 @@ out: } /** - * fh_verify - filehandle lookup and access checking - * @rqstp: pointer to current rpc request + * __fh_verify - filehandle lookup and access checking + * @rqstp: RPC transaction context, or NULL + * @net: net namespace in which to perform the export lookup + * @cred: RPC user credential + * @client: RPC auth domain + * @gssclient: RPC GSS auth domain, or NULL * @fhp: filehandle to be verified * @type: expected type of object pointed to by filehandle * @access: type of access needed to object * - * Look up a dentry from the on-the-wire filehandle, check the client's - * access to the export, and set the current task's credentials. - * - * Regardless of success or failure of fh_verify(), fh_put() should be - * called on @fhp when the caller is finished with the filehandle. - * - * fh_verify() may be called multiple times on a given filehandle, for - * example, when processing an NFSv4 compound. The first call will look - * up a dentry using the on-the-wire filehandle. Subsequent calls will - * skip the lookup and just perform the other checks and possibly change - * the current task's credentials. - * - * @type specifies the type of object expected using one of the S_IF* - * constants defined in include/linux/stat.h. The caller may use zero - * to indicate that it doesn't care, or a negative integer to indicate - * that it expects something not of the given type. - * - * @access is formed from the NFSD_MAY_* constants defined in - * fs/nfsd/vfs.h. + * See fh_verify() for further descriptions of @fhp, @type, and @access. */ -__be32 -fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) +static __be32 +__fh_verify(struct svc_rqst *rqstp, + struct net *net, struct svc_cred *cred, + struct auth_domain *client, + struct auth_domain *gssclient, + struct svc_fh *fhp, umode_t type, int access) { - struct svc_export *exp; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_export *exp = NULL; + bool may_bypass_gss = false; struct dentry *dentry; __be32 error; - dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp)); - if (!fhp->fh_dentry) { - error = nfsd_set_fh_dentry(rqstp, fhp); + error = nfsd_set_fh_dentry(rqstp, net, cred, client, + gssclient, fhp); if (error) goto out; } dentry = fhp->fh_dentry; exp = fhp->fh_export; + + trace_nfsd_fh_verify(rqstp, fhp, type, access); + /* * We still have to do all these permission checks, even when * fh_dentry is already set: @@ -349,25 +352,44 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) * (for example, if different id-squashing options are in * effect on the new filesystem). */ - error = check_pseudo_root(rqstp, dentry, exp); + error = check_pseudo_root(dentry, exp); if (error) goto out; - error = nfsd_setuser_and_check_port(rqstp, exp); + error = nfsd_setuser_and_check_port(rqstp, cred, exp); if (error) goto out; - error = nfsd_mode_check(rqstp, dentry, type); + error = nfsd_mode_check(dentry, type); if (error) goto out; /* - * pseudoflavor restrictions are not enforced on NLM, - * which clients virtually always use auth_sys for, - * even while using RPCSEC_GSS for NFS. + * If rqstp is NULL, this is a LOCALIO request which will only + * ever use a filehandle/credential pair for which access has + * been affirmed (by ACCESS or OPEN NFS requests) over the + * wire. Skip both the xprtsec policy and the security flavor + * checks. */ - if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS) - goto skip_pseudoflavor_check; + if (!rqstp) + goto check_permissions; + + if ((access & NFSD_MAY_NLM) && (exp->ex_flags & NFSEXP_NOAUTHNLM)) + /* NLM is allowed to fully bypass authentication */ + goto out; + + /* + * NLM is allowed to bypass the xprtsec policy check because lockd + * doesn't support xprtsec. + */ + if (!(access & NFSD_MAY_NLM)) { + error = check_xprtsec_policy(exp, rqstp); + if (error) + goto out; + } + + if (access & NFSD_MAY_BYPASS_GSS) + may_bypass_gss = true; /* * Clients may expect to be able to use auth_sys during mount, * even if they use gss for everything else; see section 2.3.2 @@ -375,28 +397,81 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) */ if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT && exp->ex_path.dentry == dentry) - goto skip_pseudoflavor_check; + may_bypass_gss = true; - error = check_nfsd_access(exp, rqstp); + error = check_security_flavor(exp, rqstp, may_bypass_gss); if (error) goto out; -skip_pseudoflavor_check: - /* Finally, check access permissions. */ - error = nfsd_permission(rqstp, exp, dentry, access); + svc_xprt_set_valid(rqstp->rq_xprt); - if (error) { - dprintk("fh_verify: %pd2 permission failure, " - "acc=%x, error=%d\n", - dentry, - access, ntohl(error)); - } +check_permissions: + /* Finally, check access permissions. */ + error = nfsd_permission(cred, exp, dentry, access); out: + trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); if (error == nfserr_stale) - nfsdstats.fh_stale++; + nfsd_stats_fh_stale_inc(nn, exp); return error; } +/** + * fh_verify_local - filehandle lookup and access checking + * @net: net namespace in which to perform the export lookup + * @cred: RPC user credential + * @client: RPC auth domain + * @fhp: filehandle to be verified + * @type: expected type of object pointed to by filehandle + * @access: type of access needed to object + * + * This API can be used by callers who do not have an RPC + * transaction context (ie are not running in an nfsd thread). + * + * See fh_verify() for further descriptions of @fhp, @type, and @access. + */ +__be32 +fh_verify_local(struct net *net, struct svc_cred *cred, + struct auth_domain *client, struct svc_fh *fhp, + umode_t type, int access) +{ + return __fh_verify(NULL, net, cred, client, NULL, + fhp, type, access); +} + +/** + * fh_verify - filehandle lookup and access checking + * @rqstp: pointer to current rpc request + * @fhp: filehandle to be verified + * @type: expected type of object pointed to by filehandle + * @access: type of access needed to object + * + * Look up a dentry from the on-the-wire filehandle, check the client's + * access to the export, and set the current task's credentials. + * + * Regardless of success or failure of fh_verify(), fh_put() should be + * called on @fhp when the caller is finished with the filehandle. + * + * fh_verify() may be called multiple times on a given filehandle, for + * example, when processing an NFSv4 compound. The first call will look + * up a dentry using the on-the-wire filehandle. Subsequent calls will + * skip the lookup and just perform the other checks and possibly change + * the current task's credentials. + * + * @type specifies the type of object expected using one of the S_IF* + * constants defined in include/linux/stat.h. The caller may use zero + * to indicate that it doesn't care, or a negative integer to indicate + * that it expects something not of the given type. + * + * @access is formed from the NFSD_MAY_* constants defined in + * fs/nfsd/vfs.h. + */ +__be32 +fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) +{ + return __fh_verify(rqstp, SVC_NET(rqstp), &rqstp->rq_cred, + rqstp->rq_client, rqstp->rq_gssclient, + fhp, type, access); +} /* * Compose a file handle for an NFS reply. @@ -410,32 +485,21 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, { if (dentry != exp->ex_path.dentry) { struct fid *fid = (struct fid *) - (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1); + (fh_fsid(&fhp->fh_handle) + fhp->fh_handle.fh_size/4 - 1); int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4; - int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK); + int fh_flags = (exp->ex_flags & NFSEXP_NOSUBTREECHECK) ? 0 : + EXPORT_FH_CONNECTABLE; + int fileid_type = + exportfs_encode_fh(dentry, fid, &maxsize, fh_flags); fhp->fh_handle.fh_fileid_type = - exportfs_encode_fh(dentry, fid, &maxsize, subtreecheck); + fileid_type > 0 ? fileid_type : FILEID_INVALID; fhp->fh_handle.fh_size += maxsize * 4; } else { fhp->fh_handle.fh_fileid_type = FILEID_ROOT; } } -/* - * for composing old style file handles - */ -static inline void _fh_update_old(struct dentry *dentry, - struct svc_export *exp, - struct knfsd_fh *fh) -{ - fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino); - fh->ofh_generation = d_inode(dentry)->i_generation; - if (d_is_dir(dentry) || - (exp->ex_flags & NFSEXP_NOSUBTREECHECK)) - fh->ofh_dirino = 0; -} - static bool is_root_export(struct svc_export *exp) { return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root; @@ -452,7 +516,7 @@ static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) case FSID_DEV: if (!old_valid_dev(exp_sb(exp)->s_dev)) return false; - /* FALL THROUGH */ + fallthrough; case FSID_MAJOR_MINOR: case FSID_ENCODE_DEV: return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV; @@ -462,7 +526,7 @@ static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) case FSID_UUID16: if (!is_root_export(exp)) return false; - /* fall through */ + fallthrough; case FSID_UUID4_INUM: case FSID_UUID16_INUM: return exp->ex_uuid != NULL; @@ -532,9 +596,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, /* ref_fh is a reference file handle. * if it is non-null and for the same filesystem, then we should compose * a filehandle which is of the same version, where possible. - * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca - * Then create a 32byte filehandle using nfs_fhbase_old - * */ struct inode * inode = d_inode(dentry); @@ -552,10 +613,13 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, */ set_version_and_fsid_type(fhp, exp, ref_fh); + /* If we have a ref_fh, then copy the fh_no_wcc setting from it. */ + fhp->fh_no_wcc = ref_fh ? ref_fh->fh_no_wcc : false; + if (ref_fh == fhp) fh_put(ref_fh); - if (fhp->fh_locked || fhp->fh_dentry) { + if (fhp->fh_dentry) { printk(KERN_ERR "fh_compose: fh %pd2 not initialized!\n", dentry); } @@ -567,35 +631,21 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, fhp->fh_dentry = dget(dentry); /* our internal copy */ fhp->fh_export = exp_get(exp); - if (fhp->fh_handle.fh_version == 0xca) { - /* old style filehandle please */ - memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); - fhp->fh_handle.fh_size = NFS_FHSIZE; - fhp->fh_handle.ofh_dcookie = 0xfeebbaca; - fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev); - fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev; - fhp->fh_handle.ofh_xino = - ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino); - fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry)); - if (inode) - _fh_update_old(dentry, exp, &fhp->fh_handle); - } else { - fhp->fh_handle.fh_size = - key_len(fhp->fh_handle.fh_fsid_type) + 4; - fhp->fh_handle.fh_auth_type = 0; - - mk_fsid(fhp->fh_handle.fh_fsid_type, - fhp->fh_handle.fh_fsid, - ex_dev, - d_inode(exp->ex_path.dentry)->i_ino, - exp->ex_fsid, exp->ex_uuid); - - if (inode) - _fh_update(fhp, exp, dentry); - if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) { - fh_put(fhp); - return nfserr_opnotsupp; - } + fhp->fh_handle.fh_size = + key_len(fhp->fh_handle.fh_fsid_type) + 4; + fhp->fh_handle.fh_auth_type = 0; + + mk_fsid(fhp->fh_handle.fh_fsid_type, + fh_fsid(&fhp->fh_handle), + ex_dev, + d_inode(exp->ex_path.dentry)->i_ino, + exp->ex_fsid, exp->ex_uuid); + + if (inode) + _fh_update(fhp, exp, dentry); + if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) { + fh_put(fhp); + return nfserr_stale; } return 0; @@ -616,16 +666,12 @@ fh_update(struct svc_fh *fhp) dentry = fhp->fh_dentry; if (d_really_is_negative(dentry)) goto out_negative; - if (fhp->fh_handle.fh_version != 1) { - _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle); - } else { - if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT) - return 0; + if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT) + return 0; - _fh_update(fhp, fhp->fh_export, dentry); - if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) - return nfserr_opnotsupp; - } + _fh_update(fhp, fhp->fh_export, dentry); + if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) + return nfserr_stale; return 0; out_bad: printk(KERN_ERR "fh_update: fh not verified!\n"); @@ -636,6 +682,111 @@ out_negative: return nfserr_serverfault; } +/** + * fh_getattr - Retrieve attributes on a local file + * @fhp: File handle of target file + * @stat: Caller-supplied kstat buffer to be filled in + * + * Returns nfs_ok on success, otherwise an NFS status code is + * returned. + */ +__be32 fh_getattr(const struct svc_fh *fhp, struct kstat *stat) +{ + struct path p = { + .mnt = fhp->fh_export->ex_path.mnt, + .dentry = fhp->fh_dentry, + }; + struct inode *inode = d_inode(p.dentry); + u32 request_mask = STATX_BASIC_STATS; + + if (S_ISREG(inode->i_mode)) + request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN); + + if (fhp->fh_maxsize == NFS4_FHSIZE) + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); + + return nfserrno(vfs_getattr(&p, stat, request_mask, + AT_STATX_SYNC_AS_STAT)); +} + +/** + * fh_fill_pre_attrs - Fill in pre-op attributes + * @fhp: file handle to be updated + * + */ +__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp) +{ + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + struct kstat stat; + __be32 err; + + if (fhp->fh_no_wcc || fhp->fh_pre_saved) + return nfs_ok; + + err = fh_getattr(fhp, &stat); + if (err) + return err; + + if (v4) + fhp->fh_pre_change = nfsd4_change_attribute(&stat); + + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; + fhp->fh_pre_saved = true; + return nfs_ok; +} + +/** + * fh_fill_post_attrs - Fill in post-op attributes + * @fhp: file handle to be updated + * + */ +__be32 fh_fill_post_attrs(struct svc_fh *fhp) +{ + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + __be32 err; + + if (fhp->fh_no_wcc) + return nfs_ok; + + if (fhp->fh_post_saved) + printk("nfsd: inode locked twice during operation.\n"); + + err = fh_getattr(fhp, &fhp->fh_post_attr); + if (err) + return err; + + fhp->fh_post_saved = true; + if (v4) + fhp->fh_post_change = + nfsd4_change_attribute(&fhp->fh_post_attr); + return nfs_ok; +} + +/** + * fh_fill_both_attrs - Fill pre-op and post-op attributes + * @fhp: file handle to be updated + * + * This is used when the directory wasn't changed, but wcc attributes + * are needed anyway. + */ +__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp) +{ + __be32 err; + + err = fh_fill_post_attrs(fhp); + if (err) + return err; + + fhp->fh_pre_change = fhp->fh_post_change; + fhp->fh_pre_mtime = fhp->fh_post_attr.mtime; + fhp->fh_pre_ctime = fhp->fh_post_attr.ctime; + fhp->fh_pre_size = fhp->fh_post_attr.size; + fhp->fh_pre_saved = true; + return nfs_ok; +} + /* * Release a file handle. */ @@ -645,16 +796,16 @@ fh_put(struct svc_fh *fhp) struct dentry * dentry = fhp->fh_dentry; struct svc_export * exp = fhp->fh_export; if (dentry) { - fh_unlock(fhp); fhp->fh_dentry = NULL; dput(dentry); - fh_clear_wcc(fhp); + fh_clear_pre_post_attrs(fhp); } fh_drop_write(fhp); if (exp) { exp_put(exp); fhp->fh_export = NULL; } + fhp->fh_no_wcc = false; return; } @@ -664,20 +815,15 @@ fh_put(struct svc_fh *fhp) char * SVCFH_fmt(struct svc_fh *fhp) { struct knfsd_fh *fh = &fhp->fh_handle; + static char buf[2+1+1+64*3+1]; - static char buf[80]; - sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x", - fh->fh_size, - fh->fh_base.fh_pad[0], - fh->fh_base.fh_pad[1], - fh->fh_base.fh_pad[2], - fh->fh_base.fh_pad[3], - fh->fh_base.fh_pad[4], - fh->fh_base.fh_pad[5]); + if (fh->fh_size > 64) + return "bad-fh"; + sprintf(buf, "%d: %*ph", fh->fh_size, fh->fh_size, fh->fh_raw); return buf; } -enum fsid_source fsid_source(struct svc_fh *fhp) +enum fsid_source fsid_source(const struct svc_fh *fhp) { if (fhp->fh_handle.fh_version != 1) return FSIDSOURCE_DEV; @@ -704,3 +850,44 @@ enum fsid_source fsid_source(struct svc_fh *fhp) return FSIDSOURCE_UUID; return FSIDSOURCE_DEV; } + +/** + * nfsd4_change_attribute - Generate an NFSv4 change_attribute value + * @stat: inode attributes + * + * Caller must fill in @stat before calling, typically by invoking + * vfs_getattr() with STATX_MODE, STATX_CTIME, and STATX_CHANGE_COOKIE. + * Returns an unsigned 64-bit changeid4 value (RFC 8881 Section 3.2). + * + * We could use i_version alone as the change attribute. However, i_version + * can go backwards on a regular file after an unclean shutdown. On its own + * that doesn't necessarily cause a problem, but if i_version goes backwards + * and then is incremented again it could reuse a value that was previously + * used before boot, and a client who queried the two values might incorrectly + * assume nothing changed. + * + * By using both ctime and the i_version counter we guarantee that as long as + * time doesn't go backwards we never reuse an old value. If the filesystem + * advertises STATX_ATTR_CHANGE_MONOTONIC, then this mitigation is not + * needed. + * + * We only need to do this for regular files as well. For directories, we + * assume that the new change attr is always logged to stable storage in some + * fashion before the results can be seen. + */ +u64 nfsd4_change_attribute(const struct kstat *stat) +{ + u64 chattr; + + if (stat->result_mask & STATX_CHANGE_COOKIE) { + chattr = stat->change_cookie; + if (S_ISREG(stat->mode) && + !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) { + chattr += (u64)stat->ctime.tv_sec << 30; + chattr += stat->ctime.tv_nsec; + } + } else { + chattr = time_to_chattr(&stat->ctime); + } + return chattr; +} diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 755e256a9103..5ef7191f8ad8 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -10,8 +10,59 @@ #include <linux/crc32.h> #include <linux/sunrpc/svc.h> -#include <uapi/linux/nfsd/nfsfh.h> #include <linux/iversion.h> +#include <linux/exportfs.h> +#include <linux/nfs4.h> + +#include "export.h" + +/* + * The file handle starts with a sequence of four-byte words. + * The first word contains a version number (1) and three descriptor bytes + * that tell how the remaining 3 variable length fields should be handled. + * These three bytes are auth_type, fsid_type and fileid_type. + * + * All four-byte values are in host-byte-order. + * + * The auth_type field is deprecated and must be set to 0. + * + * The fsid_type identifies how the filesystem (or export point) is + * encoded. + * Current values: + * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number + * NOTE: we cannot use the kdev_t device id value, because kdev_t.h + * says we mustn't. We must break it up and reassemble. + * 1 - 4 byte user specified identifier + * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED + * 3 - 4 byte device id, encoded for user-space, 4 byte inode number + * 4 - 4 byte inode number and 4 byte uuid + * 5 - 8 byte uuid + * 6 - 16 byte uuid + * 7 - 8 byte inode number and 16 byte uuid + * + * The fileid_type identifies how the file within the filesystem is encoded. + * The values for this field are filesystem specific, exccept that + * filesystems must not use the values '0' or '0xff'. 'See enum fid_type' + * in include/linux/exportfs.h for currently registered values. + */ + +struct knfsd_fh { + unsigned int fh_size; /* + * Points to the current size while + * building a new file handle. + */ + u8 fh_raw[NFS4_FHSIZE]; +}; + +#define fh_version fh_raw[0] +#define fh_auth_type fh_raw[1] +#define fh_fsid_type fh_raw[2] +#define fh_fileid_type fh_raw[3] + +static inline u32 *fh_fsid(const struct knfsd_fh *fh) +{ + return (u32 *)&fh->fh_raw[4]; +} static inline __u32 ino_t_to_u32(ino_t ino) { @@ -33,29 +84,36 @@ typedef struct svc_fh { struct dentry * fh_dentry; /* validated dentry */ struct svc_export * fh_export; /* export pointer */ - bool fh_locked; /* inode locked by us */ bool fh_want_write; /* remount protection taken */ - -#ifdef CONFIG_NFSD_V3 + bool fh_no_wcc; /* no wcc data needed */ + bool fh_no_atomic_attr; + /* + * wcc data is not atomic with + * operation + */ + bool fh_use_wgather; /* NFSv2 wgather option */ + bool fh_64bit_cookies;/* readdir cookie size */ + int fh_flags; /* FH flags */ bool fh_post_saved; /* post-op attrs saved */ bool fh_pre_saved; /* pre-op attrs saved */ - /* Pre-op attributes saved during fh_lock */ + /* Pre-op attributes saved when inode is locked */ __u64 fh_pre_size; /* size before operation */ - struct timespec fh_pre_mtime; /* mtime before oper */ - struct timespec fh_pre_ctime; /* ctime before oper */ + struct timespec64 fh_pre_mtime; /* mtime before oper */ + struct timespec64 fh_pre_ctime; /* ctime before oper */ /* * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode) * to find out if it is valid. */ u64 fh_pre_change; - /* Post-op attributes saved in fh_unlock */ + /* Post-op attributes saved in fh_fill_post_attrs() */ struct kstat fh_post_attr; /* full attrs after operation */ u64 fh_post_change; /* nfsv4 change; see above */ -#endif /* CONFIG_NFSD_V3 */ - } svc_fh; +#define NFSD4_FH_FOREIGN (1<<0) +#define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f)) +#define HAS_FH_FLAG(c, f) ((c)->fh_flags & (f)) enum nfsd_fsid { FSID_DEV = 0, @@ -73,7 +131,7 @@ enum fsid_source { FSIDSOURCE_FSID, FSIDSOURCE_UUID, }; -extern enum fsid_source fsid_source(struct svc_fh *fhp); +extern enum fsid_source fsid_source(const struct svc_fh *fhp); /* @@ -162,24 +220,27 @@ extern char * SVCFH_fmt(struct svc_fh *fhp); * Function prototypes */ __be32 fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int); +__be32 fh_verify_local(struct net *, struct svc_cred *, struct auth_domain *, + struct svc_fh *, umode_t, int); +__be32 fh_getattr(const struct svc_fh *fhp, struct kstat *stat); __be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); __be32 fh_update(struct svc_fh *); void fh_put(struct svc_fh *); static __inline__ struct svc_fh * -fh_copy(struct svc_fh *dst, struct svc_fh *src) +fh_copy(struct svc_fh *dst, const struct svc_fh *src) { - WARN_ON(src->fh_dentry || src->fh_locked); - + WARN_ON(src->fh_dentry); + *dst = *src; return dst; } static inline void -fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +fh_copy_shallow(struct knfsd_fh *dst, const struct knfsd_fh *src) { dst->fh_size = src->fh_size; - memcpy(&dst->fh_base, &src->fh_base, src->fh_size); + memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size); } static __inline__ struct svc_fh * @@ -190,134 +251,89 @@ fh_init(struct svc_fh *fhp, int maxsize) return fhp; } -static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +static inline bool fh_match(const struct knfsd_fh *fh1, + const struct knfsd_fh *fh2) { if (fh1->fh_size != fh2->fh_size) return false; - if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0) + if (memcmp(fh1->fh_raw, fh2->fh_raw, fh1->fh_size) != 0) return false; return true; } -static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +static inline bool fh_fsid_match(const struct knfsd_fh *fh1, + const struct knfsd_fh *fh2) { + u32 *fsid1 = fh_fsid(fh1); + u32 *fsid2 = fh_fsid(fh2); + if (fh1->fh_fsid_type != fh2->fh_fsid_type) return false; - if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0) + if (memcmp(fsid1, fsid2, key_len(fh1->fh_fsid_type)) != 0) return false; return true; } -#ifdef CONFIG_CRC32 /** - * knfsd_fh_hash - calculate the crc32 hash for the filehandle - * @fh - pointer to filehandle + * fh_want_write - Get write access to an export + * @fhp: File handle of file to be written * - * returns a crc32 hash for the filehandle that is compatible with - * the one displayed by "wireshark". - */ - -static inline u32 -knfsd_fh_hash(struct knfsd_fh *fh) -{ - return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size); -} -#else -static inline u32 -knfsd_fh_hash(struct knfsd_fh *fh) -{ - return 0; -} -#endif - -#ifdef CONFIG_NFSD_V3 -/* - * The wcc data stored in current_fh should be cleared - * between compound ops. - */ -static inline void -fh_clear_wcc(struct svc_fh *fhp) -{ - fhp->fh_post_saved = false; - fhp->fh_pre_saved = false; -} - -/* - * We could use i_version alone as the change attribute. However, - * i_version can go backwards after a reboot. On its own that doesn't - * necessarily cause a problem, but if i_version goes backwards and then - * is incremented again it could reuse a value that was previously used - * before boot, and a client who queried the two values might - * incorrectly assume nothing changed. + * Caller must invoke fh_drop_write() when its write operation + * is complete. * - * By using both ctime and the i_version counter we guarantee that as - * long as time doesn't go backwards we never reuse an old value. + * Returns 0 if the file handle's export can be written to. Otherwise + * the export is not prepared for updates, and the returned negative + * errno value reflects the reason for the failure. */ -static inline u64 nfsd4_change_attribute(struct kstat *stat, - struct inode *inode) +static inline int fh_want_write(struct svc_fh *fhp) { - u64 chattr; - - chattr = stat->ctime.tv_sec; - chattr <<= 30; - chattr += stat->ctime.tv_nsec; - chattr += inode_query_iversion(inode); - return chattr; + int ret; + + if (fhp->fh_want_write) + return 0; + ret = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (!ret) + fhp->fh_want_write = true; + return ret; } -extern void fill_pre_wcc(struct svc_fh *fhp); -extern void fill_post_wcc(struct svc_fh *fhp); -#else -#define fh_clear_wcc(ignored) -#define fill_pre_wcc(ignored) -#define fill_post_wcc(notused) -#endif /* CONFIG_NFSD_V3 */ - - -/* - * Lock a file handle/inode - * NOTE: both fh_lock and fh_unlock are done "by hand" in - * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once - * so, any changes here should be reflected there. +/** + * fh_drop_write - Release write access on an export + * @fhp: File handle of file on which fh_want_write() was previously called */ - -static inline void -fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) +static inline void fh_drop_write(struct svc_fh *fhp) { - struct dentry *dentry = fhp->fh_dentry; - struct inode *inode; - - BUG_ON(!dentry); - - if (fhp->fh_locked) { - printk(KERN_WARNING "fh_lock: %pd2 already locked!\n", - dentry); - return; + if (fhp->fh_want_write) { + fhp->fh_want_write = false; + mnt_drop_write(fhp->fh_export->ex_path.mnt); } - - inode = d_inode(dentry); - inode_lock_nested(inode, subclass); - fill_pre_wcc(fhp); - fhp->fh_locked = true; } -static inline void -fh_lock(struct svc_fh *fhp) +/** + * knfsd_fh_hash - calculate the crc32 hash for the filehandle + * @fh - pointer to filehandle + * + * returns a crc32 hash for the filehandle that is compatible with + * the one displayed by "wireshark". + */ +static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) { - fh_lock_nested(fhp, I_MUTEX_NORMAL); + return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size); } -/* - * Unlock a file handle/inode +/** + * fh_clear_pre_post_attrs - Reset pre/post attributes + * @fhp: file handle to be updated + * */ -static inline void -fh_unlock(struct svc_fh *fhp) +static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) { - if (fhp->fh_locked) { - fill_post_wcc(fhp); - inode_unlock(d_inode(fhp->fh_dentry)); - fhp->fh_locked = false; - } + fhp->fh_post_saved = false; + fhp->fh_pre_saved = false; } +u64 nfsd4_change_attribute(const struct kstat *stat); +__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp); +__be32 fh_fill_post_attrs(struct svc_fh *fhp); +__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp); #endif /* _LINUX_NFSD_NFSFH_H */ diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0d20fd161225..481e789a7697 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -10,31 +10,41 @@ #include "cache.h" #include "xdr.h" #include "vfs.h" - -typedef struct svc_rqst svc_rqst; -typedef struct svc_buf svc_buf; +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC - -static __be32 -nfsd_proc_null(struct svc_rqst *rqstp) +static __be32 nfsd_map_status(__be32 status) { - return nfs_ok; + switch (status) { + case nfs_ok: + break; + case nfserr_nofilehandle: + case nfserr_badhandle: + status = nfserr_stale; + break; + case nfserr_wrongsec: + case nfserr_xdev: + case nfserr_file_open: + status = nfserr_acces; + break; + case nfserr_symlink_not_dir: + status = nfserr_notdir; + break; + case nfserr_symlink: + case nfserr_wrong_type: + status = nfserr_inval; + break; + } + return status; } static __be32 -nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp) -{ - if (err) return err; - return fh_getattr(&resp->fh, &resp->stat); -} -static __be32 -nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp) +nfsd_proc_null(struct svc_rqst *rqstp) { - if (err) return err; - return fh_getattr(&resp->fh, &resp->stat); + return rpc_success; } + /* * Get a file's attributes * N.B. After this call resp->fh needs an fh_put @@ -44,13 +54,18 @@ nfsd_proc_getattr(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; - dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + + trace_nfsd_vfs_getattr(rqstp, &argp->fh); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, - NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); - return nfsd_return_attrs(nfserr, resp); + resp->status = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + resp->status = nfsd_map_status(resp->status); + return rpc_success; } /* @@ -63,8 +78,10 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) struct nfsd_sattrargs *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; struct iattr *iap = &argp->attrs; + struct nfsd_attrs attrs = { + .na_iattr = iap, + }; struct svc_fh *fhp; - __be32 nfserr; dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n", SVCFH_fmt(&argp->fh), @@ -94,16 +111,16 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) * Solaris, at least, doesn't seem to care what the time * request is. We require it be within 30 minutes of now. */ - time_t delta = iap->ia_atime.tv_sec - get_seconds(); + time64_t delta = iap->ia_atime.tv_sec - ktime_get_real_seconds(); - nfserr = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); - if (nfserr) - goto done; + resp->status = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + if (resp->status != nfs_ok) + goto out; if (delta < 0) delta = -delta; if (delta < MAX_TOUCH_TIME_ERROR && - setattr_prepare(fhp->fh_dentry, iap) != 0) { + setattr_prepare(&nop_mnt_idmap, fhp->fh_dentry, iap) != 0) { /* * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. * This will cause notify_change to set these times @@ -113,9 +130,21 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) } } - nfserr = nfsd_setattr(rqstp, fhp, iap, 0, (time_t)0); -done: - return nfsd_return_attrs(nfserr, resp); + resp->status = nfsd_setattr(rqstp, fhp, &attrs, NULL); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + resp->status = nfsd_map_status(resp->status); + return rpc_success; +} + +/* Obsolete, replaced by MNTPROC_MNT. */ +static __be32 +nfsd_proc_root(struct svc_rqst *rqstp) +{ + return rpc_success; } /* @@ -129,17 +158,21 @@ nfsd_proc_lookup(struct svc_rqst *rqstp) { struct nfsd_diropargs *argp = rqstp->rq_argp; struct nfsd_diropres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: LOOKUP %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); fh_init(&resp->fh, NFS_FHSIZE); - nfserr = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len, - &resp->fh); - + resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len, + &resp->fh); fh_put(&argp->fh); - return nfsd_return_dirop(nfserr, resp); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + resp->status = nfsd_map_status(resp->status); + return rpc_success; } /* @@ -148,18 +181,20 @@ nfsd_proc_lookup(struct svc_rqst *rqstp) static __be32 nfsd_proc_readlink(struct svc_rqst *rqstp) { - struct nfsd_readlinkargs *argp = rqstp->rq_argp; + struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_readlinkres *resp = rqstp->rq_resp; - __be32 nfserr; dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ resp->len = NFS_MAXPATHLEN; - nfserr = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len); + resp->page = *(rqstp->rq_next_page++); + resp->status = nfsd_readlink(rqstp, &argp->fh, + page_address(resp->page), &resp->len); fh_put(&argp->fh); - return nfserr; + resp->status = nfsd_map_status(resp->status); + return rpc_success; } /* @@ -171,34 +206,39 @@ nfsd_proc_read(struct svc_rqst *rqstp) { struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; - __be32 nfserr; + u32 eof; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->count, argp->offset); + argp->count = min_t(u32, argp->count, NFS_MAXDATA); + argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); + + resp->pages = rqstp->rq_next_page; + /* Obtain buffer pointer for payload. 19 is 1 word for * status, 17 words for fattr, and 1 word for the byte count. */ - - if (NFSSVC_MAXBLKSIZE_V2 < argp->count) { - char buf[RPC_MAX_ADDRBUFLEN]; - printk(KERN_NOTICE - "oversized read request from %s (%d bytes)\n", - svc_print_addr(rqstp, buf, sizeof(buf)), - argp->count); - argp->count = NFSSVC_MAXBLKSIZE_V2; - } svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); resp->count = argp->count; - nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), - argp->offset, - rqstp->rq_vec, argp->vlen, - &resp->count); + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, + &resp->count, &eof); + if (resp->status == nfs_ok) + resp->status = fh_getattr(&resp->fh, &resp->stat); + else if (resp->status == nfserr_jukebox) + set_bit(RQ_DROPME, &rqstp->rq_flags); + resp->status = nfsd_map_status(resp->status); + return rpc_success; +} - if (nfserr) return nfserr; - return fh_getattr(&resp->fh, &resp->stat); +/* Reserved */ +static __be32 +nfsd_proc_writecache(struct svc_rqst *rqstp) +{ + return rpc_success; } /* @@ -210,22 +250,21 @@ nfsd_proc_write(struct svc_rqst *rqstp) { struct nfsd_writeargs *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; - __be32 nfserr; unsigned long cnt = argp->len; - unsigned int nvecs; - dprintk("nfsd: WRITE %s %d bytes at %d\n", + dprintk("nfsd: WRITE %s %u bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, - &argp->first, cnt); - if (!nvecs) - return nfserr_io; - nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), - argp->offset, rqstp->rq_vec, nvecs, - &cnt, NFS_DATA_SYNC); - return nfsd_return_attrs(nfserr, resp); + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, + &argp->payload, &cnt, NFS_DATA_SYNC, NULL); + if (resp->status == nfs_ok) + resp->status = fh_getattr(&resp->fh, &resp->stat); + else if (resp->status == nfserr_jukebox) + set_bit(RQ_DROPME, &rqstp->rq_flags); + resp->status = nfsd_map_status(resp->status); + return rpc_success; } /* @@ -242,53 +281,51 @@ nfsd_proc_create(struct svc_rqst *rqstp) svc_fh *dirfhp = &argp->fh; svc_fh *newfhp = &resp->fh; struct iattr *attr = &argp->attrs; + struct nfsd_attrs attrs = { + .na_iattr = attr, + }; struct inode *inode; struct dentry *dchild; int type, mode; - __be32 nfserr; int hosterr; dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); - dprintk("nfsd: CREATE %s %.*s\n", - SVCFH_fmt(dirfhp), argp->len, argp->name); - /* First verify the parent file handle */ - nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); - if (nfserr) + resp->status = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); + if (resp->status != nfs_ok) goto done; /* must fh_put dirfhp even on error */ /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */ - nfserr = nfserr_exist; + resp->status = nfserr_exist; if (isdotent(argp->name, argp->len)) goto done; hosterr = fh_want_write(dirfhp); if (hosterr) { - nfserr = nfserrno(hosterr); + resp->status = nfserrno(hosterr); goto done; } - fh_lock_nested(dirfhp, I_MUTEX_PARENT); - dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); + dchild = start_creating(&nop_mnt_idmap, dirfhp->fh_dentry, + &QSTR_LEN(argp->name, argp->len)); if (IS_ERR(dchild)) { - nfserr = nfserrno(PTR_ERR(dchild)); - goto out_unlock; + resp->status = nfserrno(PTR_ERR(dchild)); + goto out_write; } fh_init(newfhp, NFS_FHSIZE); - nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); - if (!nfserr && d_really_is_negative(dchild)) - nfserr = nfserr_noent; - dput(dchild); - if (nfserr) { - if (nfserr != nfserr_noent) + resp->status = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); + if (!resp->status && d_really_is_negative(dchild)) + resp->status = nfserr_noent; + if (resp->status) { + if (resp->status != nfserr_noent) goto out_unlock; /* * If the new file handle wasn't verified, we can't tell * whether the file exists or not. Time to bail ... */ - nfserr = nfserr_acces; + resp->status = nfserr_acces; if (!newfhp->fh_dentry) { - printk(KERN_WARNING + printk(KERN_WARNING "nfsd_proc_create: file handle not verified\n"); goto out_unlock; } @@ -312,18 +349,19 @@ nfsd_proc_create(struct svc_rqst *rqstp) rdev = inode->i_rdev; attr->ia_valid |= ATTR_SIZE; - /* FALLTHROUGH */ + fallthrough; case S_IFIFO: /* this is probably a permission check.. * at least IRIX implements perm checking on * echo thing > device-special-file-or-pipe * by doing a CREATE with type==0 */ - nfserr = nfsd_permission(rqstp, - newfhp->fh_export, - newfhp->fh_dentry, - NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS); - if (nfserr && nfserr != nfserr_rofs) + resp->status = nfsd_permission( + &rqstp->rq_cred, + newfhp->fh_export, + newfhp->fh_dentry, + NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS); + if (resp->status && resp->status != nfserr_rofs) goto out_unlock; } } else @@ -359,16 +397,19 @@ nfsd_proc_create(struct svc_rqst *rqstp) attr->ia_valid &= ~ATTR_SIZE; /* Make sure the type and device matches */ - nfserr = nfserr_exist; - if (inode && type != (inode->i_mode & S_IFMT)) + resp->status = nfserr_exist; + if (inode && inode_wrong_type(inode, type)) goto out_unlock; } - nfserr = 0; + resp->status = nfs_ok; if (!inode) { /* File doesn't exist. Create it and set attrs */ - nfserr = nfsd_create_locked(rqstp, dirfhp, argp->name, - argp->len, attr, type, rdev, newfhp); + resp->status = nfsd_create_locked(rqstp, dirfhp, &attrs, type, + rdev, newfhp); + /* nfsd_create_locked() unlocked the parent */ + dput(dchild); + goto out_write; } else if (type == S_IFREG) { dprintk("nfsd: existing %s, valid=%x, size=%ld\n", argp->name, attr->ia_valid, (long) attr->ia_size); @@ -378,99 +419,99 @@ nfsd_proc_create(struct svc_rqst *rqstp) */ attr->ia_valid &= ATTR_SIZE; if (attr->ia_valid) - nfserr = nfsd_setattr(rqstp, newfhp, attr, 0, (time_t)0); + resp->status = nfsd_setattr(rqstp, newfhp, &attrs, + NULL); } out_unlock: - /* We don't really need to unlock, as fh_put does it. */ - fh_unlock(dirfhp); + end_creating(dchild); +out_write: fh_drop_write(dirfhp); done: fh_put(dirfhp); - return nfsd_return_dirop(nfserr, resp); + if (resp->status != nfs_ok) + goto out; + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + resp->status = nfsd_map_status(resp->status); + return rpc_success; } static __be32 nfsd_proc_remove(struct svc_rqst *rqstp) { struct nfsd_diropargs *argp = rqstp->rq_argp; - __be32 nfserr; - - dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh), - argp->len, argp->name); + struct nfsd_stat *resp = rqstp->rq_resp; /* Unlink. -SIFDIR means file must not be a directory */ - nfserr = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len); + resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, + argp->name, argp->len); fh_put(&argp->fh); - return nfserr; + resp->status = nfsd_map_status(resp->status); + return rpc_success; } static __be32 nfsd_proc_rename(struct svc_rqst *rqstp) { struct nfsd_renameargs *argp = rqstp->rq_argp; - __be32 nfserr; + struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: RENAME %s %.*s -> \n", - SVCFH_fmt(&argp->ffh), argp->flen, argp->fname); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname); - - nfserr = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen, - &argp->tfh, argp->tname, argp->tlen); + resp->status = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen, + &argp->tfh, argp->tname, argp->tlen); fh_put(&argp->ffh); fh_put(&argp->tfh); - return nfserr; + resp->status = nfsd_map_status(resp->status); + return rpc_success; } static __be32 nfsd_proc_link(struct svc_rqst *rqstp) { struct nfsd_linkargs *argp = rqstp->rq_argp; - __be32 nfserr; - - dprintk("nfsd: LINK %s ->\n", - SVCFH_fmt(&argp->ffh)); - dprintk("nfsd: %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); + struct nfsd_stat *resp = rqstp->rq_resp; - nfserr = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen, - &argp->ffh); + resp->status = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen, + &argp->ffh); fh_put(&argp->ffh); fh_put(&argp->tfh); - return nfserr; + resp->status = nfsd_map_status(resp->status); + return rpc_success; } static __be32 nfsd_proc_symlink(struct svc_rqst *rqstp) { struct nfsd_symlinkargs *argp = rqstp->rq_argp; + struct nfsd_stat *resp = rqstp->rq_resp; + struct nfsd_attrs attrs = { + .na_iattr = &argp->attrs, + }; struct svc_fh newfh; - __be32 nfserr; - if (argp->tlen > NFS_MAXPATHLEN) - return nfserr_nametoolong; + if (argp->tlen > NFS_MAXPATHLEN) { + resp->status = nfserr_nametoolong; + goto out; + } argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first, page_address(rqstp->rq_arg.pages[0]), argp->tlen); - if (IS_ERR(argp->tname)) - return nfserrno(PTR_ERR(argp->tname)); - - dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", - SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, - argp->tlen, argp->tname); + if (IS_ERR(argp->tname)) { + resp->status = nfserrno(PTR_ERR(argp->tname)); + goto out; + } fh_init(&newfh, NFS_FHSIZE); - nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, - argp->tname, &newfh); + resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, + argp->tname, &attrs, &newfh); kfree(argp->tname); fh_put(&argp->ffh); fh_put(&newfh); - return nfserr; +out: + resp->status = nfsd_map_status(resp->status); + return rpc_success; } /* @@ -482,9 +523,9 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp) { struct nfsd_createargs *argp = rqstp->rq_argp; struct nfsd_diropres *resp = rqstp->rq_resp; - __be32 nfserr; - - dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); + struct nfsd_attrs attrs = { + .na_iattr = &argp->attrs, + }; if (resp->fh.fh_dentry) { printk(KERN_WARNING @@ -493,10 +534,16 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp) argp->attrs.ia_valid &= ~ATTR_SIZE; fh_init(&resp->fh, NFS_FHSIZE); - nfserr = nfsd_create(rqstp, &argp->fh, argp->name, argp->len, - &argp->attrs, S_IFDIR, 0, &resp->fh); + resp->status = nfsd_create(rqstp, &argp->fh, argp->name, argp->len, + &attrs, S_IFDIR, 0, &resp->fh); fh_put(&argp->fh); - return nfsd_return_dirop(nfserr, resp); + if (resp->status != nfs_ok) + goto out; + + resp->status = fh_getattr(&resp->fh, &resp->stat); +out: + resp->status = nfsd_map_status(resp->status); + return rpc_success; } /* @@ -506,13 +553,31 @@ static __be32 nfsd_proc_rmdir(struct svc_rqst *rqstp) { struct nfsd_diropargs *argp = rqstp->rq_argp; - __be32 nfserr; + struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); - - nfserr = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len); + resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, + argp->name, argp->len); fh_put(&argp->fh); - return nfserr; + resp->status = nfsd_map_status(resp->status); + return rpc_success; +} + +static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, + struct nfsd_readdirres *resp, + u32 count) +{ + struct xdr_buf *buf = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; + + memset(buf, 0, sizeof(*buf)); + + /* Reserve room for the NULL ptr & eof flag (-2 words) */ + buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), (u32)PAGE_SIZE); + buf->buflen -= XDR_UNIT * 2; + buf->pages = rqstp->rq_next_page; + rqstp->rq_next_page++; + + xdr_init_encode_pages(xdr, buf); } /* @@ -523,37 +588,22 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) { struct nfsd_readdirargs *argp = rqstp->rq_argp; struct nfsd_readdirres *resp = rqstp->rq_resp; - int count; - __be32 nfserr; loff_t offset; - dprintk("nfsd: READDIR %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, argp->cookie); - - /* Shrink to the client read size */ - count = (argp->count >> 2) - 2; + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); - /* Make sure we've room for the NULL ptr & eof flag */ - count -= 2; - if (count < 0) - count = 0; + nfsd_init_dirlist_pages(rqstp, resp, argp->count); - resp->buffer = argp->buffer; - resp->offset = NULL; - resp->buflen = count; resp->common.err = nfs_ok; - /* Read directory and encode entries on the fly */ + resp->cookie_offset = 0; offset = argp->cookie; - nfserr = nfsd_readdir(rqstp, &argp->fh, &offset, - &resp->common, nfssvc_encode_entry); - - resp->count = resp->buffer - argp->buffer; - if (resp->offset) - *resp->offset = htonl(offset); + resp->status = nfsd_readdir(rqstp, &argp->fh, &offset, + &resp->common, nfssvc_encode_entry); + nfssvc_encode_nfscookie(resp, offset); fh_put(&argp->fh); - return nfserr; + resp->status = nfsd_map_status(resp->status); + return rpc_success; } /* @@ -564,21 +614,18 @@ nfsd_proc_statfs(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_statfsres *resp = rqstp->rq_resp; - __be32 nfserr; - - dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, - NFSD_MAY_BYPASS_GSS_ON_ROOT); + resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, + NFSD_MAY_BYPASS_GSS_ON_ROOT); fh_put(&argp->fh); - return nfserr; + resp->status = nfsd_map_status(resp->status); + return rpc_success; } /* * NFSv2 Server procedures. * Only the results of non-idempotent operations are cached. */ -struct nfsd_void { int dummy; }; #define ST 1 /* status */ #define FH 8 /* filehandle */ @@ -587,237 +634,217 @@ struct nfsd_void { int dummy; }; static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_NULL] = { .pc_func = nfsd_proc_null, - .pc_decode = nfssvc_decode_void, - .pc_encode = nfssvc_encode_void, - .pc_argsize = sizeof(struct nfsd_void), - .pc_ressize = sizeof(struct nfsd_void), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = ST, + .pc_xdrressize = 0, + .pc_name = "NULL", }, [NFSPROC_GETATTR] = { .pc_func = nfsd_proc_getattr, - .pc_decode = nfssvc_decode_fhandle, - .pc_encode = nfssvc_encode_attrstat, - .pc_release = nfssvc_release_fhandle, + .pc_decode = nfssvc_decode_fhandleargs, + .pc_encode = nfssvc_encode_attrstatres, + .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [NFSPROC_SETATTR] = { .pc_func = nfsd_proc_setattr, .pc_decode = nfssvc_decode_sattrargs, - .pc_encode = nfssvc_encode_attrstat, - .pc_release = nfssvc_release_fhandle, + .pc_encode = nfssvc_encode_attrstatres, + .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_sattrargs), + .pc_argzero = sizeof(struct nfsd_sattrargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, + .pc_name = "SETATTR", }, [NFSPROC_ROOT] = { - .pc_decode = nfssvc_decode_void, - .pc_encode = nfssvc_encode_void, - .pc_argsize = sizeof(struct nfsd_void), - .pc_ressize = sizeof(struct nfsd_void), + .pc_func = nfsd_proc_root, + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = ST, + .pc_xdrressize = 0, + .pc_name = "ROOT", }, [NFSPROC_LOOKUP] = { .pc_func = nfsd_proc_lookup, .pc_decode = nfssvc_decode_diropargs, .pc_encode = nfssvc_encode_diropres, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_diropargs), + .pc_argzero = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+AT, + .pc_name = "LOOKUP", }, [NFSPROC_READLINK] = { .pc_func = nfsd_proc_readlink, - .pc_decode = nfssvc_decode_readlinkargs, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_readlinkres, - .pc_argsize = sizeof(struct nfsd_readlinkargs), + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4, + .pc_name = "READLINK", }, [NFSPROC_READ] = { .pc_func = nfsd_proc_read, .pc_decode = nfssvc_decode_readargs, .pc_encode = nfssvc_encode_readres, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_readres, .pc_argsize = sizeof(struct nfsd_readargs), + .pc_argzero = sizeof(struct nfsd_readargs), .pc_ressize = sizeof(struct nfsd_readres), .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, + .pc_xdrressize = ST+AT+1+NFS_MAXDATA/4, + .pc_name = "READ", }, [NFSPROC_WRITECACHE] = { - .pc_decode = nfssvc_decode_void, - .pc_encode = nfssvc_encode_void, - .pc_argsize = sizeof(struct nfsd_void), - .pc_ressize = sizeof(struct nfsd_void), + .pc_func = nfsd_proc_writecache, + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_argzero = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = ST, + .pc_xdrressize = 0, + .pc_name = "WRITECACHE", }, [NFSPROC_WRITE] = { .pc_func = nfsd_proc_write, .pc_decode = nfssvc_decode_writeargs, - .pc_encode = nfssvc_encode_attrstat, - .pc_release = nfssvc_release_fhandle, + .pc_encode = nfssvc_encode_attrstatres, + .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_writeargs), + .pc_argzero = sizeof(struct nfsd_writeargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, + .pc_name = "WRITE", }, [NFSPROC_CREATE] = { .pc_func = nfsd_proc_create, .pc_decode = nfssvc_decode_createargs, .pc_encode = nfssvc_encode_diropres, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_createargs), + .pc_argzero = sizeof(struct nfsd_createargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, + .pc_name = "CREATE", }, [NFSPROC_REMOVE] = { .pc_func = nfsd_proc_remove, .pc_decode = nfssvc_decode_diropargs, - .pc_encode = nfssvc_encode_void, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_diropargs), - .pc_ressize = sizeof(struct nfsd_void), + .pc_argzero = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "REMOVE", }, [NFSPROC_RENAME] = { .pc_func = nfsd_proc_rename, .pc_decode = nfssvc_decode_renameargs, - .pc_encode = nfssvc_encode_void, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_renameargs), - .pc_ressize = sizeof(struct nfsd_void), + .pc_argzero = sizeof(struct nfsd_renameargs), + .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "RENAME", }, [NFSPROC_LINK] = { .pc_func = nfsd_proc_link, .pc_decode = nfssvc_decode_linkargs, - .pc_encode = nfssvc_encode_void, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_linkargs), - .pc_ressize = sizeof(struct nfsd_void), + .pc_argzero = sizeof(struct nfsd_linkargs), + .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "LINK", }, [NFSPROC_SYMLINK] = { .pc_func = nfsd_proc_symlink, .pc_decode = nfssvc_decode_symlinkargs, - .pc_encode = nfssvc_encode_void, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_symlinkargs), - .pc_ressize = sizeof(struct nfsd_void), + .pc_argzero = sizeof(struct nfsd_symlinkargs), + .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "SYMLINK", }, [NFSPROC_MKDIR] = { .pc_func = nfsd_proc_mkdir, .pc_decode = nfssvc_decode_createargs, .pc_encode = nfssvc_encode_diropres, - .pc_release = nfssvc_release_fhandle, + .pc_release = nfssvc_release_diropres, .pc_argsize = sizeof(struct nfsd_createargs), + .pc_argzero = sizeof(struct nfsd_createargs), .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, + .pc_name = "MKDIR", }, [NFSPROC_RMDIR] = { .pc_func = nfsd_proc_rmdir, .pc_decode = nfssvc_decode_diropargs, - .pc_encode = nfssvc_encode_void, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_diropargs), - .pc_ressize = sizeof(struct nfsd_void), + .pc_argzero = sizeof(struct nfsd_diropargs), + .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "RMDIR", }, [NFSPROC_READDIR] = { .pc_func = nfsd_proc_readdir, .pc_decode = nfssvc_decode_readdirargs, .pc_encode = nfssvc_encode_readdirres, .pc_argsize = sizeof(struct nfsd_readdirargs), + .pc_argzero = sizeof(struct nfsd_readdirargs), .pc_ressize = sizeof(struct nfsd_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIR", }, [NFSPROC_STATFS] = { .pc_func = nfsd_proc_statfs, - .pc_decode = nfssvc_decode_fhandle, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_statfsres, .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_argzero = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_statfsres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+5, + .pc_name = "STATFS", }, }; - -static unsigned int nfsd_count2[ARRAY_SIZE(nfsd_procedures2)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_count2[ARRAY_SIZE(nfsd_procedures2)]); const struct svc_version nfsd_version2 = { .vs_vers = 2, - .vs_nproc = 18, + .vs_nproc = ARRAY_SIZE(nfsd_procedures2), .vs_proc = nfsd_procedures2, .vs_count = nfsd_count2, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS2_SVC_XDRSIZE, }; - -/* - * Map errnos to NFS errnos. - */ -__be32 -nfserrno (int errno) -{ - static struct { - __be32 nfserr; - int syserr; - } nfs_errtbl[] = { - { nfs_ok, 0 }, - { nfserr_perm, -EPERM }, - { nfserr_noent, -ENOENT }, - { nfserr_io, -EIO }, - { nfserr_nxio, -ENXIO }, - { nfserr_fbig, -E2BIG }, - { nfserr_acces, -EACCES }, - { nfserr_exist, -EEXIST }, - { nfserr_xdev, -EXDEV }, - { nfserr_mlink, -EMLINK }, - { nfserr_nodev, -ENODEV }, - { nfserr_notdir, -ENOTDIR }, - { nfserr_isdir, -EISDIR }, - { nfserr_inval, -EINVAL }, - { nfserr_fbig, -EFBIG }, - { nfserr_nospc, -ENOSPC }, - { nfserr_rofs, -EROFS }, - { nfserr_mlink, -EMLINK }, - { nfserr_nametoolong, -ENAMETOOLONG }, - { nfserr_notempty, -ENOTEMPTY }, -#ifdef EDQUOT - { nfserr_dquot, -EDQUOT }, -#endif - { nfserr_stale, -ESTALE }, - { nfserr_jukebox, -ETIMEDOUT }, - { nfserr_jukebox, -ERESTARTSYS }, - { nfserr_jukebox, -EAGAIN }, - { nfserr_jukebox, -EWOULDBLOCK }, - { nfserr_jukebox, -ENOMEM }, - { nfserr_io, -ETXTBSY }, - { nfserr_notsupp, -EOPNOTSUPP }, - { nfserr_toosmall, -ETOOSMALL }, - { nfserr_serverfault, -ESERVERFAULT }, - { nfserr_serverfault, -ENFILE }, - { nfserr_io, -EUCLEAN }, - { nfserr_perm, -ENOKEY }, - }; - int i; - - for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { - if (nfs_errtbl[i].syserr == errno) - return nfs_errtbl[i].nfserr; - } - WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); - return nfserr_io; -} - diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 89cb484f1cfb..b08ae85d53ef 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -12,12 +12,14 @@ #include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> +#include <linux/siphash.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> +#include <linux/nfslocalio.h> #include <linux/seq_file.h> #include <linux/inetdevice.h> #include <net/addrconf.h> @@ -27,25 +29,36 @@ #include "cache.h" #include "vfs.h" #include "netns.h" +#include "filecache.h" + +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_SVC -extern struct svc_program nfsd_program; +atomic_t nfsd_th_cnt = ATOMIC_INIT(0); static int nfsd(void *vrqstp); +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static int nfsd_acl_rpcbind_set(struct net *, + const struct svc_program *, + u32, int, + unsigned short, + unsigned short); +static __be32 nfsd_acl_init_request(struct svc_rqst *, + const struct svc_program *, + struct svc_process_info *); +#endif +static int nfsd_rpcbind_set(struct net *, + const struct svc_program *, + u32, int, + unsigned short, + unsigned short); +static __be32 nfsd_init_request(struct svc_rqst *, + const struct svc_program *, + struct svc_process_info *); /* - * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members - * of the svc_serv struct. In particular, ->sv_nrthreads but also to some - * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt - * - * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a - * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number - * of nfsd threads must exist and each must listed in ->sp_all_threads in each - * entry of ->sv_pools[]. - * - * Transitions of the thread count between zero and non-zero are of particular - * interest since the svc_serv needs to be created and initialized at that - * point, or freed. + * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members + * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in @@ -57,137 +70,168 @@ static int nfsd(void *vrqstp); */ DEFINE_MUTEX(nfsd_mutex); -/* - * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. - * nfsd_drc_max_pages limits the total amount of memory available for - * version 4.1 DRC caches. - * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. - */ -spinlock_t nfsd_drc_lock; -unsigned long nfsd_drc_max_mem; -unsigned long nfsd_drc_mem_used; +#if IS_ENABLED(CONFIG_NFS_LOCALIO) +static const struct svc_version *localio_versions[] = { + [1] = &localio_version1, +}; + +#define NFSD_LOCALIO_NRVERS ARRAY_SIZE(localio_versions) + +#endif /* CONFIG_NFS_LOCALIO */ #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -static struct svc_stat nfsd_acl_svcstats; static const struct svc_version *nfsd_acl_version[] = { +# if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, +# endif +# if defined(CONFIG_NFSD_V3_ACL) [3] = &nfsd_acl_version3, +# endif }; -#define NFSD_ACL_MINVERS 2 +#define NFSD_ACL_MINVERS 2 #define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) -static const struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS]; -static struct svc_program nfsd_acl_program = { - .pg_prog = NFS_ACL_PROGRAM, - .pg_nvers = NFSD_ACL_NRVERS, - .pg_vers = nfsd_acl_versions, - .pg_name = "nfsacl", - .pg_class = "nfsd", - .pg_stats = &nfsd_acl_svcstats, - .pg_authenticate = &svc_set_client, -}; - -static struct svc_stat nfsd_acl_svcstats = { - .program = &nfsd_acl_program, -}; #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ -static const struct svc_version *nfsd_version[] = { +static const struct svc_version *nfsd_version[NFSD_MAXVERS+1] = { +#if defined(CONFIG_NFSD_V2) [2] = &nfsd_version2, -#if defined(CONFIG_NFSD_V3) - [3] = &nfsd_version3, #endif + [3] = &nfsd_version3, #if defined(CONFIG_NFSD_V4) [4] = &nfsd_version4, #endif }; -#define NFSD_MINVERS 2 -#define NFSD_NRVERS ARRAY_SIZE(nfsd_version) -static const struct svc_version *nfsd_versions[NFSD_NRVERS]; - -struct svc_program nfsd_program = { -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) - .pg_next = &nfsd_acl_program, -#endif +struct svc_program nfsd_programs[] = { + { .pg_prog = NFS_PROGRAM, /* program number */ - .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */ - .pg_vers = nfsd_versions, /* version table */ + .pg_nvers = NFSD_MAXVERS+1, /* nr of entries in nfsd_version */ + .pg_vers = nfsd_version, /* version table */ .pg_name = "nfsd", /* program name */ .pg_class = "nfsd", /* authentication class */ - .pg_stats = &nfsd_svcstats, /* version table */ - .pg_authenticate = &svc_set_client, /* export authentication */ - + .pg_authenticate = svc_set_client, /* export authentication */ + .pg_init_request = nfsd_init_request, + .pg_rpcbind_set = nfsd_rpcbind_set, + }, +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + { + .pg_prog = NFS_ACL_PROGRAM, + .pg_nvers = NFSD_ACL_NRVERS, + .pg_vers = nfsd_acl_version, + .pg_name = "nfsacl", + .pg_class = "nfsd", + .pg_authenticate = svc_set_client, + .pg_init_request = nfsd_acl_init_request, + .pg_rpcbind_set = nfsd_acl_rpcbind_set, + }, +#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + { + .pg_prog = NFS_LOCALIO_PROGRAM, + .pg_nvers = NFSD_LOCALIO_NRVERS, + .pg_vers = localio_versions, + .pg_name = "nfslocalio", + .pg_class = "nfsd", + .pg_authenticate = svc_set_client, + .pg_init_request = svc_generic_init_request, + .pg_rpcbind_set = svc_generic_rpcbind_set, + } +#endif /* CONFIG_NFS_LOCALIO */ }; -static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = { - [0] = 1, - [1] = 1, - [2] = 1, -}; +bool nfsd_support_version(int vers) +{ + if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS) + return nfsd_version[vers] != NULL; + return false; +} -int nfsd_vers(int vers, enum vers_op change) +int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change) { - if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) + if (vers < NFSD_MINVERS || vers > NFSD_MAXVERS) return 0; switch(change) { case NFSD_SET: - nfsd_versions[vers] = nfsd_version[vers]; -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) - if (vers < NFSD_ACL_NRVERS) - nfsd_acl_versions[vers] = nfsd_acl_version[vers]; -#endif + nn->nfsd_versions[vers] = nfsd_support_version(vers); break; case NFSD_CLEAR: - nfsd_versions[vers] = NULL; -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) - if (vers < NFSD_ACL_NRVERS) - nfsd_acl_versions[vers] = NULL; -#endif + nn->nfsd_versions[vers] = false; break; case NFSD_TEST: - return nfsd_versions[vers] != NULL; + return nn->nfsd_versions[vers]; case NFSD_AVAIL: - return nfsd_version[vers] != NULL; + return nfsd_support_version(vers); } return 0; } static void -nfsd_adjust_nfsd_versions4(void) +nfsd_adjust_nfsd_versions4(struct nfsd_net *nn) { unsigned i; for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) { - if (nfsd_supported_minorversions[i]) + if (nn->nfsd4_minorversions[i]) return; } - nfsd_vers(4, NFSD_CLEAR); + nfsd_vers(nn, 4, NFSD_CLEAR); } -int nfsd_minorversion(u32 minorversion, enum vers_op change) +int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change) { if (minorversion > NFSD_SUPPORTED_MINOR_VERSION && change != NFSD_AVAIL) return -1; + switch(change) { case NFSD_SET: - nfsd_supported_minorversions[minorversion] = true; - nfsd_vers(4, NFSD_SET); + nfsd_vers(nn, 4, NFSD_SET); + nn->nfsd4_minorversions[minorversion] = + nfsd_vers(nn, 4, NFSD_TEST); break; case NFSD_CLEAR: - nfsd_supported_minorversions[minorversion] = false; - nfsd_adjust_nfsd_versions4(); + nn->nfsd4_minorversions[minorversion] = false; + nfsd_adjust_nfsd_versions4(nn); break; case NFSD_TEST: - return nfsd_supported_minorversions[minorversion]; + return nn->nfsd4_minorversions[minorversion]; case NFSD_AVAIL: - return minorversion <= NFSD_SUPPORTED_MINOR_VERSION; + return minorversion <= NFSD_SUPPORTED_MINOR_VERSION && + nfsd_vers(nn, 4, NFSD_AVAIL); } return 0; } +bool nfsd_net_try_get(struct net *net) __must_hold(rcu) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + return (nn && percpu_ref_tryget_live(&nn->nfsd_net_ref)); +} + +void nfsd_net_put(struct net *net) __must_hold(rcu) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + percpu_ref_put(&nn->nfsd_net_ref); +} + +static void nfsd_net_done(struct percpu_ref *ref) +{ + struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref); + + complete(&nn->nfsd_net_confirm_done); +} + +static void nfsd_net_free(struct percpu_ref *ref) +{ + struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_net_ref); + + complete(&nn->nfsd_net_free_done); +} + /* * Maximum number of nfsd processes */ @@ -205,52 +249,26 @@ int nfsd_nrthreads(struct net *net) return rv; } -static int nfsd_init_socks(struct net *net) -{ - int error; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - if (!list_empty(&nn->nfsd_serv->sv_permsocks)) - return 0; - - error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, - SVC_SOCK_DEFAULTS); - if (error < 0) - return error; - - error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, - SVC_SOCK_DEFAULTS); - if (error < 0) - return error; - - return 0; -} - static int nfsd_users = 0; -static int nfsd_startup_generic(int nrservs) +static int nfsd_startup_generic(void) { int ret; if (nfsd_users++) return 0; - /* - * Readahead param cache - will no-op if it already exists. - * (Note therefore results will be suboptimal if number of - * threads is modified after nfsd start.) - */ - ret = nfsd_racache_init(2*nrservs); + ret = nfsd_file_cache_init(); if (ret) goto dec_users; ret = nfs4_state_start(); if (ret) - goto out_racache; + goto out_file_cache; return 0; -out_racache: - nfsd_racache_shutdown(); +out_file_cache: + nfsd_file_cache_shutdown(); dec_users: nfsd_users--; return ret; @@ -262,19 +280,72 @@ static void nfsd_shutdown_generic(void) return; nfs4_state_shutdown(); - nfsd_racache_shutdown(); + nfsd_file_cache_shutdown(); } -static bool nfsd_needs_lockd(void) +static bool nfsd_needs_lockd(struct nfsd_net *nn) { -#if defined(CONFIG_NFSD_V3) - return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL); -#else - return (nfsd_versions[2] != NULL); -#endif + return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); +} + +/** + * nfsd_copy_write_verifier - Atomically copy a write verifier + * @verf: buffer in which to receive the verifier cookie + * @nn: NFS net namespace + * + * This function provides a wait-free mechanism for copying the + * namespace's write verifier without tearing it. + */ +void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) +{ + unsigned int seq; + + do { + seq = read_seqbegin(&nn->writeverf_lock); + memcpy(verf, nn->writeverf, sizeof(nn->writeverf)); + } while (read_seqretry(&nn->writeverf_lock, seq)); +} + +static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) +{ + struct timespec64 now; + u64 verf; + + /* + * Because the time value is hashed, y2038 time_t overflow + * is irrelevant in this usage. + */ + ktime_get_raw_ts64(&now); + verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key); + memcpy(nn->writeverf, &verf, sizeof(nn->writeverf)); +} + +/** + * nfsd_reset_write_verifier - Generate a new write verifier + * @nn: NFS net namespace + * + * This function updates the ->writeverf field of @nn. This field + * contains an opaque cookie that, according to Section 18.32.3 of + * RFC 8881, "the client can use to determine whether a server has + * changed instance state (e.g., server restart) between a call to + * WRITE and a subsequent call to either WRITE or COMMIT. This + * cookie MUST be unchanged during a single instance of the NFSv4.1 + * server and MUST be unique between instances of the NFSv4.1 + * server." + */ +void nfsd_reset_write_verifier(struct nfsd_net *nn) +{ + write_seqlock(&nn->writeverf_lock); + nfsd_reset_write_verifier_locked(nn); + write_sequnlock(&nn->writeverf_lock); } -static int nfsd_startup_net(int nrservs, struct net *net) +/* + * Crank up a set of per-namespace resources for a new NFSD instance, + * including lockd, a duplicate reply cache, an open file cache + * instance, and a cache of NFSv4 state objects. + */ +static int nfsd_startup_net(struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; @@ -282,31 +353,49 @@ static int nfsd_startup_net(int nrservs, struct net *net) if (nn->nfsd_net_up) return 0; - ret = nfsd_startup_generic(nrservs); + ret = nfsd_startup_generic(); if (ret) return ret; - ret = nfsd_init_socks(net); - if (ret) + + if (list_empty(&nn->nfsd_serv->sv_permsocks)) { + pr_warn("NFSD: Failed to start, no listeners configured.\n"); + ret = -EIO; goto out_socks; + } - if (nfsd_needs_lockd() && !nn->lockd_up) { - ret = lockd_up(net); + if (nfsd_needs_lockd(nn) && !nn->lockd_up) { + ret = lockd_up(net, cred); if (ret) goto out_socks; - nn->lockd_up = 1; + nn->lockd_up = true; } - ret = nfs4_state_start_net(net); + ret = nfsd_file_cache_start_net(net); if (ret) goto out_lockd; + ret = nfsd_reply_cache_init(nn); + if (ret) + goto out_filecache; + +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + nfsd4_ssc_init_umount_work(nn); +#endif + ret = nfs4_state_start_net(net); + if (ret) + goto out_reply_cache; + nn->nfsd_net_up = true; return 0; +out_reply_cache: + nfsd_reply_cache_shutdown(nn); +out_filecache: + nfsd_file_cache_shutdown_net(net); out_lockd: if (nn->lockd_up) { lockd_down(net); - nn->lockd_up = 0; + nn->lockd_up = false; } out_socks: nfsd_shutdown_generic(); @@ -317,15 +406,29 @@ static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + if (!nn->nfsd_net_up) + return; + + percpu_ref_kill_and_confirm(&nn->nfsd_net_ref, nfsd_net_done); + wait_for_completion(&nn->nfsd_net_confirm_done); + + nfsd_export_flush(net); nfs4_state_shutdown_net(net); + nfsd_reply_cache_shutdown(nn); + nfsd_file_cache_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); - nn->lockd_up = 0; + nn->lockd_up = false; } + + wait_for_completion(&nn->nfsd_net_free_done); + percpu_ref_exit(&nn->nfsd_net_ref); + nn->nfsd_net_up = false; nfsd_shutdown_generic(); } +static DEFINE_SPINLOCK(nfsd_notifier_lock); static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -335,18 +438,17 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nn->ntf_refcnt)) + if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; + spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } - atomic_dec(&nn->ntf_refcnt); - wake_up(&nn->ntf_wq); + spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; @@ -366,10 +468,10 @@ static int nfsd_inet6addr_event(struct notifier_block *this, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; - if ((event != NETDEV_DOWN) || - !atomic_inc_not_zero(&nn->ntf_refcnt)) + if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; + spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; @@ -378,8 +480,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } - atomic_dec(&nn->ntf_refcnt); - wake_up(&nn->ntf_wq); + spin_unlock(&nfsd_notifier_lock); + out: return NOTIFY_DONE; } @@ -392,11 +494,21 @@ static struct notifier_block nfsd_inet6addr_notifier = { /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); -static void nfsd_last_thread(struct svc_serv *serv, struct net *net) +/** + * nfsd_destroy_serv - tear down NFSD's svc_serv for a namespace + * @net: network namespace the NFS service is associated with + */ +void nfsd_destroy_serv(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv = nn->nfsd_serv; + + lockdep_assert_held(&nfsd_mutex); + + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = NULL; + spin_unlock(&nfsd_notifier_lock); - atomic_dec(&nn->ntf_refcnt); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); @@ -404,64 +516,36 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0); /* * write_ports can create the server without actually starting - * any threads--if we get shut down before any threads are - * started, then nfsd_last_thread will be run before any of this + * any threads. If we get shut down before any threads are + * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ - svc_rpcb_cleanup(serv, net); - if (!nn->nfsd_net_up) - return; - + svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); - printk(KERN_WARNING "nfsd: last server has exited, flushing export " - "cache\n"); - nfsd_export_flush(net); + svc_destroy(&serv); } -void nfsd_reset_versions(void) +void nfsd_reset_versions(struct nfsd_net *nn) { int i; - for (i = 0; i < NFSD_NRVERS; i++) - if (nfsd_vers(i, NFSD_TEST)) + for (i = 0; i <= NFSD_MAXVERS; i++) + if (nfsd_vers(nn, i, NFSD_TEST)) return; - for (i = 0; i < NFSD_NRVERS; i++) + for (i = 0; i <= NFSD_MAXVERS; i++) if (i != 4) - nfsd_vers(i, NFSD_SET); + nfsd_vers(nn, i, NFSD_SET); else { int minor = 0; - while (nfsd_minorversion(minor, NFSD_SET) >= 0) + while (nfsd_minorversion(nn, minor, NFSD_SET) >= 0) minor++; } } -/* - * Each session guarantees a negotiated per slot memory cache for replies - * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated - * NFSv4.1 server might want to use more memory for a DRC than a machine - * with mutiple services. - * - * Impose a hard limit on the number of pages for the DRC which varies - * according to the machines free pages. This is of course only a default. - * - * For now this is a #defined shift which could be under admin control - * in the future. - */ -static void set_max_drc(void) -{ - #define NFSD_DRC_SIZE_SHIFT 7 - nfsd_drc_max_mem = (nr_free_buffer_pages() - >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; - nfsd_drc_mem_used = 0; - spin_lock_init(&nfsd_drc_lock); - dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem); -} - static int nfsd_get_default_max_blksize(void) { struct sysinfo i; @@ -477,46 +561,72 @@ static int nfsd_get_default_max_blksize(void) */ target >>= 12; - ret = NFSSVC_MAXBLKSIZE; + ret = NFSSVC_DEFBLKSIZE; while (ret > target && ret >= 8*1024*2) ret /= 2; return ret; } -static const struct svc_serv_ops nfsd_thread_sv_ops = { - .svo_shutdown = nfsd_last_thread, - .svo_function = nfsd, - .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, - .svo_module = THIS_MODULE, -}; +void nfsd_shutdown_threads(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; + + mutex_lock(&nfsd_mutex); + serv = nn->nfsd_serv; + if (serv == NULL) { + mutex_unlock(&nfsd_mutex); + return; + } + + /* Kill outstanding nfsd threads */ + svc_set_num_threads(serv, NULL, 0); + nfsd_destroy_serv(net); + mutex_unlock(&nfsd_mutex); +} + +struct svc_rqst *nfsd_current_rqst(void) +{ + if (kthread_func(current) == nfsd) + return kthread_data(current); + return NULL; +} int nfsd_create_serv(struct net *net) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; WARN_ON(!mutex_is_locked(&nfsd_mutex)); - if (nn->nfsd_serv) { - svc_get(nn->nfsd_serv); + if (nn->nfsd_serv) return 0; - } + + error = percpu_ref_init(&nn->nfsd_net_ref, nfsd_net_free, + 0, GFP_KERNEL); + if (error) + return error; + init_completion(&nn->nfsd_net_free_done); + init_completion(&nn->nfsd_net_confirm_done); + if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); - nfsd_reset_versions(); - nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - &nfsd_thread_sv_ops); - if (nn->nfsd_serv == NULL) + nfsd_reset_versions(nn); + serv = svc_create_pooled(nfsd_programs, ARRAY_SIZE(nfsd_programs), + &nn->nfsd_svcstats, + nfsd_max_blksize, nfsd); + if (serv == NULL) return -ENOMEM; - nn->nfsd_serv->sv_maxconn = nn->max_connections; - error = svc_bind(nn->nfsd_serv, net); + error = svc_bind(serv, net); if (error < 0) { - svc_destroy(nn->nfsd_serv); + svc_destroy(&serv); return error; } + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = serv; + spin_unlock(&nfsd_notifier_lock); - set_max_drc(); /* check if the notifier is already set */ if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { register_inetaddr_notifier(&nfsd_inetaddr_notifier); @@ -524,8 +634,7 @@ int nfsd_create_serv(struct net *net) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - atomic_inc(&nn->ntf_refcnt); - ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */ + nfsd_reset_write_verifier(nn); return 0; } @@ -541,29 +650,29 @@ int nfsd_nrpools(struct net *net) int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) { - int i = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv = nn->nfsd_serv; + int i; - if (nn->nfsd_serv != NULL) { - for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++) - nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads; - } - + if (serv) + for (i = 0; i < serv->sv_nrpools && i < n; i++) + nthreads[i] = serv->sv_pools[i].sp_nrthreads; return 0; } -void nfsd_destroy(struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - int destroy = (nn->nfsd_serv->sv_nrthreads == 1); - - if (destroy) - svc_shutdown_net(nn->nfsd_serv, net); - svc_destroy(nn->nfsd_serv); - if (destroy) - nn->nfsd_serv = NULL; -} - +/** + * nfsd_set_nrthreads - set the number of running threads in the net's service + * @n: number of array members in @nthreads + * @nthreads: array of thread counts for each pool + * @net: network namespace to operate within + * + * This function alters the number of running threads for the given network + * namespace in each pool. If passed an array longer then the number of pools + * the extra pool settings are ignored. If passed an array shorter than the + * number of pools, the missing values are interpreted as 0's. + * + * Returns 0 on success or a negative errno on error. + */ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; @@ -571,11 +680,18 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) int err = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - WARN_ON(!mutex_is_locked(&nfsd_mutex)); + lockdep_assert_held(&nfsd_mutex); if (nn->nfsd_serv == NULL || n <= 0) return 0; + /* + * Special case: When n == 1, pass in NULL for the pool, so that the + * change is distributed equally among them. + */ + if (n == 1) + return svc_set_num_threads(nn->nfsd_serv, NULL, nthreads[0]); + if (n > nn->nfsd_serv->sv_nrpools) n = nn->nfsd_serv->sv_nrpools; @@ -588,7 +704,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (tot > NFSD_MAXSERVS) { /* total too large: scale down requested numbers */ for (i = 0; i < n && tot > 0; i++) { - int new = nthreads[i] * NFSD_MAXSERVS / tot; + int new = nthreads[i] * NFSD_MAXSERVS / tot; tot -= (nthreads[i] - new); nthreads[i] = new; } @@ -598,75 +714,166 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) } } - /* - * There must always be a thread in pool 0; the admin - * can't shut down NFS completely using pool_threads. - */ - if (nthreads[0] == 0) - nthreads[0] = 1; - /* apply the new numbers */ - svc_get(nn->nfsd_serv); for (i = 0; i < n; i++) { - err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, - &nn->nfsd_serv->sv_pools[i], nthreads[i]); + err = svc_set_num_threads(nn->nfsd_serv, + &nn->nfsd_serv->sv_pools[i], + nthreads[i]); if (err) - break; + goto out; } - nfsd_destroy(net); + + /* Anything undefined in array is considered to be 0 */ + for (i = n; i < nn->nfsd_serv->sv_nrpools; ++i) { + err = svc_set_num_threads(nn->nfsd_serv, + &nn->nfsd_serv->sv_pools[i], + 0); + if (err) + goto out; + } +out: return err; } -/* - * Adjust the number of threads and return the new number of threads. - * This is also the function that starts the server if necessary, if - * this is the first time nrservs is nonzero. +/** + * nfsd_svc: start up or shut down the nfsd server + * @n: number of array members in @nthreads + * @nthreads: array of thread counts for each pool + * @net: network namespace to operate within + * @cred: credentials to use for xprt creation + * @scope: server scope value (defaults to nodename) + * + * Adjust the number of threads in each pool and return the new + * total number of threads in the service. */ int -nfsd_svc(int nrservs, struct net *net) +nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const char *scope) { int error; - bool nfsd_up_before; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; + + lockdep_assert_held(&nfsd_mutex); - mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); - nrservs = max(nrservs, 0); - nrservs = min(nrservs, NFSD_MAXSERVS); - error = 0; - - if (nrservs == 0 && nn->nfsd_serv == NULL) - goto out; + strscpy(nn->nfsd_name, scope ? scope : utsname()->nodename, + sizeof(nn->nfsd_name)); error = nfsd_create_serv(net); if (error) goto out; + serv = nn->nfsd_serv; - nfsd_up_before = nn->nfsd_net_up; - - error = nfsd_startup_net(nrservs, net); + error = nfsd_startup_net(net, cred); if (error) - goto out_destroy; - error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, - NULL, nrservs); + goto out_put; + error = nfsd_set_nrthreads(n, nthreads, net); if (error) - goto out_shutdown; - /* We are holding a reference to nn->nfsd_serv which - * we don't want to count in the return value, - * so subtract 1 - */ - error = nn->nfsd_serv->sv_nrthreads - 1; -out_shutdown: - if (error < 0 && !nfsd_up_before) - nfsd_shutdown_net(net); -out_destroy: - nfsd_destroy(net); /* Release server */ + goto out_put; + error = serv->sv_nrthreads; +out_put: + if (serv->sv_nrthreads == 0) + nfsd_destroy_serv(net); out: - mutex_unlock(&nfsd_mutex); return error; } +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static bool +nfsd_support_acl_version(int vers) +{ + if (vers >= NFSD_ACL_MINVERS && vers < NFSD_ACL_NRVERS) + return nfsd_acl_version[vers] != NULL; + return false; +} + +static int +nfsd_acl_rpcbind_set(struct net *net, const struct svc_program *progp, + u32 version, int family, unsigned short proto, + unsigned short port) +{ + if (!nfsd_support_acl_version(version) || + !nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) + return 0; + return svc_generic_rpcbind_set(net, progp, version, family, + proto, port); +} + +static __be32 +nfsd_acl_init_request(struct svc_rqst *rqstp, + const struct svc_program *progp, + struct svc_process_info *ret) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + int i; + + if (likely(nfsd_support_acl_version(rqstp->rq_vers) && + nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) + return svc_generic_init_request(rqstp, progp, ret); + + ret->mismatch.lovers = NFSD_ACL_NRVERS; + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { + if (nfsd_support_acl_version(rqstp->rq_vers) && + nfsd_vers(nn, i, NFSD_TEST)) { + ret->mismatch.lovers = i; + break; + } + } + if (ret->mismatch.lovers == NFSD_ACL_NRVERS) + return rpc_prog_unavail; + ret->mismatch.hivers = NFSD_ACL_MINVERS; + for (i = NFSD_ACL_NRVERS - 1; i >= NFSD_ACL_MINVERS; i--) { + if (nfsd_support_acl_version(rqstp->rq_vers) && + nfsd_vers(nn, i, NFSD_TEST)) { + ret->mismatch.hivers = i; + break; + } + } + return rpc_prog_mismatch; +} +#endif + +static int +nfsd_rpcbind_set(struct net *net, const struct svc_program *progp, + u32 version, int family, unsigned short proto, + unsigned short port) +{ + if (!nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) + return 0; + return svc_generic_rpcbind_set(net, progp, version, family, + proto, port); +} + +static __be32 +nfsd_init_request(struct svc_rqst *rqstp, + const struct svc_program *progp, + struct svc_process_info *ret) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + int i; + + if (likely(nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) + return svc_generic_init_request(rqstp, progp, ret); + + ret->mismatch.lovers = NFSD_MAXVERS + 1; + for (i = NFSD_MINVERS; i <= NFSD_MAXVERS; i++) { + if (nfsd_vers(nn, i, NFSD_TEST)) { + ret->mismatch.lovers = i; + break; + } + } + if (ret->mismatch.lovers > NFSD_MAXVERS) + return rpc_prog_unavail; + ret->mismatch.hivers = NFSD_MINVERS; + for (i = NFSD_MAXVERS; i >= NFSD_MINVERS; i--) { + if (nfsd_vers(nn, i, NFSD_TEST)) { + ret->mismatch.hivers = i; + break; + } + } + return rpc_prog_mismatch; +} /* * This is the NFS server kernel thread @@ -678,216 +885,146 @@ nfsd(void *vrqstp) struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - int err; - - /* Lock module and set up kernel thread */ - mutex_lock(&nfsd_mutex); /* At this point, the thread shares current->fs * with the init process. We need to create files with the - * umask as defined by the client instead of init's umask. */ - if (unshare_fs_struct() < 0) { - printk("Unable to start nfsd thread: out of memory\n"); - goto out; - } + * umask as defined by the client instead of init's umask. + */ + svc_thread_init_status(rqstp, unshare_fs_struct()); current->fs->umask = 0; - /* - * thread is spawned with all signals set to SIG_IGN, re-enable - * the ones that will bring down the thread - */ - allow_signal(SIGKILL); - allow_signal(SIGHUP); - allow_signal(SIGINT); - allow_signal(SIGQUIT); - - nfsdstats.th_cnt++; - mutex_unlock(&nfsd_mutex); + atomic_inc(&nfsd_th_cnt); set_freezable(); /* * The main request loop */ - for (;;) { - /* Update sv_maxconn if it has changed */ - rqstp->rq_server->sv_maxconn = nn->max_connections; - - /* - * Find a socket with data available and call its - * recvfrom routine. - */ - while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) - ; - if (err == -EINTR) - break; - validate_process_creds(); - svc_process(rqstp); - validate_process_creds(); + while (!svc_thread_should_stop(rqstp)) { + svc_recv(rqstp); + nfsd_file_net_dispose(nn); } - /* Clear signals before calling svc_exit_thread() */ - flush_signals(current); - - mutex_lock(&nfsd_mutex); - nfsdstats.th_cnt --; - -out: - rqstp->rq_server = NULL; + atomic_dec(&nfsd_th_cnt); /* Release the thread */ svc_exit_thread(rqstp); - - nfsd_destroy(net); - - /* Release module */ - mutex_unlock(&nfsd_mutex); - module_put_and_exit(0); return 0; } -static __be32 map_new_errors(u32 vers, __be32 nfserr) -{ - if (nfserr == nfserr_jukebox && vers == 2) - return nfserr_dropit; - if (nfserr == nfserr_wrongsec && vers < 4) - return nfserr_acces; - return nfserr; -} - -/* - * A write procedure can have a large argument, and a read procedure can - * have a large reply, but no NFSv2 or NFSv3 procedure has argument and - * reply that can both be larger than a page. The xdr code has taken - * advantage of this assumption to be a sloppy about bounds checking in - * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that - * problem, we enforce these assumptions here: +/** + * nfsd_dispatch - Process an NFS or NFSACL or LOCALIO Request + * @rqstp: incoming request + * + * This RPC dispatcher integrates the NFS server's duplicate reply cache. + * + * Return values: + * %0: Processing complete; do not send a Reply + * %1: Processing complete; send Reply in rqstp->rq_res */ -static bool nfs_request_too_big(struct svc_rqst *rqstp, - const struct svc_procedure *proc) +int nfsd_dispatch(struct svc_rqst *rqstp) { + const struct svc_procedure *proc = rqstp->rq_procinfo; + __be32 *statp = rqstp->rq_accept_statp; + struct nfsd_cacherep *rp; + unsigned int start, len; + __be32 *nfs_reply; + /* - * The ACL code has more careful bounds-checking and is not - * susceptible to this problem: + * Give the xdr decoder a chance to change this if it wants + * (necessary in the NFSv4.0 compound case) */ - if (rqstp->rq_prog != NFS_PROGRAM) - return false; + rqstp->rq_cachetype = proc->pc_cachetype; + /* - * Ditto NFSv4 (which can in theory have argument and reply both - * more than a page): + * ->pc_decode advances the argument stream past the NFS + * Call header, so grab the header's starting location and + * size now for the call to nfsd_cache_lookup(). */ - if (rqstp->rq_vers >= 4) - return false; - /* The reply will be small, we're OK: */ - if (proc->pc_xdrressize > 0 && - proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE)) - return false; - - return rqstp->rq_arg.len > PAGE_SIZE; -} - -int -nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) -{ - const struct svc_procedure *proc; - __be32 nfserr; - __be32 *nfserrp; - - dprintk("nfsd_dispatch: vers %d proc %d\n", - rqstp->rq_vers, rqstp->rq_proc); - proc = rqstp->rq_procinfo; + start = xdr_stream_pos(&rqstp->rq_arg_stream); + len = xdr_stream_remaining(&rqstp->rq_arg_stream); + if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) + goto out_decode_err; - if (nfs_request_too_big(rqstp, proc)) { - dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); - *statp = rpc_garbage_args; - return 1; - } /* - * Give the xdr decoder a chance to change this if it wants - * (necessary in the NFSv4.0 compound case) + * Release rq_status_counter setting it to an odd value after the rpc + * request has been properly parsed. rq_status_counter is used to + * notify the consumers if the rqstp fields are stable + * (rq_status_counter is odd) or not meaningful (rq_status_counter + * is even). */ - rqstp->rq_cachetype = proc->pc_cachetype; - /* Decode arguments */ - if (proc->pc_decode && - !proc->pc_decode(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base)) { - dprintk("nfsd: failed to decode arguments!\n"); - *statp = rpc_garbage_args; - return 1; - } + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); - /* Check whether we have this call in the cache. */ - switch (nfsd_cache_lookup(rqstp)) { - case RC_DROPIT: - return 0; + rp = NULL; + switch (nfsd_cache_lookup(rqstp, start, len, &rp)) { + case RC_DOIT: + break; case RC_REPLY: - return 1; - case RC_DOIT:; - /* do it */ + goto out_cached_reply; + case RC_DROPIT: + goto out_dropit; } - /* need to grab the location to store the status, as - * nfsv4 does some encoding while processing - */ - nfserrp = rqstp->rq_res.head[0].iov_base - + rqstp->rq_res.head[0].iov_len; - rqstp->rq_res.head[0].iov_len += sizeof(__be32); - - /* Now call the procedure handler, and encode NFS status. */ - nfserr = proc->pc_func(rqstp); - nfserr = map_new_errors(rqstp->rq_vers, nfserr); - if (nfserr == nfserr_dropit || test_bit(RQ_DROPME, &rqstp->rq_flags)) { - dprintk("nfsd: Dropping request; may be revisited later\n"); - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); - return 0; - } + nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0); + *statp = proc->pc_func(rqstp); + if (test_bit(RQ_DROPME, &rqstp->rq_flags)) + goto out_update_drop; - if (rqstp->rq_proc != 0) - *nfserrp++ = nfserr; + if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) + goto out_encode_err; - /* Encode result. - * For NFSv2, additional info is never returned in case of an error. + /* + * Release rq_status_counter setting it to an even value after the rpc + * request has been properly processed. */ - if (!(nfserr && rqstp->rq_vers == 2)) { - if (proc->pc_encode && !proc->pc_encode(rqstp, nfserrp)) { - /* Failed to encode result. Release cache entry */ - dprintk("nfsd: failed to encode result!\n"); - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); - *statp = rpc_system_err; - return 1; - } - } + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); - /* Store reply in cache. */ - nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1); + nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); +out_cached_reply: return 1; -} -int nfsd_pool_stats_open(struct inode *inode, struct file *file) -{ - int ret; - struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id); +out_decode_err: + trace_nfsd_garbage_args_err(rqstp); + *statp = rpc_garbage_args; + return 1; - mutex_lock(&nfsd_mutex); - if (nn->nfsd_serv == NULL) { - mutex_unlock(&nfsd_mutex); - return -ENODEV; - } - /* bump up the psudo refcount while traversing */ - svc_get(nn->nfsd_serv); - ret = svc_pool_stats_open(nn->nfsd_serv, file); - mutex_unlock(&nfsd_mutex); - return ret; +out_update_drop: + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); +out_dropit: + return 0; + +out_encode_err: + trace_nfsd_cant_encode_err(rqstp); + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); + *statp = rpc_system_err; + return 1; } -int nfsd_pool_stats_release(struct inode *inode, struct file *file) +/** + * nfssvc_decode_voidarg - Decode void arguments + * @rqstp: Server RPC transaction context + * @xdr: XDR stream positioned at arguments to decode + * + * Return values: + * %false: Arguments were not valid + * %true: Decoding was successful + */ +bool nfssvc_decode_voidarg(struct svc_rqst *rqstp, struct xdr_stream *xdr) { - int ret = seq_release(inode, file); - struct net *net = inode->i_sb->s_fs_info; + return true; +} - mutex_lock(&nfsd_mutex); - /* this function really, really should have been called svc_put() */ - nfsd_destroy(net); - mutex_unlock(&nfsd_mutex); - return ret; +/** + * nfssvc_encode_voidres - Encode void results + * @rqstp: Server RPC transaction context + * @xdr: XDR stream into which to encode results + * + * Return values: + * %false: Local error while encoding + * %true: Encoding was successful + */ +bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr) +{ + return true; } diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 6b2e8b73d36e..fc262ceafca9 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -9,12 +9,10 @@ #include "xdr.h" #include "auth.h" -#define NFSDDBG_FACILITY NFSDDBG_XDR - /* * Mapping of S_IF* types to NFS file types */ -static u32 nfs_ftypes[] = { +static const u32 nfs_ftypes[] = { NFNON, NFCHR, NFCHR, NFBAD, NFDIR, NFBAD, NFBLK, NFBAD, NFREG, NFBAD, NFLNK, NFBAD, @@ -23,93 +21,168 @@ static u32 nfs_ftypes[] = { /* - * XDR functions for basic NFS types + * Basic NFSv2 data types (RFC 1094 Section 2.3) */ -static __be32 * -decode_fh(__be32 *p, struct svc_fh *fhp) + +/** + * svcxdr_encode_stat - Encode an NFSv2 status code + * @xdr: XDR stream + * @status: status value to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_stat(struct xdr_stream *xdr, __be32 status) { + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(status)); + if (!p) + return false; + *p = status; + + return true; +} + +/** + * svcxdr_decode_fhandle - Decode an NFSv2 file handle + * @xdr: XDR stream positioned at an encoded NFSv2 FH + * @fhp: OUT: filled-in server file handle + * + * Return values: + * %false: The encoded file handle was not valid + * %true: @fhp has been initialized + */ +bool +svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_FHSIZE); + if (!p) + return false; fh_init(fhp, NFS_FHSIZE); - memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE); + memcpy(&fhp->fh_handle.fh_raw, p, NFS_FHSIZE); fhp->fh_handle.fh_size = NFS_FHSIZE; - /* FIXME: Look up export pointer here and verify - * Sun Secure RPC if requested */ - return p + (NFS_FHSIZE >> 2); + return true; } -/* Helper function for NFSv2 ACL code */ -__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_fhandle(struct xdr_stream *xdr, const struct svc_fh *fhp) { - return decode_fh(p, fhp); + __be32 *p; + + p = xdr_reserve_space(xdr, NFS_FHSIZE); + if (!p) + return false; + memcpy(p, &fhp->fh_handle.fh_raw, NFS_FHSIZE); + + return true; } static __be32 * -encode_fh(__be32 *p, struct svc_fh *fhp) +encode_timeval(__be32 *p, const struct timespec64 *time) { - memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE); - return p + (NFS_FHSIZE>> 2); + *p++ = cpu_to_be32((u32)time->tv_sec); + if (time->tv_nsec) + *p++ = cpu_to_be32(time->tv_nsec / NSEC_PER_USEC); + else + *p++ = xdr_zero; + return p; } -/* - * Decode a file name and make sure that the path contains - * no slashes or null bytes. - */ -static __be32 * -decode_filename(__be32 *p, char **namp, unsigned int *lenp) +static bool +svcxdr_decode_filename(struct xdr_stream *xdr, char **name, unsigned int *len) { - char *name; - unsigned int i; - - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0' || *name == '/') - return NULL; - } - } + u32 size, i; + __be32 *p; + char *c; + + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS_MAXNAMLEN) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; - return p; + *len = size; + *name = (char *)p; + for (i = 0, c = *name; i < size; i++, c++) + if (*c == '\0' || *c == '/') + return false; + + return true; } -static __be32 * -decode_sattr(__be32 *p, struct iattr *iap) +static bool +svcxdr_decode_diropargs(struct xdr_stream *xdr, struct svc_fh *fhp, + char **name, unsigned int *len) { - u32 tmp, tmp1; + return svcxdr_decode_fhandle(xdr, fhp) && + svcxdr_decode_filename(xdr, name, len); +} + +static bool +svcxdr_decode_sattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct iattr *iap) +{ + u32 tmp1, tmp2; + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT * 8); + if (!p) + return false; iap->ia_valid = 0; - /* Sun client bug compatibility check: some sun clients seem to - * put 0xffff in the mode field when they mean 0xffffffff. - * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah. + /* + * Some Sun clients put 0xffff in the mode field when they + * mean 0xffffffff. */ - if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) { + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp1 != 0xffff) { iap->ia_valid |= ATTR_MODE; - iap->ia_mode = tmp; + iap->ia_mode = tmp1; } - if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_uid = make_kuid(&init_user_ns, tmp); + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { + iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), tmp1); if (uid_valid(iap->ia_uid)) iap->ia_valid |= ATTR_UID; } - if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_gid = make_kgid(&init_user_ns, tmp); + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { + iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), tmp1); if (gid_valid(iap->ia_gid)) iap->ia_valid |= ATTR_GID; } - if ((tmp = ntohl(*p++)) != (u32)-1) { + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { iap->ia_valid |= ATTR_SIZE; - iap->ia_size = tmp; + iap->ia_size = tmp1; } - tmp = ntohl(*p++); tmp1 = ntohl(*p++); - if (tmp != (u32)-1 && tmp1 != (u32)-1) { + + tmp1 = be32_to_cpup(p++); + tmp2 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp2 != (u32)-1) { iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - iap->ia_atime.tv_sec = tmp; - iap->ia_atime.tv_nsec = tmp1 * 1000; + iap->ia_atime.tv_sec = tmp1; + iap->ia_atime.tv_nsec = tmp2 * NSEC_PER_USEC; } - tmp = ntohl(*p++); tmp1 = ntohl(*p++); - if (tmp != (u32)-1 && tmp1 != (u32)-1) { + + tmp1 = be32_to_cpup(p++); + tmp2 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp2 != (u32)-1) { iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; - iap->ia_mtime.tv_sec = tmp; - iap->ia_mtime.tv_nsec = tmp1 * 1000; + iap->ia_mtime.tv_sec = tmp1; + iap->ia_mtime.tv_nsec = tmp2 * NSEC_PER_USEC; /* * Passing the invalid value useconds=1000000 for mtime * is a Sun convention for "set both mtime and atime to @@ -119,451 +192,472 @@ decode_sattr(__be32 *p, struct iattr *iap) * sattr in section 6.1 of "NFS Illustrated" by * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 */ - if (tmp1 == 1000000) + if (tmp2 == 1000000) iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET); } - return p; + + return true; } -static __be32 * -encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, - struct kstat *stat) +/** + * svcxdr_encode_fattr - Encode NFSv2 file attributes + * @rqstp: Context of a completed RPC transaction + * @xdr: XDR stream + * @fhp: File handle to encode + * @stat: Attributes to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp, const struct kstat *stat) { - struct dentry *dentry = fhp->fh_dentry; - int type; + struct user_namespace *userns = nfsd_user_namespace(rqstp); + struct dentry *dentry = fhp->fh_dentry; + int type = stat->mode & S_IFMT; struct timespec64 time; - u32 f; + __be32 *p; + u32 fsid; - type = (stat->mode & S_IFMT); + p = xdr_reserve_space(xdr, XDR_UNIT * 17); + if (!p) + return false; - *p++ = htonl(nfs_ftypes[type >> 12]); - *p++ = htonl((u32) stat->mode); - *p++ = htonl((u32) stat->nlink); - *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); - *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); + *p++ = cpu_to_be32(nfs_ftypes[type >> 12]); + *p++ = cpu_to_be32((u32)stat->mode); + *p++ = cpu_to_be32((u32)stat->nlink); + *p++ = cpu_to_be32((u32)from_kuid_munged(userns, stat->uid)); + *p++ = cpu_to_be32((u32)from_kgid_munged(userns, stat->gid)); - if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { - *p++ = htonl(NFS_MAXPATHLEN); - } else { - *p++ = htonl((u32) stat->size); - } - *p++ = htonl((u32) stat->blksize); + if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) + *p++ = cpu_to_be32(NFS_MAXPATHLEN); + else + *p++ = cpu_to_be32((u32) stat->size); + *p++ = cpu_to_be32((u32) stat->blksize); if (S_ISCHR(type) || S_ISBLK(type)) - *p++ = htonl(new_encode_dev(stat->rdev)); + *p++ = cpu_to_be32(new_encode_dev(stat->rdev)); else - *p++ = htonl(0xffffffff); - *p++ = htonl((u32) stat->blocks); + *p++ = cpu_to_be32(0xffffffff); + *p++ = cpu_to_be32((u32)stat->blocks); + switch (fsid_source(fhp)) { - default: - case FSIDSOURCE_DEV: - *p++ = htonl(new_encode_dev(stat->dev)); - break; case FSIDSOURCE_FSID: - *p++ = htonl((u32) fhp->fh_export->ex_fsid); + fsid = (u32)fhp->fh_export->ex_fsid; break; case FSIDSOURCE_UUID: - f = ((u32*)fhp->fh_export->ex_uuid)[0]; - f ^= ((u32*)fhp->fh_export->ex_uuid)[1]; - f ^= ((u32*)fhp->fh_export->ex_uuid)[2]; - f ^= ((u32*)fhp->fh_export->ex_uuid)[3]; - *p++ = htonl(f); + fsid = ((u32 *)fhp->fh_export->ex_uuid)[0]; + fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[1]; + fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[2]; + fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[3]; + break; + default: + fsid = new_encode_dev(stat->dev); break; } - *p++ = htonl((u32) stat->ino); - *p++ = htonl((u32) stat->atime.tv_sec); - *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); + *p++ = cpu_to_be32(fsid); + + *p++ = cpu_to_be32((u32)stat->ino); + p = encode_timeval(p, &stat->atime); time = stat->mtime; - lease_get_mtime(d_inode(dentry), &time); - *p++ = htonl((u32) time.tv_sec); - *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); - *p++ = htonl((u32) stat->ctime.tv_sec); - *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0); + lease_get_mtime(d_inode(dentry), &time); + p = encode_timeval(p, &time); + encode_timeval(p, &stat->ctime); - return p; -} - -/* Helper function for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) -{ - return encode_fattr(rqstp, p, fhp, stat); + return true; } /* * XDR decode functions */ -int -nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_argsize_check(rqstp, p); -} -int -nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_fhandle *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->fh); } -int -nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_sattrargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = decode_sattr(p, &args->attrs); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->fh) && + svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } -int -nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_diropargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->fh, &args->name, &args->len); } -int -nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_readargs *args = rqstp->rq_argp; - unsigned int len; - int v; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - - args->offset = ntohl(*p++); - len = args->count = ntohl(*p++); - p++; /* totalcount - unused */ - - len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2); - - /* set up somewhere to store response. - * We take pages, put them on reslist and include in iovec - */ - v=0; - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } - args->vlen = v; - return xdr_argsize_check(rqstp, p); + u32 totalcount; + + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return false; + if (xdr_stream_decode_u32(xdr, &args->offset) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return false; + /* totalcount is ignored */ + if (xdr_stream_decode_u32(xdr, &totalcount) < 0) + return false; + + return true; } -int -nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_writeargs *args = rqstp->rq_argp; - unsigned int len, hdr, dlen; - struct kvec *head = rqstp->rq_arg.head; - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - - p++; /* beginoffset */ - args->offset = ntohl(*p++); /* offset */ - p++; /* totalcount */ - len = args->len = ntohl(*p++); - /* - * The protocol specifies a maximum of 8192 bytes. - */ - if (len > NFSSVC_MAXBLKSIZE_V2) - return 0; - - /* - * Check to make sure that we got the right number of - * bytes. - */ - hdr = (void*)p - head->iov_base; - if (hdr > head->iov_len) - return 0; - dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; - - /* - * Round the length of the data which was specified up to - * the next multiple of XDR units and then compare that - * against the length which was actually received. - * Note that when RPCSEC/GSS (for example) is used, the - * data buffer can be padded so dlen might be larger - * than required. It must never be smaller. - */ - if (dlen < XDR_QUADLEN(len)*4) - return 0; - - args->first.iov_base = (void *)p; - args->first.iov_len = head->iov_len - hdr; - return 1; + u32 beginoffset, totalcount; + + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return false; + /* beginoffset is ignored */ + if (xdr_stream_decode_u32(xdr, &beginoffset) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &args->offset) < 0) + return false; + /* totalcount is ignored */ + if (xdr_stream_decode_u32(xdr, &totalcount) < 0) + return false; + + /* opaque data */ + if (xdr_stream_decode_u32(xdr, &args->len) < 0) + return false; + if (args->len > NFS_MAXDATA) + return false; + + return xdr_stream_subsegment(xdr, &args->payload, args->len); } -int -nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_createargs *args = rqstp->rq_argp; - if ( !(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - p = decode_sattr(p, &args->attrs); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->fh, + &args->name, &args->len) && + svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } -int -nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_renameargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->ffh, + &args->fname, &args->flen) && + svcxdr_decode_diropargs(xdr, &args->tfh, + &args->tname, &args->tlen); } -int -nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_readlinkargs *args = rqstp->rq_argp; - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - args->buffer = page_address(*(rqstp->rq_next_page++)); - - return xdr_argsize_check(rqstp, p); -} - -int -nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_linkargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->ffh) && + svcxdr_decode_diropargs(xdr, &args->tfh, + &args->tname, &args->tlen); } -int -nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_symlinkargs *args = rqstp->rq_argp; - char *base = (char *)p; - size_t xdrlen; - - if ( !(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen))) - return 0; + struct kvec *head = rqstp->rq_arg.head; - args->tlen = ntohl(*p++); + if (!svcxdr_decode_diropargs(xdr, &args->ffh, &args->fname, &args->flen)) + return false; + if (xdr_stream_decode_u32(xdr, &args->tlen) < 0) + return false; if (args->tlen == 0) - return 0; + return false; - args->first.iov_base = p; - args->first.iov_len = rqstp->rq_arg.head[0].iov_len; - args->first.iov_len -= (char *)p - base; - - /* This request is never larger than a page. Therefore, - * transport will deliver either: - * 1. pathname in the pagelist -> sattr is in the tail. - * 2. everything in the head buffer -> sattr is in the head. - */ - if (rqstp->rq_arg.page_len) { - if (args->tlen != rqstp->rq_arg.page_len) - return 0; - p = rqstp->rq_arg.tail[0].iov_base; - } else { - xdrlen = XDR_QUADLEN(args->tlen); - if (xdrlen > args->first.iov_len - (8 * sizeof(__be32))) - return 0; - p += xdrlen; - } - decode_sattr(p, &args->attrs); - - return 1; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + args->first.iov_base = xdr_inline_decode(xdr, args->tlen); + if (!args->first.iov_base) + return false; + return svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } -int -nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_readdirargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - args->cookie = ntohl(*p++); - args->count = ntohl(*p++); - args->count = min_t(u32, args->count, PAGE_SIZE); - args->buffer = page_address(*(rqstp->rq_next_page++)); + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return false; + if (xdr_stream_decode_u32(xdr, &args->cookie) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return false; - return xdr_argsize_check(rqstp, p); + return true; } /* * XDR encode functions */ -int -nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p) + +bool +nfssvc_encode_statres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { - return xdr_ressize_check(rqstp, p); + struct nfsd_stat *resp = rqstp->rq_resp; + + return svcxdr_encode_stat(xdr, resp->status); } -int -nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_encode_attrstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_attrstat *resp = rqstp->rq_resp; - p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return false; + break; + } + + return true; } -int -nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_encode_diropres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_diropres *resp = rqstp->rq_resp; - p = encode_fh(p, &resp->fh); - p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fhandle(xdr, &resp->fh)) + return false; + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return false; + break; + } + + return true; } -int -nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_readlinkres *resp = rqstp->rq_resp; - - *p++ = htonl(resp->len); - xdr_ressize_check(rqstp, p); - rqstp->rq_res.page_len = resp->len; - if (resp->len & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); + struct kvec *head = rqstp->rq_res.head; + + if (!svcxdr_encode_stat(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (xdr_stream_encode_u32(xdr, resp->len) < 0) + return false; + svcxdr_encode_opaque_pages(rqstp, xdr, &resp->page, 0, + resp->len); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) + return false; + break; } - return 1; + + return true; } -int -nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_readres *resp = rqstp->rq_resp; - - p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); - *p++ = htonl(resp->count); - xdr_ressize_check(rqstp, p); - - /* now update rqstp->rq_res to reflect data as well */ - rqstp->rq_res.page_len = resp->count; - if (resp->count & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3); + struct kvec *head = rqstp->rq_res.head; + + if (!svcxdr_encode_stat(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return false; + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return false; + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, + rqstp->rq_res.page_base, + resp->count); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) + return false; + break; } - return 1; + + return true; } -int -nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_readdirres *resp = rqstp->rq_resp; + struct xdr_buf *dirlist = &resp->dirlist; + + if (!svcxdr_encode_stat(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0, + dirlist->len); + /* no more entries */ + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0) + return false; + break; + } - xdr_ressize_check(rqstp, p); - p = resp->buffer; - *p++ = 0; /* no more entries */ - *p++ = htonl((resp->common.err == nfserr_eof)); - rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1; - - return 1; + return true; } -int -nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p) +bool +nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { struct nfsd_statfsres *resp = rqstp->rq_resp; struct kstatfs *stat = &resp->stats; + __be32 *p; + + if (!svcxdr_encode_stat(xdr, resp->status)) + return false; + switch (resp->status) { + case nfs_ok: + p = xdr_reserve_space(xdr, XDR_UNIT * 5); + if (!p) + return false; + *p++ = cpu_to_be32(NFS_MAXDATA); + *p++ = cpu_to_be32(stat->f_bsize); + *p++ = cpu_to_be32(stat->f_blocks); + *p++ = cpu_to_be32(stat->f_bfree); + *p = cpu_to_be32(stat->f_bavail); + break; + } - *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */ - *p++ = htonl(stat->f_bsize); - *p++ = htonl(stat->f_blocks); - *p++ = htonl(stat->f_bfree); - *p++ = htonl(stat->f_bavail); - return xdr_ressize_check(rqstp, p); + return true; } -int -nfssvc_encode_entry(void *ccdv, const char *name, - int namlen, loff_t offset, u64 ino, unsigned int d_type) +/** + * nfssvc_encode_nfscookie - Encode a directory offset cookie + * @resp: readdir result context + * @offset: offset cookie to encode + * + * The buffer space for the offset cookie has already been reserved + * by svcxdr_encode_entry_common(). + */ +void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset) { - struct readdir_cd *ccd = ccdv; - struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common); - __be32 *p = cd->buffer; - int buflen, slen; + __be32 cookie = cpu_to_be32(offset); - /* - dprintk("nfsd: entry(%.*s off %ld ino %ld)\n", - namlen, name, offset, ino); - */ + if (!resp->cookie_offset) + return; - if (offset > ~((u32) 0)) { - cd->common.err = nfserr_fbig; - return -EINVAL; - } - if (cd->offset) - *cd->offset = htonl(offset); + write_bytes_to_xdr_buf(&resp->dirlist, resp->cookie_offset, &cookie, + sizeof(cookie)); + resp->cookie_offset = 0; +} - /* truncate filename */ - namlen = min(namlen, NFS2_MAXNAMLEN); - slen = XDR_QUADLEN(namlen); +static bool +svcxdr_encode_entry_common(struct nfsd_readdirres *resp, const char *name, + int namlen, loff_t offset, u64 ino) +{ + struct xdr_buf *dirlist = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; + + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + /* fileid */ + if (xdr_stream_encode_u32(xdr, (u32)ino) < 0) + return false; + /* name */ + if (xdr_stream_encode_opaque(xdr, name, min(namlen, NFS2_MAXNAMLEN)) < 0) + return false; + /* cookie */ + resp->cookie_offset = dirlist->len; + if (xdr_stream_encode_u32(xdr, ~0U) < 0) + return false; + + return true; +} - if ((buflen = cd->buflen - slen - 4) < 0) { - cd->common.err = nfserr_toosmall; - return -EINVAL; - } - if (ino > ~((u32) 0)) { - cd->common.err = nfserr_fbig; - return -EINVAL; - } - *p++ = xdr_one; /* mark entry present */ - *p++ = htonl((u32) ino); /* file id */ - p = xdr_encode_array(p, name, namlen);/* name length & name */ - cd->offset = p; /* remember pointer */ - *p++ = htonl(~0U); /* offset of next entry */ - - cd->buflen = buflen; - cd->buffer = p; - cd->common.err = nfs_ok; +/** + * nfssvc_encode_entry - encode one NFSv2 READDIR entry + * @data: directory context + * @name: name of the object to be encoded + * @namlen: length of that name, in bytes + * @offset: the offset of the previous entry + * @ino: the fileid of this entry + * @d_type: unused + * + * Return values: + * %0: Entry was successfully encoded. + * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * + * On exit, the following fields are updated: + * - resp->xdr + * - resp->common.err + * - resp->cookie_offset + */ +int nfssvc_encode_entry(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = data; + struct nfsd_readdirres *resp = container_of(ccd, + struct nfsd_readdirres, + common); + unsigned int starting_length = resp->dirlist.len; + + /* The offset cookie for the previous entry */ + nfssvc_encode_nfscookie(resp, offset); + + if (!svcxdr_encode_entry_common(resp, name, namlen, offset, ino)) + goto out_toosmall; + + xdr_commit_encode(&resp->xdr); + resp->common.err = nfs_ok; return 0; + +out_toosmall: + resp->cookie_offset = 0; + resp->common.err = nfserr_toosmall; + resp->dirlist.len = starting_length; + return -EINVAL; } /* * XDR release functions */ -void -nfssvc_release_fhandle(struct svc_rqst *rqstp) +void nfssvc_release_attrstat(struct svc_rqst *rqstp) +{ + struct nfsd_attrstat *resp = rqstp->rq_resp; + + fh_put(&resp->fh); +} + +void nfssvc_release_diropres(struct svc_rqst *rqstp) { - struct nfsd_fhandle *resp = rqstp->rq_resp; + struct nfsd_diropres *resp = rqstp->rq_resp; + + fh_put(&resp->fh); +} + +void nfssvc_release_readres(struct svc_rqst *rqstp) +{ + struct nfsd_readres *resp = rqstp->rq_resp; fh_put(&resp->fh); } diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 4f4282d4eeca..db9af780438b 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -27,17 +27,19 @@ struct nfsd4_layout_ops { struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdevp); __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdevp); + const struct nfsd4_getdeviceinfo *gdevp); - __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp, - struct nfsd4_layoutget *lgp); - __be32 (*encode_layoutget)(struct xdr_stream *, - struct nfsd4_layoutget *lgp); + __be32 (*proc_layoutget)(struct svc_rqst *rqstp, struct inode *inode, + const struct svc_fh *fhp, struct nfsd4_layoutget *lgp); + __be32 (*encode_layoutget)(struct xdr_stream *xdr, + const struct nfsd4_layoutget *lgp); __be32 (*proc_layoutcommit)(struct inode *inode, + struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp); - void (*fence_client)(struct nfs4_layout_stateid *ls); + void (*fence_client)(struct nfs4_layout_stateid *ls, + struct nfsd_file *file); }; extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; @@ -72,11 +74,13 @@ void nfsd4_setup_layout_type(struct svc_export *exp); void nfsd4_return_all_client_layouts(struct nfs4_client *); void nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp); +void nfsd4_close_layout(struct nfs4_layout_stateid *ls); int nfsd4_init_pnfs(void); void nfsd4_exit_pnfs(void); #else struct nfs4_client; struct nfs4_file; +struct nfs4_layout_stateid; static inline void nfsd4_setup_layout_type(struct svc_export *exp) { @@ -89,6 +93,9 @@ static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) { } +static inline void nfsd4_close_layout(struct nfs4_layout_stateid *ls) +{ +} static inline void nfsd4_exit_pnfs(void) { } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 396c76755b03..b052c1effdc5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -35,10 +35,12 @@ #ifndef _NFSD4_STATE_H #define _NFSD4_STATE_H +#include <crypto/md5.h> #include <linux/idr.h> #include <linux/refcount.h> #include <linux/sunrpc/svc_xprt.h> #include "nfsfh.h" +#include "nfsd.h" typedef struct { u32 cl_boot; @@ -55,27 +57,51 @@ typedef struct { stateid_opaque_t si_opaque; } stateid_t; -#define STATEID_FMT "(%08x/%08x/%08x/%08x)" -#define STATEID_VAL(s) \ - (s)->si_opaque.so_clid.cl_boot, \ - (s)->si_opaque.so_clid.cl_id, \ - (s)->si_opaque.so_id, \ - (s)->si_generation +typedef struct { + stateid_t cs_stid; +#define NFS4_COPY_STID 1 +#define NFS4_COPYNOTIFY_STID 2 + unsigned char cs_type; + refcount_t cs_count; +} copy_stateid_t; + +struct nfsd4_referring_call { + struct list_head __list; + + u32 rc_sequenceid; + u32 rc_slotid; +}; + +struct nfsd4_referring_call_list { + struct list_head __list; + + struct nfs4_sessionid rcl_sessionid; + int __nr_referring_calls; + struct list_head rcl_referring_calls; +}; struct nfsd4_callback { struct nfs4_client *cb_clp; struct rpc_message cb_msg; +#define NFSD4_CALLBACK_RUNNING (0) +#define NFSD4_CALLBACK_WAKE (1) +#define NFSD4_CALLBACK_REQUEUE (2) + unsigned long cb_flags; const struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; int cb_seq_status; int cb_status; - bool cb_need_restart; + int cb_held_slot; + + int cb_nr_referring_call_list; + struct list_head cb_referring_call_list; }; struct nfsd4_callback_ops { void (*prepare)(struct nfsd4_callback *); int (*done)(struct nfsd4_callback *, struct rpc_task *); void (*release)(struct nfsd4_callback *); + uint32_t opcode; }; /* @@ -85,16 +111,36 @@ struct nfsd4_callback_ops { */ struct nfs4_stid { refcount_t sc_count; -#define NFS4_OPEN_STID 1 -#define NFS4_LOCK_STID 2 -#define NFS4_DELEG_STID 4 -/* For an open stateid kept around *only* to process close replays: */ -#define NFS4_CLOSED_STID 8 + + /* A new stateid is added to the cl_stateids idr early before it + * is fully initialised. Its sc_type is then zero. After + * initialisation the sc_type it set under cl_lock, and then + * never changes. + */ +#define SC_TYPE_OPEN BIT(0) +#define SC_TYPE_LOCK BIT(1) +#define SC_TYPE_DELEG BIT(2) +#define SC_TYPE_LAYOUT BIT(3) + unsigned short sc_type; + +/* state_lock protects sc_status for delegation stateids. + * ->cl_lock protects sc_status for open and lock stateids. + * ->st_mutex also protect sc_status for open stateids. + * ->ls_lock protects sc_status for layout stateids. + */ +/* + * For an open stateid kept around *only* to process close replays. + * For deleg stateid, kept in idr until last reference is dropped. + */ +#define SC_STATUS_CLOSED BIT(0) /* For a deleg stateid kept around only to process free_stateid's: */ -#define NFS4_REVOKED_DELEG_STID 16 -#define NFS4_CLOSED_DELEG_STID 32 -#define NFS4_LAYOUT_STID 64 - unsigned char sc_type; +#define SC_STATUS_REVOKED BIT(1) +#define SC_STATUS_ADMIN_REVOKED BIT(2) +#define SC_STATUS_FREEABLE BIT(3) +#define SC_STATUS_FREED BIT(4) + unsigned short sc_status; + + struct list_head sc_cp_list; stateid_t sc_stateid; spinlock_t sc_lock; struct nfs4_client *sc_client; @@ -102,6 +148,47 @@ struct nfs4_stid { void (*sc_free)(struct nfs4_stid *); }; +/* Keep a list of stateids issued by the COPY_NOTIFY, associate it with the + * parent OPEN/LOCK/DELEG stateid. + */ +struct nfs4_cpntf_state { + copy_stateid_t cp_stateid; + struct list_head cp_list; /* per parent nfs4_stid */ + stateid_t cp_p_stateid; /* copy of parent's stateid */ + clientid_t cp_p_clid; /* copy of parent's clid */ + time64_t cpntf_time; /* last time stateid used */ +}; + +/* + * RFC 7862 Section 4.8 states: + * + * | A copy offload stateid will be valid until either (A) the client + * | or server restarts or (B) the client returns the resource by + * | issuing an OFFLOAD_CANCEL operation or the client replies to a + * | CB_OFFLOAD operation. + * + * Because a client might not reply to a CB_OFFLOAD, or a reply + * might get lost due to connection loss, NFSD purges async copy + * state after a short period to prevent it from accumulating + * over time. + */ +#define NFSD_COPY_INITIAL_TTL 10 + +struct nfs4_cb_fattr { + struct nfsd4_callback ncf_getattr; + u32 ncf_cb_status; + + /* from CB_GETATTR reply */ + u64 ncf_cb_change; + u64 ncf_cb_fsize; + struct timespec64 ncf_cb_mtime; + struct timespec64 ncf_cb_atime; + + bool ncf_file_modified; + u64 ncf_initial_cinfo; + u64 ncf_cur_fsize; +}; + /* * Represents a delegation stateid. The nfs4_client holds references to these * and they are put when it is being destroyed or when the delegation is @@ -129,13 +216,43 @@ struct nfs4_delegation { struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ struct nfs4_clnt_odstate *dl_clnt_odstate; + time64_t dl_time; u32 dl_type; - time_t dl_time; -/* For recall: */ + /* For recall: */ int dl_retries; struct nfsd4_callback dl_recall; + bool dl_recalled; + bool dl_written; + bool dl_setattr; + + /* for CB_GETATTR */ + struct nfs4_cb_fattr dl_cb_fattr; + + /* For delegated timestamps */ + struct timespec64 dl_atime; + struct timespec64 dl_mtime; + struct timespec64 dl_ctime; }; +static inline bool deleg_is_read(u32 dl_type) +{ + return (dl_type == OPEN_DELEGATE_READ || dl_type == OPEN_DELEGATE_READ_ATTRS_DELEG); +} + +static inline bool deleg_is_write(u32 dl_type) +{ + return (dl_type == OPEN_DELEGATE_WRITE || dl_type == OPEN_DELEGATE_WRITE_ATTRS_DELEG); +} + +static inline bool deleg_attrs_deleg(u32 dl_type) +{ + return dl_type == OPEN_DELEGATE_READ_ATTRS_DELEG || + dl_type == OPEN_DELEGATE_WRITE_ATTRS_DELEG; +} + +bool nfsd4_vet_deleg_time(struct timespec64 *cb, const struct timespec64 *orig, + const struct timespec64 *now); + #define cb_to_delegation(cb) \ container_of(cb, struct nfs4_delegation, dl_recall) @@ -156,10 +273,11 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) return container_of(s, struct nfs4_delegation, dl_stid); } -/* Maximum number of slots per session. 160 is useful for long haul TCP */ -#define NFSD_MAX_SLOTS_PER_SESSION 160 -/* Maximum number of operations per session compound */ -#define NFSD_MAX_OPS_PER_COMPOUND 16 +/* Maximum number of slots per session. This is for sanity-check only. + * It could be increased if we had a mechanism to shutdown misbehaving clients. + * A large number can be needed to get good throughput on high-latency servers. + */ +#define NFSD_MAX_SLOTS_PER_SESSION 2048 /* Maximum session per slot cache size */ #define NFSD_SLOT_CACHE_SIZE 2048 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ @@ -171,12 +289,15 @@ struct nfsd4_slot { u32 sl_seqid; __be32 sl_status; struct svc_cred sl_cred; + u32 sl_index; u32 sl_datalen; u16 sl_opcnt; + u16 sl_generation; #define NFSD4_SLOT_INUSE (1 << 0) #define NFSD4_SLOT_CACHETHIS (1 << 1) #define NFSD4_SLOT_INITIALIZED (1 << 2) #define NFSD4_SLOT_CACHED (1 << 3) +#define NFSD4_SLOT_REUSED (1 << 4) u8 sl_flags; char sl_data[]; }; @@ -235,6 +356,9 @@ struct nfsd4_conn { unsigned char cn_flags; }; +/* Maximum number of slots that nfsd will use in the backchannel */ +#define NFSD_BC_SLOT_TABLE_SIZE (sizeof(u32) * 8) + /* * Representation of a v4.1+ session. These are refcounted in a similar fashion * to the nfs4_client. References are only taken when the server is actively @@ -242,20 +366,23 @@ struct nfsd4_conn { */ struct nfsd4_session { atomic_t se_ref; + spinlock_t se_lock; + u32 se_cb_slot_avail; /* bitmap of available slots */ + u32 se_cb_highest_slot; /* highest slot client wants */ + u32 se_cb_prog; struct list_head se_hash; /* hash by sessionid */ struct list_head se_perclnt; -/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */ -#define NFS4_SESSION_DEAD 0x010 - u32 se_flags; + struct list_head se_all_sessions;/* global list of sessions */ struct nfs4_client *se_client; struct nfs4_sessionid se_sessionid; struct nfsd4_channel_attrs se_fchannel; - struct nfsd4_channel_attrs se_bchannel; struct nfsd4_cb_sec se_cb_sec; struct list_head se_conns; - u32 se_cb_prog; - u32 se_cb_seq_nr; - struct nfsd4_slot *se_slots[]; /* forward channel slots */ + u32 se_cb_seq_nr[NFSD_BC_SLOT_TABLE_SIZE]; + struct xarray se_slots; /* forward channel slots */ + u16 se_slot_gen; + bool se_dead; + u32 se_target_maxslots; }; /* formatted contents of nfs4_sessionid */ @@ -265,7 +392,30 @@ struct nfsd4_sessionid { u32 reserved; }; -#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ +/* Length of MD5 digest as hex, plus terminating '\0' */ +#define HEXDIR_LEN (2 * MD5_DIGEST_SIZE + 1) + +/* + * State Meaning Where set + * -------------------------------------------------------------------------- + * | NFSD4_ACTIVE | Confirmed, active | Default | + * |------------------- ----------------------------------------------------| + * | NFSD4_COURTESY | Courtesy state. | nfs4_get_client_reaplist | + * | | Lease/lock/share | | + * | | reservation conflict | | + * | | can cause Courtesy | | + * | | client to be expired | | + * |------------------------------------------------------------------------| + * | NFSD4_EXPIRABLE | Courtesy client to be| nfs4_laundromat | + * | | expired by Laundromat| try_to_expire_client | + * | | due to conflict | | + * |------------------------------------------------------------------------| + */ +enum { + NFSD4_ACTIVE = 0, + NFSD4_COURTESY, + NFSD4_EXPIRABLE, +}; /* * struct nfs4_client - one per client. Clientids live here. @@ -281,8 +431,9 @@ struct nfsd4_sessionid { * 0. If they are not renewed within a lease period, they become eligible for * destruction by the laundromat. * - * These objects can also be destroyed prematurely by the fault injection code, - * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates. + * These objects can also be destroyed if the client sends certain forms of + * SETCLIENTID or EXCHANGE_ID operations. + * * Care is taken *not* to do this however when the objects have an elevated * refcount. * @@ -290,7 +441,7 @@ struct nfsd4_sessionid { * * o Each nfs4_clients is also hashed by name (the opaque quantity initially * sent by the client to identify itself). - * + * * o cl_perclient list is used to ensure no dangling stateowner references * when we expire the nfs4_client */ @@ -308,13 +459,18 @@ struct nfs4_client { #endif struct xdr_netobj cl_name; /* id generated by client */ nfs4_verifier cl_verifier; /* generated by client */ - time_t cl_time; /* time of last lease renewal */ + time64_t cl_time; /* time of last lease renewal */ struct sockaddr_storage cl_addr; /* client ipaddress */ bool cl_mach_cred; /* SP4_MACH_CRED in force */ struct svc_cred cl_cred; /* setclientid principal */ clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ u32 cl_minorversion; + atomic_t cl_admin_revoked; /* count of admin-revoked states */ + /* NFSv4.1 client implementation id: */ + struct xdr_netobj cl_nii_domain; + struct xdr_netobj cl_nii_name; + struct timespec64 cl_nii_time; /* for v4.0 and v4.1 callbacks: */ struct nfs4_cb_conn cl_cb_conn; @@ -327,6 +483,8 @@ struct nfs4_client { #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 1 << NFSD4_CLIENT_CB_KILL) unsigned long cl_flags; + + struct workqueue_struct *cl_callback_wq; const struct cred *cl_cb_cred; struct rpc_clnt *cl_cb_client; u32 cl_cb_ident; @@ -346,17 +504,29 @@ struct nfs4_client { struct nfsd4_clid_slot cl_cs_slot; /* create_session slot */ u32 cl_exchange_flags; /* number of rpc's in progress over an associated session: */ - atomic_t cl_refcount; + atomic_t cl_rpc_users; + struct nfsdfs_client cl_nfsdfs; struct nfs4_op_map cl_spo_must_allow; - /* for nfs41 callbacks */ - /* We currently support a single back channel with a single slot */ - unsigned long cl_cb_slot_busy; + /* debugging info directory under nfsd/clients/ : */ + struct dentry *cl_nfsd_dentry; + /* 'info' file within that directory. Ref is not counted, + * but will remain valid iff cl_nfsd_dentry != NULL + */ + struct dentry *cl_nfsd_info_dentry; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ /* wait here for slots */ struct net *net; struct list_head async_copies; /* list of async copies */ spinlock_t async_lock; /* lock for async copies */ + atomic_t cl_cb_inflight; /* Outstanding callbacks */ + + unsigned int cl_state; + atomic_t cl_delegs_in_recall; + + struct nfsd4_cb_recall_any *cl_ra; + time64_t cl_ra_time; }; /* struct nfs4_client_reset @@ -367,7 +537,8 @@ struct nfs4_client { struct nfs4_client_reclaim { struct list_head cr_strhash; /* hash by cr_name */ struct nfs4_client *cr_clp; /* pointer to associated clp */ - char cr_recdir[HEXDIR_LEN]; /* recover dir */ + struct xdr_netobj cr_name; /* recovery dir name */ + struct xdr_netobj cr_princhash; }; /* A reasonable value for REPLAY_ISIZE was estimated as follows: @@ -388,7 +559,7 @@ struct nfs4_replay { unsigned int rp_buflen; char *rp_buf; struct knfsd_fh rp_openfh; - struct mutex rp_mutex; + int rp_locked; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; @@ -437,7 +608,7 @@ struct nfs4_openowner { */ struct list_head oo_close_lru; struct nfs4_ol_stateid *oo_last_closed_stid; - time_t oo_time; /* time of placement on so_close_lru */ + time64_t oo_time; /* time of placement on so_close_lru */ #define NFS4_OO_CONFIRMED 1 unsigned char oo_flags; }; @@ -481,14 +652,13 @@ struct nfs4_clnt_odstate { * inode can have multiple filehandles associated with it, so there is * (potentially) a many to one relationship between this struct and struct * inode. - * - * These are hashed by filehandle in the file_hashtbl, which is protected by - * the global state_lock spinlock. */ struct nfs4_file { refcount_t fi_ref; + struct inode * fi_inode; + bool fi_aliased; spinlock_t fi_lock; - struct hlist_node fi_hash; /* hash on fi_fhandle */ + struct rhlist_head fi_rlist; struct list_head fi_stateids; union { struct list_head fi_delegations; @@ -496,7 +666,7 @@ struct nfs4_file { }; struct list_head fi_clnt_odstate; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ - struct file * fi_fds[3]; + struct nfsd_file *fi_fds[3]; /* * Each open or lock stateid contributes 0-4 to the counts * below depending on which bits are set in st_access_bitmap: @@ -506,7 +676,8 @@ struct nfs4_file { */ atomic_t fi_access[2]; u32 fi_share_deny; - struct file *fi_deleg_file; + struct nfsd_file *fi_deleg_file; + struct nfsd_file *fi_rdeleg_file; int fi_delegees; struct knfsd_fh fi_fhandle; bool fi_had_conflict; @@ -537,6 +708,10 @@ struct nfs4_ol_stateid { struct list_head st_locks; struct nfs4_stateowner *st_stateowner; struct nfs4_clnt_odstate *st_clnt_odstate; +/* + * These bitmasks use 3 separate bits for READ, ALLOW, and BOTH; see the + * comment above bmap_to_share_mode() for explanation: + */ unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid *st_openstp; @@ -555,7 +730,7 @@ struct nfs4_layout_stateid { spinlock_t ls_lock; struct list_head ls_layouts; u32 ls_layout_type; - struct file *ls_file; + struct nfsd_file *ls_file; struct nfsd4_callback ls_recall; stateid_t ls_recall_sid; bool ls_recalled; @@ -578,6 +753,8 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_OFFLOAD, NFSPROC4_CLNT_CB_SEQUENCE, NFSPROC4_CLNT_CB_NOTIFY_LOCK, + NFSPROC4_CLNT_CB_RECALL_ANY, + NFSPROC4_CLNT_CB_GETATTR, }; /* Returns true iff a is later than b: */ @@ -594,10 +771,11 @@ static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b) struct nfsd4_blocked_lock { struct list_head nbl_list; struct list_head nbl_lru; - unsigned long nbl_time; + time64_t nbl_time; struct file_lock nbl_lock; struct knfsd_fh nbl_fh; struct nfsd4_callback nbl_cb; + struct kref nbl_kref; }; struct nfsd4_compound_state; @@ -606,48 +784,69 @@ struct nfsd4_copy; extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, - stateid_t *stateid, int flags, struct file **filp, bool *tmp_file); + stateid_t *stateid, int flags, struct nfsd_file **filp, + struct nfs4_stid **cstid); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, - stateid_t *stateid, unsigned char typemask, - struct nfs4_stid **s, struct nfsd_net *nn); + stateid_t *stateid, unsigned short typemask, + unsigned short statusmask, + struct nfs4_stid **s, struct nfsd_net *nn); struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, void (*sc_free)(struct nfs4_stid *)); -int nfs4_init_cp_state(struct nfsd_net *nn, struct nfsd4_copy *copy); -void nfs4_free_cp_state(struct nfsd4_copy *copy); -void nfs4_unhash_stid(struct nfs4_stid *s); +int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy); +void nfs4_free_copy_state(struct nfsd4_copy *copy); +struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, + struct nfs4_stid *p_stid); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); extern void nfs4_release_reclaim(struct nfsd_net *); -extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, +extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn); -extern __be32 nfs4_check_open_reclaim(clientid_t *clid, - struct nfsd4_compound_state *cstate, struct nfsd_net *nn); +extern __be32 nfs4_check_open_reclaim(struct nfs4_client *); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); +extern void nfsd41_cb_referring_call(struct nfsd4_callback *cb, + struct nfs4_sessionid *sessionid, + u32 slotid, u32 seqno); +extern void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb); extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); -extern void nfsd4_run_cb(struct nfsd4_callback *cb); -extern int nfsd4_create_callback_queue(void); -extern void nfsd4_destroy_callback_queue(void); +extern bool nfsd4_run_cb(struct nfsd4_callback *cb); + +static inline void nfsd4_try_run_cb(struct nfsd4_callback *cb) +{ + if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags)) + WARN_ON_ONCE(!nfsd4_run_cb(cb)); +} + extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); -extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); -extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, - struct nfsd_net *nn); -extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); +void nfsd4_async_copy_reaper(struct nfsd_net *nn); +bool nfsd4_has_active_async_copies(struct nfs4_client *clp); +extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, + struct xdr_netobj princhash, struct nfsd_net *nn); +extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); -struct nfs4_file *find_file(struct knfsd_fh *fh); void put_nfs4_file(struct nfs4_file *fi); -extern void nfs4_put_copy(struct nfsd4_copy *copy); -extern struct nfsd4_copy * -find_async_copy(struct nfs4_client *clp, stateid_t *staetid); +extern void nfs4_put_cpntf_state(struct nfsd_net *nn, + struct nfs4_cpntf_state *cps); +extern __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st, + struct nfs4_client *clp, + struct nfs4_cpntf_state **cps); static inline void get_nfs4_file(struct nfs4_file *fi) { refcount_inc(&fi->fi_ref); } -struct file *find_any_file(struct nfs4_file *f); +struct nfsd_file *find_any_file(struct nfs4_file *f); + +#ifdef CONFIG_NFSD_V4 +void nfsd4_revoke_states(struct net *net, struct super_block *sb); +#else +static inline void nfsd4_revoke_states(struct net *net, struct super_block *sb) +{ +} +#endif /* grace period management */ void nfsd4_end_grace(struct nfsd_net *nn); @@ -660,31 +859,17 @@ extern void nfsd4_client_record_remove(struct nfs4_client *clp); extern int nfsd4_client_record_check(struct nfs4_client *clp); extern void nfsd4_record_grace_done(struct nfsd_net *nn); -/* nfs fault injection functions */ -#ifdef CONFIG_NFSD_FAULT_INJECTION -int nfsd_fault_inject_init(void); -void nfsd_fault_inject_cleanup(void); - -u64 nfsd_inject_print_clients(void); -u64 nfsd_inject_forget_client(struct sockaddr_storage *, size_t); -u64 nfsd_inject_forget_clients(u64); - -u64 nfsd_inject_print_locks(void); -u64 nfsd_inject_forget_client_locks(struct sockaddr_storage *, size_t); -u64 nfsd_inject_forget_locks(u64); - -u64 nfsd_inject_print_openowners(void); -u64 nfsd_inject_forget_client_openowners(struct sockaddr_storage *, size_t); -u64 nfsd_inject_forget_openowners(u64); - -u64 nfsd_inject_print_delegations(void); -u64 nfsd_inject_forget_client_delegations(struct sockaddr_storage *, size_t); -u64 nfsd_inject_forget_delegations(u64); -u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t); -u64 nfsd_inject_recall_delegations(u64); -#else /* CONFIG_NFSD_FAULT_INJECTION */ -static inline int nfsd_fault_inject_init(void) { return 0; } -static inline void nfsd_fault_inject_cleanup(void) {} -#endif /* CONFIG_NFSD_FAULT_INJECTION */ +static inline bool try_to_expire_client(struct nfs4_client *clp) +{ + cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE); + return clp->cl_state == NFSD4_EXPIRABLE; +} + +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, + struct dentry *dentry, struct nfs4_delegation **pdp); +struct nfsd4_get_dir_delegation; +struct nfs4_delegation *nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, + struct nfsd4_get_dir_delegation *gdd, + struct nfsd_file *nf); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 9bce3b913189..f7eaf95e20fc 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -7,16 +7,14 @@ * Format: * rc <hits> <misses> <nocache> * Statistsics for the reply cache - * fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache> + * fh <stale> <deprecated filehandle cache stats> * statistics for filehandle lookup * io <bytes-read> <bytes-written> * statistics for IO throughput - * th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%> - * time (seconds) when nfsd thread usage above thresholds - * and number of times that all threads were in use - * ra cache-size <10% <20% <30% ... <100% not-found - * number of times that read-ahead entry was found that deep in - * the cache. + * th <threads> <deprecated thread usage histogram stats> + * number of threads + * ra <deprecated ra-cache stats> + * * plus generic RPC stats (see net/sunrpc/stats.c) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> @@ -29,49 +27,43 @@ #include "nfsd.h" -struct nfsd_stats nfsdstats; -struct svc_stat nfsd_svcstats = { - .program = &nfsd_program, -}; - -static int nfsd_proc_show(struct seq_file *seq, void *v) +static int nfsd_show(struct seq_file *seq, void *v) { + struct net *net = pde_data(file_inode(seq->file)); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; - seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n", - nfsdstats.rchits, - nfsdstats.rcmisses, - nfsdstats.rcnocache, - nfsdstats.fh_stale, - nfsdstats.fh_lookup, - nfsdstats.fh_anon, - nfsdstats.fh_nocache_dir, - nfsdstats.fh_nocache_nondir, - nfsdstats.io_read, - nfsdstats.io_write); + seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE])); + /* thread usage: */ - seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt); - for (i=0; i<10; i++) { - unsigned int jifs = nfsdstats.th_usage[i]; - unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ; - seq_printf(seq, " %u.%03u", sec, msec); - } + seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt)); + + /* deprecated thread usage histogram stats */ + for (i = 0; i < 10; i++) + seq_puts(seq, " 0.000"); + + /* deprecated ra-cache stats */ + seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n"); - /* newline and ra-cache */ - seq_printf(seq, "\nra %u", nfsdstats.ra_size); - for (i=0; i<11; i++) - seq_printf(seq, " %u", nfsdstats.ra_depth[i]); - seq_putc(seq, '\n'); - /* show my rpc info */ - svc_seq_show(seq, &nfsd_svcstats); + svc_seq_show(seq, &nn->nfsd_svcstats); #ifdef CONFIG_NFSD_V4 /* Show count for individual nfsv4 operations */ /* Writing operation numbers 0 1 2 also for maintaining uniformity */ - seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); - for (i = 0; i <= LAST_NFS4_OP; i++) - seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]); + seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1); + for (i = 0; i <= LAST_NFS4_OP; i++) { + seq_printf(seq, " %lld", + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)])); + } + seq_printf(seq, "\nwdeleg_getattr %lld", + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif @@ -79,26 +71,16 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) return 0; } -static int nfsd_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, nfsd_proc_show, NULL); -} - -static const struct file_operations nfsd_proc_fops = { - .open = nfsd_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); -void -nfsd_stat_init(void) +struct proc_dir_entry *nfsd_proc_stat_init(struct net *net) { - svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + return svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); } -void -nfsd_stat_shutdown(void) +void nfsd_proc_stat_shutdown(struct net *net) { - svc_proc_unregister(&init_net, "nfsd"); + svc_proc_unregister(net, "nfsd"); } diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index b23fdac69820..e4efb0e4e56d 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -8,37 +8,69 @@ #define _NFSD_STATS_H #include <uapi/linux/nfsd/stats.h> +#include <linux/percpu_counter.h> +struct proc_dir_entry *nfsd_proc_stat_init(struct net *net); +void nfsd_proc_stat_shutdown(struct net *net); -struct nfsd_stats { - unsigned int rchits; /* repcache hits */ - unsigned int rcmisses; /* repcache hits */ - unsigned int rcnocache; /* uncached reqs */ - unsigned int fh_stale; /* FH stale error */ - unsigned int fh_lookup; /* dentry cached */ - unsigned int fh_anon; /* anon file dentry returned */ - unsigned int fh_nocache_dir; /* filehandle not found in dcache */ - unsigned int fh_nocache_nondir; /* filehandle not found in dcache */ - unsigned int io_read; /* bytes returned to read requests */ - unsigned int io_write; /* bytes passed in write requests */ - unsigned int th_cnt; /* number of available threads */ - unsigned int th_usage[10]; /* number of ticks during which n perdeciles - * of available threads were in use */ - unsigned int th_fullcnt; /* number of times last free thread was used */ - unsigned int ra_size; /* size of ra cache */ - unsigned int ra_depth[11]; /* number of times ra entry was found that deep - * in the cache (10percentiles). [10] = not found */ -#ifdef CONFIG_NFSD_V4 - unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */ -#endif +static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn) +{ + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_HITS]); +} + +static inline void nfsd_stats_rc_misses_inc(struct nfsd_net *nn) +{ + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_MISSES]); +} + +static inline void nfsd_stats_rc_nocache_inc(struct nfsd_net *nn) +{ + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_NOCACHE]); +} -}; +static inline void nfsd_stats_fh_stale_inc(struct nfsd_net *nn, + struct svc_export *exp) +{ + percpu_counter_inc(&nn->counter[NFSD_STATS_FH_STALE]); + if (exp && exp->ex_stats) + percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]); +} +static inline void nfsd_stats_io_read_add(struct nfsd_net *nn, + struct svc_export *exp, s64 amount) +{ + percpu_counter_add(&nn->counter[NFSD_STATS_IO_READ], amount); + if (exp && exp->ex_stats) + percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount); +} -extern struct nfsd_stats nfsdstats; -extern struct svc_stat nfsd_svcstats; +static inline void nfsd_stats_io_write_add(struct nfsd_net *nn, + struct svc_export *exp, s64 amount) +{ + percpu_counter_add(&nn->counter[NFSD_STATS_IO_WRITE], amount); + if (exp && exp->ex_stats) + percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount); +} -void nfsd_stat_init(void); -void nfsd_stat_shutdown(void); +static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) +{ + percpu_counter_inc(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]); +} +static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount) +{ + percpu_counter_add(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount); +} + +static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) +{ + percpu_counter_sub(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount); +} + +#ifdef CONFIG_NFSD_V4 +static inline void nfsd_stats_wdeleg_getattr_inc(struct nfsd_net *nn) +{ + percpu_counter_inc(&nn->counter[NFSD_STATS_WDELEG_GETATTR]); +} +#endif #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c index 90967466a1e5..f008b95ceec2 100644 --- a/fs/nfsd/trace.c +++ b/fs/nfsd/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 80933e4334d8..5ae2a611e57f 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -9,22 +9,126 @@ #define _NFSD_TRACE_H #include <linux/tracepoint.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/xprt.h> +#include <trace/misc/fs.h> +#include <trace/misc/nfs.h> +#include <trace/misc/sunrpc.h> + +#include "export.h" #include "nfsfh.h" +#include "xdr4.h" + +#define NFSD_TRACE_PROC_CALL_FIELDS(r) \ + __field(unsigned int, netns_ino) \ + __field(u32, xid) \ + __sockaddr(server, (r)->rq_xprt->xpt_locallen) \ + __sockaddr(client, (r)->rq_xprt->xpt_remotelen) + +#define NFSD_TRACE_PROC_CALL_ASSIGNMENTS(r) \ + do { \ + struct svc_xprt *xprt = (r)->rq_xprt; \ + __entry->netns_ino = SVC_NET(r)->ns.inum; \ + __entry->xid = be32_to_cpu((r)->rq_xid); \ + __assign_sockaddr(server, &xprt->xpt_local, \ + xprt->xpt_locallen); \ + __assign_sockaddr(client, &xprt->xpt_remote, \ + xprt->xpt_remotelen); \ + } while (0) + +#define NFSD_TRACE_PROC_RES_FIELDS(r) \ + __field(unsigned int, netns_ino) \ + __field(u32, xid) \ + __field(unsigned long, status) \ + __sockaddr(server, (r)->rq_xprt->xpt_locallen) \ + __sockaddr(client, (r)->rq_xprt->xpt_remotelen) + +#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(r, error) \ + do { \ + struct svc_xprt *xprt = (r)->rq_xprt; \ + __entry->netns_ino = SVC_NET(r)->ns.inum; \ + __entry->xid = be32_to_cpu((r)->rq_xid); \ + __entry->status = be32_to_cpu(error); \ + __assign_sockaddr(server, &xprt->xpt_local, \ + xprt->xpt_locallen); \ + __assign_sockaddr(client, &xprt->xpt_remote, \ + xprt->xpt_remotelen); \ + } while (0); + +DECLARE_EVENT_CLASS(nfsd_xdr_err_class, + TP_PROTO( + const struct svc_rqst *rqstp + ), + TP_ARGS(rqstp), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(u32, xid) + __field(u32, vers) + __field(u32, proc) + __sockaddr(server, rqstp->rq_xprt->xpt_locallen) + __sockaddr(client, rqstp->rq_xprt->xpt_remotelen) + ), + TP_fast_assign( + const struct svc_xprt *xprt = rqstp->rq_xprt; + + __entry->netns_ino = xprt->xpt_net->ns.inum; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->vers = rqstp->rq_vers; + __entry->proc = rqstp->rq_proc; + __assign_sockaddr(server, &xprt->xpt_local, xprt->xpt_locallen); + __assign_sockaddr(client, &xprt->xpt_remote, xprt->xpt_remotelen); + ), + TP_printk("xid=0x%08x vers=%u proc=%u", + __entry->xid, __entry->vers, __entry->proc + ) +); + +#define DEFINE_NFSD_XDR_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_xdr_err_class, nfsd_##name##_err, \ + TP_PROTO(const struct svc_rqst *rqstp), \ + TP_ARGS(rqstp)) + +DEFINE_NFSD_XDR_ERR_EVENT(garbage_args); +DEFINE_NFSD_XDR_ERR_EVENT(cant_encode); + +#define show_nfsd_may_flags(x) \ + __print_flags(x, "|", \ + { NFSD_MAY_EXEC, "EXEC" }, \ + { NFSD_MAY_WRITE, "WRITE" }, \ + { NFSD_MAY_READ, "READ" }, \ + { NFSD_MAY_SATTR, "SATTR" }, \ + { NFSD_MAY_TRUNC, "TRUNC" }, \ + { NFSD_MAY_NLM, "NLM" }, \ + { NFSD_MAY_OWNER_OVERRIDE, "OWNER_OVERRIDE" }, \ + { NFSD_MAY_LOCAL_ACCESS, "LOCAL_ACCESS" }, \ + { NFSD_MAY_BYPASS_GSS_ON_ROOT, "BYPASS_GSS_ON_ROOT" }, \ + { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }, \ + { NFSD_MAY_BYPASS_GSS, "BYPASS_GSS" }, \ + { NFSD_MAY_READ_IF_EXEC, "READ_IF_EXEC" }, \ + { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }, \ + { NFSD_MAY_LOCALIO, "LOCALIO" }) TRACE_EVENT(nfsd_compound, - TP_PROTO(const struct svc_rqst *rqst, - u32 args_opcnt), - TP_ARGS(rqst, args_opcnt), + TP_PROTO( + const struct svc_rqst *rqst, + const char *tag, + u32 taglen, + u32 opcnt + ), + TP_ARGS(rqst, tag, taglen, opcnt), TP_STRUCT__entry( __field(u32, xid) - __field(u32, args_opcnt) + __field(u32, opcnt) + __string_len(tag, tag, taglen) ), TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); - __entry->args_opcnt = args_opcnt; + __entry->opcnt = opcnt; + __assign_str(tag); ), - TP_printk("xid=0x%08x opcnt=%u", - __entry->xid, __entry->args_opcnt) + TP_printk("xid=0x%08x opcnt=%u tag=%s", + __entry->xid, __entry->opcnt, __get_str(tag) + ) ) TRACE_EVENT(nfsd_compound_status, @@ -43,24 +147,300 @@ TRACE_EVENT(nfsd_compound_status, __entry->args_opcnt = args_opcnt; __entry->resp_opcnt = resp_opcnt; __entry->status = be32_to_cpu(status); - __assign_str(name, name); + __assign_str(name); ), TP_printk("op=%u/%u %s status=%d", __entry->resp_opcnt, __entry->args_opcnt, __get_str(name), __entry->status) ) +TRACE_EVENT(nfsd_compound_decode_err, + TP_PROTO( + const struct svc_rqst *rqstp, + u32 args_opcnt, + u32 resp_opcnt, + u32 opnum, + __be32 status + ), + TP_ARGS(rqstp, args_opcnt, resp_opcnt, opnum, status), + TP_STRUCT__entry( + NFSD_TRACE_PROC_RES_FIELDS(rqstp) + + __field(u32, args_opcnt) + __field(u32, resp_opcnt) + __field(u32, opnum) + ), + TP_fast_assign( + NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status) + + __entry->args_opcnt = args_opcnt; + __entry->resp_opcnt = resp_opcnt; + __entry->opnum = opnum; + ), + TP_printk("op=%u/%u opnum=%u status=%lu", + __entry->resp_opcnt, __entry->args_opcnt, + __entry->opnum, __entry->status) +); + +DECLARE_EVENT_CLASS(nfsd_compound_err_class, + TP_PROTO( + const struct svc_rqst *rqstp, + u32 opnum, + __be32 status + ), + TP_ARGS(rqstp, opnum, status), + TP_STRUCT__entry( + NFSD_TRACE_PROC_RES_FIELDS(rqstp) + + __field(u32, opnum) + ), + TP_fast_assign( + NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status) + + __entry->opnum = opnum; + ), + TP_printk("opnum=%u status=%lu", + __entry->opnum, __entry->status) +); + +#define DEFINE_NFSD_COMPOUND_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_compound_err_class, nfsd_compound_##name##_err, \ + TP_PROTO( \ + const struct svc_rqst *rqstp, \ + u32 opnum, \ + __be32 status \ + ), \ + TP_ARGS(rqstp, opnum, status)) + +DEFINE_NFSD_COMPOUND_ERR_EVENT(op); +DEFINE_NFSD_COMPOUND_ERR_EVENT(encode); + +#define show_fs_file_type(x) \ + __print_symbolic(x, \ + { S_IFLNK, "LNK" }, \ + { S_IFREG, "REG" }, \ + { S_IFDIR, "DIR" }, \ + { S_IFCHR, "CHR" }, \ + { S_IFBLK, "BLK" }, \ + { S_IFIFO, "FIFO" }, \ + { S_IFSOCK, "SOCK" }) + +TRACE_EVENT_CONDITION(nfsd_fh_verify, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + umode_t type, + int access + ), + TP_ARGS(rqstp, fhp, type, access), + TP_CONDITION(rqstp != NULL), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __sockaddr(server, rqstp->rq_xprt->xpt_remotelen) + __sockaddr(client, rqstp->rq_xprt->xpt_remotelen) + __field(u32, xid) + __field(u32, fh_hash) + __field(const void *, inode) + __field(unsigned long, type) + __field(unsigned long, access) + ), + TP_fast_assign( + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; + __assign_sockaddr(server, &rqstp->rq_xprt->xpt_local, + rqstp->rq_xprt->xpt_locallen); + __assign_sockaddr(client, &rqstp->rq_xprt->xpt_remote, + rqstp->rq_xprt->xpt_remotelen); + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->inode = d_inode(fhp->fh_dentry); + __entry->type = type; + __entry->access = access; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s", + __entry->xid, __entry->fh_hash, + show_fs_file_type(__entry->type), + show_nfsd_may_flags(__entry->access) + ) +); + +TRACE_EVENT_CONDITION(nfsd_fh_verify_err, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + umode_t type, + int access, + __be32 error + ), + TP_ARGS(rqstp, fhp, type, access, error), + TP_CONDITION(rqstp != NULL && error), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __sockaddr(server, rqstp->rq_xprt->xpt_remotelen) + __sockaddr(client, rqstp->rq_xprt->xpt_remotelen) + __field(u32, xid) + __field(u32, fh_hash) + __field(const void *, inode) + __field(unsigned long, type) + __field(unsigned long, access) + __field(int, error) + ), + TP_fast_assign( + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; + __assign_sockaddr(server, &rqstp->rq_xprt->xpt_local, + rqstp->rq_xprt->xpt_locallen); + __assign_sockaddr(client, &rqstp->rq_xprt->xpt_remote, + rqstp->rq_xprt->xpt_remotelen); + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + if (fhp->fh_dentry) + __entry->inode = d_inode(fhp->fh_dentry); + else + __entry->inode = NULL; + __entry->type = type; + __entry->access = access; + __entry->error = be32_to_cpu(error); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s error=%d", + __entry->xid, __entry->fh_hash, + show_fs_file_type(__entry->type), + show_nfsd_may_flags(__entry->access), + __entry->error + ) +); + +DECLARE_EVENT_CLASS(nfsd_fh_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + int status), + TP_ARGS(rqstp, fhp, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, fh_hash) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->status = status; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x status=%d", + __entry->xid, __entry->fh_hash, + __entry->status) +) + +#define DEFINE_NFSD_FH_ERR_EVENT(name) \ +DEFINE_EVENT_CONDITION(nfsd_fh_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + int status), \ + TP_ARGS(rqstp, fhp, status), \ + TP_CONDITION(rqstp != NULL)) + +DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badexport); +DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badhandle); + +TRACE_EVENT(nfsd_exp_find_key, + TP_PROTO(const struct svc_expkey *key, + int status), + TP_ARGS(key, status), + TP_STRUCT__entry( + __field(u8, fsidtype) + __array(u32, fsid, 6) + __string(auth_domain, key->ek_client->name) + __field(int, status) + ), + TP_fast_assign( + __entry->fsidtype = key->ek_fsidtype; + memcpy(__entry->fsid, key->ek_fsid, 4*6); + __assign_str(auth_domain); + __entry->status = status; + ), + TP_printk("fsid=%x::%s domain=%s status=%d", + __entry->fsidtype, + __print_array(__entry->fsid, 6, 4), + __get_str(auth_domain), + __entry->status + ) +); + +TRACE_EVENT(nfsd_expkey_update, + TP_PROTO(const struct svc_expkey *key, const char *exp_path), + TP_ARGS(key, exp_path), + TP_STRUCT__entry( + __field(u8, fsidtype) + __array(u32, fsid, 6) + __string(auth_domain, key->ek_client->name) + __string(path, exp_path) + __field(bool, cache) + ), + TP_fast_assign( + __entry->fsidtype = key->ek_fsidtype; + memcpy(__entry->fsid, key->ek_fsid, 4*6); + __assign_str(auth_domain); + __assign_str(path); + __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); + ), + TP_printk("fsid=%x::%s domain=%s path=%s cache=%s", + __entry->fsidtype, + __print_array(__entry->fsid, 6, 4), + __get_str(auth_domain), + __get_str(path), + __entry->cache ? "pos" : "neg" + ) +); + +TRACE_EVENT(nfsd_exp_get_by_name, + TP_PROTO(const struct svc_export *key, + int status), + TP_ARGS(key, status), + TP_STRUCT__entry( + __string(path, key->ex_path.dentry->d_name.name) + __string(auth_domain, key->ex_client->name) + __field(int, status) + ), + TP_fast_assign( + __assign_str(path); + __assign_str(auth_domain); + __entry->status = status; + ), + TP_printk("path=%s domain=%s status=%d", + __get_str(path), + __get_str(auth_domain), + __entry->status + ) +); + +TRACE_EVENT(nfsd_export_update, + TP_PROTO(const struct svc_export *key), + TP_ARGS(key), + TP_STRUCT__entry( + __string(path, key->ex_path.dentry->d_name.name) + __string(auth_domain, key->ex_client->name) + __field(bool, cache) + ), + TP_fast_assign( + __assign_str(path); + __assign_str(auth_domain); + __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); + ), + TP_printk("path=%s domain=%s cache=%s", + __get_str(path), + __get_str(auth_domain), + __entry->cache ? "pos" : "neg" + ) +); + DECLARE_EVENT_CLASS(nfsd_io_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, - unsigned long len), + u64 offset, + u32 len), TP_ARGS(rqstp, fhp, offset, len), TP_STRUCT__entry( __field(u32, xid) __field(u32, fh_hash) - __field(loff_t, offset) - __field(unsigned long, len) + __field(u64, offset) + __field(u32, len) ), TP_fast_assign( __entry->xid = be32_to_cpu(rqstp->rq_xid); @@ -68,7 +448,7 @@ DECLARE_EVENT_CLASS(nfsd_io_class, __entry->offset = offset; __entry->len = len; ), - TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu", + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u", __entry->xid, __entry->fh_hash, __entry->offset, __entry->len) ) @@ -77,19 +457,24 @@ DECLARE_EVENT_CLASS(nfsd_io_class, DEFINE_EVENT(nfsd_io_class, nfsd_##name, \ TP_PROTO(struct svc_rqst *rqstp, \ struct svc_fh *fhp, \ - loff_t offset, \ - unsigned long len), \ + u64 offset, \ + u32 len), \ TP_ARGS(rqstp, fhp, offset, len)) DEFINE_NFSD_IO_EVENT(read_start); DEFINE_NFSD_IO_EVENT(read_splice); DEFINE_NFSD_IO_EVENT(read_vector); +DEFINE_NFSD_IO_EVENT(read_direct); DEFINE_NFSD_IO_EVENT(read_io_done); DEFINE_NFSD_IO_EVENT(read_done); DEFINE_NFSD_IO_EVENT(write_start); DEFINE_NFSD_IO_EVENT(write_opened); +DEFINE_NFSD_IO_EVENT(write_direct); +DEFINE_NFSD_IO_EVENT(write_vector); DEFINE_NFSD_IO_EVENT(write_io_done); DEFINE_NFSD_IO_EVENT(write_done); +DEFINE_NFSD_IO_EVENT(commit_start); +DEFINE_NFSD_IO_EVENT(commit_done); DECLARE_EVENT_CLASS(nfsd_err_class, TP_PROTO(struct svc_rqst *rqstp, @@ -125,7 +510,103 @@ DEFINE_EVENT(nfsd_err_class, nfsd_##name, \ DEFINE_NFSD_ERR_EVENT(read_err); DEFINE_NFSD_ERR_EVENT(write_err); +TRACE_EVENT(nfsd_dirent, + TP_PROTO(struct svc_fh *fhp, + u64 ino, + const char *name, + int namlen), + TP_ARGS(fhp, ino, name, namlen), + TP_STRUCT__entry( + __field(u32, fh_hash) + __field(u64, ino) + __string_len(name, name, namlen) + ), + TP_fast_assign( + __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; + __entry->ino = ino; + __assign_str(name); + ), + TP_printk("fh_hash=0x%08x ino=%llu name=%s", + __entry->fh_hash, __entry->ino, __get_str(name) + ) +) + +DECLARE_EVENT_CLASS(nfsd_copy_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *src_fhp, + loff_t src_offset, + struct svc_fh *dst_fhp, + loff_t dst_offset, + u64 count, + int status), + TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, count, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, src_fh_hash) + __field(loff_t, src_offset) + __field(u32, dst_fh_hash) + __field(loff_t, dst_offset) + __field(u64, count) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->src_fh_hash = knfsd_fh_hash(&src_fhp->fh_handle); + __entry->src_offset = src_offset; + __entry->dst_fh_hash = knfsd_fh_hash(&dst_fhp->fh_handle); + __entry->dst_offset = dst_offset; + __entry->count = count; + __entry->status = status; + ), + TP_printk("xid=0x%08x src_fh_hash=0x%08x src_offset=%lld " + "dst_fh_hash=0x%08x dst_offset=%lld " + "count=%llu status=%d", + __entry->xid, __entry->src_fh_hash, __entry->src_offset, + __entry->dst_fh_hash, __entry->dst_offset, + (unsigned long long)__entry->count, + __entry->status) +) + +#define DEFINE_NFSD_COPY_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_copy_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *src_fhp, \ + loff_t src_offset, \ + struct svc_fh *dst_fhp, \ + loff_t dst_offset, \ + u64 count, \ + int status), \ + TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, \ + count, status)) + +DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err); + #include "state.h" +#include "filecache.h" +#include "vfs.h" + +TRACE_EVENT(nfsd_delegret_wakeup, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct inode *inode, + long timeo + ), + TP_ARGS(rqstp, inode, timeo), + TP_STRUCT__entry( + __field(u32, xid) + __field(const void *, inode) + __field(long, timeo) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->inode = inode; + __entry->timeo = timeo; + ), + TP_printk("xid=0x%08x inode=%p%s", + __entry->xid, __entry->inode, + __entry->timeo == 0 ? " (timed out)" : "" + ) +); DECLARE_EVENT_CLASS(nfsd_stateid_class, TP_PROTO(stateid_t *stp), @@ -153,6 +634,7 @@ DECLARE_EVENT_CLASS(nfsd_stateid_class, DEFINE_EVENT(nfsd_stateid_class, nfsd_##name, \ TP_PROTO(stateid_t *stp), \ TP_ARGS(stp)) + DEFINE_STATEID_EVENT(layoutstate_alloc); DEFINE_STATEID_EVENT(layoutstate_unhash); DEFINE_STATEID_EVENT(layoutstate_free); @@ -164,6 +646,2014 @@ DEFINE_STATEID_EVENT(layout_recall_done); DEFINE_STATEID_EVENT(layout_recall_fail); DEFINE_STATEID_EVENT(layout_recall_release); +DEFINE_STATEID_EVENT(open); +DEFINE_STATEID_EVENT(deleg_read); +DEFINE_STATEID_EVENT(deleg_write); +DEFINE_STATEID_EVENT(deleg_return); + +DECLARE_EVENT_CLASS(nfsd_stateseqid_class, + TP_PROTO(u32 seqid, const stateid_t *stp), + TP_ARGS(seqid, stp), + TP_STRUCT__entry( + __field(u32, seqid) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + ), + TP_fast_assign( + __entry->seqid = seqid; + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + ), + TP_printk("seqid=%u client %08x:%08x stateid %08x:%08x", + __entry->seqid, __entry->cl_boot, __entry->cl_id, + __entry->si_id, __entry->si_generation) +) + +#define DEFINE_STATESEQID_EVENT(name) \ +DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \ + TP_PROTO(u32 seqid, const stateid_t *stp), \ + TP_ARGS(seqid, stp)) + +DEFINE_STATESEQID_EVENT(preprocess); +DEFINE_STATESEQID_EVENT(open_confirm); + +#define show_stid_type(x) \ + __print_flags(x, "|", \ + { SC_TYPE_OPEN, "OPEN" }, \ + { SC_TYPE_LOCK, "LOCK" }, \ + { SC_TYPE_DELEG, "DELEG" }, \ + { SC_TYPE_LAYOUT, "LAYOUT" }) + +#define show_stid_status(x) \ + __print_flags(x, "|", \ + { SC_STATUS_CLOSED, "CLOSED" }, \ + { SC_STATUS_REVOKED, "REVOKED" }, \ + { SC_STATUS_ADMIN_REVOKED, "ADMIN_REVOKED" }) + +DECLARE_EVENT_CLASS(nfsd_stid_class, + TP_PROTO( + const struct nfs4_stid *stid + ), + TP_ARGS(stid), + TP_STRUCT__entry( + __field(unsigned long, sc_type) + __field(unsigned long, sc_status) + __field(int, sc_count) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + ), + TP_fast_assign( + const stateid_t *stp = &stid->sc_stateid; + + __entry->sc_type = stid->sc_type; + __entry->sc_status = stid->sc_status; + __entry->sc_count = refcount_read(&stid->sc_count); + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + ), + TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s state=%s", + __entry->cl_boot, __entry->cl_id, + __entry->si_id, __entry->si_generation, + __entry->sc_count, show_stid_type(__entry->sc_type), + show_stid_status(__entry->sc_status) + ) +); + +#define DEFINE_STID_EVENT(name) \ +DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \ + TP_PROTO(const struct nfs4_stid *stid), \ + TP_ARGS(stid)) + +DEFINE_STID_EVENT(revoke); + +TRACE_EVENT(nfsd_stateowner_replay, + TP_PROTO( + u32 opnum, + const struct nfs4_replay *rp + ), + TP_ARGS(opnum, rp), + TP_STRUCT__entry( + __field(unsigned long, status) + __field(u32, opnum) + ), + TP_fast_assign( + __entry->status = be32_to_cpu(rp->rp_status); + __entry->opnum = opnum; + ), + TP_printk("opnum=%u status=%lu", + __entry->opnum, __entry->status) +); + +TRACE_EVENT_CONDITION(nfsd_seq4_status, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct nfsd4_sequence *sequence + ), + TP_ARGS(rqstp, sequence), + TP_CONDITION(sequence->status_flags), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(u32, xid) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(unsigned long, status_flags) + ), + TP_fast_assign( + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&sequence->sessionid; + + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->status_flags = sequence->status_flags; + ), + TP_printk("xid=0x%08x sessionid=%08x:%08x:%08x:%08x status_flags=%s", + __entry->xid, __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + show_nfs4_seq4_status(__entry->status_flags) + ) +); + +DECLARE_EVENT_CLASS(nfsd_cs_slot_class, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfsd4_create_session *cs + ), + TP_ARGS(clp, cs), + TP_STRUCT__entry( + __field(u32, seqid) + __field(u32, slot_seqid) + __field(u32, cl_boot) + __field(u32, cl_id) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + const struct nfsd4_clid_slot *slot = &clp->cl_cs_slot; + + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen); + __entry->seqid = cs->seqid; + __entry->slot_seqid = slot->sl_seqid; + ), + TP_printk("addr=%pISpc client %08x:%08x seqid=%u slot_seqid=%u", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->seqid, __entry->slot_seqid + ) +); + +#define DEFINE_CS_SLOT_EVENT(name) \ +DEFINE_EVENT(nfsd_cs_slot_class, nfsd_##name, \ + TP_PROTO( \ + const struct nfs4_client *clp, \ + const struct nfsd4_create_session *cs \ + ), \ + TP_ARGS(clp, cs)) + +DEFINE_CS_SLOT_EVENT(slot_seqid_conf); +DEFINE_CS_SLOT_EVENT(slot_seqid_unconf); + +#define show_nfs_slot_flags(val) \ + __print_flags(val, "|", \ + { NFSD4_SLOT_INUSE, "INUSE" }, \ + { NFSD4_SLOT_CACHETHIS, "CACHETHIS" }, \ + { NFSD4_SLOT_INITIALIZED, "INITIALIZED" }, \ + { NFSD4_SLOT_CACHED, "CACHED" }, \ + { NFSD4_SLOT_REUSED, "REUSED" }) + +TRACE_EVENT(nfsd_slot_seqid_sequence, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfsd4_sequence *seq, + const struct nfsd4_slot *slot + ), + TP_ARGS(clp, seq, slot), + TP_STRUCT__entry( + __field(u32, seqid) + __field(u32, slot_seqid) + __field(u32, slot_index) + __field(unsigned long, slot_flags) + __field(u32, cl_boot) + __field(u32, cl_id) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen); + __entry->seqid = seq->seqid; + __entry->slot_seqid = slot->sl_seqid; + __entry->slot_index = seq->slotid; + __entry->slot_flags = slot->sl_flags; + ), + TP_printk("addr=%pISpc client %08x:%08x idx=%u seqid=%u slot_seqid=%u flags=%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->slot_index, __entry->seqid, __entry->slot_seqid, + show_nfs_slot_flags(__entry->slot_flags) + ) +); + +DECLARE_EVENT_CLASS(nfsd_clientid_class, + TP_PROTO(const clientid_t *clid), + TP_ARGS(clid), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + ), + TP_fast_assign( + __entry->cl_boot = clid->cl_boot; + __entry->cl_id = clid->cl_id; + ), + TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id) +) + +#define DEFINE_CLIENTID_EVENT(name) \ +DEFINE_EVENT(nfsd_clientid_class, nfsd_clid_##name, \ + TP_PROTO(const clientid_t *clid), \ + TP_ARGS(clid)) + +DEFINE_CLIENTID_EVENT(expire_unconf); +DEFINE_CLIENTID_EVENT(reclaim_complete); +DEFINE_CLIENTID_EVENT(confirmed); +DEFINE_CLIENTID_EVENT(destroyed); +DEFINE_CLIENTID_EVENT(admin_expired); +DEFINE_CLIENTID_EVENT(replaced); +DEFINE_CLIENTID_EVENT(purged); +DEFINE_CLIENTID_EVENT(renew); +DEFINE_CLIENTID_EVENT(stale); + +TRACE_EVENT(nfsd_mark_client_expired, + TP_PROTO( + const struct nfs4_client *clp, + int cl_rpc_users + ), + TP_ARGS(clp, cl_rpc_users), + TP_STRUCT__entry( + __field(int, cl_rpc_users) + __field(u32, cl_boot) + __field(u32, cl_id) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_rpc_users = cl_rpc_users; + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x cl_rpc_users=%d", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->cl_rpc_users) +); + +DECLARE_EVENT_CLASS(nfsd_net_class, + TP_PROTO(const struct nfsd_net *nn), + TP_ARGS(nn), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + ), + TP_printk("boot_time=%16llx", __entry->boot_time) +) + +#define DEFINE_NET_EVENT(name) \ +DEFINE_EVENT(nfsd_net_class, nfsd_##name, \ + TP_PROTO(const struct nfsd_net *nn), \ + TP_ARGS(nn)) + +DEFINE_NET_EVENT(grace_start); +DEFINE_NET_EVENT(grace_complete); + +TRACE_EVENT(nfsd_writeverf_reset, + TP_PROTO( + const struct nfsd_net *nn, + const struct svc_rqst *rqstp, + int error + ), + TP_ARGS(nn, rqstp, error), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(u32, xid) + __field(int, error) + __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->error = error; + + /* avoid seqlock inside TP_fast_assign */ + memcpy(__entry->verifier, nn->writeverf, + NFS4_VERIFIER_SIZE); + ), + TP_printk("boot_time=%16llx xid=0x%08x error=%d new verifier=0x%s", + __entry->boot_time, __entry->xid, __entry->error, + __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE) + ) +); + +TRACE_EVENT(nfsd_clid_cred_mismatch, + TP_PROTO( + const struct nfs4_client *clp, + const struct svc_rqst *rqstp + ), + TP_ARGS(clp, rqstp), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(unsigned long, cl_flavor) + __field(unsigned long, new_flavor) + __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->cl_flavor = clp->cl_cred.cr_flavor; + __entry->new_flavor = rqstp->rq_cred.cr_flavor; + __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote, + rqstp->rq_xprt->xpt_remotelen); + ), + TP_printk("client %08x:%08x flavor=%s, conflict=%s from addr=%pISpc", + __entry->cl_boot, __entry->cl_id, + show_nfsd_authflavor(__entry->cl_flavor), + show_nfsd_authflavor(__entry->new_flavor), + __get_sockaddr(addr) + ) +) + +TRACE_EVENT(nfsd_clid_verf_mismatch, + TP_PROTO( + const struct nfs4_client *clp, + const struct svc_rqst *rqstp, + const nfs4_verifier *verf + ), + TP_ARGS(clp, rqstp, verf), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __array(unsigned char, cl_verifier, NFS4_VERIFIER_SIZE) + __array(unsigned char, new_verifier, NFS4_VERIFIER_SIZE) + __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + memcpy(__entry->cl_verifier, (void *)&clp->cl_verifier, + NFS4_VERIFIER_SIZE); + memcpy(__entry->new_verifier, (void *)verf, + NFS4_VERIFIER_SIZE); + __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote, + rqstp->rq_xprt->xpt_remotelen); + ), + TP_printk("client %08x:%08x verf=0x%s, updated=0x%s from addr=%pISpc", + __entry->cl_boot, __entry->cl_id, + __print_hex_str(__entry->cl_verifier, NFS4_VERIFIER_SIZE), + __print_hex_str(__entry->new_verifier, NFS4_VERIFIER_SIZE), + __get_sockaddr(addr) + ) +); + +DECLARE_EVENT_CLASS(nfsd_clid_class, + TP_PROTO(const struct nfs4_client *clp), + TP_ARGS(clp), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __field(unsigned long, flavor) + __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) + __string_len(name, clp->cl_name.data, clp->cl_name.len) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + memcpy(__entry->addr, &clp->cl_addr, + sizeof(struct sockaddr_in6)); + __entry->flavor = clp->cl_cred.cr_flavor; + memcpy(__entry->verifier, (void *)&clp->cl_verifier, + NFS4_VERIFIER_SIZE); + __assign_str(name); + ), + TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", + __entry->addr, __get_str(name), + __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE), + show_nfsd_authflavor(__entry->flavor), + __entry->cl_boot, __entry->cl_id) +); + +#define DEFINE_CLID_EVENT(name) \ +DEFINE_EVENT(nfsd_clid_class, nfsd_clid_##name, \ + TP_PROTO(const struct nfs4_client *clp), \ + TP_ARGS(clp)) + +DEFINE_CLID_EVENT(fresh); +DEFINE_CLID_EVENT(confirmed_r); + +/* + * from fs/nfsd/filecache.h + */ +#define show_nf_flags(val) \ + __print_flags(val, "|", \ + { 1 << NFSD_FILE_HASHED, "HASHED" }, \ + { 1 << NFSD_FILE_PENDING, "PENDING" }, \ + { 1 << NFSD_FILE_REFERENCED, "REFERENCED" }, \ + { 1 << NFSD_FILE_RECENT, "RECENT" }, \ + { 1 << NFSD_FILE_GC, "GC" }) + +DECLARE_EVENT_CLASS(nfsd_file_class, + TP_PROTO(struct nfsd_file *nf), + TP_ARGS(nf), + TP_STRUCT__entry( + __field(void *, nf_inode) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + ), + TP_fast_assign( + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p", + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), + __entry->nf_file) +) + +#define DEFINE_NFSD_FILE_EVENT(name) \ +DEFINE_EVENT(nfsd_file_class, name, \ + TP_PROTO(struct nfsd_file *nf), \ + TP_ARGS(nf)) + +DEFINE_NFSD_FILE_EVENT(nfsd_file_free); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put); +DEFINE_NFSD_FILE_EVENT(nfsd_file_closing); + +TRACE_EVENT(nfsd_file_alloc, + TP_PROTO( + const struct nfsd_file *nf + ), + TP_ARGS(nf), + TP_STRUCT__entry( + __field(const void *, nf_inode) + __field(unsigned long, nf_flags) + __field(unsigned long, nf_may) + __field(unsigned int, nf_ref) + ), + TP_fast_assign( + __entry->nf_inode = nf->nf_inode; + __entry->nf_flags = nf->nf_flags; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_may = nf->nf_may; + ), + TP_printk("inode=%p ref=%u flags=%s may=%s", + __entry->nf_inode, __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may) + ) +); + +TRACE_EVENT(nfsd_file_get_dio_attrs, + TP_PROTO( + const struct inode *inode, + const struct kstat *stat + ), + TP_ARGS(inode, stat), + TP_STRUCT__entry( + __field(const void *, inode) + __field(unsigned long, mask) + __field(u32, mem_align) + __field(u32, offset_align) + __field(u32, read_offset_align) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->mask = stat->result_mask; + __entry->mem_align = stat->dio_mem_align; + __entry->offset_align = stat->dio_offset_align; + __entry->read_offset_align = stat->dio_read_offset_align; + ), + TP_printk("inode=%p flags=%s mem_align=%u offset_align=%u read_offset_align=%u", + __entry->inode, show_statx_mask(__entry->mask), + __entry->mem_align, __entry->offset_align, + __entry->read_offset_align + ) +); + +TRACE_EVENT(nfsd_file_acquire, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct inode *inode, + unsigned int may_flags, + const struct nfsd_file *nf, + __be32 status + ), + + TP_ARGS(rqstp, inode, may_flags, nf, status), + + TP_STRUCT__entry( + __field(u32, xid) + __field(const void *, inode) + __field(unsigned long, may_flags) + __field(unsigned int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned long, nf_may) + __field(const void *, nf_file) + __field(u32, status) + ), + + TP_fast_assign( + __entry->xid = rqstp ? be32_to_cpu(rqstp->rq_xid) : 0; + __entry->inode = inode; + __entry->may_flags = may_flags; + __entry->nf_ref = nf ? refcount_read(&nf->nf_ref) : 0; + __entry->nf_flags = nf ? nf->nf_flags : 0; + __entry->nf_may = nf ? nf->nf_may : 0; + __entry->nf_file = nf ? nf->nf_file : NULL; + __entry->status = be32_to_cpu(status); + ), + + TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p status=%u", + __entry->xid, __entry->inode, + show_nfsd_may_flags(__entry->may_flags), + __entry->nf_ref, show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), + __entry->nf_file, __entry->status + ) +); + +TRACE_EVENT(nfsd_file_insert_err, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct inode *inode, + unsigned int may_flags, + long error + ), + TP_ARGS(rqstp, inode, may_flags, error), + TP_STRUCT__entry( + __field(u32, xid) + __field(const void *, inode) + __field(unsigned long, may_flags) + __field(long, error) + ), + TP_fast_assign( + __entry->xid = rqstp ? be32_to_cpu(rqstp->rq_xid) : 0; + __entry->inode = inode; + __entry->may_flags = may_flags; + __entry->error = error; + ), + TP_printk("xid=0x%x inode=%p may_flags=%s error=%ld", + __entry->xid, __entry->inode, + show_nfsd_may_flags(__entry->may_flags), + __entry->error + ) +); + +TRACE_EVENT(nfsd_file_cons_err, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct inode *inode, + unsigned int may_flags, + const struct nfsd_file *nf + ), + TP_ARGS(rqstp, inode, may_flags, nf), + TP_STRUCT__entry( + __field(u32, xid) + __field(const void *, inode) + __field(unsigned long, may_flags) + __field(unsigned int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned long, nf_may) + __field(const void *, nf_file) + ), + TP_fast_assign( + __entry->xid = rqstp ? be32_to_cpu(rqstp->rq_xid) : 0; + __entry->inode = inode; + __entry->may_flags = may_flags; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p", + __entry->xid, __entry->inode, + show_nfsd_may_flags(__entry->may_flags), __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), __entry->nf_file + ) +); + +DECLARE_EVENT_CLASS(nfsd_file_open_class, + TP_PROTO(const struct nfsd_file *nf, __be32 status), + TP_ARGS(nf, status), + TP_STRUCT__entry( + __field(void *, nf_inode) /* cannot be dereferenced */ + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned long, nf_may) + __field(void *, nf_file) /* cannot be dereferenced */ + ), + TP_fast_assign( + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("inode=%p ref=%d flags=%s may=%s file=%p", + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), + __entry->nf_file) +) + +#define DEFINE_NFSD_FILE_OPEN_EVENT(name) \ +DEFINE_EVENT(nfsd_file_open_class, name, \ + TP_PROTO( \ + const struct nfsd_file *nf, \ + __be32 status \ + ), \ + TP_ARGS(nf, status)) + +DEFINE_NFSD_FILE_OPEN_EVENT(nfsd_file_open); +DEFINE_NFSD_FILE_OPEN_EVENT(nfsd_file_opened); + +TRACE_EVENT(nfsd_file_is_cached, + TP_PROTO( + const struct inode *inode, + int found + ), + TP_ARGS(inode, found), + TP_STRUCT__entry( + __field(const struct inode *, inode) + __field(int, found) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->found = found; + ), + TP_printk("inode=%p is %scached", + __entry->inode, + __entry->found ? "" : "not " + ) +); + +TRACE_EVENT(nfsd_file_fsnotify_handle_event, + TP_PROTO(struct inode *inode, u32 mask), + TP_ARGS(inode, mask), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, nlink) + __field(umode_t, mode) + __field(u32, mask) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->nlink = inode->i_nlink; + __entry->mode = inode->i_mode; + __entry->mask = mask; + ), + TP_printk("inode=%p nlink=%u mode=0%ho mask=0x%x", __entry->inode, + __entry->nlink, __entry->mode, __entry->mask) +); + +DECLARE_EVENT_CLASS(nfsd_file_gc_class, + TP_PROTO( + const struct nfsd_file *nf + ), + TP_ARGS(nf), + TP_STRUCT__entry( + __field(void *, nf_inode) + __field(void *, nf_file) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + ), + TP_fast_assign( + __entry->nf_inode = nf->nf_inode; + __entry->nf_file = nf->nf_file; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + ), + TP_printk("inode=%p ref=%d nf_flags=%s nf_file=%p", + __entry->nf_inode, __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + __entry->nf_file + ) +); + +#define DEFINE_NFSD_FILE_GC_EVENT(name) \ +DEFINE_EVENT(nfsd_file_gc_class, name, \ + TP_PROTO( \ + const struct nfsd_file *nf \ + ), \ + TP_ARGS(nf)) + +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_aged); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed); + +DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class, + TP_PROTO( + unsigned long removed, + unsigned long remaining + ), + TP_ARGS(removed, remaining), + TP_STRUCT__entry( + __field(unsigned long, removed) + __field(unsigned long, remaining) + ), + TP_fast_assign( + __entry->removed = removed; + __entry->remaining = remaining; + ), + TP_printk("%lu entries removed, %lu remaining", + __entry->removed, __entry->remaining) +); + +#define DEFINE_NFSD_FILE_LRUWALK_EVENT(name) \ +DEFINE_EVENT(nfsd_file_lruwalk_class, name, \ + TP_PROTO( \ + unsigned long removed, \ + unsigned long remaining \ + ), \ + TP_ARGS(removed, remaining)) + +DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed); +DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed); + +TRACE_EVENT(nfsd_file_close, + TP_PROTO( + const struct inode *inode + ), + TP_ARGS(inode), + TP_STRUCT__entry( + __field(const void *, inode) + ), + TP_fast_assign( + __entry->inode = inode; + ), + TP_printk("inode=%p", + __entry->inode + ) +); + +#include "cache.h" + +TRACE_DEFINE_ENUM(RC_DROPIT); +TRACE_DEFINE_ENUM(RC_REPLY); +TRACE_DEFINE_ENUM(RC_DOIT); + +#define show_drc_retval(x) \ + __print_symbolic(x, \ + { RC_DROPIT, "DROPIT" }, \ + { RC_REPLY, "REPLY" }, \ + { RC_DOIT, "DOIT" }) + +TRACE_EVENT(nfsd_drc_found, + TP_PROTO( + const struct nfsd_net *nn, + const struct svc_rqst *rqstp, + int result + ), + TP_ARGS(nn, rqstp, result), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(unsigned long, result) + __field(u32, xid) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->result = result; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + ), + TP_printk("boot_time=%16llx xid=0x%08x result=%s", + __entry->boot_time, __entry->xid, + show_drc_retval(__entry->result)) + +); + +TRACE_EVENT(nfsd_drc_mismatch, + TP_PROTO( + const struct nfsd_net *nn, + const struct nfsd_cacherep *key, + const struct nfsd_cacherep *rp + ), + TP_ARGS(nn, key, rp), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(u32, xid) + __field(u32, cached) + __field(u32, ingress) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->xid = be32_to_cpu(key->c_key.k_xid); + __entry->cached = (__force u32)key->c_key.k_csum; + __entry->ingress = (__force u32)rp->c_key.k_csum; + ), + TP_printk("boot_time=%16llx xid=0x%08x cached-csum=0x%08x ingress-csum=0x%08x", + __entry->boot_time, __entry->xid, __entry->cached, + __entry->ingress) +); + +TRACE_EVENT(nfsd_cb_args, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfs4_cb_conn *conn + ), + TP_ARGS(clp, conn), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, prog) + __field(u32, ident) + __sockaddr(addr, conn->cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->prog = conn->cb_prog; + __entry->ident = conn->cb_ident; + __assign_sockaddr(addr, &conn->cb_addr, conn->cb_addrlen); + ), + TP_printk("addr=%pISpc client %08x:%08x prog=%u ident=%u", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->prog, __entry->ident) +); + +TRACE_EVENT(nfsd_cb_nodelegs, + TP_PROTO(const struct nfs4_client *clp), + TP_ARGS(clp), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + ), + TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id) +) + +#define show_cb_state(val) \ + __print_symbolic(val, \ + { NFSD4_CB_UP, "UP" }, \ + { NFSD4_CB_UNKNOWN, "UNKNOWN" }, \ + { NFSD4_CB_DOWN, "DOWN" }, \ + { NFSD4_CB_FAULT, "FAULT"}) + +DECLARE_EVENT_CLASS(nfsd_cb_class, + TP_PROTO(const struct nfs4_client *clp), + TP_ARGS(clp), + TP_STRUCT__entry( + __field(unsigned long, state) + __field(u32, cl_boot) + __field(u32, cl_id) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->state = clp->cl_cb_state; + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x state=%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + show_cb_state(__entry->state)) +); + +#define DEFINE_NFSD_CB_EVENT(name) \ +DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \ + TP_PROTO(const struct nfs4_client *clp), \ + TP_ARGS(clp)) + +DEFINE_NFSD_CB_EVENT(start); +DEFINE_NFSD_CB_EVENT(new_state); +DEFINE_NFSD_CB_EVENT(probe); +DEFINE_NFSD_CB_EVENT(lost); +DEFINE_NFSD_CB_EVENT(shutdown); +DEFINE_NFSD_CB_EVENT(rpc_prepare); +DEFINE_NFSD_CB_EVENT(rpc_done); +DEFINE_NFSD_CB_EVENT(rpc_release); + +TRACE_DEFINE_ENUM(RPC_AUTH_NULL); +TRACE_DEFINE_ENUM(RPC_AUTH_UNIX); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5I); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5P); + +#define show_nfsd_authflavor(val) \ + __print_symbolic(val, \ + { RPC_AUTH_NULL, "none" }, \ + { RPC_AUTH_UNIX, "sys" }, \ + { RPC_AUTH_GSS, "gss" }, \ + { RPC_AUTH_GSS_KRB5, "krb5" }, \ + { RPC_AUTH_GSS_KRB5I, "krb5i" }, \ + { RPC_AUTH_GSS_KRB5P, "krb5p" }) + +TRACE_EVENT(nfsd_cb_setup, + TP_PROTO(const struct nfs4_client *clp, + const char *netid, + rpc_authflavor_t authflavor + ), + TP_ARGS(clp, netid, authflavor), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(unsigned long, authflavor) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + __string(netid, netid) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __assign_str(netid); + __entry->authflavor = authflavor; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x proto=%s flavor=%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __get_str(netid), show_nfsd_authflavor(__entry->authflavor)) +); + +TRACE_EVENT(nfsd_cb_setup_err, + TP_PROTO( + const struct nfs4_client *clp, + long error + ), + TP_ARGS(clp, error), + TP_STRUCT__entry( + __field(long, error) + __field(u32, cl_boot) + __field(u32, cl_id) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->error = error; + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x error=%ld", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->error) +); + +/* Not a real opcode, but there is no 0 operation. */ +#define _CB_NULL 0 + +#define show_nfsd_cb_opcode(val) \ + __print_symbolic(val, \ + { _CB_NULL, "CB_NULL" }, \ + { OP_CB_GETATTR, "CB_GETATTR" }, \ + { OP_CB_RECALL, "CB_RECALL" }, \ + { OP_CB_LAYOUTRECALL, "CB_LAYOUTRECALL" }, \ + { OP_CB_RECALL_ANY, "CB_RECALL_ANY" }, \ + { OP_CB_NOTIFY_LOCK, "CB_NOTIFY_LOCK" }, \ + { OP_CB_OFFLOAD, "CB_OFFLOAD" }) + +DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfsd4_callback *cb + ), + TP_ARGS(clp, cb), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(const void *, cb) + __field(unsigned long, opcode) + __field(bool, need_restart) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->cb = cb; + __entry->opcode = cb->cb_ops ? cb->cb_ops->opcode : _CB_NULL; + __entry->need_restart = test_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x cb=%p%s opcode=%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, __entry->cb, + __entry->need_restart ? " (need restart)" : " (first try)", + show_nfsd_cb_opcode(__entry->opcode) + ) +); + +#define DEFINE_NFSD_CB_LIFETIME_EVENT(name) \ +DEFINE_EVENT(nfsd_cb_lifetime_class, nfsd_cb_##name, \ + TP_PROTO( \ + const struct nfs4_client *clp, \ + const struct nfsd4_callback *cb \ + ), \ + TP_ARGS(clp, cb)) + +DEFINE_NFSD_CB_LIFETIME_EVENT(queue); +DEFINE_NFSD_CB_LIFETIME_EVENT(destroy); +DEFINE_NFSD_CB_LIFETIME_EVENT(restart); +DEFINE_NFSD_CB_LIFETIME_EVENT(bc_update); +DEFINE_NFSD_CB_LIFETIME_EVENT(bc_shutdown); + +TRACE_EVENT(nfsd_cb_seq_status, + TP_PROTO( + const struct rpc_task *task, + const struct nfsd4_callback *cb + ), + TP_ARGS(task, cb), + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(int, tk_status) + __field(int, seq_status) + ), + TP_fast_assign( + const struct nfs4_client *clp = cb->cb_clp; + const struct nfsd4_session *session = clp->cl_cb_session; + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&session->se_sessionid; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->tk_status = task->tk_status; + __entry->seq_status = cb->cb_seq_status; + ), + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d", + __entry->task_id, __entry->client_id, + __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + __entry->tk_status, __entry->seq_status + ) +); + +TRACE_EVENT(nfsd_cb_free_slot, + TP_PROTO( + const struct rpc_task *task, + const struct nfsd4_callback *cb + ), + TP_ARGS(task, cb), + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(u32, slot_seqno) + ), + TP_fast_assign( + const struct nfs4_client *clp = cb->cb_clp; + const struct nfsd4_session *session = clp->cl_cb_session; + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&session->se_sessionid; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->slot_seqno = session->se_cb_seq_nr[cb->cb_held_slot]; + ), + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u", + __entry->task_id, __entry->client_id, + __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + __entry->slot_seqno + ) +); + +TRACE_EVENT_CONDITION(nfsd_cb_recall, + TP_PROTO( + const struct nfs4_stid *stid + ), + TP_ARGS(stid), + TP_CONDITION(stid->sc_client), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + __sockaddr(addr, stid->sc_client->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + const stateid_t *stp = &stid->sc_stateid; + const struct nfs4_client *clp = stid->sc_client; + + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->si_id, __entry->si_generation) +); + +TRACE_EVENT(nfsd_cb_notify_lock, + TP_PROTO( + const struct nfs4_lockowner *lo, + const struct nfsd4_blocked_lock *nbl + ), + TP_ARGS(lo, nbl), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, fh_hash) + __sockaddr(addr, lo->lo_owner.so_client->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + const struct nfs4_client *clp = lo->lo_owner.so_client; + + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->fh_hash = knfsd_fh_hash(&nbl->nbl_fh); + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x fh_hash=0x%08x", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->fh_hash) +); + +TRACE_EVENT(nfsd_cb_offload, + TP_PROTO( + const struct nfs4_client *clp, + const stateid_t *stp, + const struct knfsd_fh *fh, + u64 count, + __be32 status + ), + TP_ARGS(clp, stp, fh, count, status), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + __field(u32, fh_hash) + __field(int, status) + __field(u64, count) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + __entry->fh_hash = knfsd_fh_hash(fh); + __entry->status = be32_to_cpu(status); + __entry->count = count; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x fh_hash=0x%08x count=%llu status=%d", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->si_id, __entry->si_generation, + __entry->fh_hash, __entry->count, __entry->status) +); + +TRACE_EVENT(nfsd_cb_recall_any, + TP_PROTO( + const struct nfsd4_cb_recall_any *ra + ), + TP_ARGS(ra), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, keep) + __field(unsigned long, bmval0) + __sockaddr(addr, ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = ra->ra_cb.cb_clp->cl_clientid.cl_boot; + __entry->cl_id = ra->ra_cb.cb_clp->cl_clientid.cl_id; + __entry->keep = ra->ra_keep; + __entry->bmval0 = ra->ra_bmval[0]; + __assign_sockaddr(addr, &ra->ra_cb.cb_clp->cl_addr, + ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen); + ), + TP_printk("addr=%pISpc client %08x:%08x keep=%u bmval0=%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->keep, show_rca_mask(__entry->bmval0) + ) +); + +DECLARE_EVENT_CLASS(nfsd_cb_done_class, + TP_PROTO( + const stateid_t *stp, + const struct rpc_task *task + ), + TP_ARGS(stp, task), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + __field(int, status) + ), + TP_fast_assign( + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + __entry->status = task->tk_status; + ), + TP_printk("client %08x:%08x stateid %08x:%08x status=%d", + __entry->cl_boot, __entry->cl_id, __entry->si_id, + __entry->si_generation, __entry->status + ) +); + +#define DEFINE_NFSD_CB_DONE_EVENT(name) \ +DEFINE_EVENT(nfsd_cb_done_class, name, \ + TP_PROTO( \ + const stateid_t *stp, \ + const struct rpc_task *task \ + ), \ + TP_ARGS(stp, task)) + +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_recall_done); +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done); +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done); +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done); +DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_getattr_done); + +TRACE_EVENT(nfsd_cb_recall_any_done, + TP_PROTO( + const struct nfsd4_callback *cb, + const struct rpc_task *task + ), + TP_ARGS(cb, task), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(int, status) + ), + TP_fast_assign( + __entry->status = task->tk_status; + __entry->cl_boot = cb->cb_clp->cl_clientid.cl_boot; + __entry->cl_id = cb->cb_clp->cl_clientid.cl_id; + ), + TP_printk("client %08x:%08x status=%d", + __entry->cl_boot, __entry->cl_id, __entry->status + ) +); + +TRACE_EVENT(nfsd_ctl_unlock_ip, + TP_PROTO( + const struct net *net, + const char *address + ), + TP_ARGS(net, address), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(address, address) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(address); + ), + TP_printk("address=%s", + __get_str(address) + ) +); + +TRACE_EVENT(nfsd_ctl_unlock_fs, + TP_PROTO( + const struct net *net, + const char *path + ), + TP_ARGS(net, path), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(path, path) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(path); + ), + TP_printk("path=%s", + __get_str(path) + ) +); + +TRACE_EVENT(nfsd_ctl_filehandle, + TP_PROTO( + const struct net *net, + const char *domain, + const char *path, + int maxsize + ), + TP_ARGS(net, domain, path, maxsize), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, maxsize) + __string(domain, domain) + __string(path, path) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->maxsize = maxsize; + __assign_str(domain); + __assign_str(path); + ), + TP_printk("domain=%s path=%s maxsize=%d", + __get_str(domain), __get_str(path), __entry->maxsize + ) +); + +TRACE_EVENT(nfsd_ctl_threads, + TP_PROTO( + const struct net *net, + int newthreads + ), + TP_ARGS(net, newthreads), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, newthreads) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->newthreads = newthreads; + ), + TP_printk("newthreads=%d", + __entry->newthreads + ) +); + +TRACE_EVENT(nfsd_ctl_pool_threads, + TP_PROTO( + const struct net *net, + int pool, + int nrthreads + ), + TP_ARGS(net, pool, nrthreads), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, pool) + __field(int, nrthreads) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->pool = pool; + __entry->nrthreads = nrthreads; + ), + TP_printk("pool=%d nrthreads=%d", + __entry->pool, __entry->nrthreads + ) +); + +TRACE_EVENT(nfsd_ctl_version, + TP_PROTO( + const struct net *net, + const char *mesg + ), + TP_ARGS(net, mesg), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(mesg, mesg) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(mesg); + ), + TP_printk("%s", + __get_str(mesg) + ) +); + +TRACE_EVENT(nfsd_ctl_ports_addfd, + TP_PROTO( + const struct net *net, + int fd + ), + TP_ARGS(net, fd), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, fd) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->fd = fd; + ), + TP_printk("fd=%d", + __entry->fd + ) +); + +TRACE_EVENT(nfsd_ctl_ports_addxprt, + TP_PROTO( + const struct net *net, + const char *transport, + int port + ), + TP_ARGS(net, transport, port), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, port) + __string(transport, transport) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->port = port; + __assign_str(transport); + ), + TP_printk("transport=%s port=%d", + __get_str(transport), __entry->port + ) +); + +TRACE_EVENT(nfsd_ctl_maxblksize, + TP_PROTO( + const struct net *net, + int bsize + ), + TP_ARGS(net, bsize), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, bsize) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->bsize = bsize; + ), + TP_printk("bsize=%d", + __entry->bsize + ) +); + +TRACE_EVENT(nfsd_ctl_time, + TP_PROTO( + const struct net *net, + const char *name, + size_t namelen, + int time + ), + TP_ARGS(net, name, namelen, time), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, time) + __string_len(name, name, namelen) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->time = time; + __assign_str(name); + ), + TP_printk("file=%s time=%d", + __get_str(name), __entry->time + ) +); + +TRACE_EVENT(nfsd_ctl_recoverydir, + TP_PROTO( + const struct net *net, + const char *recdir + ), + TP_ARGS(net, recdir), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(recdir, recdir) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(recdir); + ), + TP_printk("recdir=%s", + __get_str(recdir) + ) +); + +TRACE_EVENT(nfsd_end_grace, + TP_PROTO( + const struct net *net + ), + TP_ARGS(net), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + ), + TP_printk("nn=%d", __entry->netns_ino + ) +); + +DECLARE_EVENT_CLASS(nfsd_copy_class, + TP_PROTO( + const struct nfsd4_copy *copy + ), + TP_ARGS(copy), + TP_STRUCT__entry( + __field(bool, intra) + __field(bool, async) + __field(u32, src_cl_boot) + __field(u32, src_cl_id) + __field(u32, src_so_id) + __field(u32, src_si_generation) + __field(u32, dst_cl_boot) + __field(u32, dst_cl_id) + __field(u32, dst_so_id) + __field(u32, dst_si_generation) + __field(u32, cb_cl_boot) + __field(u32, cb_cl_id) + __field(u32, cb_so_id) + __field(u32, cb_si_generation) + __field(u64, src_cp_pos) + __field(u64, dst_cp_pos) + __field(u64, cp_count) + __sockaddr(addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + const stateid_t *src_stp = ©->cp_src_stateid; + const stateid_t *dst_stp = ©->cp_dst_stateid; + const stateid_t *cb_stp = ©->cp_res.cb_stateid; + + __entry->intra = test_bit(NFSD4_COPY_F_INTRA, ©->cp_flags); + __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); + __entry->src_cl_boot = src_stp->si_opaque.so_clid.cl_boot; + __entry->src_cl_id = src_stp->si_opaque.so_clid.cl_id; + __entry->src_so_id = src_stp->si_opaque.so_id; + __entry->src_si_generation = src_stp->si_generation; + __entry->dst_cl_boot = dst_stp->si_opaque.so_clid.cl_boot; + __entry->dst_cl_id = dst_stp->si_opaque.so_clid.cl_id; + __entry->dst_so_id = dst_stp->si_opaque.so_id; + __entry->dst_si_generation = dst_stp->si_generation; + __entry->cb_cl_boot = cb_stp->si_opaque.so_clid.cl_boot; + __entry->cb_cl_id = cb_stp->si_opaque.so_clid.cl_id; + __entry->cb_so_id = cb_stp->si_opaque.so_id; + __entry->cb_si_generation = cb_stp->si_generation; + __entry->src_cp_pos = copy->cp_src_pos; + __entry->dst_cp_pos = copy->cp_dst_pos; + __entry->cp_count = copy->cp_count; + __assign_sockaddr(addr, ©->cp_clp->cl_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("client=%pISpc intra=%d async=%d " + "src_client %08x:%08x src_stateid %08x:%08x " + "dst_client %08x:%08x dst_stateid %08x:%08x " + "cb_client %08x:%08x cb_stateid %08x:%08x " + "cp_src_pos=%llu cp_dst_pos=%llu cp_count=%llu", + __get_sockaddr(addr), __entry->intra, __entry->async, + __entry->src_cl_boot, __entry->src_cl_id, + __entry->src_so_id, __entry->src_si_generation, + __entry->dst_cl_boot, __entry->dst_cl_id, + __entry->dst_so_id, __entry->dst_si_generation, + __entry->cb_cl_boot, __entry->cb_cl_id, + __entry->cb_so_id, __entry->cb_si_generation, + __entry->src_cp_pos, __entry->dst_cp_pos, __entry->cp_count + ) +); + +#define DEFINE_COPY_EVENT(name) \ +DEFINE_EVENT(nfsd_copy_class, nfsd_copy_##name, \ + TP_PROTO(const struct nfsd4_copy *copy), \ + TP_ARGS(copy)) + +DEFINE_COPY_EVENT(inter); +DEFINE_COPY_EVENT(intra); +DEFINE_COPY_EVENT(async); + +TRACE_EVENT(nfsd_copy_done, + TP_PROTO( + const struct nfsd4_copy *copy, + __be32 status + ), + TP_ARGS(copy, status), + TP_STRUCT__entry( + __field(int, status) + __field(bool, intra) + __field(bool, async) + __sockaddr(addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + __entry->status = be32_to_cpu(status); + __entry->intra = test_bit(NFSD4_COPY_F_INTRA, ©->cp_flags); + __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); + __assign_sockaddr(addr, ©->cp_clp->cl_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc status=%d intra=%d async=%d", + __get_sockaddr(addr), __entry->status, __entry->intra, __entry->async + ) +); + +DECLARE_EVENT_CLASS(nfsd_copy_async_done_class, + TP_PROTO( + const struct nfsd4_copy *copy + ), + TP_ARGS(copy), + TP_STRUCT__entry( + __field(int, status) + __field(bool, intra) + __field(bool, async) + __field(u32, src_cl_boot) + __field(u32, src_cl_id) + __field(u32, src_so_id) + __field(u32, src_si_generation) + __field(u32, dst_cl_boot) + __field(u32, dst_cl_id) + __field(u32, dst_so_id) + __field(u32, dst_si_generation) + __field(u32, cb_cl_boot) + __field(u32, cb_cl_id) + __field(u32, cb_so_id) + __field(u32, cb_si_generation) + __field(u64, src_cp_pos) + __field(u64, dst_cp_pos) + __field(u64, cp_count) + __sockaddr(addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + const stateid_t *src_stp = ©->cp_src_stateid; + const stateid_t *dst_stp = ©->cp_dst_stateid; + const stateid_t *cb_stp = ©->cp_res.cb_stateid; + + __entry->status = be32_to_cpu(copy->nfserr); + __entry->intra = test_bit(NFSD4_COPY_F_INTRA, ©->cp_flags); + __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); + __entry->src_cl_boot = src_stp->si_opaque.so_clid.cl_boot; + __entry->src_cl_id = src_stp->si_opaque.so_clid.cl_id; + __entry->src_so_id = src_stp->si_opaque.so_id; + __entry->src_si_generation = src_stp->si_generation; + __entry->dst_cl_boot = dst_stp->si_opaque.so_clid.cl_boot; + __entry->dst_cl_id = dst_stp->si_opaque.so_clid.cl_id; + __entry->dst_so_id = dst_stp->si_opaque.so_id; + __entry->dst_si_generation = dst_stp->si_generation; + __entry->cb_cl_boot = cb_stp->si_opaque.so_clid.cl_boot; + __entry->cb_cl_id = cb_stp->si_opaque.so_clid.cl_id; + __entry->cb_so_id = cb_stp->si_opaque.so_id; + __entry->cb_si_generation = cb_stp->si_generation; + __entry->src_cp_pos = copy->cp_src_pos; + __entry->dst_cp_pos = copy->cp_dst_pos; + __entry->cp_count = copy->cp_count; + __assign_sockaddr(addr, ©->cp_clp->cl_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("client=%pISpc status=%d intra=%d async=%d " + "src_client %08x:%08x src_stateid %08x:%08x " + "dst_client %08x:%08x dst_stateid %08x:%08x " + "cb_client %08x:%08x cb_stateid %08x:%08x " + "cp_src_pos=%llu cp_dst_pos=%llu cp_count=%llu", + __get_sockaddr(addr), + __entry->status, __entry->intra, __entry->async, + __entry->src_cl_boot, __entry->src_cl_id, + __entry->src_so_id, __entry->src_si_generation, + __entry->dst_cl_boot, __entry->dst_cl_id, + __entry->dst_so_id, __entry->dst_si_generation, + __entry->cb_cl_boot, __entry->cb_cl_id, + __entry->cb_so_id, __entry->cb_si_generation, + __entry->src_cp_pos, __entry->dst_cp_pos, __entry->cp_count + ) +); + +#define DEFINE_COPY_ASYNC_DONE_EVENT(name) \ +DEFINE_EVENT(nfsd_copy_async_done_class, \ + nfsd_copy_async_##name, \ + TP_PROTO(const struct nfsd4_copy *copy), \ + TP_ARGS(copy)) + +DEFINE_COPY_ASYNC_DONE_EVENT(done); +DEFINE_COPY_ASYNC_DONE_EVENT(cancel); + +TRACE_EVENT(nfsd_vfs_setattr, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const struct iattr *iap, + const struct timespec64 *guardtime + ), + TP_ARGS(rqstp, fhp, iap, guardtime), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(s64, gtime_tv_sec) + __field(u32, gtime_tv_nsec) + __field(unsigned int, ia_valid) + __field(loff_t, ia_size) + __field(uid_t, ia_uid) + __field(gid_t, ia_gid) + __field(umode_t, ia_mode) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->gtime_tv_sec = guardtime ? guardtime->tv_sec : 0; + __entry->gtime_tv_nsec = guardtime ? guardtime->tv_nsec : 0; + __entry->ia_valid = iap->ia_valid; + __entry->ia_size = iap->ia_size; + __entry->ia_uid = __kuid_val(iap->ia_uid); + __entry->ia_gid = __kgid_val(iap->ia_gid); + __entry->ia_mode = iap->ia_mode; + ), + TP_printk( + "xid=0x%08x fh_hash=0x%08x ia_valid=%s ia_size=%llu ia_mode=0%o ia_uid=%u ia_gid=%u guard_time=%lld.%u", + __entry->xid, __entry->fh_hash, show_ia_valid_flags(__entry->ia_valid), + __entry->ia_size, __entry->ia_mode, __entry->ia_uid, __entry->ia_gid, + __entry->gtime_tv_sec, __entry->gtime_tv_nsec + ) +) + +TRACE_EVENT(nfsd_vfs_lookup, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s", + __entry->xid, __entry->fh_hash, __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_create, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + umode_t type, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, type, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(umode_t, type) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->type = type; + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x type=%s name=%s", + __entry->xid, __entry->fh_hash, + show_fs_file_type(__entry->type), __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_symlink, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int namelen, + const char *target + ), + TP_ARGS(rqstp, fhp, name, namelen, target), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, namelen) + __string(target, target) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + __assign_str(target); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s target=%s", + __entry->xid, __entry->fh_hash, + __get_str(name), __get_str(target) + ) +); + +TRACE_EVENT(nfsd_vfs_link, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *sfhp, + const struct svc_fh *tfhp, + const char *name, + unsigned int namelen + ), + TP_ARGS(rqstp, sfhp, tfhp, name, namelen), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, sfh_hash) + __field(u32, tfh_hash) + __string_len(name, name, namelen) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle); + __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x src_fh=0x%08x tgt_fh=0x%08x name=%s", + __entry->xid, __entry->sfh_hash, __entry->tfh_hash, + __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_unlink, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s", + __entry->xid, __entry->fh_hash, + __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_rename, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *sfhp, + const struct svc_fh *tfhp, + const char *source, + unsigned int sourcelen, + const char *target, + unsigned int targetlen + ), + TP_ARGS(rqstp, sfhp, tfhp, source, sourcelen, target, targetlen), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, sfh_hash) + __field(u32, tfh_hash) + __string_len(source, source, sourcelen) + __string_len(target, target, targetlen) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle); + __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle); + __assign_str(source); + __assign_str(target); + ), + TP_printk("xid=0x%08x sfh_hash=0x%08x tfh_hash=0x%08x source=%s target=%s", + __entry->xid, __entry->sfh_hash, __entry->tfh_hash, + __get_str(source), __get_str(target) + ) +); + +TRACE_EVENT(nfsd_vfs_readdir, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + u32 count, + u64 offset + ), + TP_ARGS(rqstp, fhp, count, offset), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(u32, count) + __field(u64, offset) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->count = count; + __entry->offset = offset; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu count=%u", + __entry->xid, __entry->fh_hash, + __entry->offset, __entry->count + ) +); + +DECLARE_EVENT_CLASS(nfsd_vfs_getattr_class, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp + ), + TP_ARGS(rqstp, fhp), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x", + __entry->xid, __entry->fh_hash + ) +); + +#define DEFINE_NFSD_VFS_GETATTR_EVENT(__name) \ +DEFINE_EVENT(nfsd_vfs_getattr_class, __name, \ + TP_PROTO( \ + const struct svc_rqst *rqstp, \ + const struct svc_fh *fhp \ + ), \ + TP_ARGS(rqstp, fhp)) + +DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_getattr); +DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_statfs); + +DECLARE_EVENT_CLASS(nfsd_pnfs_class, + TP_PROTO( + const struct nfs4_client *clp, + const char *dev, + int error + ), + TP_ARGS(clp, dev, error), + TP_STRUCT__entry( + __sockaddr(addr, sizeof(struct sockaddr_in6)) + __field(unsigned int, netns_ino) + __string(dev, dev) + __field(int, error) + ), + TP_fast_assign( + __assign_sockaddr(addr, &clp->cl_addr, + sizeof(struct sockaddr_in6)); + __entry->netns_ino = clp->net->ns.inum; + __assign_str(dev); + __entry->error = error; + ), + TP_printk("client=%pISpc nn=%d dev=%s error=%d", + __get_sockaddr(addr), + __entry->netns_ino, + __get_str(dev), + __entry->error + ) +); + +#define DEFINE_NFSD_PNFS_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_pnfs_class, nfsd_pnfs_##name, \ + TP_PROTO( \ + const struct nfs4_client *clp, \ + const char *dev, \ + int error \ + ), \ + TP_ARGS(clp, dev, error)) + +DEFINE_NFSD_PNFS_ERR_EVENT(fence); #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9824e32b2f23..964cf922ad83 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -25,56 +25,95 @@ #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> #include <linux/jhash.h> -#include <linux/ima.h> +#include <linux/pagemap.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/exportfs.h> #include <linux/writeback.h> #include <linux/security.h> +#include <linux/sunrpc/xdr.h> -#ifdef CONFIG_NFSD_V3 #include "xdr3.h" -#endif /* CONFIG_NFSD_V3 */ #ifdef CONFIG_NFSD_V4 -#include "../internal.h" #include "acl.h" #include "idmap.h" +#include "xdr4.h" #endif /* CONFIG_NFSD_V4 */ #include "nfsd.h" #include "vfs.h" +#include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP +bool nfsd_disable_splice_read __read_mostly; +u64 nfsd_io_cache_read __read_mostly = NFSD_IO_BUFFERED; +u64 nfsd_io_cache_write __read_mostly = NFSD_IO_BUFFERED; -/* - * This is a cache of readahead params that help us choose the proper - * readahead strategy. Initially, we set all readahead parameters to 0 - * and let the VFS handle things. - * If you increase the number of cached files very much, you'll need to - * add a hash table here. +/** + * nfserrno - Map Linux errnos to NFS errnos + * @errno: POSIX(-ish) error code to be mapped + * + * Returns the appropriate (net-endian) nfserr_* (or nfs_ok if errno is 0). If + * it's an error we don't expect, log it once and return nfserr_io. */ -struct raparms { - struct raparms *p_next; - unsigned int p_count; - ino_t p_ino; - dev_t p_dev; - int p_set; - struct file_ra_state p_ra; - unsigned int p_hindex; -}; - -struct raparm_hbucket { - struct raparms *pb_head; - spinlock_t pb_lock; -} ____cacheline_aligned_in_smp; +__be32 +nfserrno (int errno) +{ + static struct { + __be32 nfserr; + int syserr; + } nfs_errtbl[] = { + { nfs_ok, 0 }, + { nfserr_perm, -EPERM }, + { nfserr_noent, -ENOENT }, + { nfserr_io, -EIO }, + { nfserr_nxio, -ENXIO }, + { nfserr_fbig, -E2BIG }, + { nfserr_stale, -EBADF }, + { nfserr_acces, -EACCES }, + { nfserr_exist, -EEXIST }, + { nfserr_xdev, -EXDEV }, + { nfserr_nodev, -ENODEV }, + { nfserr_notdir, -ENOTDIR }, + { nfserr_isdir, -EISDIR }, + { nfserr_inval, -EINVAL }, + { nfserr_fbig, -EFBIG }, + { nfserr_nospc, -ENOSPC }, + { nfserr_rofs, -EROFS }, + { nfserr_mlink, -EMLINK }, + { nfserr_nametoolong, -ENAMETOOLONG }, + { nfserr_notempty, -ENOTEMPTY }, + { nfserr_dquot, -EDQUOT }, + { nfserr_stale, -ESTALE }, + { nfserr_jukebox, -ETIMEDOUT }, + { nfserr_jukebox, -ERESTARTSYS }, + { nfserr_jukebox, -EAGAIN }, + { nfserr_jukebox, -EWOULDBLOCK }, + { nfserr_jukebox, -ENOMEM }, + { nfserr_io, -ETXTBSY }, + { nfserr_notsupp, -EOPNOTSUPP }, + { nfserr_toosmall, -ETOOSMALL }, + { nfserr_serverfault, -ESERVERFAULT }, + { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EREMOTEIO }, + { nfserr_stale, -EOPENSTALE }, + { nfserr_io, -EUCLEAN }, + { nfserr_perm, -ENOKEY }, + { nfserr_no_grace, -ENOGRACE}, + { nfserr_io, -EBADMSG }, + }; + int i; -#define RAPARM_HASH_BITS 4 -#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) -#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) -static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; + for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) { + if (nfs_errtbl[i].syserr == errno) + return nfs_errtbl[i].nfserr; + } + WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); + return nfserr_io; +} /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed @@ -90,9 +129,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct dentry *dentry = *dpp; struct path path = {.mnt = mntget(exp->ex_path.mnt), .dentry = dget(dentry)}; + unsigned int follow_flags = 0; int err = 0; - err = follow_down(&path); + if (exp->ex_flags & NFSEXP_CROSSMOUNT) + follow_flags = LOOKUP_AUTOMOUNT; + + err = follow_down(&path, follow_flags); if (err < 0) goto out; if (path.mnt == exp->ex_path.mnt && path.dentry == dentry && @@ -187,7 +230,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) return 1; if (nfsd4_is_junction(dentry)) return 1; - if (d_mountpoint(dentry)) + if (d_managed(dentry)) /* * Might only be a mountpoint in a different namespace, * but we need to check. @@ -206,7 +249,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry; int host_err; - dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); + trace_nfsd_vfs_lookup(rqstp, fhp, name, len); dparent = fhp->fh_dentry; exp = exp_get(fhp->fh_export); @@ -226,27 +269,14 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; } } else { - /* - * In the nfsd4_open() case, this may be held across - * subsequent open and delegation acquisition which may - * need to take the child's i_mutex: - */ - fh_lock_nested(fhp, I_MUTEX_PARENT); - dentry = lookup_one_len(name, dparent, len); + dentry = lookup_one_unlocked(&nop_mnt_idmap, + &QSTR_LEN(name, len), dparent); host_err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_nfserr; if (nfsd_mountpoint(dentry, exp)) { - /* - * We don't need the i_mutex after all. It's - * still possible we could open this (regular - * files can be mountpoints too), but the - * i_mutex is just there to prevent renames of - * something that we might be about to delegate, - * and a mountpoint won't be renamed: - */ - fh_unlock(fhp); - if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { + host_err = nfsd_cross_mnt(rqstp, &dentry, &exp); + if (host_err) { dput(dentry); goto out_nfserr; } @@ -261,7 +291,15 @@ out_nfserr: return nfserrno(host_err); } -/* +/** + * nfsd_lookup - look up a single path component for nfsd + * + * @rqstp: the request context + * @fhp: the file handle of the directory + * @name: the component name, or %NULL to look up parent + * @len: length of name to examine + * @resfh: pointer to pre-initialised filehandle to hold result. + * * Look up one component of a pathname. * N.B. After this call _both_ fhp and resfh need an fh_put * @@ -271,11 +309,11 @@ out_nfserr: * returned. Otherwise the covered directory is returned. * NOTE: this mountpoint crossing is not supported properly by all * clients and is explicitly disallowed for NFSv3 - * NeilBrown <neilb@cse.unsw.edu.au> + * */ __be32 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, - unsigned int len, struct svc_fh *resfh) + unsigned int len, struct svc_fh *resfh) { struct svc_export *exp; struct dentry *dentry; @@ -287,7 +325,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); if (err) return err; - err = check_nfsd_access(exp, rqstp); + err = check_nfsd_access(exp, rqstp, false); if (err) goto out; /* @@ -303,23 +341,47 @@ out: return err; } +static void +commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst *rqstp, + int err) +{ + switch (err) { + case -EAGAIN: + case -ESTALE: + /* + * Neither of these are the result of a problem with + * durable storage, so avoid a write verifier reset. + */ + break; + default: + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, err); + } +} + /* * Commit metadata changes to stable storage. */ static int -commit_metadata(struct svc_fh *fhp) +commit_inode_metadata(struct inode *inode) { - struct inode *inode = d_inode(fhp->fh_dentry); const struct export_operations *export_ops = inode->i_sb->s_export_op; - if (!EX_ISSYNC(fhp->fh_export)) - return 0; - if (export_ops->commit_metadata) return export_ops->commit_metadata(inode); return sync_inode_metadata(inode, 1); } +static int +commit_metadata(struct svc_fh *fhp) +{ + struct inode *inode = d_inode(fhp->fh_dentry); + + if (!EX_ISSYNC(fhp->fh_export)) + return 0; + return commit_inode_metadata(inode); +} + /* * Go over the attributes and take care of the small differences between * NFS semantics and what Linux expects. @@ -327,6 +389,10 @@ commit_metadata(struct svc_fh *fhp) static void nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) { + /* Ignore mode updates on symlinks */ + if (S_ISLNK(inode->i_mode)) + iap->ia_valid &= ~ATTR_MODE; + /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { iap->ia_mode &= S_IALLUGO; @@ -344,7 +410,9 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) iap->ia_mode &= ~S_ISGID; } else { /* set ATTR_KILL_* bits and let VFS handle it */ - iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); + iap->ia_valid |= ATTR_KILL_SUID; + iap->ia_valid |= + setattr_should_drop_sgid(&nop_mnt_idmap, inode); } } } @@ -354,47 +422,98 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap) { struct inode *inode = d_inode(fhp->fh_dentry); - int host_err; if (iap->ia_size < inode->i_size) { __be32 err; - err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE); + err = nfsd_permission(&rqstp->rq_cred, + fhp->fh_export, fhp->fh_dentry, + NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE); if (err) return err; } + return nfserrno(get_write_access(inode)); +} + +static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) +{ + int host_err; + + if (iap->ia_valid & ATTR_SIZE) { + /* + * RFC5661, Section 18.30.4: + * Changing the size of a file with SETATTR indirectly + * changes the time_modify and change attributes. + * + * (and similar for the older RFCs) + */ + struct iattr size_attr = { + .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, + .ia_size = iap->ia_size, + }; + + if (iap->ia_size < 0) + return -EFBIG; + + host_err = notify_change(&nop_mnt_idmap, dentry, &size_attr, NULL); + if (host_err) + return host_err; + iap->ia_valid &= ~ATTR_SIZE; + + /* + * Avoid the additional setattr call below if the only other + * attribute that the client sends is the mtime, as we update + * it as part of the size change above. + */ + if ((iap->ia_valid & ~ATTR_MTIME) == 0) + return 0; + } - host_err = get_write_access(inode); - if (host_err) - goto out_nfserrno; + if ((iap->ia_valid & ~ATTR_DELEG) == 0) + return 0; - host_err = locks_verify_truncate(inode, NULL, iap->ia_size); - if (host_err) - goto out_put_write_access; - return 0; + /* + * If ATTR_DELEG is set, then this is an update from a client that + * holds a delegation. If this is an update for only the atime, the + * ctime should not be changed. If the update contains the mtime + * too, then ATTR_CTIME should already be set. + */ + if (!(iap->ia_valid & ATTR_DELEG)) + iap->ia_valid |= ATTR_CTIME; -out_put_write_access: - put_write_access(inode); -out_nfserrno: - return nfserrno(host_err); + return notify_change(&nop_mnt_idmap, dentry, iap, NULL); } -/* - * Set various file attributes. After this call fhp needs an fh_put. +/** + * nfsd_setattr - Set various file attributes. + * @rqstp: controlling RPC transaction + * @fhp: filehandle of target + * @attr: attributes to set + * @guardtime: do not act if ctime.tv_sec does not match this timestamp + * + * This call may adjust the contents of @attr (in particular, this + * call may change the bits in the na_iattr.ia_valid field). + * + * Returns nfs_ok on success, otherwise an NFS status code is + * returned. Caller must release @fhp by calling fh_put in either + * case. */ __be32 -nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, - int check_guard, time_t guardtime) +nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_attrs *attr, const struct timespec64 *guardtime) { struct dentry *dentry; struct inode *inode; + struct iattr *iap = attr->na_iattr; int accmode = NFSD_MAY_SATTR; umode_t ftype = 0; __be32 err; - int host_err; + int host_err = 0; bool get_write_count; bool size_change = (iap->ia_valid & ATTR_SIZE); + int retries; + + trace_nfsd_vfs_setattr(rqstp, fhp, iap, guardtime); if (iap->ia_valid & ATTR_SIZE) { accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; @@ -404,7 +523,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* * If utimes(2) and friends are called with times not NULL, we should * not set NFSD_MAY_WRITE bit. Otherwise fh_verify->nfsd_permission - * will return EACCESS, when the caller's effective UID does not match + * will return EACCES, when the caller's effective UID does not match * the owner of the file, and the caller is not privileged. In this * situation, we should return EPERM(notify_change will return this). */ @@ -430,18 +549,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, dentry = fhp->fh_dentry; inode = d_inode(dentry); - /* Ignore any mode updates on symlinks */ - if (S_ISLNK(inode->i_mode)) - iap->ia_valid &= ~ATTR_MODE; - - if (!iap->ia_valid) - return 0; - nfsd_sanitize_attrs(inode, iap); - if (check_guard && guardtime != inode->i_ctime.tv_sec) - return nfserr_notsync; - /* * The size case is special, it changes the file in addition to the * attributes, and file systems don't expect it to be mixed with @@ -455,45 +564,64 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, return err; } - fh_lock(fhp); - if (size_change) { - /* - * RFC5661, Section 18.30.4: - * Changing the size of a file with SETATTR indirectly - * changes the time_modify and change attributes. - * - * (and similar for the older RFCs) - */ - struct iattr size_attr = { - .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, - .ia_size = iap->ia_size, - }; + inode_lock(inode); + err = fh_fill_pre_attrs(fhp); + if (err) + goto out_unlock; + + if (guardtime) { + struct timespec64 ctime = inode_get_ctime(inode); + if ((u32)guardtime->tv_sec != (u32)ctime.tv_sec || + guardtime->tv_nsec != ctime.tv_nsec) { + err = nfserr_notsync; + goto out_fill_attrs; + } + } - host_err = notify_change(dentry, &size_attr, NULL); - if (host_err) - goto out_unlock; - iap->ia_valid &= ~ATTR_SIZE; + for (retries = 1;;) { + struct iattr attrs; /* - * Avoid the additional setattr call below if the only other - * attribute that the client sends is the mtime, as we update - * it as part of the size change above. + * notify_change() can alter its iattr argument, making + * @iap unsuitable for submission multiple times. Make a + * copy for every loop iteration. */ - if ((iap->ia_valid & ~ATTR_MTIME) == 0) - goto out_unlock; + attrs = *iap; + host_err = __nfsd_setattr(dentry, &attrs); + if (host_err != -EAGAIN || !retries--) + break; + if (!nfsd_wait_for_delegreturn(rqstp, inode)) + break; } - - iap->ia_valid |= ATTR_CTIME; - host_err = notify_change(dentry, iap, NULL); - + if (attr->na_seclabel && attr->na_seclabel->len) + attr->na_labelerr = security_inode_setsecctx(dentry, + attr->na_seclabel->data, attr->na_seclabel->len); + if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl) + attr->na_aclerr = set_posix_acl(&nop_mnt_idmap, + dentry, ACL_TYPE_ACCESS, + attr->na_pacl); + if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && + !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode)) + attr->na_aclerr = set_posix_acl(&nop_mnt_idmap, + dentry, ACL_TYPE_DEFAULT, + attr->na_dpacl); +out_fill_attrs: + /* + * RFC 1813 Section 3.3.2 does not mandate that an NFS server + * returns wcc_data for SETATTR. Some client implementations + * depend on receiving wcc_data, however, to sort out partial + * updates (eg., the client requested that size and mode be + * modified, but the server changed only the file mode). + */ + fh_fill_post_attrs(fhp); out_unlock: - fh_unlock(fhp); + inode_unlock(inode); if (size_change) put_write_access(inode); out: if (!host_err) host_err = commit_metadata(fhp); - return nfserrno(host_err); + return err != 0 ? err : nfserrno(host_err); } #if defined(CONFIG_NFSD_V4) @@ -520,51 +648,68 @@ int nfsd4_is_junction(struct dentry *dentry) return 0; if (!(inode->i_mode & S_ISVTX)) return 0; - if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) + if (vfs_getxattr(&nop_mnt_idmap, dentry, NFSD_JUNCTION_XATTR_NAME, + NULL, 0) <= 0) return 0; return 1; } -#ifdef CONFIG_NFSD_V4_SECURITY_LABEL -__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct xdr_netobj *label) -{ - __be32 error; - int host_error; - struct dentry *dentry; - - error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR); - if (error) - return error; - - dentry = fhp->fh_dentry; - inode_lock(d_inode(dentry)); - host_error = security_inode_setsecctx(dentry, label->data, label->len); - inode_unlock(d_inode(dentry)); - return nfserrno(host_error); -} -#else -__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct xdr_netobj *label) +static struct nfsd4_compound_state *nfsd4_get_cstate(struct svc_rqst *rqstp) { - return nfserr_notsupp; + return &((struct nfsd4_compoundres *)rqstp->rq_resp)->cstate; } -#endif -__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, - u64 dst_pos, u64 count) +__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, + struct nfsd_file *nf_src, u64 src_pos, + struct nfsd_file *nf_dst, u64 dst_pos, + u64 count, bool sync) { + struct file *src = nf_src->nf_file; + struct file *dst = nf_dst->nf_file; + errseq_t since; loff_t cloned; + __be32 ret = 0; + since = READ_ONCE(dst->f_wb_err); cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); - if (count && cloned != count) - cloned = -EINVAL; - return nfserrno(cloned < 0 ? cloned : 0); + if (cloned < 0) { + ret = nfserrno(cloned); + goto out_err; + } + if (count && cloned != count) { + ret = nfserrno(-EINVAL); + goto out_err; + } + if (sync) { + loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX; + int status = vfs_fsync_range(dst, dst_pos, dst_end, 0); + + if (!status) + status = filemap_check_wb_err(dst->f_mapping, since); + if (!status) + status = commit_inode_metadata(file_inode(src)); + if (status < 0) { + struct nfsd_net *nn = net_generic(nf_dst->nf_net, + nfsd_net_id); + + trace_nfsd_clone_file_range_err(rqstp, + &nfsd4_get_cstate(rqstp)->save_fh, + src_pos, + &nfsd4_get_cstate(rqstp)->current_fh, + dst_pos, + count, status); + commit_reset_write_verifier(nn, rqstp, status); + ret = nfserrno(status); + } + } +out_err: + return ret; } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { + ssize_t ret; /* * Limit copy to 4MB to prevent indefinitely blocking an nfsd @@ -575,7 +720,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, * limit like this and pipeline multiple COPY requests. */ count = min_t(u64, count, 1 << 22); - return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); + ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); + + if (ret == -EOPNOTSUPP || ret == -EXDEV) + ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, + COPY_FILE_SPLICE); + return ret; } __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -595,7 +745,6 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif /* defined(CONFIG_NFSD_V4) */ -#ifdef CONFIG_NFSD_V3 /* * Check server access rights to a file system object */ @@ -609,6 +758,12 @@ static struct accessmap nfs3_regaccess[] = { { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC }, { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE }, +#ifdef CONFIG_NFSD_V4 + { NFS4_ACCESS_XAREAD, NFSD_MAY_READ }, + { NFS4_ACCESS_XAWRITE, NFSD_MAY_WRITE }, + { NFS4_ACCESS_XALIST, NFSD_MAY_READ }, +#endif + { 0, 0 } }; @@ -619,6 +774,12 @@ static struct accessmap nfs3_diraccess[] = { { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE }, { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE }, +#ifdef CONFIG_NFSD_V4 + { NFS4_ACCESS_XAREAD, NFSD_MAY_READ }, + { NFS4_ACCESS_XAWRITE, NFSD_MAY_WRITE }, + { NFS4_ACCESS_XALIST, NFSD_MAY_READ }, +#endif + { 0, 0 } }; @@ -669,7 +830,8 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor sresult |= map->access; - err2 = nfsd_permission(rqstp, export, dentry, map->how); + err2 = nfsd_permission(&rqstp->rq_cred, export, + dentry, map->how); switch (err2) { case nfs_ok: result |= map->access; @@ -695,9 +857,8 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor out: return error; } -#endif /* CONFIG_NFSD_V3 */ -static int nfsd_open_break_lease(struct inode *inode, int access) +int nfsd_open_break_lease(struct inode *inode, int access) { unsigned int mode; @@ -713,60 +874,28 @@ static int nfsd_open_break_lease(struct inode *inode, int access) * and additional flags. * N.B. After this call fhp needs an fh_put */ -__be32 -nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, - int may_flags, struct file **filp) +static int +__nfsd_open(struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { struct path path; struct inode *inode; struct file *file; int flags = O_RDONLY|O_LARGEFILE; - __be32 err; - int host_err = 0; - - validate_process_creds(); - - /* - * If we get here, then the client has already done an "open", - * and (hopefully) checked permission - so allow OWNER_OVERRIDE - * in case a chmod has now revoked permission. - * - * Arguably we should also allow the owner override for - * directories, but we never have and it doesn't seem to have - * caused anyone a problem. If we were to change this, note - * also that our filldir callbacks would need a variant of - * lookup_one_len that doesn't check permissions. - */ - if (type == S_IFREG) - may_flags |= NFSD_MAY_OWNER_OVERRIDE; - err = fh_verify(rqstp, fhp, type, may_flags); - if (err) - goto out; + int host_err = -EPERM; path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; inode = d_inode(path.dentry); - /* Disallow write access to files with the append-only bit set - * or any access when mandatory locking enabled - */ - err = nfserr_perm; if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) goto out; - /* - * We must ignore files (but only files) which might have mandatory - * locks on them because there is no way to know if the accesser has - * the lock. - */ - if (S_ISREG((inode)->i_mode) && mandatory_lock(inode)) - goto out; if (!inode->i_fop) goto out; host_err = nfsd_open_break_lease(inode, may_flags); if (host_err) /* NOMEM or WOULDBLOCK */ - goto out_nfserr; + goto out; if (may_flags & NFSD_MAY_WRITE) { if (may_flags & NFSD_MAY_READ) @@ -778,119 +907,105 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, file = dentry_open(&path, flags, current_cred()); if (IS_ERR(file)) { host_err = PTR_ERR(file); - goto out_nfserr; + goto out; } - host_err = ima_file_check(file, may_flags); + host_err = security_file_post_open(file, may_flags); if (host_err) { fput(file); - goto out_nfserr; + goto out; } - if (may_flags & NFSD_MAY_64BIT_COOKIE) - file->f_mode |= FMODE_64BITHASH; - else - file->f_mode |= FMODE_32BITHASH; - *filp = file; -out_nfserr: - err = nfserrno(host_err); out: - validate_process_creds(); + return host_err; +} + +__be32 +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) +{ + __be32 err; + int host_err; + bool retried = false; + + /* + * If we get here, then the client has already done an "open", + * and (hopefully) checked permission - so allow OWNER_OVERRIDE + * in case a chmod has now revoked permission. + * + * Arguably we should also allow the owner override for + * directories, but we never have and it doesn't seem to have + * caused anyone a problem. If we were to change this, note + * also that our filldir callbacks would need a variant of + * lookup_one_positive_unlocked() that doesn't check permissions. + */ + if (type == S_IFREG) + may_flags |= NFSD_MAY_OWNER_OVERRIDE; +retry: + err = fh_verify(rqstp, fhp, type, may_flags); + if (!err) { + host_err = __nfsd_open(fhp, type, may_flags, filp); + if (host_err == -EOPENSTALE && !retried) { + retried = true; + fh_put(fhp); + goto retry; + } + err = nfserrno(host_err); + } return err; } -struct raparms * -nfsd_init_raparms(struct file *file) +/** + * nfsd_open_verified - Open a regular file for the filecache + * @fhp: NFS filehandle of the file to open + * @type: S_IFMT inode type allowed (0 means any type is allowed) + * @may_flags: internal permission flags + * @filp: OUT: open "struct file *" + * + * Returns zero on success, or a negative errno value. + */ +int +nfsd_open_verified(struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { - struct inode *inode = file_inode(file); - dev_t dev = inode->i_sb->s_dev; - ino_t ino = inode->i_ino; - struct raparms *ra, **rap, **frap = NULL; - int depth = 0; - unsigned int hash; - struct raparm_hbucket *rab; - - hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; - rab = &raparm_hash[hash]; - - spin_lock(&rab->pb_lock); - for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { - if (ra->p_ino == ino && ra->p_dev == dev) - goto found; - depth++; - if (ra->p_count == 0) - frap = rap; - } - depth = nfsdstats.ra_size; - if (!frap) { - spin_unlock(&rab->pb_lock); - return NULL; - } - rap = frap; - ra = *frap; - ra->p_dev = dev; - ra->p_ino = ino; - ra->p_set = 0; - ra->p_hindex = hash; -found: - if (rap != &rab->pb_head) { - *rap = ra->p_next; - ra->p_next = rab->pb_head; - rab->pb_head = ra; - } - ra->p_count++; - nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; - spin_unlock(&rab->pb_lock); - - if (ra->p_set) - file->f_ra = ra->p_ra; - return ra; -} - -void nfsd_put_raparams(struct file *file, struct raparms *ra) -{ - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; - - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); + return __nfsd_open(fhp, type, may_flags, filp); } /* * Grab and keep cached pages associated with a file in the svc_rqst - * so that they can be passed to the network sendmsg/sendpage routines + * so that they can be passed to the network sendmsg routines * directly. They will be released after the sending has completed. + * + * Return values: Number of bytes consumed, or -EIO if there are no + * remaining pages in rqstp->rq_pages. */ static int nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct svc_rqst *rqstp = sd->u.data; - struct page **pp = rqstp->rq_next_page; - struct page *page = buf->page; - size_t size; - - size = sd->len; - - if (rqstp->rq_res.page_len == 0) { - get_page(page); - put_page(*rqstp->rq_next_page); - *(rqstp->rq_next_page++) = page; - rqstp->rq_res.page_base = buf->offset; - rqstp->rq_res.page_len = size; - } else if (page != pp[-1]) { - get_page(page); - if (*rqstp->rq_next_page) - put_page(*rqstp->rq_next_page); - *(rqstp->rq_next_page++) = page; - rqstp->rq_res.page_len += size; - } else - rqstp->rq_res.page_len += size; + struct page *page = buf->page; // may be a compound one + unsigned offset = buf->offset; + struct page *last_page; - return size; + last_page = page + (offset + sd->len - 1) / PAGE_SIZE; + for (page += offset / PAGE_SIZE; page <= last_page; page++) { + /* + * Skip page replacement when extending the contents of the + * current page. But note that we may get two zero_pages in a + * row from shmem. + */ + if (page == *(rqstp->rq_next_page - 1) && + offset_in_page(rqstp->rq_res.page_base + + rqstp->rq_res.page_len)) + continue; + if (unlikely(!svc_rqst_replace_page(rqstp, page))) + return -EIO; + } + if (rqstp->rq_res.page_len == 0) // first call + rqstp->rq_res.page_base = offset % PAGE_SIZE; + rqstp->rq_res.page_len += sd->len; + return sd->len; } static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, @@ -899,12 +1014,25 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } +static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len, + size_t expected) +{ + if (expected != 0 && len == 0) + return 1; + if (offset+len >= i_size_read(file_inode(file))) + return 1; + return 0; +} + static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count, int host_err) + unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { - nfsdstats.io_read += host_err; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + nfsd_stats_io_read_add(nn, fhp->fh_export, host_err); + *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); trace_nfsd_read_io_done(rqstp, fhp, offset, *count); @@ -915,8 +1043,21 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } } +/** + * nfsd_splice_read - Perform a VFS read using a splice pipe + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @file: opened struct file of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @eof: OUT: set non-zero if operation reached the end of the file + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. + */ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, unsigned long *count) + struct file *file, loff_t offset, unsigned long *count, + u32 *eof) { struct splice_desc sd = { .len = 0, @@ -924,25 +1065,157 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, .pos = offset, .u.data = rqstp, }; - int host_err; + ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); - rqstp->rq_next_page = rqstp->rq_respages + 1; - host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + host_err = rw_verify_area(READ, file, &offset, *count); + if (!host_err) + host_err = splice_direct_to_actor(file, &sd, + nfsd_direct_splice_actor); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } -__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, - struct kvec *vec, int vlen, unsigned long *count) +/* + * The byte range of the client's READ request is expanded on both ends + * until it meets the underlying file system's direct I/O alignment + * requirements. After the internal read is complete, the byte range of + * the NFS READ payload is reduced to the byte range that was originally + * requested. + * + * Note that a direct read can be done only when the xdr_buf containing + * the NFS READ reply does not already have contents in its .pages array. + * This is due to potentially restrictive alignment requirements on the + * read buffer. When .page_len and @base are zero, the .pages array is + * guaranteed to be page-aligned. + */ +static noinline_for_stack __be32 +nfsd_direct_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, loff_t offset, unsigned long *count, + u32 *eof) { + u64 dio_start, dio_end; + unsigned long v, total; struct iov_iter iter; - int host_err; + struct kiocb kiocb; + ssize_t host_err; + size_t len; + + init_sync_kiocb(&kiocb, nf->nf_file); + kiocb.ki_flags |= IOCB_DIRECT; + + /* Read a properly-aligned region of bytes into rq_bvec */ + dio_start = round_down(offset, nf->nf_dio_read_offset_align); + dio_end = round_up((u64)offset + *count, nf->nf_dio_read_offset_align); + + kiocb.ki_pos = dio_start; + + v = 0; + total = dio_end - dio_start; + while (total && v < rqstp->rq_maxpages && + rqstp->rq_next_page < rqstp->rq_page_end) { + len = min_t(size_t, total, PAGE_SIZE); + bvec_set_page(&rqstp->rq_bvec[v], *rqstp->rq_next_page, + len, 0); + + total -= len; + ++rqstp->rq_next_page; + ++v; + } + + trace_nfsd_read_direct(rqstp, fhp, offset, *count - total); + iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, + dio_end - dio_start - total); + + host_err = vfs_iocb_iter_read(nf->nf_file, &kiocb, &iter); + if (host_err >= 0) { + unsigned int pad = offset - dio_start; + + /* The returned payload starts after the pad */ + rqstp->rq_res.page_base = pad; + + /* Compute the count of bytes to be returned */ + if (host_err > pad + *count) + host_err = *count; + else if (host_err > pad) + host_err -= pad; + else + host_err = 0; + } else if (unlikely(host_err == -EINVAL)) { + struct inode *inode = d_inode(fhp->fh_dentry); + + pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + host_err = -ESERVERFAULT; + } - trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, READ, vec, vlen, *count); - host_err = vfs_iter_read(file, &iter, &offset, 0); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + return nfsd_finish_read(rqstp, fhp, nf->nf_file, offset, count, + eof, host_err); +} + +/** + * nfsd_iter_read - Perform a VFS read using an iterator + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @nf: opened struct nfsd_file of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @base: offset in first page of read buffer + * @eof: OUT: set non-zero if operation reached the end of the file + * + * Some filesystems or situations cannot use nfsd_splice_read. This + * function is the slightly less-performant fallback for those cases. + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. + */ +__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, loff_t offset, unsigned long *count, + unsigned int base, u32 *eof) +{ + struct file *file = nf->nf_file; + unsigned long v, total; + struct iov_iter iter; + struct kiocb kiocb; + ssize_t host_err; + size_t len; + + init_sync_kiocb(&kiocb, file); + + switch (nfsd_io_cache_read) { + case NFSD_IO_BUFFERED: + break; + case NFSD_IO_DIRECT: + /* When dio_read_offset_align is zero, dio is not supported */ + if (nf->nf_dio_read_offset_align && !rqstp->rq_res.page_len) + return nfsd_direct_read(rqstp, fhp, nf, offset, + count, eof); + fallthrough; + case NFSD_IO_DONTCACHE: + if (file->f_op->fop_flags & FOP_DONTCACHE) + kiocb.ki_flags = IOCB_DONTCACHE; + break; + } + + kiocb.ki_pos = offset; + + v = 0; + total = *count; + while (total && v < rqstp->rq_maxpages && + rqstp->rq_next_page < rqstp->rq_page_end) { + len = min_t(size_t, total, PAGE_SIZE - base); + bvec_set_page(&rqstp->rq_bvec[v], *rqstp->rq_next_page, + len, base); + + total -= len; + ++rqstp->rq_next_page; + ++v; + base = 0; + } + + trace_nfsd_read_vector(rqstp, fhp, offset, *count - total); + iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count - total); + host_err = vfs_iocb_iter_read(file, &kiocb, &iter); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } /* @@ -973,7 +1246,7 @@ static int wait_for_concurrent_writes(struct file *file) dprintk("nfsd: write resume %d\n", task_pid_nr(current)); } - if (inode->i_state & I_DIRTY) { + if (inode_state_read_once(inode) & I_DIRTY) { dprintk("nfsd: write sync %d\n", task_pid_nr(current)); err = vfs_fsync(file, 0); } @@ -982,49 +1255,247 @@ static int wait_for_concurrent_writes(struct file *file) return err; } +struct nfsd_write_dio_seg { + struct iov_iter iter; + int flags; +}; + +static unsigned long +iov_iter_bvec_offset(const struct iov_iter *iter) +{ + return (unsigned long)(iter->bvec->bv_offset + iter->iov_offset); +} + +static void +nfsd_write_dio_seg_init(struct nfsd_write_dio_seg *segment, + struct bio_vec *bvec, unsigned int nvecs, + unsigned long total, size_t start, size_t len, + struct kiocb *iocb) +{ + iov_iter_bvec(&segment->iter, ITER_SOURCE, bvec, nvecs, total); + if (start) + iov_iter_advance(&segment->iter, start); + iov_iter_truncate(&segment->iter, len); + segment->flags = iocb->ki_flags; +} + +static unsigned int +nfsd_write_dio_iters_init(struct nfsd_file *nf, struct bio_vec *bvec, + unsigned int nvecs, struct kiocb *iocb, + unsigned long total, + struct nfsd_write_dio_seg segments[3]) +{ + u32 offset_align = nf->nf_dio_offset_align; + loff_t prefix_end, orig_end, middle_end; + u32 mem_align = nf->nf_dio_mem_align; + size_t prefix, middle, suffix; + loff_t offset = iocb->ki_pos; + unsigned int nsegs = 0; + + /* + * Check if direct I/O is feasible for this write request. + * If alignments are not available, the write is too small, + * or no alignment can be found, fall back to buffered I/O. + */ + if (unlikely(!mem_align || !offset_align) || + unlikely(total < max(offset_align, mem_align))) + goto no_dio; + + prefix_end = round_up(offset, offset_align); + orig_end = offset + total; + middle_end = round_down(orig_end, offset_align); + + prefix = prefix_end - offset; + middle = middle_end - prefix_end; + suffix = orig_end - middle_end; + + if (!middle) + goto no_dio; + + if (prefix) + nfsd_write_dio_seg_init(&segments[nsegs++], bvec, + nvecs, total, 0, prefix, iocb); + + nfsd_write_dio_seg_init(&segments[nsegs], bvec, nvecs, + total, prefix, middle, iocb); + + /* + * Check if the bvec iterator is aligned for direct I/O. + * + * bvecs generated from RPC receive buffers are contiguous: After + * the first bvec, all subsequent bvecs start at bv_offset zero + * (page-aligned). Therefore, only the first bvec is checked. + */ + if (iov_iter_bvec_offset(&segments[nsegs].iter) & (mem_align - 1)) + goto no_dio; + segments[nsegs].flags |= IOCB_DIRECT; + nsegs++; + + if (suffix) + nfsd_write_dio_seg_init(&segments[nsegs++], bvec, nvecs, total, + prefix + middle, suffix, iocb); + + return nsegs; + +no_dio: + /* No DIO alignment possible - pack into single non-DIO segment. */ + nfsd_write_dio_seg_init(&segments[0], bvec, nvecs, total, 0, + total, iocb); + return 1; +} + +static noinline_for_stack int +nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, unsigned int nvecs, + unsigned long *cnt, struct kiocb *kiocb) +{ + struct nfsd_write_dio_seg segments[3]; + struct file *file = nf->nf_file; + unsigned int nsegs, i; + ssize_t host_err; + + nsegs = nfsd_write_dio_iters_init(nf, rqstp->rq_bvec, nvecs, + kiocb, *cnt, segments); + + *cnt = 0; + for (i = 0; i < nsegs; i++) { + kiocb->ki_flags = segments[i].flags; + if (kiocb->ki_flags & IOCB_DIRECT) + trace_nfsd_write_direct(rqstp, fhp, kiocb->ki_pos, + segments[i].iter.count); + else { + trace_nfsd_write_vector(rqstp, fhp, kiocb->ki_pos, + segments[i].iter.count); + /* + * Mark the I/O buffer as evict-able to reduce + * memory contention. + */ + if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE) + kiocb->ki_flags |= IOCB_DONTCACHE; + } + + host_err = vfs_iocb_iter_write(file, kiocb, &segments[i].iter); + if (host_err < 0) + return host_err; + *cnt += host_err; + if (host_err < segments[i].iter.count) + break; /* partial write */ + } + + return 0; +} + +/** + * nfsd_vfs_write - write data to an already-open file + * @rqstp: RPC execution context + * @fhp: File handle of file to write into + * @nf: An open file matching @fhp + * @offset: Byte offset of start + * @payload: xdr_buf containing the write payload + * @cnt: IN: number of bytes to write, OUT: number of bytes actually written + * @stable: An NFS stable_how value + * @verf: NFS WRITE verifier + * + * Upon return, caller must invoke fh_put on @fhp. + * + * Return values: + * An nfsstat value in network byte order. + */ __be32 -nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, - unsigned long *cnt, int stable) +nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, loff_t offset, + const struct xdr_buf *payload, unsigned long *cnt, + int stable, __be32 *verf) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + struct file *file = nf->nf_file; + struct super_block *sb = file_inode(file)->i_sb; + struct kiocb kiocb; struct svc_export *exp; struct iov_iter iter; + errseq_t since; __be32 nfserr; int host_err; - int use_wgather; - loff_t pos = offset; + unsigned long exp_op_flags = 0; unsigned int pflags = current->flags; - rwf_t flags = 0; + bool restore_flags = false; + unsigned int nvecs; trace_nfsd_write_opened(rqstp, fhp, offset, *cnt); - if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) + if (sb->s_export_op) + exp_op_flags = sb->s_export_op->flags; + + if (test_bit(RQ_LOCAL, &rqstp->rq_flags) && + !(exp_op_flags & EXPORT_OP_REMOTE_FS)) { /* - * We want less throttling in balance_dirty_pages() - * and shrink_inactive_list() so that nfs to + * We want throttling in balance_dirty_pages() + * and shrink_inactive_list() to only consider + * the backingdev we are writing to, so that nfs to * localhost doesn't cause nfsd to lock up due to all * the client's dirty pages or its congested queue. */ - current->flags |= PF_LESS_THROTTLE; + current->flags |= PF_LOCAL_THROTTLE; + restore_flags = true; + } exp = fhp->fh_export; - use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); if (!EX_ISSYNC(exp)) stable = NFS_UNSTABLE; + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = offset; + if (likely(!fhp->fh_use_wgather)) { + switch (stable) { + case NFS_FILE_SYNC: + /* persist data and timestamps */ + kiocb.ki_flags |= IOCB_DSYNC | IOCB_SYNC; + break; + case NFS_DATA_SYNC: + /* persist data only */ + kiocb.ki_flags |= IOCB_DSYNC; + break; + } + } - if (stable && !use_wgather) - flags |= RWF_SYNC; + nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); - iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt); - host_err = vfs_iter_write(file, &iter, &pos, flags); - if (host_err < 0) + since = READ_ONCE(file->f_wb_err); + if (verf) + nfsd_copy_write_verifier(verf, nn); + + switch (nfsd_io_cache_write) { + case NFSD_IO_DIRECT: + host_err = nfsd_direct_write(rqstp, fhp, nf, nvecs, + cnt, &kiocb); + break; + case NFSD_IO_DONTCACHE: + if (file->f_op->fop_flags & FOP_DONTCACHE) + kiocb.ki_flags |= IOCB_DONTCACHE; + fallthrough; + case NFSD_IO_BUFFERED: + iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); + host_err = vfs_iocb_iter_write(file, &kiocb, &iter); + if (host_err < 0) + break; + *cnt = host_err; + break; + } + if (host_err < 0) { + commit_reset_write_verifier(nn, rqstp, host_err); goto out_nfserr; - nfsdstats.io_write += *cnt; + } + nfsd_stats_io_write_add(nn, exp, *cnt); fsnotify_modify(file); + host_err = filemap_check_wb_err(file->f_mapping, since); + if (host_err < 0) + goto out_nfserr; - if (stable && use_wgather) + if (stable && fhp->fh_use_wgather) { host_err = wait_for_concurrent_writes(file); + if (host_err < 0) + commit_reset_write_verifier(nn, rqstp, host_err); + } out_nfserr: if (host_err >= 0) { @@ -1034,122 +1505,207 @@ out_nfserr: trace_nfsd_write_err(rqstp, fhp, offset, host_err); nfserr = nfserrno(host_err); } - if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - current_restore_flags(pflags, PF_LESS_THROTTLE); + if (restore_flags) + current_restore_flags(pflags, PF_LOCAL_THROTTLE); return nfserr; } -/* - * Read data from a file. count must contain the requested read count - * on entry. On return, *count contains the number of bytes actually read. +/** + * nfsd_read_splice_ok - check if spliced reading is supported + * @rqstp: RPC transaction context + * + * Return values: + * %true: nfsd_splice_read() may be used + * %false: nfsd_splice_read() must not be used + * + * NFS READ normally uses splice to send data in-place. However the + * data in cache can change after the reply's MIC is computed but + * before the RPC reply is sent. To prevent the client from + * rejecting the server-computed MIC in this somewhat rare case, do + * not use splice with the GSS integrity and privacy services. + */ +bool nfsd_read_splice_ok(struct svc_rqst *rqstp) +{ + if (nfsd_disable_splice_read) + return false; + switch (svc_auth_flavor(rqstp)) { + case RPC_AUTH_GSS_KRB5I: + case RPC_AUTH_GSS_KRB5P: + return false; + } + return true; +} + +/** + * nfsd_read - Read data from a file + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @eof: OUT: set non-zero if operation reached the end of the file + * + * The caller must verify that there is enough space in @rqstp.rq_res + * to perform this operation. + * * N.B. After this call fhp needs an fh_put + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. */ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) + loff_t offset, unsigned long *count, u32 *eof) { + struct nfsd_file *nf; struct file *file; - struct raparms *ra; __be32 err; trace_nfsd_read_start(rqstp, fhp, offset, *count); - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf); if (err) return err; - ra = nfsd_init_raparms(file); - - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - err = nfsd_splice_read(rqstp, fhp, file, offset, count); + file = nf->nf_file; + if (file->f_op->splice_read && nfsd_read_splice_ok(rqstp)) + err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else - err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count); - - if (ra) - nfsd_put_raparams(file, ra); - fput(file); + err = nfsd_iter_read(rqstp, fhp, nf, offset, count, 0, eof); + nfsd_file_put(nf); trace_nfsd_read_done(rqstp, fhp, offset, *count); - return err; } -/* - * Write data to a file. - * The stable flag requests synchronous writes. - * N.B. After this call fhp needs an fh_put +/** + * nfsd_write - open a file and write data to it + * @rqstp: RPC execution context + * @fhp: File handle of file to write into; nfsd_write() may modify it + * @offset: Byte offset of start + * @payload: xdr_buf containing the write payload + * @cnt: IN: number of bytes to write, OUT: number of bytes actually written + * @stable: An NFS stable_how value + * @verf: NFS WRITE verifier + * + * Upon return, caller must invoke fh_put on @fhp. + * + * Return values: + * An nfsstat value in network byte order. */ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, - struct kvec *vec, int vlen, unsigned long *cnt, int stable) + const struct xdr_buf *payload, unsigned long *cnt, int stable, + __be32 *verf) { - struct file *file = NULL; - __be32 err = 0; + struct nfsd_file *nf; + __be32 err; trace_nfsd_write_start(rqstp, fhp, offset, *cnt); - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf); if (err) goto out; - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable); - fput(file); + err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt, + stable, verf); + nfsd_file_put(nf); out: trace_nfsd_write_done(rqstp, fhp, offset, *cnt); return err; } -#ifdef CONFIG_NFSD_V3 -/* - * Commit all pending writes to stable storage. +/** + * nfsd_commit - Commit pending writes to stable storage + * @rqstp: RPC request being processed + * @fhp: NFS filehandle + * @nf: target file + * @offset: raw offset from beginning of file + * @count: raw count of bytes to sync + * @verf: filled in with the server's current write verifier * - * Note: we only guarantee that data that lies within the range specified - * by the 'offset' and 'count' parameters will be synced. + * Note: we guarantee that data that lies within the range specified + * by the 'offset' and 'count' parameters will be synced. The server + * is permitted to sync data that lies outside this range at the + * same time. * * Unfortunately we cannot lock the file to make sure we return full WCC * data to the client, as locking happens lower down in the filesystem. + * + * Return values: + * An nfsstat value in network byte order. */ __be32 -nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, unsigned long count) +nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, + u64 offset, u32 count, __be32 *verf) { - struct file *file; - loff_t end = LLONG_MAX; - __be32 err = nfserr_inval; + __be32 err = nfs_ok; + u64 maxbytes; + loff_t start, end; + struct nfsd_net *nn; - if (offset < 0) - goto out; - if (count != 0) { - end = offset + (loff_t)count - 1; - if (end < offset) - goto out; + trace_nfsd_commit_start(rqstp, fhp, offset, count); + + /* + * Convert the client-provided (offset, count) range to a + * (start, end) range. If the client-provided range falls + * outside the maximum file size of the underlying FS, + * clamp the sync range appropriately. + */ + start = 0; + end = LLONG_MAX; + maxbytes = (u64)fhp->fh_dentry->d_sb->s_maxbytes; + if (offset < maxbytes) { + start = offset; + if (count && (offset + count - 1 < maxbytes)) + end = offset + count - 1; } - err = nfsd_open(rqstp, fhp, S_IFREG, - NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file); - if (err) - goto out; + nn = net_generic(nf->nf_net, nfsd_net_id); if (EX_ISSYNC(fhp->fh_export)) { - int err2 = vfs_fsync_range(file, offset, end, 0); - - if (err2 != -EINVAL) + errseq_t since = READ_ONCE(nf->nf_file->f_wb_err); + int err2; + + err2 = vfs_fsync_range(nf->nf_file, start, end, 0); + switch (err2) { + case 0: + nfsd_copy_write_verifier(verf, nn); + err2 = filemap_check_wb_err(nf->nf_file->f_mapping, + since); err = nfserrno(err2); - else + break; + case -EINVAL: err = nfserr_notsupp; - } + break; + default: + commit_reset_write_verifier(nn, rqstp, err2); + err = nfserrno(err2); + } + } else + nfsd_copy_write_verifier(verf, nn); - fput(file); -out: + trace_nfsd_commit_done(rqstp, fhp, offset, count); return err; } -#endif /* CONFIG_NFSD_V3 */ -static __be32 -nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, - struct iattr *iap) +/** + * nfsd_create_setattr - Set a created file's attributes + * @rqstp: RPC transaction being executed + * @fhp: NFS filehandle of parent directory + * @resfhp: NFS filehandle of new object + * @attrs: requested attributes of new object + * + * Returns nfs_ok on success, or an nfsstat in network byte order. + */ +__be32 +nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct nfsd_attrs *attrs) { + struct iattr *iap = attrs->na_iattr; + __be32 status; + /* - * Mode has already been set earlier in create: + * Mode has already been set by file creation. */ iap->ia_valid &= ~ATTR_MODE; + /* * Setting uid/gid works only for root. Irix appears to * send along the gid on create when it tries to implement @@ -1157,10 +1713,31 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, */ if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) iap->ia_valid &= ~(ATTR_UID|ATTR_GID); - if (iap->ia_valid) - return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); - /* Callers expect file metadata to be committed here */ - return nfserrno(commit_metadata(resfhp)); + + /* + * Callers expect new file metadata to be committed even + * if the attributes have not changed. + */ + if (nfsd_attrs_valid(attrs)) + status = nfsd_setattr(rqstp, resfhp, attrs, NULL); + else + status = nfserrno(commit_metadata(resfhp)); + + /* + * Transactional filesystems had a chance to commit changes + * for both parent and child simultaneously making the + * following commit_metadata a noop in many cases. + */ + if (!status) + status = nfserrno(commit_metadata(fhp)); + + /* + * Update the new filehandle to pick up the new attributes. + */ + if (!status) + status = fh_update(resfhp); + + return status; } /* HPUX client sometimes creates a file in mode 000, and sets size to 0. @@ -1178,30 +1755,24 @@ nfsd_check_ignore_resizing(struct iattr *iap) iap->ia_valid &= ~ATTR_SIZE; } -/* The parent directory should already be locked: */ +/* The parent directory should already be locked - we will unlock */ __be32 nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, - char *fname, int flen, struct iattr *iap, - int type, dev_t rdev, struct svc_fh *resfhp) + struct nfsd_attrs *attrs, + int type, dev_t rdev, struct svc_fh *resfhp) { struct dentry *dentry, *dchild; struct inode *dirp; + struct iattr *iap = attrs->na_iattr; __be32 err; - __be32 err2; - int host_err; + int host_err = 0; dentry = fhp->fh_dentry; dirp = d_inode(dentry); dchild = dget(resfhp->fh_dentry); - if (!fhp->fh_locked) { - WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n", - dentry); - err = nfserr_io; - goto out; - } - - err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE); + err = nfsd_permission(&rqstp->rq_cred, fhp->fh_export, dentry, + NFSD_MAY_CREATE); if (err) goto out; @@ -1209,44 +1780,34 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_mode = 0; iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; + if (!IS_POSIXACL(dirp)) + iap->ia_mode &= ~current_umask(); + err = 0; - host_err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(dirp, dchild, iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, dchild, iap->ia_mode, NULL); if (!host_err) nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); - if (!host_err && unlikely(d_unhashed(dchild))) { - struct dentry *d; - d = lookup_one_len(dchild->d_name.name, - dchild->d_parent, - dchild->d_name.len); - if (IS_ERR(d)) { - host_err = PTR_ERR(d); - break; - } - if (unlikely(d_is_negative(d))) { - dput(d); - err = nfserr_serverfault; - goto out; - } + dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode, NULL); + if (IS_ERR(dchild)) { + host_err = PTR_ERR(dchild); + } else if (d_is_negative(dchild)) { + err = nfserr_serverfault; + goto out; + } else if (unlikely(dchild != resfhp->fh_dentry)) { dput(resfhp->fh_dentry); - resfhp->fh_dentry = dget(d); - err = fh_update(resfhp); - dput(dchild); - dchild = d; - if (err) - goto out; + resfhp->fh_dentry = dget(dchild); } break; case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); + host_err = vfs_mknod(&nop_mnt_idmap, dirp, dchild, + iap->ia_mode, rdev, NULL); break; default: printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", @@ -1256,24 +1817,12 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err < 0) goto out_nfserr; - err = nfsd_create_setattr(rqstp, resfhp, iap); + err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs); - /* - * nfsd_create_setattr already committed the child. Transactional - * filesystems had a chance to commit changes for both parent and - * child simultaneously making the following commit_metadata a - * noop. - */ - err2 = nfserrno(commit_metadata(fhp)); - if (err2) - err = err2; - /* - * Update the file handle to get the new inode info. - */ - if (!err) - err = fh_update(resfhp); out: - dput(dchild); + if (!err) + fh_fill_post_attrs(fhp); + end_creating(dchild); return err; out_nfserr: @@ -1289,13 +1838,15 @@ out_nfserr: */ __be32 nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, - char *fname, int flen, struct iattr *iap, - int type, dev_t rdev, struct svc_fh *resfhp) + char *fname, int flen, struct nfsd_attrs *attrs, + int type, dev_t rdev, struct svc_fh *resfhp) { struct dentry *dentry, *dchild = NULL; __be32 err; int host_err; + trace_nfsd_vfs_create(rqstp, fhp, type, fname, flen); + if (isdotent(fname, flen)) return nfserr_exist; @@ -1309,187 +1860,26 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err) return nfserrno(host_err); - fh_lock_nested(fhp, I_MUTEX_PARENT); - dchild = lookup_one_len(fname, dentry, flen); + dchild = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); host_err = PTR_ERR(dchild); if (IS_ERR(dchild)) return nfserrno(host_err); - err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - /* - * We unconditionally drop our ref to dchild as fh_compose will have - * already grabbed its own ref for it. - */ - dput(dchild); - if (err) - return err; - return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type, - rdev, resfhp); -} - -#ifdef CONFIG_NFSD_V3 - -/* - * NFSv3 and NFSv4 version of nfsd_create - */ -__be32 -do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, - char *fname, int flen, struct iattr *iap, - struct svc_fh *resfhp, int createmode, u32 *verifier, - bool *truncp, bool *created) -{ - struct dentry *dentry, *dchild = NULL; - struct inode *dirp; - __be32 err; - int host_err; - __u32 v_mtime=0, v_atime=0; - - err = nfserr_perm; - if (!flen) - goto out; - err = nfserr_exist; - if (isdotent(fname, flen)) - goto out; - if (!(iap->ia_valid & ATTR_MODE)) - iap->ia_mode = 0; - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); - if (err) - goto out; - - dentry = fhp->fh_dentry; - dirp = d_inode(dentry); - - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - - fh_lock_nested(fhp, I_MUTEX_PARENT); - - /* - * Compose the response file handle. - */ - dchild = lookup_one_len(fname, dentry, flen); - host_err = PTR_ERR(dchild); - if (IS_ERR(dchild)) - goto out_nfserr; - - /* If file doesn't exist, check for permissions to create one */ - if (d_really_is_negative(dchild)) { - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); - if (err) - goto out; - } err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); if (err) - goto out; - - if (nfsd_create_is_exclusive(createmode)) { - /* solaris7 gets confused (bugid 4218508) if these have - * the high bit set, so just clear the high bits. If this is - * ever changed to use different attrs for storing the - * verifier, then do_open_lookup() will also need to be fixed - * accordingly. - */ - v_mtime = verifier[0]&0x7fffffff; - v_atime = verifier[1]&0x7fffffff; - } - - if (d_really_is_positive(dchild)) { - err = 0; - - switch (createmode) { - case NFS3_CREATE_UNCHECKED: - if (! d_is_reg(dchild)) - goto out; - else if (truncp) { - /* in nfsv4, we need to treat this case a little - * differently. we don't want to truncate the - * file now; this would be wrong if the OPEN - * fails for some other reason. furthermore, - * if the size is nonzero, we should ignore it - * according to spec! - */ - *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; - } - else { - iap->ia_valid &= ATTR_SIZE; - goto set_attr; - } - break; - case NFS3_CREATE_EXCLUSIVE: - if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime - && d_inode(dchild)->i_atime.tv_sec == v_atime - && d_inode(dchild)->i_size == 0 ) { - if (created) - *created = 1; - break; - } - /* fall through */ - case NFS4_CREATE_EXCLUSIVE4_1: - if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime - && d_inode(dchild)->i_atime.tv_sec == v_atime - && d_inode(dchild)->i_size == 0 ) { - if (created) - *created = 1; - goto set_attr; - } - /* fall through */ - case NFS3_CREATE_GUARDED: - err = nfserr_exist; - } - fh_drop_write(fhp); - goto out; - } - - host_err = vfs_create(dirp, dchild, iap->ia_mode, true); - if (host_err < 0) { - fh_drop_write(fhp); - goto out_nfserr; - } - if (created) - *created = 1; - - nfsd_check_ignore_resizing(iap); - - if (nfsd_create_is_exclusive(createmode)) { - /* Cram the verifier into atime/mtime */ - iap->ia_valid = ATTR_MTIME|ATTR_ATIME - | ATTR_MTIME_SET|ATTR_ATIME_SET; - /* XXX someone who knows this better please fix it for nsec */ - iap->ia_mtime.tv_sec = v_mtime; - iap->ia_atime.tv_sec = v_atime; - iap->ia_mtime.tv_nsec = 0; - iap->ia_atime.tv_nsec = 0; - } - - set_attr: - err = nfsd_create_setattr(rqstp, resfhp, iap); - - /* - * nfsd_create_setattr already committed the child - * (and possibly also the parent). - */ - if (!err) - err = nfserrno(commit_metadata(fhp)); - - /* - * Update the filehandle to get the new inode info. - */ - if (!err) - err = fh_update(resfhp); + goto out_unlock; + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp); + /* nfsd_create_locked() unlocked the parent */ + dput(dchild); + return err; - out: - fh_unlock(fhp); - if (dchild && !IS_ERR(dchild)) - dput(dchild); - fh_drop_write(fhp); - return err; - - out_nfserr: - err = nfserrno(host_err); - goto out; +out_unlock: + end_creating(dchild); + return err; } -#endif /* CONFIG_NFSD_V3 */ /* * Read a symlink. On entry, *lenp must contain the maximum path length that @@ -1529,20 +1919,32 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) return 0; } -/* - * Create a symlink and look up its inode +/** + * nfsd_symlink - Create a symlink and look up its inode + * @rqstp: RPC transaction being executed + * @fhp: NFS filehandle of parent directory + * @fname: filename of the new symlink + * @flen: length of @fname + * @path: content of the new symlink (NUL-terminated) + * @attrs: requested attributes of new object + * @resfhp: NFS filehandle of new object + * * N.B. After this call _both_ fhp and resfhp need an fh_put + * + * Returns nfs_ok on success, or an nfsstat in network byte order. */ __be32 nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, - char *fname, int flen, - char *path, - struct svc_fh *resfhp) + char *fname, int flen, + char *path, struct nfsd_attrs *attrs, + struct svc_fh *resfhp) { struct dentry *dentry, *dnew; __be32 err, cerr; int host_err; + trace_nfsd_vfs_symlink(rqstp, fhp, fname, flen, path); + err = nfserr_noent; if (!flen || path[0] == '\0') goto out; @@ -1555,38 +1957,49 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; + if (host_err) { + err = nfserrno(host_err); + goto out; + } - fh_lock(fhp); dentry = fhp->fh_dentry; - dnew = lookup_one_len(fname, dentry, flen); - host_err = PTR_ERR(dnew); - if (IS_ERR(dnew)) - goto out_nfserr; - - host_err = vfs_symlink(d_inode(dentry), dnew, path); + dnew = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); + if (IS_ERR(dnew)) { + err = nfserrno(PTR_ERR(dnew)); + goto out_drop_write; + } + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path, NULL); err = nfserrno(host_err); + cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); + if (!err) + nfsd_create_setattr(rqstp, fhp, resfhp, attrs); + fh_fill_post_attrs(fhp); +out_unlock: + end_creating(dnew); if (!err) err = nfserrno(commit_metadata(fhp)); - fh_unlock(fhp); - + if (!err) + err = cerr; +out_drop_write: fh_drop_write(fhp); - - cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); - dput(dnew); - if (err==0) err = cerr; out: return err; - -out_nfserr: - err = nfserrno(host_err); - goto out; } -/* - * Create a hardlink - * N.B. After this call _both_ ffhp and tfhp need an fh_put +/** + * nfsd_link - create a link + * @rqstp: RPC transaction context + * @ffhp: the file handle of the directory where the new link is to be created + * @name: the filename of the new link + * @len: the length of @name in octets + * @tfhp: the file handle of an existing file object + * + * After this call _both_ ffhp and tfhp need an fh_put. + * + * Returns a generic NFS status code in network byte-order. */ __be32 nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, @@ -1594,9 +2007,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, { struct dentry *ddir, *dnew, *dold; struct inode *dirp; + int type; __be32 err; int host_err; + trace_nfsd_vfs_link(rqstp, ffhp, tfhp, name, len); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1613,62 +2029,101 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (isdotent(name, len)) goto out; + err = nfs_ok; + type = d_inode(tfhp->fh_dentry)->i_mode & S_IFMT; host_err = fh_want_write(tfhp); - if (host_err) { - err = nfserrno(host_err); + if (host_err) goto out; - } - fh_lock_nested(ffhp, I_MUTEX_PARENT); ddir = ffhp->fh_dentry; dirp = d_inode(ddir); + dnew = start_creating(&nop_mnt_idmap, ddir, &QSTR_LEN(name, len)); - dnew = lookup_one_len(name, ddir, len); - host_err = PTR_ERR(dnew); - if (IS_ERR(dnew)) - goto out_nfserr; + if (IS_ERR(dnew)) { + host_err = PTR_ERR(dnew); + goto out_drop_write; + } dold = tfhp->fh_dentry; err = nfserr_noent; if (d_really_is_negative(dold)) - goto out_dput; - host_err = vfs_link(dold, dirp, dnew, NULL); + goto out_unlock; + err = fh_fill_pre_attrs(ffhp); + if (err != nfs_ok) + goto out_unlock; + host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); + fh_fill_post_attrs(ffhp); +out_unlock: + end_creating(dnew); if (!host_err) { - err = nfserrno(commit_metadata(ffhp)); - if (!err) - err = nfserrno(commit_metadata(tfhp)); - } else { - if (host_err == -EXDEV && rqstp->rq_vers == 2) - err = nfserr_acces; - else - err = nfserrno(host_err); + host_err = commit_metadata(ffhp); + if (!host_err) + host_err = commit_metadata(tfhp); } -out_dput: - dput(dnew); -out_unlock: - fh_unlock(ffhp); + +out_drop_write: fh_drop_write(tfhp); + if (host_err == -EBUSY) { + /* + * See RFC 8881 Section 18.9.4 para 1-2: NFSv4 LINK + * wants a status unique to the object type. + */ + if (type != S_IFDIR) + err = nfserr_file_open; + else + err = nfserr_acces; + } out: - return err; + return err != nfs_ok ? err : nfserrno(host_err); +} -out_nfserr: - err = nfserrno(host_err); - goto out_unlock; +static void +nfsd_close_cached_files(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + nfsd_file_close_inode_sync(inode); } -/* - * Rename a file - * N.B. After this call _both_ ffhp and tfhp need an fh_put +static bool +nfsd_has_cached_files(struct dentry *dentry) +{ + bool ret = false; + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + ret = nfsd_file_is_cached(inode); + return ret; +} + +/** + * nfsd_rename - rename a directory entry + * @rqstp: RPC transaction context + * @ffhp: the file handle of parent directory containing the entry to be renamed + * @fname: the filename of directory entry to be renamed + * @flen: the length of @fname in octets + * @tfhp: the file handle of parent directory to contain the renamed entry + * @tname: the filename of the new entry + * @tlen: the length of @tlen in octets + * + * After this call _both_ ffhp and tfhp need an fh_put. + * + * Returns a generic NFS status code in network byte-order. */ __be32 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct svc_fh *tfhp, char *tname, int tlen) { - struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; - struct inode *fdir, *tdir; + struct dentry *fdentry, *tdentry; + int type = S_IFDIR; + struct renamedata rd = {}; __be32 err; int host_err; + struct dentry *close_cached; + + trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen); err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) @@ -1678,84 +2133,118 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out; fdentry = ffhp->fh_dentry; - fdir = d_inode(fdentry); tdentry = tfhp->fh_dentry; - tdir = d_inode(tdentry); err = nfserr_perm; if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; + err = nfserr_xdev; + if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) + goto out; + if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) + goto out; + +retry: + close_cached = NULL; host_err = fh_want_write(ffhp); if (host_err) { err = nfserrno(host_err); goto out; } - /* cannot use fh_lock as we need deadlock protective ordering - * so do it by hand */ - trap = lock_rename(tdentry, fdentry); - ffhp->fh_locked = tfhp->fh_locked = true; - fill_pre_wcc(ffhp); - fill_pre_wcc(tfhp); + rd.mnt_idmap = &nop_mnt_idmap; + rd.old_parent = fdentry; + rd.new_parent = tdentry; - odentry = lookup_one_len(fname, fdentry, flen); - host_err = PTR_ERR(odentry); - if (IS_ERR(odentry)) - goto out_nfserr; + host_err = start_renaming(&rd, 0, &QSTR_LEN(fname, flen), + &QSTR_LEN(tname, tlen)); - host_err = -ENOENT; - if (d_really_is_negative(odentry)) - goto out_dput_old; - host_err = -EINVAL; - if (odentry == trap) - goto out_dput_old; - - ndentry = lookup_one_len(tname, tdentry, tlen); - host_err = PTR_ERR(ndentry); - if (IS_ERR(ndentry)) - goto out_dput_old; - host_err = -ENOTEMPTY; - if (ndentry == trap) - goto out_dput_new; - - host_err = -EXDEV; - if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) - goto out_dput_new; - if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) - goto out_dput_new; + if (host_err) { + err = nfserrno(host_err); + goto out_want_write; + } + err = fh_fill_pre_attrs(ffhp); + if (err != nfs_ok) + goto out_unlock; + err = fh_fill_pre_attrs(tfhp); + if (err != nfs_ok) + goto out_unlock; + + type = d_inode(rd.old_dentry)->i_mode & S_IFMT; + + if (d_inode(rd.new_dentry)) + type = d_inode(rd.new_dentry)->i_mode & S_IFMT; + + if ((rd.new_dentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && + nfsd_has_cached_files(rd.new_dentry)) { + close_cached = dget(rd.new_dentry); + goto out_unlock; + } else { + int retries; - host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); - if (!host_err) { - host_err = commit_metadata(tfhp); - if (!host_err) - host_err = commit_metadata(ffhp); + for (retries = 1;;) { + host_err = vfs_rename(&rd); + if (host_err != -EAGAIN || !retries--) + break; + if (!nfsd_wait_for_delegreturn(rqstp, d_inode(rd.old_dentry))) + break; + } + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) + host_err = commit_metadata(ffhp); + } } - out_dput_new: - dput(ndentry); - out_dput_old: - dput(odentry); - out_nfserr: - err = nfserrno(host_err); - /* - * We cannot rely on fh_unlock on the two filehandles, - * as that would do the wrong thing if the two directories - * were the same, so again we do it by hand. - */ - fill_post_wcc(ffhp); - fill_post_wcc(tfhp); - unlock_rename(tdentry, fdentry); - ffhp->fh_locked = tfhp->fh_locked = false; + if (host_err == -EBUSY) { + /* + * See RFC 8881 Section 18.26.4 para 1-3: NFSv4 RENAME + * wants a status unique to the object type. + */ + if (type != S_IFDIR) + err = nfserr_file_open; + else + err = nfserr_acces; + } else { + err = nfserrno(host_err); + } + + if (!close_cached) { + fh_fill_post_attrs(ffhp); + fh_fill_post_attrs(tfhp); + } +out_unlock: + end_renaming(&rd); +out_want_write: fh_drop_write(ffhp); + /* + * If the target dentry has cached open files, then we need to + * try to close them prior to doing the rename. Final fput + * shouldn't be done with locks held however, so we delay it + * until this point and then reattempt the whole shebang. + */ + if (close_cached) { + nfsd_close_cached_files(close_cached); + dput(close_cached); + goto retry; + } out: return err; } -/* - * Unlink a file or directory - * N.B. After this call fhp needs an fh_put +/** + * nfsd_unlink - remove a directory entry + * @rqstp: RPC transaction context + * @fhp: the file handle of the parent directory to be modified + * @type: enforced file type of the object to be removed + * @fname: the name of directory entry to be removed + * @flen: length of @fname in octets + * + * After this call fhp needs an fh_put. + * + * Returns a generic NFS status code in network byte-order. */ __be32 nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, @@ -1763,9 +2252,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, { struct dentry *dentry, *rdentry; struct inode *dirp; + struct inode *rinode = NULL; __be32 err; int host_err; + trace_nfsd_vfs_unlink(rqstp, fhp, fname, flen); + err = nfserr_acces; if (!flen || isdotent(fname, flen)) goto out; @@ -1777,36 +2269,65 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (host_err) goto out_nfserr; - fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = fhp->fh_dentry; dirp = d_inode(dentry); - rdentry = lookup_one_len(fname, dentry, flen); + rdentry = start_removing(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); + host_err = PTR_ERR(rdentry); if (IS_ERR(rdentry)) - goto out_nfserr; + goto out_drop_write; - if (d_really_is_negative(rdentry)) { - dput(rdentry); - err = nfserr_noent; - goto out; - } + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + + rinode = d_inode(rdentry); + /* Prevent truncation until after locks dropped */ + ihold(rinode); if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; - if (type != S_IFDIR) - host_err = vfs_unlink(dirp, rdentry, NULL); - else - host_err = vfs_rmdir(dirp, rdentry); - if (!host_err) + if (type != S_IFDIR) { + int retries; + + if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) + nfsd_close_cached_files(rdentry); + + for (retries = 1;;) { + host_err = vfs_unlink(&nop_mnt_idmap, dirp, rdentry, NULL); + if (host_err != -EAGAIN || !retries--) + break; + if (!nfsd_wait_for_delegreturn(rqstp, rinode)) + break; + } + } else { + host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry, NULL); + } + fh_fill_post_attrs(fhp); + +out_unlock: + end_removing(rdentry); + if (!err && !host_err) host_err = commit_metadata(fhp); - dput(rdentry); + iput(rinode); /* truncate the inode here */ +out_drop_write: + fh_drop_write(fhp); out_nfserr: - err = nfserrno(host_err); + if (host_err == -EBUSY) { + /* + * See RFC 8881 Section 18.25.4 para 4: NFSv4 REMOVE + * wants a status unique to the object type. + */ + if (type != S_IFDIR) + err = nfserr_file_open; + else + err = nfserr_acces; + } out: - return err; + return err != nfs_ok ? err : nfserrno(host_err); } /* @@ -1831,7 +2352,7 @@ struct readdir_data { int full; }; -static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, +static bool nfsd_buffered_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -1843,7 +2364,7 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64)); if (buf->used + reclen > PAGE_SIZE) { buf->full = 1; - return -EINVAL; + return false; } de->namlen = namlen; @@ -1853,11 +2374,12 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, memcpy(de->name, name, namlen); buf->used += reclen; - return 0; + return true; } -static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, - struct readdir_cd *cdp, loff_t *offsetp) +static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp, + nfsd_filldir_t func, struct readdir_cd *cdp, + loff_t *offsetp) { struct buffered_dirent *de; int host_err; @@ -1903,6 +2425,8 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, if (cdp->err != nfs_ok) break; + trace_nfsd_dirent(fhp, de->ino, de->name, de->namlen); + reclen = ALIGN(sizeof(*de) + de->namlen, sizeof(u64)); size -= reclen; @@ -1923,9 +2447,23 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, return cdp->err; } -/* - * Read entries from a directory. - * The NFSv3/4 verifier we ignore for now. +/** + * nfsd_readdir - Read entries from a directory + * @rqstp: RPC transaction context + * @fhp: NFS file handle of directory to be read + * @offsetp: OUT: seek offset of final entry that was read + * @cdp: OUT: an eof error value + * @func: entry filler actor + * + * This implementation ignores the NFSv3/4 verifier cookie. + * + * NB: normal system calls hold file->f_pos_lock when calling + * ->iterate_shared and ->llseek, but nfsd_readdir() does not. + * Because the struct file acquired here is not visible to other + * threads, it's internal state does not need mutex protection. + * + * Returns nfs_ok on success, otherwise an nfsstat code is + * returned. */ __be32 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, @@ -1936,30 +2474,63 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, loff_t offset = *offsetp; int may_flags = NFSD_MAY_READ; - /* NFSv2 only supports 32 bit cookies */ - if (rqstp->rq_vers > 2) - may_flags |= NFSD_MAY_64BIT_COOKIE; - err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file); if (err) goto out; + if (fhp->fh_64bit_cookies) + file->f_mode |= FMODE_64BITHASH; + else + file->f_mode |= FMODE_32BITHASH; + offset = vfs_llseek(file, offset, SEEK_SET); if (offset < 0) { err = nfserrno((int)offset); goto out_close; } - err = nfsd_buffered_readdir(file, func, cdp, offsetp); + err = nfsd_buffered_readdir(file, fhp, func, cdp, offsetp); if (err == nfserr_eof || err == nfserr_toosmall) err = nfs_ok; /* can still be found in ->err */ out_close: - fput(file); + nfsd_filp_close(file); out: return err; } +/** + * nfsd_filp_close: close a file synchronously + * @fp: the file to close + * + * nfsd_filp_close() is similar in behaviour to filp_close(). + * The difference is that if this is the final close on the + * file, the that finalisation happens immediately, rather then + * being handed over to a work_queue, as it the case for + * filp_close(). + * When a user-space process closes a file (even when using + * filp_close() the finalisation happens before returning to + * userspace, so it is effectively synchronous. When a kernel thread + * uses file_close(), on the other hand, the handling is completely + * asynchronous. This means that any cost imposed by that finalisation + * is not imposed on the nfsd thread, and nfsd could potentually + * close files more quickly than the work queue finalises the close, + * which would lead to unbounded growth in the queue. + * + * In some contexts is it not safe to synchronously wait for + * close finalisation (see comment for __fput_sync()), but nfsd + * does not match those contexts. In partcilarly it does not, at the + * time that this function is called, hold and locks and no finalisation + * of any file, socket, or device driver would have any cause to wait + * for nfsd to make progress. + */ +void nfsd_filp_close(struct file *fp) +{ + get_file(fp); + filp_close(fp, NULL); + __fput_sync(fp); +} + /* * Get file system stats * N.B. After this call fhp needs an fh_put @@ -1969,6 +2540,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, in { __be32 err; + trace_nfsd_vfs_statfs(rqstp, fhp); + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); if (!err) { struct path path = { @@ -1981,17 +2554,254 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, in return err; } -static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp) +static int exp_rdonly(struct svc_cred *cred, struct svc_export *exp) { - return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY; + return nfsexp_flags(cred, exp) & NFSEXP_READONLY; } +#ifdef CONFIG_NFSD_V4 +/* + * Helper function to translate error numbers. In the case of xattr operations, + * some error codes need to be translated outside of the standard translations. + * + * ENODATA needs to be translated to nfserr_noxattr. + * E2BIG to nfserr_xattr2big. + * + * Additionally, vfs_listxattr can return -ERANGE. This means that the + * file has too many extended attributes to retrieve inside an + * XATTR_LIST_MAX sized buffer. This is a bug in the xattr implementation: + * filesystems will allow the adding of extended attributes until they hit + * their own internal limit. This limit may be larger than XATTR_LIST_MAX. + * So, at that point, the attributes are present and valid, but can't + * be retrieved using listxattr, since the upper level xattr code enforces + * the XATTR_LIST_MAX limit. + * + * This bug means that we need to deal with listxattr returning -ERANGE. The + * best mapping is to return TOOSMALL. + */ +static __be32 +nfsd_xattr_errno(int err) +{ + switch (err) { + case -ENODATA: + return nfserr_noxattr; + case -E2BIG: + return nfserr_xattr2big; + case -ERANGE: + return nfserr_toosmall; + } + return nfserrno(err); +} + +/* + * Retrieve the specified user extended attribute. To avoid always + * having to allocate the maximum size (since we are not getting + * a maximum size from the RPC), do a probe + alloc. Hold a reader + * lock on i_rwsem to prevent the extended attribute from changing + * size while we're doing this. + */ +__be32 +nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, + void **bufp, int *lenp) +{ + ssize_t len; + __be32 err; + char *buf; + struct inode *inode; + struct dentry *dentry; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_READ); + if (err) + return err; + + err = nfs_ok; + dentry = fhp->fh_dentry; + inode = d_inode(dentry); + + inode_lock_shared(inode); + + len = vfs_getxattr(&nop_mnt_idmap, dentry, name, NULL, 0); + + /* + * Zero-length attribute, just return. + */ + if (len == 0) { + *bufp = NULL; + *lenp = 0; + goto out; + } + + if (len < 0) { + err = nfsd_xattr_errno(len); + goto out; + } + + if (len > *lenp) { + err = nfserr_toosmall; + goto out; + } + + buf = kvmalloc(len, GFP_KERNEL); + if (buf == NULL) { + err = nfserr_jukebox; + goto out; + } + + len = vfs_getxattr(&nop_mnt_idmap, dentry, name, buf, len); + if (len <= 0) { + kvfree(buf); + buf = NULL; + err = nfsd_xattr_errno(len); + } + + *lenp = len; + *bufp = buf; + +out: + inode_unlock_shared(inode); + + return err; +} + +/* + * Retrieve the xattr names. Since we can't know how many are + * user extended attributes, we must get all attributes here, + * and have the XDR encode filter out the "user." ones. + * + * While this could always just allocate an XATTR_LIST_MAX + * buffer, that's a waste, so do a probe + allocate. To + * avoid any changes between the probe and allocate, wrap + * this in inode_lock. + */ +__be32 +nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp, + int *lenp) +{ + ssize_t len; + __be32 err; + char *buf; + struct inode *inode; + struct dentry *dentry; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_READ); + if (err) + return err; + + dentry = fhp->fh_dentry; + inode = d_inode(dentry); + *lenp = 0; + + inode_lock_shared(inode); + + len = vfs_listxattr(dentry, NULL, 0); + if (len <= 0) { + err = nfsd_xattr_errno(len); + goto out; + } + + if (len > XATTR_LIST_MAX) { + err = nfserr_xattr2big; + goto out; + } + + buf = kvmalloc(len, GFP_KERNEL); + if (buf == NULL) { + err = nfserr_jukebox; + goto out; + } + + len = vfs_listxattr(dentry, buf, len); + if (len <= 0) { + kvfree(buf); + err = nfsd_xattr_errno(len); + goto out; + } + + *lenp = len; + *bufp = buf; + + err = nfs_ok; +out: + inode_unlock_shared(inode); + + return err; +} + +/** + * nfsd_removexattr - Remove an extended attribute + * @rqstp: RPC transaction being executed + * @fhp: NFS filehandle of object with xattr to remove + * @name: name of xattr to remove (NUL-terminate) + * + * Pass in a NULL pointer for delegated_inode, and let the client deal + * with NFS4ERR_DELAY (same as with e.g. setattr and remove). + * + * Returns nfs_ok on success, or an nfsstat in network byte order. + */ +__be32 +nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) +{ + __be32 err; + int ret; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE); + if (err) + return err; + + ret = fh_want_write(fhp); + if (ret) + return nfserrno(ret); + + inode_lock(fhp->fh_dentry->d_inode); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + ret = __vfs_removexattr_locked(&nop_mnt_idmap, fhp->fh_dentry, + name, NULL); + err = nfsd_xattr_errno(ret); + fh_fill_post_attrs(fhp); +out_unlock: + inode_unlock(fhp->fh_dentry->d_inode); + fh_drop_write(fhp); + + return err; +} + +__be32 +nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, + void *buf, u32 len, u32 flags) +{ + __be32 err; + int ret; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE); + if (err) + return err; + + ret = fh_want_write(fhp); + if (ret) + return nfserrno(ret); + inode_lock(fhp->fh_dentry->d_inode); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, + name, buf, len, flags, NULL); + fh_fill_post_attrs(fhp); + err = nfsd_xattr_errno(ret); +out_unlock: + inode_unlock(fhp->fh_dentry->d_inode); + fh_drop_write(fhp); + return err; +} +#endif + /* * Check for a user's access permissions to this inode. */ __be32 -nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, - struct dentry *dentry, int acc) +nfsd_permission(struct svc_cred *cred, struct svc_export *exp, + struct dentry *dentry, int acc) { struct inode *inode = d_inode(dentry); int err; @@ -2006,7 +2816,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, (acc & NFSD_MAY_EXEC)? " exec" : "", (acc & NFSD_MAY_SATTR)? " sattr" : "", (acc & NFSD_MAY_TRUNC)? " trunc" : "", - (acc & NFSD_MAY_LOCK)? " lock" : "", + (acc & NFSD_MAY_NLM)? " nlm" : "", (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "", inode->i_mode, IS_IMMUTABLE(inode)? " immut" : "", @@ -2022,7 +2832,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, */ if (!(acc & NFSD_MAY_LOCAL_ACCESS)) if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) { - if (exp_rdonly(rqstp, exp) || + if (exp_rdonly(cred, exp) || __mnt_is_readonly(exp->ex_path.mnt)) return nfserr_rofs; if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode)) @@ -2031,16 +2841,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode)) return nfserr_perm; - if (acc & NFSD_MAY_LOCK) { - /* If we cannot rely on authentication in NLM requests, - * just allow locks, otherwise require read permission, or - * ownership - */ - if (exp->ex_flags & NFSEXP_NOAUTHNLM) - return 0; - else - acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE; - } /* * The file owner always gets access permission for accesses that * would normally be checked at open time. This is to make @@ -2060,73 +2860,14 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return 0; /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ - err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC)); + err = inode_permission(&nop_mnt_idmap, inode, + acc & (MAY_READ | MAY_WRITE | MAY_EXEC)); /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) - err = inode_permission(inode, MAY_EXEC); + err = inode_permission(&nop_mnt_idmap, inode, MAY_EXEC); return err? nfserrno(err) : 0; } - -void -nfsd_racache_shutdown(void) -{ - struct raparms *raparm, *last_raparm; - unsigned int i; - - dprintk("nfsd: freeing readahead buffers.\n"); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - raparm = raparm_hash[i].pb_head; - while(raparm) { - last_raparm = raparm; - raparm = raparm->p_next; - kfree(last_raparm); - } - raparm_hash[i].pb_head = NULL; - } -} -/* - * Initialize readahead param cache - */ -int -nfsd_racache_init(int cache_size) -{ - int i; - int j = 0; - int nperbucket; - struct raparms **raparm = NULL; - - - if (raparm_hash[0].pb_head) - return 0; - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - nperbucket = max(2, nperbucket); - cache_size = nperbucket * RAPARM_HASH_SIZE; - - dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - spin_lock_init(&raparm_hash[i].pb_lock); - - raparm = &raparm_hash[i].pb_head; - for (j = 0; j < nperbucket; j++) { - *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); - if (!*raparm) - goto out_nomem; - raparm = &(*raparm)->p_next; - } - *raparm = NULL; - } - - nfsdstats.ra_size = cache_size; - return 0; - -out_nomem: - dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); - nfsd_racache_shutdown(); - return -ENOMEM; -} diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index a7e107309f76..ded2900d423f 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -6,6 +6,8 @@ #ifndef LINUX_NFSD_VFS_H #define LINUX_NFSD_VFS_H +#include <linux/fs.h> +#include <linux/posix_acl.h> #include "nfsfh.h" #include "nfsd.h" @@ -18,7 +20,7 @@ #define NFSD_MAY_READ 0x004 /* == MAY_READ */ #define NFSD_MAY_SATTR 0x008 #define NFSD_MAY_TRUNC 0x010 -#define NFSD_MAY_LOCK 0x020 +#define NFSD_MAY_NLM 0x020 /* request is from lockd */ #define NFSD_MAY_MASK 0x03f /* extra hints to permission and open routines: */ @@ -31,17 +33,44 @@ #define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */ +#define NFSD_MAY_LOCALIO 0x2000 /* for tracing, reflects when localio used */ + #define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) #define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) +struct nfsd_file; + /* * Callback function for readdir */ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); /* nfsd/vfs.c */ -int nfsd_racache_init(int); -void nfsd_racache_shutdown(void); +struct nfsd_attrs { + struct iattr *na_iattr; /* input */ + struct xdr_netobj *na_seclabel; /* input */ + struct posix_acl *na_pacl; /* input */ + struct posix_acl *na_dpacl; /* input */ + + int na_labelerr; /* output */ + int na_aclerr; /* output */ +}; + +static inline void nfsd_attrs_free(struct nfsd_attrs *attrs) +{ + posix_acl_release(attrs->na_pacl); + posix_acl_release(attrs->na_dpacl); +} + +static inline bool nfsd_attrs_valid(struct nfsd_attrs *attrs) +{ + struct iattr *iap = attrs->na_iattr; + + return (iap->ia_valid || (attrs->na_seclabel && + attrs->na_seclabel->len)); +} + +__be32 nfserrno (int errno); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, @@ -50,54 +79,68 @@ __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, const char *, unsigned int, struct svc_export **, struct dentry **); __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, - struct iattr *, int, time_t); + struct nfsd_attrs *, const struct timespec64 *); int nfsd_mountpoint(struct dentry *, struct svc_export *); #ifdef CONFIG_NFSD_V4 -__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, - struct xdr_netobj *); __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, loff_t, int); -__be32 nfsd4_clone_file_range(struct file *, u64, struct file *, - u64, u64); +__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, + struct nfsd_file *nf_src, u64 src_pos, + struct nfsd_file *nf_dst, u64 dst_pos, + u64 count, bool sync); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *, - char *name, int len, struct iattr *attrs, - int type, dev_t rdev, struct svc_fh *res); + struct nfsd_attrs *attrs, int type, dev_t rdev, + struct svc_fh *res); __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, - char *name, int len, struct iattr *attrs, + char *name, int len, struct nfsd_attrs *attrs, int type, dev_t rdev, struct svc_fh *res); -#ifdef CONFIG_NFSD_V3 __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); -__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, - char *name, int len, struct iattr *attrs, - struct svc_fh *res, int createmode, - u32 *verifier, bool *truncp, bool *created); -__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, - loff_t, unsigned long); -#endif /* CONFIG_NFSD_V3 */ +__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct nfsd_attrs *iap); +__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp, + struct nfsd_file *nf, u64 offset, u32 count, + __be32 *verf); +#ifdef CONFIG_NFSD_V4 +__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *name, void **bufp, int *lenp); +__be32 nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char **bufp, int *lenp); +__be32 nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *name); +__be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *name, void *buf, u32 len, u32 flags); +#endif +int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -struct raparms; +int nfsd_open_verified(struct svc_fh *fhp, umode_t type, int may_flags, + struct file **filp); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count); -__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, - struct kvec *vec, int vlen, - unsigned long *count); -__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, - loff_t, struct kvec *, int, unsigned long *); -__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, - struct kvec *, int, unsigned long *, int); + unsigned long *count, + u32 *eof); +__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, loff_t offset, + unsigned long *count, unsigned int base, + u32 *eof); +bool nfsd_read_splice_ok(struct svc_rqst *rqstp); +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, unsigned long *count, + u32 *eof); +__be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, const struct xdr_buf *payload, + unsigned long *cnt, int stable, __be32 *verf); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, - struct kvec *vec, int vlen, unsigned long *cnt, - int stable); + struct nfsd_file *nf, loff_t offset, + const struct xdr_buf *payload, + unsigned long *cnt, int stable, __be32 *verf); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, - char *name, int len, char *path, - struct svc_fh *res); + char *name, int len, char *path, + struct nfsd_attrs *attrs, + struct svc_fh *res); __be32 nfsd_link(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *); ssize_t nfsd_copy_file_range(struct file *, u64, @@ -112,60 +155,9 @@ __be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, struct kstatfs *, int access); -__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, - struct dentry *, int); +__be32 nfsd_permission(struct svc_cred *cred, struct svc_export *exp, + struct dentry *dentry, int acc); -struct raparms *nfsd_init_raparms(struct file *file); -void nfsd_put_raparams(struct file *file, struct raparms *ra); - -static inline int fh_want_write(struct svc_fh *fh) -{ - int ret = mnt_want_write(fh->fh_export->ex_path.mnt); - - if (!ret) - fh->fh_want_write = true; - return ret; -} - -static inline void fh_drop_write(struct svc_fh *fh) -{ - if (fh->fh_want_write) { - fh->fh_want_write = false; - mnt_drop_write(fh->fh_export->ex_path.mnt); - } -} - -static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat) -{ - struct path p = {.mnt = fh->fh_export->ex_path.mnt, - .dentry = fh->fh_dentry}; - return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS, - AT_STATX_SYNC_AS_STAT)); -} - -static inline int nfsd_create_is_exclusive(int createmode) -{ - return createmode == NFS3_CREATE_EXCLUSIVE - || createmode == NFS4_CREATE_EXCLUSIVE4_1; -} - -static inline bool nfsd_eof_on_read(long requested, long read, - loff_t offset, loff_t size) -{ - /* We assume a short read means eof: */ - if (requested > read) - return true; - /* - * A non-short read might also reach end of file. The spec - * still requires us to set eof in that case. - * - * Further operations may have modified the file size since - * the read, so the following check is not atomic with the read. - * We've only seen that cause a problem for a client in the case - * where the read returned a count of 0 without setting eof. - * That case was fixed by the addition of the above check. - */ - return (offset + read >= size); -} +void nfsd_filp_close(struct file *fp); #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index ea7cca3a64b7..852f71580bd0 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -27,14 +27,13 @@ struct nfsd_readargs { struct svc_fh fh; __u32 offset; __u32 count; - int vlen; }; struct nfsd_writeargs { svc_fh fh; __u32 offset; - int len; - struct kvec first; + __u32 len; + struct xdr_buf payload; }; struct nfsd_createargs { @@ -53,11 +52,6 @@ struct nfsd_renameargs { unsigned int tlen; }; -struct nfsd_readlinkargs { - struct svc_fh fh; - char * buffer; -}; - struct nfsd_linkargs { struct svc_fh ffh; struct svc_fh tfh; @@ -79,39 +73,53 @@ struct nfsd_readdirargs { struct svc_fh fh; __u32 cookie; __u32 count; - __be32 * buffer; +}; + +struct nfsd_stat { + __be32 status; }; struct nfsd_attrstat { + __be32 status; struct svc_fh fh; struct kstat stat; }; struct nfsd_diropres { + __be32 status; struct svc_fh fh; struct kstat stat; }; struct nfsd_readlinkres { + __be32 status; int len; + struct page *page; }; struct nfsd_readres { + __be32 status; struct svc_fh fh; unsigned long count; struct kstat stat; + struct page **pages; }; struct nfsd_readdirres { + /* Components of the reply */ + __be32 status; + int count; + /* Used to encode the reply's entry list */ + struct xdr_stream xdr; + struct xdr_buf dirlist; struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; + unsigned int cookie_offset; }; struct nfsd_statfsres { + __be32 status; struct kstatfs stats; }; @@ -133,33 +141,37 @@ union nfsd_xdrstore { #define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) -int nfssvc_decode_void(struct svc_rqst *, __be32 *); -int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *); -int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_readargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_createargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *); -int nfssvc_encode_void(struct svc_rqst *, __be32 *); -int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *); -int nfssvc_encode_diropres(struct svc_rqst *, __be32 *); -int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *); -int nfssvc_encode_readres(struct svc_rqst *, __be32 *); -int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *); -int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *); - -int nfssvc_encode_entry(void *, const char *name, - int namlen, loff_t offset, u64 ino, unsigned int); - -void nfssvc_release_fhandle(struct svc_rqst *); +bool nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +bool nfssvc_encode_statres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_encode_attrstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_encode_diropres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset); +int nfssvc_encode_entry(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type); + +void nfssvc_release_attrstat(struct svc_rqst *rqstp); +void nfssvc_release_diropres(struct svc_rqst *rqstp); +void nfssvc_release_readres(struct svc_rqst *rqstp); /* Helper functions for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); -__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); +bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp); +bool svcxdr_encode_stat(struct xdr_stream *xdr, __be32 status); +bool svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp, const struct kstat *stat); #endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 2cb29e961a76..522067b7fd75 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -14,7 +14,7 @@ struct nfsd3_sattrargs { struct svc_fh fh; struct iattr attrs; int check_guard; - time_t guardtime; + struct timespec64 guardtime; }; struct nfsd3_diropargs { @@ -25,14 +25,13 @@ struct nfsd3_diropargs { struct nfsd3_accessargs { struct svc_fh fh; - unsigned int access; + __u32 access; }; struct nfsd3_readargs { struct svc_fh fh; __u64 offset; __u32 count; - int vlen; }; struct nfsd3_writeargs { @@ -41,7 +40,7 @@ struct nfsd3_writeargs { __u32 count; int stable; __u32 len; - struct kvec first; + struct xdr_buf payload; }; struct nfsd3_createargs { @@ -71,11 +70,6 @@ struct nfsd3_renameargs { unsigned int tlen; }; -struct nfsd3_readlinkargs { - struct svc_fh fh; - char * buffer; -}; - struct nfsd3_linkargs { struct svc_fh ffh; struct svc_fh tfh; @@ -96,10 +90,8 @@ struct nfsd3_symlinkargs { struct nfsd3_readdirargs { struct svc_fh fh; __u64 cookie; - __u32 dircount; __u32 count; __be32 * verf; - __be32 * buffer; }; struct nfsd3_commitargs { @@ -110,13 +102,13 @@ struct nfsd3_commitargs { struct nfsd3_getaclargs { struct svc_fh fh; - int mask; + __u32 mask; }; struct posix_acl; struct nfsd3_setaclargs { struct svc_fh fh; - int mask; + __u32 mask; struct posix_acl *acl_access; struct posix_acl *acl_default; }; @@ -145,13 +137,15 @@ struct nfsd3_readlinkres { __be32 status; struct svc_fh fh; __u32 len; + struct page **pages; }; struct nfsd3_readres { __be32 status; struct svc_fh fh; unsigned long count; - int eof; + __u32 eof; + struct page **pages; }; struct nfsd3_writeres { @@ -159,6 +153,7 @@ struct nfsd3_writeres { struct svc_fh fh; unsigned long count; int committed; + __be32 verf[2]; }; struct nfsd3_renameres { @@ -174,19 +169,17 @@ struct nfsd3_linkres { }; struct nfsd3_readdirres { + /* Components of the reply */ __be32 status; struct svc_fh fh; - /* Just to save kmalloc on every readdirplus entry (svc_fh is a - * little large for the stack): */ - struct svc_fh scratch; - int count; __be32 verf[2]; + /* Used to encode the reply's entry list */ + struct xdr_stream xdr; + struct xdr_buf dirlist; + struct svc_fh scratch; struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; - __be32 * offset1; + unsigned int cookie_offset; struct svc_rqst * rqstp; }; @@ -223,6 +216,7 @@ struct nfsd3_pathconfres { struct nfsd3_commitres { __be32 status; struct svc_fh fh; + __be32 verf[2]; }; struct nfsd3_getaclres { @@ -271,51 +265,50 @@ union nfsd3_xdrstore { #define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) -int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *); -int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *); -int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *); -int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *); -int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_readres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_createres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *); -int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *); -int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *); +bool nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_decode_commitargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); + +bool nfs3svc_encode_getattrres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_wccstat(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_lookupres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_writeres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_createres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_renameres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_linkres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs3svc_encode_commitres(struct svc_rqst *rqstp, struct xdr_stream *xdr); void nfs3svc_release_fhandle(struct svc_rqst *); void nfs3svc_release_fhandle2(struct svc_rqst *); -int nfs3svc_encode_entry(void *, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int); -int nfs3svc_encode_entry_plus(void *, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int); -/* Helper functions for NFSv3 ACL code */ -__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, - struct svc_fh *fhp); -__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); +void nfs3svc_encode_cookie3(struct nfsd3_readdirres *resp, u64 offset); +int nfs3svc_encode_entry3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type); +int nfs3svc_encode_entryplus3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type); +/* Helper functions for NFSv3 ACL code */ +bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp); +bool svcxdr_encode_nfsstat3(struct xdr_stream *xdr, __be32 status); +bool svcxdr_encode_post_op_attr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp); #endif /* _LINUX_NFSD_XDR3_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index feeb6d4bdffd..ae75846b3cd7 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -46,9 +46,137 @@ #define CURRENT_STATE_ID_FLAG (1<<0) #define SAVED_STATE_ID_FLAG (1<<1) -#define SET_STATE_ID(c, f) ((c)->sid_flags |= (f)) -#define HAS_STATE_ID(c, f) ((c)->sid_flags & (f)) -#define CLEAR_STATE_ID(c, f) ((c)->sid_flags &= ~(f)) +#define SET_CSTATE_FLAG(c, f) ((c)->sid_flags |= (f)) +#define HAS_CSTATE_FLAG(c, f) ((c)->sid_flags & (f)) +#define CLEAR_CSTATE_FLAG(c, f) ((c)->sid_flags &= ~(f)) + +/** + * nfsd4_encode_bool - Encode an XDR bool type result + * @xdr: target XDR stream + * @val: boolean value to encode + * + * Return values: + * %nfs_ok: @val encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_bool(struct xdr_stream *xdr, bool val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(p == NULL)) + return nfserr_resource; + *p = val ? xdr_one : xdr_zero; + return nfs_ok; +} + +/** + * nfsd4_encode_uint32_t - Encode an XDR uint32_t type result + * @xdr: target XDR stream + * @val: integer value to encode + * + * Return values: + * %nfs_ok: @val encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_uint32_t(struct xdr_stream *xdr, u32 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(p == NULL)) + return nfserr_resource; + *p = cpu_to_be32(val); + return nfs_ok; +} + +#define nfsd4_encode_aceflag4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_acemask4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_acetype4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_count4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_mode4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_nfs_lease4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_qop4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_sequenceid4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_slotid4(x, v) nfsd4_encode_uint32_t(x, v) + +/** + * nfsd4_encode_uint64_t - Encode an XDR uint64_t type result + * @xdr: target XDR stream + * @val: integer value to encode + * + * Return values: + * %nfs_ok: @val encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_uint64_t(struct xdr_stream *xdr, u64 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2); + + if (unlikely(p == NULL)) + return nfserr_resource; + put_unaligned_be64(val, p); + return nfs_ok; +} + +#define nfsd4_encode_changeid4(x, v) nfsd4_encode_uint64_t(x, v) +#define nfsd4_encode_nfs_cookie4(x, v) nfsd4_encode_uint64_t(x, v) +#define nfsd4_encode_length4(x, v) nfsd4_encode_uint64_t(x, v) +#define nfsd4_encode_offset4(x, v) nfsd4_encode_uint64_t(x, v) + +/** + * nfsd4_encode_opaque_fixed - Encode a fixed-length XDR opaque type result + * @xdr: target XDR stream + * @data: pointer to data + * @size: length of data in bytes + * + * Return values: + * %nfs_ok: @data encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_opaque_fixed(struct xdr_stream *xdr, const void *data, + size_t size) +{ + __be32 *p = xdr_reserve_space(xdr, xdr_align_size(size)); + size_t pad = xdr_pad_size(size); + + if (unlikely(p == NULL)) + return nfserr_resource; + memcpy(p, data, size); + if (pad) + memset((char *)p + size, 0, pad); + return nfs_ok; +} + +/** + * nfsd4_encode_opaque - Encode a variable-length XDR opaque type result + * @xdr: target XDR stream + * @data: pointer to data + * @size: length of data in bytes + * + * Return values: + * %nfs_ok: @data encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_opaque(struct xdr_stream *xdr, const void *data, size_t size) +{ + size_t pad = xdr_pad_size(size); + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(size)); + if (unlikely(p == NULL)) + return nfserr_resource; + *p++ = cpu_to_be32(size); + memcpy(p, data, size); + if (pad) + memset((char *)p + size, 0, pad); + return nfs_ok; +} + +#define nfsd4_encode_component4(x, d, s) nfsd4_encode_opaque(x, d, s) struct nfsd4_compound_state { struct svc_fh current_fh; @@ -76,12 +204,7 @@ static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) struct nfsd4_change_info { u32 atomic; - bool change_supported; - u32 before_ctime_sec; - u32 before_ctime_nsec; u64 before_change; - u32 after_ctime_sec; - u32 after_ctime_nsec; u64 after_change; }; @@ -175,12 +298,8 @@ struct nfsd4_lock { } v; /* response */ - union { - struct { - stateid_t stateid; - } ok; - struct nfsd4_lock_denied denied; - } u; + stateid_t lk_resp_stateid; + struct nfsd4_lock_denied lk_denied; }; #define lk_new_open_seqid v.new.open_seqid #define lk_new_open_stateid v.new.open_stateid @@ -190,20 +309,15 @@ struct nfsd4_lock { #define lk_old_lock_stateid v.old.lock_stateid #define lk_old_lock_seqid v.old.lock_seqid -#define lk_resp_stateid u.ok.stateid -#define lk_denied u.denied - - struct nfsd4_lockt { u32 lt_type; clientid_t lt_clientid; struct xdr_netobj lt_owner; u64 lt_offset; u64 lt_length; - struct nfsd4_lock_denied lt_denied; + struct nfsd4_lock_denied lt_denied; }; - struct nfsd4_locku { u32 lu_type; u32 lu_seqid; @@ -221,11 +335,39 @@ struct nfsd4_lookup { struct nfsd4_putfh { u32 pf_fhlen; /* request */ char *pf_fhval; /* request */ + bool no_verify; /* represents foreigh fh */ +}; + +struct nfsd4_getxattr { + char *getxa_name; /* request */ + u32 getxa_len; /* request */ + void *getxa_buf; +}; + +struct nfsd4_setxattr { + u32 setxa_flags; /* request */ + char *setxa_name; /* request */ + char *setxa_buf; /* request */ + u32 setxa_len; /* request */ + struct nfsd4_change_info setxa_cinfo; /* response */ +}; + +struct nfsd4_removexattr { + char *rmxa_name; /* request */ + struct nfsd4_change_info rmxa_cinfo; /* response */ +}; + +struct nfsd4_listxattrs { + u64 lsxa_cookie; /* request */ + u32 lsxa_maxcount; /* request */ + char *lsxa_buf; /* unfiltered buffer (reply) */ + u32 lsxa_len; /* unfiltered len (reply) */ }; struct nfsd4_open { u32 op_claim_type; /* request */ - struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ + u32 op_fnamelen; + char * op_fname; /* request - everything but CLAIM_PREV */ u32 op_delegate_type; /* request - CLAIM_PREV only */ stateid_t op_delegate_stateid; /* request - response */ u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */ @@ -244,17 +386,19 @@ struct nfsd4_open { u32 op_deleg_want; /* request */ stateid_t op_stateid; /* response */ __be32 op_xdr_error; /* see nfsd4_open_omfg() */ - u32 op_recall; /* recall */ struct nfsd4_change_info op_cinfo; /* response */ u32 op_rflags; /* response */ + bool op_recall; /* response */ bool op_truncate; /* used during processing */ bool op_created; /* used during processing */ struct nfs4_openowner *op_openowner; /* used during processing */ + struct file *op_filp; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ struct nfs4_clnt_odstate *op_odstate; /* used during processing */ struct nfs4_acl *op_acl; struct xdr_netobj op_label; + struct svc_rqst *op_rqstp; }; struct nfsd4_open_confirm { @@ -273,15 +417,15 @@ struct nfsd4_open_downgrade { struct nfsd4_read { - stateid_t rd_stateid; /* request */ - u64 rd_offset; /* request */ - u32 rd_length; /* request */ - int rd_vlen; - struct file *rd_filp; - bool rd_tmp_file; - - struct svc_rqst *rd_rqstp; /* response */ - struct svc_fh * rd_fhp; /* response */ + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct nfsd_file *rd_nf; + + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh *rd_fhp; /* response */ + u32 rd_eof; /* response */ }; struct nfsd4_readdir { @@ -359,13 +503,6 @@ struct nfsd4_setclientid_confirm { nfs4_verifier sc_confirm; }; -struct nfsd4_saved_compoundargs { - __be32 *p; - __be32 *end; - int pagelen; - struct page **pagelist; -}; - struct nfsd4_test_stateid_id { __be32 ts_id_status; stateid_t ts_id_stateid; @@ -381,6 +518,24 @@ struct nfsd4_free_stateid { stateid_t fr_stateid; /* request */ }; +struct nfsd4_get_dir_delegation { + /* request */ + u32 gdda_signal_deleg_avail; + u32 gdda_notification_types[1]; + struct timespec64 gdda_child_attr_delay; + struct timespec64 gdda_dir_attr_delay; + u32 gdda_child_attributes[3]; + u32 gdda_dir_attributes[3]; + /* response */ + u32 gddrnf_status; + nfs4_verifier gddr_cookieverf; + stateid_t gddr_stateid; + u32 gddr_notification[1]; + u32 gddr_child_attributes[3]; + u32 gddr_dir_attributes[3]; + bool gddrnf_will_signal_deleg_avail; +}; + /* also used for NVERIFY */ struct nfsd4_verify { u32 ve_bmval[3]; /* request */ @@ -393,8 +548,7 @@ struct nfsd4_write { u64 wr_offset; /* request */ u32 wr_stable_how; /* request */ u32 wr_buflen; /* request */ - struct kvec wr_head; - struct page ** wr_pagelist; /* request */ + struct xdr_buf wr_payload; /* request */ u32 wr_bytes_written; /* response */ u32 wr_how_written; /* response */ @@ -407,20 +561,23 @@ struct nfsd4_exchange_id { u32 flags; clientid_t clientid; u32 seqid; - int spa_how; + u32 spa_how; u32 spo_must_enforce[3]; u32 spo_must_allow[3]; + struct xdr_netobj nii_domain; + struct xdr_netobj nii_name; + struct timespec64 nii_time; + char *server_impl_name; }; struct nfsd4_sequence { struct nfs4_sessionid sessionid; /* request/response */ u32 seqid; /* request/response */ u32 slotid; /* request/response */ - u32 maxslots; /* request/response */ + u32 maxslots; /* request */ u32 cachethis; /* request */ -#if 0 + u32 maxslots_response; /* response */ u32 target_maxslots; /* response */ -#endif /* not yet */ u32 status_flags; /* response */ }; @@ -439,9 +596,43 @@ struct nfsd4_reclaim_complete { struct nfsd4_deviceid { u64 fsid_idx; u32 generation; - u32 pad; }; +static inline __be32 * +svcxdr_encode_deviceid4(__be32 *p, const struct nfsd4_deviceid *devid) +{ + __be64 *q = (__be64 *)p; + + *q = (__force __be64)devid->fsid_idx; + p += 2; + *p++ = (__force __be32)devid->generation; + *p++ = xdr_zero; + return p; +} + +static inline __be32 * +svcxdr_decode_deviceid4(__be32 *p, struct nfsd4_deviceid *devid) +{ + __be64 *q = (__be64 *)p; + + devid->fsid_idx = (__force u64)(*q); + p += 2; + devid->generation = (__force u32)(*p++); + p++; /* NFSD does not use the remaining octets */ + return p; +} + +static inline __be32 +nfsd4_decode_deviceid4(struct xdr_stream *xdr, struct nfsd4_deviceid *devid) +{ + __be32 *p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE); + + if (unlikely(!p)) + return nfserr_bad_xdr; + svcxdr_decode_deviceid4(p, devid); + return nfs_ok; +} + struct nfsd4_layout_seg { u32 iomode; u64 offset; @@ -472,11 +663,10 @@ struct nfsd4_layoutcommit { u32 lc_reclaim; /* request */ u32 lc_newoffset; /* request */ u64 lc_last_wr; /* request */ - struct timespec lc_mtime; /* request */ + struct timespec64 lc_mtime; /* request */ u32 lc_layout_type; /* request */ - u32 lc_up_len; /* layout length */ - void *lc_up_layout; /* decoded by callback */ - u32 lc_size_chg; /* boolean for response */ + struct xdr_buf lc_up_layout; /* decoded by callback */ + bool lc_size_chg; /* response */ u64 lc_newsize; /* response */ }; @@ -488,7 +678,7 @@ struct nfsd4_layoutreturn { u32 lrf_body_len; /* request */ void *lrf_body; /* request */ stateid_t lr_sid; /* request/response */ - u32 lrs_present; /* response */ + bool lrs_present; /* response */ }; struct nfsd4_fallocate { @@ -514,38 +704,84 @@ struct nfsd42_write_res { stateid_t cb_stateid; }; +struct nfsd4_cb_offload { + struct nfsd4_callback co_cb; + struct nfsd42_write_res co_res; + __be32 co_nfserr; + unsigned int co_retries; + struct knfsd_fh co_fh; + + struct nfs4_sessionid co_referring_sessionid; + u32 co_referring_slotid; + u32 co_referring_seqno; +}; + struct nfsd4_copy { /* request */ - stateid_t cp_src_stateid; - stateid_t cp_dst_stateid; - u64 cp_src_pos; - u64 cp_dst_pos; - u64 cp_count; - - /* both */ - bool cp_synchronous; + stateid_t cp_src_stateid; + stateid_t cp_dst_stateid; + u64 cp_src_pos; + u64 cp_dst_pos; + u64 cp_count; + struct nl4_server *cp_src; + + unsigned long cp_flags; +#define NFSD4_COPY_F_STOPPED (0) +#define NFSD4_COPY_F_INTRA (1) +#define NFSD4_COPY_F_SYNCHRONOUS (2) +#define NFSD4_COPY_F_COMMITTED (3) +#define NFSD4_COPY_F_COMPLETED (4) +#define NFSD4_COPY_F_OFFLOAD_DONE (5) /* response */ - struct nfsd42_write_res cp_res; - - /* for cb_offload */ - struct nfsd4_callback cp_cb; __be32 nfserr; + struct nfsd42_write_res cp_res; struct knfsd_fh fh; + /* offload callback */ + struct nfsd4_cb_offload cp_cb_offload; + struct nfs4_client *cp_clp; - struct file *file_src; - struct file *file_dst; + struct nfsd_file *nf_src; + struct nfsd_file *nf_dst; - stateid_t cp_stateid; + copy_stateid_t cp_stateid; struct list_head copies; struct task_struct *copy_task; refcount_t refcount; - bool stopped; + unsigned int cp_ttl; + + struct nfsd4_ssc_umount_item *ss_nsui; + struct nfs_fh c_fh; + nfs4_stateid stateid; + struct nfsd_net *cp_nn; }; +static inline void nfsd4_copy_set_sync(struct nfsd4_copy *copy, bool sync) +{ + if (sync) + set_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); + else + clear_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); +} + +static inline bool nfsd4_copy_is_sync(const struct nfsd4_copy *copy) +{ + return test_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); +} + +static inline bool nfsd4_copy_is_async(const struct nfsd4_copy *copy) +{ + return !test_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); +} + +static inline bool nfsd4_ssc_is_inter(const struct nfsd4_copy *copy) +{ + return !test_bit(NFSD4_COPY_F_INTRA, ©->cp_flags); +} + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -563,13 +799,26 @@ struct nfsd4_offload_status { /* response */ u64 count; - u32 status; + __be32 status; + bool completed; +}; + +struct nfsd4_copy_notify { + /* request */ + stateid_t cpn_src_stateid; + struct nl4_server *cpn_dst; + + /* response */ + stateid_t cpn_cnr_stateid; + struct timespec64 cpn_lease_time; + struct nl4_server *cpn_src; }; struct nfsd4_op { - int opnum; - const struct nfsd4_operation * opdesc; + u32 opnum; __be32 status; + const struct nfsd4_operation *opdesc; + struct nfs4_replay *replay; union nfsd4_op_u { struct nfsd4_access access; struct nfsd4_close close; @@ -613,6 +862,7 @@ struct nfsd4_op { struct nfsd4_reclaim_complete reclaim_complete; struct nfsd4_test_stateid test_stateid; struct nfsd4_free_stateid free_stateid; + struct nfsd4_get_dir_delegation get_dir_delegation; struct nfsd4_getdeviceinfo getdeviceinfo; struct nfsd4_layoutget layoutget; struct nfsd4_layoutcommit layoutcommit; @@ -625,9 +875,14 @@ struct nfsd4_op { struct nfsd4_clone clone; struct nfsd4_copy copy; struct nfsd4_offload_status offload_status; + struct nfsd4_copy_notify copy_notify; struct nfsd4_seek seek; + + struct nfsd4_getxattr getxattr; + struct nfsd4_setxattr setxattr; + struct nfsd4_listxattrs listxattrs; + struct nfsd4_removexattr removexattr; } u; - struct nfs4_replay * replay; }; bool nfsd4_cache_this_op(struct nfsd4_op *); @@ -642,59 +897,33 @@ struct svcxdr_tmpbuf { struct nfsd4_compoundargs { /* scratch variables for XDR decode */ - __be32 * p; - __be32 * end; - struct page ** pagelist; - int pagelen; - bool tail; - __be32 tmp[8]; - __be32 * tmpp; + struct xdr_stream *xdr; struct svcxdr_tmpbuf *to_free; - struct svc_rqst *rqstp; - u32 taglen; char * tag; + u32 taglen; u32 minorversion; + u32 client_opcnt; u32 opcnt; + bool splice_ok; struct nfsd4_op *ops; struct nfsd4_op iops[8]; - int cachetype; }; struct nfsd4_compoundres { /* scratch variables for XDR encode */ - struct xdr_stream xdr; + struct xdr_stream *xdr; struct svc_rqst * rqstp; - u32 taglen; + __be32 *statusp; char * tag; + u32 taglen; u32 opcnt; - __be32 * tagp; /* tag, opcount encode location */ + struct nfsd4_compound_state cstate; }; -static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) -{ - struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; - return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; -} - -/* - * The session reply cache only needs to cache replies that the client - * actually asked us to. But it's almost free for us to cache compounds - * consisting of only a SEQUENCE op, so we may as well cache those too. - * Also, the protocol doesn't give us a convenient response in the case - * of a replay of a solo SEQUENCE op that wasn't cached - * (RETRY_UNCACHED_REP can only be returned in the second op of a - * compound). - */ -static inline bool nfsd4_cache_this(struct nfsd4_compoundres *resp) -{ - return (resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS) - || nfsd4_is_solo_sequence(resp); -} - static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp) { struct nfsd4_compoundres *resp = rqstp->rq_resp; @@ -709,27 +938,9 @@ void warn_on_nonidempotent_op(struct nfsd4_op *op); #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) -static inline void -set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) -{ - BUG_ON(!fhp->fh_pre_saved); - cinfo->atomic = (u32)fhp->fh_post_saved; - cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry)); - - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; - cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; - cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; - cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; - cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; - -} - - bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); -int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *); -int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *); -int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *); +bool nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr); __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op); @@ -741,6 +952,7 @@ extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); +void nfsd4_exchange_id_release(union nfsd4_op_u *u); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, @@ -773,8 +985,10 @@ extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern void nfsd4_lock_release(union nfsd4_op_u *u); extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern void nfsd4_lockt_release(union nfsd4_op_u *u); extern __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); extern __be32 @@ -830,18 +1044,18 @@ struct nfsd4_operation { u32 op_flags; char *op_name; /* Try to get response size before operation */ - u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *); + u32 (*op_rsize_bop)(const struct svc_rqst *rqstp, + const struct nfsd4_op *op); void (*op_get_currentstateid)(struct nfsd4_compound_state *, union nfsd4_op_u *); void (*op_set_currentstateid)(struct nfsd4_compound_state *, union nfsd4_op_u *); }; +struct nfsd4_cb_recall_any { + struct nfsd4_callback ra_cb; + u32 ra_keep; + u32 ra_bmval[1]; +}; #endif - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 547cf07cf4e0..f4e29c0c701c 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -6,8 +6,11 @@ #define cb_compound_enc_hdr_sz 4 #define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) #define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define enc_referring_call4_sz (1 + 1) +#define enc_referring_call_list4_sz (sessionid_sz + 1 + \ + enc_referring_call4_sz) #define cb_sequence_enc_sz (sessionid_sz + 4 + \ - 1 /* no referring calls list yet */) + enc_referring_call_list4_sz) #define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) #define op_enc_sz 1 @@ -48,3 +51,31 @@ #define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) +#define NFS4_enc_cb_recall_any_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + 1 + 1) +#define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) + +/* + * 1: CB_GETATTR opcode (32-bit) + * N: file_handle + * 1: number of entry in attribute array (32-bit) + * 3: entry 0-2 in attribute array (32-bit * 3) + */ +#define NFS4_enc_cb_getattr_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + enc_nfs4_fh_sz + 1 + 3) +/* + * 4: fattr_bitmap_maxsz + * 1: attribute array len + * 2: change attr (64-bit) + * 2: size (64-bit) + * 2: atime.seconds (64-bit) + * 1: atime.nanoseconds (32-bit) + * 2: mtime.seconds (64-bit) + * 1: mtime.nanoseconds (32-bit) + */ +#define NFS4_dec_cb_getattr_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + 4 + 1 + 2 + 2 + 2 + 1 + 2 + 1 + op_dec_sz) |
