diff options
Diffstat (limited to 'fs/afs')
52 files changed, 21631 insertions, 9313 deletions
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index ebba3b18e5da..682bd8ec2c10 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -1,13 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only config AFS_FS tristate "Andrew File System support (AFS)" depends on INET select AF_RXRPC select DNS_RESOLVER + select NETFS_SUPPORT + select CRYPTO_KRB5 help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. @@ -17,7 +20,7 @@ config AFS_DEBUG help Say Y here to make runtime controllable debugging messages appear. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. @@ -27,3 +30,15 @@ config AFS_FSCACHE help Say Y here if you want AFS data to be cached locally on disk through the generic filesystem cache manager + +config AFS_DEBUG_CURSOR + bool "AFS server cursor debugging" + depends on AFS_FS + help + Say Y here to cause the contents of a server cursor to be dumped to + the dmesg log if the server rotation algorithm fails to successfully + contact a server. + + See <file:Documentation/filesystems/afs.rst> for more information. + + If unsure, say N. diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 4f64b95d57bd..b49b8fe682f3 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -1,32 +1,45 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for Red Hat Linux AFS client. # -afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o - -kafs-objs := \ - $(afs-cache-y) \ +kafs-y := \ + addr_list.o \ + addr_prefs.o \ callback.o \ cell.o \ + cm_security.o \ cmservice.o \ dir.o \ + dir_edit.o \ + dir_search.o \ + dir_silly.o \ + dynroot.o \ file.o \ flock.o \ fsclient.o \ + fs_operation.o \ + fs_probe.o \ inode.o \ main.o \ misc.o \ mntpt.o \ - proc.o \ + rotate.o \ rxrpc.o \ security.o \ server.o \ + server_list.o \ super.o \ - netdevices.o \ + validation.o \ vlclient.o \ - vlocation.o \ - vnode.o \ + vl_alias.o \ + vl_list.o \ + vl_probe.o \ + vl_rotate.o \ volume.o \ - write.o + write.o \ + xattr.o \ + yfsclient.o +kafs-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_AFS_FS) := kafs.o diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c new file mode 100644 index 000000000000..e941da5b6dd9 --- /dev/null +++ b/fs/afs/addr_list.c @@ -0,0 +1,414 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Server address list management + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/dns_resolver.h> +#include <linux/inet.h> +#include <keys/rxrpc-type.h> +#include "internal.h" +#include "afs_fs.h" + +static void afs_free_addrlist(struct rcu_head *rcu) +{ + struct afs_addr_list *alist = container_of(rcu, struct afs_addr_list, rcu); + unsigned int i; + + for (i = 0; i < alist->nr_addrs; i++) + rxrpc_kernel_put_peer(alist->addrs[i].peer); + trace_afs_alist(alist->debug_id, refcount_read(&alist->usage), afs_alist_trace_free); + kfree(alist); +} + +/* + * Release an address list. + */ +void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason) +{ + unsigned int debug_id; + bool dead; + int r; + + if (!alist) + return; + debug_id = alist->debug_id; + dead = __refcount_dec_and_test(&alist->usage, &r); + trace_afs_alist(debug_id, r - 1, reason); + if (dead) + call_rcu(&alist->rcu, afs_free_addrlist); +} + +struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason) +{ + int r; + + if (alist) { + __refcount_inc(&alist->usage, &r); + trace_afs_alist(alist->debug_id, r + 1, reason); + } + return alist; +} + +/* + * Allocate an address list. + */ +struct afs_addr_list *afs_alloc_addrlist(unsigned int nr) +{ + struct afs_addr_list *alist; + static atomic_t debug_id; + + _enter("%u", nr); + + if (nr > AFS_MAX_ADDRESSES) + nr = AFS_MAX_ADDRESSES; + + alist = kzalloc(struct_size(alist, addrs, nr), GFP_KERNEL); + if (!alist) + return NULL; + + refcount_set(&alist->usage, 1); + alist->max_addrs = nr; + alist->debug_id = atomic_inc_return(&debug_id); + trace_afs_alist(alist->debug_id, 1, afs_alist_trace_alloc); + return alist; +} + +/* + * Parse a text string consisting of delimited addresses. + */ +struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, + const char *text, size_t len, + char delim, + unsigned short service, + unsigned short port) +{ + struct afs_vlserver_list *vllist; + struct afs_addr_list *alist; + const char *p, *end = text + len; + const char *problem; + unsigned int nr = 0; + int ret = -ENOMEM; + + _enter("%*.*s,%c", (int)len, (int)len, text, delim); + + if (!len) { + _leave(" = -EDESTADDRREQ [empty]"); + return ERR_PTR(-EDESTADDRREQ); + } + + if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len))) + delim = ','; + + /* Count the addresses */ + p = text; + do { + if (!*p) { + problem = "nul"; + goto inval; + } + if (*p == delim) + continue; + nr++; + if (*p == '[') { + p++; + if (p == end) { + problem = "brace1"; + goto inval; + } + p = memchr(p, ']', end - p); + if (!p) { + problem = "brace2"; + goto inval; + } + p++; + if (p >= end) + break; + } + + p = memchr(p, delim, end - p); + if (!p) + break; + p++; + } while (p < end); + + _debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES); + + vllist = afs_alloc_vlserver_list(1); + if (!vllist) + return ERR_PTR(-ENOMEM); + + vllist->nr_servers = 1; + vllist->servers[0].server = afs_alloc_vlserver("<dummy>", 7, AFS_VL_PORT); + if (!vllist->servers[0].server) + goto error_vl; + + alist = afs_alloc_addrlist(nr); + if (!alist) + goto error; + + /* Extract the addresses */ + p = text; + do { + const char *q, *stop; + unsigned int xport = port; + __be32 x[4]; + int family; + + if (*p == delim) { + p++; + continue; + } + + if (*p == '[') { + p++; + q = memchr(p, ']', end - p); + } else { + for (q = p; q < end; q++) + if (*q == '+' || *q == delim) + break; + } + + if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop)) { + family = AF_INET; + } else if (in6_pton(p, q - p, (u8 *)x, -1, &stop)) { + family = AF_INET6; + } else { + problem = "family"; + goto bad_address; + } + + p = q; + if (stop != p) { + problem = "nostop"; + goto bad_address; + } + + if (q < end && *q == ']') + p++; + + if (p < end) { + if (*p == '+') { + /* Port number specification "+1234" */ + xport = 0; + p++; + if (p >= end || !isdigit(*p)) { + problem = "port"; + goto bad_address; + } + do { + xport *= 10; + xport += *p - '0'; + if (xport > 65535) { + problem = "pval"; + goto bad_address; + } + p++; + } while (p < end && isdigit(*p)); + } else if (*p == delim) { + p++; + } else { + problem = "weird"; + goto bad_address; + } + } + + if (family == AF_INET) + ret = afs_merge_fs_addr4(net, alist, x[0], xport); + else + ret = afs_merge_fs_addr6(net, alist, x, xport); + if (ret < 0) + goto error; + + } while (p < end); + + rcu_assign_pointer(vllist->servers[0].server->addresses, alist); + _leave(" = [nr %u]", alist->nr_addrs); + return vllist; + +inval: + _leave(" = -EINVAL [%s %zu %*.*s]", + problem, p - text, (int)len, (int)len, text); + return ERR_PTR(-EINVAL); +bad_address: + _leave(" = -EINVAL [%s %zu %*.*s]", + problem, p - text, (int)len, (int)len, text); + ret = -EINVAL; +error: + afs_put_addrlist(alist, afs_alist_trace_put_parse_error); +error_vl: + afs_put_vlserverlist(net, vllist); + return ERR_PTR(ret); +} + +/* + * Perform a DNS query for VL servers and build a up an address list. + */ +struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) +{ + struct afs_vlserver_list *vllist; + char *result = NULL; + int ret; + + _enter("%s", cell->name); + + ret = dns_query(cell->net->net, "afsdb", cell->name, cell->name_len, + "srv=1", &result, _expiry, true); + if (ret < 0) { + _leave(" = %d [dns]", ret); + return ERR_PTR(ret); + } + + if (*_expiry == 0) + *_expiry = ktime_get_real_seconds() + 60; + + if (ret > 1 && result[0] == 0) + vllist = afs_extract_vlserver_list(cell, result, ret); + else + vllist = afs_parse_text_addrs(cell->net, result, ret, ',', + VL_SERVICE, AFS_VL_PORT); + kfree(result); + if (IS_ERR(vllist) && vllist != ERR_PTR(-ENOMEM)) + pr_err("Failed to parse DNS data %ld\n", PTR_ERR(vllist)); + + return vllist; +} + +/* + * Merge an IPv4 entry into a fileserver address list. + */ +int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *alist, + __be32 xdr, u16 port) +{ + struct sockaddr_rxrpc srx; + struct rxrpc_peer *peer; + int i; + + if (alist->nr_addrs >= alist->max_addrs) + return 0; + + srx.srx_family = AF_RXRPC; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin); + srx.transport.sin.sin_family = AF_INET; + srx.transport.sin.sin_port = htons(port); + srx.transport.sin.sin_addr.s_addr = xdr; + + peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL); + if (!peer) + return -ENOMEM; + + for (i = 0; i < alist->nr_ipv4; i++) { + if (peer == alist->addrs[i].peer) { + rxrpc_kernel_put_peer(peer); + return 0; + } + if (peer <= alist->addrs[i].peer) + break; + } + + if (i < alist->nr_addrs) + memmove(alist->addrs + i + 1, + alist->addrs + i, + sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); + + alist->addrs[i].peer = peer; + alist->nr_ipv4++; + alist->nr_addrs++; + return 0; +} + +/* + * Merge an IPv6 entry into a fileserver address list. + */ +int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist, + __be32 *xdr, u16 port) +{ + struct sockaddr_rxrpc srx; + struct rxrpc_peer *peer; + int i; + + if (alist->nr_addrs >= alist->max_addrs) + return 0; + + srx.srx_family = AF_RXRPC; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin6); + srx.transport.sin6.sin6_family = AF_INET6; + srx.transport.sin6.sin6_port = htons(port); + memcpy(&srx.transport.sin6.sin6_addr, xdr, 16); + + peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL); + if (!peer) + return -ENOMEM; + + for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { + if (peer == alist->addrs[i].peer) { + rxrpc_kernel_put_peer(peer); + return 0; + } + if (peer <= alist->addrs[i].peer) + break; + } + + if (i < alist->nr_addrs) + memmove(alist->addrs + i + 1, + alist->addrs + i, + sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); + alist->addrs[i].peer = peer; + alist->nr_addrs++; + return 0; +} + +/* + * Set the app data on the rxrpc peers an address list points to + */ +void afs_set_peer_appdata(struct afs_server *server, + struct afs_addr_list *old_alist, + struct afs_addr_list *new_alist) +{ + unsigned long data = (unsigned long)server; + int n = 0, o = 0; + + if (!old_alist) { + /* New server. Just set all. */ + for (; n < new_alist->nr_addrs; n++) + rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data); + return; + } + if (!new_alist) { + /* Dead server. Just remove all. */ + for (; o < old_alist->nr_addrs; o++) + rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0); + return; + } + + /* Walk through the two lists simultaneously, setting new peers and + * clearing old ones. The two lists are ordered by pointer to peer + * record. + */ + while (n < new_alist->nr_addrs && o < old_alist->nr_addrs) { + struct rxrpc_peer *pn = new_alist->addrs[n].peer; + struct rxrpc_peer *po = old_alist->addrs[o].peer; + + if (pn == po) + continue; + if (pn < po) { + rxrpc_kernel_set_peer_data(pn, data); + n++; + } else { + rxrpc_kernel_set_peer_data(po, 0); + o++; + } + } + + if (n < new_alist->nr_addrs) + for (; n < new_alist->nr_addrs; n++) + rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data); + if (o < old_alist->nr_addrs) + for (; o < old_alist->nr_addrs; o++) + rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0); +} diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c new file mode 100644 index 000000000000..133736412c3d --- /dev/null +++ b/fs/afs/addr_prefs.c @@ -0,0 +1,533 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Address preferences management + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": addr_prefs: " fmt +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/inet.h> +#include <linux/seq_file.h> +#include <keys/rxrpc-type.h> +#include "internal.h" + +static inline struct afs_net *afs_seq2net_single(struct seq_file *m) +{ + return afs_net(seq_file_single_net(m)); +} + +/* + * Split a NUL-terminated string up to the first newline around spaces. The + * source string will be modified to have NUL-terminations inserted. + */ +static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv) +{ + unsigned int count = 0; + char *p = *pbuf; + + maxstrv--; /* Allow for terminal NULL */ + for (;;) { + /* Skip over spaces */ + while (isspace(*p)) { + if (*p == '\n') { + p++; + break; + } + p++; + } + if (!*p) + break; + + /* Mark start of word */ + if (count >= maxstrv) { + pr_warn("Too many elements in string\n"); + return -EINVAL; + } + strv[count++] = p; + + /* Skip over word */ + while (!isspace(*p) && *p) + p++; + if (!*p) + break; + + /* Mark end of word */ + if (*p == '\n') { + *p++ = 0; + break; + } + *p++ = 0; + } + + *pbuf = p; + strv[count] = NULL; + return count; +} + +/* + * Parse an address with an optional subnet mask. + */ +static int afs_parse_address(char *p, struct afs_addr_preference *pref) +{ + const char *stop; + unsigned long mask, tmp; + char *end = p + strlen(p); + bool bracket = false; + + if (*p == '[') { + p++; + bracket = true; + } + +#if 0 + if (*p == '[') { + p++; + q = memchr(p, ']', end - p); + if (!q) { + pr_warn("Can't find closing ']'\n"); + return -EINVAL; + } + } else { + for (q = p; q < end; q++) + if (*q == '/') + break; + } +#endif + + if (in4_pton(p, end - p, (u8 *)&pref->ipv4_addr, -1, &stop)) { + pref->family = AF_INET; + mask = 32; + } else if (in6_pton(p, end - p, (u8 *)&pref->ipv6_addr, -1, &stop)) { + pref->family = AF_INET6; + mask = 128; + } else { + pr_warn("Can't determine address family\n"); + return -EINVAL; + } + + p = (char *)stop; + if (bracket) { + if (*p != ']') { + pr_warn("Can't find closing ']'\n"); + return -EINVAL; + } + p++; + } + + if (*p == '/') { + p++; + tmp = simple_strtoul(p, &p, 10); + if (tmp > mask) { + pr_warn("Subnet mask too large\n"); + return -EINVAL; + } + if (tmp == 0) { + pr_warn("Subnet mask too small\n"); + return -EINVAL; + } + mask = tmp; + } + + if (*p) { + pr_warn("Invalid address\n"); + return -EINVAL; + } + + pref->subnet_mask = mask; + return 0; +} + +enum cmp_ret { + CONTINUE_SEARCH, + INSERT_HERE, + EXACT_MATCH, + SUBNET_MATCH, +}; + +/* + * See if a candidate address matches a listed address. + */ +static enum cmp_ret afs_cmp_address_pref(const struct afs_addr_preference *a, + const struct afs_addr_preference *b) +{ + int subnet = min(a->subnet_mask, b->subnet_mask); + const __be32 *pa, *pb; + u32 mask, na, nb; + int diff; + + if (a->family != b->family) + return INSERT_HERE; + + switch (a->family) { + case AF_INET6: + pa = a->ipv6_addr.s6_addr32; + pb = b->ipv6_addr.s6_addr32; + break; + case AF_INET: + pa = &a->ipv4_addr.s_addr; + pb = &b->ipv4_addr.s_addr; + break; + } + + while (subnet > 32) { + diff = ntohl(*pa++) - ntohl(*pb++); + if (diff < 0) + return INSERT_HERE; /* a<b */ + if (diff > 0) + return CONTINUE_SEARCH; /* a>b */ + subnet -= 32; + } + + if (subnet == 0) + return EXACT_MATCH; + + mask = 0xffffffffU << (32 - subnet); + na = ntohl(*pa); + nb = ntohl(*pb); + diff = (na & mask) - (nb & mask); + //kdebug("diff %08x %08x %08x %d", na, nb, mask, diff); + if (diff < 0) + return INSERT_HERE; /* a<b */ + if (diff > 0) + return CONTINUE_SEARCH; /* a>b */ + if (a->subnet_mask == b->subnet_mask) + return EXACT_MATCH; + if (a->subnet_mask > b->subnet_mask) + return SUBNET_MATCH; /* a binds tighter than b */ + return CONTINUE_SEARCH; /* b binds tighter than a */ +} + +/* + * Insert an address preference. + */ +static int afs_insert_address_pref(struct afs_addr_preference_list **_preflist, + struct afs_addr_preference *pref, + int index) +{ + struct afs_addr_preference_list *preflist = *_preflist, *old = preflist; + size_t size, max_prefs; + + _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index); + + if (preflist->nr == 255) + return -ENOSPC; + if (preflist->nr >= preflist->max_prefs) { + max_prefs = preflist->max_prefs + 1; + size = struct_size(preflist, prefs, max_prefs); + size = roundup_pow_of_two(size); + max_prefs = min_t(size_t, (size - sizeof(*preflist)) / sizeof(*pref), 255); + preflist = kmalloc(size, GFP_KERNEL); + if (!preflist) + return -ENOMEM; + *preflist = **_preflist; + preflist->max_prefs = max_prefs; + *_preflist = preflist; + + if (index < preflist->nr) + memcpy(preflist->prefs + index + 1, old->prefs + index, + sizeof(*pref) * (preflist->nr - index)); + if (index > 0) + memcpy(preflist->prefs, old->prefs, sizeof(*pref) * index); + } else { + if (index < preflist->nr) + memmove(preflist->prefs + index + 1, preflist->prefs + index, + sizeof(*pref) * (preflist->nr - index)); + } + + preflist->prefs[index] = *pref; + preflist->nr++; + if (pref->family == AF_INET) + preflist->ipv6_off++; + return 0; +} + +/* + * Add an address preference. + * echo "add <proto> <IP>[/<mask>] <prior>" >/proc/fs/afs/addr_prefs + */ +static int afs_add_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist, + int argc, char **argv) +{ + struct afs_addr_preference_list *preflist = *_preflist; + struct afs_addr_preference pref; + enum cmp_ret cmp; + int ret, i, stop; + + if (argc != 3) { + pr_warn("Wrong number of params\n"); + return -EINVAL; + } + + if (strcmp(argv[0], "udp") != 0) { + pr_warn("Unsupported protocol\n"); + return -EINVAL; + } + + ret = afs_parse_address(argv[1], &pref); + if (ret < 0) + return ret; + + ret = kstrtou16(argv[2], 10, &pref.prio); + if (ret < 0) { + pr_warn("Invalid priority\n"); + return ret; + } + + if (pref.family == AF_INET) { + i = 0; + stop = preflist->ipv6_off; + } else { + i = preflist->ipv6_off; + stop = preflist->nr; + } + + for (; i < stop; i++) { + cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + case SUBNET_MATCH: + return afs_insert_address_pref(_preflist, &pref, i); + case EXACT_MATCH: + preflist->prefs[i].prio = pref.prio; + return 0; + } + } + + return afs_insert_address_pref(_preflist, &pref, i); +} + +/* + * Delete an address preference. + */ +static int afs_delete_address_pref(struct afs_addr_preference_list **_preflist, + int index) +{ + struct afs_addr_preference_list *preflist = *_preflist; + + _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index); + + if (preflist->nr == 0) + return -ENOENT; + + if (index < preflist->nr - 1) + memmove(preflist->prefs + index, preflist->prefs + index + 1, + sizeof(preflist->prefs[0]) * (preflist->nr - index - 1)); + + if (index < preflist->ipv6_off) + preflist->ipv6_off--; + preflist->nr--; + return 0; +} + +/* + * Delete an address preference. + * echo "del <proto> <IP>[/<mask>]" >/proc/fs/afs/addr_prefs + */ +static int afs_del_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist, + int argc, char **argv) +{ + struct afs_addr_preference_list *preflist = *_preflist; + struct afs_addr_preference pref; + enum cmp_ret cmp; + int ret, i, stop; + + if (argc != 2) { + pr_warn("Wrong number of params\n"); + return -EINVAL; + } + + if (strcmp(argv[0], "udp") != 0) { + pr_warn("Unsupported protocol\n"); + return -EINVAL; + } + + ret = afs_parse_address(argv[1], &pref); + if (ret < 0) + return ret; + + if (pref.family == AF_INET) { + i = 0; + stop = preflist->ipv6_off; + } else { + i = preflist->ipv6_off; + stop = preflist->nr; + } + + for (; i < stop; i++) { + cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + case SUBNET_MATCH: + return 0; + case EXACT_MATCH: + return afs_delete_address_pref(_preflist, i); + } + } + + return -ENOANO; +} + +/* + * Handle writes to /proc/fs/afs/addr_prefs + */ +int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size) +{ + struct afs_addr_preference_list *preflist, *old; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net_single(m); + size_t psize; + char *argv[5]; + int ret, argc, max_prefs; + + inode_lock(file_inode(file)); + + /* Allocate a candidate new list and initialise it from the old. */ + old = rcu_dereference_protected(net->address_prefs, + lockdep_is_held(&file_inode(file)->i_rwsem)); + + if (old) + max_prefs = old->nr + 1; + else + max_prefs = 1; + + psize = struct_size(old, prefs, max_prefs); + psize = roundup_pow_of_two(psize); + max_prefs = min_t(size_t, (psize - sizeof(*old)) / sizeof(old->prefs[0]), 255); + + ret = -ENOMEM; + preflist = kmalloc(struct_size(preflist, prefs, max_prefs), GFP_KERNEL); + if (!preflist) + goto done; + + if (old) + memcpy(preflist, old, struct_size(preflist, prefs, old->nr)); + else + memset(preflist, 0, sizeof(*preflist)); + preflist->max_prefs = max_prefs; + + do { + argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv)); + if (argc < 0) { + ret = argc; + goto done; + } + if (argc < 2) + goto inval; + + if (strcmp(argv[0], "add") == 0) + ret = afs_add_address_pref(net, &preflist, argc - 1, argv + 1); + else if (strcmp(argv[0], "del") == 0) + ret = afs_del_address_pref(net, &preflist, argc - 1, argv + 1); + else + goto inval; + if (ret < 0) + goto done; + } while (*buf); + + preflist->version++; + rcu_assign_pointer(net->address_prefs, preflist); + /* Store prefs before version */ + smp_store_release(&net->address_pref_version, preflist->version); + kfree_rcu(old, rcu); + preflist = NULL; + ret = 0; + +done: + kfree(preflist); + inode_unlock(file_inode(file)); + _leave(" = %d", ret); + return ret; + +inval: + pr_warn("Invalid Command\n"); + ret = -EINVAL; + goto done; +} + +/* + * Mark the priorities on an address list if the address preferences table has + * changed. The caller must hold the RCU read lock. + */ +void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist) +{ + const struct afs_addr_preference_list *preflist = + rcu_dereference(net->address_prefs); + const struct sockaddr_in6 *sin6; + const struct sockaddr_in *sin; + const struct sockaddr *sa; + struct afs_addr_preference test; + enum cmp_ret cmp; + int i, j; + + if (!preflist || !preflist->nr || !alist->nr_addrs || + smp_load_acquire(&alist->addr_pref_version) == preflist->version) + return; + + test.family = AF_INET; + test.subnet_mask = 32; + test.prio = 0; + for (i = 0; i < alist->nr_ipv4; i++) { + sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer); + sin = (const struct sockaddr_in *)sa; + test.ipv4_addr = sin->sin_addr; + for (j = 0; j < preflist->ipv6_off; j++) { + cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + break; + case EXACT_MATCH: + case SUBNET_MATCH: + WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio); + break; + } + } + } + + test.family = AF_INET6; + test.subnet_mask = 128; + test.prio = 0; + for (; i < alist->nr_addrs; i++) { + sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer); + sin6 = (const struct sockaddr_in6 *)sa; + test.ipv6_addr = sin6->sin6_addr; + for (j = preflist->ipv6_off; j < preflist->nr; j++) { + cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + break; + case EXACT_MATCH: + case SUBNET_MATCH: + WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio); + break; + } + } + } + + smp_store_release(&alist->addr_pref_version, preflist->version); +} + +/* + * Mark the priorities on an address list if the address preferences table has + * changed. Avoid taking the RCU read lock if we can. + */ +void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist) +{ + if (!net->address_prefs || + /* Load version before prefs */ + smp_load_acquire(&net->address_pref_version) == alist->addr_pref_version) + return; + + rcu_read_lock(); + afs_get_address_preferences_rcu(net, alist); + rcu_read_unlock(); +} diff --git a/fs/afs/afs.h b/fs/afs/afs.h index 3c462ff6db63..ec3db00bd081 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -1,12 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* AFS common types * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #ifndef AFS_H @@ -14,15 +10,21 @@ #include <linux/in.h> -#define AFS_MAXCELLNAME 64 /* maximum length of a cell name */ -#define AFS_MAXVOLNAME 64 /* maximum length of a volume name */ -#define AFSNAMEMAX 256 /* maximum length of a filename plus NUL */ -#define AFSPATHMAX 1024 /* maximum length of a pathname plus NUL */ -#define AFSOPAQUEMAX 1024 /* maximum length of an opaque field */ +#define AFS_MAXCELLNAME 253 /* Maximum length of a cell name (DNS limited) */ +#define AFS_MAXVOLNAME 64 /* Maximum length of a volume name */ +#define AFS_MAXNSERVERS 8 /* Maximum servers in a basic volume record */ +#define AFS_NMAXNSERVERS 13 /* Maximum servers in a N/U-class volume record */ +#define AFS_MAXTYPES 3 /* Maximum number of volume types */ +#define AFSNAMEMAX 256 /* Maximum length of a filename plus NUL */ +#define AFSPATHMAX 1024 /* Maximum length of a pathname plus NUL */ +#define AFSOPAQUEMAX 1024 /* Maximum length of an opaque field */ + +#define AFS_VL_MAX_LIFESPAN 120 +#define AFS_PROBE_MAX_LIFESPAN 30 -typedef unsigned afs_volid_t; -typedef unsigned afs_vnodeid_t; -typedef unsigned long long afs_dataversion_t; +typedef u64 afs_volid_t; +typedef u64 afs_vnodeid_t; +typedef u64 afs_dataversion_t; typedef enum { AFSVL_RWVOL, /* read/write volume */ @@ -49,8 +51,9 @@ typedef enum { */ struct afs_fid { afs_volid_t vid; /* volume ID */ - afs_vnodeid_t vnode; /* file index within volume */ - unsigned unique; /* unique ID number (file index version) */ + afs_vnodeid_t vnode; /* Lower 64-bits of file index within volume */ + u32 vnode_hi; /* Upper 32-bits of file index */ + u32 unique; /* unique ID number (file index version) */ }; /* @@ -64,14 +67,27 @@ typedef enum { } afs_callback_type_t; struct afs_callback { - struct afs_fid fid; /* file identifier */ - unsigned version; /* callback version */ - unsigned expiry; /* time at which expires */ - afs_callback_type_t type; /* type of callback */ + time64_t expires_at; /* Time at which expires */ + //unsigned version; /* Callback version */ + //afs_callback_type_t type; /* Type of callback */ +}; + +struct afs_callback_break { + struct afs_fid fid; /* File identifier */ + //struct afs_callback cb; /* Callback details */ }; #define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */ +struct afs_uuid { + __be32 time_low; /* low part of timestamp */ + __be16 time_mid; /* mid part of timestamp */ + __be16 time_hi_and_version; /* high part of timestamp and version */ + __s8 clock_seq_hi_and_reserved; /* clock seq hi and variant */ + __s8 clock_seq_low; /* clock seq low */ + __s8 node[6]; /* spatially unique node ID (MAC addr) */ +}; + /* * AFS volume information */ @@ -111,23 +127,28 @@ typedef u32 afs_access_t; * AFS file status information */ struct afs_file_status { - unsigned if_version; /* interface version */ -#define AFS_FSTATUS_VERSION 1 - - afs_file_type_t type; /* file type */ - unsigned nlink; /* link count */ u64 size; /* file size */ afs_dataversion_t data_version; /* current data version */ - u32 author; /* author ID */ - kuid_t owner; /* owner ID */ - kgid_t group; /* group ID */ + struct timespec64 mtime_client; /* Last time client changed data */ + struct timespec64 mtime_server; /* Last time server changed data */ + s64 author; /* author ID */ + s64 owner; /* owner ID */ + s64 group; /* group ID */ afs_access_t caller_access; /* access rights for authenticated caller */ afs_access_t anon_access; /* access rights for unauthenticated caller */ umode_t mode; /* UNIX mode */ - struct afs_fid parent; /* parent dir ID for non-dirs only */ - time_t mtime_client; /* last time client changed data */ - time_t mtime_server; /* last time server changed data */ + afs_file_type_t type; /* file type */ + u32 nlink; /* link count */ s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */ + u32 abort_code; /* Abort if bulk-fetching this failed */ +}; + +struct afs_status_cb { + struct afs_file_status status; + struct afs_callback callback; + bool have_status; /* True if status record was retrieved */ + bool have_cb; /* True if cb record was retrieved */ + bool have_error; /* True if status.abort_code indicates an error */ }; /* @@ -144,27 +165,42 @@ struct afs_file_status { * AFS volume synchronisation information */ struct afs_volsync { - time_t creation; /* volume creation time */ + time64_t creation; /* Volume creation time (or TIME64_MIN) */ + time64_t update; /* Volume update time (or TIME64_MIN) */ }; /* * AFS volume status record */ struct afs_volume_status { - u32 vid; /* volume ID */ - u32 parent_id; /* parent volume ID */ + afs_volid_t vid; /* volume ID */ + afs_volid_t parent_id; /* parent volume ID */ u8 online; /* true if volume currently online and available */ u8 in_service; /* true if volume currently in service */ u8 blessed; /* same as in_service */ u8 needs_salvage; /* true if consistency checking required */ u32 type; /* volume type (afs_voltype_t) */ - u32 min_quota; /* minimum space set aside (blocks) */ - u32 max_quota; /* maximum space this volume may occupy (blocks) */ - u32 blocks_in_use; /* space this volume currently occupies (blocks) */ - u32 part_blocks_avail; /* space available in volume's partition */ - u32 part_max_blocks; /* size of volume's partition */ + u64 min_quota; /* minimum space set aside (blocks) */ + u64 max_quota; /* maximum space this volume may occupy (blocks) */ + u64 blocks_in_use; /* space this volume currently occupies (blocks) */ + u64 part_blocks_avail; /* space available in volume's partition */ + u64 part_max_blocks; /* size of volume's partition */ + s64 vol_copy_date; + s64 vol_backup_date; }; #define AFS_BLOCK_SIZE 1024 +/* + * XDR encoding of UUID in AFS. + */ +struct afs_uuid__xdr { + __be32 time_low; + __be32 time_mid; + __be32 time_hi_and_version; + __be32 clock_seq_hi_and_reserved; + __be32 clock_seq_low; + __be32 node[6]; +}; + #endif /* AFS_H */ diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h index 255f5dd6040c..565cbe0a8af6 100644 --- a/fs/afs/afs_cm.h +++ b/fs/afs/afs_cm.h @@ -1,12 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* AFS Cache Manager definitions * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #ifndef AFS_CM_H diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h index eb647323d8f0..20ab344baf9d 100644 --- a/fs/afs/afs_fs.h +++ b/fs/afs/afs_fs.h @@ -1,12 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* AFS File Service definitions * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #ifndef AFS_FS_H @@ -17,8 +13,10 @@ enum AFS_FS_Operations { FSFETCHDATA = 130, /* AFS Fetch file data */ + FSFETCHACL = 131, /* AFS Fetch file ACL */ FSFETCHSTATUS = 132, /* AFS Fetch file status */ FSSTOREDATA = 133, /* AFS Store file data */ + FSSTOREACL = 134, /* AFS Store file ACL */ FSSTORESTATUS = 135, /* AFS Store file status */ FSREMOVEFILE = 136, /* AFS Remove a file */ FSCREATEFILE = 137, /* AFS Create a file */ @@ -31,15 +29,20 @@ enum AFS_FS_Operations { FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */ FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */ FSGETROOTVOLUME = 151, /* AFS Get root volume name */ + FSBULKSTATUS = 155, /* AFS Fetch multiple file statuses */ FSSETLOCK = 156, /* AFS Request a file lock */ FSEXTENDLOCK = 157, /* AFS Extend a file lock */ FSRELEASELOCK = 158, /* AFS Release a file lock */ FSLOOKUP = 161, /* AFS lookup file in directory */ + FSINLINEBULKSTATUS = 65536, /* AFS Fetch multiple file statuses with inline errors */ FSFETCHDATA64 = 65537, /* AFS Fetch file data */ FSSTOREDATA64 = 65538, /* AFS Store file data */ + FSGIVEUPALLCALLBACKS = 65539, /* AFS Give up all outstanding callbacks on a server */ + FSGETCAPABILITIES = 65540, /* Probe and get the capabilities of a fileserver */ }; enum AFS_FS_Errors { + VRESTARTING = -100, /* Server is restarting */ VSALVAGE = 101, /* volume needs salvaging */ VNOVNODE = 102, /* no such file/dir (vnode) */ VNOVOL = 103, /* no such volume or volume unavailable */ @@ -51,6 +54,9 @@ enum AFS_FS_Errors { VOVERQUOTA = 109, /* volume's maximum quota exceeded */ VBUSY = 110, /* volume is temporarily unavailable */ VMOVED = 111, /* volume moved to new server - ask this FS where */ + VIO = 112, /* I/O error in volume */ + VSALVAGING = 113, /* Volume is being salvaged */ + VRESTRICTED = 120, /* Volume is restricted from using */ }; #endif /* AFS_FS_H */ diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h index 800f607ffaf5..b835e25a2c02 100644 --- a/fs/afs/afs_vl.h +++ b/fs/afs/afs_vl.h @@ -1,12 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* AFS Volume Location Service client interface * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #ifndef AFS_VL_H @@ -16,11 +12,19 @@ #define AFS_VL_PORT 7003 /* volume location service port */ #define VL_SERVICE 52 /* RxRPC service ID for the Volume Location service */ +#define YFS_VL_SERVICE 2503 /* Service ID for AuriStor upgraded VL service */ +#define YFS_VL_MAXCELLNAME 256 /* Maximum length of a cell name in YFS protocol */ enum AFSVL_Operations { - VLGETENTRYBYID = 503, /* AFS Get Cache Entry By ID operation ID */ - VLGETENTRYBYNAME = 504, /* AFS Get Cache Entry By Name operation ID */ - VLPROBE = 514, /* AFS Probe Volume Location Service operation ID */ + VLGETENTRYBYID = 503, /* AFS Get VLDB entry by ID */ + VLGETENTRYBYNAME = 504, /* AFS Get VLDB entry by name */ + VLPROBE = 514, /* AFS probe VL service */ + VLGETENTRYBYIDU = 526, /* AFS Get VLDB entry by ID (UUID-variant) */ + VLGETENTRYBYNAMEU = 527, /* AFS Get VLDB entry by name (UUID-variant) */ + VLGETADDRSU = 533, /* AFS Get addrs for fileserver */ + YVLGETENDPOINTS = 64002, /* YFS Get endpoints for file/volume server */ + YVLGETCELLNAME = 64014, /* YFS Get actual cell name */ + VLGETCAPABILITIES = 65537, /* AFS Get server capabilities */ }; enum AFSVL_Errors { @@ -54,6 +58,19 @@ enum AFSVL_Errors { AFSVL_NOMEM = 363547, /* malloc/realloc failed to alloc enough memory */ }; +enum { + YFS_SERVER_INDEX = 0, + YFS_SERVER_UUID = 1, + YFS_SERVER_ENDPOINT = 2, +}; + +enum { + YFS_ENDPOINT_IPV4 = 0, + YFS_ENDPOINT_IPV6 = 1, +}; + +#define YFS_MAXENDPOINTS 16 + /* * maps to "struct vldbentry" in vvl-spec.pdf */ @@ -74,11 +91,48 @@ struct afs_vldbentry { struct in_addr addr; /* server address */ unsigned partition; /* partition ID on this server */ unsigned flags; /* server specific flags */ -#define AFS_VLSF_NEWREPSITE 0x0001 /* unused */ +#define AFS_VLSF_NEWREPSITE 0x0001 /* Ignore all 'non-new' servers */ #define AFS_VLSF_ROVOL 0x0002 /* this server holds a R/O instance of the volume */ #define AFS_VLSF_RWVOL 0x0004 /* this server holds a R/W instance of the volume */ #define AFS_VLSF_BACKVOL 0x0008 /* this server holds a backup instance of the volume */ +#define AFS_VLSF_UUID 0x0010 /* This server is referred to by its UUID */ +#define AFS_VLSF_DONTUSE 0x0020 /* This server ref should be ignored */ } servers[8]; }; +#define AFS_VLDB_MAXNAMELEN 65 + + +struct afs_ListAddrByAttributes__xdr { + __be32 Mask; +#define AFS_VLADDR_IPADDR 0x1 /* Match by ->ipaddr */ +#define AFS_VLADDR_INDEX 0x2 /* Match by ->index */ +#define AFS_VLADDR_UUID 0x4 /* Match by ->uuid */ + __be32 ipaddr; + __be32 index; + __be32 spare; + struct afs_uuid__xdr uuid; +}; + +struct afs_uvldbentry__xdr { + __be32 name[AFS_VLDB_MAXNAMELEN]; + __be32 nServers; + struct afs_uuid__xdr serverNumber[AFS_NMAXNSERVERS]; + __be32 serverUnique[AFS_NMAXNSERVERS]; + __be32 serverPartition[AFS_NMAXNSERVERS]; + __be32 serverFlags[AFS_NMAXNSERVERS]; + __be32 volumeId[AFS_MAXTYPES]; + __be32 cloneId; + __be32 flags; + __be32 spares1; + __be32 spares2; + __be32 spares3; + __be32 spares4; + __be32 spares5; + __be32 spares6; + __be32 spares7; + __be32 spares8; + __be32 spares9; +}; + #endif /* AFS_VL_H */ diff --git a/fs/afs/cache.c b/fs/afs/cache.c deleted file mode 100644 index 577763c3d88b..000000000000 --- a/fs/afs/cache.c +++ /dev/null @@ -1,402 +0,0 @@ -/* AFS caching stuff - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/sched.h> -#include "internal.h" - -static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen); - -static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static enum fscache_checkaux afs_vlocation_cache_check_aux( - void *cookie_netfs_data, const void *buffer, uint16_t buflen); - -static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); - -static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, - uint64_t *size); -static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t buflen); -static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen); -static void afs_vnode_cache_now_uncached(void *cookie_netfs_data); - -struct fscache_netfs afs_cache_netfs = { - .name = "afs", - .version = 0, -}; - -struct fscache_cookie_def afs_cell_cache_index_def = { - .name = "AFS.cell", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = afs_cell_cache_get_key, - .get_aux = afs_cell_cache_get_aux, - .check_aux = afs_cell_cache_check_aux, -}; - -struct fscache_cookie_def afs_vlocation_cache_index_def = { - .name = "AFS.vldb", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = afs_vlocation_cache_get_key, - .get_aux = afs_vlocation_cache_get_aux, - .check_aux = afs_vlocation_cache_check_aux, -}; - -struct fscache_cookie_def afs_volume_cache_index_def = { - .name = "AFS.volume", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .get_key = afs_volume_cache_get_key, -}; - -struct fscache_cookie_def afs_vnode_cache_index_def = { - .name = "AFS.vnode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .get_key = afs_vnode_cache_get_key, - .get_attr = afs_vnode_cache_get_attr, - .get_aux = afs_vnode_cache_get_aux, - .check_aux = afs_vnode_cache_check_aux, - .now_uncached = afs_vnode_cache_now_uncached, -}; - -/* - * set the key for the index entry - */ -static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_cell *cell = cookie_netfs_data; - uint16_t klen; - - _enter("%p,%p,%u", cell, buffer, bufmax); - - klen = strlen(cell->name); - if (klen > bufmax) - return 0; - - memcpy(buffer, cell->name, klen); - return klen; -} - -/* - * provide new auxiliary cache data - */ -static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_cell *cell = cookie_netfs_data; - uint16_t dlen; - - _enter("%p,%p,%u", cell, buffer, bufmax); - - dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]); - dlen = min(dlen, bufmax); - dlen &= ~(sizeof(cell->vl_addrs[0]) - 1); - - memcpy(buffer, cell->vl_addrs, dlen); - return dlen; -} - -/* - * check that the auxiliary data indicates that the entry is still valid - */ -static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen) -{ - _leave(" = OKAY"); - return FSCACHE_CHECKAUX_OKAY; -} - -/*****************************************************************************/ -/* - * set the key for the index entry - */ -static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_vlocation *vlocation = cookie_netfs_data; - uint16_t klen; - - _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); - - klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name)); - if (klen > bufmax) - return 0; - - memcpy(buffer, vlocation->vldb.name, klen); - - _leave(" = %u", klen); - return klen; -} - -/* - * provide new auxiliary cache data - */ -static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_vlocation *vlocation = cookie_netfs_data; - uint16_t dlen; - - _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax); - - dlen = sizeof(struct afs_cache_vlocation); - dlen -= offsetof(struct afs_cache_vlocation, nservers); - if (dlen > bufmax) - return 0; - - memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen); - - _leave(" = %u", dlen); - return dlen; -} - -/* - * check that the auxiliary data indicates that the entry is still valid - */ -static -enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen) -{ - const struct afs_cache_vlocation *cvldb; - struct afs_vlocation *vlocation = cookie_netfs_data; - uint16_t dlen; - - _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen); - - /* check the size of the data is what we're expecting */ - dlen = sizeof(struct afs_cache_vlocation); - dlen -= offsetof(struct afs_cache_vlocation, nservers); - if (dlen != buflen) - return FSCACHE_CHECKAUX_OBSOLETE; - - cvldb = container_of(buffer, struct afs_cache_vlocation, nservers); - - /* if what's on disk is more valid than what's in memory, then use the - * VL record from the cache */ - if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) { - memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen); - vlocation->valid = 1; - _leave(" = SUCCESS [c->m]"); - return FSCACHE_CHECKAUX_OKAY; - } - - /* need to update the cache if the cached info differs */ - if (memcmp(&vlocation->vldb, buffer, dlen) != 0) { - /* delete if the volume IDs for this name differ */ - if (memcmp(&vlocation->vldb.vid, &cvldb->vid, - sizeof(cvldb->vid)) != 0 - ) { - _leave(" = OBSOLETE"); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - _leave(" = UPDATE"); - return FSCACHE_CHECKAUX_NEEDS_UPDATE; - } - - _leave(" = OKAY"); - return FSCACHE_CHECKAUX_OKAY; -} - -/*****************************************************************************/ -/* - * set the key for the volume index entry - */ -static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_volume *volume = cookie_netfs_data; - uint16_t klen; - - _enter("{%u},%p,%u", volume->type, buffer, bufmax); - - klen = sizeof(volume->type); - if (klen > bufmax) - return 0; - - memcpy(buffer, &volume->type, sizeof(volume->type)); - - _leave(" = %u", klen); - return klen; - -} - -/*****************************************************************************/ -/* - * set the key for the index entry - */ -static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_vnode *vnode = cookie_netfs_data; - uint16_t klen; - - _enter("{%x,%x,%llx},%p,%u", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, - buffer, bufmax); - - klen = sizeof(vnode->fid.vnode); - if (klen > bufmax) - return 0; - - memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode)); - - _leave(" = %u", klen); - return klen; -} - -/* - * provide updated file attributes - */ -static void afs_vnode_cache_get_attr(const void *cookie_netfs_data, - uint64_t *size) -{ - const struct afs_vnode *vnode = cookie_netfs_data; - - _enter("{%x,%x,%llx},", - vnode->fid.vnode, vnode->fid.unique, - vnode->status.data_version); - - *size = vnode->status.size; -} - -/* - * provide new auxiliary cache data - */ -static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, - void *buffer, uint16_t bufmax) -{ - const struct afs_vnode *vnode = cookie_netfs_data; - uint16_t dlen; - - _enter("{%x,%x,%Lx},%p,%u", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, - buffer, bufmax); - - dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); - if (dlen > bufmax) - return 0; - - memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique)); - buffer += sizeof(vnode->fid.unique); - memcpy(buffer, &vnode->status.data_version, - sizeof(vnode->status.data_version)); - - _leave(" = %u", dlen); - return dlen; -} - -/* - * check that the auxiliary data indicates that the entry is still valid - */ -static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen) -{ - struct afs_vnode *vnode = cookie_netfs_data; - uint16_t dlen; - - _enter("{%x,%x,%llx},%p,%u", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, - buffer, buflen); - - /* check the size of the data is what we're expecting */ - dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version); - if (dlen != buflen) { - _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - if (memcmp(buffer, - &vnode->fid.unique, - sizeof(vnode->fid.unique) - ) != 0) { - unsigned unique; - - memcpy(&unique, buffer, sizeof(unique)); - - _leave(" = OBSOLETE [uniq %x != %x]", - unique, vnode->fid.unique); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - if (memcmp(buffer + sizeof(vnode->fid.unique), - &vnode->status.data_version, - sizeof(vnode->status.data_version) - ) != 0) { - afs_dataversion_t version; - - memcpy(&version, buffer + sizeof(vnode->fid.unique), - sizeof(version)); - - _leave(" = OBSOLETE [vers %llx != %llx]", - version, vnode->status.data_version); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - _leave(" = SUCCESS"); - return FSCACHE_CHECKAUX_OKAY; -} - -/* - * indication the cookie is no longer uncached - * - this function is called when the backing store currently caching a cookie - * is removed - * - the netfs should use this to clean up any markers indicating cached pages - * - this is mandatory for any object that may have data - */ -static void afs_vnode_cache_now_uncached(void *cookie_netfs_data) -{ - struct afs_vnode *vnode = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - _enter("{%x,%x,%Lx}", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version); - - pagevec_init(&pvec, 0); - first = 0; - - for (;;) { - /* grab a bunch of pages to clean */ - nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } - - _leave(""); -} diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 7ef637d7f3a5..894d2bad6b6c 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -20,456 +20,246 @@ #include <linux/sched.h> #include "internal.h" -#if 0 -unsigned afs_vnode_update_timeout = 10; -#endif /* 0 */ - -#define afs_breakring_space(server) \ - CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail, \ - ARRAY_SIZE((server)->cb_break)) - -//static void afs_callback_updater(struct work_struct *); - -static struct workqueue_struct *afs_callback_update_worker; - /* - * allow the fileserver to request callback state (re-)initialisation + * Handle invalidation of an mmap'd file. We invalidate all the PTEs referring + * to the pages in this file's pagecache, forcing the kernel to go through + * ->fault() or ->page_mkwrite() - at which point we can handle invalidation + * more fully. */ -void afs_init_callback_state(struct afs_server *server) +void afs_invalidate_mmap_work(struct work_struct *work) { - struct afs_vnode *vnode; + struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work); + + unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); +} - _enter("{%p}", server); +static void afs_volume_init_callback(struct afs_volume *volume) +{ + struct afs_vnode *vnode; - spin_lock(&server->cb_lock); + down_read(&volume->open_mmaps_lock); - /* kill all the promises on record from this server */ - while (!RB_EMPTY_ROOT(&server->cb_promises)) { - vnode = rb_entry(server->cb_promises.rb_node, - struct afs_vnode, cb_promise); - _debug("UNPROMISE { vid=%x:%u uq=%u}", - vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); - rb_erase(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = false; + list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) { + if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) { + afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb); + queue_work(system_dfl_wq, &vnode->cb_work); + } } - spin_unlock(&server->cb_lock); - _leave(""); + up_read(&volume->open_mmaps_lock); } /* - * handle the data invalidation side of a callback being broken + * Allow the fileserver to request callback state (re-)initialisation. + * Unfortunately, UUIDs are not guaranteed unique. */ -void afs_broken_callback_work(struct work_struct *work) +void afs_init_callback_state(struct afs_server *server) { - struct afs_vnode *vnode = - container_of(work, struct afs_vnode, cb_broken_work); - - _enter(""); - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - return; + struct afs_server_entry *se; - /* we're only interested in dealing with a broken callback on *this* - * vnode and only if no-one else has dealt with it yet */ - if (!mutex_trylock(&vnode->validate_lock)) - return; /* someone else is dealing with it */ + down_read(&server->cell->vs_lock); - if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { - if (S_ISDIR(vnode->vfs_inode.i_mode)) - afs_clear_permits(vnode); - - if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0) - goto out; - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - goto out; - - /* if the vnode's data version number changed then its contents - * are different */ - if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) - afs_zap_data(vnode); + list_for_each_entry(se, &server->volumes, slink) { + se->cb_expires_at = AFS_NO_CB_PROMISE; + se->volume->cb_expires_at = AFS_NO_CB_PROMISE; + trace_afs_cb_v_break(se->volume->vid, atomic_read(&se->volume->cb_v_break), + afs_cb_break_for_s_reinit); + if (!list_empty(&se->volume->open_mmaps)) + afs_volume_init_callback(se->volume); } -out: - mutex_unlock(&vnode->validate_lock); - - /* avoid the potential race whereby the mutex_trylock() in this - * function happens again between the clear_bit() and the - * mutex_unlock() */ - if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { - _debug("requeue"); - queue_work(afs_callback_update_worker, &vnode->cb_broken_work); - } - _leave(""); + up_read(&server->cell->vs_lock); } /* * actually break a callback */ -static void afs_break_callback(struct afs_server *server, - struct afs_vnode *vnode) +void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reason) { _enter(""); - set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - - if (vnode->cb_promised) { - spin_lock(&vnode->lock); - - _debug("break callback"); - - spin_lock(&server->cb_lock); - if (vnode->cb_promised) { - rb_erase(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = false; - } - spin_unlock(&server->cb_lock); + clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_cb_break)) { + vnode->cb_break++; + vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); + afs_clear_permits(vnode); - queue_work(afs_callback_update_worker, &vnode->cb_broken_work); - if (list_empty(&vnode->granted_locks) && - !list_empty(&vnode->pending_locks)) + if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB) afs_lock_may_be_available(vnode); - spin_unlock(&vnode->lock); - } -} -/* - * allow the fileserver to explicitly break one callback - * - happens when - * - the backing file is changed - * - a lock is released - */ -static void afs_break_one_callback(struct afs_server *server, - struct afs_fid *fid) -{ - struct afs_vnode *vnode; - struct rb_node *p; - - _debug("find"); - spin_lock(&server->fs_lock); - p = server->fs_vnodes.rb_node; - while (p) { - vnode = rb_entry(p, struct afs_vnode, server_rb); - if (fid->vid < vnode->fid.vid) - p = p->rb_left; - else if (fid->vid > vnode->fid.vid) - p = p->rb_right; - else if (fid->vnode < vnode->fid.vnode) - p = p->rb_left; - else if (fid->vnode > vnode->fid.vnode) - p = p->rb_right; - else if (fid->unique < vnode->fid.unique) - p = p->rb_left; - else if (fid->unique > vnode->fid.unique) - p = p->rb_right; - else - goto found; - } + if (reason != afs_cb_break_for_deleted && + vnode->status.type == AFS_FTYPE_FILE && + atomic_read(&vnode->cb_nr_mmap)) + queue_work(system_dfl_wq, &vnode->cb_work); - /* not found so we just ignore it (it may have moved to another - * server) */ -not_available: - _debug("not avail"); - spin_unlock(&server->fs_lock); - _leave(""); - return; - -found: - _debug("found"); - ASSERTCMP(server, ==, vnode->server); - - if (!igrab(AFS_VNODE_TO_I(vnode))) - goto not_available; - spin_unlock(&server->fs_lock); - - afs_break_callback(server, vnode); - iput(&vnode->vfs_inode); - _leave(""); -} - -/* - * allow the fileserver to break callback promises - */ -void afs_break_callbacks(struct afs_server *server, size_t count, - struct afs_callback callbacks[]) -{ - _enter("%p,%zu,", server, count); - - ASSERT(server != NULL); - ASSERTCMP(count, <=, AFSCBMAX); - - for (; count > 0; callbacks++, count--) { - _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }", - callbacks->fid.vid, - callbacks->fid.vnode, - callbacks->fid.unique, - callbacks->version, - callbacks->expiry, - callbacks->type - ); - afs_break_one_callback(server, &callbacks->fid); + trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true); + } else { + trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, false); } - - _leave(""); - return; } -/* - * record the callback for breaking - * - the caller must hold server->cb_lock - */ -static void afs_do_give_up_callback(struct afs_server *server, - struct afs_vnode *vnode) +void afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reason) { - struct afs_callback *cb; - - _enter("%p,%p", server, vnode); - - cb = &server->cb_break[server->cb_break_head]; - cb->fid = vnode->fid; - cb->version = vnode->cb_version; - cb->expiry = vnode->cb_expiry; - cb->type = vnode->cb_type; - smp_wmb(); - server->cb_break_head = - (server->cb_break_head + 1) & - (ARRAY_SIZE(server->cb_break) - 1); - - /* defer the breaking of callbacks to try and collect as many as - * possible to ship in one operation */ - switch (atomic_inc_return(&server->cb_break_n)) { - case 1 ... AFSCBMAX - 1: - queue_delayed_work(afs_callback_update_worker, - &server->cb_break_work, HZ * 2); - break; - case AFSCBMAX: - afs_flush_callback_breaks(server); - break; - default: - break; - } - - ASSERT(server->cb_promises.rb_node != NULL); - rb_erase(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = false; - _leave(""); + write_seqlock(&vnode->cb_lock); + __afs_break_callback(vnode, reason); + write_sequnlock(&vnode->cb_lock); } /* - * discard the callback on a deleted item + * Look up a volume by volume ID under RCU conditions. */ -void afs_discard_callback_on_delete(struct afs_vnode *vnode) +static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, + afs_volid_t vid) { - struct afs_server *server = vnode->server; + struct afs_volume *volume = NULL; + struct rb_node *p; + int seq = 1; - _enter("%d", vnode->cb_promised); + for (;;) { + /* Unfortunately, rbtree walking doesn't give reliable results + * under just the RCU read lock, so we have to check for + * changes. + */ + seq++; /* 2 on the 1st/lockless path, otherwise odd */ + read_seqbegin_or_lock(&cell->volume_lock, &seq); + + p = rcu_dereference_raw(cell->volumes.rb_node); + while (p) { + volume = rb_entry(p, struct afs_volume, cell_node); + + if (volume->vid < vid) + p = rcu_dereference_raw(p->rb_left); + else if (volume->vid > vid) + p = rcu_dereference_raw(p->rb_right); + else + break; + volume = NULL; + } - if (!vnode->cb_promised) { - _leave(" [not promised]"); - return; + if (volume && afs_try_get_volume(volume, afs_volume_trace_get_callback)) + break; + if (!need_seqretry(&cell->volume_lock, seq)) + break; + seq |= 1; /* Want a lock next time */ } - ASSERT(server != NULL); - - spin_lock(&server->cb_lock); - if (vnode->cb_promised) { - ASSERT(server->cb_promises.rb_node != NULL); - rb_erase(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = false; - } - spin_unlock(&server->cb_lock); - _leave(""); + done_seqretry(&cell->volume_lock, seq); + return volume; } /* - * give up the callback registered for a vnode on the file server when the - * inode is being cleared + * Allow the fileserver to break callbacks at the volume-level. This is + * typically done when, for example, a R/W volume is snapshotted to a R/O + * volume (the only way to change an R/O volume). It may also, however, happen + * when a volserver takes control of a volume (offlining it, moving it, etc.). + * + * Every file in that volume will need to be reevaluated. */ -void afs_give_up_callback(struct afs_vnode *vnode) +static void afs_break_volume_callback(struct afs_server *server, + struct afs_volume *volume) + __releases(RCU) { - struct afs_server *server = vnode->server; - - DECLARE_WAITQUEUE(myself, current); - - _enter("%d", vnode->cb_promised); - - _debug("GIVE UP INODE %p", &vnode->vfs_inode); + struct afs_server_list *slist = rcu_dereference(volume->servers); + unsigned int i, cb_v_break; - if (!vnode->cb_promised) { - _leave(" [not promised]"); - return; - } + write_lock(&volume->cb_v_break_lock); - ASSERT(server != NULL); + for (i = 0; i < slist->nr_servers; i++) + if (slist->servers[i].server == server) + slist->servers[i].cb_expires_at = AFS_NO_CB_PROMISE; + volume->cb_expires_at = AFS_NO_CB_PROMISE; - spin_lock(&server->cb_lock); - if (vnode->cb_promised && afs_breakring_space(server) == 0) { - add_wait_queue(&server->cb_break_waitq, &myself); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!vnode->cb_promised || - afs_breakring_space(server) != 0) - break; - spin_unlock(&server->cb_lock); - schedule(); - spin_lock(&server->cb_lock); - } - remove_wait_queue(&server->cb_break_waitq, &myself); - __set_current_state(TASK_RUNNING); - } + cb_v_break = atomic_inc_return_release(&volume->cb_v_break); + trace_afs_cb_v_break(volume->vid, cb_v_break, afs_cb_break_for_volume_callback); - /* of course, it's always possible for the server to break this vnode's - * callback first... */ - if (vnode->cb_promised) - afs_do_give_up_callback(server, vnode); + write_unlock(&volume->cb_v_break_lock); + rcu_read_unlock(); - spin_unlock(&server->cb_lock); - _leave(""); + if (!list_empty(&volume->open_mmaps)) + afs_volume_init_callback(volume); } /* - * dispatch a deferred give up callbacks operation + * allow the fileserver to explicitly break one callback + * - happens when + * - the backing file is changed + * - a lock is released */ -void afs_dispatch_give_up_callbacks(struct work_struct *work) +static void afs_break_one_callback(struct afs_server *server, + struct afs_volume *volume, + struct afs_fid *fid) { - struct afs_server *server = - container_of(work, struct afs_server, cb_break_work.work); - - _enter(""); + struct super_block *sb; + struct afs_vnode *vnode; + struct inode *inode; - /* tell the fileserver to discard the callback promises it has - * - in the event of ENOMEM or some other error, we just forget that we - * had callbacks entirely, and the server will call us later to break - * them + /* See if we can find a matching inode - even an I_NEW inode needs to + * be marked as it can have its callback broken before we finish + * setting up the local inode. */ - afs_fs_give_up_callbacks(server, &afs_async_call); -} + sb = rcu_dereference(volume->sb); + if (!sb) + return; -/* - * flush the outstanding callback breaks on a server - */ -void afs_flush_callback_breaks(struct afs_server *server) -{ - mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0); + inode = find_inode_rcu(sb, fid->vnode, afs_ilookup5_test_by_fid, fid); + if (inode) { + vnode = AFS_FS_I(inode); + afs_break_callback(vnode, afs_cb_break_for_callback); + } else { + trace_afs_cb_miss(fid, afs_cb_break_for_callback); + } } -#if 0 -/* - * update a bunch of callbacks - */ -static void afs_callback_updater(struct work_struct *work) +static void afs_break_some_callbacks(struct afs_server *server, + struct afs_callback_break *cbb, + size_t *_count) { - struct afs_server *server; - struct afs_vnode *vnode, *xvnode; - time_t now; - long timeout; - int ret; - - server = container_of(work, struct afs_server, updater); - - _enter(""); - - now = get_seconds(); + struct afs_callback_break *residue = cbb; + struct afs_volume *volume; + afs_volid_t vid = cbb->fid.vid; + size_t i; + + rcu_read_lock(); + volume = afs_lookup_volume_rcu(server->cell, vid); + if (cbb->fid.vnode == 0 && cbb->fid.unique == 0) { + afs_break_volume_callback(server, volume); + *_count -= 1; + if (*_count) + memmove(cbb, cbb + 1, sizeof(*cbb) * *_count); + } else { + /* TODO: Find all matching volumes if we couldn't match the server and + * break them anyway. + */ - /* find the first vnode to update */ - spin_lock(&server->cb_lock); - for (;;) { - if (RB_EMPTY_ROOT(&server->cb_promises)) { - spin_unlock(&server->cb_lock); - _leave(" [nothing]"); - return; + for (i = *_count; i > 0; cbb++, i--) { + if (cbb->fid.vid == vid) { + _debug("- Fid { vl=%08llx n=%llu u=%u }", + cbb->fid.vid, + cbb->fid.vnode, + cbb->fid.unique); + --*_count; + if (volume) + afs_break_one_callback(server, volume, &cbb->fid); + } else { + *residue++ = *cbb; + } } - - vnode = rb_entry(rb_first(&server->cb_promises), - struct afs_vnode, cb_promise); - if (atomic_read(&vnode->usage) > 0) - break; - rb_erase(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = false; - } - - timeout = vnode->update_at - now; - if (timeout > 0) { - queue_delayed_work(afs_vnode_update_worker, - &afs_vnode_update, timeout * HZ); - spin_unlock(&server->cb_lock); - _leave(" [nothing]"); - return; - } - - list_del_init(&vnode->update); - atomic_inc(&vnode->usage); - spin_unlock(&server->cb_lock); - - /* we can now perform the update */ - _debug("update %s", vnode->vldb.name); - vnode->state = AFS_VL_UPDATING; - vnode->upd_rej_cnt = 0; - vnode->upd_busy_cnt = 0; - - ret = afs_vnode_update_record(vl, &vldb); - switch (ret) { - case 0: - afs_vnode_apply_update(vl, &vldb); - vnode->state = AFS_VL_UPDATING; - break; - case -ENOMEDIUM: - vnode->state = AFS_VL_VOLUME_DELETED; - break; - default: - vnode->state = AFS_VL_UNCERTAIN; - break; + rcu_read_unlock(); } - /* and then reschedule */ - _debug("reschedule"); - vnode->update_at = get_seconds() + afs_vnode_update_timeout; - - spin_lock(&server->cb_lock); - - if (!list_empty(&server->cb_promises)) { - /* next update in 10 minutes, but wait at least 1 second more - * than the newest record already queued so that we don't spam - * the VL server suddenly with lots of requests - */ - xvnode = list_entry(server->cb_promises.prev, - struct afs_vnode, update); - if (vnode->update_at <= xvnode->update_at) - vnode->update_at = xvnode->update_at + 1; - xvnode = list_entry(server->cb_promises.next, - struct afs_vnode, update); - timeout = xvnode->update_at - now; - if (timeout < 0) - timeout = 0; - } else { - timeout = afs_vnode_update_timeout; - } - - list_add_tail(&vnode->update, &server->cb_promises); - - _debug("timeout %ld", timeout); - queue_delayed_work(afs_vnode_update_worker, - &afs_vnode_update, timeout * HZ); - spin_unlock(&server->cb_lock); - afs_put_vnode(vl); + afs_put_volume(volume, afs_volume_trace_put_callback); } -#endif /* - * initialise the callback update process + * allow the fileserver to break callback promises */ -int __init afs_callback_update_init(void) +void afs_break_callbacks(struct afs_server *server, size_t count, + struct afs_callback_break *callbacks) { - afs_callback_update_worker = - create_singlethread_workqueue("kafs_callbackd"); - return afs_callback_update_worker ? 0 : -ENOMEM; -} + _enter("%p,%zu,", server, count); -/* - * shut down the callback update process - */ -void afs_callback_update_kill(void) -{ - destroy_workqueue(afs_callback_update_worker); + ASSERT(server != NULL); + + while (count > 0) + afs_break_some_callbacks(server, callbacks, &count); } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 3c090b7555ea..71c10a05cebe 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -1,221 +1,403 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* AFS cell and server record management * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2002, 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/key.h> #include <linux/ctype.h> #include <linux/dns_resolver.h> #include <linux/sched.h> +#include <linux/inet.h> +#include <linux/namei.h> #include <keys/rxrpc-type.h> #include "internal.h" -DECLARE_RWSEM(afs_proc_cells_sem); -LIST_HEAD(afs_proc_cells); +static unsigned __read_mostly afs_cell_gc_delay = 10; +static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; +static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; +static atomic_t cell_debug_id; + +static void afs_cell_timer(struct timer_list *timer); +static void afs_destroy_cell_work(struct work_struct *work); +static void afs_manage_cell_work(struct work_struct *work); + +static void afs_dec_cells_outstanding(struct afs_net *net) +{ + if (atomic_dec_and_test(&net->cells_outstanding)) + wake_up_var(&net->cells_outstanding); +} -static LIST_HEAD(afs_cells); -static DEFINE_RWLOCK(afs_cells_lock); -static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */ -static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq); -static struct afs_cell *afs_cell_root; +static void afs_set_cell_state(struct afs_cell *cell, enum afs_cell_state state) +{ + smp_store_release(&cell->state, state); /* Commit cell changes before state */ + smp_wmb(); /* Set cell state before task state */ + wake_up_var(&cell->state); +} /* - * allocate a cell record and fill in its name, VL server address list and - * allocate an anonymous key + * Look up and get an activation reference on a cell record. The caller must + * hold net->cells_lock at least read-locked. */ -static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen, - char *vllist) +static struct afs_cell *afs_find_cell_locked(struct afs_net *net, + const char *name, unsigned int namesz, + enum afs_cell_trace reason) +{ + struct afs_cell *cell = NULL; + struct rb_node *p; + int n; + + _enter("%*.*s", namesz, namesz, name); + + if (name && namesz == 0) + return ERR_PTR(-EINVAL); + if (namesz > AFS_MAXCELLNAME) + return ERR_PTR(-ENAMETOOLONG); + + if (!name) { + cell = rcu_dereference_protected(net->ws_cell, + lockdep_is_held(&net->cells_lock)); + if (!cell) + return ERR_PTR(-EDESTADDRREQ); + goto found; + } + + p = net->cells.rb_node; + while (p) { + cell = rb_entry(p, struct afs_cell, net_node); + + n = strncasecmp(cell->name, name, + min_t(size_t, cell->name_len, namesz)); + if (n == 0) + n = cell->name_len - namesz; + if (n < 0) + p = p->rb_left; + else if (n > 0) + p = p->rb_right; + else + goto found; + } + + return ERR_PTR(-ENOENT); + +found: + return afs_use_cell(cell, reason); +} + +/* + * Look up and get an activation reference on a cell record. + */ +struct afs_cell *afs_find_cell(struct afs_net *net, + const char *name, unsigned int namesz, + enum afs_cell_trace reason) { struct afs_cell *cell; - struct key *key; - char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; - char *dvllist = NULL, *_vllist = NULL; - char delimiter = ':'; - int ret; - _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist); + down_read(&net->cells_lock); + cell = afs_find_cell_locked(net, name, namesz, reason); + up_read(&net->cells_lock); + return cell; +} - BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ +/* + * Set up a cell record and fill in its name, VL server address list and + * allocate an anonymous key + */ +static struct afs_cell *afs_alloc_cell(struct afs_net *net, + const char *name, unsigned int namelen, + const char *addresses) +{ + struct afs_vlserver_list *vllist = NULL; + struct afs_cell *cell; + int i, ret; + ASSERT(name); + if (namelen == 0) + return ERR_PTR(-EINVAL); if (namelen > AFS_MAXCELLNAME) { _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); } - /* allocate and initialise a cell record */ - cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); + /* Prohibit cell names that contain unprintable chars, '/' and '@' or + * that begin with a dot. This also precludes "@cell". + */ + if (name[0] == '.') + return ERR_PTR(-EINVAL); + for (i = 0; i < namelen; i++) { + char ch = name[i]; + if (!isprint(ch) || ch == '/' || ch == '@') + return ERR_PTR(-EINVAL); + } + + _enter("%*.*s,%s", namelen, namelen, name, addresses); + + cell = kzalloc(sizeof(struct afs_cell), GFP_KERNEL); if (!cell) { _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } - memcpy(cell->name, name, namelen); - cell->name[namelen] = 0; - - atomic_set(&cell->usage, 1); - INIT_LIST_HEAD(&cell->link); - rwlock_init(&cell->servers_lock); - INIT_LIST_HEAD(&cell->servers); - init_rwsem(&cell->vl_sem); - INIT_LIST_HEAD(&cell->vl_list); - spin_lock_init(&cell->vl_lock); - - /* if the ip address is invalid, try dns query */ - if (!vllist || strlen(vllist) < 7) { - ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL); - if (ret < 0) { - if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY) - /* translate these errors into something - * userspace might understand */ - ret = -EDESTADDRREQ; - _leave(" = %d", ret); - return ERR_PTR(ret); - } - _vllist = dvllist; + /* Allocate the cell name and the key name in one go. */ + cell->name = kmalloc(1 + namelen + 1 + + 4 + namelen + 1, GFP_KERNEL); + if (!cell->name) { + kfree(cell); + return ERR_PTR(-ENOMEM); + } - /* change the delimiter for user-space reply */ - delimiter = ','; + cell->name[0] = '.'; + cell->name++; + cell->name_len = namelen; + for (i = 0; i < namelen; i++) + cell->name[i] = tolower(name[i]); + cell->name[i++] = 0; + + cell->key_desc = cell->name + i; + memcpy(cell->key_desc, "afs@", 4); + memcpy(cell->key_desc + 4, cell->name, cell->name_len + 1); + + cell->net = net; + refcount_set(&cell->ref, 1); + atomic_set(&cell->active, 0); + INIT_WORK(&cell->destroyer, afs_destroy_cell_work); + INIT_WORK(&cell->manager, afs_manage_cell_work); + timer_setup(&cell->management_timer, afs_cell_timer, 0); + init_rwsem(&cell->vs_lock); + cell->volumes = RB_ROOT; + INIT_HLIST_HEAD(&cell->proc_volumes); + seqlock_init(&cell->volume_lock); + cell->fs_servers = RB_ROOT; + init_rwsem(&cell->fs_lock); + rwlock_init(&cell->vl_servers_lock); + cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); + + /* Provide a VL server list, filling it in if we were given a list of + * addresses to use. + */ + if (addresses) { + vllist = afs_parse_text_addrs(net, + addresses, strlen(addresses), ':', + VL_SERVICE, AFS_VL_PORT); + if (IS_ERR(vllist)) { + ret = PTR_ERR(vllist); + vllist = NULL; + goto parse_failed; + } + vllist->source = DNS_RECORD_FROM_CONFIG; + vllist->status = DNS_LOOKUP_NOT_DONE; + cell->dns_expiry = TIME64_MAX; } else { - _vllist = vllist; + ret = -ENOMEM; + vllist = afs_alloc_vlserver_list(0); + if (!vllist) + goto error; + vllist->source = DNS_RECORD_UNAVAILABLE; + vllist->status = DNS_LOOKUP_NOT_DONE; + cell->dns_expiry = ktime_get_real_seconds(); } - /* fill in the VL server list from the rest of the string */ - do { - unsigned a, b, c, d; - - next = strchr(_vllist, delimiter); - if (next) - *next++ = 0; - - if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) - goto bad_address; - - if (a > 255 || b > 255 || c > 255 || d > 255) - goto bad_address; - - cell->vl_addrs[cell->vl_naddrs++].s_addr = - htonl((a << 24) | (b << 16) | (c << 8) | d); + rcu_assign_pointer(cell->vl_servers, vllist); - } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next)); - - /* create a key to represent an anonymous user */ - memcpy(keyname, "afs@", 4); - dp = keyname + 4; - cp = cell->name; - do { - *dp++ = toupper(*cp); - } while (*cp++); - - key = rxrpc_get_null_key(keyname); - if (IS_ERR(key)) { - _debug("no key"); - ret = PTR_ERR(key); + cell->dns_source = vllist->source; + cell->dns_status = vllist->status; + smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ + atomic_inc(&net->cells_outstanding); + ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell, + 2, INT_MAX / 2, GFP_KERNEL); + if (ret < 0) goto error; - } - cell->anonymous_key = key; + cell->dynroot_ino = ret; + cell->debug_id = atomic_inc_return(&cell_debug_id); - _debug("anon key %p{%x}", - cell->anonymous_key, key_serial(cell->anonymous_key)); + trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc); _leave(" = %p", cell); return cell; -bad_address: - printk(KERN_ERR "kAFS: bad VL server IP address\n"); - ret = -EINVAL; +parse_failed: + if (ret == -EINVAL) + printk(KERN_ERR "kAFS: bad VL server IP address\n"); error: - key_put(cell->anonymous_key); - kfree(dvllist); + afs_put_vlserverlist(cell->net, vllist); + kfree(cell->name - 1); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); } /* - * afs_cell_crate() - create a cell record - * @name: is the name of the cell. - * @namsesz: is the strlen of the cell name. - * @vllist: is a colon separated list of IP addresses in "a.b.c.d" format. - * @retref: is T to return the cell reference when the cell exists. + * afs_lookup_cell - Look up or create a cell record. + * @net: The network namespace + * @name: The name of the cell. + * @namesz: The strlen of the cell name. + * @vllist: A colon/comma separated list of numeric IP addresses or NULL. + * @reason: The reason we're doing the lookup + * @trace: The reason to be logged if the lookup is successful. + * + * Look up a cell record by name and query the DNS for VL server addresses if + * needed. Note that that actual DNS query is punted off to the manager thread + * so that this function can return immediately if interrupted whilst allowing + * cell records to be shared even if not yet fully constructed. */ -struct afs_cell *afs_cell_create(const char *name, unsigned namesz, - char *vllist, bool retref) +struct afs_cell *afs_lookup_cell(struct afs_net *net, + const char *name, unsigned int namesz, + const char *vllist, + enum afs_lookup_cell_for reason, + enum afs_cell_trace trace) { - struct afs_cell *cell; - int ret; - - _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist); - - down_write(&afs_cells_sem); - read_lock(&afs_cells_lock); - list_for_each_entry(cell, &afs_cells, link) { - if (strncasecmp(cell->name, name, namesz) == 0) - goto duplicate_name; + struct afs_cell *cell, *candidate, *cursor; + struct rb_node *parent, **pp; + enum afs_cell_state state; + int ret, n; + + _enter("%s,%s,%u", name, vllist, reason); + + if (reason != AFS_LOOKUP_CELL_PRELOAD) { + cell = afs_find_cell(net, name, namesz, trace); + if (!IS_ERR(cell)) { + if (reason == AFS_LOOKUP_CELL_DYNROOT) + goto no_wait; + if (cell->state == AFS_CELL_SETTING_UP || + cell->state == AFS_CELL_UNLOOKED) + goto lookup_cell; + goto wait_for_cell; + } } - read_unlock(&afs_cells_lock); - cell = afs_cell_alloc(name, namesz, vllist); - if (IS_ERR(cell)) { - _leave(" = %ld", PTR_ERR(cell)); - up_write(&afs_cells_sem); - return cell; + /* Assume we're probably going to create a cell and preallocate and + * mostly set up a candidate record. We can then use this to stash the + * name, the net namespace and VL server addresses. + * + * We also want to do this before we hold any locks as it may involve + * upcalling to userspace to make DNS queries. + */ + candidate = afs_alloc_cell(net, name, namesz, vllist); + if (IS_ERR(candidate)) { + _leave(" = %ld", PTR_ERR(candidate)); + return candidate; } - /* add a proc directory for this cell */ - ret = afs_proc_cell_setup(cell); - if (ret < 0) - goto error; + /* Find the insertion point and check to see if someone else added a + * cell whilst we were allocating. + */ + down_write(&net->cells_lock); + + pp = &net->cells.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + cursor = rb_entry(parent, struct afs_cell, net_node); + + n = strncasecmp(cursor->name, name, + min_t(size_t, cursor->name_len, namesz)); + if (n == 0) + n = cursor->name_len - namesz; + if (n < 0) + pp = &(*pp)->rb_left; + else if (n > 0) + pp = &(*pp)->rb_right; + else + goto cell_already_exists; + } -#ifdef CONFIG_AFS_FSCACHE - /* put it up for caching (this never returns an error) */ - cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, - &afs_cell_cache_index_def, - cell); -#endif + cell = candidate; + candidate = NULL; + afs_use_cell(cell, trace); + rb_link_node_rcu(&cell->net_node, parent, pp); + rb_insert_color(&cell->net_node, &net->cells); + up_write(&net->cells_lock); + +lookup_cell: + if (reason != AFS_LOOKUP_CELL_PRELOAD && + reason != AFS_LOOKUP_CELL_ROOTCELL) { + set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); + afs_queue_cell(cell, afs_cell_trace_queue_new); + } - /* add to the cell lists */ - write_lock(&afs_cells_lock); - list_add_tail(&cell->link, &afs_cells); - write_unlock(&afs_cells_lock); +wait_for_cell: + state = smp_load_acquire(&cell->state); /* vs error */ + switch (state) { + case AFS_CELL_ACTIVE: + case AFS_CELL_DEAD: + break; + case AFS_CELL_UNLOOKED: + default: + if (reason == AFS_LOOKUP_CELL_PRELOAD || + reason == AFS_LOOKUP_CELL_ROOTCELL) + break; + _debug("wait_for_cell"); + afs_see_cell(cell, afs_cell_trace_wait); + wait_var_event(&cell->state, + ({ + state = smp_load_acquire(&cell->state); /* vs error */ + state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; + })); + _debug("waited_for_cell %d %d", cell->state, cell->error); + } - down_write(&afs_proc_cells_sem); - list_add_tail(&cell->proc_link, &afs_proc_cells); - up_write(&afs_proc_cells_sem); - up_write(&afs_cells_sem); +no_wait: + /* Check the state obtained from the wait check. */ + state = smp_load_acquire(&cell->state); /* vs error */ + if (state == AFS_CELL_DEAD) { + ret = cell->error; + goto error; + } + if (state == AFS_CELL_ACTIVE) { + switch (cell->dns_status) { + case DNS_LOOKUP_NOT_DONE: + if (cell->dns_source == DNS_RECORD_FROM_CONFIG) { + ret = 0; + break; + } + fallthrough; + default: + ret = -EIO; + goto error; + case DNS_LOOKUP_GOOD: + case DNS_LOOKUP_GOOD_WITH_BAD: + ret = 0; + break; + case DNS_LOOKUP_GOT_NOT_FOUND: + ret = -ENOENT; + goto error; + case DNS_LOOKUP_BAD: + ret = -EREMOTEIO; + goto error; + case DNS_LOOKUP_GOT_LOCAL_FAILURE: + case DNS_LOOKUP_GOT_TEMP_FAILURE: + case DNS_LOOKUP_GOT_NS_FAILURE: + ret = -EDESTADDRREQ; + goto error; + } + } - _leave(" = %p", cell); + _leave(" = %p [cell]", cell); return cell; +cell_already_exists: + _debug("cell exists"); + cell = cursor; + if (reason == AFS_LOOKUP_CELL_PRELOAD) { + ret = -EEXIST; + } else { + afs_use_cell(cursor, trace); + ret = 0; + } + up_write(&net->cells_lock); + if (candidate) + afs_put_cell(candidate, afs_cell_trace_put_candidate); + if (ret == 0) + goto wait_for_cell; + goto error_noput; error: - up_write(&afs_cells_sem); - key_put(cell->anonymous_key); - kfree(cell); - _leave(" = %d", ret); + afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_error); +error_noput: + _leave(" = %d [error]", ret); return ERR_PTR(ret); - -duplicate_name: - if (retref && !IS_ERR(cell)) - afs_get_cell(cell); - - read_unlock(&afs_cells_lock); - up_write(&afs_cells_sem); - - if (retref) { - _leave(" = %p", cell); - return cell; - } - - _leave(" = -EEXIST"); - return ERR_PTR(-EEXIST); } /* @@ -223,10 +405,11 @@ duplicate_name: * - can be called with a module parameter string * - can be called from a write to /proc/fs/afs/rootcell */ -int afs_cell_init(char *rootcell) +int afs_cell_init(struct afs_net *net, const char *rootcell) { struct afs_cell *old_root, *new_root; - char *cp; + const char *cp, *vllist; + size_t len; _enter(""); @@ -239,222 +422,503 @@ int afs_cell_init(char *rootcell) } cp = strchr(rootcell, ':'); - if (!cp) + if (!cp) { _debug("kAFS: no VL server IP addresses specified"); - else - *cp++ = 0; + vllist = NULL; + len = strlen(rootcell); + } else { + vllist = cp + 1; + len = cp - rootcell; + } - /* allocate a cell record for the root cell */ - new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false); + if (len == 0 || !rootcell[0] || rootcell[0] == '.' || rootcell[len - 1] == '.') + return -EINVAL; + if (memchr(rootcell, '/', len)) + return -EINVAL; + cp = strstr(rootcell, ".."); + if (cp && cp < rootcell + len) + return -EINVAL; + + /* allocate a cell record for the root/workstation cell */ + new_root = afs_lookup_cell(net, rootcell, len, vllist, + AFS_LOOKUP_CELL_ROOTCELL, + afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); return PTR_ERR(new_root); } + if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags)) + afs_use_cell(new_root, afs_cell_trace_use_pin); + /* install the new cell */ - write_lock(&afs_cells_lock); - old_root = afs_cell_root; - afs_cell_root = new_root; - write_unlock(&afs_cells_lock); - afs_put_cell(old_root); + down_write(&net->cells_lock); + old_root = rcu_replace_pointer(net->ws_cell, new_root, + lockdep_is_held(&net->cells_lock)); + up_write(&net->cells_lock); + afs_unuse_cell(old_root, afs_cell_trace_unuse_ws); _leave(" = 0"); return 0; } /* - * lookup a cell record + * Update a cell's VL server address list from the DNS. */ -struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz, - bool dns_cell) +static int afs_update_cell(struct afs_cell *cell) { - struct afs_cell *cell; - - _enter("\"%*.*s\",", namesz, namesz, name ?: ""); - - down_read(&afs_cells_sem); - read_lock(&afs_cells_lock); + struct afs_vlserver_list *vllist, *old = NULL, *p; + unsigned int min_ttl = READ_ONCE(afs_cell_min_ttl); + unsigned int max_ttl = READ_ONCE(afs_cell_max_ttl); + time64_t now, expiry = 0; + int ret = 0; + + _enter("%s", cell->name); + + vllist = afs_dns_query(cell, &expiry); + if (IS_ERR(vllist)) { + ret = PTR_ERR(vllist); + + _debug("%s: fail %d", cell->name, ret); + if (ret == -ENOMEM) + goto out_wake; + + vllist = afs_alloc_vlserver_list(0); + if (!vllist) { + if (ret >= 0) + ret = -ENOMEM; + goto out_wake; + } - if (name) { - /* if the cell was named, look for it in the cell record list */ - list_for_each_entry(cell, &afs_cells, link) { - if (strncmp(cell->name, name, namesz) == 0) { - afs_get_cell(cell); - goto found; - } + switch (ret) { + case -ENODATA: + case -EDESTADDRREQ: + vllist->status = DNS_LOOKUP_GOT_NOT_FOUND; + break; + case -EAGAIN: + case -ECONNREFUSED: + vllist->status = DNS_LOOKUP_GOT_TEMP_FAILURE; + break; + default: + vllist->status = DNS_LOOKUP_GOT_LOCAL_FAILURE; + break; } - cell = ERR_PTR(-ENOENT); - if (dns_cell) - goto create_cell; - found: - ; - } else { - cell = afs_cell_root; - if (!cell) { - /* this should not happen unless user tries to mount - * when root cell is not set. Return an impossibly - * bizarre errno to alert the user. Things like - * ENOENT might be "more appropriate" but they happen - * for other reasons. + } + + _debug("%s: got list %d %d", cell->name, vllist->source, vllist->status); + cell->dns_status = vllist->status; + + now = ktime_get_real_seconds(); + if (min_ttl > max_ttl) + max_ttl = min_ttl; + if (expiry < now + min_ttl) + expiry = now + min_ttl; + else if (expiry > now + max_ttl) + expiry = now + max_ttl; + + _debug("%s: status %d", cell->name, vllist->status); + if (vllist->source == DNS_RECORD_UNAVAILABLE) { + switch (vllist->status) { + case DNS_LOOKUP_GOT_NOT_FOUND: + /* The DNS said that the cell does not exist or there + * weren't any addresses to be had. */ - cell = ERR_PTR(-EDESTADDRREQ); - } else { - afs_get_cell(cell); + cell->dns_expiry = expiry; + break; + + case DNS_LOOKUP_BAD: + case DNS_LOOKUP_GOT_LOCAL_FAILURE: + case DNS_LOOKUP_GOT_TEMP_FAILURE: + case DNS_LOOKUP_GOT_NS_FAILURE: + default: + cell->dns_expiry = now + 10; + break; } + } else { + cell->dns_expiry = expiry; + } + /* Replace the VL server list if the new record has servers or the old + * record doesn't. + */ + write_lock(&cell->vl_servers_lock); + p = rcu_dereference_protected(cell->vl_servers, true); + if (vllist->nr_servers > 0 || p->nr_servers == 0) { + rcu_assign_pointer(cell->vl_servers, vllist); + cell->dns_source = vllist->source; + old = p; } + write_unlock(&cell->vl_servers_lock); + afs_put_vlserverlist(cell->net, old); - read_unlock(&afs_cells_lock); - up_read(&afs_cells_sem); - _leave(" = %p", cell); - return cell; +out_wake: + smp_store_release(&cell->dns_lookup_count, + cell->dns_lookup_count + 1); /* vs source/status */ + wake_up_var(&cell->dns_lookup_count); + _leave(" = %d", ret); + return ret; +} -create_cell: - read_unlock(&afs_cells_lock); - up_read(&afs_cells_sem); +/* + * Destroy a cell record + */ +static void afs_cell_destroy(struct rcu_head *rcu) +{ + struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu); + struct afs_net *net = cell->net; + int r; - cell = afs_cell_create(name, namesz, NULL, true); + _enter("%p{%s}", cell, cell->name); - _leave(" = %p", cell); + r = refcount_read(&cell->ref); + ASSERTCMP(r, ==, 0); + trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free); + + afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); + afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias); + key_put(cell->anonymous_key); + idr_remove(&net->cells_dyn_ino, cell->dynroot_ino); + kfree(cell->name - 1); + kfree(cell); + + afs_dec_cells_outstanding(net); + _leave(" [destroyed]"); +} + +static void afs_destroy_cell_work(struct work_struct *work) +{ + struct afs_cell *cell = container_of(work, struct afs_cell, destroyer); + + afs_see_cell(cell, afs_cell_trace_destroy); + timer_delete_sync(&cell->management_timer); + cancel_work_sync(&cell->manager); + call_rcu(&cell->rcu, afs_cell_destroy); +} + +/* + * Get a reference on a cell record. + */ +struct afs_cell *afs_get_cell(struct afs_cell *cell, enum afs_cell_trace reason) +{ + int r; + + __refcount_inc(&cell->ref, &r); + trace_afs_cell(cell->debug_id, r + 1, atomic_read(&cell->active), reason); return cell; } -#if 0 /* - * try and get a cell record + * Drop a reference on a cell record. */ -struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell) +void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason) { - write_lock(&afs_cells_lock); + if (cell) { + unsigned int debug_id = cell->debug_id; + unsigned int a; + bool zero; + int r; + + a = atomic_read(&cell->active); + zero = __refcount_dec_and_test(&cell->ref, &r); + trace_afs_cell(debug_id, r - 1, a, reason); + if (zero) { + a = atomic_read(&cell->active); + WARN(a != 0, "Cell active count %u > 0\n", a); + WARN_ON(!queue_work(afs_wq, &cell->destroyer)); + } + } +} - if (cell && !list_empty(&cell->link)) - afs_get_cell(cell); - else - cell = NULL; +/* + * Note a cell becoming more active. + */ +struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) +{ + int r, a; - write_unlock(&afs_cells_lock); + __refcount_inc(&cell->ref, &r); + a = atomic_inc_return(&cell->active); + trace_afs_cell(cell->debug_id, r + 1, a, reason); return cell; } -#endif /* 0 */ /* - * destroy a cell record + * Record a cell becoming less active. When the active counter reaches 1, it + * is scheduled for destruction, but may get reactivated. */ -void afs_put_cell(struct afs_cell *cell) +void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason) { + unsigned int debug_id; + time64_t now, expire_delay; + bool zero; + int r, a; + if (!cell) return; - _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); - - ASSERTCMP(atomic_read(&cell->usage), >, 0); + _enter("%s", cell->name); - /* to prevent a race, the decrement and the dequeue must be effectively - * atomic */ - write_lock(&afs_cells_lock); + now = ktime_get_real_seconds(); + cell->last_inactive = now; + expire_delay = 0; + if (cell->vl_servers->nr_servers) + expire_delay = afs_cell_gc_delay; - if (likely(!atomic_dec_and_test(&cell->usage))) { - write_unlock(&afs_cells_lock); - _leave(""); - return; - } + debug_id = cell->debug_id; + a = atomic_dec_return(&cell->active); + if (!a) + /* 'cell' may now be garbage collected. */ + afs_set_cell_timer(cell, expire_delay); - ASSERT(list_empty(&cell->servers)); - ASSERT(list_empty(&cell->vl_list)); + zero = __refcount_dec_and_test(&cell->ref, &r); + trace_afs_cell(debug_id, r - 1, a, reason); + if (zero) + WARN_ON(!queue_work(afs_wq, &cell->destroyer)); +} - write_unlock(&afs_cells_lock); +/* + * Note that a cell has been seen. + */ +void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason) +{ + int r, a; - wake_up(&afs_cells_freeable_wq); + r = refcount_read(&cell->ref); + a = atomic_read(&cell->active); + trace_afs_cell(cell->debug_id, r, a, reason); +} - _leave(" [unused]"); +/* + * Queue a cell for management, giving the workqueue a ref to hold. + */ +void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason) +{ + queue_work(afs_wq, &cell->manager); } /* - * destroy a cell record - * - must be called with the afs_cells_sem write-locked - * - cell->link should have been broken by the caller + * Cell-specific management timer. */ -static void afs_cell_destroy(struct afs_cell *cell) +static void afs_cell_timer(struct timer_list *timer) { - _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); + struct afs_cell *cell = container_of(timer, struct afs_cell, management_timer); - ASSERTCMP(atomic_read(&cell->usage), >=, 0); - ASSERT(list_empty(&cell->link)); + afs_see_cell(cell, afs_cell_trace_see_mgmt_timer); + if (refcount_read(&cell->ref) > 0 && cell->net->live) + queue_work(afs_wq, &cell->manager); +} - /* wait for everyone to stop using the cell */ - if (atomic_read(&cell->usage) > 0) { - DECLARE_WAITQUEUE(myself, current); +/* + * Set/reduce the cell timer. + */ +void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs) +{ + timer_reduce(&cell->management_timer, jiffies + delay_secs * HZ); +} - _debug("wait for cell %s", cell->name); - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&afs_cells_freeable_wq, &myself); +/* + * Activate a cell. + */ +static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) +{ + struct hlist_node **p; + struct afs_cell *pcell; + int ret; - while (atomic_read(&cell->usage) > 0) { - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } + ret = afs_proc_cell_setup(cell); + if (ret < 0) + return ret; - remove_wait_queue(&afs_cells_freeable_wq, &myself); - set_current_state(TASK_RUNNING); + mutex_lock(&net->proc_cells_lock); + for (p = &net->proc_cells.first; *p; p = &(*p)->next) { + pcell = hlist_entry(*p, struct afs_cell, proc_link); + if (strcmp(cell->name, pcell->name) < 0) + break; } - _debug("cell dead"); - ASSERTCMP(atomic_read(&cell->usage), ==, 0); - ASSERT(list_empty(&cell->servers)); - ASSERT(list_empty(&cell->vl_list)); + cell->proc_link.pprev = p; + cell->proc_link.next = *p; + rcu_assign_pointer(*p, &cell->proc_link.next); + if (cell->proc_link.next) + cell->proc_link.next->pprev = &cell->proc_link.next; + + mutex_unlock(&net->proc_cells_lock); + return 0; +} + +/* + * Deactivate a cell. + */ +static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) +{ + _enter("%s", cell->name); afs_proc_cell_remove(cell); - down_write(&afs_proc_cells_sem); - list_del_init(&cell->proc_link); - up_write(&afs_proc_cells_sem); + mutex_lock(&net->proc_cells_lock); + if (!hlist_unhashed(&cell->proc_link)) + hlist_del_rcu(&cell->proc_link); + mutex_unlock(&net->proc_cells_lock); -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(cell->cache, 0); -#endif - key_put(cell->anonymous_key); - kfree(cell); + _leave(""); +} - _leave(" [destroyed]"); +static bool afs_has_cell_expired(struct afs_cell *cell, time64_t *_next_manage) +{ + const struct afs_vlserver_list *vllist; + time64_t expire_at = cell->last_inactive; + time64_t now = ktime_get_real_seconds(); + + if (atomic_read(&cell->active)) + return false; + if (!cell->net->live) + return true; + + vllist = rcu_dereference_protected(cell->vl_servers, true); + if (vllist && vllist->nr_servers > 0) + expire_at += afs_cell_gc_delay; + + if (expire_at <= now) + return true; + if (expire_at < *_next_manage) + *_next_manage = expire_at; + return false; } /* - * purge in-memory cell database on module unload or afs_init() failure - * - the timeout daemon is stopped before calling this + * Manage a cell record, initialising and destroying it, maintaining its DNS + * records. */ -void afs_cell_purge(void) +static bool afs_manage_cell(struct afs_cell *cell) { - struct afs_cell *cell; + struct afs_net *net = cell->net; + time64_t next_manage = TIME64_MAX; + int ret; - _enter(""); + _enter("%s", cell->name); + + _debug("state %u", cell->state); + switch (cell->state) { + case AFS_CELL_SETTING_UP: + goto set_up_cell; + case AFS_CELL_UNLOOKED: + case AFS_CELL_ACTIVE: + goto cell_is_active; + case AFS_CELL_REMOVING: + WARN_ON_ONCE(1); + return false; + case AFS_CELL_DEAD: + return false; + default: + _debug("bad state %u", cell->state); + WARN_ON_ONCE(1); /* Unhandled state */ + return false; + } - afs_put_cell(afs_cell_root); +set_up_cell: + ret = afs_activate_cell(net, cell); + if (ret < 0) { + cell->error = ret; + goto remove_cell; + } - down_write(&afs_cells_sem); + afs_set_cell_state(cell, AFS_CELL_UNLOOKED); - while (!list_empty(&afs_cells)) { - cell = NULL; +cell_is_active: + if (afs_has_cell_expired(cell, &next_manage)) + goto remove_cell; - /* remove the next cell from the front of the list */ - write_lock(&afs_cells_lock); + if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { + ret = afs_update_cell(cell); + if (ret < 0) + cell->error = ret; + if (cell->state == AFS_CELL_UNLOOKED) + afs_set_cell_state(cell, AFS_CELL_ACTIVE); + } - if (!list_empty(&afs_cells)) { - cell = list_entry(afs_cells.next, - struct afs_cell, link); - list_del_init(&cell->link); - } + if (next_manage < TIME64_MAX && cell->net->live) { + time64_t now = ktime_get_real_seconds(); - write_unlock(&afs_cells_lock); + if (next_manage - now <= 0) + afs_queue_cell(cell, afs_cell_trace_queue_again); + else + afs_set_cell_timer(cell, next_manage - now); + } + _leave(" [done %u]", cell->state); + return false; - if (cell) { - _debug("PURGING CELL %s (%d)", - cell->name, atomic_read(&cell->usage)); +remove_cell: + down_write(&net->cells_lock); - /* now the cell should be left with no references */ - afs_cell_destroy(cell); - } + if (atomic_read(&cell->active)) { + up_write(&net->cells_lock); + goto cell_is_active; + } + + /* Make sure that the expiring server records are going to see the fact + * that the cell is caput. + */ + afs_set_cell_state(cell, AFS_CELL_REMOVING); + + afs_deactivate_cell(net, cell); + afs_purge_servers(cell); + + rb_erase(&cell->net_node, &net->cells); + afs_see_cell(cell, afs_cell_trace_unuse_delete); + up_write(&net->cells_lock); + + /* The root volume is pinning the cell */ + afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root); + cell->root_volume = NULL; + + afs_set_cell_state(cell, AFS_CELL_DEAD); + return true; +} + +static void afs_manage_cell_work(struct work_struct *work) +{ + struct afs_cell *cell = container_of(work, struct afs_cell, manager); + bool final_put; + + afs_see_cell(cell, afs_cell_trace_manage); + final_put = afs_manage_cell(cell); + afs_see_cell(cell, afs_cell_trace_managed); + if (final_put) + afs_put_cell(cell, afs_cell_trace_put_final); +} + +/* + * Purge in-memory cell database. + */ +void afs_cell_purge(struct afs_net *net) +{ + struct afs_cell *ws; + struct rb_node *cursor; + + _enter(""); + + down_write(&net->cells_lock); + ws = rcu_replace_pointer(net->ws_cell, NULL, + lockdep_is_held(&net->cells_lock)); + up_write(&net->cells_lock); + afs_unuse_cell(ws, afs_cell_trace_unuse_ws); + + _debug("kick cells"); + down_read(&net->cells_lock); + for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { + struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node); + + afs_see_cell(cell, afs_cell_trace_purge); + + if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) + afs_unuse_cell(cell, afs_cell_trace_unuse_pin); + + afs_queue_cell(cell, afs_cell_trace_queue_purge); } + up_read(&net->cells_lock); - up_write(&afs_cells_sem); + _debug("wait"); + wait_var_event(&net->cells_outstanding, + !atomic_read(&net->cells_outstanding)); _leave(""); } diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c new file mode 100644 index 000000000000..edcbd249d202 --- /dev/null +++ b/fs/afs/cm_security.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Cache manager security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include <crypto/krb5.h> +#include "internal.h" +#include "afs_cm.h" +#include "afs_fs.h" +#include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include <trace/events/rxrpc.h> + +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32))) + +#ifdef CONFIG_RXGK +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server); +#endif + +/* + * Respond to an RxGK challenge, adding appdata. + */ +static int afs_respond_to_challenge(struct sk_buff *challenge) +{ +#ifdef CONFIG_RXGK + struct krb5_buffer appdata = {}; + struct afs_server *server; +#endif + struct rxrpc_peer *peer; + unsigned long peer_data; + u16 service_id; + u8 security_index; + + rxrpc_kernel_query_challenge(challenge, &peer, &peer_data, + &service_id, &security_index); + + _enter("%u,%u", service_id, security_index); + + switch (service_id) { + /* We don't send CM_SERVICE RPCs, so don't expect a challenge + * therefrom. + */ + case FS_SERVICE: + case VL_SERVICE: + case YFS_FS_SERVICE: + case YFS_VL_SERVICE: + break; + default: + pr_warn("Can't respond to unknown challenge %u:%u", + service_id, security_index); + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } + + switch (security_index) { +#ifdef CONFIG_RXKAD + case RXRPC_SECURITY_RXKAD: + return rxkad_kernel_respond_to_challenge(challenge); +#endif + +#ifdef CONFIG_RXGK + case RXRPC_SECURITY_RXGK: + return rxgk_kernel_respond_to_challenge(challenge, &appdata); + + case RXRPC_SECURITY_YFS_RXGK: + switch (service_id) { + case FS_SERVICE: + case YFS_FS_SERVICE: + server = (struct afs_server *)peer_data; + if (!server->cm_rxgk_appdata.data) { + mutex_lock(&server->cm_token_lock); + if (!server->cm_rxgk_appdata.data) + afs_create_yfs_cm_token(challenge, server); + mutex_unlock(&server->cm_token_lock); + } + if (server->cm_rxgk_appdata.data) + appdata = server->cm_rxgk_appdata; + break; + } + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +#endif + + default: + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } +} + +/* + * Process the OOB message queue, processing challenge packets. + */ +void afs_process_oob_queue(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, rx_oob_work); + struct sk_buff *oob; + enum rxrpc_oob_type type; + + while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) { + switch (type) { + case RXRPC_OOB_CHALLENGE: + afs_respond_to_challenge(oob); + break; + } + rxrpc_kernel_free_oob(oob); + } +} + +#ifdef CONFIG_RXGK +/* + * Create a securities keyring for the cache manager and attach a key to it for + * the RxGK tokens we want to use to secure the callback connection back from + * the fileserver. + */ +int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + const struct krb5_enctype *krb5; + struct key *ring; + key_ref_t key; + char K0[32], *desc; + int ret; + + ring = keyring_alloc("kafs", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_SEARCH | KEY_POS_WRITE | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(ring)) + return PTR_ERR(ring); + + ret = rxrpc_sock_set_security_keyring(socket->sk, ring); + if (ret < 0) + goto out; + + ret = -ENOPKG; + krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (!krb5) + goto out; + + if (WARN_ON_ONCE(krb5->key_len > sizeof(K0))) + goto out; + + ret = -ENOMEM; + desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u", + YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype); + if (!desc) + goto out; + + wait_for_random_bytes(); + get_random_bytes(K0, krb5->key_len); + + key = key_create(make_key_ref(ring, true), + "rxrpc_s", desc, + K0, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + kfree(desc); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto out; + } + + net->fs_cm_token_key = key_ref_to_ptr(key); + ret = 0; +out: + key_put(ring); + return ret; +} + +/* + * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver. + */ +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server) +{ + const struct krb5_enctype *conn_krb5, *token_krb5; + const struct krb5_buffer *token_key; + struct crypto_aead *aead; + struct scatterlist sg; + struct afs_net *net = server->cell->net; + const struct key *key = net->fs_cm_token_key; + size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset; + __be32 caps[1] = { + [0] = htonl(AFS_CAP_ERROR_TRANSLATION), + }; + __be32 *xdr; + void *appdata, *K0, *encbase; + u32 enctype; + int ret; + + if (!key) + return -ENOKEY; + + /* Assume that the fileserver is happy to use the same encoding type as + * we were told to use by the token obtained by the user. + */ + enctype = rxgk_kernel_query_challenge(challenge); + + conn_krb5 = crypto_krb5_find_enctype(enctype); + if (!conn_krb5) + return -ENOPKG; + token_krb5 = key->payload.data[0]; + token_key = (const struct krb5_buffer *)&key->payload.data[2]; + + /* struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + */ + keysize = 4 + xdr_len_object(conn_krb5->key_len); + + /* struct RXGK_AuthName { + * afs_int32 kind; + * opaque data<AUTHDATAMAX>; + * opaque display<AUTHPRINTABLEMAX>; + * }; + */ + uuidsize = sizeof(server->uuid); + authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0); + + /* struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ + toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize); + + offset = 0; + encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset); + + /* struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + */ + contsize = 4 + 4 + xdr_len_object(encsize); + + /* struct YFSAppData { + * opr_uuid initiatorUuid; + * opr_uuid acceptorUuid; + * Capabilities caps; + * afs_int32 enctype; + * opaque callbackKey<>; + * opaque callbackToken<>; + * }; + */ + adatasize = 16 + 16 + + xdr_len_object(sizeof(caps)) + + 4 + + xdr_len_object(conn_krb5->key_len) + + xdr_len_object(contsize); + + ret = -ENOMEM; + appdata = kzalloc(adatasize, GFP_KERNEL); + if (!appdata) + goto out; + xdr = appdata; + + memcpy(xdr, &net->uuid, 16); /* appdata.initiatorUuid */ + xdr += 16 / 4; + memcpy(xdr, &server->uuid, 16); /* appdata.acceptorUuid */ + xdr += 16 / 4; + *xdr++ = htonl(ARRAY_SIZE(caps)); /* appdata.caps.len */ + memcpy(xdr, &caps, sizeof(caps)); /* appdata.caps */ + xdr += ARRAY_SIZE(caps); + *xdr++ = htonl(conn_krb5->etype); /* appdata.enctype */ + + *xdr++ = htonl(conn_krb5->key_len); /* appdata.callbackKey.len */ + K0 = xdr; + get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(contsize); /* appdata.callbackToken.len */ + *xdr++ = htonl(1); /* cont.kvno */ + *xdr++ = htonl(token_krb5->etype); /* cont.enctype */ + *xdr++ = htonl(encsize); /* cont.encrypted_token.len */ + + encbase = xdr; + xdr += offset / 4; + *xdr++ = htonl(conn_krb5->etype); /* token.K0.enctype */ + *xdr++ = htonl(conn_krb5->key_len); /* token.K0.key.len */ + memcpy(xdr, K0, conn_krb5->key_len); /* token.K0.key.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(RXRPC_SECURITY_ENCRYPT); /* token.level */ + *xdr++ = htonl(0); /* token.starttime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(0); /* token.lifetime */ + *xdr++ = htonl(0); /* token.bytelife */ + *xdr++ = htonl(0); /* token.expirationtime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(1); /* token.identities.count */ + *xdr++ = htonl(0); /* token.identities[0].kind */ + *xdr++ = htonl(uuidsize); /* token.identities[0].data.len */ + memcpy(xdr, &server->uuid, uuidsize); + xdr += xdr_round_up(uuidsize) / 4; + *xdr++ = htonl(0); /* token.identities[0].display.len */ + + xdr = encbase + xdr_round_up(encsize); + + if ((unsigned long)xdr - (unsigned long)appdata != adatasize) + pr_err("Appdata size incorrect %lx != %zx\n", + (unsigned long)xdr - (unsigned long)appdata, adatasize); + + aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN, + GFP_KERNEL); + if (IS_ERR(aead)) { + ret = PTR_ERR(aead); + goto out_token; + } + + sg_init_one(&sg, encbase, encsize); + ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false); + if (ret < 0) + goto out_aead; + + server->cm_rxgk_appdata.len = adatasize; + server->cm_rxgk_appdata.data = appdata; + appdata = NULL; + +out_aead: + crypto_free_aead(aead); +out_token: + kfree(appdata); +out: + return ret; +} +#endif /* CONFIG_RXGK */ diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 1c8c6cc6de30..1a906805a9e3 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* AFS Cache Manager Service * * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> @@ -16,21 +12,24 @@ #include <linux/ip.h> #include "internal.h" #include "afs_cm.h" - -#if 0 -struct workqueue_struct *afs_cm_workqueue; -#endif /* 0 */ - -static int afs_deliver_cb_init_call_back_state(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_init_call_back_state3(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *, - struct sk_buff *, bool); +#include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include <trace/events/rxrpc.h> + +static int afs_deliver_cb_init_call_back_state(struct afs_call *); +static int afs_deliver_cb_init_call_back_state3(struct afs_call *); +static int afs_deliver_cb_probe(struct afs_call *); +static int afs_deliver_cb_callback(struct afs_call *); +static int afs_deliver_cb_probe_uuid(struct afs_call *); +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *); static void afs_cm_destructor(struct afs_call *); +static void SRXAFSCB_CallBack(struct work_struct *); +static void SRXAFSCB_InitCallBackState(struct work_struct *); +static void SRXAFSCB_Probe(struct work_struct *); +static void SRXAFSCB_ProbeUuid(struct work_struct *); +static void SRXAFSCB_TellMeAboutYourself(struct work_struct *); + +static int afs_deliver_yfs_cb_callback(struct afs_call *); /* * CB.CallBack operation type @@ -38,8 +37,8 @@ static void afs_cm_destructor(struct afs_call *); static const struct afs_call_type afs_SRXCBCallBack = { .name = "CB.CallBack", .deliver = afs_deliver_cb_callback, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_CallBack, }; /* @@ -48,8 +47,8 @@ static const struct afs_call_type afs_SRXCBCallBack = { static const struct afs_call_type afs_SRXCBInitCallBackState = { .name = "CB.InitCallBackState", .deliver = afs_deliver_cb_init_call_back_state, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_InitCallBackState, }; /* @@ -58,8 +57,8 @@ static const struct afs_call_type afs_SRXCBInitCallBackState = { static const struct afs_call_type afs_SRXCBInitCallBackState3 = { .name = "CB.InitCallBackState3", .deliver = afs_deliver_cb_init_call_back_state3, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_InitCallBackState, }; /* @@ -68,8 +67,8 @@ static const struct afs_call_type afs_SRXCBInitCallBackState3 = { static const struct afs_call_type afs_SRXCBProbe = { .name = "CB.Probe", .deliver = afs_deliver_cb_probe, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_Probe, }; /* @@ -78,8 +77,8 @@ static const struct afs_call_type afs_SRXCBProbe = { static const struct afs_call_type afs_SRXCBProbeUuid = { .name = "CB.ProbeUuid", .deliver = afs_deliver_cb_probe_uuid, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_ProbeUuid, }; /* @@ -88,8 +87,18 @@ static const struct afs_call_type afs_SRXCBProbeUuid = { static const struct afs_call_type afs_SRXCBTellMeAboutYourself = { .name = "CB.TellMeAboutYourself", .deliver = afs_deliver_cb_tell_me_about_yourself, - .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, + .work = SRXAFSCB_TellMeAboutYourself, +}; + +/* + * YFS CB.CallBack operation type + */ +static const struct afs_call_type afs_SRXYFSCB_CallBack = { + .name = "YFSCB.CallBack", + .deliver = afs_deliver_yfs_cb_callback, + .destructor = afs_cm_destructor, + .work = SRXAFSCB_CallBack, }; /* @@ -98,11 +107,9 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = { */ bool afs_cm_incoming_call(struct afs_call *call) { - u32 operation_id = ntohl(call->operation_ID); - - _enter("{CB.OP %u}", operation_id); + _enter("{%u, CB.OP %u}", call->service_id, call->operation_ID); - switch (operation_id) { + switch (call->operation_ID) { case CBCallBack: call->type = &afs_SRXCBCallBack; return true; @@ -115,29 +122,44 @@ bool afs_cm_incoming_call(struct afs_call *call) case CBProbe: call->type = &afs_SRXCBProbe; return true; + case CBProbeUuid: + call->type = &afs_SRXCBProbeUuid; + return true; case CBTellMeAboutYourself: call->type = &afs_SRXCBTellMeAboutYourself; return true; + case YFSCBCallBack: + if (call->service_id != YFS_CM_SERVICE) + return false; + call->type = &afs_SRXYFSCB_CallBack; + return true; default: return false; } } /* - * clean up a cache manager call + * Clean up a cache manager call. */ static void afs_cm_destructor(struct afs_call *call) { - _enter(""); - - afs_put_server(call->server); - call->server = NULL; kfree(call->buffer); call->buffer = NULL; } /* - * allow the fileserver to see if the cache manager is still alive + * Abort a service call from within an action function. + */ +static void afs_abort_service_call(struct afs_call *call, u32 abort_code, int error, + enum rxrpc_abort_reason why) +{ + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + abort_code, error, why); + afs_set_call_complete(call, error, 0); +} + +/* + * The server supplied a list of callbacks that it wanted to break. */ static void SRXAFSCB_CallBack(struct work_struct *work) { @@ -145,71 +167,69 @@ static void SRXAFSCB_CallBack(struct work_struct *work) _enter(""); - /* be sure to send the reply *before* attempting to spam the AFS server - * with FSFetchStatus requests on the vnodes with broken callbacks lest - * the AFS server get into a vicious cycle of trying to break further - * callbacks because it hadn't received completion of the CBCallBack op - * yet */ - afs_send_empty_reply(call); + /* We need to break the callbacks before sending the reply as the + * server holds up change visibility till it receives our reply so as + * to maintain cache coherency. + */ + if (call->server) { + trace_afs_server(call->server->debug_id, + refcount_read(&call->server->ref), + atomic_read(&call->server->active), + afs_server_trace_callback); + afs_break_callbacks(call->server, call->count, call->request); + } - afs_break_callbacks(call->server, call->count, call->request); + afs_send_empty_reply(call); + afs_put_call(call); _leave(""); } /* * deliver request data to a CB.CallBack call */ -static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_callback(struct afs_call *call) { - struct afs_callback *cb; - struct afs_server *server; - struct in_addr addr; + struct afs_callback_break *cb; __be32 *bp; - u32 tmp; int ret, loop; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* extract the FID array and its count in two steps */ + fallthrough; case 1: _debug("extract FID count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); if (call->count > AFSCBMAX) - return -EBADMSG; + return afs_protocol_error(call, afs_eproto_cb_fid_count); - call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL); + call->buffer = kmalloc(array3_size(call->count, 3, 4), + GFP_KERNEL); if (!call->buffer) return -ENOMEM; - call->offset = 0; + afs_extract_to_buf(call, call->count * 3 * 4); call->unmarshall++; + fallthrough; case 2: _debug("extract FID array"); - ret = afs_extract_data(call, skb, last, call->buffer, - call->count * 3 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; _debug("unmarshall FID array"); call->request = kcalloc(call->count, - sizeof(struct afs_callback), + sizeof(struct afs_callback_break), GFP_KERNEL); if (!call->request) return -ENOMEM; @@ -220,76 +240,45 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, cb->fid.vid = ntohl(*bp++); cb->fid.vnode = ntohl(*bp++); cb->fid.unique = ntohl(*bp++); - cb->type = AFSCM_CB_UNTYPED; } - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; /* extract the callback array and its count in two steps */ + fallthrough; case 3: _debug("extract CB count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } - - tmp = ntohl(call->tmp); - _debug("CB count: %u", tmp); - if (tmp != call->count && tmp != 0) - return -EBADMSG; - call->offset = 0; + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count2 = ntohl(call->tmp); + _debug("CB count: %u", call->count2); + if (call->count2 != call->count && call->count2 != 0) + return afs_protocol_error(call, afs_eproto_cb_count); + call->iter = &call->def_iter; + iov_iter_discard(&call->def_iter, ITER_DEST, call->count2 * 3 * 4); call->unmarshall++; - if (tmp == 0) - goto empty_cb_array; + fallthrough; case 4: - _debug("extract CB array"); - ret = afs_extract_data(call, skb, last, call->request, - call->count * 3 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + _debug("extract discard %zu/%u", + iov_iter_count(call->iter), call->count2 * 3 * 4); - _debug("unmarshall CB array"); - cb = call->request; - bp = call->buffer; - for (loop = call->count; loop > 0; loop--, cb++) { - cb->version = ntohl(*bp++); - cb->expiry = ntohl(*bp++); - cb->type = ntohl(*bp++); - } + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; - empty_cb_array: - call->offset = 0; call->unmarshall++; + fallthrough; case 5: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; break; } - if (!last) - return 0; - - call->state = AFS_CALL_REPLYING; - - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); - if (!server) - return -ENOTCONN; - call->server = server; - - INIT_WORK(&call->work, SRXAFSCB_CallBack); - queue_work(afs_wq, &call->work); + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return afs_io_error(call, afs_io_error_cm_reply); return 0; } @@ -302,72 +291,86 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) _enter("{%p}", call->server); - afs_init_callback_state(call->server); + if (call->server) + afs_init_callback_state(call->server); afs_send_empty_reply(call); + afs_put_call(call); _leave(""); } /* * deliver request data to a CB.InitCallBackState call */ -static int afs_deliver_cb_init_call_back_state(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { - struct afs_server *server; - struct in_addr addr; - - _enter(",{%u},%d", skb->len, last); - - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; - - /* no unmarshalling required */ - call->state = AFS_CALL_REPLYING; - - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); - if (!server) - return -ENOTCONN; - call->server = server; + _enter(""); - INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); - queue_work(afs_wq, &call->work); - return 0; + afs_extract_discard(call, 0); + return afs_extract_data(call, false); } /* * deliver request data to a CB.InitCallBackState3 call */ -static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { - struct afs_server *server; - struct in_addr addr; + struct afs_uuid *r; + unsigned loop; + __be32 *b; + int ret; - _enter(",{%u},%d", skb->len, last); + _enter("{%u}", call->unmarshall); - if (!last) - return 0; + switch (call->unmarshall) { + case 0: + call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + afs_extract_to_buf(call, 11 * sizeof(__be32)); + call->unmarshall++; + + fallthrough; + case 1: + _debug("extract UUID"); + ret = afs_extract_data(call, false); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall UUID"); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + b = call->buffer; + r = call->request; + r->time_low = b[0]; + r->time_mid = htons(ntohl(b[1])); + r->time_hi_and_version = htons(ntohl(b[2])); + r->clock_seq_hi_and_reserved = ntohl(b[3]); + r->clock_seq_low = ntohl(b[4]); - /* no unmarshalling required */ - call->state = AFS_CALL_REPLYING; + for (loop = 0; loop < 6; loop++) + r->node[loop] = ntohl(b[loop + 5]); - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); - if (!server) - return -ENOTCONN; - call->server = server; + call->unmarshall++; + fallthrough; + + case 2: + break; + } + + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return afs_io_error(call, afs_io_error_cm_reply); + + if (memcmp(call->request, &call->server->_uuid, sizeof(call->server->_uuid)) != 0) { + pr_notice("Callback UUID does not match fileserver UUID\n"); + trace_afs_cm_no_server_u(call, call->request); + return 0; + } - INIT_WORK(&call->work, SRXAFSCB_InitCallBackState); - queue_work(afs_wq, &call->work); return 0; } @@ -380,84 +383,73 @@ static void SRXAFSCB_Probe(struct work_struct *work) _enter(""); afs_send_empty_reply(call); + afs_put_call(call); _leave(""); } /* * deliver request data to a CB.Probe call */ -static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe(struct afs_call *call) { - _enter(",{%u},%d", skb->len, last); + int ret; - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + _enter(""); - /* no unmarshalling required */ - call->state = AFS_CALL_REPLYING; + afs_extract_discard(call, 0); + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; - INIT_WORK(&call->work, SRXAFSCB_Probe); - queue_work(afs_wq, &call->work); + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return afs_io_error(call, afs_io_error_cm_reply); return 0; } /* - * allow the fileserver to quickly find out if the fileserver has been rebooted + * Allow the fileserver to quickly find out if the cache manager has been + * rebooted. */ static void SRXAFSCB_ProbeUuid(struct work_struct *work) { struct afs_call *call = container_of(work, struct afs_call, work); struct afs_uuid *r = call->request; - struct { - __be32 match; - } reply; - _enter(""); - - if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0) - reply.match = htonl(0); + if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0) + afs_send_empty_reply(call); else - reply.match = htonl(1); + afs_abort_service_call(call, 1, 1, afs_abort_probeuuid_negative); - afs_send_simple_reply(call, &reply, sizeof(reply)); + afs_put_call(call); _leave(""); } /* * deliver request data to a CB.ProbeUuid call */ -static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe_uuid(struct afs_call *call) { struct afs_uuid *r; unsigned loop; __be32 *b; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: - call->offset = 0; - call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + call->buffer = kmalloc_array(11, sizeof(__be32), GFP_KERNEL); if (!call->buffer) return -ENOMEM; + afs_extract_to_buf(call, 11 * sizeof(__be32)); call->unmarshall++; + fallthrough; case 1: _debug("extract UUID"); - ret = afs_extract_data(call, skb, last, call->buffer, - 11 * sizeof(__be32)); + ret = afs_extract_data(call, false); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -471,32 +463,24 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, b = call->buffer; r = call->request; - r->time_low = ntohl(b[0]); - r->time_mid = ntohl(b[1]); - r->time_hi_and_version = ntohl(b[2]); + r->time_low = b[0]; + r->time_mid = htons(ntohl(b[1])); + r->time_hi_and_version = htons(ntohl(b[2])); r->clock_seq_hi_and_reserved = ntohl(b[3]); r->clock_seq_low = ntohl(b[4]); for (loop = 0; loop < 6; loop++) r->node[loop] = ntohl(b[loop + 5]); - call->offset = 0; call->unmarshall++; + fallthrough; case 2: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; break; } - if (!last) - return 0; - - call->state = AFS_CALL_REPLYING; - - INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); - queue_work(afs_wq, &call->work); + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return afs_io_error(call, afs_io_error_cm_reply); return 0; } @@ -505,9 +489,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, */ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) { - struct afs_interface *ifs; struct afs_call *call = container_of(work, struct afs_call, work); - int loop, nifs; + int loop; struct { struct /* InterfaceAddr */ { @@ -525,61 +508,112 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) _enter(""); - nifs = 0; - ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL); - if (ifs) { - nifs = afs_get_ipv4_interfaces(ifs, 32, false); - if (nifs < 0) { - kfree(ifs); - ifs = NULL; - nifs = 0; - } - } - memset(&reply, 0, sizeof(reply)); - reply.ia.nifs = htonl(nifs); - reply.ia.uuid[0] = htonl(afs_uuid.time_low); - reply.ia.uuid[1] = htonl(afs_uuid.time_mid); - reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version); - reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved); - reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low); + reply.ia.uuid[0] = call->net->uuid.time_low; + reply.ia.uuid[1] = htonl(ntohs(call->net->uuid.time_mid)); + reply.ia.uuid[2] = htonl(ntohs(call->net->uuid.time_hi_and_version)); + reply.ia.uuid[3] = htonl((s8) call->net->uuid.clock_seq_hi_and_reserved); + reply.ia.uuid[4] = htonl((s8) call->net->uuid.clock_seq_low); for (loop = 0; loop < 6; loop++) - reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]); - - if (ifs) { - for (loop = 0; loop < nifs; loop++) { - reply.ia.ifaddr[loop] = ifs[loop].address.s_addr; - reply.ia.netmask[loop] = ifs[loop].netmask.s_addr; - reply.ia.mtu[loop] = htonl(ifs[loop].mtu); - } - kfree(ifs); - } + reply.ia.uuid[loop + 5] = htonl((s8) call->net->uuid.node[loop]); reply.cap.capcount = htonl(1); reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION); afs_send_simple_reply(call, &reply, sizeof(reply)); - + afs_put_call(call); _leave(""); } /* * deliver request data to a CB.TellMeAboutYourself call */ -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) { - _enter(",{%u},%d", skb->len, last); + int ret; - if (skb->len > 0) - return -EBADMSG; - if (!last) - return 0; + _enter(""); + + afs_extract_discard(call, 0); + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return afs_io_error(call, afs_io_error_cm_reply); + return 0; +} + +/* + * deliver request data to a YFS CB.CallBack call + */ +static int afs_deliver_yfs_cb_callback(struct afs_call *call) +{ + struct afs_callback_break *cb; + struct yfs_xdr_YFSFid *bp; + size_t size; + int ret, loop; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; - /* no unmarshalling required */ - call->state = AFS_CALL_REPLYING; + /* extract the FID array and its count in two steps */ + fallthrough; + case 1: + _debug("extract FID count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("FID count: %u", call->count); + if (call->count > YFSCBMAX) + return afs_protocol_error(call, afs_eproto_cb_fid_count); + + size = array_size(call->count, sizeof(struct yfs_xdr_YFSFid)); + call->buffer = kmalloc(size, GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + afs_extract_to_buf(call, size); + call->unmarshall++; + + fallthrough; + case 2: + _debug("extract FID array"); + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + _debug("unmarshall FID array"); + call->request = kcalloc(call->count, + sizeof(struct afs_callback_break), + GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + cb = call->request; + bp = call->buffer; + for (loop = call->count; loop > 0; loop--, cb++) { + cb->fid.vid = xdr_to_u64(bp->volume); + cb->fid.vnode = xdr_to_u64(bp->vnode.lo); + cb->fid.vnode_hi = ntohl(bp->vnode.hi); + cb->fid.unique = ntohl(bp->vnode.unique); + bp++; + } + + afs_extract_to_tmp(call); + call->unmarshall++; + fallthrough; + + case 3: + break; + } - INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself); - queue_work(afs_wq, &call->work); + if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) + return afs_io_error(call, afs_io_error_cm_reply); return 0; } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 34494fbead0a..f4e9e12373ac 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1,49 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* dir.c: AFS filesystem directory handling * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2002, 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/pagemap.h> +#include <linux/swap.h> #include <linux/ctype.h> #include <linux/sched.h> +#include <linux/iversion.h> +#include <linux/iov_iter.h> +#include <linux/task_io_accounting_ops.h> #include "internal.h" +#include "afs_fs.h" +#include "xdr_fs.h" static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); static int afs_readdir(struct file *file, struct dir_context *ctx); -static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); +static int afs_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); -static void afs_d_release(struct dentry *dentry); -static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, +static void afs_d_iput(struct dentry *dentry, struct inode *inode); +static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl); -static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); +static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, + loff_t fpos, u64 ino, unsigned dtype); +static int afs_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl); +static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry); -static int afs_symlink(struct inode *dir, struct dentry *dentry, - const char *content); -static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry); +static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *content); +static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags); const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, .release = afs_release, - .iterate = afs_readdir, + .iterate_shared = afs_readdir, .lock = afs_lock, .llseek = generic_file_llseek, }; @@ -62,151 +67,146 @@ const struct inode_operations afs_dir_inode_operations = { .setattr = afs_setattr, }; +const struct address_space_operations afs_dir_aops = { + .writepages = afs_single_writepages, +}; + const struct dentry_operations afs_fs_dentry_operations = { .d_revalidate = afs_d_revalidate, .d_delete = afs_d_delete, .d_release = afs_d_release, .d_automount = afs_d_automount, + .d_iput = afs_d_iput, }; -#define AFS_DIR_HASHTBL_SIZE 128 -#define AFS_DIR_DIRENT_SIZE 32 -#define AFS_DIRENT_PER_BLOCK 64 - -union afs_dirent { - struct { - uint8_t valid; - uint8_t unused[1]; - __be16 hash_next; - __be32 vnode; - __be32 unique; - uint8_t name[16]; - uint8_t overflow[4]; /* if any char of the name (inc - * NUL) reaches here, consume - * the next dirent too */ - } u; - uint8_t extended_name[32]; +struct afs_lookup_one_cookie { + struct dir_context ctx; + struct qstr name; + bool found; + struct afs_fid fid; }; -/* AFS directory page header (one at the beginning of every 2048-byte chunk) */ -struct afs_dir_pagehdr { - __be16 npages; - __be16 magic; -#define AFS_DIR_MAGIC htons(1234) - uint8_t nentries; - uint8_t bitmap[8]; - uint8_t pad[19]; +struct afs_lookup_cookie { + struct dir_context ctx; + struct qstr name; + unsigned short nr_fids; + struct afs_fid fids[50]; }; -/* directory block layout */ -union afs_dir_block { - - struct afs_dir_pagehdr pagehdr; - - struct { - struct afs_dir_pagehdr pagehdr; - uint8_t alloc_ctrs[128]; - /* dir hash table */ - uint16_t hashtable[AFS_DIR_HASHTBL_SIZE]; - } hdr; +static void afs_dir_unuse_cookie(struct afs_vnode *dvnode, int ret) +{ + if (ret == 0) { + struct afs_vnode_cache_aux aux; + loff_t i_size = i_size_read(&dvnode->netfs.inode); + + afs_set_cache_aux(dvnode, &aux); + fscache_unuse_cookie(afs_vnode_cache(dvnode), &aux, &i_size); + } else { + fscache_unuse_cookie(afs_vnode_cache(dvnode), NULL, NULL); + } +} - union afs_dirent dirents[AFS_DIRENT_PER_BLOCK]; -}; +/* + * Iterate through a kmapped directory segment, dumping a summary of + * the contents. + */ +static size_t afs_dir_dump_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) +{ + do { + union afs_xdr_dir_block *block = iter_base; -/* layout on a linux VM page */ -struct afs_dir_page { - union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)]; -}; + pr_warn("[%05zx] %32phN\n", progress, block); + iter_base += AFS_DIR_BLOCK_SIZE; + progress += AFS_DIR_BLOCK_SIZE; + len -= AFS_DIR_BLOCK_SIZE; + } while (len > 0); -struct afs_lookup_cookie { - struct dir_context ctx; - struct afs_fid fid; - struct qstr name; - int found; -}; + return len; +} /* - * check that a directory page is valid + * Dump the contents of a directory. */ -static inline void afs_dir_check_page(struct inode *dir, struct page *page) +static void afs_dir_dump(struct afs_vnode *dvnode) { - struct afs_dir_page *dbuf; - loff_t latter; - int tmp, qty; - -#if 0 - /* check the page count */ - qty = desc.size / sizeof(dbuf->blocks[0]); - if (qty == 0) - goto error; + struct iov_iter iter; + unsigned long long i_size = i_size_read(&dvnode->netfs.inode); - if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) { - printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n", - __func__, dir->i_ino, qty, - ntohs(dbuf->blocks[0].pagehdr.npages)); - goto error; - } -#endif + pr_warn("DIR %llx:%llx is=%llx\n", + dvnode->fid.vid, dvnode->fid.vnode, i_size); - /* determine how many magic numbers there should be in this page */ - latter = dir->i_size - page_offset(page); - if (latter >= PAGE_SIZE) - qty = PAGE_SIZE; - else - qty = latter; - qty /= sizeof(union afs_dir_block); - - /* check them */ - dbuf = page_address(page); - for (tmp = 0; tmp < qty; tmp++) { - if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) { - printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n", - __func__, dir->i_ino, tmp, qty, - ntohs(dbuf->blocks[tmp].pagehdr.magic)); - goto error; - } - } + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size); + iterate_folioq(&iter, iov_iter_count(&iter), NULL, NULL, + afs_dir_dump_step); +} - SetPageChecked(page); - return; +/* + * check that a directory folio is valid + */ +static bool afs_dir_check_block(struct afs_vnode *dvnode, size_t progress, + union afs_xdr_dir_block *block) +{ + if (block->hdr.magic != AFS_DIR_MAGIC) { + pr_warn("%s(%lx): [%zx] bad magic %04x\n", + __func__, dvnode->netfs.inode.i_ino, + progress, ntohs(block->hdr.magic)); + trace_afs_dir_check_failed(dvnode, progress); + trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic); + return false; + } -error: - SetPageChecked(page); - SetPageError(page); + /* Make sure each block is NUL terminated so we can reasonably + * use string functions on it. The filenames in the folio + * *should* be NUL-terminated anyway. + */ + ((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0; + afs_stat_v(dvnode, n_read_dir); + return true; } /* - * discard a page cached in the pagecache + * Iterate through a kmapped directory segment, checking the content. */ -static inline void afs_dir_put_page(struct page *page) +static size_t afs_dir_check_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) { - kunmap(page); - page_cache_release(page); + struct afs_vnode *dvnode = priv; + + if (WARN_ON_ONCE(progress % AFS_DIR_BLOCK_SIZE || + len % AFS_DIR_BLOCK_SIZE)) + return len; + + do { + if (!afs_dir_check_block(dvnode, progress, iter_base)) + break; + iter_base += AFS_DIR_BLOCK_SIZE; + len -= AFS_DIR_BLOCK_SIZE; + } while (len > 0); + + return len; } /* - * get a page into the pagecache + * Check all the blocks in a directory. */ -static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, - struct key *key) +static int afs_dir_check(struct afs_vnode *dvnode) { - struct page *page; - _enter("{%lu},%lu", dir->i_ino, index); - - page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); - if (!IS_ERR(page)) { - kmap(page); - if (!PageChecked(page)) - afs_dir_check_page(dir, page); - if (PageError(page)) - goto fail; - } - return page; + struct iov_iter iter; + unsigned long long i_size = i_size_read(&dvnode->netfs.inode); + size_t checked = 0; -fail: - afs_dir_put_page(page); - _leave(" = -EIO"); - return ERR_PTR(-EIO); + if (unlikely(!i_size)) + return 0; + + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size); + checked = iterate_folioq(&iter, iov_iter_count(&iter), dvnode, NULL, + afs_dir_check_step); + if (checked != i_size) { + afs_dir_dump(dvnode); + return -EIO; + } + return 0; } /* @@ -216,8 +216,8 @@ static int afs_dir_open(struct inode *inode, struct file *file) { _enter("{%lu}", inode->i_ino); - BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); - BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags)) return -ENOENT; @@ -226,153 +226,327 @@ static int afs_dir_open(struct inode *inode, struct file *file) } /* + * Read a file in a single download. + */ +static ssize_t afs_do_read_single(struct afs_vnode *dvnode, struct file *file) +{ + struct iov_iter iter; + ssize_t ret; + loff_t i_size; + bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) && + !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags)); + + i_size = i_size_read(&dvnode->netfs.inode); + if (is_dir) { + if (i_size < AFS_DIR_BLOCK_SIZE) + return afs_bad(dvnode, afs_file_error_dir_small); + if (i_size > AFS_DIR_BLOCK_SIZE * 1024) { + trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); + return -EFBIG; + } + } else { + if (i_size > AFSPATHMAX) { + trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); + return -EFBIG; + } + } + + /* Expand the storage. TODO: Shrink the storage too. */ + if (dvnode->directory_size < i_size) { + size_t cur_size = dvnode->directory_size; + + ret = netfs_alloc_folioq_buffer(NULL, + &dvnode->directory, &cur_size, i_size, + mapping_gfp_mask(dvnode->netfs.inode.i_mapping)); + dvnode->directory_size = cur_size; + if (ret < 0) + return ret; + } + + iov_iter_folio_queue(&iter, ITER_DEST, dvnode->directory, 0, 0, dvnode->directory_size); + + /* AFS requires us to perform the read of a directory synchronously as + * a single unit to avoid issues with the directory contents being + * changed between reads. + */ + ret = netfs_read_single(&dvnode->netfs.inode, file, &iter); + if (ret >= 0) { + i_size = i_size_read(&dvnode->netfs.inode); + if (i_size > ret) { + /* The content has grown, so we need to expand the + * buffer. + */ + ret = -ESTALE; + } else if (is_dir) { + int ret2 = afs_dir_check(dvnode); + + if (ret2 < 0) + ret = ret2; + } else if (i_size < folioq_folio_size(dvnode->directory, 0)) { + /* NUL-terminate a symlink. */ + char *symlink = kmap_local_folio(folioq_folio(dvnode->directory, 0), 0); + + symlink[i_size] = 0; + kunmap_local(symlink); + } + } + + return ret; +} + +ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file) +{ + ssize_t ret; + + fscache_use_cookie(afs_vnode_cache(dvnode), false); + ret = afs_do_read_single(dvnode, file); + fscache_unuse_cookie(afs_vnode_cache(dvnode), NULL, NULL); + return ret; +} + +/* + * Read the directory into a folio_queue buffer in one go, scrubbing the + * previous contents. We return -ESTALE if the caller needs to call us again. + */ +ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file) + __acquires(&dvnode->validate_lock) +{ + ssize_t ret; + loff_t i_size; + + i_size = i_size_read(&dvnode->netfs.inode); + + ret = -ERESTARTSYS; + if (down_read_killable(&dvnode->validate_lock) < 0) + goto error; + + /* We only need to reread the data if it became invalid - or if we + * haven't read it yet. + */ + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) { + ret = i_size; + goto valid; + } + + up_read(&dvnode->validate_lock); + if (down_write_killable(&dvnode->validate_lock) < 0) + goto error; + + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_invalidate_cache(dvnode, 0); + + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) || + !test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) { + trace_afs_reload_dir(dvnode); + ret = afs_read_single(dvnode, file); + if (ret < 0) + goto error_unlock; + + // TODO: Trim excess pages + + set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); + set_bit(AFS_VNODE_DIR_READ, &dvnode->flags); + } else { + ret = i_size; + } + + downgrade_write(&dvnode->validate_lock); +valid: + return ret; + +error_unlock: + up_write(&dvnode->validate_lock); +error: + _leave(" = %zd", ret); + return ret; +} + +/* * deal with one block in an AFS directory */ -static int afs_dir_iterate_block(struct dir_context *ctx, - union afs_dir_block *block, - unsigned blkoff) +static int afs_dir_iterate_block(struct afs_vnode *dvnode, + struct dir_context *ctx, + union afs_xdr_dir_block *block) { - union afs_dirent *dire; - unsigned offset, next, curr; + union afs_xdr_dirent *dire; + unsigned int blknum, base, hdr, pos, next, nr_slots; size_t nlen; int tmp; - _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block); + blknum = ctx->pos / AFS_DIR_BLOCK_SIZE; + base = blknum * AFS_DIR_SLOTS_PER_BLOCK; + hdr = (blknum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + pos = DIV_ROUND_UP(ctx->pos, AFS_DIR_DIRENT_SIZE) - base; - curr = (ctx->pos - blkoff) / sizeof(union afs_dirent); + _enter("%llx,%x", ctx->pos, blknum); /* walk through the block, an entry at a time */ - for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries; - offset < AFS_DIRENT_PER_BLOCK; - offset = next - ) { - next = offset + 1; - + for (unsigned int slot = hdr; slot < AFS_DIR_SLOTS_PER_BLOCK; slot = next) { /* skip entries marked unused in the bitmap */ - if (!(block->pagehdr.bitmap[offset / 8] & - (1 << (offset % 8)))) { - _debug("ENT[%Zu.%u]: unused", - blkoff / sizeof(union afs_dir_block), offset); - if (offset >= curr) - ctx->pos = blkoff + - next * sizeof(union afs_dirent); + if (!(block->hdr.bitmap[slot / 8] & + (1 << (slot % 8)))) { + _debug("ENT[%x]: Unused", base + slot); + next = slot + 1; + if (next >= pos) + ctx->pos = (base + next) * sizeof(union afs_xdr_dirent); continue; } /* got a valid entry */ - dire = &block->dirents[offset]; + dire = &block->dirents[slot]; nlen = strnlen(dire->u.name, - sizeof(*block) - - offset * sizeof(union afs_dirent)); + (unsigned long)(block + 1) - (unsigned long)dire->u.name - 1); + if (nlen > AFSNAMEMAX - 1) { + _debug("ENT[%x]: Name too long (len %zx)", + base + slot, nlen); + return afs_bad(dvnode, afs_file_error_dir_name_too_long); + } - _debug("ENT[%Zu.%u]: %s %Zu \"%s\"", - blkoff / sizeof(union afs_dir_block), offset, - (offset < curr ? "skip" : "fill"), + _debug("ENT[%x]: %s %zx \"%s\"", + base + slot, (slot < pos ? "skip" : "fill"), nlen, dire->u.name); - /* work out where the next possible entry is */ - for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) { - if (next >= AFS_DIRENT_PER_BLOCK) { - _debug("ENT[%Zu.%u]:" - " %u travelled beyond end dir block" - " (len %u/%Zu)", - blkoff / sizeof(union afs_dir_block), - offset, next, tmp, nlen); - return -EIO; - } - if (!(block->pagehdr.bitmap[next / 8] & - (1 << (next % 8)))) { - _debug("ENT[%Zu.%u]:" - " %u unmarked extension (len %u/%Zu)", - blkoff / sizeof(union afs_dir_block), - offset, next, tmp, nlen); - return -EIO; - } + nr_slots = afs_dir_calc_slots(nlen); + next = slot + nr_slots; + if (next > AFS_DIR_SLOTS_PER_BLOCK) { + _debug("ENT[%x]: extends beyond end dir block (len %zx)", + base + slot, nlen); + return afs_bad(dvnode, afs_file_error_dir_over_end); + } - _debug("ENT[%Zu.%u]: ext %u/%Zu", - blkoff / sizeof(union afs_dir_block), - next, tmp, nlen); - next++; + /* Check that the name-extension dirents are all allocated */ + for (tmp = 1; tmp < nr_slots; tmp++) { + unsigned int xslot = slot + tmp; + + if (!(block->hdr.bitmap[xslot / 8] & (1 << (xslot % 8)))) { + _debug("ENT[%x]: Unmarked extension (%x/%x)", + base + slot, tmp, nr_slots); + return afs_bad(dvnode, afs_file_error_dir_unmarked_ext); + } } /* skip if starts before the current position */ - if (offset < curr) + if (slot < pos) { + if (next > pos) + ctx->pos = (base + next) * sizeof(union afs_xdr_dirent); continue; + } /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), - ctx->actor == afs_lookup_filldir ? + (ctx->actor == afs_lookup_filldir || + ctx->actor == afs_lookup_one_filldir)? ntohl(dire->u.unique) : DT_UNKNOWN)) { _leave(" = 0 [full]"); return 0; } - ctx->pos = blkoff + next * sizeof(union afs_dirent); + ctx->pos = (base + next) * sizeof(union afs_xdr_dirent); } _leave(" = 1 [more]"); return 1; } +struct afs_dir_iteration_ctx { + struct dir_context *dir_ctx; + int error; +}; + /* - * iterate through the data blob that lists the contents of an AFS directory + * Iterate through a kmapped directory segment. */ -static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, - struct key *key) +static size_t afs_dir_iterate_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) { - union afs_dir_block *dblock; - struct afs_dir_page *dbuf; - struct page *page; - unsigned blkoff, limit; + struct afs_dir_iteration_ctx *ctx = priv2; + struct afs_vnode *dvnode = priv; int ret; - _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos); - - if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { - _leave(" = -ESTALE"); - return -ESTALE; + if (WARN_ON_ONCE(progress % AFS_DIR_BLOCK_SIZE || + len % AFS_DIR_BLOCK_SIZE)) { + pr_err("Mis-iteration prog=%zx len=%zx\n", + progress % AFS_DIR_BLOCK_SIZE, + len % AFS_DIR_BLOCK_SIZE); + return len; } - /* round the file position up to the next entry boundary */ - ctx->pos += sizeof(union afs_dirent) - 1; - ctx->pos &= ~(sizeof(union afs_dirent) - 1); + do { + ret = afs_dir_iterate_block(dvnode, ctx->dir_ctx, iter_base); + if (ret != 1) + break; - /* walk through the blocks in sequence */ - ret = 0; - while (ctx->pos < dir->i_size) { - blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1); + ctx->dir_ctx->pos = round_up(ctx->dir_ctx->pos, AFS_DIR_BLOCK_SIZE); + iter_base += AFS_DIR_BLOCK_SIZE; + len -= AFS_DIR_BLOCK_SIZE; + } while (len > 0); - /* fetch the appropriate page from the directory */ - page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - break; - } + return len; +} - limit = blkoff & ~(PAGE_SIZE - 1); +/* + * Iterate through the directory folios. + */ +static int afs_dir_iterate_contents(struct inode *dir, struct dir_context *dir_ctx) +{ + struct afs_dir_iteration_ctx ctx = { .dir_ctx = dir_ctx }; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct iov_iter iter; + unsigned long long i_size = i_size_read(dir); - dbuf = page_address(page); + /* Round the file position up to the next entry boundary */ + dir_ctx->pos = round_up(dir_ctx->pos, sizeof(union afs_xdr_dirent)); - /* deal with the individual blocks stashed on this page */ - do { - dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / - sizeof(union afs_dir_block)]; - ret = afs_dir_iterate_block(ctx, dblock, blkoff); - if (ret != 1) { - afs_dir_put_page(page); - goto out; - } + if (i_size <= 0 || dir_ctx->pos >= i_size) + return 0; - blkoff += sizeof(union afs_dir_block); + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size); + iov_iter_advance(&iter, round_down(dir_ctx->pos, AFS_DIR_BLOCK_SIZE)); - } while (ctx->pos < dir->i_size && blkoff < limit); + iterate_folioq(&iter, iov_iter_count(&iter), dvnode, &ctx, + afs_dir_iterate_step); - afs_dir_put_page(page); - ret = 0; - } + if (ctx.error == -ESTALE) + afs_invalidate_dir(dvnode, afs_dir_invalid_iter_stale); + return ctx.error; +} + +/* + * iterate through the data blob that lists the contents of an AFS directory + */ +static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, + struct file *file, afs_dataversion_t *_dir_version) +{ + struct afs_vnode *dvnode = AFS_FS_I(dir); + int retry_limit = 100; + int ret; + + _enter("{%lu},%llx,,", dir->i_ino, ctx->pos); + + do { + if (--retry_limit < 0) { + pr_warn("afs_read_dir(): Too many retries\n"); + ret = -ESTALE; + break; + } + ret = afs_read_dir(dvnode, file); + if (ret < 0) { + if (ret != -ESTALE) + break; + if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { + ret = -ESTALE; + break; + } + continue; + } + *_dir_version = inode_peek_iversion_raw(dir); + + ret = afs_dir_iterate_contents(dir, ctx); + up_read(&dvnode->validate_lock); + } while (ret == -ESTALE); -out: _leave(" = %d", ret); return ret; } @@ -382,109 +556,408 @@ out: */ static int afs_readdir(struct file *file, struct dir_context *ctx) { - return afs_dir_iterate(file_inode(file), - ctx, file->private_data); + afs_dataversion_t dir_version; + + return afs_dir_iterate(file_inode(file), ctx, file, &dir_version); } /* - * search the directory for a name + * Search the directory for a single name * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, - loff_t fpos, u64 ino, unsigned dtype) +static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) { - struct afs_lookup_cookie *cookie = _cookie; + struct afs_lookup_one_cookie *cookie = + container_of(ctx, struct afs_lookup_one_cookie, ctx); _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, (unsigned long long) ino, dtype); /* insanity checks first */ - BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); - BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); if (cookie->name.len != nlen || memcmp(cookie->name.name, name, nlen) != 0) { - _leave(" = 0 [no]"); - return 0; + _leave(" = true [keep looking]"); + return true; } cookie->fid.vnode = ino; cookie->fid.unique = dtype; cookie->found = 1; - _leave(" = -1 [found]"); - return -1; + _leave(" = false [found]"); + return false; } /* - * do a lookup in a directory + * Do a lookup of a single name in a directory * - just returns the FID the dentry name maps to if found */ -static int afs_do_lookup(struct inode *dir, struct dentry *dentry, - struct afs_fid *fid, struct key *key) +static int afs_do_lookup_one(struct inode *dir, const struct qstr *name, + struct afs_fid *fid, + afs_dataversion_t *_dir_version) { struct afs_super_info *as = dir->i_sb->s_fs_info; - struct afs_lookup_cookie cookie = { - .ctx.actor = afs_lookup_filldir, - .name = dentry->d_name, + struct afs_lookup_one_cookie cookie = { + .ctx.actor = afs_lookup_one_filldir, + .name = *name, .fid.vid = as->volume->vid }; int ret; - _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name); + _enter("{%lu},{%.*s},", dir->i_ino, name->len, name->name); /* search the directory */ - ret = afs_dir_iterate(dir, &cookie.ctx, key); + ret = afs_dir_iterate(dir, &cookie.ctx, NULL, _dir_version); if (ret < 0) { _leave(" = %d [iter]", ret); return ret; } - ret = -ENOENT; if (!cookie.found) { _leave(" = -ENOENT [not found]"); return -ENOENT; } *fid = cookie.fid; - _leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique); + _leave(" = 0 { vn=%llu u=%u }", fid->vnode, fid->unique); return 0; } /* - * Try to auto mount the mountpoint with pseudo directory, if the autocell - * operation is setted. + * search the directory for a name + * - if afs_dir_iterate_block() spots this function, it'll pass the FID + * uniquifier through dtype + */ +static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) +{ + struct afs_lookup_cookie *cookie = + container_of(ctx, struct afs_lookup_cookie, ctx); + + _enter("{%s,%u},%s,%u,,%llu,%u", + cookie->name.name, cookie->name.len, name, nlen, + (unsigned long long) ino, dtype); + + /* insanity checks first */ + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); + + if (cookie->nr_fids < 50) { + cookie->fids[cookie->nr_fids].vnode = ino; + cookie->fids[cookie->nr_fids].unique = dtype; + cookie->nr_fids++; + } + + return cookie->nr_fids < 50; +} + +/* + * Deal with the result of a successful lookup operation. Turn all the files + * into inodes and save the first one - which is the one we actually want. */ -static struct inode *afs_try_auto_mntpt( - int ret, struct dentry *dentry, struct inode *dir, struct key *key, - struct afs_fid *fid) +static void afs_do_lookup_success(struct afs_operation *op) { - const char *devname = dentry->d_name.name; - struct afs_vnode *vnode = AFS_FS_I(dir); + struct afs_vnode_param *vp; + struct afs_vnode *vnode; struct inode *inode; + u32 abort_code; + int i; + + _enter(""); + + for (i = 0; i < op->nr_files; i++) { + switch (i) { + case 0: + vp = &op->file[0]; + abort_code = vp->scb.status.abort_code; + if (abort_code != 0) { + op->call_abort_code = abort_code; + afs_op_set_error(op, afs_abort_to_error(abort_code)); + op->cumul_error.abort_code = abort_code; + } + break; + + case 1: + vp = &op->file[1]; + break; + + default: + vp = &op->more_files[i - 2]; + break; + } + + if (vp->scb.status.abort_code) + trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code); + if (!vp->scb.have_status && !vp->scb.have_error) + continue; + + _debug("do [%u]", i); + if (vp->vnode) { + if (!test_bit(AFS_VNODE_UNSET, &vp->vnode->flags)) + afs_vnode_commit_status(op, vp); + } else if (vp->scb.status.abort_code == 0) { + inode = afs_iget(op, vp); + if (!IS_ERR(inode)) { + vnode = AFS_FS_I(inode); + afs_cache_permit(vnode, op->key, + 0 /* Assume vnode->cb_break is 0 */ + + op->cb_v_break, + &vp->scb); + vp->vnode = vnode; + vp->put_vnode = true; + } + } else { + _debug("- abort %d %llx:%llx.%x", + vp->scb.status.abort_code, + vp->fid.vid, vp->fid.vnode, vp->fid.unique); + } + } + + _leave(""); +} + +static const struct afs_operation_ops afs_inline_bulk_status_operation = { + .issue_afs_rpc = afs_fs_inline_bulk_status, + .issue_yfs_rpc = yfs_fs_inline_bulk_status, + .success = afs_do_lookup_success, +}; + +static const struct afs_operation_ops afs_lookup_fetch_status_operation = { + .issue_afs_rpc = afs_fs_fetch_status, + .issue_yfs_rpc = yfs_fs_fetch_status, + .success = afs_do_lookup_success, + .aborted = afs_check_for_remote_deletion, +}; + +/* + * See if we know that the server we expect to use doesn't support + * FS.InlineBulkStatus. + */ +static bool afs_server_supports_ibulk(struct afs_vnode *dvnode) +{ + struct afs_server_list *slist; + struct afs_volume *volume = dvnode->volume; + struct afs_server *server; + bool ret = true; + int i; + + if (!test_bit(AFS_VOLUME_MAYBE_NO_IBULK, &volume->flags)) + return true; + + rcu_read_lock(); + slist = rcu_dereference(volume->servers); - _enter("%d, %p{%s}, {%x:%u}, %p", - ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key); + for (i = 0; i < slist->nr_servers; i++) { + server = slist->servers[i].server; + if (server == dvnode->cb_server) { + if (test_bit(AFS_SERVER_FL_NO_IBULK, &server->flags)) + ret = false; + break; + } + } + + rcu_read_unlock(); + return ret; +} - if (ret != -ENOENT || - !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) +/* + * Do a lookup in a directory. We make use of bulk lookup to query a slew of + * files in one go and create inodes for them. The inode of the file we were + * asked for is returned. + */ +static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) +{ + struct afs_lookup_cookie *cookie; + struct afs_vnode_param *vp; + struct afs_operation *op; + struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; + struct inode *inode = NULL, *ti; + afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version); + bool supports_ibulk, isnew; + long ret; + int i; + + _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); + + cookie = kzalloc(sizeof(struct afs_lookup_cookie), GFP_KERNEL); + if (!cookie) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < ARRAY_SIZE(cookie->fids); i++) + cookie->fids[i].vid = dvnode->fid.vid; + cookie->ctx.actor = afs_lookup_filldir; + cookie->name = dentry->d_name; + cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want + * and slot 0 for the directory */ + + /* Search the directory for the named entry using the hash table... */ + ret = afs_dir_search(dvnode, &dentry->d_name, &cookie->fids[1], &data_version); + if (ret < 0) goto out; - inode = afs_iget_autocell(dir, devname, strlen(devname), key); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); + supports_ibulk = afs_server_supports_ibulk(dvnode); + if (supports_ibulk) { + /* ...then scan linearly from that point for entries to lookup-ahead. */ + cookie->ctx.pos = (ret + 1) * AFS_DIR_DIRENT_SIZE; + afs_dir_iterate(dir, &cookie->ctx, NULL, &data_version); + } + + dentry->d_fsdata = (void *)(unsigned long)data_version; + + /* Check to see if we already have an inode for the primary fid. */ + inode = ilookup5(dir->i_sb, cookie->fids[1].vnode, + afs_ilookup5_test_by_fid, &cookie->fids[1]); + if (inode) + goto out; /* We do */ + + /* Okay, we didn't find it. We need to query the server - and whilst + * we're doing that, we're going to attempt to look up a bunch of other + * vnodes also. + */ + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); goto out; } - *fid = AFS_FS_I(inode)->fid; - _leave("= %p", inode); - return inode; + afs_op_set_vnode(op, 0, dvnode); + afs_op_set_fid(op, 1, &cookie->fids[1]); + + op->nr_files = cookie->nr_fids; + _debug("nr_files %u", op->nr_files); + + /* Need space for examining all the selected files */ + if (op->nr_files > 2) { + op->more_files = kvcalloc(op->nr_files - 2, + sizeof(struct afs_vnode_param), + GFP_KERNEL); + if (!op->more_files) { + afs_op_nomem(op); + goto out_op; + } + + for (i = 2; i < op->nr_files; i++) { + vp = &op->more_files[i - 2]; + vp->fid = cookie->fids[i]; + + /* Find any inodes that already exist and get their + * callback counters. + */ + ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode, + afs_ilookup5_test_by_fid, &vp->fid, &isnew); + if (!IS_ERR_OR_NULL(ti)) { + vnode = AFS_FS_I(ti); + vp->dv_before = vnode->status.data_version; + vp->cb_break_before = afs_calc_vnode_cb_break(vnode); + vp->vnode = vnode; + vp->put_vnode = true; + vp->speculative = true; /* vnode not locked */ + } + } + } + + /* Try FS.InlineBulkStatus first. Abort codes for the individual + * lookups contained therein are stored in the reply without aborting + * the whole operation. + */ + afs_op_set_error(op, -ENOTSUPP); + if (supports_ibulk) { + op->ops = &afs_inline_bulk_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } + if (afs_op_error(op) == -ENOTSUPP) { + /* We could try FS.BulkStatus next, but this aborts the entire + * op if any of the lookups fails - so, for the moment, revert + * to FS.FetchStatus for op->file[1]. + */ + op->fetch_status.which = 1; + op->ops = &afs_lookup_fetch_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } + +out_op: + if (!afs_op_error(op)) { + if (op->file[1].scb.status.abort_code) { + afs_op_accumulate_error(op, -ECONNABORTED, + op->file[1].scb.status.abort_code); + } else { + inode = &op->file[1].vnode->netfs.inode; + op->file[1].vnode = NULL; + } + } + + if (op->file[0].scb.have_status) + dentry->d_fsdata = (void *)(unsigned long)op->file[0].scb.status.data_version; + else + dentry->d_fsdata = (void *)(unsigned long)op->file[0].dv_before; + ret = afs_put_operation(op); out: - _leave("= %d", ret); - return ERR_PTR(ret); + kfree(cookie); + _leave(""); + return inode ?: ERR_PTR(ret); +} + +/* + * Look up an entry in a directory with @sys substitution. + */ +static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry) +{ + struct afs_sysnames *subs; + struct afs_net *net = afs_i2net(dir); + struct dentry *ret; + char *buf, *p, *name; + int len, i; + + _enter(""); + + ret = ERR_PTR(-ENOMEM); + p = buf = kmalloc(AFSNAMEMAX, GFP_KERNEL); + if (!buf) + goto out_p; + if (dentry->d_name.len > 4) { + memcpy(p, dentry->d_name.name, dentry->d_name.len - 4); + p += dentry->d_name.len - 4; + } + + /* There is an ordered list of substitutes that we have to try. */ + read_lock(&net->sysnames_lock); + subs = net->sysnames; + refcount_inc(&subs->usage); + read_unlock(&net->sysnames_lock); + + for (i = 0; i < subs->nr; i++) { + name = subs->subs[i]; + len = dentry->d_name.len - 4 + strlen(name); + if (len >= AFSNAMEMAX) { + ret = ERR_PTR(-ENAMETOOLONG); + goto out_s; + } + + strcpy(p, name); + ret = lookup_noperm(&QSTR(buf), dentry->d_parent); + if (IS_ERR(ret) || d_is_positive(ret)) + goto out_s; + dput(ret); + } + + /* We don't want to d_add() the @sys dentry here as we don't want to + * the cached dentry to hide changes to the sysnames list. + */ + ret = NULL; +out_s: + afs_put_sysnames(subs); + kfree(buf); +out_p: + return ret; } /* @@ -493,79 +966,89 @@ out: static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct afs_vnode *vnode; - struct afs_fid fid; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_fid fid = {}; struct inode *inode; - struct key *key; + struct dentry *d; int ret; - vnode = AFS_FS_I(dir); - - _enter("{%x:%u},%p{%s},", - vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name); + _enter("{%llx:%llu},%p{%pd},", + dvnode->fid.vid, dvnode->fid.vnode, dentry, dentry); - ASSERTCMP(dentry->d_inode, ==, NULL); + ASSERTCMP(d_inode(dentry), ==, NULL); if (dentry->d_name.len >= AFSNAMEMAX) { _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); } - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) { _leave(" = -ESTALE"); return ERR_PTR(-ESTALE); } - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - return ERR_CAST(key); - } - - ret = afs_validate(vnode, key); + ret = afs_validate(dvnode, NULL); if (ret < 0) { - key_put(key); + afs_dir_unuse_cookie(dvnode, ret); _leave(" = %d [val]", ret); return ERR_PTR(ret); } - ret = afs_do_lookup(dir, dentry, &fid, key); - if (ret < 0) { - inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid); - if (!IS_ERR(inode)) { - key_put(key); - goto success; - } - - ret = PTR_ERR(inode); - key_put(key); - if (ret == -ENOENT) { - d_add(dentry, NULL); - _leave(" = NULL [negative]"); - return NULL; - } - _leave(" = %d [do]", ret); - return ERR_PTR(ret); + if (dentry->d_name.len >= 4 && + dentry->d_name.name[dentry->d_name.len - 4] == '@' && + dentry->d_name.name[dentry->d_name.len - 3] == 's' && + dentry->d_name.name[dentry->d_name.len - 2] == 'y' && + dentry->d_name.name[dentry->d_name.len - 1] == 's') + return afs_lookup_atsys(dir, dentry); + + afs_stat_v(dvnode, n_lookup); + inode = afs_do_lookup(dir, dentry); + if (inode == ERR_PTR(-ENOENT)) + inode = NULL; + else if (!IS_ERR_OR_NULL(inode)) + fid = AFS_FS_I(inode)->fid; + + _debug("splice %p", dentry->d_inode); + d = d_splice_alias(inode, dentry); + if (!IS_ERR_OR_NULL(d)) { + d->d_fsdata = dentry->d_fsdata; + trace_afs_lookup(dvnode, &d->d_name, &fid); + } else { + trace_afs_lookup(dvnode, &dentry->d_name, &fid); } - dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version; + _leave(""); + return d; +} - /* instantiate the dentry */ - inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL); - key_put(key); - if (IS_ERR(inode)) { - _leave(" = %ld", PTR_ERR(inode)); - return ERR_CAST(inode); - } +/* + * Check the validity of a dentry under RCU conditions. + */ +static int afs_d_revalidate_rcu(struct afs_vnode *dvnode, struct dentry *dentry) +{ + long dir_version, de_version; + + _enter("%p", dentry); + + if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) + return -ECHILD; -success: - d_add(dentry, inode); - _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", - fid.vnode, - fid.unique, - dentry->d_inode->i_ino, - dentry->d_inode->i_generation); + if (!afs_check_validity(dvnode)) + return -ECHILD; - return NULL; + /* We only need to invalidate a dentry if the server's copy changed + * behind our back. If we made the change, it's no problem. Note that + * on a 32-bit system, we only have 32 bits in the dentry to store the + * version. + */ + dir_version = (long)READ_ONCE(dvnode->status.data_version); + de_version = (long)READ_ONCE(dentry->d_fsdata); + if (de_version != dir_version) { + dir_version = (long)READ_ONCE(dvnode->invalid_before); + if (de_version - dir_version < 0) + return -ECHILD; + } + + return 1; /* Still valid */ } /* @@ -573,71 +1056,83 @@ success: * - NOTE! the hit can be a negative hit too, so we can't assume we have an * inode */ -static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) +static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - struct afs_vnode *vnode, *dir; - struct afs_fid uninitialized_var(fid); - struct dentry *parent; + struct afs_vnode *vnode, *dir = AFS_FS_I(parent_dir); + struct afs_fid fid; + struct inode *inode; struct key *key; - void *dir_version; + afs_dataversion_t dir_version, invalid_before; + long de_version; int ret; if (flags & LOOKUP_RCU) - return -ECHILD; + return afs_d_revalidate_rcu(dir, dentry); - vnode = AFS_FS_I(dentry->d_inode); - - if (dentry->d_inode) - _enter("{v={%x:%u} n=%s fl=%lx},", - vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); + _enter("{v={%llx:%llu} n=%pd fl=%lx},", + vnode->fid.vid, vnode->fid.vnode, dentry, vnode->flags); - else - _enter("{neg n=%s}", dentry->d_name.name); + } else { + _enter("{neg n=%pd}", dentry); + } key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell); if (IS_ERR(key)) key = NULL; - /* lock down the parent dentry so we can peer at it */ - parent = dget_parent(dentry); - if (!parent->d_inode) - goto out_bad; - - dir = AFS_FS_I(parent->d_inode); - /* validate the parent directory */ - if (test_bit(AFS_VNODE_MODIFIED, &dir->flags)) - afs_validate(dir, key); + ret = afs_validate(dir, key); + if (ret == -ERESTARTSYS) { + key_put(key); + return ret; + } if (test_bit(AFS_VNODE_DELETED, &dir->flags)) { - _debug("%s: parent dir deleted", dentry->d_name.name); - goto out_bad; + _debug("%pd: parent dir deleted", dentry); + goto not_found; } - dir_version = (void *) (unsigned long) dir->status.data_version; - if (dentry->d_fsdata == dir_version) - goto out_valid; /* the dir contents are unchanged */ + /* We only need to invalidate a dentry if the server's copy changed + * behind our back. If we made the change, it's no problem. Note that + * on a 32-bit system, we only have 32 bits in the dentry to store the + * version. + */ + dir_version = dir->status.data_version; + de_version = (long)dentry->d_fsdata; + if (de_version == (long)dir_version) + goto out_valid_noupdate; + + invalid_before = dir->invalid_before; + if (de_version - (long)invalid_before >= 0) + goto out_valid; _debug("dir modified"); + afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key); + ret = afs_do_lookup_one(&dir->netfs.inode, name, &fid, &dir_version); switch (ret) { case 0: /* the filename maps to something */ - if (!dentry->d_inode) - goto out_bad; - if (is_bad_inode(dentry->d_inode)) { - printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n", - parent->d_name.name, dentry->d_name.name); - goto out_bad; + if (d_really_is_negative(dentry)) + goto not_found; + inode = d_inode(dentry); + if (is_bad_inode(inode)) { + printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n", + dentry); + goto not_found; } + vnode = AFS_FS_I(inode); + /* if the vnode ID has changed, then the dirent points to a * different file */ if (fid.vnode != vnode->fid.vnode) { - _debug("%s: dirent changed [%u != %u]", - dentry->d_name.name, fid.vnode, + _debug("%pd: dirent changed [%llu != %llu]", + dentry, fid.vnode, vnode->fid.vnode); goto not_found; } @@ -646,56 +1141,35 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) * been deleted and replaced, and the original vnode ID has * been reused */ if (fid.unique != vnode->fid.unique) { - _debug("%s: file deleted (uq %u -> %u I:%u)", - dentry->d_name.name, fid.unique, + _debug("%pd: file deleted (uq %u -> %u I:%u)", + dentry, fid.unique, vnode->fid.unique, - dentry->d_inode->i_generation); - spin_lock(&vnode->lock); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - spin_unlock(&vnode->lock); + vnode->netfs.inode.i_generation); goto not_found; } goto out_valid; case -ENOENT: /* the filename is unknown */ - _debug("%s: dirent not found", dentry->d_name.name); - if (dentry->d_inode) + _debug("%pd: dirent not found", dentry); + if (d_really_is_positive(dentry)) goto not_found; goto out_valid; default: - _debug("failed to iterate dir %s: %d", - parent->d_name.name, ret); - goto out_bad; + _debug("failed to iterate parent %pd2: %d", dentry, ret); + goto not_found; } out_valid: - dentry->d_fsdata = dir_version; -out_skip: - dput(parent); + dentry->d_fsdata = (void *)(unsigned long)dir_version; +out_valid_noupdate: key_put(key); _leave(" = 1 [valid]"); return 1; - /* the dirent, if it exists, now points to a different vnode */ not_found: - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_NFSFS_RENAMED; - spin_unlock(&dentry->d_lock); - -out_bad: - if (dentry->d_inode) { - /* don't unhash if we have submounts */ - if (have_submounts(dentry)) - goto out_skip; - } - - _debug("dropping dentry %s/%s", - parent->d_name.name, dentry->d_name.name); - shrink_dcache_parent(dentry); - d_drop(dentry); - dput(parent); + _debug("dropping dentry %pd2", dentry); key_put(key); _leave(" = 0 [bad]"); @@ -710,14 +1184,14 @@ out_bad: */ static int afs_d_delete(const struct dentry *dentry) { - _enter("%s", dentry->d_name.name); + _enter("%pd", dentry); if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto zap; - if (dentry->d_inode && - (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags) || - test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags))) + if (d_really_is_positive(dentry) && + (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(d_inode(dentry))->flags) || + test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(d_inode(dentry))->flags))) goto zap; _leave(" = 0 [keep]"); @@ -729,332 +1203,588 @@ zap: } /* - * handle dentry release + * Clean up sillyrename files on dentry removal. */ -static void afs_d_release(struct dentry *dentry) +static void afs_d_iput(struct dentry *dentry, struct inode *inode) { - _enter("%s", dentry->d_name.name); + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + afs_silly_iput(dentry, inode); + iput(inode); } /* - * create a directory on an AFS filesystem + * handle dentry release */ -static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +void afs_d_release(struct dentry *dentry) { - struct afs_file_status status; - struct afs_callback cb; - struct afs_server *server; - struct afs_vnode *dvnode, *vnode; - struct afs_fid fid; - struct inode *inode; - struct key *key; - int ret; + _enter("%pd", dentry); +} - dvnode = AFS_FS_I(dir); +void afs_check_for_remote_deletion(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; - _enter("{%x:%u},{%s},%ho", - dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); + switch (afs_op_abort_code(op)) { + case VNOVNODE: + set_bit(AFS_VNODE_DELETED, &vnode->flags); + clear_nlink(&vnode->netfs.inode); + afs_break_callback(vnode, afs_cb_break_for_deleted); + } +} - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; +/* + * Create a new inode for create/mkdir/symlink + */ +static void afs_vnode_new_inode(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + struct afs_vnode *vnode; + struct inode *inode; - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error; - } + _enter(""); - mode |= S_IFDIR; - ret = afs_vnode_create(dvnode, key, dentry->d_name.name, - mode, &fid, &status, &cb, &server); - if (ret < 0) - goto mkdir_error; + ASSERTCMP(afs_op_error(op), ==, 0); - inode = afs_iget(dir->i_sb, key, &fid, &status, &cb); + inode = afs_iget(op, vp); if (IS_ERR(inode)) { - /* ENOMEM at a really inconvenient time - just abandon the new - * directory on the server */ - ret = PTR_ERR(inode); - goto iget_error; + /* ENOMEM or EINTR at a really inconvenient time - just abandon + * the new directory on the server. + */ + afs_op_accumulate_error(op, PTR_ERR(inode), 0); + return; } - /* apply the status report we've got for the new vnode */ vnode = AFS_FS_I(inode); - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) { - _debug("not hashed"); - d_rehash(dentry); + set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + if (S_ISDIR(inode->i_mode)) + afs_mkdir_init_dir(vnode, dvp->vnode); + else if (S_ISLNK(inode->i_mode)) + afs_init_new_symlink(vnode, op); + if (!afs_op_error(op)) + afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb); + d_instantiate(op->dentry, inode); +} + +static void afs_create_success(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + op->ctime = op->file[0].scb.status.mtime_client; + afs_vnode_commit_status(op, &op->file[0]); + afs_update_dentry_version(op, &op->file[0], op->dentry); + afs_vnode_new_inode(op); +} + +static void afs_create_edit_dir(struct afs_operation *op) +{ + struct netfs_cache_resources cres = {}; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + struct afs_vnode *dvnode = dvp->vnode; + + _enter("op=%08x", op->debug_id); + + fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode)); + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) + afs_edit_dir_add(dvnode, &op->dentry->d_name, &vp->fid, + op->create.reason); + up_write(&dvnode->validate_lock); + fscache_end_operation(&cres); +} + +static void afs_create_put(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + + if (afs_op_error(op)) + d_drop(op->dentry); +} + +static const struct afs_operation_ops afs_mkdir_operation = { + .issue_afs_rpc = afs_fs_make_dir, + .issue_yfs_rpc = yfs_fs_make_dir, + .success = afs_create_success, + .aborted = afs_check_for_remote_deletion, + .edit_dir = afs_create_edit_dir, + .put = afs_create_put, +}; + +/* + * create a directory on an AFS filesystem + */ +static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + struct afs_operation *op; + struct afs_vnode *dvnode = AFS_FS_I(dir); + int ret; + + _enter("{%llx:%llu},{%pd},%ho", + dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); + + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) { + d_drop(dentry); + return ERR_CAST(op); } - key_put(key); - _leave(" = 0"); - return 0; -iget_error: - afs_put_server(server); -mkdir_error: - key_put(key); -error: - d_drop(dentry); - _leave(" = %d", ret); - return ret; + fscache_use_cookie(afs_vnode_cache(dvnode), true); + + afs_op_set_vnode(op, 0, dvnode); + op->file[0].dv_delta = 1; + op->file[0].modification = true; + op->file[0].update_ctime = true; + op->dentry = dentry; + op->create.mode = S_IFDIR | mode; + op->create.reason = afs_edit_dir_for_mkdir; + op->mtime = current_time(dir); + op->ops = &afs_mkdir_operation; + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ERR_PTR(ret); +} + +/* + * Remove a subdir from a directory. + */ +static void afs_dir_remove_subdir(struct dentry *dentry) +{ + if (d_really_is_positive(dentry)) { + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + + clear_nlink(&vnode->netfs.inode); + set_bit(AFS_VNODE_DELETED, &vnode->flags); + afs_clear_cb_promise(vnode, afs_cb_promise_clear_rmdir); + afs_invalidate_dir(vnode, afs_dir_invalid_subdir_removed); + } } +static void afs_rmdir_success(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + op->ctime = op->file[0].scb.status.mtime_client; + afs_vnode_commit_status(op, &op->file[0]); + afs_update_dentry_version(op, &op->file[0], op->dentry); +} + +static void afs_rmdir_edit_dir(struct afs_operation *op) +{ + struct netfs_cache_resources cres = {}; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode *dvnode = dvp->vnode; + + _enter("op=%08x", op->debug_id); + afs_dir_remove_subdir(op->dentry); + + fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode)); + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) + afs_edit_dir_remove(dvnode, &op->dentry->d_name, + afs_edit_dir_for_rmdir); + up_write(&dvnode->validate_lock); + fscache_end_operation(&cres); +} + +static void afs_rmdir_put(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + if (op->file[1].vnode) + up_write(&op->file[1].vnode->rmdir_lock); +} + +static const struct afs_operation_ops afs_rmdir_operation = { + .issue_afs_rpc = afs_fs_remove_dir, + .issue_yfs_rpc = yfs_fs_remove_dir, + .success = afs_rmdir_success, + .aborted = afs_check_for_remote_deletion, + .edit_dir = afs_rmdir_edit_dir, + .put = afs_rmdir_put, +}; + /* * remove a directory from an AFS filesystem */ static int afs_rmdir(struct inode *dir, struct dentry *dentry) { - struct afs_vnode *dvnode, *vnode; - struct key *key; + struct afs_operation *op; + struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL; int ret; - dvnode = AFS_FS_I(dir); + _enter("{%llx:%llu},{%pd}", + dvnode->fid.vid, dvnode->fid.vnode, dentry); - _enter("{%x:%u},{%s}", - dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; + fscache_use_cookie(afs_vnode_cache(dvnode), true); - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error; - } + afs_op_set_vnode(op, 0, dvnode); + op->file[0].dv_delta = 1; + op->file[0].modification = true; + op->file[0].update_ctime = true; - ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true); - if (ret < 0) - goto rmdir_error; + op->dentry = dentry; + op->ops = &afs_rmdir_operation; - if (dentry->d_inode) { - vnode = AFS_FS_I(dentry->d_inode); - clear_nlink(&vnode->vfs_inode); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - afs_discard_callback_on_delete(vnode); + /* Try to make sure we have a callback promise on the victim. */ + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); + ret = afs_validate(vnode, op->key); + if (ret < 0) + goto error; } - key_put(key); - _leave(" = 0"); - return 0; + if (vnode) { + ret = down_write_killable(&vnode->rmdir_lock); + if (ret < 0) + goto error; + op->file[1].vnode = vnode; + } -rmdir_error: - key_put(key); -error: - _leave(" = %d", ret); + ret = afs_do_sync_operation(op); + + /* Not all systems that can host afs servers have ENOTEMPTY. */ + if (ret == -EEXIST) + ret = -ENOTEMPTY; +out: + afs_dir_unuse_cookie(dvnode, ret); return ret; + +error: + ret = afs_put_operation(op); + goto out; } /* - * remove a file from an AFS filesystem + * Remove a link to a file or symlink from a directory. + * + * If the file was not deleted due to excess hard links, the fileserver will + * break the callback promise on the file - if it had one - before it returns + * to us, and if it was deleted, it won't + * + * However, if we didn't have a callback promise outstanding, or it was + * outstanding on a different server, then it won't break it either... */ -static int afs_unlink(struct inode *dir, struct dentry *dentry) +static void afs_dir_remove_link(struct afs_operation *op) { - struct afs_vnode *dvnode, *vnode; - struct key *key; + struct afs_vnode *dvnode = op->file[0].vnode; + struct afs_vnode *vnode = op->file[1].vnode; + struct dentry *dentry = op->dentry; int ret; - dvnode = AFS_FS_I(dir); + if (afs_op_error(op) || + (op->file[1].scb.have_status && op->file[1].scb.have_error)) + return; + if (d_really_is_positive(dentry)) + return; - _enter("{%x:%u},{%s}", - dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + /* Already done */ + } else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { + write_seqlock(&vnode->cb_lock); + drop_nlink(&vnode->netfs.inode); + if (vnode->netfs.inode.i_nlink == 0) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + __afs_break_callback(vnode, afs_cb_break_for_unlink); + } + write_sequnlock(&vnode->cb_lock); + } else { + afs_break_callback(vnode, afs_cb_break_for_unlink); - ret = -ENAMETOOLONG; - if (dentry->d_name.len >= AFSNAMEMAX) - goto error; + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + _debug("AFS_VNODE_DELETED"); - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error; + ret = afs_validate(vnode, op->key); + if (ret != -ESTALE) + afs_op_set_error(op, ret); } - if (dentry->d_inode) { - vnode = AFS_FS_I(dentry->d_inode); + _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, afs_op_error(op)); +} + +static void afs_unlink_success(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + op->ctime = op->file[0].scb.status.mtime_client; + afs_check_dir_conflict(op, &op->file[0]); + afs_vnode_commit_status(op, &op->file[0]); + afs_vnode_commit_status(op, &op->file[1]); + afs_update_dentry_version(op, &op->file[0], op->dentry); + afs_dir_remove_link(op); +} - /* make sure we have a callback promise on the victim */ - ret = afs_validate(vnode, key); - if (ret < 0) - goto error; +static void afs_unlink_edit_dir(struct afs_operation *op) +{ + struct netfs_cache_resources cres = {}; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode *dvnode = dvp->vnode; + + _enter("op=%08x", op->debug_id); + fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode)); + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) + afs_edit_dir_remove(dvnode, &op->dentry->d_name, + afs_edit_dir_for_unlink); + up_write(&dvnode->validate_lock); + fscache_end_operation(&cres); +} + +static void afs_unlink_put(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT) + d_rehash(op->dentry); +} + +static const struct afs_operation_ops afs_unlink_operation = { + .issue_afs_rpc = afs_fs_remove_file, + .issue_yfs_rpc = yfs_fs_remove_file, + .success = afs_unlink_success, + .aborted = afs_check_for_remote_deletion, + .edit_dir = afs_unlink_edit_dir, + .put = afs_unlink_put, +}; + +/* + * Remove a file or symlink from an AFS filesystem. + */ +static int afs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct afs_operation *op; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + int ret; + + _enter("{%llx:%llu},{%pd}", + dvnode->fid.vid, dvnode->fid.vnode, dentry); + + if (dentry->d_name.len >= AFSNAMEMAX) + return -ENAMETOOLONG; + + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + fscache_use_cookie(afs_vnode_cache(dvnode), true); + + afs_op_set_vnode(op, 0, dvnode); + op->file[0].dv_delta = 1; + op->file[0].modification = true; + op->file[0].update_ctime = true; + + /* Try to make sure we have a callback promise on the victim. */ + ret = afs_validate(vnode, op->key); + if (ret < 0) { + afs_op_set_error(op, ret); + goto error; } - ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false); - if (ret < 0) - goto remove_error; - - if (dentry->d_inode) { - /* if the file wasn't deleted due to excess hard links, the - * fileserver will break the callback promise on the file - if - * it had one - before it returns to us, and if it was deleted, - * it won't - * - * however, if we didn't have a callback promise outstanding, - * or it was outstanding on a different server, then it won't - * break it either... - */ - vnode = AFS_FS_I(dentry->d_inode); - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - _debug("AFS_VNODE_DELETED"); - if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) - _debug("AFS_VNODE_CB_BROKEN"); - set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - ret = afs_validate(vnode, key); - _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret); + spin_lock(&dentry->d_lock); + if (d_count(dentry) > 1) { + spin_unlock(&dentry->d_lock); + /* Start asynchronous writeout of the inode */ + write_inode_now(d_inode(dentry), 0); + afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key)); + goto error; + } + if (!d_unhashed(dentry)) { + /* Prevent a race with RCU lookup. */ + __d_drop(dentry); + op->unlink.need_rehash = true; } + spin_unlock(&dentry->d_lock); - key_put(key); - _leave(" = 0"); - return 0; + op->file[1].vnode = vnode; + op->file[1].update_ctime = true; + op->file[1].op_unlinked = true; + op->dentry = dentry; + op->ops = &afs_unlink_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + + /* If there was a conflict with a third party, check the status of the + * unlinked vnode. + */ + if (afs_op_error(op) == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + op->file[1].update_ctime = false; + op->fetch_status.which = 1; + op->ops = &afs_fetch_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } -remove_error: - key_put(key); error: - _leave(" = %d", ret); + ret = afs_put_operation(op); + afs_dir_unuse_cookie(dvnode, ret); return ret; } +static const struct afs_operation_ops afs_create_operation = { + .issue_afs_rpc = afs_fs_create_file, + .issue_yfs_rpc = yfs_fs_create_file, + .success = afs_create_success, + .aborted = afs_check_for_remote_deletion, + .edit_dir = afs_create_edit_dir, + .put = afs_create_put, +}; + /* * create a regular file on an AFS filesystem */ -static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int afs_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - struct afs_file_status status; - struct afs_callback cb; - struct afs_server *server; - struct afs_vnode *dvnode, *vnode; - struct afs_fid fid; - struct inode *inode; - struct key *key; - int ret; + struct afs_operation *op; + struct afs_vnode *dvnode = AFS_FS_I(dir); + int ret = -ENAMETOOLONG; - dvnode = AFS_FS_I(dir); + _enter("{%llx:%llu},{%pd},%ho", + dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); - _enter("{%x:%u},{%s},%ho,", - dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); - - ret = -ENAMETOOLONG; if (dentry->d_name.len >= AFSNAMEMAX) goto error; - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); goto error; } - mode |= S_IFREG; - ret = afs_vnode_create(dvnode, key, dentry->d_name.name, - mode, &fid, &status, &cb, &server); - if (ret < 0) - goto create_error; + fscache_use_cookie(afs_vnode_cache(dvnode), true); - inode = afs_iget(dir->i_sb, key, &fid, &status, &cb); - if (IS_ERR(inode)) { - /* ENOMEM at a really inconvenient time - just abandon the new - * directory on the server */ - ret = PTR_ERR(inode); - goto iget_error; - } + afs_op_set_vnode(op, 0, dvnode); + op->file[0].dv_delta = 1; + op->file[0].modification = true; + op->file[0].update_ctime = true; - /* apply the status report we've got for the new vnode */ - vnode = AFS_FS_I(inode); - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) { - _debug("not hashed"); - d_rehash(dentry); - } - key_put(key); - _leave(" = 0"); - return 0; + op->dentry = dentry; + op->create.mode = S_IFREG | mode; + op->create.reason = afs_edit_dir_for_create; + op->mtime = current_time(dir); + op->ops = &afs_create_operation; + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; -iget_error: - afs_put_server(server); -create_error: - key_put(key); error: d_drop(dentry); _leave(" = %d", ret); return ret; } +static void afs_link_success(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + + _enter("op=%08x", op->debug_id); + op->ctime = dvp->scb.status.mtime_client; + afs_vnode_commit_status(op, dvp); + afs_vnode_commit_status(op, vp); + afs_update_dentry_version(op, dvp, op->dentry); + if (op->dentry_2->d_parent == op->dentry->d_parent) + afs_update_dentry_version(op, dvp, op->dentry_2); + ihold(&vp->vnode->netfs.inode); + d_instantiate(op->dentry, &vp->vnode->netfs.inode); +} + +static void afs_link_put(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + if (afs_op_error(op)) + d_drop(op->dentry); +} + +static const struct afs_operation_ops afs_link_operation = { + .issue_afs_rpc = afs_fs_link, + .issue_yfs_rpc = yfs_fs_link, + .success = afs_link_success, + .aborted = afs_check_for_remote_deletion, + .edit_dir = afs_create_edit_dir, + .put = afs_link_put, +}; + /* * create a hard link between files in an AFS filesystem */ static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry) { - struct afs_vnode *dvnode, *vnode; - struct key *key; - int ret; + struct afs_operation *op; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_vnode *vnode = AFS_FS_I(d_inode(from)); + int ret = -ENAMETOOLONG; - vnode = AFS_FS_I(from->d_inode); - dvnode = AFS_FS_I(dir); - - _enter("{%x:%u},{%x:%u},{%s}", + _enter("{%llx:%llu},{%llx:%llu},{%pd}", vnode->fid.vid, vnode->fid.vnode, dvnode->fid.vid, dvnode->fid.vnode, - dentry->d_name.name); + dentry); - ret = -ENAMETOOLONG; if (dentry->d_name.len >= AFSNAMEMAX) goto error; - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); goto error; } - ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name); - if (ret < 0) - goto link_error; + fscache_use_cookie(afs_vnode_cache(dvnode), true); - ihold(&vnode->vfs_inode); - d_instantiate(dentry, &vnode->vfs_inode); - key_put(key); - _leave(" = 0"); - return 0; + ret = afs_validate(vnode, op->key); + if (ret < 0) + goto error_op; + + afs_op_set_vnode(op, 0, dvnode); + afs_op_set_vnode(op, 1, vnode); + op->file[0].dv_delta = 1; + op->file[0].modification = true; + op->file[0].update_ctime = true; + op->file[1].update_ctime = true; + + op->dentry = dentry; + op->dentry_2 = from; + op->ops = &afs_link_operation; + op->create.reason = afs_edit_dir_for_link; + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; -link_error: - key_put(key); +error_op: + afs_put_operation(op); + afs_dir_unuse_cookie(dvnode, ret); error: d_drop(dentry); _leave(" = %d", ret); return ret; } +static const struct afs_operation_ops afs_symlink_operation = { + .issue_afs_rpc = afs_fs_symlink, + .issue_yfs_rpc = yfs_fs_symlink, + .success = afs_create_success, + .aborted = afs_check_for_remote_deletion, + .edit_dir = afs_create_edit_dir, + .put = afs_create_put, +}; + /* * create a symlink in an AFS filesystem */ -static int afs_symlink(struct inode *dir, struct dentry *dentry, - const char *content) +static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *content) { - struct afs_file_status status; - struct afs_server *server; - struct afs_vnode *dvnode, *vnode; - struct afs_fid fid; - struct inode *inode; - struct key *key; + struct afs_operation *op; + struct afs_vnode *dvnode = AFS_FS_I(dir); int ret; - dvnode = AFS_FS_I(dir); - - _enter("{%x:%u},{%s},%s", - dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, + _enter("{%llx:%llu},{%pd},%s", + dvnode->fid.vid, dvnode->fid.vnode, dentry, content); ret = -ENAMETOOLONG; @@ -1065,95 +1795,428 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, if (strlen(content) >= AFSPATHMAX) goto error; - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); goto error; } - ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content, - &fid, &status, &server); - if (ret < 0) - goto create_error; + fscache_use_cookie(afs_vnode_cache(dvnode), true); - inode = afs_iget(dir->i_sb, key, &fid, &status, NULL); - if (IS_ERR(inode)) { - /* ENOMEM at a really inconvenient time - just abandon the new - * directory on the server */ - ret = PTR_ERR(inode); - goto iget_error; - } + afs_op_set_vnode(op, 0, dvnode); + op->file[0].dv_delta = 1; - /* apply the status report we've got for the new vnode */ - vnode = AFS_FS_I(inode); - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) { - _debug("not hashed"); - d_rehash(dentry); - } - key_put(key); - _leave(" = 0"); - return 0; + op->dentry = dentry; + op->ops = &afs_symlink_operation; + op->create.reason = afs_edit_dir_for_symlink; + op->create.symlink = content; + op->mtime = current_time(dir); + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; -iget_error: - afs_put_server(server); -create_error: - key_put(key); error: d_drop(dentry); _leave(" = %d", ret); return ret; } +static void afs_rename_success(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; + + _enter("op=%08x", op->debug_id); + + op->ctime = op->file[0].scb.status.mtime_client; + afs_check_dir_conflict(op, &op->file[1]); + afs_vnode_commit_status(op, &op->file[0]); + if (op->file[1].vnode != op->file[0].vnode) { + op->ctime = op->file[1].scb.status.mtime_client; + afs_vnode_commit_status(op, &op->file[1]); + } + if (op->more_files[0].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[0]); + if (op->more_files[1].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[1]); + + /* If we're moving a subdir between dirs, we need to update + * its DV counter too as the ".." will be altered. + */ + if (op->file[0].vnode != op->file[1].vnode) { + if (S_ISDIR(vnode->netfs.inode.i_mode)) { + u64 new_dv; + + write_seqlock(&vnode->cb_lock); + + new_dv = vnode->status.data_version + 1; + trace_afs_set_dv(vnode, new_dv); + vnode->status.data_version = new_dv; + inode_set_iversion_raw(&vnode->netfs.inode, new_dv); + + write_sequnlock(&vnode->cb_lock); + } + + if ((op->rename.rename_flags & RENAME_EXCHANGE) && + S_ISDIR(new_vnode->netfs.inode.i_mode)) { + u64 new_dv; + + write_seqlock(&new_vnode->cb_lock); + + new_dv = new_vnode->status.data_version + 1; + new_vnode->status.data_version = new_dv; + inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv); + + write_sequnlock(&new_vnode->cb_lock); + } + } +} + +static void afs_rename_edit_dir(struct afs_operation *op) +{ + struct netfs_cache_resources orig_cres = {}, new_cres = {}; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode *orig_dvnode = orig_dvp->vnode; + struct afs_vnode *new_dvnode = new_dvp->vnode; + struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + struct dentry *old_dentry = op->dentry; + struct dentry *new_dentry = op->dentry_2; + struct inode *new_inode; + + _enter("op=%08x", op->debug_id); + + if (op->rename.rehash) { + d_rehash(op->rename.rehash); + op->rename.rehash = NULL; + } + + fscache_begin_write_operation(&orig_cres, afs_vnode_cache(orig_dvnode)); + if (new_dvnode != orig_dvnode) + fscache_begin_write_operation(&new_cres, afs_vnode_cache(new_dvnode)); + + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) + afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name, + afs_edit_dir_for_rename_0); + + if (new_dvnode != orig_dvnode) { + up_write(&orig_dvnode->validate_lock); + down_write(&new_dvnode->validate_lock); + } + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && + new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta) { + if (!op->rename.new_negative) + afs_edit_dir_remove(new_dvnode, &new_dentry->d_name, + afs_edit_dir_for_rename_1); + + afs_edit_dir_add(new_dvnode, &new_dentry->d_name, + &vnode->fid, afs_edit_dir_for_rename_2); + } + + if (S_ISDIR(vnode->netfs.inode.i_mode) && + new_dvnode != orig_dvnode && + test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + afs_edit_dir_update(vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); + + new_inode = d_inode(new_dentry); + if (new_inode) { + spin_lock(&new_inode->i_lock); + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else if (new_inode->i_nlink > 0) + drop_nlink(new_inode); + spin_unlock(&new_inode->i_lock); + } + + /* Now we can update d_fsdata on the dentries to reflect their + * new parent's data_version. + */ + afs_update_dentry_version(op, new_dvp, op->dentry); + afs_update_dentry_version(op, new_dvp, op->dentry_2); + + d_move(old_dentry, new_dentry); + + up_write(&new_dvnode->validate_lock); + fscache_end_operation(&orig_cres); + if (new_dvnode != orig_dvnode) + fscache_end_operation(&new_cres); +} + +static void afs_rename_exchange_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode *orig_dvnode = orig_dvp->vnode; + struct afs_vnode *new_dvnode = new_dvp->vnode; + struct afs_vnode *old_vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; + struct dentry *old_dentry = op->dentry; + struct dentry *new_dentry = op->dentry_2; + + _enter("op=%08x", op->debug_id); + + if (new_dvnode == orig_dvnode) { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) { + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + afs_edit_dir_update(orig_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + } + + d_exchange(old_dentry, new_dentry); + up_write(&orig_dvnode->validate_lock); + } else { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + + up_write(&orig_dvnode->validate_lock); + down_write(&new_dvnode->validate_lock); + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && + new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta) + afs_edit_dir_update(new_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + + if (S_ISDIR(old_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags)) + afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); + + if (S_ISDIR(new_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags)) + afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode, + afs_edit_dir_for_rename_sub); + + /* Now we can update d_fsdata on the dentries to reflect their + * new parents' data_version. + */ + afs_update_dentry_version(op, new_dvp, old_dentry); + afs_update_dentry_version(op, orig_dvp, new_dentry); + + d_exchange(old_dentry, new_dentry); + up_write(&new_dvnode->validate_lock); + } +} + +static void afs_rename_put(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + if (op->rename.rehash) + d_rehash(op->rename.rehash); + dput(op->rename.tmp); + if (afs_op_error(op)) + d_rehash(op->dentry); +} + +static const struct afs_operation_ops afs_rename_operation = { + .issue_afs_rpc = afs_fs_rename, + .issue_yfs_rpc = yfs_fs_rename, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; + +#if 0 /* Autoswitched in yfs_fs_rename_replace(). */ +static const struct afs_operation_ops afs_rename_replace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_replace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; +#endif + +static const struct afs_operation_ops afs_rename_noreplace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_noreplace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; + +static const struct afs_operation_ops afs_rename_exchange_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_exchange, + .success = afs_rename_success, + .edit_dir = afs_rename_exchange_edit_dir, + .put = afs_rename_put, +}; + /* * rename a file in an AFS filesystem and/or move it between directories */ -static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { - struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; - struct key *key; + struct afs_operation *op; + struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL; int ret; - vnode = AFS_FS_I(old_dentry->d_inode); + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + /* Don't allow silly-rename files be moved around. */ + if (old_dentry->d_flags & DCACHE_NFSFS_RENAMED) + return -EINVAL; + + vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); + if (d_is_positive(new_dentry)) + new_vnode = AFS_FS_I(d_inode(new_dentry)); - _enter("{%x:%u},{%x:%u},{%x:%u},{%s}", + _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, vnode->fid.vid, vnode->fid.vnode, new_dvnode->fid.vid, new_dvnode->fid.vnode, - new_dentry->d_name.name); + new_dentry); - ret = -ENAMETOOLONG; - if (new_dentry->d_name.len >= AFSNAMEMAX) + op = afs_alloc_operation(NULL, orig_dvnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + fscache_use_cookie(afs_vnode_cache(orig_dvnode), true); + if (new_dvnode != orig_dvnode) + fscache_use_cookie(afs_vnode_cache(new_dvnode), true); + + ret = afs_validate(vnode, op->key); + afs_op_set_error(op, ret); + if (ret < 0) goto error; - key = afs_request_key(orig_dvnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); + ret = -ENOMEM; + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) goto error; + + afs_op_set_vnode(op, 0, orig_dvnode); + afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ + op->file[0].dv_delta = 1; + op->file[1].dv_delta = 1; + op->file[0].modification = true; + op->file[1].modification = true; + op->file[0].update_ctime = true; + op->file[1].update_ctime = true; + op->more_files[0].vnode = vnode; + op->more_files[0].speculative = true; + op->more_files[1].vnode = new_vnode; + op->more_files[1].speculative = true; + op->nr_files = 4; + + op->dentry = old_dentry; + op->dentry_2 = new_dentry; + op->rename.rename_flags = flags; + op->rename.new_negative = d_is_negative(new_dentry); + + if (flags & RENAME_NOREPLACE) { + op->ops = &afs_rename_noreplace_operation; + } else if (flags & RENAME_EXCHANGE) { + op->ops = &afs_rename_exchange_operation; + d_drop(new_dentry); + } else { + /* If we might displace the target, we might need to do silly + * rename. + */ + op->ops = &afs_rename_operation; + + /* For non-directories, check whether the target is busy and if + * so, make a copy of the dentry and then do a silly-rename. + * If the silly-rename succeeds, the copied dentry is hashed + * and becomes the new target. + */ + if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { + /* To prevent any new references to the target during + * the rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + op->rename.rehash = new_dentry; + } + + if (d_count(new_dentry) > 2) { + /* copy the target dentry's name */ + op->rename.tmp = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!op->rename.tmp) { + afs_op_nomem(op); + goto error; + } + + ret = afs_sillyrename(new_dvnode, + AFS_FS_I(d_inode(new_dentry)), + new_dentry, op->key); + if (ret) { + afs_op_set_error(op, ret); + goto error; + } + + op->dentry_2 = op->rename.tmp; + op->rename.rehash = NULL; + op->rename.new_negative = true; + } + } } - ret = afs_vnode_rename(orig_dvnode, new_dvnode, key, - old_dentry->d_name.name, - new_dentry->d_name.name); - if (ret < 0) - goto rename_error; - key_put(key); - _leave(" = 0"); - return 0; + /* This bit is potentially nasty as there's a potential race with + * afs_d_revalidate{,_rcu}(). We have to change d_fsdata on the dentry + * to reflect it's new parent's new data_version after the op, but + * d_revalidate may see old_dentry between the op having taken place + * and the version being updated. + * + * So drop the old_dentry for now to make other threads go through + * lookup instead - which we hold a lock against. + */ + d_drop(old_dentry); + + ret = afs_do_sync_operation(op); + if (ret == -ENOTSUPP) + ret = -EINVAL; +out: + afs_dir_unuse_cookie(orig_dvnode, ret); + if (new_dvnode != orig_dvnode) + afs_dir_unuse_cookie(new_dvnode, ret); + return ret; -rename_error: - key_put(key); error: - d_drop(new_dentry); - _leave(" = %d", ret); + ret = afs_put_operation(op); + goto out; +} + +/* + * Write the file contents to the cache as a single blob. + */ +int afs_single_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct afs_vnode *dvnode = AFS_FS_I(mapping->host); + struct iov_iter iter; + bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) && + !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags)); + int ret = 0; + + /* Need to lock to prevent the folio queue and folios from being thrown + * away. + */ + down_read(&dvnode->validate_lock); + + if (is_dir ? + test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) : + atomic64_read(&dvnode->cb_expires_at) != AFS_NO_CB_PROMISE) { + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, + i_size_read(&dvnode->netfs.inode)); + ret = netfs_writeback_single(mapping, wbc, &iter); + } + + up_read(&dvnode->validate_lock); return ret; } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c new file mode 100644 index 000000000000..fd3aa9f97ce6 --- /dev/null +++ b/fs/afs/dir_edit.c @@ -0,0 +1,648 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* AFS filesystem directory editing + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/pagemap.h> +#include <linux/iversion.h> +#include <linux/folio_queue.h> +#include "internal.h" +#include "xdr_fs.h" + +/* + * Find a number of contiguous clear bits in a directory block bitmask. + * + * There are 64 slots, which means we can load the entire bitmap into a + * variable. The first bit doesn't count as it corresponds to the block header + * slot. nr_slots is between 1 and 9. + */ +static int afs_find_contig_bits(union afs_xdr_dir_block *block, unsigned int nr_slots) +{ + u64 bitmap; + u32 mask; + int bit, n; + + bitmap = (u64)block->hdr.bitmap[0] << 0 * 8; + bitmap |= (u64)block->hdr.bitmap[1] << 1 * 8; + bitmap |= (u64)block->hdr.bitmap[2] << 2 * 8; + bitmap |= (u64)block->hdr.bitmap[3] << 3 * 8; + bitmap |= (u64)block->hdr.bitmap[4] << 4 * 8; + bitmap |= (u64)block->hdr.bitmap[5] << 5 * 8; + bitmap |= (u64)block->hdr.bitmap[6] << 6 * 8; + bitmap |= (u64)block->hdr.bitmap[7] << 7 * 8; + bitmap >>= 1; /* The first entry is metadata */ + bit = 1; + mask = (1 << nr_slots) - 1; + + do { + if (sizeof(unsigned long) == 8) + n = ffz(bitmap); + else + n = ((u32)bitmap) != 0 ? + ffz((u32)bitmap) : + ffz((u32)(bitmap >> 32)) + 32; + bitmap >>= n; + bit += n; + + if ((bitmap & mask) == 0) { + if (bit > 64 - nr_slots) + return -1; + return bit; + } + + n = __ffs(bitmap); + bitmap >>= n; + bit += n; + } while (bitmap); + + return -1; +} + +/* + * Set a number of contiguous bits in the directory block bitmap. + */ +static void afs_set_contig_bits(union afs_xdr_dir_block *block, + int bit, unsigned int nr_slots) +{ + u64 mask; + + mask = (1 << nr_slots) - 1; + mask <<= bit; + + block->hdr.bitmap[0] |= (u8)(mask >> 0 * 8); + block->hdr.bitmap[1] |= (u8)(mask >> 1 * 8); + block->hdr.bitmap[2] |= (u8)(mask >> 2 * 8); + block->hdr.bitmap[3] |= (u8)(mask >> 3 * 8); + block->hdr.bitmap[4] |= (u8)(mask >> 4 * 8); + block->hdr.bitmap[5] |= (u8)(mask >> 5 * 8); + block->hdr.bitmap[6] |= (u8)(mask >> 6 * 8); + block->hdr.bitmap[7] |= (u8)(mask >> 7 * 8); +} + +/* + * Clear a number of contiguous bits in the directory block bitmap. + */ +static void afs_clear_contig_bits(union afs_xdr_dir_block *block, + int bit, unsigned int nr_slots) +{ + u64 mask; + + mask = (1 << nr_slots) - 1; + mask <<= bit; + + block->hdr.bitmap[0] &= ~(u8)(mask >> 0 * 8); + block->hdr.bitmap[1] &= ~(u8)(mask >> 1 * 8); + block->hdr.bitmap[2] &= ~(u8)(mask >> 2 * 8); + block->hdr.bitmap[3] &= ~(u8)(mask >> 3 * 8); + block->hdr.bitmap[4] &= ~(u8)(mask >> 4 * 8); + block->hdr.bitmap[5] &= ~(u8)(mask >> 5 * 8); + block->hdr.bitmap[6] &= ~(u8)(mask >> 6 * 8); + block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8); +} + +/* + * Get a specific block, extending the directory storage to cover it as needed. + */ +static union afs_xdr_dir_block *afs_dir_get_block(struct afs_dir_iter *iter, size_t block) +{ + struct folio_queue *fq; + struct afs_vnode *dvnode = iter->dvnode; + struct folio *folio; + size_t blpos = block * AFS_DIR_BLOCK_SIZE; + size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos; + int ret; + + if (dvnode->directory_size < blend) { + size_t cur_size = dvnode->directory_size; + + ret = netfs_alloc_folioq_buffer( + NULL, &dvnode->directory, &cur_size, blend, + mapping_gfp_mask(dvnode->netfs.inode.i_mapping)); + dvnode->directory_size = cur_size; + if (ret < 0) + goto fail; + } + + fq = iter->fq; + if (!fq) + fq = dvnode->directory; + + /* Search the folio queue for the folio containing the block... */ + for (; fq; fq = fq->next) { + for (int s = iter->fq_slot; s < folioq_count(fq); s++) { + size_t fsize = folioq_folio_size(fq, s); + + if (blend <= fpos + fsize) { + /* ... and then return the mapped block. */ + folio = folioq_folio(fq, s); + if (WARN_ON_ONCE(folio_pos(folio) != fpos)) + goto fail; + iter->fq = fq; + iter->fq_slot = s; + iter->fpos = fpos; + return kmap_local_folio(folio, blpos - fpos); + } + fpos += fsize; + } + iter->fq_slot = 0; + } + +fail: + iter->fq = NULL; + iter->fq_slot = 0; + afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block); + return NULL; +} + +/* + * Scan a directory block looking for a dirent of the right name. + */ +static int afs_dir_scan_block(const union afs_xdr_dir_block *block, const struct qstr *name, + unsigned int blocknum) +{ + const union afs_xdr_dirent *de; + u64 bitmap; + int d, len, n; + + _enter(""); + + bitmap = (u64)block->hdr.bitmap[0] << 0 * 8; + bitmap |= (u64)block->hdr.bitmap[1] << 1 * 8; + bitmap |= (u64)block->hdr.bitmap[2] << 2 * 8; + bitmap |= (u64)block->hdr.bitmap[3] << 3 * 8; + bitmap |= (u64)block->hdr.bitmap[4] << 4 * 8; + bitmap |= (u64)block->hdr.bitmap[5] << 5 * 8; + bitmap |= (u64)block->hdr.bitmap[6] << 6 * 8; + bitmap |= (u64)block->hdr.bitmap[7] << 7 * 8; + + for (d = (blocknum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + d < AFS_DIR_SLOTS_PER_BLOCK; + d++) { + if (!((bitmap >> d) & 1)) + continue; + de = &block->dirents[d]; + if (de->u.valid != 1) + continue; + + /* The block was NUL-terminated by afs_dir_check_page(). */ + len = strlen(de->u.name); + if (len == name->len && + memcmp(de->u.name, name->name, name->len) == 0) + return d; + + n = round_up(12 + len + 1 + 4, AFS_DIR_DIRENT_SIZE); + n /= AFS_DIR_DIRENT_SIZE; + d += n - 1; + } + + return -1; +} + +/* + * Initialise a new directory block. Note that block 0 is special and contains + * some extra metadata. + */ +static void afs_edit_init_block(union afs_xdr_dir_block *meta, + union afs_xdr_dir_block *block, int block_num) +{ + memset(block, 0, sizeof(*block)); + block->hdr.npages = htons(1); + block->hdr.magic = AFS_DIR_MAGIC; + block->hdr.bitmap[0] = 1; + + if (block_num == 0) { + block->hdr.bitmap[0] = 0xff; + block->hdr.bitmap[1] = 0x1f; + memset(block->meta.alloc_ctrs, + AFS_DIR_SLOTS_PER_BLOCK, + sizeof(block->meta.alloc_ctrs)); + meta->meta.alloc_ctrs[0] = + AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS0; + } + + if (block_num < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[block_num] = + AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS; +} + +/* + * Edit a directory's file data to add a new directory entry. Doing this after + * create, mkdir, symlink, link or rename if the data version number is + * incremented by exactly one avoids the need to re-download the entire + * directory contents. + * + * The caller must hold the inode locked. + */ +void afs_edit_dir_add(struct afs_vnode *vnode, + const struct qstr *name, struct afs_fid *new_fid, + enum afs_edit_dir_reason why) +{ + union afs_xdr_dir_block *meta, *block; + union afs_xdr_dirent *de; + struct afs_dir_iter iter = { .dvnode = vnode }; + unsigned int nr_blocks, b, entry; + loff_t i_size; + int slot; + + _enter(",,{%d,%s},", name->len, name->name); + + i_size = i_size_read(&vnode->netfs.inode); + if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || + (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { + afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_bad_size); + return; + } + + meta = afs_dir_get_block(&iter, 0); + if (!meta) + return; + + /* Work out how many slots we're going to need. */ + iter.nr_slots = afs_dir_calc_slots(name->len); + + if (i_size == 0) + goto new_directory; + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + /* Find a block that has sufficient slots available. Each folio + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks + 1; b++) { + /* If the directory extended into a new folio, then we need to + * tack a new folio on the end. + */ + if (nr_blocks >= AFS_DIR_MAX_BLOCKS) + goto error_too_many_blocks; + + /* Lower dir blocks have a counter in the header we can check. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR && + meta->meta.alloc_ctrs[b] < iter.nr_slots) + continue; + + block = afs_dir_get_block(&iter, b); + if (!block) + goto error; + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto already_invalidated; + + _debug("block %u: %2u %3u %u", + b, + (b < AFS_DIR_BLOCKS_WITH_CTR) ? meta->meta.alloc_ctrs[b] : 99, + ntohs(block->hdr.npages), + ntohs(block->hdr.magic)); + + /* Initialise the block if necessary. */ + if (b == nr_blocks) { + _debug("init %u", b); + afs_edit_init_block(meta, block, b); + afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE); + } + + /* We need to try and find one or more consecutive slots to + * hold the entry. + */ + slot = afs_find_contig_bits(block, iter.nr_slots); + if (slot >= 0) { + _debug("slot %u", slot); + goto found_space; + } + + kunmap_local(block); + } + + /* There are no spare slots of sufficient size, yet the operation + * succeeded. Download the directory again. + */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_no_slots); + goto out_unmap; + +new_directory: + afs_edit_init_block(meta, meta, 0); + i_size = AFS_DIR_BLOCK_SIZE; + afs_set_i_size(vnode, i_size); + slot = AFS_DIR_RESV_BLOCKS0; + block = afs_dir_get_block(&iter, 0); + nr_blocks = 1; + b = 0; + +found_space: + /* Set the dirent slot. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_create, b, slot, + new_fid->vnode, new_fid->unique, name->name); + de = &block->dirents[slot]; + de->u.valid = 1; + de->u.unused[0] = 0; + de->u.hash_next = 0; // TODO: Really need to maintain this + de->u.vnode = htonl(new_fid->vnode); + de->u.unique = htonl(new_fid->unique); + memcpy(de->u.name, name->name, name->len + 1); + de->u.name[name->len] = 0; + + /* Adjust the bitmap. */ + afs_set_contig_bits(block, slot, iter.nr_slots); + + /* Adjust the allocation counter. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[b] -= iter.nr_slots; + + /* Adjust the hash chain. */ + entry = b * AFS_DIR_SLOTS_PER_BLOCK + slot; + iter.bucket = afs_dir_hash_name(name); + de->u.hash_next = meta->meta.hashtable[iter.bucket]; + meta->meta.hashtable[iter.bucket] = htons(entry); + kunmap_local(block); + + inode_inc_iversion_raw(&vnode->netfs.inode); + afs_stat_v(vnode, n_dir_cr); + _debug("Insert %s in %u[%u]", name->name, b, slot); + + netfs_single_mark_inode_dirty(&vnode->netfs.inode); + +out_unmap: + kunmap_local(meta); + _leave(""); + return; + +already_invalidated: + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name); + kunmap_local(block); + goto out_unmap; + +error_too_many_blocks: + afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_too_many_blocks); +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name); + goto out_unmap; +} + +/* + * Edit a directory's file data to remove a new directory entry. Doing this + * after unlink, rmdir or rename if the data version number is incremented by + * exactly one avoids the need to re-download the entire directory contents. + * + * The caller must hold the inode locked. + */ +void afs_edit_dir_remove(struct afs_vnode *vnode, + const struct qstr *name, enum afs_edit_dir_reason why) +{ + union afs_xdr_dir_block *meta, *block, *pblock; + union afs_xdr_dirent *de, *pde; + struct afs_dir_iter iter = { .dvnode = vnode }; + struct afs_fid fid; + unsigned int b, slot, entry; + loff_t i_size; + __be16 next; + int found; + + _enter(",,{%d,%s},", name->len, name->name); + + i_size = i_size_read(&vnode->netfs.inode); + if (i_size < AFS_DIR_BLOCK_SIZE || + i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || + (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { + afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_bad_size); + return; + } + + if (!afs_dir_init_iter(&iter, name)) + return; + + meta = afs_dir_find_block(&iter, 0); + if (!meta) + return; + + /* Find the entry in the blob. */ + found = afs_dir_search_bucket(&iter, name, &fid); + if (found < 0) { + /* Didn't find the dirent to clobber. Re-download. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent, + 0, 0, 0, 0, name->name); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_wrong_name); + goto out_unmap; + } + + entry = found; + b = entry / AFS_DIR_SLOTS_PER_BLOCK; + slot = entry % AFS_DIR_SLOTS_PER_BLOCK; + + block = afs_dir_find_block(&iter, b); + if (!block) + goto error; + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto already_invalidated; + + /* Check and clear the entry. */ + de = &block->dirents[slot]; + if (de->u.valid != 1) + goto error_unmap; + + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot, + ntohl(de->u.vnode), ntohl(de->u.unique), + name->name); + + /* Adjust the bitmap. */ + afs_clear_contig_bits(block, slot, iter.nr_slots); + + /* Adjust the allocation counter. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[b] += iter.nr_slots; + + /* Clear the constituent entries. */ + next = de->u.hash_next; + memset(de, 0, sizeof(*de) * iter.nr_slots); + kunmap_local(block); + + /* Adjust the hash chain: if iter->prev_entry is 0, the hashtable head + * index is previous; otherwise it's slot number of the previous entry. + */ + if (!iter.prev_entry) { + __be16 prev_next = meta->meta.hashtable[iter.bucket]; + + if (unlikely(prev_next != htons(entry))) { + pr_warn("%llx:%llx:%x: not head of chain b=%x p=%x,%x e=%x %*s", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, + iter.bucket, iter.prev_entry, prev_next, entry, + name->len, name->name); + goto error; + } + meta->meta.hashtable[iter.bucket] = next; + } else { + unsigned int pb = iter.prev_entry / AFS_DIR_SLOTS_PER_BLOCK; + unsigned int ps = iter.prev_entry % AFS_DIR_SLOTS_PER_BLOCK; + __be16 prev_next; + + pblock = afs_dir_find_block(&iter, pb); + if (!pblock) + goto error; + pde = &pblock->dirents[ps]; + prev_next = pde->u.hash_next; + if (prev_next != htons(entry)) { + kunmap_local(pblock); + pr_warn("%llx:%llx:%x: not prev in chain b=%x p=%x,%x e=%x %*s", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, + iter.bucket, iter.prev_entry, prev_next, entry, + name->len, name->name); + goto error; + } + pde->u.hash_next = next; + kunmap_local(pblock); + } + + netfs_single_mark_inode_dirty(&vnode->netfs.inode); + + inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); + afs_stat_v(vnode, n_dir_rm); + _debug("Remove %s from %u[%u]", name->name, b, slot); + +out_unmap: + kunmap_local(meta); + _leave(""); + return; + +already_invalidated: + kunmap_local(block); + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval, + 0, 0, 0, 0, name->name); + goto out_unmap; + +error_unmap: + kunmap_local(block); +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error, + 0, 0, 0, 0, name->name); + goto out_unmap; +} + +/* + * Edit an entry in a directory to update the vnode it refers to. This is also + * used to update the ".." entry in a directory. + */ +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why) +{ + union afs_xdr_dir_block *block; + union afs_xdr_dirent *de; + struct afs_dir_iter iter = { .dvnode = vnode }; + unsigned int nr_blocks, b; + loff_t i_size; + int slot; + + _enter(""); + + i_size = i_size_read(&vnode->netfs.inode); + if (i_size < AFS_DIR_BLOCK_SIZE) { + afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_bad_size); + return; + } + + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + /* Find a block that has sufficient slots available. Each folio + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks; b++) { + block = afs_dir_get_block(&iter, b); + if (!block) + goto error; + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto already_invalidated; + + slot = afs_dir_scan_block(block, name, b); + if (slot >= 0) + goto found_dirent; + + kunmap_local(block); + } + + /* Didn't find the dirent to clobber. Download the directory again. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, + 0, 0, 0, 0, name->name); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd); + goto out; + +found_dirent: + de = &block->dirents[slot]; + de->u.vnode = htonl(new_dvnode->fid.vnode); + de->u.unique = htonl(new_dvnode->fid.unique); + + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot, + ntohl(de->u.vnode), ntohl(de->u.unique), name->name); + + kunmap_local(block); + netfs_single_mark_inode_dirty(&vnode->netfs.inode); + inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); + +out: + _leave(""); + return; + +already_invalidated: + kunmap_local(block); + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, + 0, 0, 0, 0, name->name); + goto out; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, + 0, 0, 0, 0, name->name); + goto out; +} + +/* + * Initialise a new directory. We need to fill in the "." and ".." entries. + */ +void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_dvnode) +{ + union afs_xdr_dir_block *meta; + struct afs_dir_iter iter = { .dvnode = dvnode }; + union afs_xdr_dirent *de; + unsigned int slot = AFS_DIR_RESV_BLOCKS0; + loff_t i_size; + + i_size = i_size_read(&dvnode->netfs.inode); + if (i_size != AFS_DIR_BLOCK_SIZE) { + afs_invalidate_dir(dvnode, afs_dir_invalid_edit_add_bad_size); + return; + } + + meta = afs_dir_get_block(&iter, 0); + if (!meta) + return; + + afs_edit_init_block(meta, meta, 0); + + de = &meta->dirents[slot]; + de->u.valid = 1; + de->u.vnode = htonl(dvnode->fid.vnode); + de->u.unique = htonl(dvnode->fid.unique); + memcpy(de->u.name, ".", 2); + trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot, + dvnode->fid.vnode, dvnode->fid.unique, "."); + slot++; + + de = &meta->dirents[slot]; + de->u.valid = 1; + de->u.vnode = htonl(parent_dvnode->fid.vnode); + de->u.unique = htonl(parent_dvnode->fid.unique); + memcpy(de->u.name, "..", 3); + trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot, + parent_dvnode->fid.vnode, parent_dvnode->fid.unique, ".."); + + afs_set_contig_bits(meta, AFS_DIR_RESV_BLOCKS0, 2); + meta->meta.alloc_ctrs[0] -= 2; + kunmap_local(meta); + + netfs_single_mark_inode_dirty(&dvnode->netfs.inode); + set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); + set_bit(AFS_VNODE_DIR_READ, &dvnode->flags); +} diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c new file mode 100644 index 000000000000..d2516e55b5ed --- /dev/null +++ b/fs/afs/dir_search.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Search a directory's hash table. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * https://tools.ietf.org/html/draft-keiser-afs3-directory-object-00 + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/iversion.h> +#include "internal.h" +#include "afs_fs.h" +#include "xdr_fs.h" + +/* + * Calculate the name hash. + */ +unsigned int afs_dir_hash_name(const struct qstr *name) +{ + const unsigned char *p = name->name; + unsigned int hash = 0, i; + int bucket; + + for (i = 0; i < name->len; i++) + hash = (hash * 173) + p[i]; + bucket = hash & (AFS_DIR_HASHTBL_SIZE - 1); + if (hash > INT_MAX) { + bucket = AFS_DIR_HASHTBL_SIZE - bucket; + bucket &= (AFS_DIR_HASHTBL_SIZE - 1); + } + return bucket; +} + +/* + * Reset a directory iterator. + */ +static bool afs_dir_reset_iter(struct afs_dir_iter *iter) +{ + unsigned long long i_size = i_size_read(&iter->dvnode->netfs.inode); + unsigned int nblocks; + + /* Work out the maximum number of steps we can take. */ + nblocks = umin(i_size / AFS_DIR_BLOCK_SIZE, AFS_DIR_MAX_BLOCKS); + if (!nblocks) + return false; + iter->loop_check = nblocks * (AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS); + iter->prev_entry = 0; /* Hash head is previous */ + return true; +} + +/* + * Initialise a directory iterator for looking up a name. + */ +bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name) +{ + iter->nr_slots = afs_dir_calc_slots(name->len); + iter->bucket = afs_dir_hash_name(name); + return afs_dir_reset_iter(iter); +} + +/* + * Get a specific block. + */ +union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block) +{ + struct folio_queue *fq = iter->fq; + struct afs_vnode *dvnode = iter->dvnode; + struct folio *folio; + size_t blpos = block * AFS_DIR_BLOCK_SIZE; + size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos; + int slot = iter->fq_slot; + + _enter("%zx,%d", block, slot); + + if (iter->block) { + kunmap_local(iter->block); + iter->block = NULL; + } + + if (dvnode->directory_size < blend) + goto fail; + + if (!fq || blpos < fpos) { + fq = dvnode->directory; + slot = 0; + fpos = 0; + } + + /* Search the folio queue for the folio containing the block... */ + for (; fq; fq = fq->next) { + for (; slot < folioq_count(fq); slot++) { + size_t fsize = folioq_folio_size(fq, slot); + + if (blend <= fpos + fsize) { + /* ... and then return the mapped block. */ + folio = folioq_folio(fq, slot); + if (WARN_ON_ONCE(folio_pos(folio) != fpos)) + goto fail; + iter->fq = fq; + iter->fq_slot = slot; + iter->fpos = fpos; + iter->block = kmap_local_folio(folio, blpos - fpos); + return iter->block; + } + fpos += fsize; + } + slot = 0; + } + +fail: + iter->fq = NULL; + iter->fq_slot = 0; + afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block); + return NULL; +} + +/* + * Search through a directory bucket. + */ +int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, + struct afs_fid *_fid) +{ + const union afs_xdr_dir_block *meta; + unsigned int entry; + int ret = -ESTALE; + + meta = afs_dir_find_block(iter, 0); + if (!meta) + return -ESTALE; + + entry = ntohs(meta->meta.hashtable[iter->bucket & (AFS_DIR_HASHTBL_SIZE - 1)]); + _enter("%x,%x", iter->bucket, entry); + + while (entry) { + const union afs_xdr_dir_block *block; + const union afs_xdr_dirent *dire; + unsigned int blnum = entry / AFS_DIR_SLOTS_PER_BLOCK; + unsigned int slot = entry % AFS_DIR_SLOTS_PER_BLOCK; + unsigned int resv = (blnum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + + _debug("search %x", entry); + + if (slot < resv) { + kdebug("slot out of range h=%x rs=%2x sl=%2x-%2x", + iter->bucket, resv, slot, slot + iter->nr_slots - 1); + goto bad; + } + + block = afs_dir_find_block(iter, blnum); + if (!block) + goto bad; + dire = &block->dirents[slot]; + + if (slot + iter->nr_slots <= AFS_DIR_SLOTS_PER_BLOCK && + memcmp(dire->u.name, name->name, name->len) == 0 && + dire->u.name[name->len] == '\0') { + _fid->vnode = ntohl(dire->u.vnode); + _fid->unique = ntohl(dire->u.unique); + ret = entry; + goto found; + } + + iter->prev_entry = entry; + entry = ntohs(dire->u.hash_next); + if (!--iter->loop_check) { + kdebug("dir chain loop h=%x", iter->bucket); + goto bad; + } + } + + ret = -ENOENT; +found: + if (iter->block) { + kunmap_local(iter->block); + iter->block = NULL; + } + +bad: + if (ret == -ESTALE) + afs_invalidate_dir(iter->dvnode, afs_dir_invalid_iter_stale); + _leave(" = %d", ret); + return ret; +} + +/* + * Search the appropriate hash chain in the contents of an AFS directory. + */ +int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, + struct afs_fid *_fid, afs_dataversion_t *_dir_version) +{ + struct afs_dir_iter iter = { .dvnode = dvnode, }; + int ret, retry_limit = 3; + + _enter("{%lu},,,", dvnode->netfs.inode.i_ino); + + if (!afs_dir_init_iter(&iter, name)) + return -ENOENT; + do { + if (--retry_limit < 0) { + pr_warn("afs_read_dir(): Too many retries\n"); + ret = -ESTALE; + break; + } + ret = afs_read_dir(dvnode, NULL); + if (ret < 0) { + if (ret != -ESTALE) + break; + if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) { + ret = -ESTALE; + break; + } + continue; + } + *_dir_version = inode_peek_iversion_raw(&dvnode->netfs.inode); + + ret = afs_dir_search_bucket(&iter, name, _fid); + up_read(&dvnode->validate_lock); + if (ret == -ESTALE) + afs_dir_reset_iter(&iter); + } while (ret == -ESTALE); + + _leave(" = %d", ret); + return ret; +} diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c new file mode 100644 index 000000000000..014495d4b868 --- /dev/null +++ b/fs/afs/dir_silly.c @@ -0,0 +1,291 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* AFS silly rename handling + * + * Copyright (C) 2019 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * - Derived from NFS's sillyrename. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/fsnotify.h> +#include "internal.h" + +static void afs_silly_rename_success(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + + afs_check_dir_conflict(op, &op->file[0]); + afs_vnode_commit_status(op, &op->file[0]); +} + +static void afs_silly_rename_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode *dvnode = dvp->vnode; + struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + struct dentry *old = op->dentry; + struct dentry *new = op->dentry_2; + + spin_lock(&old->d_lock); + old->d_flags |= DCACHE_NFSFS_RENAMED; + spin_unlock(&old->d_lock); + if (dvnode->silly_key != op->key) { + key_put(dvnode->silly_key); + dvnode->silly_key = key_get(op->key); + } + + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) { + afs_edit_dir_remove(dvnode, &old->d_name, + afs_edit_dir_for_silly_0); + afs_edit_dir_add(dvnode, &new->d_name, + &vnode->fid, afs_edit_dir_for_silly_1); + } + up_write(&dvnode->validate_lock); +} + +static const struct afs_operation_ops afs_silly_rename_operation = { + .issue_afs_rpc = afs_fs_rename, + .issue_yfs_rpc = yfs_fs_rename, + .success = afs_silly_rename_success, + .edit_dir = afs_silly_rename_edit_dir, +}; + +/* + * Actually perform the silly rename step. + */ +static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode, + struct dentry *old, struct dentry *new, + struct key *key) +{ + struct afs_operation *op; + + _enter("%pd,%pd", old, new); + + op = afs_alloc_operation(key, dvnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) { + afs_put_operation(op); + return -ENOMEM; + } + + afs_op_set_vnode(op, 0, dvnode); + afs_op_set_vnode(op, 1, dvnode); + op->file[0].dv_delta = 1; + op->file[1].dv_delta = 1; + op->file[0].modification = true; + op->file[1].modification = true; + op->file[0].update_ctime = true; + op->file[1].update_ctime = true; + op->more_files[0].vnode = AFS_FS_I(d_inode(old)); + op->more_files[0].speculative = true; + op->more_files[1].vnode = AFS_FS_I(d_inode(new)); + op->more_files[1].speculative = true; + op->nr_files = 4; + + op->dentry = old; + op->dentry_2 = new; + op->ops = &afs_silly_rename_operation; + + trace_afs_silly_rename(vnode, false); + return afs_do_sync_operation(op); +} + +/* + * Perform silly-rename of a dentry. + * + * AFS is stateless and the server doesn't know when the client is holding a + * file open. To prevent application problems when a file is unlinked while + * it's still open, the client performs a "silly-rename". That is, it renames + * the file to a hidden file in the same directory, and only performs the + * unlink once the last reference to it is put. + * + * The final cleanup is done during dentry_iput. + */ +int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, + struct dentry *dentry, struct key *key) +{ + static unsigned int sillycounter; + struct dentry *sdentry = NULL; + unsigned char silly[16]; + int ret = -EBUSY; + + _enter(""); + + /* We don't allow a dentry to be silly-renamed twice. */ + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + return -EBUSY; + + sdentry = NULL; + do { + dput(sdentry); + sillycounter++; + + /* Create a silly name. Note that the ".__afs" prefix is + * understood by the salvager and must not be changed. + */ + scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter); + sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent); + + /* N.B. Better to return EBUSY here ... it could be dangerous + * to delete the file while it's in use. + */ + if (IS_ERR(sdentry)) + goto out; + } while (!d_is_negative(sdentry)); + + ihold(&vnode->netfs.inode); + + ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key); + switch (ret) { + case 0: + /* The rename succeeded. */ + set_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags); + d_move(dentry, sdentry); + break; + case -ERESTARTSYS: + /* The result of the rename is unknown. Play it safe by forcing + * a new lookup. + */ + d_drop(dentry); + d_drop(sdentry); + } + + iput(&vnode->netfs.inode); + dput(sdentry); +out: + _leave(" = %d", ret); + return ret; +} + +static void afs_silly_unlink_success(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + afs_check_dir_conflict(op, &op->file[0]); + afs_vnode_commit_status(op, &op->file[0]); + afs_vnode_commit_status(op, &op->file[1]); + afs_update_dentry_version(op, &op->file[0], op->dentry); +} + +static void afs_silly_unlink_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode *dvnode = dvp->vnode; + + _enter("op=%08x", op->debug_id); + down_write(&dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) + afs_edit_dir_remove(dvnode, &op->dentry->d_name, + afs_edit_dir_for_unlink); + up_write(&dvnode->validate_lock); +} + +static const struct afs_operation_ops afs_silly_unlink_operation = { + .issue_afs_rpc = afs_fs_remove_file, + .issue_yfs_rpc = yfs_fs_remove_file, + .success = afs_silly_unlink_success, + .aborted = afs_check_for_remote_deletion, + .edit_dir = afs_silly_unlink_edit_dir, +}; + +/* + * Tell the server to remove a sillyrename file. + */ +static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode, + struct dentry *dentry, struct key *key) +{ + struct afs_operation *op; + + _enter(""); + + op = afs_alloc_operation(NULL, dvnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + afs_op_set_vnode(op, 0, dvnode); + afs_op_set_vnode(op, 1, vnode); + op->file[0].dv_delta = 1; + op->file[0].modification = true; + op->file[0].update_ctime = true; + op->file[1].op_unlinked = true; + op->file[1].update_ctime = true; + + op->dentry = dentry; + op->ops = &afs_silly_unlink_operation; + + trace_afs_silly_rename(vnode, true); + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + + /* If there was a conflict with a third party, check the status of the + * unlinked vnode. + */ + if (op->cumul_error.error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + op->file[1].update_ctime = false; + op->fetch_status.which = 1; + op->ops = &afs_fetch_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } + + return afs_put_operation(op); +} + +/* + * Remove sillyrename file on iput. + */ +int afs_silly_iput(struct dentry *dentry, struct inode *inode) +{ + struct afs_vnode *dvnode = AFS_FS_I(d_inode(dentry->d_parent)); + struct afs_vnode *vnode = AFS_FS_I(inode); + struct dentry *alias; + int ret; + + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + + _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); + + down_read(&dvnode->rmdir_lock); + + alias = d_alloc_parallel(dentry->d_parent, &dentry->d_name, &wq); + if (IS_ERR(alias)) { + up_read(&dvnode->rmdir_lock); + return 0; + } + + if (!d_in_lookup(alias)) { + /* We raced with lookup... See if we need to transfer the + * sillyrename information to the aliased dentry. + */ + ret = 0; + spin_lock(&alias->d_lock); + if (d_really_is_positive(alias) && + !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { + alias->d_flags |= DCACHE_NFSFS_RENAMED; + ret = 1; + } + spin_unlock(&alias->d_lock); + up_read(&dvnode->rmdir_lock); + dput(alias); + return ret; + } + + /* Stop lock-release from complaining. */ + spin_lock(&vnode->lock); + vnode->lock_state = AFS_VNODE_LOCK_DELETED; + trace_afs_flock_ev(vnode, NULL, afs_flock_silly_delete, 0); + spin_unlock(&vnode->lock); + + afs_do_silly_unlink(dvnode, vnode, dentry, dvnode->silly_key); + up_read(&dvnode->rmdir_lock); + d_lookup_done(alias); + dput(alias); + return 1; +} diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c new file mode 100644 index 000000000000..aa56e8951e03 --- /dev/null +++ b/fs/afs/dynroot.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* AFS dynamic root handling + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/dns_resolver.h> +#include "internal.h" + +#define AFS_MIN_DYNROOT_CELL_INO 4 /* Allow for ., .., @cell, .@cell */ +#define AFS_MAX_DYNROOT_CELL_INO ((unsigned int)INT_MAX) + +static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino); + +/* + * iget5() comparator for inode created by autocell operations + */ +static int afs_iget5_pseudo_test(struct inode *inode, void *opaque) +{ + struct afs_fid *fid = opaque; + + return inode->i_ino == fid->vnode; +} + +/* + * iget5() inode initialiser + */ +static int afs_iget5_pseudo_set(struct inode *inode, void *opaque) +{ + struct afs_super_info *as = AFS_FS_S(inode->i_sb); + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_fid *fid = opaque; + + vnode->volume = as->volume; + vnode->fid = *fid; + inode->i_ino = fid->vnode; + inode->i_generation = fid->unique; + return 0; +} + +/* + * Create an inode for an autocell dynamic automount dir. + */ +static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) +{ + struct afs_vnode *vnode; + struct inode *inode; + struct afs_fid fid = { .vnode = ino, .unique = 1, }; + + _enter(""); + + inode = iget5_locked(sb, fid.vnode, + afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); + if (!inode) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); + } + + _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }", + inode, inode->i_ino, fid.vid, fid.vnode, fid.unique); + + vnode = AFS_FS_I(inode); + + if (inode_state_read_once(inode) & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 2); + inode->i_size = 0; + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_autocell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_AUTOMOUNT | S_NOATIME; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + + unlock_new_inode(inode); + } + _leave(" = %p", inode); + return inode; +} + +/* + * Try to automount the mountpoint with pseudo directory, if the autocell + * option is set. + */ +static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct afs_cell *cell = NULL; + struct afs_net *net = afs_d2net(dentry); + struct inode *inode = NULL; + const char *name = dentry->d_name.name; + size_t len = dentry->d_name.len; + bool dotted = false; + int ret = -ENOENT; + + /* Names prefixed with a dot are R/W mounts. */ + if (name[0] == '.') { + name++; + len--; + dotted = true; + } + + cell = afs_lookup_cell(net, name, len, NULL, + AFS_LOOKUP_CELL_DYNROOT, + afs_cell_trace_use_lookup_dynroot); + if (IS_ERR(cell)) { + ret = PTR_ERR(cell); + goto out_no_cell; + } + + inode = afs_iget_pseudo_dir(dir->i_sb, cell->dynroot_ino * 2 + dotted); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } + + dentry->d_fsdata = cell; + return d_splice_alias(inode, dentry); + +out: + afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_dynroot); +out_no_cell: + if (!inode) + return d_splice_alias(inode, dentry); + return ret == -ENOENT ? NULL : ERR_PTR(ret); +} + +/* + * Look up an entry in a dynroot directory. + */ +static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + _enter("%pd", dentry); + + if (flags & LOOKUP_CREATE) + return ERR_PTR(-EOPNOTSUPP); + + if (dentry->d_name.len >= AFSNAMEMAX) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } + + if (dentry->d_name.len == 5 && + memcmp(dentry->d_name.name, "@cell", 5) == 0) + return afs_lookup_atcell(dir, dentry, 2); + + if (dentry->d_name.len == 6 && + memcmp(dentry->d_name.name, ".@cell", 6) == 0) + return afs_lookup_atcell(dir, dentry, 3); + + return afs_dynroot_lookup_cell(dir, dentry, flags); +} + +const struct inode_operations afs_dynroot_inode_operations = { + .lookup = afs_dynroot_lookup, +}; + +static void afs_dynroot_d_release(struct dentry *dentry) +{ + struct afs_cell *cell = dentry->d_fsdata; + + afs_unuse_cell(cell, afs_cell_trace_unuse_dynroot_mntpt); +} + +/* + * Keep @cell symlink dentries around, but only keep cell autodirs when they're + * being used. + */ +static int afs_dynroot_delete_dentry(const struct dentry *dentry) +{ + const struct qstr *name = &dentry->d_name; + + if (name->len == 5 && memcmp(name->name, "@cell", 5) == 0) + return 0; + if (name->len == 6 && memcmp(name->name, ".@cell", 6) == 0) + return 0; + return 1; +} + +const struct dentry_operations afs_dynroot_dentry_operations = { + .d_delete = afs_dynroot_delete_dentry, + .d_release = afs_dynroot_d_release, + .d_automount = afs_d_automount, +}; + +static void afs_atcell_delayed_put_cell(void *arg) +{ + struct afs_cell *cell = arg; + + afs_put_cell(cell, afs_cell_trace_put_atcell); +} + +/* + * Read @cell or .@cell symlinks. + */ +static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_cell *cell; + struct afs_net *net = afs_i2net(inode); + const char *name; + bool dotted = vnode->fid.vnode == 3; + + if (!rcu_access_pointer(net->ws_cell)) + return ERR_PTR(-ENOENT); + + if (!dentry) { + /* We're in RCU-pathwalk. */ + cell = rcu_dereference(net->ws_cell); + if (dotted) + name = cell->name - 1; + else + name = cell->name; + /* Shouldn't need to set a delayed call. */ + return name; + } + + down_read(&net->cells_lock); + + cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock)); + if (dotted) + name = cell->name - 1; + else + name = cell->name; + afs_get_cell(cell, afs_cell_trace_get_atcell); + set_delayed_call(done, afs_atcell_delayed_put_cell, cell); + + up_read(&net->cells_lock); + return name; +} + +static const struct inode_operations afs_atcell_inode_operations = { + .get_link = afs_atcell_get_link, +}; + +/* + * Create an inode for the @cell or .@cell symlinks. + */ +static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino) +{ + struct afs_vnode *vnode; + struct inode *inode; + struct afs_fid fid = { .vnode = ino, .unique = 1, }; + + inode = iget5_locked(dir->i_sb, fid.vnode, + afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); + if (!inode) + return ERR_PTR(-ENOMEM); + + vnode = AFS_FS_I(inode); + + if (inode_state_read_once(inode) & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 1); + inode->i_size = 0; + inode->i_mode = S_IFLNK | 0555; + inode->i_op = &afs_atcell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + unlock_new_inode(inode); + } + return d_splice_alias(inode, dentry); +} + +/* + * Transcribe the cell database into readdir content under the RCU read lock. + * Each cell produces two entries, one prefixed with a dot and one not. + */ +static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx) +{ + const struct afs_cell *cell; + loff_t newpos; + + _enter("%llu", ctx->pos); + + for (;;) { + unsigned int ix = ctx->pos >> 1; + + cell = idr_get_next(&net->cells_dyn_ino, &ix); + if (!cell) + return 0; + if (READ_ONCE(cell->state) == AFS_CELL_REMOVING || + READ_ONCE(cell->state) == AFS_CELL_DEAD) { + ctx->pos += 2; + ctx->pos &= ~1; + continue; + } + + newpos = ix << 1; + if (newpos > ctx->pos) + ctx->pos = newpos; + + _debug("pos %llu -> cell %u", ctx->pos, cell->dynroot_ino); + + if ((ctx->pos & 1) == 0) { + if (!dir_emit(ctx, cell->name, cell->name_len, + cell->dynroot_ino, DT_DIR)) + return 0; + ctx->pos++; + } + if ((ctx->pos & 1) == 1) { + if (!dir_emit(ctx, cell->name - 1, cell->name_len + 1, + cell->dynroot_ino + 1, DT_DIR)) + return 0; + ctx->pos++; + } + } + return 0; +} + +/* + * Read the AFS dynamic root directory. This produces a list of cellnames, + * dotted and undotted, along with @cell and .@cell links if configured. + */ +static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx) +{ + struct afs_net *net = afs_d2net(file->f_path.dentry); + int ret = 0; + + if (!dir_emit_dots(file, ctx)) + return 0; + + if (ctx->pos == 2) { + if (rcu_access_pointer(net->ws_cell) && + !dir_emit(ctx, "@cell", 5, 2, DT_LNK)) + return 0; + ctx->pos = 3; + } + if (ctx->pos == 3) { + if (rcu_access_pointer(net->ws_cell) && + !dir_emit(ctx, ".@cell", 6, 3, DT_LNK)) + return 0; + ctx->pos = 4; + } + + if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) { + down_read(&net->cells_lock); + ret = afs_dynroot_readdir_cells(net, ctx); + up_read(&net->cells_lock); + } + return ret; +} + +static const struct file_operations afs_dynroot_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = afs_dynroot_readdir, + .fsync = noop_fsync, +}; + +/* + * Create an inode for a dynamic root directory. + */ +struct inode *afs_dynroot_iget_root(struct super_block *sb) +{ + struct afs_super_info *as = AFS_FS_S(sb); + struct afs_vnode *vnode; + struct inode *inode; + struct afs_fid fid = { .vid = 0, .vnode = 1, .unique = 1,}; + + if (as->volume) + fid.vid = as->volume->vid; + + inode = iget5_locked(sb, fid.vnode, + afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); + if (!inode) + return ERR_PTR(-ENOMEM); + + vnode = AFS_FS_I(inode); + + /* there shouldn't be an existing inode */ + if (inode_state_read_once(inode) & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 2); + inode->i_size = 0; + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_dynroot_inode_operations; + inode->i_fop = &afs_dynroot_file_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); + unlock_new_inode(inode); + } + _leave(" = %p", inode); + return inode; +} diff --git a/fs/afs/file.c b/fs/afs/file.c index 66d50fe2ee45..f66a92294284 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* AFS filesystem file handling * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> @@ -16,27 +12,32 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/gfp.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/netfs.h> +#include <trace/events/netfs.h> #include "internal.h" -static int afs_readpage(struct file *file, struct page *page); -static void afs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); -static int afs_releasepage(struct page *page, gfp_t gfp_flags); -static int afs_launder_page(struct page *page); +static int afs_file_mmap_prepare(struct vm_area_desc *desc); -static int afs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); +static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); +static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); +static void afs_vm_open(struct vm_area_struct *area); +static void afs_vm_close(struct vm_area_struct *area); +static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); const struct file_operations afs_file_operations = { .open = afs_open, .release = afs_release, .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = afs_file_write, - .mmap = generic_file_readonly_mmap, - .splice_read = generic_file_splice_read, + .read_iter = afs_file_read_iter, + .write_iter = netfs_file_write_iter, + .mmap_prepare = afs_file_mmap_prepare, + .splice_read = afs_file_splice_read, + .splice_write = iter_file_splice_write, .fsync = afs_fsync, .lock = afs_lock, .flock = afs_flock, @@ -48,45 +49,120 @@ const struct inode_operations afs_file_inode_operations = { .permission = afs_permission, }; -const struct address_space_operations afs_fs_aops = { - .readpage = afs_readpage, - .readpages = afs_readpages, - .set_page_dirty = afs_set_page_dirty, - .launder_page = afs_launder_page, - .releasepage = afs_releasepage, - .invalidatepage = afs_invalidatepage, - .write_begin = afs_write_begin, - .write_end = afs_write_end, - .writepage = afs_writepage, +const struct address_space_operations afs_file_aops = { + .direct_IO = noop_direct_IO, + .read_folio = netfs_read_folio, + .readahead = netfs_readahead, + .dirty_folio = netfs_dirty_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, + .migrate_folio = filemap_migrate_folio, .writepages = afs_writepages, }; +static const struct vm_operations_struct afs_vm_ops = { + .open = afs_vm_open, + .close = afs_vm_close, + .fault = filemap_fault, + .map_pages = afs_vm_map_pages, + .page_mkwrite = afs_page_mkwrite, +}; + +/* + * Discard a pin on a writeback key. + */ +void afs_put_wb_key(struct afs_wb_key *wbk) +{ + if (wbk && refcount_dec_and_test(&wbk->usage)) { + key_put(wbk->key); + kfree(wbk); + } +} + +/* + * Cache key for writeback. + */ +int afs_cache_wb_key(struct afs_vnode *vnode, struct afs_file *af) +{ + struct afs_wb_key *wbk, *p; + + wbk = kzalloc(sizeof(struct afs_wb_key), GFP_KERNEL); + if (!wbk) + return -ENOMEM; + refcount_set(&wbk->usage, 2); + wbk->key = af->key; + + spin_lock(&vnode->wb_lock); + list_for_each_entry(p, &vnode->wb_keys, vnode_link) { + if (p->key == wbk->key) + goto found; + } + + key_get(wbk->key); + list_add_tail(&wbk->vnode_link, &vnode->wb_keys); + spin_unlock(&vnode->wb_lock); + af->wb = wbk; + return 0; + +found: + refcount_inc(&p->usage); + spin_unlock(&vnode->wb_lock); + af->wb = p; + kfree(wbk); + return 0; +} + /* * open an AFS file or directory and attach a key to it */ int afs_open(struct inode *inode, struct file *file) { struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_file *af; struct key *key; int ret; - _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode); key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - return PTR_ERR(key); + ret = PTR_ERR(key); + goto error; } + af = kzalloc(sizeof(*af), GFP_KERNEL); + if (!af) { + ret = -ENOMEM; + goto error_key; + } + af->key = key; + ret = afs_validate(vnode, key); - if (ret < 0) { - _leave(" = %d [val]", ret); - return ret; + if (ret < 0) + goto error_af; + + if (file->f_mode & FMODE_WRITE) { + ret = afs_cache_wb_key(vnode, af); + if (ret < 0) + goto error_af; } - file->private_data = key; + if (file->f_flags & O_TRUNC) + set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + + fscache_use_cookie(afs_vnode_cache(vnode), file->f_mode & FMODE_WRITE); + + file->private_data = af; _leave(" = 0"); return 0; + +error_af: + kfree(af); +error_key: + key_put(key); +error: + _leave(" = %d", ret); + return ret; } /* @@ -94,287 +170,397 @@ int afs_open(struct inode *inode, struct file *file) */ int afs_release(struct inode *inode, struct file *file) { + struct afs_vnode_cache_aux aux; struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_file *af = file->private_data; + loff_t i_size; + int ret = 0; - _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode); - key_put(file->private_data); - _leave(" = 0"); - return 0; + if ((file->f_mode & FMODE_WRITE)) + ret = vfs_fsync(file, 0); + + file->private_data = NULL; + if (af->wb) + afs_put_wb_key(af->wb); + + if ((file->f_mode & FMODE_WRITE)) { + i_size = i_size_read(&vnode->netfs.inode); + afs_set_cache_aux(vnode, &aux); + fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size); + } else { + fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL); + } + + key_put(af->key); + kfree(af); + afs_prune_wb_keys(vnode); + _leave(" = %d", ret); + return ret; } -#ifdef CONFIG_AFS_FSCACHE -/* - * deal with notification that a page was read from the cache - */ -static void afs_file_readpage_read_complete(struct page *page, - void *data, - int error) +static void afs_fetch_data_notify(struct afs_operation *op) { - _enter("%p,%p,%d", page, data, error); + struct netfs_io_subrequest *subreq = op->fetch.subreq; - /* if the read completes with an error, we just unlock the page and let - * the VM reissue the readpage */ - if (!error) - SetPageUptodate(page); - unlock_page(page); + subreq->error = afs_op_error(op); + netfs_read_subreq_terminated(subreq); } -#endif -/* - * read page from file, directory or symlink, given a key to use - */ -int afs_page_filler(void *data, struct page *page) +static void afs_fetch_data_success(struct afs_operation *op) { - struct inode *inode = page->mapping->host; - struct afs_vnode *vnode = AFS_FS_I(inode); - struct key *key = data; - size_t len; - off_t offset; - int ret; + struct afs_vnode *vnode = op->file[0].vnode; - _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); + _enter("op=%08x", op->debug_id); + afs_vnode_commit_status(op, &op->file[0]); + afs_stat_v(vnode, n_fetches); + atomic_long_add(op->fetch.subreq->transferred, &op->net->n_fetch_bytes); + afs_fetch_data_notify(op); +} - BUG_ON(!PageLocked(page)); +static void afs_fetch_data_aborted(struct afs_operation *op) +{ + afs_check_for_remote_deletion(op); + afs_fetch_data_notify(op); +} - ret = -ESTALE; - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - goto error; +const struct afs_operation_ops afs_fetch_data_operation = { + .issue_afs_rpc = afs_fs_fetch_data, + .issue_yfs_rpc = yfs_fs_fetch_data, + .success = afs_fetch_data_success, + .aborted = afs_fetch_data_aborted, + .failed = afs_fetch_data_notify, +}; - /* is it cached? */ -#ifdef CONFIG_AFS_FSCACHE - ret = fscache_read_or_alloc_page(vnode->cache, - page, - afs_file_readpage_read_complete, - NULL, - GFP_KERNEL); -#else - ret = -ENOBUFS; -#endif - switch (ret) { - /* read BIO submitted (page in cache) */ - case 0: - break; +static void afs_issue_read_call(struct afs_operation *op) +{ + op->call_responded = false; + op->call_error = 0; + op->call_abort_code = 0; + if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags)) + yfs_fs_fetch_data(op); + else + afs_fs_fetch_data(op); +} - /* page not yet cached */ - case -ENODATA: - _debug("cache said ENODATA"); - goto go_on; +static void afs_end_read(struct afs_operation *op) +{ + if (op->call_responded && op->server) + set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags); + + if (!afs_op_error(op)) + afs_fetch_data_success(op); + else if (op->cumul_error.aborted) + afs_fetch_data_aborted(op); + else + afs_fetch_data_notify(op); + + afs_end_vnode_operation(op); + afs_put_operation(op); +} - /* page will not be cached */ - case -ENOBUFS: - _debug("cache said ENOBUFS"); - default: - go_on: - offset = page->index << PAGE_CACHE_SHIFT; - len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE); - - /* read the contents of the file from the server into the - * page */ - ret = afs_vnode_fetch_data(vnode, key, offset, len, page); - if (ret < 0) { - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - -#ifdef CONFIG_AFS_FSCACHE - fscache_uncache_page(vnode->cache, page); -#endif - BUG_ON(PageFsCache(page)); - goto error; - } +/* + * Perform I/O processing on an asynchronous call. The work item carries a ref + * to the call struct that we either need to release or to pass on. + */ +static void afs_read_receive(struct afs_call *call) +{ + struct afs_operation *op = call->op; + enum afs_call_state state; - SetPageUptodate(page); + _enter(""); - /* send the page to the cache */ -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) { - fscache_uncache_page(vnode->cache, page); - BUG_ON(PageFsCache(page)); - } -#endif - unlock_page(page); + state = READ_ONCE(call->state); + if (state == AFS_CALL_COMPLETE) + return; + trace_afs_read_recv(op, call); + + while (state < AFS_CALL_COMPLETE && READ_ONCE(call->need_attention)) { + WRITE_ONCE(call->need_attention, false); + afs_deliver_to_call(call); + state = READ_ONCE(call->state); } - _leave(" = 0"); - return 0; + if (state < AFS_CALL_COMPLETE) { + netfs_read_subreq_progress(op->fetch.subreq); + if (rxrpc_kernel_check_life(call->net->socket, call->rxcall)) + return; + /* rxrpc terminated the call. */ + afs_set_call_complete(call, call->error, call->abort_code); + } -error: - SetPageError(page); - unlock_page(page); - _leave(" = %d", ret); - return ret; + op->call_abort_code = call->abort_code; + op->call_error = call->error; + op->call_responded = call->responded; + op->call = NULL; + call->op = NULL; + afs_put_call(call); + + /* If the call failed, then we need to crank the server rotation + * handle and try the next. + */ + if (afs_select_fileserver(op)) { + afs_issue_read_call(op); + return; + } + + afs_end_read(op); } -/* - * read page from file, directory or symlink, given a file to nominate the key - * to be used - */ -static int afs_readpage(struct file *file, struct page *page) +void afs_fetch_data_async_rx(struct work_struct *work) { - struct key *key; - int ret; + struct afs_call *call = container_of(work, struct afs_call, async_work); - if (file) { - key = file->private_data; - ASSERT(key != NULL); - ret = afs_page_filler(key, page); - } else { - struct inode *inode = page->mapping->host; - key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - } else { - ret = afs_page_filler(key, page); - key_put(key); - } + afs_read_receive(call); + afs_put_call(call); +} + +void afs_fetch_data_immediate_cancel(struct afs_call *call) +{ + if (call->async) { + afs_get_call(call, afs_call_trace_wake); + if (!queue_work(afs_async_calls, &call->async_work)) + afs_deferred_put_call(call); + flush_work(&call->async_work); } - return ret; } /* - * read a set of pages + * Fetch file data from the volume. */ -static int afs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void afs_issue_read(struct netfs_io_subrequest *subreq) { - struct key *key = file->private_data; - struct afs_vnode *vnode; - int ret = 0; + struct afs_operation *op; + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + struct key *key = subreq->rreq->netfs_priv; + + _enter("%s{%llx:%llu.%u},%x,,,", + vnode->volume->name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + op = afs_alloc_operation(key, vnode->volume); + if (IS_ERR(op)) { + subreq->error = PTR_ERR(op); + netfs_read_subreq_terminated(subreq); + return; + } - _enter("{%d},{%lu},,%d", - key_serial(key), mapping->host->i_ino, nr_pages); + afs_op_set_vnode(op, 0, vnode); + + op->fetch.subreq = subreq; + op->ops = &afs_fetch_data_operation; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + + if (subreq->rreq->origin == NETFS_READAHEAD || + subreq->rreq->iocb) { + op->flags |= AFS_OPERATION_ASYNC; + + if (!afs_begin_vnode_operation(op)) { + subreq->error = afs_put_operation(op); + netfs_read_subreq_terminated(subreq); + return; + } - ASSERT(key != NULL); + if (!afs_select_fileserver(op)) { + afs_end_read(op); + return; + } - vnode = AFS_FS_I(mapping->host); - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _leave(" = -ESTALE"); - return -ESTALE; + afs_issue_read_call(op); + } else { + afs_do_sync_operation(op); } +} - /* attempt to read as many of the pages as possible */ -#ifdef CONFIG_AFS_FSCACHE - ret = fscache_read_or_alloc_pages(vnode->cache, - mapping, - pages, - &nr_pages, - afs_file_readpage_read_complete, - NULL, - mapping_gfp_mask(mapping)); -#else - ret = -ENOBUFS; -#endif - - switch (ret) { - /* all pages are being read from the cache */ - case 0: - BUG_ON(!list_empty(pages)); - BUG_ON(nr_pages != 0); - _leave(" = 0 [reading all]"); - return 0; - - /* there were pages that couldn't be read from the cache */ - case -ENODATA: - case -ENOBUFS: - break; +static int afs_init_request(struct netfs_io_request *rreq, struct file *file) +{ + struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - /* other error */ + if (file) + rreq->netfs_priv = key_get(afs_file_key(file)); + rreq->rsize = 256 * 1024; + rreq->wsize = 256 * 1024 * 1024; + + switch (rreq->origin) { + case NETFS_READ_SINGLE: + if (!file) { + struct key *key = afs_request_key(vnode->volume->cell); + + if (IS_ERR(key)) + return PTR_ERR(key); + rreq->netfs_priv = key; + } + break; + case NETFS_WRITEBACK: + case NETFS_WRITETHROUGH: + case NETFS_UNBUFFERED_WRITE: + case NETFS_DIO_WRITE: + if (S_ISREG(rreq->inode->i_mode)) + rreq->io_streams[0].avail = true; + break; + case NETFS_WRITEBACK_SINGLE: default: - _leave(" = %d", ret); - return ret; + break; } + return 0; +} - /* load the missing pages from the network */ - ret = read_cache_pages(mapping, pages, afs_page_filler, key); +static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, + struct folio **foliop, void **_fsdata) +{ + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - _leave(" = %d [netting]", ret); - return ret; + return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; } -/* - * write back a dirty page - */ -static int afs_launder_page(struct page *page) +static void afs_free_request(struct netfs_io_request *rreq) { - _enter("{%lu}", page->index); + key_put(rreq->netfs_priv); + afs_put_wb_key(rreq->netfs_priv2); +} - return 0; +static void afs_update_i_size(struct inode *inode, loff_t new_i_size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + loff_t i_size; + + write_seqlock(&vnode->cb_lock); + i_size = i_size_read(&vnode->netfs.inode); + if (new_i_size > i_size) { + i_size_write(&vnode->netfs.inode, new_i_size); + inode_set_bytes(&vnode->netfs.inode, new_i_size); + } + write_sequnlock(&vnode->cb_lock); + fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size); } -/* - * invalidate part or all of a page - * - release a page and clean up its private data if offset is 0 (indicating - * the entire page) - */ -static void afs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq) { - struct afs_writeback *wb = (struct afs_writeback *) page_private(page); + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); - _enter("{%lu},%u,%u", page->index, offset, length); + afs_invalidate_cache(vnode, 0); +} - BUG_ON(!PageLocked(page)); +const struct netfs_request_ops afs_req_ops = { + .init_request = afs_init_request, + .free_request = afs_free_request, + .check_write_begin = afs_check_write_begin, + .issue_read = afs_issue_read, + .update_i_size = afs_update_i_size, + .invalidate_cache = afs_netfs_invalidate_cache, + .begin_writeback = afs_begin_writeback, + .prepare_write = afs_prepare_write, + .issue_write = afs_issue_write, + .retry_request = afs_retry_request, +}; - /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_CACHE_SIZE) { -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page)) { - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - fscache_wait_on_page_write(vnode->cache, page); - fscache_uncache_page(vnode->cache, page); - } -#endif +static void afs_add_open_mmap(struct afs_vnode *vnode) +{ + if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) { + down_write(&vnode->volume->open_mmaps_lock); - if (PagePrivate(page)) { - if (wb && !PageWriteback(page)) { - set_page_private(page, 0); - afs_put_writeback(wb); - } + if (list_empty(&vnode->cb_mmap_link)) + list_add_tail(&vnode->cb_mmap_link, &vnode->volume->open_mmaps); - if (!page_private(page)) - ClearPagePrivate(page); - } + up_write(&vnode->volume->open_mmaps_lock); } +} + +static void afs_drop_open_mmap(struct afs_vnode *vnode) +{ + if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1)) + return; - _leave(""); + down_write(&vnode->volume->open_mmaps_lock); + + read_seqlock_excl(&vnode->cb_lock); + // the only place where ->cb_nr_mmap may hit 0 + // see __afs_break_callback() for the other side... + if (atomic_dec_and_test(&vnode->cb_nr_mmap)) + list_del_init(&vnode->cb_mmap_link); + read_sequnlock_excl(&vnode->cb_lock); + + up_write(&vnode->volume->open_mmaps_lock); + flush_work(&vnode->cb_work); } /* - * release a page and clean up its private state if it's not busy - * - return true if the page can now be released, false if not + * Handle setting up a memory mapping on an AFS file. */ -static int afs_releasepage(struct page *page, gfp_t gfp_flags) +static int afs_file_mmap_prepare(struct vm_area_desc *desc) { - struct afs_writeback *wb = (struct afs_writeback *) page_private(page); - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - - _enter("{{%x:%u}[%lu],%lx},%x", - vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, - gfp_flags); - - /* deny if page is being written to the cache and the caller hasn't - * elected to wait */ -#ifdef CONFIG_AFS_FSCACHE - if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) { - _leave(" = F [cache busy]"); - return 0; - } -#endif + struct afs_vnode *vnode = AFS_FS_I(file_inode(desc->file)); + int ret; - if (PagePrivate(page)) { - if (wb) { - set_page_private(page, 0); - afs_put_writeback(wb); - } - ClearPagePrivate(page); - } + afs_add_open_mmap(vnode); + + ret = generic_file_mmap_prepare(desc); + if (ret == 0) + desc->vm_ops = &afs_vm_ops; + else + afs_drop_open_mmap(vnode); + return ret; +} + +static void afs_vm_open(struct vm_area_struct *vma) +{ + afs_add_open_mmap(AFS_FS_I(file_inode(vma->vm_file))); +} + +static void afs_vm_close(struct vm_area_struct *vma) +{ + afs_drop_open_mmap(AFS_FS_I(file_inode(vma->vm_file))); +} + +static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) +{ + struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file)); + + if (afs_check_validity(vnode)) + return filemap_map_pages(vmf, start_pgoff, end_pgoff); + return 0; +} + +static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_file *af = iocb->ki_filp->private_data; + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) + return netfs_unbuffered_read_iter(iocb, iter); - /* indicate that the page can be released */ - _leave(" = T"); - return 1; + ret = netfs_start_io_read(inode); + if (ret < 0) + return ret; + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_read(iocb, iter, 0); + netfs_end_io_read(inode); + return ret; +} + +static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_file *af = in->private_data; + ssize_t ret; + + ret = netfs_start_io_read(inode); + if (ret < 0) + return ret; + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_splice_read(in, ppos, pipe, len, flags); + netfs_end_io_read(inode); + return ret; } diff --git a/fs/afs/flock.c b/fs/afs/flock.c index a8cf2cff836c..f0e96a35093f 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -1,68 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* AFS file locking support * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include "internal.h" #define AFS_LOCK_GRANTED 0 #define AFS_LOCK_PENDING 1 +#define AFS_LOCK_YOUR_TRY 2 + +struct workqueue_struct *afs_lock_manager; +static void afs_next_locker(struct afs_vnode *vnode, int error); static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl); static void afs_fl_release_private(struct file_lock *fl); -static struct workqueue_struct *afs_lock_manager; -static DEFINE_MUTEX(afs_lock_manager_mutex); - static const struct file_lock_operations afs_lock_ops = { .fl_copy_lock = afs_fl_copy_lock, .fl_release_private = afs_fl_release_private, }; -/* - * initialise the lock manager thread if it isn't already running - */ -static int afs_init_lock_manager(void) +static inline void afs_set_lock_state(struct afs_vnode *vnode, enum afs_lock_state state) { - int ret; - - ret = 0; - if (!afs_lock_manager) { - mutex_lock(&afs_lock_manager_mutex); - if (!afs_lock_manager) { - afs_lock_manager = - create_singlethread_workqueue("kafs_lockd"); - if (!afs_lock_manager) - ret = -ENOMEM; - } - mutex_unlock(&afs_lock_manager_mutex); - } - return ret; + _debug("STATE %u -> %u", vnode->lock_state, state); + vnode->lock_state = state; } -/* - * destroy the lock manager thread if it's running - */ -void __exit afs_kill_lock_manager(void) -{ - if (afs_lock_manager) - destroy_workqueue(afs_lock_manager); -} +static atomic_t afs_file_lock_debug_id; /* * if the callback is broken on this vnode, then the lock may now be available */ void afs_lock_may_be_available(struct afs_vnode *vnode) { - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); - queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0); + spin_lock(&vnode->lock); + if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB) + afs_next_locker(vnode, 0); + trace_afs_flock_ev(vnode, NULL, afs_flock_callback_break, 0); + spin_unlock(&vnode->lock); } /* @@ -71,8 +50,36 @@ void afs_lock_may_be_available(struct afs_vnode *vnode) */ static void afs_schedule_lock_extension(struct afs_vnode *vnode) { - queue_delayed_work(afs_lock_manager, &vnode->lock_work, - AFS_LOCKWAIT * HZ / 2); + ktime_t expires_at, now, duration; + u64 duration_j; + + expires_at = ktime_add_ms(vnode->locked_at, AFS_LOCKWAIT * 1000 / 2); + now = ktime_get_real(); + duration = ktime_sub(expires_at, now); + if (duration <= 0) + duration_j = 0; + else + duration_j = nsecs_to_jiffies(ktime_to_ns(duration)); + + queue_delayed_work(afs_lock_manager, &vnode->lock_work, duration_j); +} + +/* + * In the case of successful completion of a lock operation, record the time + * the reply appeared and start the lock extension timer. + */ +void afs_lock_op_done(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode *vnode = op->file[0].vnode; + + if (call->error == 0) { + spin_lock(&vnode->lock); + trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0); + vnode->locked_at = call->issue_time; + afs_schedule_lock_extension(vnode); + spin_unlock(&vnode->lock); + } } /* @@ -80,22 +87,190 @@ static void afs_schedule_lock_extension(struct afs_vnode *vnode) * first lock in the queue is itself a readlock) * - the caller must hold the vnode lock */ -static void afs_grant_locks(struct afs_vnode *vnode, struct file_lock *fl) +static void afs_grant_locks(struct afs_vnode *vnode) { struct file_lock *p, *_p; + bool exclusive = (vnode->lock_type == AFS_LOCK_WRITE); - list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); - if (fl->fl_type == F_RDLCK) { - list_for_each_entry_safe(p, _p, &vnode->pending_locks, - fl_u.afs.link) { - if (p->fl_type == F_RDLCK) { - p->fl_u.afs.state = AFS_LOCK_GRANTED; - list_move_tail(&p->fl_u.afs.link, - &vnode->granted_locks); - wake_up(&p->fl_wait); - } + list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) { + if (!exclusive && lock_is_write(p)) + continue; + + list_move_tail(&p->fl_u.afs.link, &vnode->granted_locks); + p->fl_u.afs.state = AFS_LOCK_GRANTED; + trace_afs_flock_op(vnode, p, afs_flock_op_grant); + locks_wake_up(p); + } +} + +/* + * If an error is specified, reject every pending lock that matches the + * authentication and type of the lock we failed to get. If there are any + * remaining lockers, try to wake up one of them to have a go. + */ +static void afs_next_locker(struct afs_vnode *vnode, int error) +{ + struct file_lock *p, *_p, *next = NULL; + struct key *key = vnode->lock_key; + unsigned int type = F_RDLCK; + + _enter(""); + + if (vnode->lock_type == AFS_LOCK_WRITE) + type = F_WRLCK; + + list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) { + if (error && + p->c.flc_type == type && + afs_file_key(p->c.flc_file) == key) { + list_del_init(&p->fl_u.afs.link); + p->fl_u.afs.state = error; + locks_wake_up(p); } + + /* Select the next locker to hand off to. */ + if (next && (lock_is_write(next) || lock_is_read(p))) + continue; + next = p; + } + + vnode->lock_key = NULL; + key_put(key); + + if (next) { + afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING); + next->fl_u.afs.state = AFS_LOCK_YOUR_TRY; + trace_afs_flock_op(vnode, next, afs_flock_op_wake); + locks_wake_up(next); + } else { + afs_set_lock_state(vnode, AFS_VNODE_LOCK_NONE); + trace_afs_flock_ev(vnode, NULL, afs_flock_no_lockers, 0); } + + _leave(""); +} + +/* + * Kill off all waiters in the the pending lock queue due to the vnode being + * deleted. + */ +static void afs_kill_lockers_enoent(struct afs_vnode *vnode) +{ + struct file_lock *p; + + afs_set_lock_state(vnode, AFS_VNODE_LOCK_DELETED); + + while (!list_empty(&vnode->pending_locks)) { + p = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + list_del_init(&p->fl_u.afs.link); + p->fl_u.afs.state = -ENOENT; + locks_wake_up(p); + } + + key_put(vnode->lock_key); + vnode->lock_key = NULL; +} + +static void afs_lock_success(struct afs_operation *op) +{ + _enter("op=%08x", op->debug_id); + afs_vnode_commit_status(op, &op->file[0]); +} + +static const struct afs_operation_ops afs_set_lock_operation = { + .issue_afs_rpc = afs_fs_set_lock, + .issue_yfs_rpc = yfs_fs_set_lock, + .success = afs_lock_success, + .aborted = afs_check_for_remote_deletion, +}; + +/* + * Get a lock on a file + */ +static int afs_set_lock(struct afs_vnode *vnode, struct key *key, + afs_lock_type_t type) +{ + struct afs_operation *op; + + _enter("%s{%llx:%llu.%u},%x,%u", + vnode->volume->name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key), type); + + op = afs_alloc_operation(key, vnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + afs_op_set_vnode(op, 0, vnode); + + op->lock.type = type; + op->ops = &afs_set_lock_operation; + return afs_do_sync_operation(op); +} + +static const struct afs_operation_ops afs_extend_lock_operation = { + .issue_afs_rpc = afs_fs_extend_lock, + .issue_yfs_rpc = yfs_fs_extend_lock, + .success = afs_lock_success, +}; + +/* + * Extend a lock on a file + */ +static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) +{ + struct afs_operation *op; + + _enter("%s{%llx:%llu.%u},%x", + vnode->volume->name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + op = afs_alloc_operation(key, vnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + afs_op_set_vnode(op, 0, vnode); + + op->flags |= AFS_OPERATION_UNINTR; + op->ops = &afs_extend_lock_operation; + return afs_do_sync_operation(op); +} + +static const struct afs_operation_ops afs_release_lock_operation = { + .issue_afs_rpc = afs_fs_release_lock, + .issue_yfs_rpc = yfs_fs_release_lock, + .success = afs_lock_success, +}; + +/* + * Release a lock on a file + */ +static int afs_release_lock(struct afs_vnode *vnode, struct key *key) +{ + struct afs_operation *op; + + _enter("%s{%llx:%llu.%u},%x", + vnode->volume->name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + key_serial(key)); + + op = afs_alloc_operation(key, vnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + afs_op_set_vnode(op, 0, vnode); + + op->flags |= AFS_OPERATION_UNINTR; + op->ops = &afs_release_lock_operation; + return afs_do_sync_operation(op); } /* @@ -107,126 +282,105 @@ void afs_lock_work(struct work_struct *work) { struct afs_vnode *vnode = container_of(work, struct afs_vnode, lock_work.work); - struct file_lock *fl; - afs_lock_type_t type; struct key *key; int ret; - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); spin_lock(&vnode->lock); - if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) { - _debug("unlock"); +again: + _debug("wstate %u for %p", vnode->lock_state, vnode); + switch (vnode->lock_state) { + case AFS_VNODE_LOCK_NEED_UNLOCK: + afs_set_lock_state(vnode, AFS_VNODE_LOCK_UNLOCKING); + trace_afs_flock_ev(vnode, NULL, afs_flock_work_unlocking, 0); spin_unlock(&vnode->lock); /* attempt to release the server lock; if it fails, we just - * wait 5 minutes and it'll time out anyway */ - ret = afs_vnode_release_lock(vnode, vnode->unlock_key); - if (ret < 0) + * wait 5 minutes and it'll expire anyway */ + ret = afs_release_lock(vnode, vnode->lock_key); + if (ret < 0 && vnode->lock_state != AFS_VNODE_LOCK_DELETED) { + trace_afs_flock_ev(vnode, NULL, afs_flock_release_fail, + ret); printk(KERN_WARNING "AFS:" - " Failed to release lock on {%x:%x} error %d\n", + " Failed to release lock on {%llx:%llx} error %d\n", vnode->fid.vid, vnode->fid.vnode, ret); + } spin_lock(&vnode->lock); - key_put(vnode->unlock_key); - vnode->unlock_key = NULL; - clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags); - } + if (ret == -ENOENT) + afs_kill_lockers_enoent(vnode); + else + afs_next_locker(vnode, 0); + spin_unlock(&vnode->lock); + return; - /* if we've got a lock, then it must be time to extend that lock as AFS - * locks time out after 5 minutes */ - if (!list_empty(&vnode->granted_locks)) { + /* If we've already got a lock, then it must be time to extend that + * lock as AFS locks time out after 5 minutes. + */ + case AFS_VNODE_LOCK_GRANTED: _debug("extend"); - if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) - BUG(); - fl = list_entry(vnode->granted_locks.next, - struct file_lock, fl_u.afs.link); - key = key_get(fl->fl_file->private_data); + ASSERT(!list_empty(&vnode->granted_locks)); + + key = key_get(vnode->lock_key); + afs_set_lock_state(vnode, AFS_VNODE_LOCK_EXTENDING); + trace_afs_flock_ev(vnode, NULL, afs_flock_work_extending, 0); spin_unlock(&vnode->lock); - ret = afs_vnode_extend_lock(vnode, key); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + ret = afs_extend_lock(vnode, key); /* RPC */ key_put(key); - switch (ret) { - case 0: - afs_schedule_lock_extension(vnode); - break; - default: - /* ummm... we failed to extend the lock - retry - * extension shortly */ - printk(KERN_WARNING "AFS:" - " Failed to extend lock on {%x:%x} error %d\n", - vnode->fid.vid, vnode->fid.vnode, ret); + + if (ret < 0) { + trace_afs_flock_ev(vnode, NULL, afs_flock_extend_fail, + ret); + pr_warn("AFS: Failed to extend lock on {%llx:%llx} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); + } + + spin_lock(&vnode->lock); + + if (ret == -ENOENT) { + afs_kill_lockers_enoent(vnode); + spin_unlock(&vnode->lock); + return; + } + + if (vnode->lock_state != AFS_VNODE_LOCK_EXTENDING) + goto again; + afs_set_lock_state(vnode, AFS_VNODE_LOCK_GRANTED); + + if (ret != 0) queue_delayed_work(afs_lock_manager, &vnode->lock_work, HZ * 10); - break; - } - _leave(" [extend]"); + spin_unlock(&vnode->lock); + _leave(" [ext]"); return; - } - /* if we don't have a granted lock, then we must've been called back by - * the server, and so if might be possible to get a lock we're - * currently waiting for */ - if (!list_empty(&vnode->pending_locks)) { - _debug("get"); - - if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) - BUG(); - fl = list_entry(vnode->pending_locks.next, - struct file_lock, fl_u.afs.link); - key = key_get(fl->fl_file->private_data); - type = (fl->fl_type == F_RDLCK) ? - AFS_LOCK_READ : AFS_LOCK_WRITE; + /* If we're waiting for a callback to indicate lock release, we can't + * actually rely on this, so need to recheck at regular intervals. The + * problem is that the server might not notify us if the lock just + * expires (say because a client died) rather than being explicitly + * released. + */ + case AFS_VNODE_LOCK_WAITING_FOR_CB: + _debug("retry"); + afs_next_locker(vnode, 0); spin_unlock(&vnode->lock); + return; - ret = afs_vnode_set_lock(vnode, key, type); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); - switch (ret) { - case -EWOULDBLOCK: - _debug("blocked"); - break; - case 0: - _debug("acquired"); - if (type == AFS_LOCK_READ) - set_bit(AFS_VNODE_READLOCKED, &vnode->flags); - else - set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); - ret = AFS_LOCK_GRANTED; - default: - spin_lock(&vnode->lock); - /* the pending lock may have been withdrawn due to a - * signal */ - if (list_entry(vnode->pending_locks.next, - struct file_lock, fl_u.afs.link) == fl) { - fl->fl_u.afs.state = ret; - if (ret == AFS_LOCK_GRANTED) - afs_grant_locks(vnode, fl); - else - list_del_init(&fl->fl_u.afs.link); - wake_up(&fl->fl_wait); - spin_unlock(&vnode->lock); - } else { - _debug("withdrawn"); - clear_bit(AFS_VNODE_READLOCKED, &vnode->flags); - clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); - spin_unlock(&vnode->lock); - afs_vnode_release_lock(vnode, key); - if (!list_empty(&vnode->pending_locks)) - afs_lock_may_be_available(vnode); - } - break; - } - key_put(key); - _leave(" [pend]"); + case AFS_VNODE_LOCK_DELETED: + afs_kill_lockers_enoent(vnode); + spin_unlock(&vnode->lock); return; - } - /* looks like the lock request was withdrawn on a signal */ - spin_unlock(&vnode->lock); - _leave(" [no locks]"); + default: + /* Looks like a lock request was withdrawn. */ + spin_unlock(&vnode->lock); + _leave(" [no]"); + return; + } } /* @@ -235,16 +389,60 @@ void afs_lock_work(struct work_struct *work) * AF_RXRPC * - the caller must hold the vnode lock */ -static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) +static void afs_defer_unlock(struct afs_vnode *vnode) { - cancel_delayed_work(&vnode->lock_work); - if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) && - !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags)) - BUG(); - if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) - BUG(); - vnode->unlock_key = key_get(key); - afs_lock_may_be_available(vnode); + _enter("%u", vnode->lock_state); + + if (list_empty(&vnode->granted_locks) && + (vnode->lock_state == AFS_VNODE_LOCK_GRANTED || + vnode->lock_state == AFS_VNODE_LOCK_EXTENDING)) { + cancel_delayed_work(&vnode->lock_work); + + afs_set_lock_state(vnode, AFS_VNODE_LOCK_NEED_UNLOCK); + trace_afs_flock_ev(vnode, NULL, afs_flock_defer_unlock, 0); + queue_delayed_work(afs_lock_manager, &vnode->lock_work, 0); + } +} + +/* + * Check that our view of the file metadata is up to date and check to see + * whether we think that we have a locking permit. + */ +static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key, + enum afs_flock_mode mode, afs_lock_type_t type) +{ + afs_access_t access; + int ret; + + /* Make sure we've got a callback on this file and that our view of the + * data version is up to date. + */ + ret = afs_validate(vnode, key); + if (ret < 0) + return ret; + + /* Check the permission set to see if we're actually going to be + * allowed to get a lock on this file. + */ + ret = afs_check_permit(vnode, key, &access); + if (ret < 0) + return ret; + + /* At a rough estimation, you need LOCK, WRITE or INSERT perm to + * read-lock a file and WRITE or INSERT perm to write-lock a file. + * + * We can't rely on the server to do this for us since if we want to + * share a read lock that we already have, we won't go the server. + */ + if (type == AFS_LOCK_READ) { + if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE | AFS_ACE_LOCK))) + return -EACCES; + } else { + if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE))) + return -EACCES; + } + + return 0; } /* @@ -254,185 +452,247 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) { struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); + enum afs_flock_mode mode = AFS_FS_S(inode->i_sb)->flock_mode; afs_lock_type_t type; - struct key *key = file->private_data; + struct key *key = afs_file_key(file); + bool partial, no_server_lock = false; int ret; - _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + if (mode == afs_flock_mode_unset) + mode = afs_flock_mode_openafs; - /* only whole-file locks are supported */ - if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) - return -EINVAL; - - ret = afs_init_lock_manager(); - if (ret < 0) - return ret; + _enter("{%llx:%llu},%llu-%llu,%u,%u", + vnode->fid.vid, vnode->fid.vnode, + fl->fl_start, fl->fl_end, fl->c.flc_type, mode); fl->fl_ops = &afs_lock_ops; INIT_LIST_HEAD(&fl->fl_u.afs.link); fl->fl_u.afs.state = AFS_LOCK_PENDING; - type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; - - spin_lock(&inode->i_lock); + partial = (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX); + type = lock_is_read(fl) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + if (mode == afs_flock_mode_write && partial) + type = AFS_LOCK_WRITE; - /* make sure we've got a callback on this file and that our view of the - * data version is up to date */ - ret = afs_vnode_fetch_status(vnode, NULL, key); + ret = afs_do_setlk_check(vnode, key, mode, type); if (ret < 0) - goto error; + return ret; - if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) { - ret = -EAGAIN; - goto error; + trace_afs_flock_op(vnode, fl, afs_flock_op_set_lock); + + /* AFS3 protocol only supports full-file locks and doesn't provide any + * method of upgrade/downgrade, so we need to emulate for partial-file + * locks. + * + * The OpenAFS client only gets a server lock for a full-file lock and + * keeps partial-file locks local. Allow this behaviour to be emulated + * (as the default). + */ + if (mode == afs_flock_mode_local || + (partial && mode == afs_flock_mode_openafs)) { + no_server_lock = true; + goto skip_server_lock; } spin_lock(&vnode->lock); + list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); - /* if we've already got a readlock on the server then we can instantly - * grant another readlock, irrespective of whether there are any - * pending writelocks */ - if (type == AFS_LOCK_READ && - vnode->flags & (1 << AFS_VNODE_READLOCKED)) { - _debug("instant readlock"); - ASSERTCMP(vnode->flags & - ((1 << AFS_VNODE_LOCKING) | - (1 << AFS_VNODE_WRITELOCKED)), ==, 0); - ASSERT(!list_empty(&vnode->granted_locks)); - goto sharing_existing_lock; - } + ret = -ENOENT; + if (vnode->lock_state == AFS_VNODE_LOCK_DELETED) + goto error_unlock; - /* if there's no-one else with a lock on this vnode, then we need to - * ask the server for a lock */ - if (list_empty(&vnode->pending_locks) && - list_empty(&vnode->granted_locks)) { - _debug("not locked"); - ASSERTCMP(vnode->flags & - ((1 << AFS_VNODE_LOCKING) | - (1 << AFS_VNODE_READLOCKED) | - (1 << AFS_VNODE_WRITELOCKED)), ==, 0); - list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); - set_bit(AFS_VNODE_LOCKING, &vnode->flags); - spin_unlock(&vnode->lock); + /* If we've already got a lock on the server then try to move to having + * the VFS grant the requested lock. Note that this means that other + * clients may get starved out. + */ + _debug("try %u", vnode->lock_state); + if (vnode->lock_state == AFS_VNODE_LOCK_GRANTED) { + if (type == AFS_LOCK_READ) { + _debug("instant readlock"); + list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); + fl->fl_u.afs.state = AFS_LOCK_GRANTED; + goto vnode_is_locked_u; + } - ret = afs_vnode_set_lock(vnode, key, type); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); - switch (ret) { - case 0: - _debug("acquired"); - goto acquired_server_lock; - case -EWOULDBLOCK: - _debug("would block"); - spin_lock(&vnode->lock); - ASSERT(list_empty(&vnode->granted_locks)); - ASSERTCMP(vnode->pending_locks.next, ==, - &fl->fl_u.afs.link); - goto wait; - default: - spin_lock(&vnode->lock); - list_del_init(&fl->fl_u.afs.link); - spin_unlock(&vnode->lock); - goto error; + if (vnode->lock_type == AFS_LOCK_WRITE) { + _debug("instant writelock"); + list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); + fl->fl_u.afs.state = AFS_LOCK_GRANTED; + goto vnode_is_locked_u; } } - /* otherwise, we need to wait for a local lock to become available */ - _debug("wait local"); - list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); -wait: - if (!(fl->fl_flags & FL_SLEEP)) { - _debug("noblock"); + if (vnode->lock_state == AFS_VNODE_LOCK_NONE && + !(fl->c.flc_flags & FL_SLEEP)) { ret = -EAGAIN; - goto abort_attempt; + if (type == AFS_LOCK_READ) { + if (vnode->status.lock_count == -1) + goto lock_is_contended; /* Write locked */ + } else { + if (vnode->status.lock_count != 0) + goto lock_is_contended; /* Locked */ + } } - spin_unlock(&vnode->lock); - /* now we need to sleep and wait for the lock manager thread to get the - * lock from the server */ - _debug("sleep"); - ret = wait_event_interruptible(fl->fl_wait, - fl->fl_u.afs.state <= AFS_LOCK_GRANTED); - if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { - ret = fl->fl_u.afs.state; - if (ret < 0) - goto error; - spin_lock(&vnode->lock); - goto given_lock; - } + if (vnode->lock_state != AFS_VNODE_LOCK_NONE) + goto need_to_wait; + +try_to_lock: + /* We don't have a lock on this vnode and we aren't currently waiting + * for one either, so ask the server for a lock. + * + * Note that we need to be careful if we get interrupted by a signal + * after dispatching the request as we may still get the lock, even + * though we don't wait for the reply (it's not too bad a problem - the + * lock will expire in 5 mins anyway). + */ + trace_afs_flock_ev(vnode, fl, afs_flock_try_to_lock, 0); + vnode->lock_key = key_get(key); + vnode->lock_type = type; + afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING); + spin_unlock(&vnode->lock); - /* we were interrupted, but someone may still be in the throes of - * giving us the lock */ - _debug("intr"); - ASSERTCMP(ret, ==, -ERESTARTSYS); + ret = afs_set_lock(vnode, key, type); /* RPC */ spin_lock(&vnode->lock); - if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { - ret = fl->fl_u.afs.state; - if (ret < 0) { - spin_unlock(&vnode->lock); - goto error; - } - goto given_lock; - } + switch (ret) { + case -EKEYREJECTED: + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EPERM: + case -EACCES: + fl->fl_u.afs.state = ret; + trace_afs_flock_ev(vnode, fl, afs_flock_fail_perm, ret); + list_del_init(&fl->fl_u.afs.link); + afs_next_locker(vnode, ret); + goto error_unlock; -abort_attempt: - /* we aren't going to get the lock, either because we're unwilling to - * wait, or because some signal happened */ - _debug("abort"); - if (list_empty(&vnode->granted_locks) && - vnode->pending_locks.next == &fl->fl_u.afs.link) { - if (vnode->pending_locks.prev != &fl->fl_u.afs.link) { - /* kick the next pending lock into having a go */ - list_del_init(&fl->fl_u.afs.link); - afs_lock_may_be_available(vnode); - } - } else { + case -ENOENT: + fl->fl_u.afs.state = ret; + trace_afs_flock_ev(vnode, fl, afs_flock_fail_other, ret); + list_del_init(&fl->fl_u.afs.link); + afs_kill_lockers_enoent(vnode); + goto error_unlock; + + default: + fl->fl_u.afs.state = ret; + trace_afs_flock_ev(vnode, fl, afs_flock_fail_other, ret); list_del_init(&fl->fl_u.afs.link); + afs_next_locker(vnode, 0); + goto error_unlock; + + case -EWOULDBLOCK: + /* The server doesn't have a lock-waiting queue, so the client + * will have to retry. The server will break the outstanding + * callbacks on a file when a lock is released. + */ + ASSERT(list_empty(&vnode->granted_locks)); + ASSERTCMP(vnode->pending_locks.next, ==, &fl->fl_u.afs.link); + goto lock_is_contended; + + case 0: + afs_set_lock_state(vnode, AFS_VNODE_LOCK_GRANTED); + trace_afs_flock_ev(vnode, fl, afs_flock_acquired, type); + afs_grant_locks(vnode); + goto vnode_is_locked_u; } - spin_unlock(&vnode->lock); - goto error; -acquired_server_lock: - /* we've acquired a server lock, but it needs to be renewed after 5 - * mins */ - spin_lock(&vnode->lock); - afs_schedule_lock_extension(vnode); - if (type == AFS_LOCK_READ) - set_bit(AFS_VNODE_READLOCKED, &vnode->flags); - else - set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); -sharing_existing_lock: - /* the lock has been granted as far as we're concerned... */ - fl->fl_u.afs.state = AFS_LOCK_GRANTED; - list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); -given_lock: - /* ... but we do still need to get the VFS's blessing */ - ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING))); - ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) | - (1 << AFS_VNODE_WRITELOCKED))) != 0); - ret = posix_lock_file(file, fl, NULL); +vnode_is_locked_u: + spin_unlock(&vnode->lock); +vnode_is_locked: + /* the lock has been granted by the server... */ + ASSERTCMP(fl->fl_u.afs.state, ==, AFS_LOCK_GRANTED); + +skip_server_lock: + /* ... but the VFS still needs to distribute access on this client. */ + trace_afs_flock_ev(vnode, fl, afs_flock_vfs_locking, 0); + ret = locks_lock_file_wait(file, fl); + trace_afs_flock_ev(vnode, fl, afs_flock_vfs_lock, ret); if (ret < 0) goto vfs_rejected_lock; - spin_unlock(&vnode->lock); - /* again, make sure we've got a callback on this file and, again, make + /* Again, make sure we've got a callback on this file and, again, make * sure that our view of the data version is up to date (we ignore - * errors incurred here and deal with the consequences elsewhere) */ - afs_vnode_fetch_status(vnode, NULL, key); + * errors incurred here and deal with the consequences elsewhere). + */ + afs_validate(vnode, key); + _leave(" = 0"); + return 0; -error: - spin_unlock(&inode->i_lock); - _leave(" = %d", ret); - return ret; +lock_is_contended: + if (!(fl->c.flc_flags & FL_SLEEP)) { + list_del_init(&fl->fl_u.afs.link); + afs_next_locker(vnode, 0); + ret = -EAGAIN; + goto error_unlock; + } + + afs_set_lock_state(vnode, AFS_VNODE_LOCK_WAITING_FOR_CB); + trace_afs_flock_ev(vnode, fl, afs_flock_would_block, ret); + queue_delayed_work(afs_lock_manager, &vnode->lock_work, HZ * 5); + +need_to_wait: + /* We're going to have to wait. Either this client doesn't have a lock + * on the server yet and we need to wait for a callback to occur, or + * the client does have a lock on the server, but it's shared and we + * need an exclusive lock. + */ + spin_unlock(&vnode->lock); + + trace_afs_flock_ev(vnode, fl, afs_flock_waiting, 0); + ret = wait_event_interruptible(fl->c.flc_wait, + fl->fl_u.afs.state != AFS_LOCK_PENDING); + trace_afs_flock_ev(vnode, fl, afs_flock_waited, ret); + + if (fl->fl_u.afs.state >= 0 && fl->fl_u.afs.state != AFS_LOCK_GRANTED) { + spin_lock(&vnode->lock); + + switch (fl->fl_u.afs.state) { + case AFS_LOCK_YOUR_TRY: + fl->fl_u.afs.state = AFS_LOCK_PENDING; + goto try_to_lock; + case AFS_LOCK_PENDING: + if (ret > 0) { + /* We need to retry the lock. We may not be + * notified by the server if it just expired + * rather than being released. + */ + ASSERTCMP(vnode->lock_state, ==, AFS_VNODE_LOCK_WAITING_FOR_CB); + afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING); + fl->fl_u.afs.state = AFS_LOCK_PENDING; + goto try_to_lock; + } + goto error_unlock; + case AFS_LOCK_GRANTED: + default: + break; + } + + spin_unlock(&vnode->lock); + } + + if (fl->fl_u.afs.state == AFS_LOCK_GRANTED) + goto vnode_is_locked; + ret = fl->fl_u.afs.state; + goto error; vfs_rejected_lock: - /* the VFS rejected the lock we just obtained, so we have to discard - * what we just got */ + /* The VFS rejected the lock we just obtained, so we have to discard + * what we just got. We defer this to the lock manager work item to + * deal with. + */ _debug("vfs refused %d", ret); + if (no_server_lock) + goto error; + spin_lock(&vnode->lock); list_del_init(&fl->fl_u.afs.link); - if (list_empty(&vnode->granted_locks)) - afs_defer_unlock(vnode, key); - goto abort_attempt; + afs_defer_unlock(vnode); + +error_unlock: + spin_unlock(&vnode->lock); +error: + _leave(" = %d", ret); + return ret; } /* @@ -440,34 +700,20 @@ vfs_rejected_lock: */ static int afs_do_unlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); - struct key *key = file->private_data; + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; - _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, + fl->c.flc_type); - /* only whole-file unlocks are supported */ - if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) - return -EINVAL; + trace_afs_flock_op(vnode, fl, afs_flock_op_unlock); - fl->fl_ops = &afs_lock_ops; - INIT_LIST_HEAD(&fl->fl_u.afs.link); - fl->fl_u.afs.state = AFS_LOCK_PENDING; + /* Flush all pending writes before doing anything with locks. */ + vfs_fsync(file, 0); - spin_lock(&vnode->lock); - ret = posix_lock_file(file, fl, NULL); - if (ret < 0) { - spin_unlock(&vnode->lock); - _leave(" = %d [vfs]", ret); - return ret; - } - - /* discard the server lock only if all granted locks are gone */ - if (list_empty(&vnode->granted_locks)) - afs_defer_unlock(vnode, key); - spin_unlock(&vnode->lock); - _leave(" = 0"); - return 0; + ret = locks_lock_file_wait(file, fl); + _leave(" = %d [%u]", ret, vnode->lock_state); + return ret; } /* @@ -475,38 +721,40 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) */ static int afs_do_getlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); - struct key *key = file->private_data; + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct key *key = afs_file_key(file); int ret, lock_count; _enter(""); - fl->fl_type = F_UNLCK; + if (vnode->lock_state == AFS_VNODE_LOCK_DELETED) + return -ENOENT; - mutex_lock(&vnode->vfs_inode.i_mutex); + fl->c.flc_type = F_UNLCK; /* check local lock records first */ - ret = 0; posix_test_lock(file, fl); - if (fl->fl_type == F_UNLCK) { + if (lock_is_unlock(fl)) { /* no local locks; consult the server */ - ret = afs_vnode_fetch_status(vnode, NULL, key); + ret = afs_fetch_status(vnode, key, false, NULL); if (ret < 0) goto error; - lock_count = vnode->status.lock_count; - if (lock_count) { + + lock_count = READ_ONCE(vnode->status.lock_count); + if (lock_count != 0) { if (lock_count > 0) - fl->fl_type = F_RDLCK; + fl->c.flc_type = F_RDLCK; else - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; fl->fl_start = 0; fl->fl_end = OFFSET_MAX; + fl->c.flc_pid = 0; } } + ret = 0; error: - mutex_unlock(&vnode->vfs_inode.i_mutex); - _leave(" = %d [%hd]", ret, fl->fl_type); + _leave(" = %d [%hd]", ret, fl->c.flc_type); return ret; } @@ -516,21 +764,33 @@ error: int afs_lock(struct file *file, int cmd, struct file_lock *fl) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + enum afs_flock_operation op; + int ret; - _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}", + _enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}", vnode->fid.vid, vnode->fid.vnode, cmd, - fl->fl_type, fl->fl_flags, + fl->c.flc_type, fl->c.flc_flags, (long long) fl->fl_start, (long long) fl->fl_end); - /* AFS doesn't support mandatory locks */ - if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; - if (IS_GETLK(cmd)) return afs_do_getlk(file, fl); - if (fl->fl_type == F_UNLCK) - return afs_do_unlk(file, fl); - return afs_do_setlk(file, fl); + + fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id); + trace_afs_flock_op(vnode, fl, afs_flock_op_lock); + + if (lock_is_unlock(fl)) + ret = afs_do_unlk(file, fl); + else + ret = afs_do_setlk(file, fl); + + switch (ret) { + case 0: op = afs_flock_op_return_ok; break; + case -EAGAIN: op = afs_flock_op_return_eagain; break; + case -EDEADLK: op = afs_flock_op_return_edeadlk; break; + default: op = afs_flock_op_return_error; break; + } + trace_afs_flock_op(vnode, fl, op); + return ret; } /* @@ -539,10 +799,12 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) int afs_flock(struct file *file, int cmd, struct file_lock *fl) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + enum afs_flock_operation op; + int ret; - _enter("{%x:%u},%d,{t=%x,fl=%x}", + _enter("{%llx:%llu},%d,{t=%x,fl=%x}", vnode->fid.vid, vnode->fid.vnode, cmd, - fl->fl_type, fl->fl_flags); + fl->c.flc_type, fl->c.flc_flags); /* * No BSD flocks over NFS allowed. @@ -551,17 +813,26 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) * Not sure whether that would be unique, though, or whether * that would break in other places. */ - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; + fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id); + trace_afs_flock_op(vnode, fl, afs_flock_op_flock); + /* we're simulating flock() locks using posix locks on the server */ - fl->fl_owner = (fl_owner_t) file; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; + if (lock_is_unlock(fl)) + ret = afs_do_unlk(file, fl); + else + ret = afs_do_setlk(file, fl); - if (fl->fl_type == F_UNLCK) - return afs_do_unlk(file, fl); - return afs_do_setlk(file, fl); + switch (ret) { + case 0: op = afs_flock_op_return_ok; break; + case -EAGAIN: op = afs_flock_op_return_eagain; break; + case -EDEADLK: op = afs_flock_op_return_edeadlk; break; + default: op = afs_flock_op_return_error; break; + } + trace_afs_flock_op(vnode, fl, op); + return ret; } /* @@ -572,9 +843,16 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file)); + _enter(""); + new->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id); + + spin_lock(&vnode->lock); + trace_afs_flock_op(vnode, new, afs_flock_op_copy_lock); list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link); + spin_unlock(&vnode->lock); } /* @@ -583,7 +861,17 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file)); + _enter(""); + spin_lock(&vnode->lock); + + trace_afs_flock_op(vnode, fl, afs_flock_op_release_lock); list_del_init(&fl->fl_u.afs.link); + if (list_empty(&vnode->granted_locks)) + afs_defer_unlock(vnode); + + _debug("state %u for %p", vnode->lock_state, vnode); + spin_unlock(&vnode->lock); } diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c new file mode 100644 index 000000000000..8418813ee043 --- /dev/null +++ b/fs/afs/fs_operation.c @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Fileserver-directed operation handling. + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include "internal.h" + +static atomic_t afs_operation_debug_counter; + +/* + * Create an operation against a volume. + */ +struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *volume) +{ + struct afs_operation *op; + + _enter(""); + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) + return ERR_PTR(-ENOMEM); + + if (!key) { + key = afs_request_key(volume->cell); + if (IS_ERR(key)) { + kfree(op); + return ERR_CAST(key); + } + } else { + key_get(key); + } + + op->key = key; + op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op); + op->net = volume->cell->net; + op->cb_v_break = atomic_read(&volume->cb_v_break); + op->pre_volsync.creation = volume->creation_time; + op->pre_volsync.update = volume->update_time; + op->debug_id = atomic_inc_return(&afs_operation_debug_counter); + op->nr_iterations = -1; + afs_op_set_error(op, -EDESTADDRREQ); + + _leave(" = [op=%08x]", op->debug_id); + return op; +} + +struct afs_io_locker { + struct list_head link; + struct task_struct *task; + unsigned long have_lock; +}; + +/* + * Unlock the I/O lock on a vnode. + */ +static void afs_unlock_for_io(struct afs_vnode *vnode) +{ + struct afs_io_locker *locker; + + spin_lock(&vnode->lock); + locker = list_first_entry_or_null(&vnode->io_lock_waiters, + struct afs_io_locker, link); + if (locker) { + list_del(&locker->link); + smp_store_release(&locker->have_lock, 1); /* The unlock barrier. */ + smp_mb__after_atomic(); /* Store have_lock before task state */ + wake_up_process(locker->task); + } else { + clear_bit(AFS_VNODE_IO_LOCK, &vnode->flags); + } + spin_unlock(&vnode->lock); +} + +/* + * Lock the I/O lock on a vnode uninterruptibly. We can't use an ordinary + * mutex as lockdep will complain if we unlock it in the wrong thread. + */ +static void afs_lock_for_io(struct afs_vnode *vnode) +{ + struct afs_io_locker myself = { .task = current, }; + + spin_lock(&vnode->lock); + + if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) { + spin_unlock(&vnode->lock); + return; + } + + list_add_tail(&myself.link, &vnode->io_lock_waiters); + spin_unlock(&vnode->lock); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (smp_load_acquire(&myself.have_lock)) /* The lock barrier */ + break; + schedule(); + } + __set_current_state(TASK_RUNNING); +} + +/* + * Lock the I/O lock on a vnode interruptibly. We can't use an ordinary mutex + * as lockdep will complain if we unlock it in the wrong thread. + */ +static int afs_lock_for_io_interruptible(struct afs_vnode *vnode) +{ + struct afs_io_locker myself = { .task = current, }; + int ret = 0; + + spin_lock(&vnode->lock); + + if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) { + spin_unlock(&vnode->lock); + return 0; + } + + list_add_tail(&myself.link, &vnode->io_lock_waiters); + spin_unlock(&vnode->lock); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (smp_load_acquire(&myself.have_lock) || /* The lock barrier */ + signal_pending(current)) + break; + schedule(); + } + __set_current_state(TASK_RUNNING); + + /* If we got a signal, try to transfer the lock onto the next + * waiter. + */ + if (unlikely(signal_pending(current))) { + spin_lock(&vnode->lock); + if (myself.have_lock) { + spin_unlock(&vnode->lock); + afs_unlock_for_io(vnode); + } else { + list_del(&myself.link); + spin_unlock(&vnode->lock); + } + ret = -ERESTARTSYS; + } + return ret; +} + +/* + * Lock the vnode(s) being operated upon. + */ +static bool afs_get_io_locks(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + struct afs_vnode *vnode2 = op->file[1].vnode; + + _enter(""); + + if (op->flags & AFS_OPERATION_UNINTR) { + afs_lock_for_io(vnode); + op->flags |= AFS_OPERATION_LOCK_0; + _leave(" = t [1]"); + return true; + } + + if (!vnode2 || !op->file[1].need_io_lock || vnode == vnode2) + vnode2 = NULL; + + if (vnode2 > vnode) + swap(vnode, vnode2); + + if (afs_lock_for_io_interruptible(vnode) < 0) { + afs_op_set_error(op, -ERESTARTSYS); + op->flags |= AFS_OPERATION_STOP; + _leave(" = f [I 0]"); + return false; + } + op->flags |= AFS_OPERATION_LOCK_0; + + if (vnode2) { + if (afs_lock_for_io_interruptible(vnode2) < 0) { + afs_op_set_error(op, -ERESTARTSYS); + op->flags |= AFS_OPERATION_STOP; + afs_unlock_for_io(vnode); + op->flags &= ~AFS_OPERATION_LOCK_0; + _leave(" = f [I 1]"); + return false; + } + op->flags |= AFS_OPERATION_LOCK_1; + } + + _leave(" = t [2]"); + return true; +} + +static void afs_drop_io_locks(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + struct afs_vnode *vnode2 = op->file[1].vnode; + + _enter(""); + + if (op->flags & AFS_OPERATION_LOCK_1) + afs_unlock_for_io(vnode2); + if (op->flags & AFS_OPERATION_LOCK_0) + afs_unlock_for_io(vnode); +} + +static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *vp, + unsigned int index) +{ + struct afs_vnode *vnode = vp->vnode; + + if (vnode) { + vp->fid = vnode->fid; + vp->dv_before = vnode->status.data_version; + vp->cb_break_before = afs_calc_vnode_cb_break(vnode); + if (vnode->lock_state != AFS_VNODE_LOCK_NONE) + op->flags |= AFS_OPERATION_CUR_ONLY; + if (vp->modification) + set_bit(AFS_VNODE_MODIFYING, &vnode->flags); + } + + if (vp->fid.vnode) + _debug("PREP[%u] {%llx:%llu.%u}", + index, vp->fid.vid, vp->fid.vnode, vp->fid.unique); +} + +/* + * Begin an operation on the fileserver. + * + * Fileserver operations are serialised on the server by vnode, so we serialise + * them here also using the io_lock. + */ +bool afs_begin_vnode_operation(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + + ASSERT(vnode); + + _enter(""); + + if (op->file[0].need_io_lock) + if (!afs_get_io_locks(op)) + return false; + + afs_prepare_vnode(op, &op->file[0], 0); + afs_prepare_vnode(op, &op->file[1], 1); + op->cb_v_break = atomic_read(&op->volume->cb_v_break); + _leave(" = true"); + return true; +} + +/* + * Tidy up a filesystem cursor and unlock the vnode. + */ +void afs_end_vnode_operation(struct afs_operation *op) +{ + _enter(""); + + switch (afs_op_error(op)) { + case -EDESTADDRREQ: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: + afs_dump_edestaddrreq(op); + break; + } + + afs_drop_io_locks(op); +} + +/* + * Wait for an in-progress operation to complete. + */ +void afs_wait_for_operation(struct afs_operation *op) +{ + _enter(""); + + while (afs_select_fileserver(op)) { + op->call_responded = false; + op->call_error = 0; + op->call_abort_code = 0; + if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) && + op->ops->issue_yfs_rpc) + op->ops->issue_yfs_rpc(op); + else if (op->ops->issue_afs_rpc) + op->ops->issue_afs_rpc(op); + else + op->call_error = -ENOTSUPP; + + if (op->call) { + afs_wait_for_call_to_complete(op->call); + op->call_abort_code = op->call->abort_code; + op->call_error = op->call->error; + op->call_responded = op->call->responded; + afs_put_call(op->call); + } + } + + if (op->call_responded && op->server) + set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags); + + if (!afs_op_error(op)) { + _debug("success"); + op->ops->success(op); + } else if (op->cumul_error.aborted) { + if (op->ops->aborted) + op->ops->aborted(op); + } else { + if (op->ops->failed) + op->ops->failed(op); + } + + afs_end_vnode_operation(op); + + if (!afs_op_error(op) && op->ops->edit_dir) { + _debug("edit_dir"); + op->ops->edit_dir(op); + } + _leave(""); +} + +/* + * Dispose of an operation. + */ +int afs_put_operation(struct afs_operation *op) +{ + struct afs_addr_list *alist; + int i, ret = afs_op_error(op); + + _enter("op=%08x,%d", op->debug_id, ret); + + if (op->ops && op->ops->put) + op->ops->put(op); + if (op->file[0].modification) + clear_bit(AFS_VNODE_MODIFYING, &op->file[0].vnode->flags); + if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode) + clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags); + if (op->file[0].put_vnode) + iput(&op->file[0].vnode->netfs.inode); + if (op->file[1].put_vnode) + iput(&op->file[1].vnode->netfs.inode); + + if (op->more_files) { + for (i = 0; i < op->nr_files - 2; i++) + if (op->more_files[i].put_vnode) + iput(&op->more_files[i].vnode->netfs.inode); + kfree(op->more_files); + } + + if (op->estate) { + alist = op->estate->addresses; + if (alist) { + if (op->call_responded && + op->addr_index != alist->preferred && + test_bit(alist->preferred, &op->addr_tried)) + WRITE_ONCE(alist->preferred, op->addr_index); + } + } + + afs_clear_server_states(op); + afs_put_serverlist(op->net, op->server_list); + afs_put_volume(op->volume, afs_volume_trace_put_put_op); + key_put(op->key); + kfree(op); + return ret; +} + +int afs_do_sync_operation(struct afs_operation *op) +{ + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + return afs_put_operation(op); +} diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c new file mode 100644 index 000000000000..e0030ac74ea0 --- /dev/null +++ b/fs/afs/fs_probe.c @@ -0,0 +1,539 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* AFS fileserver probing + * + * Copyright (C) 2018, 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include "afs_fs.h" +#include "internal.h" +#include "protocol_afs.h" +#include "protocol_yfs.h" + +static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ; +static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ; + +struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, + enum afs_estate_trace where) +{ + if (estate) { + int r; + + __refcount_inc(&estate->ref, &r); + trace_afs_estate(estate->server_id, estate->probe_seq, r, where); + } + return estate; +} + +static void afs_endpoint_state_rcu(struct rcu_head *rcu) +{ + struct afs_endpoint_state *estate = container_of(rcu, struct afs_endpoint_state, rcu); + + trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), + afs_estate_trace_free); + afs_put_addrlist(estate->addresses, afs_alist_trace_put_estate); + kfree(estate); +} + +void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where) +{ + if (estate) { + unsigned int server_id = estate->server_id, probe_seq = estate->probe_seq; + bool dead; + int r; + + dead = __refcount_dec_and_test(&estate->ref, &r); + trace_afs_estate(server_id, probe_seq, r, where); + if (dead) + call_rcu(&estate->rcu, afs_endpoint_state_rcu); + } +} + +/* + * Start the probe polling timer. We have to supply it with an inc on the + * outstanding server count. + */ +static void afs_schedule_fs_probe(struct afs_net *net, + struct afs_server *server, bool fast) +{ + unsigned long atj; + + if (!net->live) + return; + + atj = server->probed_at; + atj += fast ? afs_fs_probe_fast_poll_interval : afs_fs_probe_slow_poll_interval; + + afs_inc_servers_outstanding(net); + if (timer_reduce(&net->fs_probe_timer, atj)) + afs_dec_servers_outstanding(net); +} + +/* + * Handle the completion of a set of probes. + */ +static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate) +{ + bool responded = test_bit(AFS_ESTATE_RESPONDED, &estate->flags); + + write_seqlock(&net->fs_lock); + if (responded) { + list_add_tail(&server->probe_link, &net->fs_probe_slow); + } else { + server->rtt = UINT_MAX; + clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags); + list_add_tail(&server->probe_link, &net->fs_probe_fast); + } + + write_sequnlock(&net->fs_lock); + + afs_schedule_fs_probe(net, server, !responded); +} + +/* + * Handle the completion of a probe. + */ +static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate) +{ + _enter(""); + + if (atomic_dec_and_test(&estate->nr_probing)) + afs_finished_fs_probe(net, server, estate); + + wake_up_all(&server->probe_wq); +} + +/* + * Handle inability to send a probe due to ENOMEM when trying to allocate a + * call struct. + */ +static void afs_fs_probe_not_done(struct afs_net *net, + struct afs_server *server, + struct afs_endpoint_state *estate, + int index) +{ + _enter(""); + + trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail); + spin_lock(&server->probe_lock); + + set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags); + if (estate->error == 0) + estate->error = -ENOMEM; + + set_bit(index, &estate->failed_set); + + spin_unlock(&server->probe_lock); + return afs_done_one_fs_probe(net, server, estate); +} + +/* + * Process the result of probing a fileserver. This is called after successful + * or failed delivery of an FS.GetCapabilities operation. + */ +void afs_fileserver_probe_result(struct afs_call *call) +{ + struct afs_endpoint_state *estate = call->probe; + struct afs_addr_list *alist = estate->addresses; + struct afs_address *addr = &alist->addrs[call->probe_index]; + struct afs_server *server = call->server; + unsigned int index = call->probe_index; + unsigned int rtt_us = -1, cap0; + int ret = call->error; + + _enter("%pU,%u", &server->uuid, index); + + WRITE_ONCE(addr->last_error, ret); + + spin_lock(&server->probe_lock); + + switch (ret) { + case 0: + estate->error = 0; + goto responded; + case -ECONNABORTED: + if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) { + estate->abort_code = call->abort_code; + estate->error = ret; + } + goto responded; + case -ENOMEM: + case -ENONET: + clear_bit(index, &estate->responsive_set); + set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags); + trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail); + goto out; + case -ECONNRESET: /* Responded, but call expired. */ + case -ERFKILL: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: + case -EHOSTDOWN: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + default: + clear_bit(index, &estate->responsive_set); + set_bit(index, &estate->failed_set); + if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags) && + (estate->error == 0 || + estate->error == -ETIMEDOUT || + estate->error == -ETIME)) + estate->error = ret; + trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail); + goto out; + } + +responded: + clear_bit(index, &estate->failed_set); + + if (call->service_id == YFS_FS_SERVICE) { + set_bit(AFS_ESTATE_IS_YFS, &estate->flags); + set_bit(AFS_SERVER_FL_IS_YFS, &server->flags); + server->service_id = call->service_id; + } else { + set_bit(AFS_ESTATE_NOT_YFS, &estate->flags); + if (!test_bit(AFS_ESTATE_IS_YFS, &estate->flags)) { + clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags); + server->service_id = call->service_id; + } + cap0 = ntohl(call->tmp); + if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES) + set_bit(AFS_SERVER_FL_HAS_FS64, &server->flags); + else + clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags); + } + + rtt_us = rxrpc_kernel_get_srtt(addr->peer); + if (rtt_us < estate->rtt) { + estate->rtt = rtt_us; + server->rtt = rtt_us; + alist->preferred = index; + } + + smp_wmb(); /* Set rtt before responded. */ + set_bit(AFS_ESTATE_RESPONDED, &estate->flags); + set_bit(index, &estate->responsive_set); + set_bit(AFS_SERVER_FL_RESPONDING, &server->flags); +out: + spin_unlock(&server->probe_lock); + + trace_afs_fs_probe(server, false, estate, index, call->error, call->abort_code, rtt_us); + _debug("probe[%x] %pU [%u] %pISpc rtt=%d ret=%d", + estate->probe_seq, &server->uuid, index, + rxrpc_kernel_remote_addr(alist->addrs[index].peer), + rtt_us, ret); + + return afs_done_one_fs_probe(call->net, server, estate); +} + +/* + * Probe all of a fileserver's addresses to find out the best route and to + * query its capabilities. + */ +int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_alist, struct key *key) +{ + struct afs_endpoint_state *estate, *old; + struct afs_addr_list *old_alist = NULL, *alist; + unsigned long unprobed; + + _enter("%pU", &server->uuid); + + estate = kzalloc(sizeof(*estate), GFP_KERNEL); + if (!estate) + return -ENOMEM; + + refcount_set(&estate->ref, 2); + estate->server_id = server->debug_id; + estate->rtt = UINT_MAX; + + write_lock(&server->fs_lock); + + old = rcu_dereference_protected(server->endpoint_state, + lockdep_is_held(&server->fs_lock)); + if (old) { + estate->responsive_set = old->responsive_set; + if (!new_alist) + new_alist = old->addresses; + } + + if (old_alist != new_alist) + afs_set_peer_appdata(server, old_alist, new_alist); + + estate->addresses = afs_get_addrlist(new_alist, afs_alist_trace_get_estate); + alist = estate->addresses; + estate->probe_seq = ++server->probe_counter; + atomic_set(&estate->nr_probing, alist->nr_addrs); + + if (new_alist) + server->addr_version = new_alist->version; + rcu_assign_pointer(server->endpoint_state, estate); + write_unlock(&server->fs_lock); + if (old) + set_bit(AFS_ESTATE_SUPERSEDED, &old->flags); + + trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), + afs_estate_trace_alloc_probe); + + afs_get_address_preferences(net, new_alist); + + server->probed_at = jiffies; + unprobed = (1UL << alist->nr_addrs) - 1; + while (unprobed) { + unsigned int index = 0, i; + int best_prio = -1; + + for (i = 0; i < alist->nr_addrs; i++) { + if (test_bit(i, &unprobed) && + alist->addrs[i].prio > best_prio) { + index = i; + best_prio = alist->addrs[i].prio; + } + } + __clear_bit(index, &unprobed); + + trace_afs_fs_probe(server, true, estate, index, 0, 0, 0); + if (!afs_fs_get_capabilities(net, server, estate, index, key)) + afs_fs_probe_not_done(net, server, estate, index); + } + + afs_put_endpoint_state(old, afs_estate_trace_put_probe); + afs_put_endpoint_state(estate, afs_estate_trace_put_probe); + return 0; +} + +/* + * Wait for the first as-yet untried fileserver to respond, for the probe state + * to be superseded or for all probes to finish. + */ +int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr) +{ + struct afs_endpoint_state *estate; + struct afs_server_list *slist = op->server_list; + bool still_probing = true; + int ret = 0, i; + + _enter("%u", slist->nr_servers); + + for (i = 0; i < slist->nr_servers; i++) { + estate = states[i].endpoint_state; + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) + return 2; + if (atomic_read(&estate->nr_probing)) + still_probing = true; + if (estate->responsive_set & states[i].untried_addrs) + return 1; + } + if (!still_probing) + return 0; + + for (i = 0; i < slist->nr_servers; i++) + add_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter); + + for (;;) { + still_probing = false; + + set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); + for (i = 0; i < slist->nr_servers; i++) { + estate = states[i].endpoint_state; + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) { + ret = 2; + goto stop; + } + if (atomic_read(&estate->nr_probing)) + still_probing = true; + if (estate->responsive_set & states[i].untried_addrs) { + ret = 1; + goto stop; + } + } + + if (!still_probing || signal_pending(current)) + goto stop; + schedule(); + } + +stop: + set_current_state(TASK_RUNNING); + + for (i = 0; i < slist->nr_servers; i++) + remove_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter); + + if (!ret && signal_pending(current)) + ret = -ERESTARTSYS; + return ret; +} + +/* + * Probe timer. We have an increment on fs_outstanding that we need to pass + * along to the work item. + */ +void afs_fs_probe_timer(struct timer_list *timer) +{ + struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer); + + if (!net->live || !queue_work(afs_wq, &net->fs_prober)) + afs_dec_servers_outstanding(net); +} + +/* + * Dispatch a probe to a server. + */ +static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server) + __releases(&net->fs_lock) +{ + struct key *key = NULL; + + /* We remove it from the queues here - it will be added back to + * one of the queues on the completion of the probe. + */ + list_del_init(&server->probe_link); + + afs_get_server(server, afs_server_trace_get_probe); + write_sequnlock(&net->fs_lock); + + afs_fs_probe_fileserver(net, server, NULL, key); + afs_put_server(net, server, afs_server_trace_put_probe); +} + +/* + * Probe a server immediately without waiting for its due time to come + * round. This is used when all of the addresses have been tried. + */ +void afs_probe_fileserver(struct afs_net *net, struct afs_server *server) +{ + write_seqlock(&net->fs_lock); + if (!list_empty(&server->probe_link)) + return afs_dispatch_fs_probe(net, server); + write_sequnlock(&net->fs_lock); +} + +/* + * Probe dispatcher to regularly dispatch probes to keep NAT alive. + */ +void afs_fs_probe_dispatcher(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, fs_prober); + struct afs_server *fast, *slow, *server; + unsigned long nowj, timer_at, poll_at; + bool first_pass = true, set_timer = false; + + if (!net->live) { + afs_dec_servers_outstanding(net); + return; + } + + _enter(""); + + if (list_empty(&net->fs_probe_fast) && list_empty(&net->fs_probe_slow)) { + afs_dec_servers_outstanding(net); + _leave(" [none]"); + return; + } + +again: + write_seqlock(&net->fs_lock); + + fast = slow = server = NULL; + nowj = jiffies; + timer_at = nowj + MAX_JIFFY_OFFSET; + + if (!list_empty(&net->fs_probe_fast)) { + fast = list_first_entry(&net->fs_probe_fast, struct afs_server, probe_link); + poll_at = fast->probed_at + afs_fs_probe_fast_poll_interval; + if (time_before(nowj, poll_at)) { + timer_at = poll_at; + set_timer = true; + fast = NULL; + } + } + + if (!list_empty(&net->fs_probe_slow)) { + slow = list_first_entry(&net->fs_probe_slow, struct afs_server, probe_link); + poll_at = slow->probed_at + afs_fs_probe_slow_poll_interval; + if (time_before(nowj, poll_at)) { + if (time_before(poll_at, timer_at)) + timer_at = poll_at; + set_timer = true; + slow = NULL; + } + } + + server = fast ?: slow; + if (server) + _debug("probe %pU", &server->uuid); + + if (server && (first_pass || !need_resched())) { + afs_dispatch_fs_probe(net, server); + first_pass = false; + goto again; + } + + write_sequnlock(&net->fs_lock); + + if (server) { + if (!queue_work(afs_wq, &net->fs_prober)) + afs_dec_servers_outstanding(net); + _leave(" [requeue]"); + } else if (set_timer) { + if (timer_reduce(&net->fs_probe_timer, timer_at)) + afs_dec_servers_outstanding(net); + _leave(" [timer]"); + } else { + afs_dec_servers_outstanding(net); + _leave(" [quiesce]"); + } +} + +/* + * Wait for a probe on a particular fileserver to complete for 2s. + */ +int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, + unsigned long exclude, bool is_intr) +{ + struct wait_queue_entry wait; + unsigned long timo = 2 * HZ; + + if (atomic_read(&estate->nr_probing) == 0) + goto dont_wait; + + init_wait_entry(&wait, 0); + for (;;) { + prepare_to_wait_event(&server->probe_wq, &wait, + is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); + if (timo == 0 || + test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags) || + (estate->responsive_set & ~exclude) || + atomic_read(&estate->nr_probing) == 0 || + (is_intr && signal_pending(current))) + break; + timo = schedule_timeout(timo); + } + + finish_wait(&server->probe_wq, &wait); + +dont_wait: + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) + return 0; + if (estate->responsive_set & ~exclude) + return 1; + if (is_intr && signal_pending(current)) + return -ERESTARTSYS; + if (timo == 0) + return -ETIME; + return -EDESTADDRREQ; +} + +/* + * Clean up the probing when the namespace is killed off. + */ +void afs_fs_probe_cleanup(struct afs_net *net) +{ + if (timer_delete_sync(&net->fs_probe_timer)) + afs_dec_servers_outstanding(net); +} diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index c2e930ec2888..bc9556991d7c 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1,20 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* AFS File Server client stubs * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/init.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/circ_buf.h> +#include <linux/iversion.h> +#include <linux/netfs.h> #include "internal.h" #include "afs_fs.h" +#include "xdr_fs.h" /* * decode an AFSFid block @@ -30,127 +29,122 @@ static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid) } /* + * Dump a bad file status record. + */ +static void xdr_dump_bad(const __be32 *bp) +{ + __be32 x[4]; + int i; + + pr_notice("AFS XDR: Bad status record\n"); + for (i = 0; i < 5 * 4 * 4; i += 16) { + memcpy(x, bp, 16); + bp += 4; + pr_notice("%03x: %08x %08x %08x %08x\n", + i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3])); + } + + memcpy(x, bp, 4); + pr_notice("0x50: %08x\n", ntohl(x[0])); +} + +/* * decode an AFSFetchStatus block */ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, - struct afs_file_status *status, - struct afs_vnode *vnode, - afs_dataversion_t *store_version) + struct afs_call *call, + struct afs_status_cb *scb) { - afs_dataversion_t expected_version; - const __be32 *bp = *_bp; - umode_t mode; + const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp; + struct afs_file_status *status = &scb->status; + bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus); u64 data_version, size; - u32 changed = 0; /* becomes non-zero if ctime-type changes seen */ - kuid_t owner; - kgid_t group; - -#define EXTRACT(DST) \ - do { \ - u32 x = ntohl(*bp++); \ - changed |= DST - x; \ - DST = x; \ - } while (0) - - status->if_version = ntohl(*bp++); - EXTRACT(status->type); - EXTRACT(status->nlink); - size = ntohl(*bp++); - data_version = ntohl(*bp++); - EXTRACT(status->author); - owner = make_kuid(&init_user_ns, ntohl(*bp++)); - changed |= !uid_eq(owner, status->owner); - status->owner = owner; - EXTRACT(status->caller_access); /* call ticket dependent */ - EXTRACT(status->anon_access); - EXTRACT(status->mode); - EXTRACT(status->parent.vnode); - EXTRACT(status->parent.unique); - bp++; /* seg size */ - status->mtime_client = ntohl(*bp++); - status->mtime_server = ntohl(*bp++); - group = make_kgid(&init_user_ns, ntohl(*bp++)); - changed |= !gid_eq(group, status->group); - status->group = group; - bp++; /* sync counter */ - data_version |= (u64) ntohl(*bp++) << 32; - EXTRACT(status->lock_count); - size |= (u64) ntohl(*bp++) << 32; - bp++; /* spare 4 */ - *_bp = bp; + u32 type, abort_code; + + abort_code = ntohl(xdr->abort_code); + + if (xdr->if_version != htonl(AFS_FSTATUS_VERSION)) { + if (xdr->if_version == htonl(0) && + abort_code != 0 && + inline_error) { + /* The OpenAFS fileserver has a bug in FS.InlineBulkStatus + * whereby it doesn't set the interface version in the error + * case. + */ + status->abort_code = abort_code; + scb->have_error = true; + goto advance; + } - if (size != status->size) { - status->size = size; - changed |= true; + pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version)); + goto bad; } - status->mode &= S_IALLUGO; - - _debug("vnode time %lx, %lx", - status->mtime_client, status->mtime_server); - - if (vnode) { - status->parent.vid = vnode->fid.vid; - if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { - _debug("vnode changed"); - i_size_write(&vnode->vfs_inode, size); - vnode->vfs_inode.i_uid = status->owner; - vnode->vfs_inode.i_gid = status->group; - vnode->vfs_inode.i_generation = vnode->fid.unique; - set_nlink(&vnode->vfs_inode, status->nlink); - - mode = vnode->vfs_inode.i_mode; - mode &= ~S_IALLUGO; - mode |= status->mode; - barrier(); - vnode->vfs_inode.i_mode = mode; - } - vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server; - vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; - vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; - vnode->vfs_inode.i_version = data_version; + if (abort_code != 0 && inline_error) { + status->abort_code = abort_code; + scb->have_error = true; + goto advance; } - expected_version = status->data_version; - if (store_version) - expected_version = *store_version; - - if (expected_version != data_version) { - status->data_version = data_version; - if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { - _debug("vnode modified %llx on {%x:%u}", - (unsigned long long) data_version, - vnode->fid.vid, vnode->fid.vnode); - set_bit(AFS_VNODE_MODIFIED, &vnode->flags); - set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); - } - } else if (store_version) { - status->data_version = data_version; + type = ntohl(xdr->type); + switch (type) { + case AFS_FTYPE_FILE: + case AFS_FTYPE_DIR: + case AFS_FTYPE_SYMLINK: + status->type = type; + break; + default: + goto bad; } + + status->nlink = ntohl(xdr->nlink); + status->author = ntohl(xdr->author); + status->owner = ntohl(xdr->owner); + status->caller_access = ntohl(xdr->caller_access); /* Ticket dependent */ + status->anon_access = ntohl(xdr->anon_access); + status->mode = ntohl(xdr->mode) & S_IALLUGO; + status->group = ntohl(xdr->group); + status->lock_count = ntohl(xdr->lock_count); + + status->mtime_client.tv_sec = ntohl(xdr->mtime_client); + status->mtime_client.tv_nsec = 0; + status->mtime_server.tv_sec = ntohl(xdr->mtime_server); + status->mtime_server.tv_nsec = 0; + + size = (u64)ntohl(xdr->size_lo); + size |= (u64)ntohl(xdr->size_hi) << 32; + status->size = size; + + data_version = (u64)ntohl(xdr->data_version_lo); + data_version |= (u64)ntohl(xdr->data_version_hi) << 32; + status->data_version = data_version; + scb->have_status = true; +advance: + *_bp = (const void *)*_bp + sizeof(*xdr); + return; + +bad: + xdr_dump_bad(*_bp); + afs_protocol_error(call, afs_eproto_bad_status); + goto advance; } -/* - * decode an AFSCallBack block - */ -static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode) +static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry) { - const __be32 *bp = *_bp; - - vnode->cb_version = ntohl(*bp++); - vnode->cb_expiry = ntohl(*bp++); - vnode->cb_type = ntohl(*bp++); - vnode->cb_expires = vnode->cb_expiry + get_seconds(); - *_bp = bp; + return ktime_divns(call->issue_time, NSEC_PER_SEC) + expiry; } -static void xdr_decode_AFSCallBack_raw(const __be32 **_bp, - struct afs_callback *cb) +static void xdr_decode_AFSCallBack(const __be32 **_bp, + struct afs_call *call, + struct afs_status_cb *scb) { + struct afs_callback *cb = &scb->callback; const __be32 *bp = *_bp; - cb->version = ntohl(*bp++); - cb->expiry = ntohl(*bp++); - cb->type = ntohl(*bp++); + bp++; /* version */ + cb->expires_at = xdr_decode_expiry(call, ntohl(*bp++)); + bp++; /* type */ + scb->have_cb = true; *_bp = bp; } @@ -161,14 +155,18 @@ static void xdr_decode_AFSVolSync(const __be32 **_bp, struct afs_volsync *volsync) { const __be32 *bp = *_bp; + u32 creation; - volsync->creation = ntohl(*bp++); + creation = ntohl(*bp++); bp++; /* spare2 */ bp++; /* spare3 */ bp++; /* spare4 */ bp++; /* spare5 */ bp++; /* spare6 */ *_bp = bp; + + if (volsync) + volsync->creation = creation; } /* @@ -229,33 +227,30 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, vs->blocks_in_use = ntohl(*bp++); vs->part_blocks_avail = ntohl(*bp++); vs->part_max_blocks = ntohl(*bp++); + vs->vol_copy_date = 0; + vs->vol_backup_date = 0; *_bp = bp; } /* * deliver reply data to an FS.FetchStatus */ -static int afs_deliver_fs_fetch_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_status(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; const __be32 *bp; + int ret; - _enter(",,%u", last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - xdr_decode_AFSCallBack(&bp, vnode); - if (call->reply2) - xdr_decode_AFSVolSync(&bp, call->reply2); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSCallBack(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -266,162 +261,143 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call, */ static const struct afs_call_type afs_RXFSFetchStatus = { .name = "FS.FetchStatus", + .op = afs_FS_FetchStatus, .deliver = afs_deliver_fs_fetch_status, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * fetch the status information for a file */ -int afs_fs_fetch_file_status(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - struct afs_volsync *volsync, - const struct afs_wait_mode *wait_mode) +void afs_fs_fetch_status(struct afs_operation *op) { + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; struct afs_call *call; __be32 *bp; - _enter(",%x,{%x:%u},,", - key_serial(key), vnode->fid.vid, vnode->fid.vnode); + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSFetchStatus, + 16, (21 + 3 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = vnode; - call->reply2 = volsync; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHSTATUS); - bp[1] = htonl(vnode->fid.vid); - bp[2] = htonl(vnode->fid.vnode); - bp[3] = htonl(vnode->fid.unique); + bp[1] = htonl(vp->fid.vid); + bp[2] = htonl(vp->fid.vnode); + bp[3] = htonl(vp->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * deliver reply data to an FS.FetchData */ -static int afs_deliver_fs_fetch_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_data(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; + struct afs_operation *op = call->op; + struct netfs_io_subrequest *subreq = op->fetch.subreq; + struct afs_vnode_param *vp = &op->file[0]; const __be32 *bp; - struct page *page; - void *buffer; + size_t count_before; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u,%zu,%zu/%llu}", + call->unmarshall, call->iov_len, iov_iter_count(call->iter), + call->remaining); switch (call->unmarshall) { case 0: - call->offset = 0; + call->remaining = 0; call->unmarshall++; - if (call->operation_ID != FSFETCHDATA64) { - call->unmarshall++; - goto no_msw; + if (call->operation_ID == FSFETCHDATA64) { + afs_extract_to_tmp64(call); + } else { + call->tmp_u = htonl(0); + afs_extract_to_tmp(call); } + fallthrough; - /* extract the upper part of the returned data length of an - * FSFETCHDATA64 op (which should always be 0 using this - * client) */ + /* Extract the returned data length into ->remaining. + * This may indicate more or less data than was + * requested will be returned. + */ case 1: - _debug("extract data length (MSW)"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + _debug("extract data length"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; - call->count = ntohl(call->tmp); - _debug("DATA length MSW: %u", call->count); - if (call->count > 0) - return -EBADMSG; - call->offset = 0; - call->unmarshall++; + call->remaining = be64_to_cpu(call->tmp64); + _debug("DATA length: %llu", call->remaining); - no_msw: - /* extract the returned data length */ - case 2: - _debug("extract data length"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + if (call->remaining == 0) + goto no_more_data; - call->count = ntohl(call->tmp); - _debug("DATA length: %u", call->count); - if (call->count > PAGE_SIZE) - return -EBADMSG; - call->offset = 0; + call->iter = &subreq->io_iter; + call->iov_len = umin(call->remaining, subreq->len - subreq->transferred); call->unmarshall++; + fallthrough; /* extract the returned data */ + case 2: + count_before = call->iov_len; + _debug("extract data %zu/%llu", count_before, call->remaining); + + ret = afs_extract_data(call, true); + subreq->transferred += count_before - call->iov_len; + call->remaining -= count_before - call->iov_len; + if (ret < 0) + return ret; + + call->iter = &call->def_iter; + if (call->remaining) + goto no_more_data; + + /* Discard any excess data the server gave us */ + afs_extract_discard(call, call->remaining); + call->unmarshall = 3; + fallthrough; + case 3: - _debug("extract data"); - if (call->count > 0) { - page = call->reply3; - buffer = kmap_atomic(page); - ret = afs_extract_data(call, skb, last, buffer, - call->count); - kunmap_atomic(buffer); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } - } + _debug("extract discard %zu/%llu", + iov_iter_count(call->iter), call->remaining); - call->offset = 0; - call->unmarshall++; + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + no_more_data: + call->unmarshall = 4; + afs_extract_to_buf(call, (21 + 3 + 6) * 4); + fallthrough; /* extract the metadata */ case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - (21 + 3 + 6) * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - xdr_decode_AFSCallBack(&bp, vnode); - if (call->reply2) - xdr_decode_AFSVolSync(&bp, call->reply2); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSCallBack(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); + + if (subreq->start + subreq->transferred >= vp->scb.status.size) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); - call->offset = 0; call->unmarshall++; + fallthrough; case 5: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; break; } - if (!last) - return 0; - - if (call->count < PAGE_SIZE) { - _debug("clear"); - page = call->reply3; - buffer = kmap_atomic(page); - memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap_atomic(buffer); - } - _leave(" = 0 [done]"); return 0; } @@ -431,394 +407,372 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, */ static const struct afs_call_type afs_RXFSFetchData = { .name = "FS.FetchData", + .op = afs_FS_FetchData, + .async_rx = afs_fetch_data_async_rx, .deliver = afs_deliver_fs_fetch_data, - .abort_to_error = afs_abort_to_error, + .immediate_cancel = afs_fetch_data_immediate_cancel, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSFetchData64 = { .name = "FS.FetchData64", + .op = afs_FS_FetchData64, + .async_rx = afs_fetch_data_async_rx, .deliver = afs_deliver_fs_fetch_data, - .abort_to_error = afs_abort_to_error, + .immediate_cancel = afs_fetch_data_immediate_cancel, .destructor = afs_flat_call_destructor, }; /* * fetch data from a very large file */ -static int afs_fs_fetch_data64(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - off_t offset, size_t length, - struct page *buffer, - const struct afs_wait_mode *wait_mode) +static void afs_fs_fetch_data64(struct afs_operation *op) { + struct netfs_io_subrequest *subreq = op->fetch.subreq; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; __be32 *bp; _enter(""); - ASSERTCMP(length, <, ULONG_MAX); - - call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4); if (!call) - return -ENOMEM; + return afs_op_nomem(op); - call->key = key; - call->reply = vnode; - call->reply2 = NULL; /* volsync */ - call->reply3 = buffer; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->operation_ID = FSFETCHDATA64; + if (op->flags & AFS_OPERATION_ASYNC) + call->async = true; /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA64); - bp[1] = htonl(vnode->fid.vid); - bp[2] = htonl(vnode->fid.vnode); - bp[3] = htonl(vnode->fid.unique); - bp[4] = htonl(upper_32_bits(offset)); - bp[5] = htonl((u32) offset); + bp[1] = htonl(vp->fid.vid); + bp[2] = htonl(vp->fid.vnode); + bp[3] = htonl(vp->fid.unique); + bp[4] = htonl(upper_32_bits(subreq->start + subreq->transferred)); + bp[5] = htonl(lower_32_bits(subreq->start + subreq->transferred)); bp[6] = 0; - bp[7] = htonl((u32) length); + bp[7] = htonl(lower_32_bits(subreq->len - subreq->transferred)); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * fetch data from a file */ -int afs_fs_fetch_data(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - off_t offset, size_t length, - struct page *buffer, - const struct afs_wait_mode *wait_mode) +void afs_fs_fetch_data(struct afs_operation *op) { + struct netfs_io_subrequest *subreq = op->fetch.subreq; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; __be32 *bp; - if (upper_32_bits(offset) || upper_32_bits(offset + length)) - return afs_fs_fetch_data64(server, key, vnode, offset, length, - buffer, wait_mode); + if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags)) + return afs_fs_fetch_data64(op); _enter(""); - call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSFetchData, 24, (21 + 3 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = vnode; - call->reply2 = NULL; /* volsync */ - call->reply3 = buffer; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->operation_ID = FSFETCHDATA; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA); - bp[1] = htonl(vnode->fid.vid); - bp[2] = htonl(vnode->fid.vnode); - bp[3] = htonl(vnode->fid.unique); - bp[4] = htonl(offset); - bp[5] = htonl(length); - - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + bp[1] = htonl(vp->fid.vid); + bp[2] = htonl(vp->fid.vnode); + bp[3] = htonl(vp->fid.unique); + bp[4] = htonl(lower_32_bits(subreq->start + subreq->transferred)); + bp[5] = htonl(lower_32_bits(subreq->len + subreq->transferred)); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* - * deliver reply data to an FS.GiveUpCallBacks + * deliver reply data to an FS.CreateFile or an FS.MakeDir */ -static int afs_deliver_fs_give_up_callbacks(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_create_vnode(struct afs_call *call) { - _enter(",{%u},%d", skb->len, last); + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + const __be32 *bp; + int ret; - if (skb->len > 0) - return -EBADMSG; /* shouldn't be any reply data */ + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFid(&bp, &op->file[1].fid); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_AFSCallBack(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); + + _leave(" = 0 [done]"); return 0; } /* - * FS.GiveUpCallBacks operation type + * FS.CreateFile and FS.MakeDir operation type */ -static const struct afs_call_type afs_RXFSGiveUpCallBacks = { - .name = "FS.GiveUpCallBacks", - .deliver = afs_deliver_fs_give_up_callbacks, - .abort_to_error = afs_abort_to_error, +static const struct afs_call_type afs_RXFSCreateFile = { + .name = "FS.CreateFile", + .op = afs_FS_CreateFile, + .deliver = afs_deliver_fs_create_vnode, .destructor = afs_flat_call_destructor, }; /* - * give up a set of callbacks - * - the callbacks are held in the server->cb_break ring + * Create a file. */ -int afs_fs_give_up_callbacks(struct afs_server *server, - const struct afs_wait_mode *wait_mode) +void afs_fs_create_file(struct afs_operation *op) { + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; - size_t ncallbacks; - __be32 *bp, *tp; - int loop; - - ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail, - ARRAY_SIZE(server->cb_break)); - - _enter("{%zu},", ncallbacks); + size_t namesz, reqsz, padsz; + __be32 *bp; - if (ncallbacks == 0) - return 0; - if (ncallbacks > AFSCBMAX) - ncallbacks = AFSCBMAX; + _enter(""); - _debug("break %zu callbacks", ncallbacks); + namesz = name->len; + padsz = (4 - (namesz & 3)) & 3; + reqsz = (5 * 4) + namesz + padsz + (6 * 4); - call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks, - 12 + ncallbacks * 6 * 4, 0); + call = afs_alloc_flat_call(op->net, &afs_RXFSCreateFile, + reqsz, (3 + 21 + 21 + 3 + 6) * 4); if (!call) - return -ENOMEM; - - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; - tp = bp + 2 + ncallbacks * 3; - *bp++ = htonl(FSGIVEUPCALLBACKS); - *bp++ = htonl(ncallbacks); - *tp++ = htonl(ncallbacks); - - atomic_sub(ncallbacks, &server->cb_break_n); - for (loop = ncallbacks; loop > 0; loop--) { - struct afs_callback *cb = - &server->cb_break[server->cb_break_tail]; - - *bp++ = htonl(cb->fid.vid); - *bp++ = htonl(cb->fid.vnode); - *bp++ = htonl(cb->fid.unique); - *tp++ = htonl(cb->version); - *tp++ = htonl(cb->expiry); - *tp++ = htonl(cb->type); - smp_mb(); - server->cb_break_tail = - (server->cb_break_tail + 1) & - (ARRAY_SIZE(server->cb_break) - 1); + *bp++ = htonl(FSCREATEFILE); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name->name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; } + *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME); + *bp++ = htonl(op->mtime.tv_sec); /* mtime */ + *bp++ = 0; /* owner */ + *bp++ = 0; /* group */ + *bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */ + *bp++ = 0; /* segment size */ - ASSERT(ncallbacks > 0); - wake_up_nr(&server->cb_break_waitq, ncallbacks); - - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); -} - -/* - * deliver reply data to an FS.CreateFile or an FS.MakeDir - */ -static int afs_deliver_fs_create_vnode(struct afs_call *call, - struct sk_buff *skb, bool last) -{ - struct afs_vnode *vnode = call->reply; - const __be32 *bp; - - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; - - /* unmarshall the reply once we've received all of it */ - bp = call->buffer; - xdr_decode_AFSFid(&bp, call->reply2); - xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL); - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - xdr_decode_AFSCallBack_raw(&bp, call->reply4); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ - - _leave(" = 0 [done]"); - return 0; + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } -/* - * FS.CreateFile and FS.MakeDir operation type - */ -static const struct afs_call_type afs_RXFSCreateXXXX = { - .name = "FS.CreateXXXX", +static const struct afs_call_type afs_RXFSMakeDir = { + .name = "FS.MakeDir", + .op = afs_FS_MakeDir, .deliver = afs_deliver_fs_create_vnode, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* - * create a file or make a directory + * Create a new directory */ -int afs_fs_create(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - const char *name, - umode_t mode, - struct afs_fid *newfid, - struct afs_file_status *newstatus, - struct afs_callback *newcb, - const struct afs_wait_mode *wait_mode) +void afs_fs_make_dir(struct afs_operation *op) { + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; size_t namesz, reqsz, padsz; __be32 *bp; _enter(""); - namesz = strlen(name); + namesz = name->len; padsz = (4 - (namesz & 3)) & 3; reqsz = (5 * 4) + namesz + padsz + (6 * 4); - call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz, - (3 + 21 + 21 + 3 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSMakeDir, + reqsz, (3 + 21 + 21 + 3 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = vnode; - call->reply2 = newfid; - call->reply3 = newstatus; - call->reply4 = newcb; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; - *bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(FSMAKEDIR); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); *bp++ = htonl(namesz); - memcpy(bp, name, namesz); + memcpy(bp, name->name, namesz); bp = (void *) bp + namesz; if (padsz > 0) { memset(bp, 0, padsz); bp = (void *) bp + padsz; } - *bp++ = htonl(AFS_SET_MODE); - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME); + *bp++ = htonl(op->mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ - *bp++ = htonl(mode & S_IALLUGO); /* unix mode */ + *bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */ *bp++ = 0; /* segment size */ - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* - * deliver reply data to an FS.RemoveFile or FS.RemoveDir + * Deliver reply data to any operation that returns status and volume sync. */ -static int afs_deliver_fs_remove(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_file_status_and_vol(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[0]; const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; } /* - * FS.RemoveDir/FS.RemoveFile operation type + * FS.RemoveFile operation type */ -static const struct afs_call_type afs_RXFSRemoveXXXX = { - .name = "FS.RemoveXXXX", - .deliver = afs_deliver_fs_remove, - .abort_to_error = afs_abort_to_error, +static const struct afs_call_type afs_RXFSRemoveFile = { + .name = "FS.RemoveFile", + .op = afs_FS_RemoveFile, + .deliver = afs_deliver_fs_file_status_and_vol, .destructor = afs_flat_call_destructor, }; /* - * remove a file or directory + * Remove a file. */ -int afs_fs_remove(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - const char *name, - bool isdir, - const struct afs_wait_mode *wait_mode) +void afs_fs_remove_file(struct afs_operation *op) { + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; size_t namesz, reqsz, padsz; __be32 *bp; _enter(""); - namesz = strlen(name); + namesz = name->len; padsz = (4 - (namesz & 3)) & 3; reqsz = (5 * 4) + namesz + padsz; - call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSRemoveFile, + reqsz, (21 + 6) * 4); if (!call) - return -ENOMEM; + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSREMOVEFILE); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); + *bp++ = htonl(namesz); + memcpy(bp, name->name, namesz); + bp = (void *) bp + namesz; + if (padsz > 0) { + memset(bp, 0, padsz); + bp = (void *) bp + padsz; + } + + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); +static const struct afs_call_type afs_RXFSRemoveDir = { + .name = "FS.RemoveDir", + .op = afs_FS_RemoveDir, + .deliver = afs_deliver_fs_file_status_and_vol, + .destructor = afs_flat_call_destructor, +}; + +/* + * Remove a directory. + */ +void afs_fs_remove_dir(struct afs_operation *op) +{ + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_call *call; + size_t namesz, reqsz, padsz; + __be32 *bp; + + _enter(""); + + namesz = name->len; + padsz = (4 - (namesz & 3)) & 3; + reqsz = (5 * 4) + namesz + padsz; + + call = afs_alloc_flat_call(op->net, &afs_RXFSRemoveDir, + reqsz, (21 + 6) * 4); + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; - *bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(FSREMOVEDIR); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); *bp++ = htonl(namesz); - memcpy(bp, name, namesz); + memcpy(bp, name->name, namesz); bp = (void *) bp + namesz; if (padsz > 0) { memset(bp, 0, padsz); bp = (void *) bp + padsz; } - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* * deliver reply data to an FS.Link */ -static int afs_deliver_fs_link(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_link(struct afs_call *call) { - struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; + _enter("{%u}", call->unmarshall); - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -829,85 +783,78 @@ static int afs_deliver_fs_link(struct afs_call *call, */ static const struct afs_call_type afs_RXFSLink = { .name = "FS.Link", + .op = afs_FS_Link, .deliver = afs_deliver_fs_link, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * make a hard link */ -int afs_fs_link(struct afs_server *server, - struct key *key, - struct afs_vnode *dvnode, - struct afs_vnode *vnode, - const char *name, - const struct afs_wait_mode *wait_mode) +void afs_fs_link(struct afs_operation *op) { + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; struct afs_call *call; size_t namesz, reqsz, padsz; __be32 *bp; _enter(""); - namesz = strlen(name); + namesz = name->len; padsz = (4 - (namesz & 3)) & 3; reqsz = (5 * 4) + namesz + padsz + (3 * 4); - call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSLink, reqsz, (21 + 21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = dvnode; - call->reply2 = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSLINK); - *bp++ = htonl(dvnode->fid.vid); - *bp++ = htonl(dvnode->fid.vnode); - *bp++ = htonl(dvnode->fid.unique); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); *bp++ = htonl(namesz); - memcpy(bp, name, namesz); + memcpy(bp, name->name, namesz); bp = (void *) bp + namesz; if (padsz > 0) { memset(bp, 0, padsz); bp = (void *) bp + padsz; } - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = vp->fid; + trace_afs_make_fs_call1(call, &vp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* * deliver reply data to an FS.Symlink */ -static int afs_deliver_fs_symlink(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_symlink(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; + _enter("{%u}", call->unmarshall); - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFid(&bp, call->reply2); - xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL); - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + xdr_decode_AFSFid(&bp, &vp->fid); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -918,104 +865,91 @@ static int afs_deliver_fs_symlink(struct afs_call *call, */ static const struct afs_call_type afs_RXFSSymlink = { .name = "FS.Symlink", + .op = afs_FS_Symlink, .deliver = afs_deliver_fs_symlink, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * create a symbolic link */ -int afs_fs_symlink(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - const char *name, - const char *contents, - struct afs_fid *newfid, - struct afs_file_status *newstatus, - const struct afs_wait_mode *wait_mode) +void afs_fs_symlink(struct afs_operation *op) { + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; size_t namesz, reqsz, padsz, c_namesz, c_padsz; __be32 *bp; _enter(""); - namesz = strlen(name); + namesz = name->len; padsz = (4 - (namesz & 3)) & 3; - c_namesz = strlen(contents); + c_namesz = strlen(op->create.symlink); c_padsz = (4 - (c_namesz & 3)) & 3; reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4); - call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz, + call = afs_alloc_flat_call(op->net, &afs_RXFSSymlink, reqsz, (3 + 21 + 21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = vnode; - call->reply2 = newfid; - call->reply3 = newstatus; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSYMLINK); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); *bp++ = htonl(namesz); - memcpy(bp, name, namesz); + memcpy(bp, name->name, namesz); bp = (void *) bp + namesz; if (padsz > 0) { memset(bp, 0, padsz); bp = (void *) bp + padsz; } *bp++ = htonl(c_namesz); - memcpy(bp, contents, c_namesz); + memcpy(bp, op->create.symlink, c_namesz); bp = (void *) bp + c_namesz; if (c_padsz > 0) { memset(bp, 0, c_padsz); bp = (void *) bp + c_padsz; } - *bp++ = htonl(AFS_SET_MODE); - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME); + *bp++ = htonl(op->mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = htonl(S_IRWXUGO); /* unix mode */ *bp++ = 0; /* segment size */ - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); } /* * deliver reply data to an FS.Rename */ -static int afs_deliver_fs_rename(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_rename(struct afs_call *call) { - struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; - - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; - /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL); - if (new_dvnode != orig_dvnode) - xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode, - NULL); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_AFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_AFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -1026,32 +960,30 @@ static int afs_deliver_fs_rename(struct afs_call *call, */ static const struct afs_call_type afs_RXFSRename = { .name = "FS.Rename", + .op = afs_FS_Rename, .deliver = afs_deliver_fs_rename, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* - * create a symbolic link + * Rename/move a file or directory. */ -int afs_fs_rename(struct afs_server *server, - struct key *key, - struct afs_vnode *orig_dvnode, - const char *orig_name, - struct afs_vnode *new_dvnode, - const char *new_name, - const struct afs_wait_mode *wait_mode) +void afs_fs_rename(struct afs_operation *op) { + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; struct afs_call *call; size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz; __be32 *bp; _enter(""); - o_namesz = strlen(orig_name); + o_namesz = orig_name->len; o_padsz = (4 - (o_namesz & 3)) & 3; - n_namesz = strlen(new_name); + n_namesz = new_name->len; n_padsz = (4 - (n_namesz & 3)) & 3; reqsz = (4 * 4) + @@ -1059,74 +991,60 @@ int afs_fs_rename(struct afs_server *server, (3 * 4) + 4 + n_namesz + n_padsz; - call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSRename, reqsz, (21 + 21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = orig_dvnode; - call->reply2 = new_dvnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSRENAME); - *bp++ = htonl(orig_dvnode->fid.vid); - *bp++ = htonl(orig_dvnode->fid.vnode); - *bp++ = htonl(orig_dvnode->fid.unique); + *bp++ = htonl(orig_dvp->fid.vid); + *bp++ = htonl(orig_dvp->fid.vnode); + *bp++ = htonl(orig_dvp->fid.unique); *bp++ = htonl(o_namesz); - memcpy(bp, orig_name, o_namesz); + memcpy(bp, orig_name->name, o_namesz); bp = (void *) bp + o_namesz; if (o_padsz > 0) { memset(bp, 0, o_padsz); bp = (void *) bp + o_padsz; } - *bp++ = htonl(new_dvnode->fid.vid); - *bp++ = htonl(new_dvnode->fid.vnode); - *bp++ = htonl(new_dvnode->fid.unique); + *bp++ = htonl(new_dvp->fid.vid); + *bp++ = htonl(new_dvp->fid.vnode); + *bp++ = htonl(new_dvp->fid.unique); *bp++ = htonl(n_namesz); - memcpy(bp, new_name, n_namesz); + memcpy(bp, new_name->name, n_namesz); bp = (void *) bp + n_namesz; if (n_padsz > 0) { memset(bp, 0, n_padsz); bp = (void *) bp + n_padsz; } - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); } /* - * deliver reply data to an FS.StoreData + * Deliver reply data to FS.StoreData or FS.StoreStatus */ -static int afs_deliver_fs_store_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_data(struct afs_call *call) { - struct afs_vnode *vnode = call->reply; + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[0]; const __be32 *bp; + int ret; - _enter(",,%u", last); - - afs_transfer_reply(call, skb); - if (!last) { - _leave(" = 0 [more]"); - return 0; - } + _enter(""); - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, - &call->store_version); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ - - afs_pages_written_back(vnode, call); + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -1137,187 +1055,113 @@ static int afs_deliver_fs_store_data(struct afs_call *call, */ static const struct afs_call_type afs_RXFSStoreData = { .name = "FS.StoreData", + .op = afs_FS_StoreData, .deliver = afs_deliver_fs_store_data, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSStoreData64 = { .name = "FS.StoreData64", + .op = afs_FS_StoreData64, .deliver = afs_deliver_fs_store_data, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* * store a set of pages to a very large file */ -static int afs_fs_store_data64(struct afs_server *server, - struct afs_writeback *wb, - pgoff_t first, pgoff_t last, - unsigned offset, unsigned to, - loff_t size, loff_t pos, loff_t i_size, - const struct afs_wait_mode *wait_mode) +static void afs_fs_store_data64(struct afs_operation *op) { - struct afs_vnode *vnode = wb->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; __be32 *bp; - _enter(",%x,{%x:%u},,", - key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode); + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(&afs_RXFSStoreData64, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData64, (4 + 6 + 3 * 2) * 4, (21 + 6) * 4); if (!call) - return -ENOMEM; + return afs_op_nomem(op); - call->wb = wb; - call->key = wb->key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->mapping = vnode->vfs_inode.i_mapping; - call->first = first; - call->last = last; - call->first_offset = offset; - call->last_to = to; - call->send_pages = true; - call->store_version = vnode->status.data_version + 1; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSTOREDATA64); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); - *bp++ = 0; /* mask */ - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MTIME); /* mask */ + *bp++ = htonl(op->mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = 0; /* unix mode */ *bp++ = 0; /* segment size */ - *bp++ = htonl(pos >> 32); - *bp++ = htonl((u32) pos); - *bp++ = htonl(size >> 32); - *bp++ = htonl((u32) size); - *bp++ = htonl(i_size >> 32); - *bp++ = htonl((u32) i_size); + *bp++ = htonl(upper_32_bits(op->store.pos)); + *bp++ = htonl(lower_32_bits(op->store.pos)); + *bp++ = htonl(upper_32_bits(op->store.size)); + *bp++ = htonl(lower_32_bits(op->store.size)); + *bp++ = htonl(upper_32_bits(op->store.i_size)); + *bp++ = htonl(lower_32_bits(op->store.i_size)); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* - * store a set of pages + * Write data to a file on the server. */ -int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, - pgoff_t first, pgoff_t last, - unsigned offset, unsigned to, - const struct afs_wait_mode *wait_mode) +void afs_fs_store_data(struct afs_operation *op) { - struct afs_vnode *vnode = wb->vnode; + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - loff_t size, pos, i_size; __be32 *bp; - _enter(",%x,{%x:%u},,", - key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode); - - size = to - offset; - if (first != last) - size += (loff_t)(last - first) << PAGE_SHIFT; - pos = (loff_t)first << PAGE_SHIFT; - pos += offset; - - i_size = i_size_read(&vnode->vfs_inode); - if (pos + size > i_size) - i_size = size + pos; + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); _debug("size %llx, at %llx, i_size %llx", - (unsigned long long) size, (unsigned long long) pos, - (unsigned long long) i_size); + (unsigned long long)op->store.size, + (unsigned long long)op->store.pos, + (unsigned long long)op->store.i_size); - if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32) - return afs_fs_store_data64(server, wb, first, last, offset, to, - size, pos, i_size, wait_mode); + if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags)) + return afs_fs_store_data64(op); - call = afs_alloc_flat_call(&afs_RXFSStoreData, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData, (4 + 6 + 3) * 4, (21 + 6) * 4); if (!call) - return -ENOMEM; + return afs_op_nomem(op); - call->wb = wb; - call->key = wb->key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->mapping = vnode->vfs_inode.i_mapping; - call->first = first; - call->last = last; - call->first_offset = offset; - call->last_to = to; - call->send_pages = true; - call->store_version = vnode->status.data_version + 1; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSTOREDATA); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); - *bp++ = 0; /* mask */ - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MTIME); /* mask */ + *bp++ = htonl(op->mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = 0; /* unix mode */ *bp++ = 0; /* segment size */ - *bp++ = htonl(pos); - *bp++ = htonl(size); - *bp++ = htonl(i_size); + *bp++ = htonl(lower_32_bits(op->store.pos)); + *bp++ = htonl(lower_32_bits(op->store.size)); + *bp++ = htonl(lower_32_bits(op->store.i_size)); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); -} - -/* - * deliver reply data to an FS.StoreStatus - */ -static int afs_deliver_fs_store_status(struct afs_call *call, - struct sk_buff *skb, bool last) -{ - afs_dataversion_t *store_version; - struct afs_vnode *vnode = call->reply; - const __be32 *bp; - - _enter(",,%u", last); - - afs_transfer_reply(call, skb); - if (!last) { - _leave(" = 0 [more]"); - return 0; - } - - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } - - /* unmarshall the reply once we've received all of it */ - store_version = NULL; - if (call->operation_ID == FSSTOREDATA) - store_version = &call->store_version; - - bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version); - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ - - _leave(" = 0 [done]"); - return 0; + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* @@ -1325,22 +1169,22 @@ static int afs_deliver_fs_store_status(struct afs_call *call, */ static const struct afs_call_type afs_RXFSStoreStatus = { .name = "FS.StoreStatus", - .deliver = afs_deliver_fs_store_status, - .abort_to_error = afs_abort_to_error, + .op = afs_FS_StoreStatus, + .deliver = afs_deliver_fs_store_data, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSStoreData_as_Status = { .name = "FS.StoreData", - .deliver = afs_deliver_fs_store_status, - .abort_to_error = afs_abort_to_error, + .op = afs_FS_StoreData, + .deliver = afs_deliver_fs_store_data, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSStoreData64_as_Status = { .name = "FS.StoreData64", - .deliver = afs_deliver_fs_store_status, - .abort_to_error = afs_abort_to_error, + .op = afs_FS_StoreData64, + .deliver = afs_deliver_fs_store_data, .destructor = afs_flat_call_destructor, }; @@ -1348,435 +1192,306 @@ static const struct afs_call_type afs_RXFSStoreData64_as_Status = { * set the attributes on a very large file, using FS.StoreData rather than * FS.StoreStatus so as to alter the file size also */ -static int afs_fs_setattr_size64(struct afs_server *server, struct key *key, - struct afs_vnode *vnode, struct iattr *attr, - const struct afs_wait_mode *wait_mode) +static void afs_fs_setattr_size64(struct afs_operation *op) { + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; + struct iattr *attr = op->setattr.attr; __be32 *bp; - _enter(",%x,{%x:%u},,", - key_serial(key), vnode->fid.vid, vnode->fid.vnode); + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); - call = afs_alloc_flat_call(&afs_RXFSStoreData64_as_Status, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData64_as_Status, (4 + 6 + 3 * 2) * 4, (21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->store_version = vnode->status.data_version + 1; - call->operation_ID = FSSTOREDATA; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSTOREDATA64); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); xdr_encode_AFS_StoreStatus(&bp, attr); - *bp++ = 0; /* position of start of write */ - *bp++ = 0; - *bp++ = 0; /* size of write */ + *bp++ = htonl(upper_32_bits(attr->ia_size)); /* position of start of write */ + *bp++ = htonl(lower_32_bits(attr->ia_size)); + *bp++ = 0; /* size of write */ *bp++ = 0; - *bp++ = htonl(attr->ia_size >> 32); /* new file length */ - *bp++ = htonl((u32) attr->ia_size); + *bp++ = htonl(upper_32_bits(attr->ia_size)); /* new file length */ + *bp++ = htonl(lower_32_bits(attr->ia_size)); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus * so as to alter the file size also */ -static int afs_fs_setattr_size(struct afs_server *server, struct key *key, - struct afs_vnode *vnode, struct iattr *attr, - const struct afs_wait_mode *wait_mode) +static void afs_fs_setattr_size(struct afs_operation *op) { + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; + struct iattr *attr = op->setattr.attr; __be32 *bp; - _enter(",%x,{%x:%u},,", - key_serial(key), vnode->fid.vid, vnode->fid.vnode); + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); - if (attr->ia_size >> 32) - return afs_fs_setattr_size64(server, key, vnode, attr, - wait_mode); + if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags)) + return afs_fs_setattr_size64(op); - call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData_as_Status, (4 + 6 + 3) * 4, (21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->store_version = vnode->status.data_version + 1; - call->operation_ID = FSSTOREDATA; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSTOREDATA); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); xdr_encode_AFS_StoreStatus(&bp, attr); - *bp++ = 0; /* position of start of write */ + *bp++ = htonl(attr->ia_size); /* position of start of write */ *bp++ = 0; /* size of write */ *bp++ = htonl(attr->ia_size); /* new file length */ - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * set the attributes on a file, using FS.StoreData if there's a change in file * size, and FS.StoreStatus otherwise */ -int afs_fs_setattr(struct afs_server *server, struct key *key, - struct afs_vnode *vnode, struct iattr *attr, - const struct afs_wait_mode *wait_mode) +void afs_fs_setattr(struct afs_operation *op) { + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; + struct iattr *attr = op->setattr.attr; __be32 *bp; if (attr->ia_valid & ATTR_SIZE) - return afs_fs_setattr_size(server, key, vnode, attr, - wait_mode); + return afs_fs_setattr_size(op); - _enter(",%x,{%x:%u},,", - key_serial(key), vnode->fid.vid, vnode->fid.vnode); + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); - call = afs_alloc_flat_call(&afs_RXFSStoreStatus, + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreStatus, (4 + 6) * 4, (21 + 6) * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); - call->operation_ID = FSSTORESTATUS; + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSTORESTATUS); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); - xdr_encode_AFS_StoreStatus(&bp, attr); + xdr_encode_AFS_StoreStatus(&bp, op->setattr.attr); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * deliver reply data to an FS.GetVolumeStatus */ -static int afs_deliver_fs_get_volume_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_get_volume_status(struct afs_call *call) { + struct afs_operation *op = call->op; const __be32 *bp; char *p; + u32 size; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: - call->offset = 0; call->unmarshall++; + afs_extract_to_buf(call, 12 * 4); + fallthrough; /* extract the returned status record */ case 1: _debug("extract status"); - ret = afs_extract_data(call, skb, last, call->buffer, - 12 * 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; bp = call->buffer; - xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2); - call->offset = 0; + xdr_decode_AFSFetchVolumeStatus(&bp, &op->volstatus.vs); call->unmarshall++; + afs_extract_to_tmp(call); + fallthrough; /* extract the volume name length */ case 2: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; - call->offset = 0; + return afs_protocol_error(call, afs_eproto_volname_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_to_buf(call, size); call->unmarshall++; + fallthrough; /* extract the volume name */ case 3: _debug("extract volname"); - if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; - p = call->reply3; + p = call->buffer; p[call->count] = 0; _debug("volname '%s'", p); - - call->offset = 0; - call->unmarshall++; - - /* extract the volume name padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_volname_padding; - } - call->count = 4 - (call->count & 3); - - case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } - - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; - no_volname_padding: + fallthrough; /* extract the offline message length */ - case 5: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + case 4: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; - call->offset = 0; + return afs_protocol_error(call, afs_eproto_offline_msg_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_to_buf(call, size); call->unmarshall++; + fallthrough; /* extract the offline message */ - case 6: + case 5: _debug("extract offline"); - if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; - p = call->reply3; + p = call->buffer; p[call->count] = 0; _debug("offline '%s'", p); - call->offset = 0; + afs_extract_to_tmp(call); call->unmarshall++; - - /* extract the offline message padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_offline_padding; - } - call->count = 4 - (call->count & 3); - - case 7: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } - - call->offset = 0; - call->unmarshall++; - no_offline_padding: + fallthrough; /* extract the message of the day length */ - case 8: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } + case 6: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; - call->offset = 0; + return afs_protocol_error(call, afs_eproto_motd_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_to_buf(call, size); call->unmarshall++; + fallthrough; /* extract the message of the day */ - case 9: + case 7: _debug("extract motd"); - if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } - } + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; - p = call->reply3; + p = call->buffer; p[call->count] = 0; _debug("motd '%s'", p); - call->offset = 0; call->unmarshall++; + fallthrough; - /* extract the message of the day padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_motd_padding; - } - call->count = 4 - (call->count & 3); - - case 10: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); - switch (ret) { - case 0: break; - case -EAGAIN: return 0; - default: return ret; - } - - call->offset = 0; - call->unmarshall++; - no_motd_padding: - - case 11: - _debug("trailer %d", skb->len); - if (skb->len != 0) - return -EBADMSG; + case 8: break; } - if (!last) - return 0; - _leave(" = 0 [done]"); return 0; } /* - * destroy an FS.GetVolumeStatus call - */ -static void afs_get_volume_status_call_destructor(struct afs_call *call) -{ - kfree(call->reply3); - call->reply3 = NULL; - afs_flat_call_destructor(call); -} - -/* * FS.GetVolumeStatus operation type */ static const struct afs_call_type afs_RXFSGetVolumeStatus = { .name = "FS.GetVolumeStatus", + .op = afs_FS_GetVolumeStatus, .deliver = afs_deliver_fs_get_volume_status, - .abort_to_error = afs_abort_to_error, - .destructor = afs_get_volume_status_call_destructor, + .destructor = afs_flat_call_destructor, }; /* * fetch the status of a volume */ -int afs_fs_get_volume_status(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - struct afs_volume_status *vs, - const struct afs_wait_mode *wait_mode) +void afs_fs_get_volume_status(struct afs_operation *op) { + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; __be32 *bp; - void *tmpbuf; _enter(""); - tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL); - if (!tmpbuf) - return -ENOMEM; - - call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4); - if (!call) { - kfree(tmpbuf); - return -ENOMEM; - } - - call->key = key; - call->reply = vnode; - call->reply2 = vs; - call->reply3 = tmpbuf; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call = afs_alloc_flat_call(op->net, &afs_RXFSGetVolumeStatus, 2 * 4, + max(12 * 4, AFSOPAQUEMAX + 1)); + if (!call) + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSGETVOLUMESTATUS); - bp[1] = htonl(vnode->fid.vid); + bp[1] = htonl(vp->fid.vid); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock */ -static int afs_deliver_fs_xxxx_lock(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_xxxx_lock(struct afs_call *call) { + struct afs_operation *op = call->op; const __be32 *bp; + int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - afs_transfer_reply(call, skb); - if (!last) - return 0; + _enter("{%u}", call->unmarshall); - if (call->reply_size != call->reply_max) - return -EBADMSG; + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; /* unmarshall the reply once we've received all of it */ bp = call->buffer; - /* xdr_decode_AFSVolSync(&bp, call->replyX); */ + xdr_decode_AFSVolSync(&bp, &op->volsync); _leave(" = 0 [done]"); return 0; @@ -1787,8 +1502,9 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call, */ static const struct afs_call_type afs_RXFSSetLock = { .name = "FS.SetLock", + .op = afs_FS_SetLock, .deliver = afs_deliver_fs_xxxx_lock, - .abort_to_error = afs_abort_to_error, + .done = afs_lock_op_done, .destructor = afs_flat_call_destructor, }; @@ -1797,8 +1513,9 @@ static const struct afs_call_type afs_RXFSSetLock = { */ static const struct afs_call_type afs_RXFSExtendLock = { .name = "FS.ExtendLock", + .op = afs_FS_ExtendLock, .deliver = afs_deliver_fs_xxxx_lock, - .abort_to_error = afs_abort_to_error, + .done = afs_lock_op_done, .destructor = afs_flat_call_destructor, }; @@ -1807,105 +1524,611 @@ static const struct afs_call_type afs_RXFSExtendLock = { */ static const struct afs_call_type afs_RXFSReleaseLock = { .name = "FS.ReleaseLock", + .op = afs_FS_ReleaseLock, .deliver = afs_deliver_fs_xxxx_lock, - .abort_to_error = afs_abort_to_error, .destructor = afs_flat_call_destructor, }; /* - * get a lock on a file + * Set a lock on a file */ -int afs_fs_set_lock(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - afs_lock_type_t type, - const struct afs_wait_mode *wait_mode) +void afs_fs_set_lock(struct afs_operation *op) { + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; __be32 *bp; _enter(""); - call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSSetLock, 5 * 4, 6 * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSSETLOCK); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); - *bp++ = htonl(type); - - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); + *bp++ = htonl(op->lock.type); + + call->fid = vp->fid; + trace_afs_make_fs_calli(call, &vp->fid, op->lock.type); + afs_make_op_call(op, call, GFP_NOFS); } /* * extend a lock on a file */ -int afs_fs_extend_lock(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - const struct afs_wait_mode *wait_mode) +void afs_fs_extend_lock(struct afs_operation *op) { + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; __be32 *bp; _enter(""); - call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSExtendLock, 4 * 4, 6 * 4); if (!call) - return -ENOMEM; - - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + return afs_op_nomem(op); /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSEXTENDLOCK); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); } /* * release a lock on a file */ -int afs_fs_release_lock(struct afs_server *server, - struct key *key, - struct afs_vnode *vnode, - const struct afs_wait_mode *wait_mode) +void afs_fs_release_lock(struct afs_operation *op) { + struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; __be32 *bp; _enter(""); - call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSRELEASELOCK); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to an FS.GiveUpAllCallBacks operation. + */ +static int afs_deliver_fs_give_up_all_callbacks(struct afs_call *call) +{ + return afs_transfer_reply(call); +} + +/* + * FS.GiveUpAllCallBacks operation type + */ +static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = { + .name = "FS.GiveUpAllCallBacks", + .op = afs_FS_GiveUpAllCallBacks, + .deliver = afs_deliver_fs_give_up_all_callbacks, + .destructor = afs_flat_call_destructor, +}; + +/* + * Flush all the callbacks we have on a server. + */ +int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, + struct afs_address *addr, struct key *key) +{ + struct afs_call *call; + __be32 *bp; + int ret; + + _enter(""); + + call = afs_alloc_flat_call(net, &afs_RXFSGiveUpAllCallBacks, 1 * 4, 0); if (!call) return -ENOMEM; - call->key = key; - call->reply = vnode; - call->service_id = FS_SERVICE; - call->port = htons(AFS_FS_PORT); + call->key = key; + call->peer = rxrpc_kernel_get_peer(addr->peer); + call->service_id = server->service_id; /* marshall the parameters */ bp = call->request; - *bp++ = htonl(FSRELEASELOCK); - *bp++ = htonl(vnode->fid.vid); - *bp++ = htonl(vnode->fid.vnode); - *bp++ = htonl(vnode->fid.unique); + *bp++ = htonl(FSGIVEUPALLCALLBACKS); + + call->server = afs_use_server(server, false, afs_server_trace_use_give_up_cb); + afs_make_call(call, GFP_NOFS); + afs_wait_for_call_to_complete(call); + ret = call->error; + if (call->responded) + set_bit(AFS_SERVER_FL_RESPONDING, &server->flags); + afs_put_call(call); + return ret; +} + +/* + * Deliver reply data to an FS.GetCapabilities operation. + */ +static int afs_deliver_fs_get_capabilities(struct afs_call *call) +{ + u32 count; + int ret; + + _enter("{%u,%zu}", call->unmarshall, iov_iter_count(call->iter)); - return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode); + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + fallthrough; + + /* Extract the capabilities word count */ + case 1: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + count = ntohl(call->tmp); + call->count = count; + call->count2 = count; + if (count == 0) { + call->unmarshall = 4; + call->tmp = 0; + break; + } + + /* Extract the first word of the capabilities to call->tmp */ + afs_extract_to_tmp(call); + call->unmarshall++; + fallthrough; + + case 2: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + afs_extract_discard(call, (count - 1) * sizeof(__be32)); + call->unmarshall++; + fallthrough; + + /* Extract remaining capabilities words */ + case 3: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + call->unmarshall++; + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +static void afs_fs_get_capabilities_destructor(struct afs_call *call) +{ + afs_put_endpoint_state(call->probe, afs_estate_trace_put_getcaps); + afs_flat_call_destructor(call); +} + +/* + * FS.GetCapabilities operation type + */ +static const struct afs_call_type afs_RXFSGetCapabilities = { + .name = "FS.GetCapabilities", + .op = afs_FS_GetCapabilities, + .deliver = afs_deliver_fs_get_capabilities, + .done = afs_fileserver_probe_result, + .immediate_cancel = afs_fileserver_probe_result, + .destructor = afs_fs_get_capabilities_destructor, +}; + +/* + * Probe a fileserver for the capabilities that it supports. This RPC can + * reply with up to 196 words. The operation is asynchronous and if we managed + * to allocate a call, true is returned the result is delivered through the + * ->done() - otherwise we return false to indicate we didn't even try. + */ +bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate, unsigned int addr_index, + struct key *key) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &afs_RXFSGetCapabilities, 1 * 4, 16 * 4); + if (!call) + return false; + + call->key = key; + call->server = afs_use_server(server, false, afs_server_trace_use_get_caps); + call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer); + call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps); + call->probe_index = addr_index; + call->service_id = server->service_id; + call->upgrade = true; + call->async = true; + call->max_lifespan = AFS_PROBE_MAX_LIFESPAN; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSGETCAPABILITIES); + + trace_afs_make_fs_call(call, NULL); + afs_make_call(call, GFP_NOFS); + afs_put_call(call); + return true; +} + +/* + * Deliver reply data to an FS.InlineBulkStatus call + */ +static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_status_cb *scb; + const __be32 *bp; + u32 tmp; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + fallthrough; + + /* Extract the file status count and array in two steps */ + case 1: + _debug("extract status count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("status count: %u/%u", tmp, op->nr_files); + if (tmp != op->nr_files) + return afs_protocol_error(call, afs_eproto_ibulkst_count); + + call->count = 0; + call->unmarshall++; + more_counts: + afs_extract_to_buf(call, 21 * sizeof(__be32)); + fallthrough; + + case 2: + _debug("extract status array %u", call->count); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + switch (call->count) { + case 0: + scb = &op->file[0].scb; + break; + case 1: + scb = &op->file[1].scb; + break; + default: + scb = &op->more_files[call->count - 2].scb; + break; + } + + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, call, scb); + + call->count++; + if (call->count < op->nr_files) + goto more_counts; + + call->count = 0; + call->unmarshall++; + afs_extract_to_tmp(call); + fallthrough; + + /* Extract the callback count and array in two steps */ + case 3: + _debug("extract CB count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("CB count: %u", tmp); + if (tmp != op->nr_files) + return afs_protocol_error(call, afs_eproto_ibulkst_cb_count); + call->count = 0; + call->unmarshall++; + more_cbs: + afs_extract_to_buf(call, 3 * sizeof(__be32)); + fallthrough; + + case 4: + _debug("extract CB array"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + _debug("unmarshall CB array"); + switch (call->count) { + case 0: + scb = &op->file[0].scb; + break; + case 1: + scb = &op->file[1].scb; + break; + default: + scb = &op->more_files[call->count - 2].scb; + break; + } + + bp = call->buffer; + xdr_decode_AFSCallBack(&bp, call, scb); + call->count++; + if (call->count < op->nr_files) + goto more_cbs; + + afs_extract_to_buf(call, 6 * sizeof(__be32)); + call->unmarshall++; + fallthrough; + + case 5: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + bp = call->buffer; + /* Unfortunately, prior to OpenAFS-1.6, volsync here is filled + * with rubbish. + */ + xdr_decode_AFSVolSync(&bp, NULL); + + call->unmarshall++; + fallthrough; + + case 6: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +static void afs_done_fs_inline_bulk_status(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + call->abort_code == RX_INVALID_OPERATION) { + set_bit(AFS_SERVER_FL_NO_IBULK, &call->server->flags); + if (call->op) + set_bit(AFS_VOLUME_MAYBE_NO_IBULK, &call->op->volume->flags); + } +} + +/* + * FS.InlineBulkStatus operation type + */ +static const struct afs_call_type afs_RXFSInlineBulkStatus = { + .name = "FS.InlineBulkStatus", + .op = afs_FS_InlineBulkStatus, + .deliver = afs_deliver_fs_inline_bulk_status, + .done = afs_done_fs_inline_bulk_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for up to 50 files + */ +void afs_fs_inline_bulk_status(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + struct afs_call *call; + __be32 *bp; + int i; + + if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) { + afs_op_set_error(op, -ENOTSUPP); + return; + } + + _enter(",%x,{%llx:%llu},%u", + key_serial(op->key), vp->fid.vid, vp->fid.vnode, op->nr_files); + + call = afs_alloc_flat_call(op->net, &afs_RXFSInlineBulkStatus, + (2 + op->nr_files * 3) * 4, + 21 * 4); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSINLINEBULKSTATUS); + *bp++ = htonl(op->nr_files); + *bp++ = htonl(dvp->fid.vid); + *bp++ = htonl(dvp->fid.vnode); + *bp++ = htonl(dvp->fid.unique); + *bp++ = htonl(vp->fid.vid); + *bp++ = htonl(vp->fid.vnode); + *bp++ = htonl(vp->fid.unique); + for (i = 0; i < op->nr_files - 2; i++) { + *bp++ = htonl(op->more_files[i].fid.vid); + *bp++ = htonl(op->more_files[i].fid.vnode); + *bp++ = htonl(op->more_files[i].fid.unique); + } + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * deliver reply data to an FS.FetchACL + */ +static int afs_deliver_fs_fetch_acl(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[0]; + struct afs_acl *acl; + const __be32 *bp; + unsigned int size; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + fallthrough; + + /* extract the returned data length */ + case 1: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + size = call->count2 = ntohl(call->tmp); + size = round_up(size, 4); + + acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL); + if (!acl) + return -ENOMEM; + op->acl = acl; + acl->size = call->count2; + afs_extract_begin(call, acl->data, size); + call->unmarshall++; + fallthrough; + + /* extract the returned data */ + case 2: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + afs_extract_to_buf(call, (21 + 6) * 4); + call->unmarshall++; + fallthrough; + + /* extract the metadata */ + case 3: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_AFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_AFSVolSync(&bp, &op->volsync); + + call->unmarshall++; + fallthrough; + + case 4: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.FetchACL operation type + */ +static const struct afs_call_type afs_RXFSFetchACL = { + .name = "FS.FetchACL", + .op = afs_FS_FetchACL, + .deliver = afs_deliver_fs_fetch_acl, +}; + +/* + * Fetch the ACL for a file. + */ +void afs_fs_fetch_acl(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); + + call = afs_alloc_flat_call(op->net, &afs_RXFSFetchACL, 16, (21 + 6) * 4); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSFETCHACL); + bp[1] = htonl(vp->fid.vid); + bp[2] = htonl(vp->fid.vnode); + bp[3] = htonl(vp->fid.unique); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_KERNEL); +} + +/* + * FS.StoreACL operation type + */ +static const struct afs_call_type afs_RXFSStoreACL = { + .name = "FS.StoreACL", + .op = afs_FS_StoreACL, + .deliver = afs_deliver_fs_file_status_and_vol, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the ACL for a file. + */ +void afs_fs_store_acl(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + const struct afs_acl *acl = op->acl; + size_t size; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); + + size = round_up(acl->size, 4); + call = afs_alloc_flat_call(op->net, &afs_RXFSStoreACL, + 5 * 4 + size, (21 + 6) * 4); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSSTOREACL); + bp[1] = htonl(vp->fid.vid); + bp[2] = htonl(vp->fid.vnode); + bp[3] = htonl(vp->fid.unique); + bp[4] = htonl(acl->size); + memcpy(&bp[5], acl->data, acl->size); + if (acl->size != size) + memset((void *)&bp[5] + acl->size, 0, size - acl->size); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_KERNEL); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 789bc253b5f6..dde1857fcabb 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -21,97 +21,480 @@ #include <linux/sched.h> #include <linux/mount.h> #include <linux/namei.h> +#include <linux/iversion.h> #include "internal.h" +#include "afs_fs.h" -struct afs_iget_data { - struct afs_fid fid; - struct afs_volume *volume; /* volume on which resides */ +void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op) +{ + size_t size = strlen(op->create.symlink) + 1; + size_t dsize = 0; + char *p; + + if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size, + mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0) + return; + + vnode->directory_size = dsize; + p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0); + memcpy(p, op->create.symlink, size); + kunmap_local(p); + set_bit(AFS_VNODE_DIR_READ, &vnode->flags); + netfs_single_mark_inode_dirty(&vnode->netfs.inode); +} + +static void afs_put_link(void *arg) +{ + struct folio *folio = virt_to_folio(arg); + + kunmap_local(arg); + folio_put(folio); +} + +const char *afs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct folio *folio; + char *content; + ssize_t ret; + + if (!dentry) { + /* RCU pathwalk. */ + if (!test_bit(AFS_VNODE_DIR_READ, &vnode->flags) || !afs_check_validity(vnode)) + return ERR_PTR(-ECHILD); + goto good; + } + + if (test_bit(AFS_VNODE_DIR_READ, &vnode->flags)) + goto fetch; + + ret = afs_validate(vnode, NULL); + if (ret < 0) + return ERR_PTR(ret); + + if (!test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && + test_bit(AFS_VNODE_DIR_READ, &vnode->flags)) + goto good; + +fetch: + ret = afs_read_single(vnode, NULL); + if (ret < 0) + return ERR_PTR(ret); + set_bit(AFS_VNODE_DIR_READ, &vnode->flags); + +good: + folio = folioq_folio(vnode->directory, 0); + folio_get(folio); + content = kmap_local_folio(folio, 0); + set_delayed_call(callback, afs_put_link, content); + return content; +} + +int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + DEFINE_DELAYED_CALL(done); + const char *content; + int len; + + content = afs_get_link(dentry, d_inode(dentry), &done); + if (IS_ERR(content)) { + do_delayed_call(&done); + return PTR_ERR(content); + } + + len = umin(strlen(content), buflen); + if (copy_to_user(buffer, content, len)) + len = -EFAULT; + do_delayed_call(&done); + return len; +} + +static const struct inode_operations afs_symlink_inode_operations = { + .get_link = afs_get_link, + .readlink = afs_readlink, }; +static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode) +{ + static unsigned long once_only; + + pr_warn("kAFS: AFS vnode with undefined type %u\n", vnode->status.type); + pr_warn("kAFS: A=%d m=%o s=%llx v=%llx\n", + vnode->status.abort_code, + vnode->status.mode, + vnode->status.size, + vnode->status.data_version); + pr_warn("kAFS: vnode %llx:%llx:%x\n", + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique); + if (parent_vnode) + pr_warn("kAFS: dir %llx:%llx:%x\n", + parent_vnode->fid.vid, + parent_vnode->fid.vnode, + parent_vnode->fid.unique); + + if (!test_and_set_bit(0, &once_only)) + dump_stack(); +} + +/* + * Set parameters for the netfs library + */ +static void afs_set_netfs_context(struct afs_vnode *vnode) +{ + netfs_inode_init(&vnode->netfs, &afs_req_ops, true); +} + /* - * map the AFS file status to the inode member variables + * Initialise an inode from the vnode status. */ -static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) +static int afs_inode_init_from_status(struct afs_operation *op, + struct afs_vnode_param *vp, + struct afs_vnode *vnode) { + struct afs_file_status *status = &vp->scb.status; struct inode *inode = AFS_VNODE_TO_I(vnode); + struct timespec64 t; + + _enter("{%llx:%llu.%u} %s", + vp->fid.vid, vp->fid.vnode, vp->fid.unique, + op->type ? op->type->name : "???"); _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu", - vnode->status.type, - vnode->status.nlink, - (unsigned long long) vnode->status.size, - vnode->status.data_version, - vnode->status.mode); + status->type, + status->nlink, + (unsigned long long) status->size, + status->data_version, + status->mode); + + write_seqlock(&vnode->cb_lock); + + vnode->cb_v_check = op->cb_v_break; + vnode->status = *status; + + t = status->mtime_client; + inode_set_ctime_to_ts(inode, t); + inode_set_mtime_to_ts(inode, t); + inode_set_atime_to_ts(inode, t); + inode->i_flags |= S_NOATIME; + inode->i_uid = make_kuid(&init_user_ns, status->owner); + inode->i_gid = make_kgid(&init_user_ns, status->group); + set_nlink(&vnode->netfs.inode, status->nlink); - switch (vnode->status.type) { + switch (status->type) { case AFS_FTYPE_FILE: - inode->i_mode = S_IFREG | vnode->status.mode; + inode->i_mode = S_IFREG | (status->mode & S_IALLUGO); inode->i_op = &afs_file_inode_operations; inode->i_fop = &afs_file_operations; + inode->i_mapping->a_ops = &afs_file_aops; + mapping_set_large_folios(inode->i_mapping); break; case AFS_FTYPE_DIR: - inode->i_mode = S_IFDIR | vnode->status.mode; + inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO); inode->i_op = &afs_dir_inode_operations; inode->i_fop = &afs_dir_file_operations; + inode->i_mapping->a_ops = &afs_dir_aops; + __set_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &vnode->netfs.flags); + /* Assume locally cached directory data will be valid. */ + __set_bit(AFS_VNODE_DIR_VALID, &vnode->flags); break; case AFS_FTYPE_SYMLINK: - inode->i_mode = S_IFLNK | vnode->status.mode; - inode->i_op = &page_symlink_inode_operations; + /* Symlinks with a mode of 0644 are actually mountpoints. */ + if ((status->mode & 0777) == 0644) { + inode->i_flags |= S_AUTOMOUNT; + + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_mntpt_inode_operations; + inode->i_fop = &afs_mntpt_file_operations; + } else { + inode->i_mode = S_IFLNK | status->mode; + inode->i_op = &afs_symlink_inode_operations; + } + inode->i_mapping->a_ops = &afs_dir_aops; + inode_nohighmem(inode); + mapping_set_release_always(inode->i_mapping); break; default: - printk("kAFS: AFS vnode with undefined type\n"); - return -EBADMSG; + dump_vnode(vnode, op->file[0].vnode != vnode ? op->file[0].vnode : NULL); + write_sequnlock(&vnode->cb_lock); + return afs_protocol_error(NULL, afs_eproto_file_type); } -#ifdef CONFIG_AFS_FSCACHE - if (vnode->status.size != inode->i_size) - fscache_attr_changed(vnode->cache); -#endif + afs_set_i_size(vnode, status->size); + afs_set_netfs_context(vnode); - set_nlink(inode, vnode->status.nlink); - inode->i_uid = vnode->status.owner; - inode->i_gid = GLOBAL_ROOT_GID; - inode->i_size = vnode->status.size; - inode->i_ctime.tv_sec = vnode->status.mtime_server; - inode->i_ctime.tv_nsec = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime; - inode->i_blocks = 0; - inode->i_generation = vnode->fid.unique; - inode->i_version = vnode->status.data_version; - inode->i_mapping->a_ops = &afs_fs_aops; + vnode->invalid_before = status->data_version; + trace_afs_set_dv(vnode, status->data_version); + inode_set_iversion_raw(&vnode->netfs.inode, status->data_version); - /* check to see whether a symbolic link is really a mountpoint */ - if (vnode->status.type == AFS_FTYPE_SYMLINK) { - afs_mntpt_check_symlink(vnode, key); + if (!vp->scb.have_cb) { + /* it's a symlink we just created (the fileserver + * didn't give us a callback) */ + afs_clear_cb_promise(vnode, afs_cb_promise_set_new_symlink); + } else { + vnode->cb_server = op->server; + afs_set_cb_promise(vnode, vp->scb.callback.expires_at, + afs_cb_promise_set_new_inode); + } - if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) { - inode->i_mode = S_IFDIR | vnode->status.mode; - inode->i_op = &afs_mntpt_inode_operations; - inode->i_fop = &afs_mntpt_file_operations; + write_sequnlock(&vnode->cb_lock); + return 0; +} + +/* + * Update the core inode struct from a returned status record. + */ +static void afs_apply_status(struct afs_operation *op, + struct afs_vnode_param *vp) +{ + struct afs_file_status *status = &vp->scb.status; + struct afs_vnode *vnode = vp->vnode; + struct inode *inode = &vnode->netfs.inode; + struct timespec64 t; + umode_t mode; + bool unexpected_jump = false; + bool data_changed = false; + bool change_size = vp->set_size; + + _enter("{%llx:%llu.%u} %s", + vp->fid.vid, vp->fid.vnode, vp->fid.unique, + op->type ? op->type->name : "???"); + + BUG_ON(test_bit(AFS_VNODE_UNSET, &vnode->flags)); + + if (status->type != vnode->status.type) { + pr_warn("Vnode %llx:%llx:%x changed type %u to %u\n", + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + status->type, vnode->status.type); + afs_protocol_error(NULL, afs_eproto_bad_status); + return; + } + + if (status->nlink != vnode->status.nlink) + set_nlink(inode, status->nlink); + + if (status->owner != vnode->status.owner) + inode->i_uid = make_kuid(&init_user_ns, status->owner); + + if (status->group != vnode->status.group) + inode->i_gid = make_kgid(&init_user_ns, status->group); + + if (status->mode != vnode->status.mode) { + mode = inode->i_mode; + mode &= ~S_IALLUGO; + mode |= status->mode & S_IALLUGO; + WRITE_ONCE(inode->i_mode, mode); + } + + t = status->mtime_client; + inode_set_mtime_to_ts(inode, t); + if (vp->update_ctime) + inode_set_ctime_to_ts(inode, op->ctime); + + if (vnode->status.data_version != status->data_version) { + trace_afs_set_dv(vnode, status->data_version); + data_changed = true; + } + + vnode->status = *status; + + if (vp->dv_before + vp->dv_delta != status->data_version) { + trace_afs_dv_mismatch(vnode, vp->dv_before, vp->dv_delta, + status->data_version); + + if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) && + atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE) + pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n", + vnode->fid.vid, vnode->fid.vnode, + (unsigned long long)vp->dv_before + vp->dv_delta, + (unsigned long long)status->data_version, + op->type ? op->type->name : "???", + op->debug_id); + + vnode->invalid_before = status->data_version; + if (vnode->status.type == AFS_FTYPE_DIR) + afs_invalidate_dir(vnode, afs_dir_invalid_dv_mismatch); + else + set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); + change_size = true; + data_changed = true; + unexpected_jump = true; + } else if (vnode->status.type == AFS_FTYPE_DIR) { + /* Expected directory change is handled elsewhere so + * that we can locally edit the directory and save on a + * download. + */ + if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + data_changed = false; + change_size = true; + } + + if (data_changed) { + inode_set_iversion_raw(inode, status->data_version); + + /* Only update the size if the data version jumped. If the + * file is being modified locally, then we might have our own + * idea of what the size should be that's not the same as + * what's on the server. + */ + vnode->netfs.remote_i_size = status->size; + if (change_size || status->size > i_size_read(inode)) { + afs_set_i_size(vnode, status->size); + if (unexpected_jump) + vnode->netfs.zero_point = status->size; + inode_set_ctime_to_ts(inode, t); + inode_set_atime_to_ts(inode, t); } + if (op->ops == &afs_fetch_data_operation) + op->fetch.subreq->rreq->i_size = status->size; } +} - return 0; +/* + * Apply a callback to a vnode. + */ +static void afs_apply_callback(struct afs_operation *op, + struct afs_vnode_param *vp) +{ + struct afs_callback *cb = &vp->scb.callback; + struct afs_vnode *vnode = vp->vnode; + + if (!afs_cb_is_broken(vp->cb_break_before, vnode)) { + if (op->volume->type == AFSVL_RWVOL) + vnode->cb_server = op->server; + afs_set_cb_promise(vnode, cb->expires_at, afs_cb_promise_set_apply_cb); + } } /* - * iget5() comparator + * Apply the received status and callback to an inode all in the same critical + * section to avoid races with afs_validate(). */ -static int afs_iget5_test(struct inode *inode, void *opaque) +void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *vp) { - struct afs_iget_data *data = opaque; + struct afs_vnode *vnode = vp->vnode; + + _enter(""); - return inode->i_ino == data->fid.vnode && - inode->i_generation == data->fid.unique; + write_seqlock(&vnode->cb_lock); + + if (vp->scb.have_error) { + /* A YFS server will return this from RemoveFile2 and AFS and + * YFS will return this from InlineBulkStatus. + */ + if (vp->scb.status.abort_code == VNOVNODE) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + clear_nlink(&vnode->netfs.inode); + __afs_break_callback(vnode, afs_cb_break_for_deleted); + op->flags &= ~AFS_OPERATION_DIR_CONFLICT; + } + } else if (vp->scb.have_status) { + if (vp->speculative && + (test_bit(AFS_VNODE_MODIFYING, &vnode->flags) || + vp->dv_before != vnode->status.data_version)) + /* Ignore the result of a speculative bulk status fetch + * if it splits around a modification op, thereby + * appearing to regress the data version. + */ + goto out; + afs_apply_status(op, vp); + if (vp->scb.have_cb) + afs_apply_callback(op, vp); + } else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) { + drop_nlink(&vnode->netfs.inode); + if (vnode->netfs.inode.i_nlink == 0) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + __afs_break_callback(vnode, afs_cb_break_for_deleted); + } + } + +out: + write_sequnlock(&vnode->cb_lock); + + if (vp->scb.have_status) + afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb); } +static void afs_fetch_status_success(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; + struct afs_vnode *vnode = vp->vnode; + int ret; + + if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) { + ret = afs_inode_init_from_status(op, vp, vnode); + afs_op_set_error(op, ret); + if (ret == 0) + afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb); + } else { + afs_vnode_commit_status(op, vp); + } +} + +const struct afs_operation_ops afs_fetch_status_operation = { + .issue_afs_rpc = afs_fs_fetch_status, + .issue_yfs_rpc = yfs_fs_fetch_status, + .success = afs_fetch_status_success, + .aborted = afs_check_for_remote_deletion, +}; + /* - * iget5() comparator for inode created by autocell operations - * - * These pseudo inodes don't match anything. + * Fetch file status from the volume. */ -static int afs_iget5_autocell_test(struct inode *inode, void *opaque) +int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool is_new, + afs_access_t *_caller_access) { - return 0; + struct afs_operation *op; + + _enter("%s,{%llx:%llu.%u,S=%lx}", + vnode->volume->name, + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, + vnode->flags); + + op = afs_alloc_operation(key, vnode->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + afs_op_set_vnode(op, 0, vnode); + + op->nr_files = 1; + op->ops = &afs_fetch_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + + if (_caller_access) + *_caller_access = op->file[0].scb.status.caller_access; + return afs_put_operation(op); +} + +/* + * ilookup() comparator + */ +int afs_ilookup5_test_by_fid(struct inode *inode, void *opaque) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_fid *fid = opaque; + + return (fid->vnode == vnode->fid.vnode && + fid->vnode_hi == vnode->fid.vnode_hi && + fid->unique == vnode->fid.unique); +} + +/* + * iget5() comparator + */ +static int afs_iget5_test(struct inode *inode, void *opaque) +{ + struct afs_vnode_param *vp = opaque; + //struct afs_vnode *vnode = AFS_FS_I(inode); + + return afs_ilookup5_test_by_fid(inode, &vp->fid); } /* @@ -119,271 +502,216 @@ static int afs_iget5_autocell_test(struct inode *inode, void *opaque) */ static int afs_iget5_set(struct inode *inode, void *opaque) { - struct afs_iget_data *data = opaque; + struct afs_vnode_param *vp = opaque; + struct afs_super_info *as = AFS_FS_S(inode->i_sb); struct afs_vnode *vnode = AFS_FS_I(inode); - inode->i_ino = data->fid.vnode; - inode->i_generation = data->fid.unique; - vnode->fid = data->fid; - vnode->volume = data->volume; + vnode->volume = as->volume; + vnode->fid = vp->fid; + /* YFS supports 96-bit vnode IDs, but Linux only supports + * 64-bit inode numbers. + */ + inode->i_ino = vnode->fid.vnode; + inode->i_generation = vnode->fid.unique; return 0; } /* - * inode retrieval for autocell + * Get a cache cookie for an inode. */ -struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name, - int namesz, struct key *key) +static void afs_get_inode_cache(struct afs_vnode *vnode) { - struct afs_iget_data data; - struct afs_super_info *as; - struct afs_vnode *vnode; - struct super_block *sb; - struct inode *inode; - static atomic_t afs_autocell_ino; - - _enter("{%x:%u},%*.*s,", - AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode, - namesz, namesz, dev_name ?: ""); - - sb = dir->i_sb; - as = sb->s_fs_info; - data.volume = as->volume; - data.fid.vid = as->volume->vid; - data.fid.unique = 0; - data.fid.vnode = 0; - - inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino), - afs_iget5_autocell_test, afs_iget5_set, - &data); - if (!inode) { - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); +#ifdef CONFIG_AFS_FSCACHE + struct { + __be32 vnode_id; + __be32 unique; + __be32 vnode_id_ext[2]; /* Allow for a 96-bit key */ + } __packed key; + struct afs_vnode_cache_aux aux; + + if (vnode->status.type != AFS_FTYPE_FILE && + vnode->status.type != AFS_FTYPE_DIR && + vnode->status.type != AFS_FTYPE_SYMLINK) { + vnode->netfs.cache = NULL; + return; } - _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }", - inode, inode->i_ino, data.fid.vid, data.fid.vnode, - data.fid.unique); - - vnode = AFS_FS_I(inode); - - /* there shouldn't be an existing inode */ - BUG_ON(!(inode->i_state & I_NEW)); - - inode->i_size = 0; - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - inode->i_op = &afs_autocell_inode_operations; - set_nlink(inode, 2); - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - inode->i_ctime.tv_sec = get_seconds(); - inode->i_ctime.tv_nsec = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime; - inode->i_blocks = 0; - inode->i_version = 0; - inode->i_generation = 0; - - set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); - set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); - inode->i_flags |= S_AUTOMOUNT | S_NOATIME; - unlock_new_inode(inode); - _leave(" = %p", inode); - return inode; + key.vnode_id = htonl(vnode->fid.vnode); + key.unique = htonl(vnode->fid.unique); + key.vnode_id_ext[0] = htonl(vnode->fid.vnode >> 32); + key.vnode_id_ext[1] = htonl(vnode->fid.vnode_hi); + afs_set_cache_aux(vnode, &aux); + + afs_vnode_set_cache(vnode, + fscache_acquire_cookie( + vnode->volume->cache, + vnode->status.type == AFS_FTYPE_FILE ? + 0 : FSCACHE_ADV_SINGLE_CHUNK, + &key, sizeof(key), + &aux, sizeof(aux), + i_size_read(&vnode->netfs.inode))); +#endif } /* * inode retrieval */ -struct inode *afs_iget(struct super_block *sb, struct key *key, - struct afs_fid *fid, struct afs_file_status *status, - struct afs_callback *cb) +struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp) { - struct afs_iget_data data = { .fid = *fid }; - struct afs_super_info *as; + struct afs_vnode_param *dvp = &op->file[0]; + struct super_block *sb = dvp->vnode->netfs.inode.i_sb; struct afs_vnode *vnode; struct inode *inode; int ret; - _enter(",{%x:%u.%u},,", fid->vid, fid->vnode, fid->unique); - - as = sb->s_fs_info; - data.volume = as->volume; + _enter(",{%llx:%llu.%u},,", vp->fid.vid, vp->fid.vnode, vp->fid.unique); - inode = iget5_locked(sb, fid->vnode, afs_iget5_test, afs_iget5_set, - &data); + inode = iget5_locked(sb, vp->fid.vnode, afs_iget5_test, afs_iget5_set, vp); if (!inode) { _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } - _debug("GOT INODE %p { vl=%x vn=%x, u=%x }", - inode, fid->vid, fid->vnode, fid->unique); - vnode = AFS_FS_I(inode); + _debug("GOT INODE %p { vl=%llx vn=%llx, u=%x }", + inode, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); + /* deal with an existing inode */ - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { _leave(" = %p", inode); return inode; } - if (!status) { - /* it's a remotely extant inode */ - set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - ret = afs_vnode_fetch_status(vnode, NULL, key); - if (ret < 0) - goto bad_inode; - } else { - /* it's an inode we just created */ - memcpy(&vnode->status, status, sizeof(vnode->status)); - - if (!cb) { - /* it's a symlink we just created (the fileserver - * didn't give us a callback) */ - vnode->cb_version = 0; - vnode->cb_expiry = 0; - vnode->cb_type = 0; - vnode->cb_expires = get_seconds(); - } else { - vnode->cb_version = cb->version; - vnode->cb_expiry = cb->expiry; - vnode->cb_type = cb->type; - vnode->cb_expires = vnode->cb_expiry + get_seconds(); - } - } - - /* set up caching before mapping the status, as map-status reads the - * first page of symlinks to see if they're really mountpoints */ - inode->i_size = vnode->status.size; -#ifdef CONFIG_AFS_FSCACHE - vnode->cache = fscache_acquire_cookie(vnode->volume->cache, - &afs_vnode_cache_index_def, - vnode); -#endif - - ret = afs_inode_map_status(vnode, key); + ret = afs_inode_init_from_status(op, vp, vnode); if (ret < 0) goto bad_inode; + afs_get_inode_cache(vnode); + /* success */ clear_bit(AFS_VNODE_UNSET, &vnode->flags); - inode->i_flags |= S_NOATIME; unlock_new_inode(inode); - _leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type); + _leave(" = %p", inode); return inode; /* failure */ bad_inode: -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vnode->cache, 0); - vnode->cache = NULL; -#endif iget_failed(inode); _leave(" = %d [bad]", ret); return ERR_PTR(ret); } -/* - * mark the data attached to an inode as obsolete due to a write on the server - * - might also want to ditch all the outstanding writes and dirty pages - */ -void afs_zap_data(struct afs_vnode *vnode) +static int afs_iget5_set_root(struct inode *inode, void *opaque) { - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + struct afs_super_info *as = AFS_FS_S(inode->i_sb); + struct afs_vnode *vnode = AFS_FS_I(inode); - /* nuke all the non-dirty pages that aren't locked, mapped or being - * written back in a regular file and completely discard the pages in a - * directory or symlink */ - if (S_ISREG(vnode->vfs_inode.i_mode)) - invalidate_remote_inode(&vnode->vfs_inode); - else - invalidate_inode_pages2(vnode->vfs_inode.i_mapping); + vnode->volume = as->volume; + vnode->fid.vid = as->volume->vid; + vnode->fid.vnode = 1; + vnode->fid.unique = 1; + inode->i_ino = 1; + inode->i_generation = 1; + return 0; } /* - * validate a vnode/inode - * - there are several things we need to check - * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, - * symlink) - * - parent dir metadata changed (security changes) - * - dentry data changed (write, truncate) - * - dentry metadata changed (security changes) + * Set up the root inode for a volume. This is always vnode 1, unique 1 within + * the volume. */ -int afs_validate(struct afs_vnode *vnode, struct key *key) +struct inode *afs_root_iget(struct super_block *sb, struct key *key) { + struct afs_super_info *as = AFS_FS_S(sb); + struct afs_operation *op; + struct afs_vnode *vnode; + struct inode *inode; int ret; - _enter("{v={%x:%u} fl=%lx},%x", - vnode->fid.vid, vnode->fid.vnode, vnode->flags, - key_serial(key)); - - if (vnode->cb_promised && - !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && - !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) && - !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { - if (vnode->cb_expires < get_seconds() + 10) { - _debug("callback expired"); - set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - } else { - goto valid; - } + _enter(",{%llx},,", as->volume->vid); + + inode = iget5_locked(sb, 1, NULL, afs_iget5_set_root, NULL); + if (!inode) { + _leave(" = -ENOMEM"); + return ERR_PTR(-ENOMEM); } - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - goto valid; + _debug("GOT ROOT INODE %p { vl=%llx }", inode, as->volume->vid); - mutex_lock(&vnode->validate_lock); + BUG_ON(!(inode_state_read_once(inode) & I_NEW)); - /* if the promise has expired, we need to check the server again to get - * a new promise - note that if the (parent) directory's metadata was - * changed then the security may be different and we may no longer have - * access */ - if (!vnode->cb_promised || - test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) { - _debug("not promised"); - ret = afs_vnode_fetch_status(vnode, NULL, key); - if (ret < 0) - goto error_unlock; - _debug("new promise [fl=%lx]", vnode->flags); - } + vnode = AFS_FS_I(inode); + vnode->cb_v_check = atomic_read(&as->volume->cb_v_break); + afs_set_netfs_context(vnode); - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _debug("file already deleted"); - ret = -ESTALE; - goto error_unlock; + op = afs_alloc_operation(key, as->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); + goto error; } - /* if the vnode's data version number changed then its contents are - * different */ - if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) - afs_zap_data(vnode); + afs_op_set_vnode(op, 0, vnode); - clear_bit(AFS_VNODE_MODIFIED, &vnode->flags); - mutex_unlock(&vnode->validate_lock); -valid: - _leave(" = 0"); - return 0; + op->nr_files = 1; + op->ops = &afs_fetch_status_operation; + ret = afs_do_sync_operation(op); + if (ret < 0) + goto error; -error_unlock: - mutex_unlock(&vnode->validate_lock); - _leave(" = %d", ret); - return ret; + afs_get_inode_cache(vnode); + + clear_bit(AFS_VNODE_UNSET, &vnode->flags); + unlock_new_inode(inode); + _leave(" = %p", inode); + return inode; + +error: + iget_failed(inode); + _leave(" = %d [bad]", ret); + return ERR_PTR(ret); } /* * read the attributes of an inode */ -int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) +int afs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { - struct inode *inode; - - inode = dentry->d_inode; + struct inode *inode = d_inode(path->dentry); + struct afs_vnode *vnode = AFS_FS_I(inode); + struct key *key; + int ret, seq; _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); - generic_fillattr(inode, stat); + if (vnode->volume && + !(query_flags & AT_STATX_DONT_SYNC) && + atomic64_read(&vnode->cb_expires_at) == AFS_NO_CB_PROMISE) { + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + ret = afs_validate(vnode, key); + key_put(key); + if (ret < 0) + return ret; + } + + do { + seq = read_seqbegin(&vnode->cb_lock); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && + stat->nlink > 0) + stat->nlink -= 1; + + /* Lie about the size of directories. We maintain a locally + * edited copy and may make different allocation decisions on + * it, but we need to give userspace the server's size. + */ + if (S_ISDIR(inode->i_mode)) + stat->size = vnode->netfs.remote_i_size; + } while (read_seqretry(&vnode->cb_lock, seq)); + return 0; } @@ -395,9 +723,9 @@ int afs_drop_inode(struct inode *inode) _enter(""); if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) - return generic_delete_inode(inode); + return inode_just_drop(inode); else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /* @@ -405,94 +733,197 @@ int afs_drop_inode(struct inode *inode) */ void afs_evict_inode(struct inode *inode) { - struct afs_permits *permits; - struct afs_vnode *vnode; - - vnode = AFS_FS_I(inode); + struct afs_vnode_cache_aux aux; + struct afs_super_info *sbi = AFS_FS_S(inode->i_sb); + struct afs_vnode *vnode = AFS_FS_I(inode); - _enter("{%x:%u.%d} v=%u x=%u t=%u }", + _enter("{%llx:%llu.%d}", vnode->fid.vid, vnode->fid.vnode, - vnode->fid.unique, - vnode->cb_version, - vnode->cb_expiry, - vnode->cb_type); + vnode->fid.unique); _debug("CLEAR INODE %p", inode); ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); - truncate_inode_pages(&inode->i_data, 0); - clear_inode(inode); + if ((S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) && + (inode_state_read_once(inode) & I_DIRTY) && + !sbi->dyn_root) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .for_sync = true, + .range_end = LLONG_MAX, + }; + + afs_single_writepages(inode->i_mapping, &wbc); + } - afs_give_up_callback(vnode); + netfs_wait_for_outstanding_io(inode); + truncate_inode_pages_final(&inode->i_data); + netfs_free_folioq_buffer(vnode->directory); - if (vnode->server) { - spin_lock(&vnode->server->fs_lock); - rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes); - spin_unlock(&vnode->server->fs_lock); - afs_put_server(vnode->server); - vnode->server = NULL; + afs_set_cache_aux(vnode, &aux); + netfs_clear_inode_writeback(inode, &aux); + clear_inode(inode); + + while (!list_empty(&vnode->wb_keys)) { + struct afs_wb_key *wbk = list_entry(vnode->wb_keys.next, + struct afs_wb_key, vnode_link); + list_del(&wbk->vnode_link); + afs_put_wb_key(wbk); } - ASSERT(list_empty(&vnode->writebacks)); - ASSERT(!vnode->cb_promised); + fscache_relinquish_cookie(afs_vnode_cache(vnode), + test_bit(AFS_VNODE_DELETED, &vnode->flags)); -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vnode->cache, 0); - vnode->cache = NULL; -#endif + afs_prune_wb_keys(vnode); + afs_put_permits(rcu_access_pointer(vnode->permit_cache)); + key_put(vnode->silly_key); + vnode->silly_key = NULL; + key_put(vnode->lock_key); + vnode->lock_key = NULL; + _leave(""); +} - mutex_lock(&vnode->permits_lock); - permits = vnode->permits; - rcu_assign_pointer(vnode->permits, NULL); - mutex_unlock(&vnode->permits_lock); - if (permits) - call_rcu(&permits->rcu, afs_zap_permits); +static void afs_setattr_success(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct inode *inode = &vp->vnode->netfs.inode; + loff_t old_i_size = i_size_read(inode); + + op->setattr.old_i_size = old_i_size; + afs_vnode_commit_status(op, vp); + /* inode->i_size has now been changed. */ + + if (op->setattr.attr->ia_valid & ATTR_SIZE) { + loff_t size = op->setattr.attr->ia_size; + if (size > old_i_size) + pagecache_isize_extended(inode, old_i_size, size); + } +} - _leave(""); +static void afs_setattr_edit_file(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_vnode *vnode = vp->vnode; + struct inode *inode = &vnode->netfs.inode; + + if (op->setattr.attr->ia_valid & ATTR_SIZE) { + loff_t size = op->setattr.attr->ia_size; + loff_t old = op->setattr.old_i_size; + + /* Note: inode->i_size was updated by afs_apply_status() inside + * the I/O and callback locks. + */ + + if (size != old) { + truncate_pagecache(inode, size); + netfs_resize_file(&vnode->netfs, size, true); + fscache_resize_cookie(afs_vnode_cache(vnode), size); + } + } } +static const struct afs_operation_ops afs_setattr_operation = { + .issue_afs_rpc = afs_fs_setattr, + .issue_yfs_rpc = yfs_fs_setattr, + .success = afs_setattr_success, + .edit_dir = afs_setattr_edit_file, +}; + /* * set the attributes of an inode */ -int afs_setattr(struct dentry *dentry, struct iattr *attr) +int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) { - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); - struct key *key; + const unsigned int supported = + ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | + ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH; + struct afs_operation *op; + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + struct inode *inode = &vnode->netfs.inode; + loff_t i_size; int ret; - _enter("{%x:%u},{n=%s},%x", - vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, + _enter("{%llx:%llu},{n=%pd},%x", + vnode->fid.vid, vnode->fid.vnode, dentry, attr->ia_valid); - if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | - ATTR_MTIME))) { + if (!(attr->ia_valid & supported)) { _leave(" = 0 [unsupported]"); return 0; } - /* flush any dirty data outstanding on a regular file */ - if (S_ISREG(vnode->vfs_inode.i_mode)) { - filemap_write_and_wait(vnode->vfs_inode.i_mapping); - afs_writeback_all(vnode); + i_size = i_size_read(inode); + if (attr->ia_valid & ATTR_SIZE) { + if (!S_ISREG(inode->i_mode)) + return -EISDIR; + + ret = inode_newsize_ok(inode, attr->ia_size); + if (ret) + return ret; + + if (attr->ia_size == i_size) + attr->ia_valid &= ~ATTR_SIZE; } - if (attr->ia_valid & ATTR_FILE) { - key = attr->ia_file->private_data; - } else { - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error; + fscache_use_cookie(afs_vnode_cache(vnode), true); + + /* Prevent any new writebacks from starting whilst we do this. */ + down_write(&vnode->validate_lock); + + if ((attr->ia_valid & ATTR_SIZE) && S_ISREG(inode->i_mode)) { + loff_t size = attr->ia_size; + + /* Wait for any outstanding writes to the server to complete */ + loff_t from = min(size, i_size); + loff_t to = max(size, i_size); + ret = filemap_fdatawait_range(inode->i_mapping, from, to); + if (ret < 0) + goto out_unlock; + + /* Don't talk to the server if we're just shortening in-memory + * writes that haven't gone to the server yet. + */ + if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && + attr->ia_size < i_size && + attr->ia_size > vnode->netfs.remote_i_size) { + truncate_setsize(inode, attr->ia_size); + netfs_resize_file(&vnode->netfs, size, false); + fscache_resize_cookie(afs_vnode_cache(vnode), + attr->ia_size); + ret = 0; + goto out_unlock; } } - ret = afs_vnode_setattr(vnode, key, attr); - if (!(attr->ia_valid & ATTR_FILE)) - key_put(key); + op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ? + afs_file_key(attr->ia_file) : NULL), + vnode->volume); + if (IS_ERR(op)) { + ret = PTR_ERR(op); + goto out_unlock; + } -error: + afs_op_set_vnode(op, 0, vnode); + op->setattr.attr = attr; + + if (attr->ia_valid & ATTR_SIZE) { + op->file[0].dv_delta = 1; + op->file[0].set_size = true; + } + op->ctime = attr->ia_ctime; + op->file[0].update_ctime = 1; + op->file[0].modification = true; + + op->ops = &afs_setattr_operation; + ret = afs_do_sync_operation(op); + +out_unlock: + up_write(&vnode->validate_lock); + fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL); _leave(" = %d", ret); return ret; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a306bb6d88d9..009064b8d661 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1,25 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal AFS stuff * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/compiler.h> #include <linux/kernel.h> +#include <linux/ktime.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/pagemap.h> -#include <linux/skbuff.h> #include <linux/rxrpc.h> #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/fscache.h> #include <linux/backing-dev.h> +#include <linux/uuid.h> +#include <linux/mm_types.h> +#include <linux/dns_resolver.h> +#include <crypto/krb5.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> #include "afs.h" #include "afs_vl.h" @@ -28,148 +33,225 @@ struct pagevec; struct afs_call; +struct afs_vnode; +struct afs_server_probe; + +/* + * Partial file-locking emulation mode. (The problem being that AFS3 only + * allows whole-file locks and no upgrading/downgrading). + */ +enum afs_flock_mode { + afs_flock_mode_unset, + afs_flock_mode_local, /* Local locking only */ + afs_flock_mode_openafs, /* Don't get server lock for a partial lock */ + afs_flock_mode_strict, /* Always get a server lock for a partial lock */ + afs_flock_mode_write, /* Get an exclusive server lock for a partial lock */ +}; -typedef enum { - AFS_VL_NEW, /* new, uninitialised record */ - AFS_VL_CREATING, /* creating record */ - AFS_VL_VALID, /* record is pending */ - AFS_VL_NO_VOLUME, /* no such volume available */ - AFS_VL_UPDATING, /* update in progress */ - AFS_VL_VOLUME_DELETED, /* volume was deleted */ - AFS_VL_UNCERTAIN, /* uncertain state (update failed) */ -} __attribute__((packed)) afs_vlocation_state_t; - -struct afs_mount_params { - bool rwpath; /* T if the parent should be considered R/W */ +struct afs_fs_context { bool force; /* T to force cell type */ bool autocell; /* T if set auto mount operation */ + bool dyn_root; /* T if dynamic root */ + bool no_cell; /* T if the source is "none" (for dynroot) */ + enum afs_flock_mode flock_mode; /* Partial file-locking emulation mode */ afs_voltype_t type; /* type of volume requested */ - int volnamesz; /* size of volume name */ + unsigned int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ + struct afs_net *net; /* the AFS net namespace stuff */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ struct key *key; /* key to use for secure mounting */ }; +enum afs_call_state { + AFS_CALL_CL_REQUESTING, /* Client: Request is being sent */ + AFS_CALL_CL_AWAIT_REPLY, /* Client: Awaiting reply */ + AFS_CALL_CL_PROC_REPLY, /* Client: rxrpc call complete; processing reply */ + AFS_CALL_SV_AWAIT_OP_ID, /* Server: Awaiting op ID */ + AFS_CALL_SV_AWAIT_REQUEST, /* Server: Awaiting request data */ + AFS_CALL_SV_REPLYING, /* Server: Replying */ + AFS_CALL_SV_AWAIT_ACK, /* Server: Awaiting final ACK */ + AFS_CALL_COMPLETE, /* Completed or failed */ +}; + /* - * definition of how to wait for the completion of an operation + * Address preferences. */ -struct afs_wait_mode { - /* RxRPC received message notification */ - void (*rx_wakeup)(struct afs_call *call); +struct afs_addr_preference { + union { + struct in_addr ipv4_addr; /* AF_INET address to compare against */ + struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */ + }; + sa_family_t family; /* Which address to use */ + u16 prio; /* Priority */ + u8 subnet_mask; /* How many bits to compare */ +}; - /* synchronous call waiter and call dispatched notification */ - int (*wait)(struct afs_call *call); +struct afs_addr_preference_list { + struct rcu_head rcu; + u16 version; /* Incremented when prefs list changes */ + u8 ipv6_off; /* Offset of IPv6 addresses */ + u8 nr; /* Number of addresses in total */ + u8 max_prefs; /* Number of prefs allocated */ + struct afs_addr_preference prefs[] __counted_by(max_prefs); +}; - /* asynchronous call completion */ - void (*async_complete)(void *reply, int error); +struct afs_address { + struct rxrpc_peer *peer; + short last_error; /* Last error from this address */ + u16 prio; /* Address priority */ }; -extern const struct afs_wait_mode afs_sync_call; -extern const struct afs_wait_mode afs_async_call; +/* + * List of server addresses. + */ +struct afs_addr_list { + struct rcu_head rcu; + refcount_t usage; + u32 version; /* Version */ + unsigned int debug_id; + unsigned int addr_pref_version; /* Version of address preference list */ + unsigned char max_addrs; + unsigned char nr_addrs; + unsigned char preferred; /* Preferred address */ + unsigned char nr_ipv4; /* Number of IPv4 addresses */ + enum dns_record_source source:8; + enum dns_lookup_status status:8; + unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */ + unsigned long responded; /* Mask of addrs that responded */ + struct afs_address addrs[] __counted_by(max_addrs); +#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) +}; /* * a record of an in-progress RxRPC call */ struct afs_call { const struct afs_call_type *type; /* type of call */ - const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ - struct work_struct async_work; /* asynchronous work processor */ + struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ - struct sk_buff_head rx_queue; /* received packets */ + struct work_struct free_work; /* Deferred free processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ + struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* security for this call */ - struct afs_server *server; /* server affected by incoming CM call */ + struct afs_net *net; /* The network namespace */ + struct afs_server *server; /* The fileserver record if fs op (pins ref) */ + struct afs_vlserver *vlserver; /* The vlserver record if vl op */ void *request; /* request data (first part) */ - struct address_space *mapping; /* page set */ - struct afs_writeback *wb; /* writeback being performed */ + size_t iov_len; /* Size of *iter to be used */ + struct iov_iter def_iter; /* Default buffer/data iterator */ + struct iov_iter *write_iter; /* Iterator defining write to be made */ + struct iov_iter *iter; /* Iterator currently in use */ + union { /* Convenience for ->def_iter */ + struct kvec kvec[1]; + struct bio_vec bvec[1]; + }; void *buffer; /* reply receive buffer */ - void *reply; /* reply buffer (first part) */ - void *reply2; /* reply buffer (second part) */ - void *reply3; /* reply buffer (third part) */ - void *reply4; /* reply buffer (fourth part) */ - pgoff_t first; /* first page in mapping to deal with */ - pgoff_t last; /* last page in mapping to deal with */ - enum { /* call state */ - AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ - AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ - AFS_CALL_AWAIT_OP_ID, /* awaiting op ID on incoming call */ - AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ - AFS_CALL_REPLYING, /* replying to incoming call */ - AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ - AFS_CALL_COMPLETE, /* successfully completed */ - AFS_CALL_BUSY, /* server was busy */ - AFS_CALL_ABORTED, /* call was aborted */ - AFS_CALL_ERROR, /* call failed due to error */ - } state; + union { + struct afs_endpoint_state *probe; + struct afs_addr_list *vl_probe; + struct afs_addr_list *ret_alist; + struct afs_vldb_entry *ret_vldb; + char *ret_str; + }; + struct afs_fid fid; /* Primary vnode ID (or all zeroes) */ + unsigned char probe_index; /* Address in ->probe_alist */ + struct afs_operation *op; + unsigned int server_index; + refcount_t ref; + enum afs_call_state state; + spinlock_t state_lock; int error; /* error code */ + u32 abort_code; /* Remote abort ID or 0 */ + unsigned long long remaining; /* How much is left to receive */ + unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ - unsigned reply_size; /* current size of reply */ - unsigned first_offset; /* offset into mapping[first] */ - unsigned last_to; /* amount of mapping[last] */ - unsigned offset; /* offset into received data store */ + unsigned count2; /* count used in unmarshalling */ unsigned char unmarshall; /* unmarshalling phase */ - bool incoming; /* T if incoming call */ - bool send_pages; /* T if data from mapping should be sent */ - u16 service_id; /* RxRPC service ID to call */ - __be16 port; /* target UDP port */ - __be32 operation_ID; /* operation ID for an incoming call */ + bool drop_ref; /* T if need to drop ref for incoming call */ + bool need_attention; /* T if RxRPC poked us */ + bool async; /* T if asynchronous */ + bool upgrade; /* T to request service upgrade */ + bool intr; /* T if interruptible */ + bool unmarshalling_error; /* T if an unmarshalling error occurred */ + bool responded; /* Got a response from the call (may be abort) */ + u8 security_ix; /* Security class */ + u16 service_id; /* Actual service ID (after upgrade) */ + unsigned int debug_id; /* Trace ID */ + u32 enctype; /* Security encoding type */ + u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ - __be32 tmp; /* place to extract temporary data */ - afs_dataversion_t store_version; /* updated version expected from store */ + union { /* place to extract temporary data */ + struct { + __be32 tmp_u; + __be32 tmp; + } __attribute__((packed)); + __be64 tmp64; + }; + ktime_t issue_time; /* Time of issue of operation */ }; struct afs_call_type { const char *name; + unsigned int op; /* Really enum afs_fs_operation */ /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ - int (*deliver)(struct afs_call *call, struct sk_buff *skb, - bool last); - - /* map an abort code to an error number */ - int (*abort_to_error)(u32 abort_code); + int (*deliver)(struct afs_call *call); /* clean up a call */ void (*destructor)(struct afs_call *call); + + /* Async receive processing function */ + void (*async_rx)(struct work_struct *work); + + /* Work function */ + void (*work)(struct work_struct *work); + + /* Call done function (gets called immediately on success or failure) */ + void (*done)(struct afs_call *call); + + /* Handle a call being immediately cancelled. */ + void (*immediate_cancel)(struct afs_call *call); +}; + +/* + * Key available for writeback on a file. + */ +struct afs_wb_key { + refcount_t usage; + struct key *key; + struct list_head vnode_link; /* Link in vnode->wb_keys */ }; /* - * record of an outstanding writeback on a vnode + * AFS open file information record. Pointed to by file->private_data. */ -struct afs_writeback { - struct list_head link; /* link in vnode->writebacks */ - struct work_struct writer; /* work item to perform the writeback */ - struct afs_vnode *vnode; /* vnode to which this write applies */ - struct key *key; /* owner of this write */ - wait_queue_head_t waitq; /* completion and ready wait queue */ - pgoff_t first; /* first page in batch */ - pgoff_t point; /* last page in current store op */ - pgoff_t last; /* last page in batch (inclusive) */ - unsigned offset_first; /* offset into first page of start of write */ - unsigned to_last; /* offset into last page of end of write */ - int num_conflicts; /* count of conflicting writes in list */ - int usage; - bool conflicts; /* T if has dependent conflicts */ - enum { - AFS_WBACK_SYNCING, /* synchronisation being performed */ - AFS_WBACK_PENDING, /* write pending */ - AFS_WBACK_CONFLICTING, /* conflicting writes posted */ - AFS_WBACK_WRITING, /* writing back */ - AFS_WBACK_COMPLETE /* the writeback record has been unlinked */ - } state __attribute__((packed)); +struct afs_file { + struct key *key; /* The key this file was opened with */ + struct afs_wb_key *wb; /* Writeback key record for this file */ }; +static inline struct key *afs_file_key(struct file *file) +{ + struct afs_file *af = file->private_data; + + return af->key; +} + /* * AFS superblock private data * - there's one superblock per volume */ struct afs_super_info { + struct net *net_ns; /* Network namespace */ + struct afs_cell *cell; /* The cell in which the volume resides */ struct afs_volume *volume; /* volume record */ - char rwparent; /* T if parent is R/W AFS volume */ + enum afs_flock_mode flock_mode:8; /* File locking emulation mode */ + bool dyn_root; /* True if dynamic root */ }; static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) @@ -180,302 +262,816 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) extern struct file_system_type afs_fs_type; /* - * entry in the cached cell catalogue + * Set of substitutes for @sys. */ -struct afs_cache_cell { - char name[AFS_MAXCELLNAME]; /* cell name (padded with NULs) */ - struct in_addr vl_servers[15]; /* cached cell VL servers */ +struct afs_sysnames { +#define AFS_NR_SYSNAME 16 + char *subs[AFS_NR_SYSNAME]; + refcount_t usage; + unsigned short nr; + char blank[1]; }; /* - * AFS cell record + * AFS network namespace record. + */ +struct afs_net { + struct net *net; /* Backpointer to the owning net namespace */ + struct afs_uuid uuid; + bool live; /* F if this namespace is being removed */ + + /* AF_RXRPC I/O stuff */ + struct socket *socket; + struct afs_call *spare_incoming_call; + struct work_struct charge_preallocation_work; + struct work_struct rx_oob_work; + struct mutex socket_mutex; + atomic_t nr_outstanding_calls; + atomic_t nr_superblocks; + + /* Cell database */ + struct rb_root cells; + struct idr cells_dyn_ino; /* cell->dynroot_ino mapping */ + struct afs_cell __rcu *ws_cell; + atomic_t cells_outstanding; + struct rw_semaphore cells_lock; + struct mutex cells_alias_lock; + + struct mutex proc_cells_lock; + struct hlist_head proc_cells; + + /* Known servers. Theoretically each fileserver can only be in one + * cell, but in practice, people create aliases and subsets and there's + * no easy way to distinguish them. + */ + seqlock_t fs_lock; /* For fs_probe_*, fs_proc */ + struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */ + struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ + struct hlist_head fs_proc; /* procfs servers list */ + + struct key *fs_cm_token_key; /* Key for creating CM tokens */ + struct work_struct fs_prober; + struct timer_list fs_probe_timer; + atomic_t servers_outstanding; + + /* File locking renewal management */ + struct mutex lock_manager_mutex; + + /* Misc */ + struct super_block *dynroot_sb; /* Dynamic root mount superblock */ + struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ + struct afs_sysnames *sysnames; + rwlock_t sysnames_lock; + struct afs_addr_preference_list __rcu *address_prefs; + u16 address_pref_version; + + /* Statistics counters */ + atomic_t n_lookup; /* Number of lookups done */ + atomic_t n_reval; /* Number of dentries needing revalidation */ + atomic_t n_inval; /* Number of invalidations by the server */ + atomic_t n_relpg; /* Number of invalidations by release_folio */ + atomic_t n_read_dir; /* Number of directory pages read */ + atomic_t n_dir_cr; /* Number of directory entry creation edits */ + atomic_t n_dir_rm; /* Number of directory entry removal edits */ + atomic_t n_stores; /* Number of store ops */ + atomic_long_t n_store_bytes; /* Number of bytes stored */ + atomic_long_t n_fetch_bytes; /* Number of bytes fetched */ + atomic_t n_fetches; /* Number of data fetch ops */ +}; + +extern const char afs_init_sysname[]; + +enum afs_cell_state { + AFS_CELL_SETTING_UP, + AFS_CELL_UNLOOKED, + AFS_CELL_ACTIVE, + AFS_CELL_REMOVING, + AFS_CELL_DEAD, +}; + +/* + * AFS cell record. + * + * This is a tricky concept to get right as it is possible to create aliases + * simply by pointing AFSDB/SRV records for two names at the same set of VL + * servers; it is also possible to do things like setting up two sets of VL + * servers, one of which provides a superset of the volumes provided by the + * other (for internal/external division, for example). + * + * Cells only exist in the sense that (a) a cell's name maps to a set of VL + * servers and (b) a cell's name is used by the client to select the key to use + * for authentication and encryption. The cell name is not typically used in + * the protocol. + * + * Two cells are determined to be aliases if they have an explicit alias (YFS + * only), share any VL servers in common or have at least one volume in common. + * "In common" means that the address list of the VL servers or the fileservers + * share at least one endpoint. */ struct afs_cell { - atomic_t usage; - struct list_head link; /* main cell list link */ + union { + struct rcu_head rcu; + struct rb_node net_node; /* Node in net->cells */ + }; + struct afs_net *net; + struct afs_cell *alias_of; /* The cell this is an alias of */ + struct afs_volume *root_volume; /* The root.cell volume if there is one */ struct key *anonymous_key; /* anonymous user key for this cell */ - struct list_head proc_link; /* /proc cell list link */ - struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ -#ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ -#endif + struct work_struct destroyer; /* Destroyer for cell */ + struct work_struct manager; /* Manager for init/deinit/dns */ + struct timer_list management_timer; /* General management timer */ + struct hlist_node proc_link; /* /proc cell list link */ + time64_t dns_expiry; /* Time AFSDB/SRV record expires */ + time64_t last_inactive; /* Time of last drop of usage count */ + refcount_t ref; /* Struct refcount */ + atomic_t active; /* Active usage counter */ + unsigned long flags; +#define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */ +#define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */ +#define AFS_CELL_FL_CHECK_ALIAS 2 /* Need to check for aliases */ + enum afs_cell_state state; + short error; + enum dns_record_source dns_source:8; /* Latest source of data from lookup */ + enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ + unsigned int dns_lookup_count; /* Counter of DNS lookups */ + unsigned int debug_id; + unsigned int dynroot_ino; /* Inode numbers for dynroot (a pair) */ + + /* The volumes belonging to this cell */ + struct rw_semaphore vs_lock; /* Lock for server->volumes */ + struct rb_root volumes; /* Tree of volumes on this server */ + struct hlist_head proc_volumes; /* procfs volume list */ + seqlock_t volume_lock; /* For volumes */ + + /* Active fileserver interaction state. */ + struct rb_root fs_servers; /* afs_server (by server UUID) */ + struct rw_semaphore fs_lock; /* For fs_servers */ + + /* VL server list. */ + rwlock_t vl_servers_lock; /* Lock on vl_servers */ + struct afs_vlserver_list __rcu *vl_servers; + + u8 name_len; /* Length of name */ + char *name; /* Cell name, case-flattened and NUL-padded */ + char *key_desc; /* Authentication key description */ +}; - /* server record management */ - rwlock_t servers_lock; /* active server list lock */ - struct list_head servers; /* active server list */ +/* + * Volume Location server record. + */ +struct afs_vlserver { + struct rcu_head rcu; + struct afs_addr_list __rcu *addresses; /* List of addresses for this VL server */ + unsigned long flags; +#define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */ +#define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */ +#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */ +#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */ + rwlock_t lock; /* Lock on addresses */ + refcount_t ref; + unsigned int rtt; /* Server's current RTT in uS */ + unsigned int debug_id; + + /* Probe state */ + wait_queue_head_t probe_wq; + atomic_t probe_outstanding; + spinlock_t probe_lock; + struct { + unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ + u32 abort_code; + short error; + unsigned short flags; +#define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */ +#define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */ +#define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */ +#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ + } probe; + + u16 service_id; /* Service ID we're using */ + u16 port; + u16 name_len; /* Length of name */ + char name[]; /* Server name, case-flattened */ +}; - /* volume location record management */ - struct rw_semaphore vl_sem; /* volume management serialisation semaphore */ - struct list_head vl_list; /* cell's active VL record list */ - spinlock_t vl_lock; /* vl_list lock */ - unsigned short vl_naddrs; /* number of VL servers in addr list */ - unsigned short vl_curr_svix; /* current server index */ - struct in_addr vl_addrs[AFS_CELL_MAX_ADDRS]; /* cell VL server addresses */ +/* + * Weighted list of Volume Location servers. + */ +struct afs_vlserver_entry { + u16 priority; /* Preference (as SRV) */ + u16 weight; /* Weight (as SRV) */ + enum dns_record_source source:8; + enum dns_lookup_status status:8; + struct afs_vlserver *server; +}; - char name[0]; /* cell name - must go last */ +struct afs_vlserver_list { + struct rcu_head rcu; + refcount_t ref; + u8 nr_servers; + u8 index; /* Server currently in use */ + u8 preferred; /* Preferred server */ + enum dns_record_source source:8; + enum dns_lookup_status status:8; + rwlock_t lock; + struct afs_vlserver_entry servers[]; }; /* - * entry in the cached volume location catalogue + * Cached VLDB entry. + * + * This is pointed to by cell->vldb_entries, indexed by name. */ -struct afs_cache_vlocation { - /* volume name (lowercase, padded with NULs) */ - uint8_t name[AFS_MAXVOLNAME + 1]; +struct afs_vldb_entry { + afs_volid_t vid[3]; /* Volume IDs for R/W, R/O and Bak volumes */ - uint8_t nservers; /* number of entries used in servers[] */ - uint8_t vidmask; /* voltype mask for vid[] */ - uint8_t srvtmask[8]; /* voltype masks for servers[] */ + unsigned long flags; +#define AFS_VLDB_HAS_RW 0 /* - R/W volume exists */ +#define AFS_VLDB_HAS_RO 1 /* - R/O volume exists */ +#define AFS_VLDB_HAS_BAK 2 /* - Backup volume exists */ +#define AFS_VLDB_QUERY_VALID 3 /* - Record is valid */ +#define AFS_VLDB_QUERY_ERROR 4 /* - VL server returned error */ + + uuid_t fs_server[AFS_NMAXNSERVERS]; + u32 addr_version[AFS_NMAXNSERVERS]; /* Registration change counters */ + u8 fs_mask[AFS_NMAXNSERVERS]; #define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ #define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ #define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ - - afs_volid_t vid[3]; /* volume IDs for R/W, R/O and Bak volumes */ - struct in_addr servers[8]; /* fileserver addresses */ - time_t rtime; /* last retrieval time */ + u8 vlsf_flags[AFS_NMAXNSERVERS]; + short error; + u8 nr_servers; /* Number of server records */ + u8 name_len; + u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; /* - * volume -> vnode hash table entry - */ -struct afs_cache_vhash { - afs_voltype_t vtype; /* which volume variation */ - uint8_t hash_bucket; /* which hash bucket this represents */ -} __attribute__((packed)); - -/* - * AFS volume location record + * Fileserver endpoint state. The records the addresses of a fileserver's + * endpoints and the state and result of a round of probing on them. This + * allows the rotation algorithm to access those results without them being + * erased by a subsequent round of probing. */ -struct afs_vlocation { - atomic_t usage; - time_t time_of_death; /* time at which put reduced usage to 0 */ - struct list_head link; /* link in cell volume location list */ - struct list_head grave; /* link in master graveyard list */ - struct list_head update; /* link in master update list */ - struct afs_cell *cell; /* cell to which volume belongs */ -#ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ -#endif - struct afs_cache_vlocation vldb; /* volume information DB record */ - struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ - wait_queue_head_t waitq; /* status change waitqueue */ - time_t update_at; /* time at which record should be updated */ - spinlock_t lock; /* access lock */ - afs_vlocation_state_t state; /* volume location state */ - unsigned short upd_rej_cnt; /* ENOMEDIUM count during update */ - unsigned short upd_busy_cnt; /* EBUSY count during update */ - bool valid; /* T if valid */ +struct afs_endpoint_state { + struct rcu_head rcu; + struct afs_addr_list *addresses; /* The addresses being probed */ + unsigned long responsive_set; /* Bitset of responsive endpoints */ + unsigned long failed_set; /* Bitset of endpoints we failed to probe */ + refcount_t ref; + unsigned int server_id; /* Debug ID of server */ + unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */ + atomic_t nr_probing; /* Number of outstanding probes */ + unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ + s32 abort_code; + short error; + unsigned long flags; +#define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */ +#define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */ +#define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */ +#define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */ +#define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */ }; /* - * AFS fileserver record + * Record of fileserver with which we're actively communicating. */ struct afs_server { - atomic_t usage; - time_t time_of_death; /* time at which put reduced usage to 0 */ - struct in_addr addr; /* server address */ - struct afs_cell *cell; /* cell in which server resides */ - struct list_head link; /* link in cell's server list */ - struct list_head grave; /* link in master graveyard list */ - struct rb_node master_rb; /* link in master by-addr tree */ - struct rw_semaphore sem; /* access lock */ + struct rcu_head rcu; + union { + uuid_t uuid; /* Server ID */ + struct afs_uuid _uuid; + }; + + struct afs_cell *cell; /* Cell to which belongs (pins ref) */ + struct rb_node uuid_rb; /* Link in cell->fs_servers */ + struct list_head probe_link; /* Link in net->fs_probe_* */ + struct hlist_node proc_link; /* Link in net->fs_proc */ + struct list_head volumes; /* RCU list of afs_server_entry objects */ + struct work_struct destroyer; /* Work item to try and destroy a server */ + struct timer_list timer; /* Management timer */ + struct mutex cm_token_lock; /* Lock governing creation of appdata */ + struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */ + time64_t unuse_time; /* Time at which last unused */ + unsigned long flags; +#define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ +#define AFS_SERVER_FL_UPDATING 1 +#define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */ +#define AFS_SERVER_FL_UNCREATED 3 /* The record needs creating */ +#define AFS_SERVER_FL_CREATING 4 /* The record is being created */ +#define AFS_SERVER_FL_EXPIRED 5 /* The record has expired */ +#define AFS_SERVER_FL_NOT_FOUND 6 /* VL server says no such server */ +#define AFS_SERVER_FL_VL_FAIL 7 /* Failed to access VL server */ +#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ +#define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ +#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ +#define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ +#define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ +#define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */ + refcount_t ref; /* Object refcount */ + atomic_t active; /* Active user count */ + u32 addr_version; /* Address list version */ + u16 service_id; /* Service ID we're using. */ + short create_error; /* Creation error */ + unsigned int rtt; /* Server's current RTT in uS */ + unsigned int debug_id; /* Debugging ID for traces */ /* file service access */ - struct rb_root fs_vnodes; /* vnodes backed by this server (ordered by FID) */ - unsigned long fs_act_jif; /* time at which last activity occurred */ - unsigned long fs_dead_jif; /* time at which no longer to be considered dead */ - spinlock_t fs_lock; /* access lock */ - int fs_state; /* 0 or reason FS currently marked dead (-errno) */ + rwlock_t fs_lock; /* access lock */ + + /* Probe state */ + struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */ + unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ + wait_queue_head_t probe_wq; + unsigned int probe_counter; /* Number of probes issued */ + spinlock_t probe_lock; +}; - /* callback promise management */ - struct rb_root cb_promises; /* vnode expiration list (ordered earliest first) */ - struct delayed_work cb_updater; /* callback updater */ - struct delayed_work cb_break_work; /* collected break dispatcher */ - wait_queue_head_t cb_break_waitq; /* space available in cb_break waitqueue */ - spinlock_t cb_lock; /* access lock */ - struct afs_callback cb_break[64]; /* ring of callbacks awaiting breaking */ - atomic_t cb_break_n; /* number of pending breaks */ - u8 cb_break_head; /* head of callback breaking ring */ - u8 cb_break_tail; /* tail of callback breaking ring */ +enum afs_ro_replicating { + AFS_RO_NOT_REPLICATING, /* Not doing replication */ + AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */ + AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */ +} __mode(byte); + +/* + * Replaceable volume server list. + */ +struct afs_server_entry { + struct afs_server *server; + struct afs_volume *volume; + struct list_head slink; /* Link in server->volumes */ + time64_t cb_expires_at; /* Time at which volume-level callback expires */ + unsigned long flags; +#define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */ +#define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */ +#define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */ +}; + +struct afs_server_list { + struct rcu_head rcu; + refcount_t usage; + bool attached; /* T if attached to servers */ + enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */ + unsigned char nr_servers; + unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ + unsigned int seq; /* Set to ->servers_seq when installed */ + rwlock_t lock; + struct afs_server_entry servers[]; }; /* - * AFS volume access record + * Live AFS volume management. */ struct afs_volume { - atomic_t usage; - struct afs_cell *cell; /* cell to which belongs (unrefd ptr) */ - struct afs_vlocation *vlocation; /* volume location */ + struct rcu_head rcu; + afs_volid_t vid; /* The volume ID of this volume */ + afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ + refcount_t ref; + unsigned int debug_id; /* Debugging ID for traces */ + time64_t update_at; /* Time at which to next update */ + struct afs_cell *cell; /* Cell to which belongs (pins ref) */ + struct rb_node cell_node; /* Link in cell->volumes */ + struct hlist_node proc_link; /* Link in cell->proc_volumes */ + struct super_block __rcu *sb; /* Superblock on which inodes reside */ + struct work_struct destructor; /* Deferred destructor */ + unsigned long flags; +#define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */ +#define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */ +#define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */ +#define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */ +#define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */ +#define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */ #ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ + struct fscache_volume *cache; /* Caching cookie */ #endif - afs_volid_t vid; /* volume ID */ + struct afs_server_list __rcu *servers; /* List of servers on which volume resides */ + rwlock_t servers_lock; /* Lock for ->servers */ + unsigned int servers_seq; /* Incremented each time ->servers changes */ + + /* RO release tracking */ + struct mutex volsync_lock; /* Time/state evaluation lock */ + time64_t creation_time; /* Volume creation time (or TIME64_MIN) */ + time64_t update_time; /* Volume update time (or TIME64_MIN) */ + + /* Callback management */ + struct mutex cb_check_lock; /* Lock to control race to check after v_break */ + time64_t cb_expires_at; /* Earliest volume callback expiry time */ + atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */ + atomic_t cb_v_break; /* Volume-break event counter. */ + atomic_t cb_v_check; /* Volume-break has-been-checked counter. */ + atomic_t cb_scrub; /* Scrub-all-data event counter. */ + rwlock_t cb_v_break_lock; + struct rw_semaphore open_mmaps_lock; + struct list_head open_mmaps; /* List of vnodes that are mmapped */ + afs_voltype_t type; /* type of volume */ char type_force; /* force volume type (suppress R/O -> R/W) */ - unsigned short nservers; /* number of server slots filled */ - unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ - struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ - struct rw_semaphore server_sem; /* lock for accessing current server */ - struct backing_dev_info bdi; + u8 name_len; + u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; -/* - * vnode catalogue entry - */ -struct afs_cache_vnode { - afs_vnodeid_t vnode_id; /* vnode ID */ - unsigned vnode_unique; /* vnode ID uniquifier */ - afs_dataversion_t data_version; /* data version */ +enum afs_lock_state { + AFS_VNODE_LOCK_NONE, /* The vnode has no lock on the server */ + AFS_VNODE_LOCK_WAITING_FOR_CB, /* We're waiting for the server to break the callback */ + AFS_VNODE_LOCK_SETTING, /* We're asking the server for a lock */ + AFS_VNODE_LOCK_GRANTED, /* We have a lock on the server */ + AFS_VNODE_LOCK_EXTENDING, /* We're extending a lock on the server */ + AFS_VNODE_LOCK_NEED_UNLOCK, /* We need to unlock on the server */ + AFS_VNODE_LOCK_UNLOCKING, /* We're telling the server to unlock */ + AFS_VNODE_LOCK_DELETED, /* The vnode has been deleted whilst we have a lock */ }; /* - * AFS inode private data + * AFS inode private data. + * + * Note that afs_alloc_inode() *must* reset anything that could incorrectly + * leak from one inode to another. */ struct afs_vnode { - struct inode vfs_inode; /* the VFS's inode record */ - + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct afs_volume *volume; /* volume on which vnode resides */ - struct afs_server *server; /* server currently supplying this file */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ -#ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ -#endif - struct afs_permits *permits; /* cache of permits so far obtained */ - struct mutex permits_lock; /* lock for altering permits list */ - struct mutex validate_lock; /* lock for validating this vnode */ - wait_queue_head_t update_waitq; /* status fetch waitqueue */ - int update_cnt; /* number of outstanding ops that will update the - * status */ - spinlock_t writeback_lock; /* lock for writebacks */ + afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ + struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ + struct list_head io_lock_waiters; /* Threads waiting for the I/O lock */ + struct rw_semaphore validate_lock; /* lock for validating this vnode */ + struct rw_semaphore rmdir_lock; /* Lock for rmdir vs sillyrename */ + struct key *silly_key; /* Silly rename key */ + spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; -#define AFS_VNODE_CB_BROKEN 0 /* set if vnode's callback was broken */ +#define AFS_VNODE_IO_LOCK 0 /* Set if the I/O serialisation lock is held */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ -#define AFS_VNODE_MODIFIED 2 /* set if vnode's data modified */ +#define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ -#define AFS_VNODE_LOCKING 6 /* set if waiting for lock on vnode */ -#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */ -#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */ -#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */ -#define AFS_VNODE_AUTOCELL 10 /* set if Vnode is an auto mount point */ -#define AFS_VNODE_PSEUDODIR 11 /* set if Vnode is a pseudo directory */ - - long acl_order; /* ACL check count (callback break count) */ - - struct list_head writebacks; /* alterations in pagecache that need writing */ +#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ +#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ +#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ +#define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ +#define AFS_VNODE_DIR_READ 11 /* Set if we've read a dir's contents */ + + struct folio_queue *directory; /* Directory contents */ + struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ struct delayed_work lock_work; /* work to be done in locking */ - struct key *unlock_key; /* key to be used in unlocking */ + struct key *lock_key; /* Key to be used in lock ops */ + ktime_t locked_at; /* Time at which lock obtained */ + enum afs_lock_state lock_state : 8; + afs_lock_type_t lock_type : 8; + unsigned int directory_size; /* Amount of space in ->directory */ /* outstanding callback notification on this file */ - struct rb_node server_rb; /* link in server->fs_vnodes */ - struct rb_node cb_promise; /* link in server->cb_promises */ - struct work_struct cb_broken_work; /* work to be done on callback break */ - time_t cb_expires; /* time at which callback expires */ - time_t cb_expires_at; /* time used to order cb_promise */ - unsigned cb_version; /* callback version */ - unsigned cb_expiry; /* callback expiry time */ - afs_callback_type_t cb_type; /* type of callback */ - bool cb_promised; /* true if promise still holds */ + struct work_struct cb_work; /* Work for mmap'd files */ + struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */ + void *cb_server; /* Server with callback/filelock */ + atomic_t cb_nr_mmap; /* Number of mmaps */ + unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */ + unsigned int cb_scrub; /* Scrub counter on ->volume */ + unsigned int cb_break; /* Break counter on vnode */ + unsigned int cb_v_check; /* Break check counter on ->volume */ + seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */ + + atomic64_t cb_expires_at; /* time at which callback expires */ +#define AFS_NO_CB_PROMISE TIME64_MIN }; +static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) +{ +#ifdef CONFIG_AFS_FSCACHE + return netfs_i_cookie(&vnode->netfs); +#else + return NULL; +#endif +} + +static inline void afs_vnode_set_cache(struct afs_vnode *vnode, + struct fscache_cookie *cookie) +{ +#ifdef CONFIG_AFS_FSCACHE + vnode->netfs.cache = cookie; + if (cookie) + mapping_set_release_always(vnode->netfs.inode.i_mapping); +#endif +} + /* * cached security record for one user's attempt to access a vnode */ struct afs_permit { struct key *key; /* RxRPC ticket holding a security context */ - afs_access_t access_mask; /* access mask for this key */ + afs_access_t access; /* CallerAccess value for this key */ }; /* - * cache of security records from attempts to access a vnode + * Immutable cache of CallerAccess records from attempts to access vnodes. + * These may be shared between multiple vnodes. */ struct afs_permits { - struct rcu_head rcu; /* disposal procedure */ - int count; /* number of records */ - struct afs_permit permits[0]; /* the permits so far examined */ + struct rcu_head rcu; + struct hlist_node hash_node; /* Link in hash */ + unsigned long h; /* Hash value for this permit list */ + refcount_t usage; + unsigned short nr_permits; /* Number of records */ + bool invalidated; /* Invalidated due to key change */ + struct afs_permit permits[] __counted_by(nr_permits); /* List of permits sorted by key pointer */ +}; + +/* + * Error prioritisation and accumulation. + */ +struct afs_error { + s32 abort_code; /* Cumulative abort code */ + short error; /* Cumulative error */ + bool responded; /* T if server responded */ + bool aborted; /* T if ->error is from an abort */ }; /* - * record of one of a system's set of network interfaces + * Cursor for iterating over a set of volume location servers. */ -struct afs_interface { - struct in_addr address; /* IPv4 address bound to interface */ - struct in_addr netmask; /* netmask applied to address */ - unsigned mtu; /* MTU of interface */ +struct afs_vl_cursor { + struct afs_cell *cell; /* The cell we're querying */ + struct afs_vlserver_list *server_list; /* Current server list (pins ref) */ + struct afs_vlserver *server; /* Server on which this resides */ + struct afs_addr_list *alist; /* Current address list (pins ref) */ + struct key *key; /* Key for the server */ + unsigned long untried_servers; /* Bitmask of untried servers */ + unsigned long addr_tried; /* Tried addresses */ + struct afs_error cumul_error; /* Cumulative error */ + unsigned int debug_id; + s32 call_abort_code; + short call_error; /* Error from single call */ + short server_index; /* Current server */ + signed char addr_index; /* Current address */ + unsigned short flags; +#define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ +#define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ +#define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ + short nr_iterations; /* Number of server iterations */ + bool call_responded; /* T if the current address responded */ +}; + +/* + * Fileserver state tracking for an operation. An array of these is kept, + * indexed by server index. + */ +struct afs_server_state { + /* Tracking of fileserver probe state. Other operations may interfere + * by probing a fileserver when accessing other volumes. + */ + unsigned int probe_seq; + unsigned long untried_addrs; /* Addresses we haven't tried yet */ + struct wait_queue_entry probe_waiter; + struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */ }; /* - * UUID definition [internet draft] - * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns - * increments since midnight 15th October 1582 - * - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID - * time - * - the clock sequence is a 14-bit counter to avoid duplicate times + * Fileserver operation methods. */ -struct afs_uuid { - u32 time_low; /* low part of timestamp */ - u16 time_mid; /* mid part of timestamp */ - u16 time_hi_and_version; /* high part of timestamp and version */ -#define AFS_UUID_TO_UNIX_TIME 0x01b21dd213814000ULL -#define AFS_UUID_TIMEHI_MASK 0x0fff -#define AFS_UUID_VERSION_TIME 0x1000 /* time-based UUID */ -#define AFS_UUID_VERSION_NAME 0x3000 /* name-based UUID */ -#define AFS_UUID_VERSION_RANDOM 0x4000 /* (pseudo-)random generated UUID */ - u8 clock_seq_hi_and_reserved; /* clock seq hi and variant */ -#define AFS_UUID_CLOCKHI_MASK 0x3f -#define AFS_UUID_VARIANT_STD 0x80 - u8 clock_seq_low; /* clock seq low */ - u8 node[6]; /* spatially unique node ID (MAC addr) */ +struct afs_operation_ops { + void (*issue_afs_rpc)(struct afs_operation *op); + void (*issue_yfs_rpc)(struct afs_operation *op); + void (*success)(struct afs_operation *op); + void (*aborted)(struct afs_operation *op); + void (*failed)(struct afs_operation *op); + void (*edit_dir)(struct afs_operation *op); + void (*put)(struct afs_operation *op); }; +struct afs_vnode_param { + struct afs_vnode *vnode; + struct afs_fid fid; /* Fid to access */ + struct afs_status_cb scb; /* Returned status and callback promise */ + afs_dataversion_t dv_before; /* Data version before the call */ + unsigned int cb_break_before; /* cb_break before the call */ + u8 dv_delta; /* Expected change in data version */ + bool put_vnode:1; /* T if we have a ref on the vnode */ + bool need_io_lock:1; /* T if we need the I/O lock on this */ + bool update_ctime:1; /* Need to update the ctime */ + bool set_size:1; /* Must update i_size */ + bool op_unlinked:1; /* True if file was unlinked by op */ + bool speculative:1; /* T if speculative status fetch (no vnode lock) */ + bool modification:1; /* Set if the content gets modified */ +}; + +/* + * Fileserver operation wrapper, handling server and address rotation + * asynchronously. May make simultaneous calls to multiple servers. + */ +struct afs_operation { + struct afs_net *net; /* Network namespace */ + struct key *key; /* Key for the cell */ + const struct afs_call_type *type; /* Type of call done */ + const struct afs_operation_ops *ops; + + /* Parameters/results for the operation */ + struct afs_volume *volume; /* Volume being accessed */ + struct afs_vnode_param file[2]; + struct afs_vnode_param *more_files; + struct afs_volsync pre_volsync; /* Volsync before op */ + struct afs_volsync volsync; /* Volsync returned by op */ + struct dentry *dentry; /* Dentry to be altered */ + struct dentry *dentry_2; /* Second dentry to be altered */ + struct timespec64 mtime; /* Modification time to record */ + struct timespec64 ctime; /* Change time to set */ + struct afs_error cumul_error; /* Cumulative error */ + short nr_files; /* Number of entries in file[], more_files */ + unsigned int debug_id; + + unsigned int cb_v_break; /* Volume break counter before op */ + + union { + struct { + int which; /* Which ->file[] to fetch for */ + } fetch_status; + struct { + int reason; /* enum afs_edit_dir_reason */ + mode_t mode; + const char *symlink; + } create; + struct { + bool need_rehash; + } unlink; + struct { + struct dentry *rehash; + struct dentry *tmp; + unsigned int rename_flags; + bool new_negative; + } rename; + struct { + struct netfs_io_subrequest *subreq; + } fetch; + struct { + afs_lock_type_t type; + } lock; + struct { + struct iov_iter *write_iter; + loff_t pos; + loff_t size; + loff_t i_size; + } store; + struct { + struct iattr *attr; + loff_t old_i_size; + } setattr; + struct afs_acl *acl; + struct yfs_acl *yacl; + struct { + struct afs_volume_status vs; + struct kstatfs *buf; + } volstatus; + }; + + /* Fileserver iteration state */ + struct afs_server_list *server_list; /* Current server list (pins ref) */ + struct afs_server *server; /* Server we're using (ref pinned by server_list) */ + struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */ + struct afs_server_state *server_states; /* States of the servers involved */ + struct afs_call *call; + unsigned long untried_servers; /* Bitmask of untried servers */ + unsigned long addr_tried; /* Tried addresses */ + s32 call_abort_code; /* Abort code from single call */ + short call_error; /* Error from single call */ + short server_index; /* Current server */ + short nr_iterations; /* Number of server iterations */ + signed char addr_index; /* Current address */ + bool call_responded; /* T if the current address responded */ + + unsigned int flags; +#define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ +#define AFS_OPERATION_VBUSY 0x0002 /* Set if seen VBUSY */ +#define AFS_OPERATION_VMOVED 0x0004 /* Set if seen VMOVED */ +#define AFS_OPERATION_VNOVOL 0x0008 /* Set if seen VNOVOL */ +#define AFS_OPERATION_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ +#define AFS_OPERATION_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ +#define AFS_OPERATION_UNINTR 0x0040 /* Set if op is uninterruptible */ +#define AFS_OPERATION_DOWNGRADE 0x0080 /* Set to retry with downgraded opcode */ +#define AFS_OPERATION_LOCK_0 0x0100 /* Set if have io_lock on file[0] */ +#define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */ +#define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ +#define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ +#define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */ +#define AFS_OPERATION_ASYNC 0x2000 /* Set if should run asynchronously */ +}; + +/* + * Cache auxiliary data. + */ +struct afs_vnode_cache_aux { + __be64 data_version; +} __packed; + +static inline void afs_set_cache_aux(struct afs_vnode *vnode, + struct afs_vnode_cache_aux *aux) +{ + aux->data_version = cpu_to_be64(vnode->status.data_version); +} + +static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int flags) +{ + struct afs_vnode_cache_aux aux; + + afs_set_cache_aux(vnode, &aux); + fscache_invalidate(afs_vnode_cache(vnode), &aux, + i_size_read(&vnode->netfs.inode), flags); +} + +/* + * Directory iteration management. + */ +struct afs_dir_iter { + struct afs_vnode *dvnode; + union afs_xdr_dir_block *block; + struct folio_queue *fq; + unsigned int fpos; + int fq_slot; + unsigned int loop_check; + u8 nr_slots; + u8 bucket; + unsigned int prev_entry; +}; + +#include <trace/events/afs.h> + /*****************************************************************************/ /* - * cache.c + * addr_list.c */ -#ifdef CONFIG_AFS_FSCACHE -extern struct fscache_netfs afs_cache_netfs; -extern struct fscache_cookie_def afs_cell_cache_index_def; -extern struct fscache_cookie_def afs_vlocation_cache_index_def; -extern struct fscache_cookie_def afs_volume_cache_index_def; -extern struct fscache_cookie_def afs_vnode_cache_index_def; -#else -#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL) -#define afs_vlocation_cache_index_def (*(struct fscache_cookie_def *) NULL) -#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL) -#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL) -#endif +struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); +extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr); +extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); +extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, + const char *, size_t, char, + unsigned short, unsigned short); +bool afs_addr_list_same(const struct afs_addr_list *a, + const struct afs_addr_list *b); +extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); + +extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, + __be32 xdr, u16 port); +extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, + __be32 *xdr, u16 port); +void afs_set_peer_appdata(struct afs_server *server, + struct afs_addr_list *old_alist, + struct afs_addr_list *new_alist); + +/* + * addr_prefs.c + */ +int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size); +void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist); +void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist); /* * callback.c */ +extern void afs_invalidate_mmap_work(struct work_struct *); extern void afs_init_callback_state(struct afs_server *); -extern void afs_broken_callback_work(struct work_struct *); -extern void afs_break_callbacks(struct afs_server *, size_t, - struct afs_callback[]); -extern void afs_discard_callback_on_delete(struct afs_vnode *); -extern void afs_give_up_callback(struct afs_vnode *); -extern void afs_dispatch_give_up_callbacks(struct work_struct *); -extern void afs_flush_callback_breaks(struct afs_server *); -extern int __init afs_callback_update_init(void); -extern void afs_callback_update_kill(void); +extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); +extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); +extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break *); + +static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) +{ + return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub; +} + +static inline bool afs_cb_is_broken(unsigned int cb_break, + const struct afs_vnode *vnode) +{ + return cb_break != (vnode->cb_break + + atomic_read(&vnode->volume->cb_ro_snapshot) + + atomic_read(&vnode->volume->cb_scrub)); +} /* * cell.c */ -extern struct rw_semaphore afs_proc_cells_sem; -extern struct list_head afs_proc_cells; - -#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0) -extern int afs_cell_init(char *); -extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool); -extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool); -extern struct afs_cell *afs_grab_cell(struct afs_cell *); -extern void afs_put_cell(struct afs_cell *); -extern void afs_cell_purge(void); +extern int afs_cell_init(struct afs_net *, const char *); +extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, + enum afs_cell_trace); +enum afs_lookup_cell_for { + AFS_LOOKUP_CELL_DYNROOT, + AFS_LOOKUP_CELL_MOUNTPOINT, + AFS_LOOKUP_CELL_DIRECT_MOUNT, + AFS_LOOKUP_CELL_PRELOAD, + AFS_LOOKUP_CELL_ROOTCELL, + AFS_LOOKUP_CELL_ALIAS_CHECK, +}; +struct afs_cell *afs_lookup_cell(struct afs_net *net, + const char *name, unsigned int namesz, + const char *vllist, + enum afs_lookup_cell_for reason, + enum afs_cell_trace trace); +extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); +void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); +extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); +extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); +void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs); +extern void __net_exit afs_cell_purge(struct afs_net *); /* * cmservice.c @@ -483,27 +1079,92 @@ extern void afs_cell_purge(void); extern bool afs_cm_incoming_call(struct afs_call *); /* + * cm_security.c + */ +void afs_process_oob_queue(struct work_struct *work); +#ifdef CONFIG_RXGK +int afs_create_token_key(struct afs_net *net, struct socket *socket); +#else +static inline int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + return 0; +} +#endif + +/* * dir.c */ +extern const struct file_operations afs_dir_file_operations; extern const struct inode_operations afs_dir_inode_operations; +extern const struct address_space_operations afs_dir_aops; extern const struct dentry_operations afs_fs_dentry_operations; -extern const struct file_operations afs_dir_file_operations; + +ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file); +ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file) + __acquires(&dvnode->validate_lock); +extern void afs_d_release(struct dentry *); +extern void afs_check_for_remote_deletion(struct afs_operation *); +int afs_single_writepages(struct address_space *mapping, + struct writeback_control *wbc); + +/* + * dir_edit.c + */ +extern void afs_edit_dir_add(struct afs_vnode *, const struct qstr *, struct afs_fid *, + enum afs_edit_dir_reason); +extern void afs_edit_dir_remove(struct afs_vnode *, const struct qstr *, enum afs_edit_dir_reason); +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); +void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); + +/* + * dir_search.c + */ +unsigned int afs_dir_hash_name(const struct qstr *name); +bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name); +union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block); +int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, + struct afs_fid *_fid); +int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, + struct afs_fid *_fid, afs_dataversion_t *_dir_version); + +/* + * dir_silly.c + */ +extern int afs_sillyrename(struct afs_vnode *, struct afs_vnode *, + struct dentry *, struct key *); +extern int afs_silly_iput(struct dentry *, struct inode *); + +/* + * dynroot.c + */ +extern const struct inode_operations afs_dynroot_inode_operations; +extern const struct dentry_operations afs_dynroot_dentry_operations; + +struct inode *afs_dynroot_iget_root(struct super_block *sb); /* * file.c */ -extern const struct address_space_operations afs_fs_aops; +extern const struct address_space_operations afs_file_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; +extern const struct afs_operation_ops afs_fetch_data_operation; +extern const struct netfs_request_ops afs_req_ops; +extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); +extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); -extern int afs_page_filler(void *, struct page *); +void afs_fetch_data_async_rx(struct work_struct *work); +void afs_fetch_data_immediate_cancel(struct afs_call *call); /* * flock.c */ -extern void __exit afs_kill_lock_manager(void); +extern struct workqueue_struct *afs_lock_manager; + +extern void afs_lock_op_done(struct afs_call *); extern void afs_lock_work(struct work_struct *); extern void afs_lock_may_be_available(struct afs_vnode *); extern int afs_lock(struct file *, int, struct file_lock *); @@ -512,65 +1173,92 @@ extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ -extern int afs_fs_fetch_file_status(struct afs_server *, struct key *, - struct afs_vnode *, struct afs_volsync *, - const struct afs_wait_mode *); -extern int afs_fs_give_up_callbacks(struct afs_server *, - const struct afs_wait_mode *); -extern int afs_fs_fetch_data(struct afs_server *, struct key *, - struct afs_vnode *, off_t, size_t, struct page *, - const struct afs_wait_mode *); -extern int afs_fs_create(struct afs_server *, struct key *, - struct afs_vnode *, const char *, umode_t, - struct afs_fid *, struct afs_file_status *, - struct afs_callback *, - const struct afs_wait_mode *); -extern int afs_fs_remove(struct afs_server *, struct key *, - struct afs_vnode *, const char *, bool, - const struct afs_wait_mode *); -extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *, - struct afs_vnode *, const char *, - const struct afs_wait_mode *); -extern int afs_fs_symlink(struct afs_server *, struct key *, - struct afs_vnode *, const char *, const char *, - struct afs_fid *, struct afs_file_status *, - const struct afs_wait_mode *); -extern int afs_fs_rename(struct afs_server *, struct key *, - struct afs_vnode *, const char *, - struct afs_vnode *, const char *, - const struct afs_wait_mode *); -extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *, - pgoff_t, pgoff_t, unsigned, unsigned, - const struct afs_wait_mode *); -extern int afs_fs_setattr(struct afs_server *, struct key *, - struct afs_vnode *, struct iattr *, - const struct afs_wait_mode *); -extern int afs_fs_get_volume_status(struct afs_server *, struct key *, - struct afs_vnode *, - struct afs_volume_status *, - const struct afs_wait_mode *); -extern int afs_fs_set_lock(struct afs_server *, struct key *, - struct afs_vnode *, afs_lock_type_t, - const struct afs_wait_mode *); -extern int afs_fs_extend_lock(struct afs_server *, struct key *, - struct afs_vnode *, - const struct afs_wait_mode *); -extern int afs_fs_release_lock(struct afs_server *, struct key *, - struct afs_vnode *, - const struct afs_wait_mode *); +extern void afs_fs_fetch_status(struct afs_operation *); +extern void afs_fs_fetch_data(struct afs_operation *); +extern void afs_fs_create_file(struct afs_operation *); +extern void afs_fs_make_dir(struct afs_operation *); +extern void afs_fs_remove_file(struct afs_operation *); +extern void afs_fs_remove_dir(struct afs_operation *); +extern void afs_fs_link(struct afs_operation *); +extern void afs_fs_symlink(struct afs_operation *); +extern void afs_fs_rename(struct afs_operation *); +extern void afs_fs_store_data(struct afs_operation *); +extern void afs_fs_setattr(struct afs_operation *); +extern void afs_fs_get_volume_status(struct afs_operation *); +extern void afs_fs_set_lock(struct afs_operation *); +extern void afs_fs_extend_lock(struct afs_operation *); +extern void afs_fs_release_lock(struct afs_operation *); +int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, + struct afs_address *addr, struct key *key); +bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate, unsigned int addr_index, + struct key *key); +extern void afs_fs_inline_bulk_status(struct afs_operation *); + +struct afs_acl { + u32 size; + u8 data[] __counted_by(size); +}; + +extern void afs_fs_fetch_acl(struct afs_operation *); +extern void afs_fs_store_acl(struct afs_operation *); + +/* + * fs_operation.c + */ +extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *); +extern int afs_put_operation(struct afs_operation *); +extern bool afs_begin_vnode_operation(struct afs_operation *); +extern void afs_end_vnode_operation(struct afs_operation *op); +extern void afs_wait_for_operation(struct afs_operation *); +extern int afs_do_sync_operation(struct afs_operation *); + +static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n, + struct afs_vnode *vnode) +{ + op->file[n].vnode = vnode; + op->file[n].need_io_lock = true; +} + +static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n, + const struct afs_fid *fid) +{ + op->file[n].fid = *fid; +} + +/* + * fs_probe.c + */ +struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, + enum afs_estate_trace where); +void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); +extern void afs_fileserver_probe_result(struct afs_call *); +int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_alist, struct key *key); +int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); +extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); +extern void afs_fs_probe_dispatcher(struct work_struct *); +int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, + unsigned long exclude, bool is_intr); +extern void afs_fs_probe_cleanup(struct afs_net *); /* * inode.c */ -extern struct inode *afs_iget_autocell(struct inode *, const char *, int, - struct key *); -extern struct inode *afs_iget(struct super_block *, struct key *, - struct afs_fid *, struct afs_file_status *, - struct afs_callback *); -extern void afs_zap_data(struct afs_vnode *); -extern int afs_validate(struct afs_vnode *, struct key *); -extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -extern int afs_setattr(struct dentry *, struct iattr *); +extern const struct afs_operation_ops afs_fetch_status_operation; + +void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op); +const char *afs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback); +int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); +extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); +extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); +extern int afs_ilookup5_test_by_fid(struct inode *, void *); +extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); +extern struct inode *afs_root_iget(struct super_block *, struct key *); +extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, + struct kstat *, u32, unsigned int); +extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); @@ -578,12 +1266,75 @@ extern int afs_drop_inode(struct inode *); * main.c */ extern struct workqueue_struct *afs_wq; -extern struct afs_uuid afs_uuid; +extern int afs_net_id; + +static inline struct afs_net *afs_net(struct net *net) +{ + return net_generic(net, afs_net_id); +} + +static inline struct afs_net *afs_sb2net(struct super_block *sb) +{ + return afs_net(AFS_FS_S(sb)->net_ns); +} + +static inline struct afs_net *afs_d2net(struct dentry *dentry) +{ + return afs_sb2net(dentry->d_sb); +} + +static inline struct afs_net *afs_i2net(struct inode *inode) +{ + return afs_sb2net(inode->i_sb); +} + +static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) +{ + return afs_i2net(&vnode->netfs.inode); +} + +static inline struct afs_net *afs_sock2net(struct sock *sk) +{ + return net_generic(sock_net(sk), afs_net_id); +} + +static inline void __afs_stat(atomic_t *s) +{ + atomic_inc(s); +} + +#define afs_stat_v(vnode, n) __afs_stat(&afs_v2net(vnode)->n) /* * misc.c */ extern int afs_abort_to_error(u32); +extern void afs_prioritise_error(struct afs_error *, int, u32); + +static inline void afs_op_nomem(struct afs_operation *op) +{ + op->cumul_error.error = -ENOMEM; +} + +static inline int afs_op_error(const struct afs_operation *op) +{ + return op->cumul_error.error; +} + +static inline s32 afs_op_abort_code(const struct afs_operation *op) +{ + return op->cumul_error.abort_code; +} + +static inline int afs_op_set_error(struct afs_operation *op, int error) +{ + return op->cumul_error.error = error; +} + +static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code) +{ + afs_prioritise_error(&op->cumul_error, error, abort_code); +} /* * mntpt.c @@ -593,165 +1344,486 @@ extern const struct inode_operations afs_autocell_inode_operations; extern const struct file_operations afs_mntpt_file_operations; extern struct vfsmount *afs_d_automount(struct path *); -extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); extern void afs_mntpt_kill_timer(void); /* * proc.c */ -extern int afs_proc_init(void); -extern void afs_proc_cleanup(void); +#ifdef CONFIG_PROC_FS +extern int __net_init afs_proc_init(struct afs_net *); +extern void __net_exit afs_proc_cleanup(struct afs_net *); extern int afs_proc_cell_setup(struct afs_cell *); extern void afs_proc_cell_remove(struct afs_cell *); +extern void afs_put_sysnames(struct afs_sysnames *); +#else +static inline int afs_proc_init(struct afs_net *net) { return 0; } +static inline void afs_proc_cleanup(struct afs_net *net) {} +static inline int afs_proc_cell_setup(struct afs_cell *cell) { return 0; } +static inline void afs_proc_cell_remove(struct afs_cell *cell) {} +static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} +#endif + +/* + * rotate.c + */ +void afs_clear_server_states(struct afs_operation *op); +extern bool afs_select_fileserver(struct afs_operation *); +extern void afs_dump_edestaddrreq(const struct afs_operation *); /* * rxrpc.c */ -extern int afs_open_socket(void); -extern void afs_close_socket(void); -extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, - const struct afs_wait_mode *); -extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, +extern struct workqueue_struct *afs_async_calls; + +extern int __net_init afs_open_socket(struct afs_net *); +extern void __net_exit afs_close_socket(struct afs_net *); +extern void afs_charge_preallocation(struct work_struct *); +extern void afs_put_call(struct afs_call *); +void afs_deferred_put_call(struct afs_call *call); +void afs_make_call(struct afs_call *call, gfp_t gfp); +void afs_deliver_to_call(struct afs_call *call); +void afs_wait_for_call_to_complete(struct afs_call *call); +extern struct afs_call *afs_alloc_flat_call(struct afs_net *, + const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); -extern void afs_transfer_reply(struct afs_call *, struct sk_buff *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); -extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, - size_t); +extern int afs_extract_data(struct afs_call *, bool); +extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); + +static inline struct afs_call *afs_get_call(struct afs_call *call, + enum afs_call_trace why) +{ + int r; + + __refcount_inc(&call->ref, &r); + + trace_afs_call(call->debug_id, why, r + 1, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); + return call; +} + +static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why) +{ + int r = refcount_read(&call->ref); + + trace_afs_call(call->debug_id, why, r, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); +} + +static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, + gfp_t gfp) +{ + struct afs_addr_list *alist = op->estate->addresses; + + op->call = call; + op->type = call->type; + call->op = op; + call->key = op->key; + call->intr = !(op->flags & AFS_OPERATION_UNINTR); + call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer); + call->service_id = op->server->service_id; + afs_make_call(call, gfp); +} + +static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) +{ + call->iov_len = size; + call->kvec[0].iov_base = buf; + call->kvec[0].iov_len = size; + iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size); +} + +static inline void afs_extract_to_tmp(struct afs_call *call) +{ + call->iov_len = sizeof(call->tmp); + afs_extract_begin(call, &call->tmp, sizeof(call->tmp)); +} + +static inline void afs_extract_to_tmp64(struct afs_call *call) +{ + call->iov_len = sizeof(call->tmp64); + afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64)); +} + +static inline void afs_extract_discard(struct afs_call *call, size_t size) +{ + call->iov_len = size; + iov_iter_discard(&call->def_iter, ITER_DEST, size); +} + +static inline void afs_extract_to_buf(struct afs_call *call, size_t size) +{ + call->iov_len = size; + afs_extract_begin(call, call->buffer, size); +} + +static inline int afs_transfer_reply(struct afs_call *call) +{ + return afs_extract_data(call, false); +} + +static inline bool afs_check_call_state(struct afs_call *call, + enum afs_call_state state) +{ + return READ_ONCE(call->state) == state; +} + +static inline bool afs_set_call_state(struct afs_call *call, + enum afs_call_state from, + enum afs_call_state to) +{ + bool ok = false; + + spin_lock_bh(&call->state_lock); + if (call->state == from) { + call->state = to; + trace_afs_call_state(call, from, to, 0, 0); + ok = true; + } + spin_unlock_bh(&call->state_lock); + return ok; +} + +static inline void afs_set_call_complete(struct afs_call *call, + int error, u32 remote_abort) +{ + enum afs_call_state state; + bool ok = false; + + spin_lock_bh(&call->state_lock); + state = call->state; + if (state != AFS_CALL_COMPLETE) { + call->abort_code = remote_abort; + call->error = error; + call->state = AFS_CALL_COMPLETE; + trace_afs_call_state(call, state, AFS_CALL_COMPLETE, + error, remote_abort); + ok = true; + } + spin_unlock_bh(&call->state_lock); + if (ok) { + trace_afs_call_done(call); + + /* Asynchronous calls have two refs to release - one from the alloc and + * one queued with the work item - and we can't just deallocate the + * call because the work item may be queued again. + */ + if (call->drop_ref) + afs_put_call(call); + } +} /* * security.c */ +extern void afs_put_permits(struct afs_permits *); extern void afs_clear_permits(struct afs_vnode *); -extern void afs_cache_permit(struct afs_vnode *, struct key *, long); -extern void afs_zap_permits(struct rcu_head *); +extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int, + struct afs_status_cb *); extern struct key *afs_request_key(struct afs_cell *); -extern int afs_permission(struct inode *, int); +extern struct key *afs_request_key_rcu(struct afs_cell *); +extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); +extern int afs_permission(struct mnt_idmap *, struct inode *, int); +extern void __exit afs_clean_up_permit_cache(void); /* * server.c */ extern spinlock_t afs_server_peer_lock; -#define afs_get_server(S) \ -do { \ - _debug("GET SERVER %d", atomic_read(&(S)->usage)); \ - atomic_inc(&(S)->usage); \ -} while(0) +struct afs_server *afs_find_server(const struct rxrpc_peer *peer); +extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); +extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); +struct afs_server *afs_use_server(struct afs_server *server, bool activate, + enum afs_server_trace reason); +void afs_unuse_server(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason); +void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason); +extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); +void afs_purge_servers(struct afs_cell *cell); +extern void afs_fs_probe_timer(struct timer_list *); +void __net_exit afs_wait_for_servers(struct afs_net *net); +bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); + +static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace) +{ + int r = refcount_read(&server->ref); + int a = atomic_read(&server->active); + + trace_afs_server(server->debug_id, r, a, trace); -extern struct afs_server *afs_lookup_server(struct afs_cell *, - const struct in_addr *); -extern struct afs_server *afs_find_server(const struct in_addr *); -extern void afs_put_server(struct afs_server *); -extern void __exit afs_purge_servers(void); +} + +static inline void afs_inc_servers_outstanding(struct afs_net *net) +{ + atomic_inc(&net->servers_outstanding); +} + +static inline void afs_dec_servers_outstanding(struct afs_net *net) +{ + if (atomic_dec_and_test(&net->servers_outstanding)) + wake_up_var(&net->servers_outstanding); +} + +static inline bool afs_is_probing_server(struct afs_server *server) +{ + return list_empty(&server->probe_link); +} + +/* + * server_list.c + */ +static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list *slist) +{ + refcount_inc(&slist->usage); + return slist; +} + +extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *); +struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, + struct key *key, + struct afs_vldb_entry *vldb); +extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *); +void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist); +void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist, + struct afs_server_list *old); +void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist); /* * super.c */ -extern int afs_fs_init(void); +extern int __init afs_fs_init(void); extern void afs_fs_exit(void); /* - * use-rtnetlink.c + * validation.c */ -extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool); -extern int afs_get_MAC_address(u8 *, size_t); +bool afs_check_validity(const struct afs_vnode *vnode); +int afs_update_volume_state(struct afs_operation *op); +int afs_validate(struct afs_vnode *vnode, struct key *key); /* * vlclient.c */ -extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *, - const char *, struct afs_cache_vlocation *, - const struct afs_wait_mode *); -extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *, - afs_volid_t, afs_voltype_t, - struct afs_cache_vlocation *, - const struct afs_wait_mode *); +extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *, + const char *, int); +extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *); +struct afs_call *afs_vl_get_capabilities(struct afs_net *net, + struct afs_addr_list *alist, + unsigned int addr_index, + struct key *key, + struct afs_vlserver *server, + unsigned int server_index); +extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); +extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *); /* - * vlocation.c + * vl_alias.c */ -#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0) +extern int afs_cell_detect_alias(struct afs_cell *, struct key *); -extern int __init afs_vlocation_update_init(void); -extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *, - struct key *, - const char *, size_t); -extern void afs_put_vlocation(struct afs_vlocation *); -extern void afs_vlocation_purge(void); +/* + * vl_probe.c + */ +extern void afs_vlserver_probe_result(struct afs_call *); +extern int afs_send_vl_probes(struct afs_net *, struct key *, struct afs_vlserver_list *); +extern int afs_wait_for_vl_probes(struct afs_vlserver_list *, unsigned long); /* - * vnode.c + * vl_rotate.c */ -static inline struct afs_vnode *AFS_FS_I(struct inode *inode) +extern bool afs_begin_vlserver_operation(struct afs_vl_cursor *, + struct afs_cell *, struct key *); +extern bool afs_select_vlserver(struct afs_vl_cursor *); +extern bool afs_select_current_vlserver(struct afs_vl_cursor *); +extern int afs_end_vlserver_operation(struct afs_vl_cursor *); + +/* + * vlserver_list.c + */ +static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver) { - return container_of(inode, struct afs_vnode, vfs_inode); + refcount_inc(&vlserver->ref); + return vlserver; } -static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) +static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist) { - return &vnode->vfs_inode; -} - -extern void afs_vnode_finalise_status_update(struct afs_vnode *, - struct afs_server *); -extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *, - struct key *); -extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *, - off_t, size_t, struct page *); -extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *, - umode_t, struct afs_fid *, struct afs_file_status *, - struct afs_callback *, struct afs_server **); -extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *, - bool); -extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *, - const char *); -extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *, - const char *, struct afs_fid *, - struct afs_file_status *, struct afs_server **); -extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *, - struct key *, const char *, const char *); -extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t, - unsigned, unsigned); -extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *); -extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *, - struct afs_volume_status *); -extern int afs_vnode_set_lock(struct afs_vnode *, struct key *, - afs_lock_type_t); -extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *); -extern int afs_vnode_release_lock(struct afs_vnode *, struct key *); + if (vllist) + refcount_inc(&vllist->ref); + return vllist; +} + +extern struct afs_vlserver *afs_alloc_vlserver(const char *, size_t, unsigned short); +extern void afs_put_vlserver(struct afs_net *, struct afs_vlserver *); +extern struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int); +extern void afs_put_vlserverlist(struct afs_net *, struct afs_vlserver_list *); +extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, + const void *, size_t); /* * volume.c */ -#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0) - -extern void afs_put_volume(struct afs_volume *); -extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *); -extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *); -extern int afs_volume_release_fileserver(struct afs_vnode *, - struct afs_server *, int); +extern struct afs_volume *afs_create_volume(struct afs_fs_context *); +extern int afs_activate_volume(struct afs_volume *); +extern void afs_deactivate_volume(struct afs_volume *); +bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason); +extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); +void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason); +extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ -extern int afs_set_page_dirty(struct page *); -extern void afs_put_writeback(struct afs_writeback *); -extern int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata); -extern int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); -extern int afs_writepage(struct page *, struct writeback_control *); +void afs_prepare_write(struct netfs_io_subrequest *subreq); +void afs_issue_write(struct netfs_io_subrequest *subreq); +void afs_begin_writeback(struct netfs_io_request *wreq); +void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream); extern int afs_writepages(struct address_space *, struct writeback_control *); -extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); -extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, - unsigned long, loff_t); -extern int afs_writeback_all(struct afs_vnode *); extern int afs_fsync(struct file *, loff_t, loff_t, int); +extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); +extern void afs_prune_wb_keys(struct afs_vnode *); + +/* + * xattr.c + */ +extern const struct xattr_handler * const afs_xattr_handlers[]; + +/* + * yfsclient.c + */ +extern void yfs_fs_fetch_data(struct afs_operation *); +extern void yfs_fs_create_file(struct afs_operation *); +extern void yfs_fs_make_dir(struct afs_operation *); +extern void yfs_fs_remove_file2(struct afs_operation *); +extern void yfs_fs_remove_file(struct afs_operation *); +extern void yfs_fs_remove_dir(struct afs_operation *); +extern void yfs_fs_link(struct afs_operation *); +extern void yfs_fs_symlink(struct afs_operation *); +extern void yfs_fs_rename(struct afs_operation *); +void yfs_fs_rename_replace(struct afs_operation *op); +void yfs_fs_rename_noreplace(struct afs_operation *op); +void yfs_fs_rename_exchange(struct afs_operation *op); +extern void yfs_fs_store_data(struct afs_operation *); +extern void yfs_fs_setattr(struct afs_operation *); +extern void yfs_fs_get_volume_status(struct afs_operation *); +extern void yfs_fs_set_lock(struct afs_operation *); +extern void yfs_fs_extend_lock(struct afs_operation *); +extern void yfs_fs_release_lock(struct afs_operation *); +extern void yfs_fs_fetch_status(struct afs_operation *); +extern void yfs_fs_inline_bulk_status(struct afs_operation *); + +struct yfs_acl { + struct afs_acl *acl; /* Dir/file/symlink ACL */ + struct afs_acl *vol_acl; /* Whole volume ACL */ + u32 inherit_flag; /* True if ACL is inherited from parent dir */ + u32 num_cleaned; /* Number of ACEs removed due to subject removal */ + unsigned int flags; +#define YFS_ACL_WANT_ACL 0x01 /* Set if caller wants ->acl */ +#define YFS_ACL_WANT_VOL_ACL 0x02 /* Set if caller wants ->vol_acl */ +}; + +extern void yfs_free_opaque_acl(struct yfs_acl *); +extern void yfs_fs_fetch_opaque_acl(struct afs_operation *); +extern void yfs_fs_store_opaque_acl2(struct afs_operation *); + +/* + * Miscellaneous inline functions. + */ +static inline struct afs_vnode *AFS_FS_I(struct inode *inode) +{ + return container_of(inode, struct afs_vnode, netfs.inode); +} + +static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) +{ + return &vnode->netfs.inode; +} + +/* + * Note that a dentry got changed. We need to set d_fsdata to the data version + * number derived from the result of the operation. It doesn't matter if + * d_fsdata goes backwards as we'll just revalidate. + */ +static inline void afs_update_dentry_version(struct afs_operation *op, + struct afs_vnode_param *dir_vp, + struct dentry *dentry) +{ + if (!op->cumul_error.error) + dentry->d_fsdata = + (void *)(unsigned long)dir_vp->scb.status.data_version; +} + +/* + * Set the file size and block count. Estimate the number of 512 bytes blocks + * used, rounded up to nearest 1K for consistency with other AFS clients. + */ +static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) +{ + i_size_write(&vnode->netfs.inode, size); + vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; +} + +/* + * Check for a conflicting operation on a directory that we just unlinked from. + * If someone managed to sneak a link or an unlink in on the file we just + * unlinked, we won't be able to trust nlink on an AFS file (but not YFS). + */ +static inline void afs_check_dir_conflict(struct afs_operation *op, + struct afs_vnode_param *dvp) +{ + if (dvp->dv_before + dvp->dv_delta != dvp->scb.status.data_version) + op->flags |= AFS_OPERATION_DIR_CONFLICT; +} +static inline int afs_io_error(struct afs_call *call, enum afs_io_error where) +{ + trace_afs_io_error(call->debug_id, -EIO, where); + return -EIO; +} + +static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where) +{ + trace_afs_file_error(vnode, -EIO, where); + return -EIO; +} + +/* + * Set the callback promise on a vnode. + */ +static inline void afs_set_cb_promise(struct afs_vnode *vnode, time64_t expires_at, + enum afs_cb_promise_trace trace) +{ + atomic64_set(&vnode->cb_expires_at, expires_at); + trace_afs_cb_promise(vnode, trace); +} + +/* + * Clear the callback promise on a vnode, returning true if it was promised. + */ +static inline bool afs_clear_cb_promise(struct afs_vnode *vnode, + enum afs_cb_promise_trace trace) +{ + trace_afs_cb_promise(vnode, trace); + return atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE; +} + +/* + * Mark a directory as being invalid. + */ +static inline void afs_invalidate_dir(struct afs_vnode *dvnode, + enum afs_dir_invalid_trace trace) +{ + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { + trace_afs_dir_invalid(dvnode, trace); + afs_stat_v(dvnode, n_inval); + } +} /*****************************************************************************/ /* diff --git a/fs/afs/main.c b/fs/afs/main.c index 42dd2e499ed8..e6bb8237db98 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* AFS client file system * * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> @@ -14,6 +10,9 @@ #include <linux/init.h> #include <linux/completion.h> #include <linux/sched.h> +#include <linux/random.h> +#include <linux/proc_fs.h> +#define CREATE_TRACE_POINTS #include "internal.h" MODULE_DESCRIPTION("AFS Client File System"); @@ -29,126 +28,185 @@ static char *rootcell; module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); -struct afs_uuid afs_uuid; struct workqueue_struct *afs_wq; +static struct proc_dir_entry *afs_proc_symlink; + +#if defined(CONFIG_ALPHA) +const char afs_init_sysname[] = "alpha_linux26"; +#elif defined(CONFIG_X86_64) +const char afs_init_sysname[] = "amd64_linux26"; +#elif defined(CONFIG_ARM) +const char afs_init_sysname[] = "arm_linux26"; +#elif defined(CONFIG_ARM64) +const char afs_init_sysname[] = "aarch64_linux26"; +#elif defined(CONFIG_X86_32) +const char afs_init_sysname[] = "i386_linux26"; +#elif defined(CONFIG_PPC64) +const char afs_init_sysname[] = "ppc64_linux26"; +#elif defined(CONFIG_PPC32) +const char afs_init_sysname[] = "ppc_linux26"; +#elif defined(CONFIG_S390) +#ifdef CONFIG_64BIT +const char afs_init_sysname[] = "s390x_linux26"; +#else +const char afs_init_sysname[] = "s390_linux26"; +#endif +#elif defined(CONFIG_SPARC64) +const char afs_init_sysname[] = "sparc64_linux26"; +#elif defined(CONFIG_SPARC32) +const char afs_init_sysname[] = "sparc_linux26"; +#else +const char afs_init_sysname[] = "unknown_linux26"; +#endif /* - * get a client UUID + * Initialise an AFS network namespace record. */ -static int __init afs_get_client_UUID(void) +static int __net_init afs_net_init(struct net *net_ns) { - struct timespec ts; - u64 uuidtime; - u16 clockseq; + struct afs_sysnames *sysnames; + struct afs_net *net = afs_net(net_ns); int ret; - /* read the MAC address of one of the external interfaces and construct - * a UUID from it */ - ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node)); - if (ret < 0) - return ret; - - getnstimeofday(&ts); - uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10; - uuidtime += ts.tv_nsec / 100; - uuidtime += AFS_UUID_TO_UNIX_TIME; - afs_uuid.time_low = uuidtime; - afs_uuid.time_mid = uuidtime >> 32; - afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK; - afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME; - - get_random_bytes(&clockseq, 2); - afs_uuid.clock_seq_low = clockseq; - afs_uuid.clock_seq_hi_and_reserved = - (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK; - afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD; - - _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", - afs_uuid.time_low, - afs_uuid.time_mid, - afs_uuid.time_hi_and_version, - afs_uuid.clock_seq_hi_and_reserved, - afs_uuid.clock_seq_low, - afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2], - afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]); + net->net = net_ns; + net->live = true; + generate_random_uuid((unsigned char *)&net->uuid); - return 0; -} + INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); + INIT_WORK(&net->rx_oob_work, afs_process_oob_queue); + mutex_init(&net->socket_mutex); -/* - * initialise the AFS client FS module - */ -static int __init afs_init(void) -{ - int ret; + net->cells = RB_ROOT; + idr_init(&net->cells_dyn_ino); + init_rwsem(&net->cells_lock); + mutex_init(&net->cells_alias_lock); + mutex_init(&net->proc_cells_lock); + INIT_HLIST_HEAD(&net->proc_cells); - printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); + seqlock_init(&net->fs_lock); + INIT_LIST_HEAD(&net->fs_probe_fast); + INIT_LIST_HEAD(&net->fs_probe_slow); + INIT_HLIST_HEAD(&net->fs_proc); - ret = afs_get_client_UUID(); - if (ret < 0) - return ret; + INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher); + timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0); + atomic_set(&net->servers_outstanding, 1); - /* create workqueue */ ret = -ENOMEM; - afs_wq = alloc_workqueue("afs", 0, 0); - if (!afs_wq) - return ret; - - /* register the /proc stuff */ - ret = afs_proc_init(); + sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); + if (!sysnames) + goto error_sysnames; + sysnames->subs[0] = (char *)&afs_init_sysname; + sysnames->nr = 1; + refcount_set(&sysnames->usage, 1); + net->sysnames = sysnames; + rwlock_init(&net->sysnames_lock); + + /* Register the /proc stuff */ + ret = afs_proc_init(net); if (ret < 0) goto error_proc; -#ifdef CONFIG_AFS_FSCACHE - /* we want to be able to cache */ - ret = fscache_register_netfs(&afs_cache_netfs); - if (ret < 0) - goto error_cache; -#endif - - /* initialise the cell DB */ - ret = afs_cell_init(rootcell); + /* Initialise the cell DB */ + ret = afs_cell_init(net, rootcell); if (ret < 0) goto error_cell_init; - /* initialise the VL update process */ - ret = afs_vlocation_update_init(); + /* Create the RxRPC transport */ + ret = afs_open_socket(net); if (ret < 0) - goto error_vl_update_init; + goto error_open_socket; - /* initialise the callback update process */ - ret = afs_callback_update_init(); - if (ret < 0) - goto error_callback_update_init; + return 0; - /* create the RxRPC transport */ - ret = afs_open_socket(); +error_open_socket: + net->live = false; + afs_fs_probe_cleanup(net); + afs_cell_purge(net); + afs_wait_for_servers(net); +error_cell_init: + net->live = false; + afs_proc_cleanup(net); +error_proc: + afs_put_sysnames(net->sysnames); +error_sysnames: + idr_destroy(&net->cells_dyn_ino); + net->live = false; + return ret; +} + +/* + * Clean up and destroy an AFS network namespace record. + */ +static void __net_exit afs_net_exit(struct net *net_ns) +{ + struct afs_net *net = afs_net(net_ns); + + net->live = false; + afs_fs_probe_cleanup(net); + afs_cell_purge(net); + afs_wait_for_servers(net); + afs_close_socket(net); + afs_proc_cleanup(net); + afs_put_sysnames(net->sysnames); + idr_destroy(&net->cells_dyn_ino); + kfree_rcu(rcu_access_pointer(net->address_prefs), rcu); +} + +static struct pernet_operations afs_net_ops = { + .init = afs_net_init, + .exit = afs_net_exit, + .id = &afs_net_id, + .size = sizeof(struct afs_net), +}; + +/* + * initialise the AFS client FS module + */ +static int __init afs_init(void) +{ + int ret = -ENOMEM; + + printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); + + afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0); + if (!afs_wq) + goto error_afs_wq; + afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!afs_async_calls) + goto error_async; + afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0); + if (!afs_lock_manager) + goto error_lockmgr; + + ret = register_pernet_device(&afs_net_ops); if (ret < 0) - goto error_open_socket; + goto error_net; /* register the filesystems */ ret = afs_fs_init(); if (ret < 0) goto error_fs; + afs_proc_symlink = proc_symlink("fs/afs", NULL, "../self/net/afs"); + if (!afs_proc_symlink) { + ret = -ENOMEM; + goto error_proc; + } + return ret; -error_fs: - afs_close_socket(); -error_open_socket: - afs_callback_update_kill(); -error_callback_update_init: - afs_vlocation_purge(); -error_vl_update_init: - afs_cell_purge(); -error_cell_init: -#ifdef CONFIG_AFS_FSCACHE - fscache_unregister_netfs(&afs_cache_netfs); -error_cache: -#endif - afs_proc_cleanup(); error_proc: + afs_fs_exit(); +error_fs: + unregister_pernet_device(&afs_net_ops); +error_net: + destroy_workqueue(afs_lock_manager); +error_lockmgr: + destroy_workqueue(afs_async_calls); +error_async: destroy_workqueue(afs_wq); +error_afs_wq: rcu_barrier(); printk(KERN_ERR "kAFS: failed to register: %d\n", ret); return ret; @@ -166,18 +224,13 @@ static void __exit afs_exit(void) { printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); + proc_remove(afs_proc_symlink); afs_fs_exit(); - afs_kill_lock_manager(); - afs_close_socket(); - afs_purge_servers(); - afs_callback_update_kill(); - afs_vlocation_purge(); + unregister_pernet_device(&afs_net_ops); + destroy_workqueue(afs_lock_manager); + destroy_workqueue(afs_async_calls); destroy_workqueue(afs_wq); - afs_cell_purge(); -#ifdef CONFIG_AFS_FSCACHE - fscache_unregister_netfs(&afs_cache_netfs); -#endif - afs_proc_cleanup(); + afs_clean_up_permit_cache(); rcu_barrier(); } diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 0dd4dafee10b..c8a7f266080d 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -1,20 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* miscellaneous bits * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> -#include <rxrpc/packet.h> +#include <crypto/krb5.h> #include "internal.h" #include "afs_fs.h" +#include "protocol_uae.h" /* * convert an AFS abort code to a Linux error number @@ -22,9 +19,12 @@ int afs_abort_to_error(u32 abort_code) { switch (abort_code) { + /* Low errno codes inserted into abort namespace */ case 13: return -EACCES; case 27: return -EFBIG; case 30: return -EROFS; + + /* VICE "special error" codes; 101 - 111 */ case VSALVAGE: return -EIO; case VNOVNODE: return -ENOENT; case VNOVOL: return -ENOMEDIUM; @@ -36,26 +36,60 @@ int afs_abort_to_error(u32 abort_code) case VOVERQUOTA: return -EDQUOT; case VBUSY: return -EBUSY; case VMOVED: return -ENXIO; - case 0x2f6df0a: return -EWOULDBLOCK; - case 0x2f6df0c: return -EACCES; - case 0x2f6df0f: return -EBUSY; - case 0x2f6df10: return -EEXIST; - case 0x2f6df11: return -EXDEV; - case 0x2f6df13: return -ENOTDIR; - case 0x2f6df14: return -EISDIR; - case 0x2f6df15: return -EINVAL; - case 0x2f6df1a: return -EFBIG; - case 0x2f6df1b: return -ENOSPC; - case 0x2f6df1d: return -EROFS; - case 0x2f6df1e: return -EMLINK; - case 0x2f6df20: return -EDOM; - case 0x2f6df21: return -ERANGE; - case 0x2f6df22: return -EDEADLK; - case 0x2f6df23: return -ENAMETOOLONG; - case 0x2f6df24: return -ENOLCK; - case 0x2f6df26: return -ENOTEMPTY; - case 0x2f6df78: return -EDQUOT; + /* Volume Location server errors */ + case AFSVL_IDEXIST: return -EEXIST; + case AFSVL_IO: return -EREMOTEIO; + case AFSVL_NAMEEXIST: return -EEXIST; + case AFSVL_CREATEFAIL: return -EREMOTEIO; + case AFSVL_NOENT: return -ENOMEDIUM; + case AFSVL_EMPTY: return -ENOMEDIUM; + case AFSVL_ENTDELETED: return -ENOMEDIUM; + case AFSVL_BADNAME: return -EINVAL; + case AFSVL_BADINDEX: return -EINVAL; + case AFSVL_BADVOLTYPE: return -EINVAL; + case AFSVL_BADSERVER: return -EINVAL; + case AFSVL_BADPARTITION: return -EINVAL; + case AFSVL_REPSFULL: return -EFBIG; + case AFSVL_NOREPSERVER: return -ENOENT; + case AFSVL_DUPREPSERVER: return -EEXIST; + case AFSVL_RWNOTFOUND: return -ENOENT; + case AFSVL_BADREFCOUNT: return -EINVAL; + case AFSVL_SIZEEXCEEDED: return -EINVAL; + case AFSVL_BADENTRY: return -EINVAL; + case AFSVL_BADVOLIDBUMP: return -EINVAL; + case AFSVL_IDALREADYHASHED: return -EINVAL; + case AFSVL_ENTRYLOCKED: return -EBUSY; + case AFSVL_BADVOLOPER: return -EBADRQC; + case AFSVL_BADRELLOCKTYPE: return -EINVAL; + case AFSVL_RERELEASE: return -EREMOTEIO; + case AFSVL_BADSERVERFLAG: return -EINVAL; + case AFSVL_PERM: return -EACCES; + case AFSVL_NOMEM: return -EREMOTEIO; + + /* Unified AFS error table */ + case UAEPERM: return -EPERM; + case UAENOENT: return -ENOENT; + case UAEAGAIN: return -EAGAIN; + case UAEACCES: return -EACCES; + case UAEBUSY: return -EBUSY; + case UAEEXIST: return -EEXIST; + case UAENOTDIR: return -ENOTDIR; + case UAEISDIR: return -EISDIR; + case UAEFBIG: return -EFBIG; + case UAENOSPC: return -ENOSPC; + case UAEROFS: return -EROFS; + case UAEMLINK: return -EMLINK; + case UAEDEADLK: return -EDEADLK; + case UAENAMETOOLONG: return -ENAMETOOLONG; + case UAENOLCK: return -ENOLCK; + case UAENOTEMPTY: return -ENOTEMPTY; + case UAELOOP: return -ELOOP; + case UAEOVERFLOW: return -EOVERFLOW; + case UAENOMEDIUM: return -ENOMEDIUM; + case UAEDQUOT: return -EDQUOT; + + /* RXKAD abort codes; from include/rxrpc/packet.h. ET "RXK" == 0x1260B00 */ case RXKADINCONSISTENCY: return -EPROTO; case RXKADPACKETSHORT: return -EPROTO; case RXKADLEVELFAIL: return -EKEYREJECTED; @@ -70,6 +104,105 @@ int afs_abort_to_error(u32 abort_code) case RXKADDATALEN: return -EKEYREJECTED; case RXKADILLEGALLEVEL: return -EKEYREJECTED; + case RXGK_INCONSISTENCY: return -EPROTO; + case RXGK_PACKETSHORT: return -EPROTO; + case RXGK_BADCHALLENGE: return -EPROTO; + case RXGK_SEALEDINCON: return -EKEYREJECTED; + case RXGK_NOTAUTH: return -EKEYREJECTED; + case RXGK_EXPIRED: return -EKEYEXPIRED; + case RXGK_BADLEVEL: return -EKEYREJECTED; + case RXGK_BADKEYNO: return -EKEYREJECTED; + case RXGK_NOTRXGK: return -EKEYREJECTED; + case RXGK_UNSUPPORTED: return -EKEYREJECTED; + case RXGK_GSSERROR: return -EKEYREJECTED; +#ifdef RXGK_BADETYPE + case RXGK_BADETYPE: return -ENOPKG; +#endif +#ifdef RXGK_BADTOKEN + case RXGK_BADTOKEN: return -EKEYREJECTED; +#endif +#ifdef RXGK_BADETYPE + case RXGK_DATALEN: return -EPROTO; +#endif +#ifdef RXGK_BADQOP + case RXGK_BADQOP: return -EKEYREJECTED; +#endif + + case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; + + case RXGEN_OPCODE: return -ENOTSUPP; + case RX_INVALID_OPERATION: return -ENOTSUPP; + default: return -EREMOTEIO; } } + +/* + * Select the error to report from a set of errors. + */ +void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) +{ + switch (error) { + case 0: + e->aborted = false; + e->error = 0; + return; + default: + if (e->error == -ETIMEDOUT || + e->error == -ETIME) + return; + fallthrough; + case -ETIMEDOUT: + case -ETIME: + if (e->error == -ENOMEM || + e->error == -ENONET) + return; + fallthrough; + case -ENOMEM: + case -ENONET: + if (e->error == -ERFKILL) + return; + fallthrough; + case -ERFKILL: + if (e->error == -EADDRNOTAVAIL) + return; + fallthrough; + case -EADDRNOTAVAIL: + if (e->error == -ENETUNREACH) + return; + fallthrough; + case -ENETUNREACH: + if (e->error == -EHOSTUNREACH) + return; + fallthrough; + case -EHOSTUNREACH: + if (e->error == -EHOSTDOWN) + return; + fallthrough; + case -EHOSTDOWN: + if (e->error == -ECONNREFUSED) + return; + fallthrough; + case -ECONNREFUSED: + if (e->error == -ECONNRESET) + return; + fallthrough; + case -ECONNRESET: /* Responded, but call expired. */ + if (e->responded) + return; + e->error = error; + e->aborted = false; + return; + + case -ECONNABORTED: + e->error = afs_abort_to_error(abort_code); + e->aborted = true; + e->responded = true; + return; + case -ENETRESET: /* Responded, but we seem to have changed address */ + e->aborted = false; + e->responded = true; + e->error = error; + return; + } +} diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 9682c33d5daf..57c204a3c04e 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* mountpoint management * * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> @@ -17,6 +13,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/gfp.h> +#include <linux/fs_context.h> #include "internal.h" @@ -33,7 +30,7 @@ const struct file_operations afs_mntpt_file_operations = { const struct inode_operations afs_mntpt_inode_operations = { .lookup = afs_mntpt_lookup, - .readlink = page_readlink, + .readlink = afs_readlink, .getattr = afs_getattr, }; @@ -46,58 +43,7 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); static unsigned long afs_mntpt_expiry_timeout = 10 * 60; -/* - * check a symbolic link to see whether it actually encodes a mountpoint - * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately - */ -int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) -{ - struct page *page; - size_t size; - char *buf; - int ret; - - _enter("{%x:%u,%u}", - vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); - - /* read the contents of the symlink into the pagecache */ - page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, - afs_page_filler, key); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto out; - } - - ret = -EIO; - if (PageError(page)) - goto out_free; - - buf = kmap(page); - - /* examine the symlink's contents */ - size = vnode->status.size; - _debug("symlink to %*.*s", (int) size, (int) size, buf); - - if (size > 2 && - (buf[0] == '%' || buf[0] == '#') && - buf[size - 1] == '.' - ) { - _debug("symlink is a mountpoint"); - spin_lock(&vnode->lock); - set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); - vnode->vfs_inode.i_flags |= S_AUTOMOUNT; - spin_unlock(&vnode->lock); - } - - ret = 0; - - kunmap(page); -out_free: - page_cache_release(page); -out: - _leave(" = %d", ret); - return ret; -} +static const char afs_root_volume[] = "root.cell"; /* * no valid lookup procedure on this sort of dir @@ -106,14 +52,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - _enter("%p,%p{%p{%s},%s}", - dir, - dentry, - dentry->d_parent, - dentry->d_parent ? - dentry->d_parent->d_name.name : (const unsigned char *) "", - dentry->d_name.name); - + _enter("%p,%p{%pd2}", dir, dentry, dentry); return ERR_PTR(-EREMOTE); } @@ -122,117 +61,121 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir, */ static int afs_mntpt_open(struct inode *inode, struct file *file) { - _enter("%p,%p{%p{%s},%s}", - inode, file, - file->f_path.dentry->d_parent, - file->f_path.dentry->d_parent ? - file->f_path.dentry->d_parent->d_name.name : - (const unsigned char *) "", - file->f_path.dentry->d_name.name); - + _enter("%p,%p{%pD2}", inode, file, file); return -EREMOTE; } /* - * create a vfsmount to be automounted + * Set the parameters for the proposed superblock. */ -static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) { - struct afs_super_info *super; - struct vfsmount *mnt; - struct afs_vnode *vnode; - struct page *page; - char *devname, *options; - bool rwpath = false; + struct afs_fs_context *ctx = fc->fs_private; + struct afs_super_info *src_as = AFS_FS_S(mntpt->d_sb); + struct afs_vnode *vnode = AFS_FS_I(d_inode(mntpt)); + struct afs_cell *cell; + const char *p; int ret; - _enter("{%s}", mntpt->d_name.name); - - BUG_ON(!mntpt->d_inode); - - ret = -ENOMEM; - devname = (char *) get_zeroed_page(GFP_KERNEL); - if (!devname) - goto error_no_devname; - - options = (char *) get_zeroed_page(GFP_KERNEL); - if (!options) - goto error_no_options; + if (fc->net_ns != src_as->net_ns) { + put_net(fc->net_ns); + fc->net_ns = get_net(src_as->net_ns); + } - vnode = AFS_FS_I(mntpt->d_inode); + if (src_as->volume && src_as->volume->type == AFSVL_RWVOL) { + ctx->type = AFSVL_RWVOL; + ctx->force = true; + } + if (ctx->cell) { + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_mntpt); + ctx->cell = NULL; + } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { /* if the directory is a pseudo directory, use the d_name */ - static const char afs_root_cell[] = ":root.cell."; unsigned size = mntpt->d_name.len; - ret = -ENOENT; - if (size < 2 || size > AFS_MAXCELLNAME) - goto error_no_page; + if (size < 2) + return -ENOENT; + p = mntpt->d_name.name; if (mntpt->d_name.name[0] == '.') { - devname[0] = '#'; - memcpy(devname + 1, mntpt->d_name.name, size - 1); - memcpy(devname + size, afs_root_cell, - sizeof(afs_root_cell)); - rwpath = true; - } else { - devname[0] = '%'; - memcpy(devname + 1, mntpt->d_name.name, size); - memcpy(devname + size + 1, afs_root_cell, - sizeof(afs_root_cell)); + size--; + p++; + ctx->type = AFSVL_RWVOL; + ctx->force = true; + } + if (size > AFS_MAXCELLNAME) + return -ENAMETOOLONG; + + cell = afs_lookup_cell(ctx->net, p, size, NULL, + AFS_LOOKUP_CELL_MOUNTPOINT, + afs_cell_trace_use_lookup_mntpt); + if (IS_ERR(cell)) { + pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); + return PTR_ERR(cell); } + ctx->cell = cell; + + ctx->volname = afs_root_volume; + ctx->volnamesz = sizeof(afs_root_volume) - 1; } else { /* read the contents of the AFS special symlink */ - loff_t size = i_size_read(mntpt->d_inode); - char *buf; + DEFINE_DELAYED_CALL(cleanup); + const char *content; + loff_t size = i_size_read(d_inode(mntpt)); - ret = -EINVAL; - if (size > PAGE_SIZE - 1) - goto error_no_page; + if (src_as->cell) + ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt); - page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto error_no_page; - } + if (size < 2 || size > PAGE_SIZE - 1) + return -EINVAL; - ret = -EIO; - if (PageError(page)) - goto error; + content = afs_get_link(mntpt, d_inode(mntpt), &cleanup); + if (IS_ERR(content)) { + do_delayed_call(&cleanup); + return PTR_ERR(content); + } - buf = kmap_atomic(page); - memcpy(devname, buf, size); - kunmap_atomic(buf); - page_cache_release(page); - page = NULL; + ret = -EINVAL; + if (content[size - 1] == '.') + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(content, size - 1)); + do_delayed_call(&cleanup); + if (ret < 0) + return ret; + + /* Don't cross a backup volume mountpoint from a backup volume */ + if (src_as->volume && src_as->volume->type == AFSVL_BACKVOL && + ctx->type == AFSVL_BACKVOL) + return -ENODEV; } - /* work out what options we want */ - super = AFS_FS_S(mntpt->d_sb); - memcpy(options, "cell=", 5); - strcpy(options + 5, super->volume->cell->name); - if (super->volume->type == AFSVL_RWVOL || rwpath) - strcat(options, ",rwpath"); - - /* try and do the mount */ - _debug("--- attempting mount %s -o %s ---", devname, options); - mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); - _debug("--- mount result %p ---", mnt); - - free_page((unsigned long) devname); - free_page((unsigned long) options); - _leave(" = %p", mnt); - return mnt; + return 0; +} + +/* + * create a vfsmount to be automounted + */ +static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +{ + struct fs_context *fc; + struct vfsmount *mnt; + int ret; + + BUG_ON(!d_inode(mntpt)); + + fc = fs_context_for_submount(&afs_fs_type, mntpt); + if (IS_ERR(fc)) + return ERR_CAST(fc); -error: - page_cache_release(page); -error_no_page: - free_page((unsigned long) options); -error_no_options: - free_page((unsigned long) devname); -error_no_devname: - _leave(" = %d", ret); - return ERR_PTR(ret); + ret = afs_mntpt_set_params(fc, mntpt); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); + return mnt; } /* @@ -242,13 +185,12 @@ struct vfsmount *afs_d_automount(struct path *path) { struct vfsmount *newmnt; - _enter("{%s}", path->dentry->d_name.name); + _enter("{%pd}", path->dentry); newmnt = afs_mntpt_do_automount(path->dentry); if (IS_ERR(newmnt)) return newmnt; - mntget(newmnt); /* prevent immediate expiration */ mnt_set_expiry(newmnt, &afs_vfsmounts); queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, afs_mntpt_expiry_timeout * HZ); diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c deleted file mode 100644 index 7ad36506c256..000000000000 --- a/fs/afs/netdevices.c +++ /dev/null @@ -1,68 +0,0 @@ -/* AFS network device helpers - * - * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> - */ - -#include <linux/string.h> -#include <linux/rtnetlink.h> -#include <linux/inetdevice.h> -#include <linux/netdevice.h> -#include <linux/if_arp.h> -#include <net/net_namespace.h> -#include "internal.h" - -/* - * get a MAC address from a random ethernet interface that has a real one - * - the buffer will normally be 6 bytes in size - */ -int afs_get_MAC_address(u8 *mac, size_t maclen) -{ - struct net_device *dev; - int ret = -ENODEV; - - BUG_ON(maclen != ETH_ALEN); - - rtnl_lock(); - dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); - if (dev) { - memcpy(mac, dev->dev_addr, maclen); - ret = 0; - } - rtnl_unlock(); - return ret; -} - -/* - * get a list of this system's interface IPv4 addresses, netmasks and MTUs - * - maxbufs must be at least 1 - * - returns the number of interface records in the buffer - */ -int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs, - bool wantloopback) -{ - struct net_device *dev; - struct in_device *idev; - int n = 0; - - ASSERT(maxbufs > 0); - - rtnl_lock(); - for_each_netdev(&init_net, dev) { - if (dev->type == ARPHRD_LOOPBACK && !wantloopback) - continue; - idev = __in_dev_get_rtnl(dev); - if (!idev) - continue; - for_primary_ifa(idev) { - bufs[n].address.s_addr = ifa->ifa_address; - bufs[n].netmask.s_addr = ifa->ifa_mask; - bufs[n].mtu = dev->mtu; - n++; - if (n >= maxbufs) - goto out; - } endfor_ifa(idev); - } -out: - rtnl_unlock(); - return n; -} diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 526e4bbbde59..44520549b509 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* /proc interface for AFS * * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/slab.h> @@ -14,263 +10,95 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" -static struct proc_dir_entry *proc_afs; - - -static int afs_proc_cells_open(struct inode *inode, struct file *file); -static void *afs_proc_cells_start(struct seq_file *p, loff_t *pos); -static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos); -static void afs_proc_cells_stop(struct seq_file *p, void *v); -static int afs_proc_cells_show(struct seq_file *m, void *v); -static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, - size_t size, loff_t *_pos); - -static const struct seq_operations afs_proc_cells_ops = { - .start = afs_proc_cells_start, - .next = afs_proc_cells_next, - .stop = afs_proc_cells_stop, - .show = afs_proc_cells_show, -}; - -static const struct file_operations afs_proc_cells_fops = { - .open = afs_proc_cells_open, - .read = seq_read, - .write = afs_proc_cells_write, - .llseek = seq_lseek, - .release = seq_release, - .owner = THIS_MODULE, -}; - -static int afs_proc_rootcell_open(struct inode *inode, struct file *file); -static int afs_proc_rootcell_release(struct inode *inode, struct file *file); -static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, - size_t size, loff_t *_pos); -static ssize_t afs_proc_rootcell_write(struct file *file, - const char __user *buf, - size_t size, loff_t *_pos); - -static const struct file_operations afs_proc_rootcell_fops = { - .open = afs_proc_rootcell_open, - .read = afs_proc_rootcell_read, - .write = afs_proc_rootcell_write, - .llseek = no_llseek, - .release = afs_proc_rootcell_release, - .owner = THIS_MODULE, -}; - -static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); -static int afs_proc_cell_volumes_release(struct inode *inode, - struct file *file); -static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos); -static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, - loff_t *pos); -static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v); -static int afs_proc_cell_volumes_show(struct seq_file *m, void *v); - -static const struct seq_operations afs_proc_cell_volumes_ops = { - .start = afs_proc_cell_volumes_start, - .next = afs_proc_cell_volumes_next, - .stop = afs_proc_cell_volumes_stop, - .show = afs_proc_cell_volumes_show, -}; - -static const struct file_operations afs_proc_cell_volumes_fops = { - .open = afs_proc_cell_volumes_open, - .read = seq_read, - .llseek = seq_lseek, - .release = afs_proc_cell_volumes_release, - .owner = THIS_MODULE, -}; - -static int afs_proc_cell_vlservers_open(struct inode *inode, - struct file *file); -static int afs_proc_cell_vlservers_release(struct inode *inode, - struct file *file); -static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos); -static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, - loff_t *pos); -static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v); -static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v); - -static const struct seq_operations afs_proc_cell_vlservers_ops = { - .start = afs_proc_cell_vlservers_start, - .next = afs_proc_cell_vlservers_next, - .stop = afs_proc_cell_vlservers_stop, - .show = afs_proc_cell_vlservers_show, -}; - -static const struct file_operations afs_proc_cell_vlservers_fops = { - .open = afs_proc_cell_vlservers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = afs_proc_cell_vlservers_release, - .owner = THIS_MODULE, -}; - -static int afs_proc_cell_servers_open(struct inode *inode, struct file *file); -static int afs_proc_cell_servers_release(struct inode *inode, - struct file *file); -static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos); -static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, - loff_t *pos); -static void afs_proc_cell_servers_stop(struct seq_file *p, void *v); -static int afs_proc_cell_servers_show(struct seq_file *m, void *v); - -static const struct seq_operations afs_proc_cell_servers_ops = { - .start = afs_proc_cell_servers_start, - .next = afs_proc_cell_servers_next, - .stop = afs_proc_cell_servers_stop, - .show = afs_proc_cell_servers_show, -}; - -static const struct file_operations afs_proc_cell_servers_fops = { - .open = afs_proc_cell_servers_open, - .read = seq_read, - .llseek = seq_lseek, - .release = afs_proc_cell_servers_release, - .owner = THIS_MODULE, +struct afs_vl_seq_net_private { + struct seq_net_private seq; /* Must be first */ + struct afs_vlserver_list *vllist; }; -/* - * initialise the /proc/fs/afs/ directory - */ -int afs_proc_init(void) +static inline struct afs_net *afs_seq2net(struct seq_file *m) { - struct proc_dir_entry *p; - - _enter(""); - - proc_afs = proc_mkdir("fs/afs", NULL); - if (!proc_afs) - goto error_dir; - - p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops); - if (!p) - goto error_cells; - - p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops); - if (!p) - goto error_rootcell; - - _leave(" = 0"); - return 0; - -error_rootcell: - remove_proc_entry("cells", proc_afs); -error_cells: - remove_proc_entry("fs/afs", NULL); -error_dir: - _leave(" = -ENOMEM"); - return -ENOMEM; + return afs_net(seq_file_net(m)); } -/* - * clean up the /proc/fs/afs/ directory - */ -void afs_proc_cleanup(void) +static inline struct afs_net *afs_seq2net_single(struct seq_file *m) { - remove_proc_entry("rootcell", proc_afs); - remove_proc_entry("cells", proc_afs); - remove_proc_entry("fs/afs", NULL); + return afs_net(seq_file_single_net(m)); } /* - * open "/proc/fs/afs/cells" which provides a summary of extant cells + * Display the list of cells known to the namespace. */ -static int afs_proc_cells_open(struct inode *inode, struct file *file) +static int afs_proc_cells_show(struct seq_file *m, void *v) { - struct seq_file *m; - int ret; + struct afs_vlserver_list *vllist; + struct afs_cell *cell; - ret = seq_open(file, &afs_proc_cells_ops); - if (ret < 0) - return ret; + if (v == SEQ_START_TOKEN) { + /* display header on line 1 */ + seq_puts(m, "USE ACT TTL SV ST NAME\n"); + return 0; + } - m = file->private_data; - m->private = PDE_DATA(inode); + cell = list_entry(v, struct afs_cell, proc_link); + vllist = rcu_dereference(cell->vl_servers); + /* display one cell per line on subsequent lines */ + seq_printf(m, "%3u %3u %6lld %2u %2u %s\n", + refcount_read(&cell->ref), + atomic_read(&cell->active), + cell->dns_expiry - ktime_get_real_seconds(), + vllist ? vllist->nr_servers : 0, + cell->state, + cell->name); return 0; } -/* - * set up the iterator to start reading from the cells list and return the - * first item - */ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { - /* lock the list against modification */ - down_read(&afs_proc_cells_sem); - return seq_list_start_head(&afs_proc_cells, *_pos); + rcu_read_lock(); + return seq_hlist_start_head_rcu(&afs_seq2net(m)->proc_cells, *_pos); } -/* - * move to next cell in cells list - */ -static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos) +static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &afs_proc_cells, pos); + return seq_hlist_next_rcu(v, &afs_seq2net(m)->proc_cells, pos); } -/* - * clean up after reading from the cells list - */ -static void afs_proc_cells_stop(struct seq_file *p, void *v) +static void afs_proc_cells_stop(struct seq_file *m, void *v) + __releases(rcu) { - up_read(&afs_proc_cells_sem); + rcu_read_unlock(); } -/* - * display a header line followed by a load of cell lines - */ -static int afs_proc_cells_show(struct seq_file *m, void *v) -{ - struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link); - - if (v == &afs_proc_cells) { - /* display header on line 1 */ - seq_puts(m, "USE NAME\n"); - return 0; - } - - /* display one cell per line on subsequent lines */ - seq_printf(m, "%3d %s\n", - atomic_read(&cell->usage), cell->name); - return 0; -} +static const struct seq_operations afs_proc_cells_ops = { + .start = afs_proc_cells_start, + .next = afs_proc_cells_next, + .stop = afs_proc_cells_stop, + .show = afs_proc_cells_show, +}; /* * handle writes to /proc/fs/afs/cells * - to add cells: echo "add <cellname> <IP>[:<IP>][:<IP>]" */ -static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, - size_t size, loff_t *_pos) +static int afs_proc_cells_write(struct file *file, char *buf, size_t size) { - char *kbuf, *name, *args; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net(m); + char *name, *args; int ret; - /* start by dragging the command into memory */ - if (size <= 1 || size >= PAGE_SIZE) - return -EINVAL; - - kbuf = kmalloc(size + 1, GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(kbuf, buf, size) != 0) - goto done; - kbuf[size] = 0; - /* trim to first NL */ - name = memchr(kbuf, '\n', size); + name = memchr(buf, '\n', size); if (name) *name = 0; /* split into command, name and argslist */ - name = strchr(kbuf, ' '); + name = strchr(buf, ' '); if (!name) goto inval; do { @@ -280,36 +108,37 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, goto inval; args = strchr(name, ' '); - if (!args) - goto inval; - do { - *args++ = 0; - } while(*args == ' '); - if (!*args) - goto inval; + if (args) { + do { + *args++ = 0; + } while(*args == ' '); + if (!*args) + goto inval; + } /* determine command to perform */ - _debug("cmd=%s name=%s args=%s", kbuf, name, args); + _debug("cmd=%s name=%s args=%s", buf, name, args); - if (strcmp(kbuf, "add") == 0) { + if (strcmp(buf, "add") == 0) { struct afs_cell *cell; - cell = afs_cell_create(name, strlen(name), args, false); + cell = afs_lookup_cell(net, name, strlen(name), args, + AFS_LOOKUP_CELL_PRELOAD, + afs_cell_trace_use_lookup_add); if (IS_ERR(cell)) { ret = PTR_ERR(cell); goto done; } - afs_put_cell(cell); - printk("kAFS: Added new cell '%s'\n", name); + if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) + afs_unuse_cell(cell, afs_cell_trace_unuse_no_pin); } else { goto inval; } - ret = size; + ret = 0; done: - kfree(kbuf); _leave(" = %d", ret); return ret; @@ -320,425 +149,638 @@ inval: } /* - * Stubs for /proc/fs/afs/rootcell + * Display the list of addr_prefs known to the namespace. */ -static int afs_proc_rootcell_open(struct inode *inode, struct file *file) -{ - return 0; -} +static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) +{ + struct afs_addr_preference_list *preflist; + struct afs_addr_preference *pref; + struct afs_net *net = afs_seq2net_single(m); + union { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } addr; + unsigned int i; + char buf[44]; /* Maximum ipv6 + max subnet is 43 */ + + rcu_read_lock(); + preflist = rcu_dereference(net->address_prefs); + + if (!preflist) { + seq_puts(m, "NO PREFS\n"); + goto out; + } -static int afs_proc_rootcell_release(struct inode *inode, struct file *file) -{ + seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n", + preflist->version, preflist->ipv6_off, preflist->nr, preflist->max_prefs); + + memset(&addr, 0, sizeof(addr)); + + for (i = 0; i < preflist->nr; i++) { + pref = &preflist->prefs[i]; + + addr.sin.sin_family = pref->family; + if (pref->family == AF_INET) { + memcpy(&addr.sin.sin_addr, &pref->ipv4_addr, + sizeof(addr.sin.sin_addr)); + snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin, pref->subnet_mask); + seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio); + } else { + memcpy(&addr.sin6.sin6_addr, &pref->ipv6_addr, + sizeof(addr.sin6.sin6_addr)); + snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin6, pref->subnet_mask); + seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio); + } + } + +out: + rcu_read_unlock(); return 0; } -static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, - size_t size, loff_t *_pos) +/* + * Display the name of the current workstation cell. + */ +static int afs_proc_rootcell_show(struct seq_file *m, void *v) { + struct afs_cell *cell; + struct afs_net *net; + + net = afs_seq2net_single(m); + down_read(&net->cells_lock); + cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock)); + if (cell) + seq_printf(m, "%s\n", cell->name); + up_read(&net->cells_lock); return 0; } /* - * handle writes to /proc/fs/afs/rootcell - * - to initialize rootcell: echo "cell.name:192.168.231.14" + * Set the current workstation cell and optionally supply its list of volume + * location servers. + * + * echo "cell.name:192.168.231.14" >/proc/fs/afs/rootcell */ -static ssize_t afs_proc_rootcell_write(struct file *file, - const char __user *buf, - size_t size, loff_t *_pos) +static int afs_proc_rootcell_write(struct file *file, char *buf, size_t size) { - char *kbuf, *s; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net_single(m); + char *s; int ret; - /* start by dragging the command into memory */ - if (size <= 1 || size >= PAGE_SIZE) - return -EINVAL; - - ret = -ENOMEM; - kbuf = kmalloc(size + 1, GFP_KERNEL); - if (!kbuf) - goto nomem; - - ret = -EFAULT; - if (copy_from_user(kbuf, buf, size) != 0) - goto infault; - kbuf[size] = 0; + ret = -EINVAL; + if (buf[0] == '.') + goto out; + if (memchr(buf, '/', size)) + goto out; /* trim to first NL */ - s = memchr(kbuf, '\n', size); + s = memchr(buf, '\n', size); if (s) *s = 0; /* determine command to perform */ - _debug("rootcell=%s", kbuf); + _debug("rootcell=%s", buf); - ret = afs_cell_init(kbuf); - if (ret >= 0) - ret = size; /* consume everything, always */ + ret = -EEXIST; + inode_lock(file_inode(file)); + if (!rcu_access_pointer(net->ws_cell)) + ret = afs_cell_init(net, buf); + else + printk("busy\n"); + inode_unlock(file_inode(file)); -infault: - kfree(kbuf); -nomem: +out: _leave(" = %d", ret); return ret; } +static const char afs_vol_types[3][3] = { + [AFSVL_RWVOL] = "RW", + [AFSVL_ROVOL] = "RO", + [AFSVL_BACKVOL] = "BK", +}; + /* - * initialise /proc/fs/afs/<cell>/ + * Display the list of volumes known to a cell. */ -int afs_proc_cell_setup(struct afs_cell *cell) +static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) { - struct proc_dir_entry *p; - - _enter("%p{%s}", cell, cell->name); - - cell->proc_dir = proc_mkdir(cell->name, proc_afs); - if (!cell->proc_dir) - goto error_dir; + struct afs_volume *vol = hlist_entry(v, struct afs_volume, proc_link); - p = proc_create_data("servers", 0, cell->proc_dir, - &afs_proc_cell_servers_fops, cell); - if (!p) - goto error_servers; - - p = proc_create_data("vlservers", 0, cell->proc_dir, - &afs_proc_cell_vlservers_fops, cell); - if (!p) - goto error_vlservers; + /* Display header on line 1 */ + if (v == SEQ_START_TOKEN) { + seq_puts(m, "USE VID TY NAME\n"); + return 0; + } - p = proc_create_data("volumes", 0, cell->proc_dir, - &afs_proc_cell_volumes_fops, cell); - if (!p) - goto error_volumes; + seq_printf(m, "%3d %08llx %s %s\n", + refcount_read(&vol->ref), vol->vid, + afs_vol_types[vol->type], + vol->name); - _leave(" = 0"); return 0; - -error_volumes: - remove_proc_entry("vlservers", cell->proc_dir); -error_vlservers: - remove_proc_entry("servers", cell->proc_dir); -error_servers: - remove_proc_entry(cell->name, proc_afs); -error_dir: - _leave(" = -ENOMEM"); - return -ENOMEM; } -/* - * remove /proc/fs/afs/<cell>/ - */ -void afs_proc_cell_remove(struct afs_cell *cell) +static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) + __acquires(cell->proc_lock) { - _enter(""); + struct afs_cell *cell = pde_data(file_inode(m->file)); - remove_proc_entry("volumes", cell->proc_dir); - remove_proc_entry("vlservers", cell->proc_dir); - remove_proc_entry("servers", cell->proc_dir); - remove_proc_entry(cell->name, proc_afs); - - _leave(""); + rcu_read_lock(); + return seq_hlist_start_head_rcu(&cell->proc_volumes, *_pos); } -/* - * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells - */ -static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) +static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v, + loff_t *_pos) { - struct afs_cell *cell; - struct seq_file *m; - int ret; + struct afs_cell *cell = pde_data(file_inode(m->file)); - cell = PDE_DATA(inode); - if (!cell) - return -ENOENT; + return seq_hlist_next_rcu(v, &cell->proc_volumes, _pos); +} - ret = seq_open(file, &afs_proc_cell_volumes_ops); - if (ret < 0) - return ret; +static void afs_proc_cell_volumes_stop(struct seq_file *m, void *v) + __releases(cell->proc_lock) +{ + rcu_read_unlock(); +} + +static const struct seq_operations afs_proc_cell_volumes_ops = { + .start = afs_proc_cell_volumes_start, + .next = afs_proc_cell_volumes_next, + .stop = afs_proc_cell_volumes_stop, + .show = afs_proc_cell_volumes_show, +}; - m = file->private_data; - m->private = cell; +static const char *const dns_record_sources[NR__dns_record_source + 1] = { + [DNS_RECORD_UNAVAILABLE] = "unav", + [DNS_RECORD_FROM_CONFIG] = "cfg", + [DNS_RECORD_FROM_DNS_A] = "A", + [DNS_RECORD_FROM_DNS_AFSDB] = "AFSDB", + [DNS_RECORD_FROM_DNS_SRV] = "SRV", + [DNS_RECORD_FROM_NSS] = "nss", + [NR__dns_record_source] = "[weird]" +}; - return 0; -} +static const char *const dns_lookup_statuses[NR__dns_lookup_status + 1] = { + [DNS_LOOKUP_NOT_DONE] = "no-lookup", + [DNS_LOOKUP_GOOD] = "good", + [DNS_LOOKUP_GOOD_WITH_BAD] = "good/bad", + [DNS_LOOKUP_BAD] = "bad", + [DNS_LOOKUP_GOT_NOT_FOUND] = "not-found", + [DNS_LOOKUP_GOT_LOCAL_FAILURE] = "local-failure", + [DNS_LOOKUP_GOT_TEMP_FAILURE] = "temp-failure", + [DNS_LOOKUP_GOT_NS_FAILURE] = "ns-failure", + [NR__dns_lookup_status] = "[weird]" +}; /* - * close the file and release the ref to the cell + * Display the list of Volume Location servers we're using for a cell. */ -static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file) +static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) { - return seq_release(inode, file); + const struct afs_vl_seq_net_private *priv = m->private; + const struct afs_vlserver_list *vllist = priv->vllist; + const struct afs_vlserver_entry *entry; + const struct afs_vlserver *vlserver; + const struct afs_addr_list *alist; + int i; + + if (v == SEQ_START_TOKEN) { + seq_printf(m, "# source %s, status %s\n", + dns_record_sources[vllist ? vllist->source : 0], + dns_lookup_statuses[vllist ? vllist->status : 0]); + return 0; + } + + entry = v; + vlserver = entry->server; + alist = rcu_dereference(vlserver->addresses); + + seq_printf(m, "%s [p=%hu w=%hu s=%s,%s]:\n", + vlserver->name, entry->priority, entry->weight, + dns_record_sources[alist ? alist->source : entry->source], + dns_lookup_statuses[alist ? alist->status : entry->status]); + if (alist) { + for (i = 0; i < alist->nr_addrs; i++) + seq_printf(m, " %c %pISpc\n", + alist->preferred == i ? '>' : '-', + rxrpc_kernel_remote_addr(alist->addrs[i].peer)); + } + seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt); + seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n", + vlserver->probe.flags, vlserver->probe.error, + vlserver->probe.abort_code, + atomic_read(&vlserver->probe_outstanding)); + return 0; } -/* - * set up the iterator to start reading from the cells list and return the - * first item - */ -static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) +static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { - struct afs_cell *cell = m->private; + struct afs_vl_seq_net_private *priv = m->private; + struct afs_vlserver_list *vllist; + struct afs_cell *cell = pde_data(file_inode(m->file)); + loff_t pos = *_pos; + + rcu_read_lock(); + + vllist = rcu_dereference(cell->vl_servers); + priv->vllist = vllist; - _enter("cell=%p pos=%Ld", cell, *_pos); + if (pos < 0) + *_pos = pos = 0; + if (pos == 0) + return SEQ_START_TOKEN; + + if (pos - 1 >= vllist->nr_servers) + return NULL; - /* lock the list against modification */ - down_read(&cell->vl_sem); - return seq_list_start_head(&cell->vl_list, *_pos); + return &vllist->servers[pos - 1]; } -/* - * move to next cell in cells list - */ -static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, - loff_t *_pos) +static void *afs_proc_cell_vlservers_next(struct seq_file *m, void *v, + loff_t *_pos) { - struct afs_cell *cell = p->private; + struct afs_vl_seq_net_private *priv = m->private; + struct afs_vlserver_list *vllist = priv->vllist; + loff_t pos; - _enter("cell=%p pos=%Ld", cell, *_pos); - return seq_list_next(v, &cell->vl_list, _pos); + pos = *_pos; + pos++; + *_pos = pos; + if (!vllist || pos - 1 >= vllist->nr_servers) + return NULL; + + return &vllist->servers[pos - 1]; } -/* - * clean up after reading from the cells list - */ -static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v) +static void afs_proc_cell_vlservers_stop(struct seq_file *m, void *v) + __releases(rcu) { - struct afs_cell *cell = p->private; - - up_read(&cell->vl_sem); + rcu_read_unlock(); } -static const char afs_vlocation_states[][4] = { - [AFS_VL_NEW] = "New", - [AFS_VL_CREATING] = "Crt", - [AFS_VL_VALID] = "Val", - [AFS_VL_NO_VOLUME] = "NoV", - [AFS_VL_UPDATING] = "Upd", - [AFS_VL_VOLUME_DELETED] = "Del", - [AFS_VL_UNCERTAIN] = "Unc", +static const struct seq_operations afs_proc_cell_vlservers_ops = { + .start = afs_proc_cell_vlservers_start, + .next = afs_proc_cell_vlservers_next, + .stop = afs_proc_cell_vlservers_stop, + .show = afs_proc_cell_vlservers_show, }; /* - * display a header line followed by a load of volume lines + * Display the list of fileservers we're using within a namespace. */ -static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) +static int afs_proc_servers_show(struct seq_file *m, void *v) { - struct afs_cell *cell = m->private; - struct afs_vlocation *vlocation = - list_entry(v, struct afs_vlocation, link); + struct afs_endpoint_state *estate; + struct afs_addr_list *alist; + struct afs_server *server; + unsigned long failed; + int i; - /* display header on line 1 */ - if (v == &cell->vl_list) { - seq_puts(m, "USE STT VLID[0] VLID[1] VLID[2] NAME\n"); + if (v == SEQ_START_TOKEN) { + seq_puts(m, "UUID REF ACT CELL\n"); return 0; } - /* display one cell per line on subsequent lines */ - seq_printf(m, "%3d %s %08x %08x %08x %s\n", - atomic_read(&vlocation->usage), - afs_vlocation_states[vlocation->state], - vlocation->vldb.vid[0], - vlocation->vldb.vid[1], - vlocation->vldb.vid[2], - vlocation->vldb.name); + server = list_entry(v, struct afs_server, proc_link); + seq_printf(m, "%pU %3d %3d %s\n", + &server->uuid, + refcount_read(&server->ref), + atomic_read(&server->active), + server->cell->name); + seq_printf(m, " - info: fl=%lx rtt=%u\n", + server->flags, server->rtt); + seq_printf(m, " - probe: last=%d\n", + (int)(jiffies - server->probed_at) / HZ); + + estate = rcu_dereference(server->endpoint_state); + if (!estate) + goto out; + failed = estate->failed_set; + seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n", + estate->probe_seq, atomic_read(&estate->nr_probing), + estate->responsive_set, estate->failed_set); + + alist = estate->addresses; + seq_printf(m, " - ALIST v=%u ap=%u\n", + alist->version, alist->addr_pref_version); + for (i = 0; i < alist->nr_addrs; i++) { + const struct afs_address *addr = &alist->addrs[i]; + + seq_printf(m, " [%x] %pISpc%s rtt=%d err=%d p=%u\n", + i, rxrpc_kernel_remote_addr(addr->peer), + alist->preferred == i ? "*" : + test_bit(i, &failed) ? "!" : "", + rxrpc_kernel_get_srtt(addr->peer), + addr->last_error, addr->prio); + } +out: return 0; } -/* - * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume - * location server - */ -static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) +static void *afs_proc_servers_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { - struct afs_cell *cell; - struct seq_file *m; - int ret; - - cell = PDE_DATA(inode); - if (!cell) - return -ENOENT; - - ret = seq_open(file, &afs_proc_cell_vlservers_ops); - if (ret<0) - return ret; - - m = file->private_data; - m->private = cell; + rcu_read_lock(); + return seq_hlist_start_head_rcu(&afs_seq2net(m)->fs_proc, *_pos); +} - return 0; +static void *afs_proc_servers_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_hlist_next_rcu(v, &afs_seq2net(m)->fs_proc, _pos); } -/* - * close the file and release the ref to the cell - */ -static int afs_proc_cell_vlservers_release(struct inode *inode, - struct file *file) +static void afs_proc_servers_stop(struct seq_file *m, void *v) + __releases(rcu) { - return seq_release(inode, file); + rcu_read_unlock(); } +static const struct seq_operations afs_proc_servers_ops = { + .start = afs_proc_servers_start, + .next = afs_proc_servers_next, + .stop = afs_proc_servers_stop, + .show = afs_proc_servers_show, +}; + /* - * set up the iterator to start reading from the cells list and return the - * first item + * Display the list of strings that may be substituted for the @sys pathname + * macro. */ -static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) +static int afs_proc_sysname_show(struct seq_file *m, void *v) { - struct afs_cell *cell = m->private; - loff_t pos = *_pos; + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *sysnames = net->sysnames; + unsigned int i = (unsigned long)v - 1; - _enter("cell=%p pos=%Ld", cell, *_pos); + if (i < sysnames->nr) + seq_printf(m, "%s\n", sysnames->subs[i]); + return 0; +} - /* lock the list against modification */ - down_read(&cell->vl_sem); +static void *afs_proc_sysname_start(struct seq_file *m, loff_t *pos) + __acquires(&net->sysnames_lock) +{ + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *names; - /* allow for the header line */ - if (!pos) - return (void *) 1; - pos--; + read_lock(&net->sysnames_lock); - if (pos >= cell->vl_naddrs) + names = net->sysnames; + if (*pos >= names->nr) return NULL; - - return &cell->vl_addrs[pos]; + return (void *)(unsigned long)(*pos + 1); } -/* - * move to next cell in cells list - */ -static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, - loff_t *_pos) +static void *afs_proc_sysname_next(struct seq_file *m, void *v, loff_t *pos) { - struct afs_cell *cell = p->private; - loff_t pos; + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *names = net->sysnames; - _enter("cell=%p{nad=%u} pos=%Ld", cell, cell->vl_naddrs, *_pos); - - pos = *_pos; - (*_pos)++; - if (pos >= cell->vl_naddrs) + *pos += 1; + if (*pos >= names->nr) return NULL; - - return &cell->vl_addrs[pos]; + return (void *)(unsigned long)(*pos + 1); } -/* - * clean up after reading from the cells list - */ -static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v) +static void afs_proc_sysname_stop(struct seq_file *m, void *v) + __releases(&net->sysnames_lock) { - struct afs_cell *cell = p->private; + struct afs_net *net = afs_seq2net(m); - up_read(&cell->vl_sem); + read_unlock(&net->sysnames_lock); } +static const struct seq_operations afs_proc_sysname_ops = { + .start = afs_proc_sysname_start, + .next = afs_proc_sysname_next, + .stop = afs_proc_sysname_stop, + .show = afs_proc_sysname_show, +}; + /* - * display a header line followed by a load of volume lines + * Allow the @sys substitution to be configured. */ -static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) +static int afs_proc_sysname_write(struct file *file, char *buf, size_t size) { - struct in_addr *addr = v; + struct afs_sysnames *sysnames, *kill; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net(m); + char *s, *p, *sub; + int ret, len; - /* display header on line 1 */ - if (v == (struct in_addr *) 1) { - seq_puts(m, "ADDRESS\n"); - return 0; + sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); + if (!sysnames) + return -ENOMEM; + refcount_set(&sysnames->usage, 1); + kill = sysnames; + + p = buf; + while ((s = strsep(&p, " \t\n"))) { + len = strlen(s); + if (len == 0) + continue; + ret = -ENAMETOOLONG; + if (len >= AFSNAMEMAX) + goto error; + + if (len >= 4 && + s[len - 4] == '@' && + s[len - 3] == 's' && + s[len - 2] == 'y' && + s[len - 1] == 's') + /* Protect against recursion */ + goto invalid; + + if (s[0] == '.' && + (len < 2 || (len == 2 && s[1] == '.'))) + goto invalid; + + if (memchr(s, '/', len)) + goto invalid; + + ret = -EFBIG; + if (sysnames->nr >= AFS_NR_SYSNAME) + goto out; + + if (strcmp(s, afs_init_sysname) == 0) { + sub = (char *)afs_init_sysname; + } else { + ret = -ENOMEM; + sub = kmemdup(s, len + 1, GFP_KERNEL); + if (!sub) + goto out; + } + + sysnames->subs[sysnames->nr] = sub; + sysnames->nr++; } - /* display one cell per line on subsequent lines */ - seq_printf(m, "%pI4\n", &addr->s_addr); - return 0; + if (sysnames->nr == 0) { + sysnames->subs[0] = sysnames->blank; + sysnames->nr++; + } + + write_lock(&net->sysnames_lock); + kill = net->sysnames; + net->sysnames = sysnames; + write_unlock(&net->sysnames_lock); + ret = 0; +out: + afs_put_sysnames(kill); + return ret; + +invalid: + ret = -EINVAL; +error: + goto out; +} + +void afs_put_sysnames(struct afs_sysnames *sysnames) +{ + int i; + + if (sysnames && refcount_dec_and_test(&sysnames->usage)) { + for (i = 0; i < sysnames->nr; i++) + if (sysnames->subs[i] != afs_init_sysname && + sysnames->subs[i] != sysnames->blank) + kfree(sysnames->subs[i]); + kfree(sysnames); + } } /* - * open "/proc/fs/afs/<cell>/servers" which provides a summary of active - * servers + * Display general per-net namespace statistics */ -static int afs_proc_cell_servers_open(struct inode *inode, struct file *file) +static int afs_proc_stats_show(struct seq_file *m, void *v) { - struct afs_cell *cell; - struct seq_file *m; - int ret; + struct afs_net *net = afs_seq2net_single(m); + + seq_puts(m, "kAFS statistics\n"); - cell = PDE_DATA(inode); - if (!cell) - return -ENOENT; + seq_printf(m, "dir-mgmt: look=%u reval=%u inval=%u relpg=%u\n", + atomic_read(&net->n_lookup), + atomic_read(&net->n_reval), + atomic_read(&net->n_inval), + atomic_read(&net->n_relpg)); - ret = seq_open(file, &afs_proc_cell_servers_ops); - if (ret < 0) - return ret; + seq_printf(m, "dir-data: rdpg=%u\n", + atomic_read(&net->n_read_dir)); - m = file->private_data; - m->private = cell; + seq_printf(m, "dir-edit: cr=%u rm=%u\n", + atomic_read(&net->n_dir_cr), + atomic_read(&net->n_dir_rm)); + + seq_printf(m, "file-rd : n=%u nb=%lu\n", + atomic_read(&net->n_fetches), + atomic_long_read(&net->n_fetch_bytes)); + seq_printf(m, "file-wr : n=%u nb=%lu\n", + atomic_read(&net->n_stores), + atomic_long_read(&net->n_store_bytes)); return 0; } /* - * close the file and release the ref to the cell + * initialise /proc/fs/afs/<cell>/ */ -static int afs_proc_cell_servers_release(struct inode *inode, - struct file *file) +int afs_proc_cell_setup(struct afs_cell *cell) { - return seq_release(inode, file); -} + struct proc_dir_entry *dir; + struct afs_net *net = cell->net; -/* - * set up the iterator to start reading from the cells list and return the - * first item - */ -static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos) - __acquires(m->private->servers_lock) -{ - struct afs_cell *cell = m->private; + _enter("%p{%s},%p", cell, cell->name, net->proc_afs); - _enter("cell=%p pos=%Ld", cell, *_pos); + dir = proc_net_mkdir(net->net, cell->name, net->proc_afs); + if (!dir) + goto error_dir; - /* lock the list against modification */ - read_lock(&cell->servers_lock); - return seq_list_start_head(&cell->servers, *_pos); -} + if (!proc_create_net_data("vlservers", 0444, dir, + &afs_proc_cell_vlservers_ops, + sizeof(struct afs_vl_seq_net_private), + cell) || + !proc_create_net_data("volumes", 0444, dir, + &afs_proc_cell_volumes_ops, + sizeof(struct seq_net_private), + cell)) + goto error_tree; -/* - * move to next cell in cells list - */ -static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, - loff_t *_pos) -{ - struct afs_cell *cell = p->private; + _leave(" = 0"); + return 0; - _enter("cell=%p pos=%Ld", cell, *_pos); - return seq_list_next(v, &cell->servers, _pos); +error_tree: + remove_proc_subtree(cell->name, net->proc_afs); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; } /* - * clean up after reading from the cells list + * remove /proc/fs/afs/<cell>/ */ -static void afs_proc_cell_servers_stop(struct seq_file *p, void *v) - __releases(p->private->servers_lock) +void afs_proc_cell_remove(struct afs_cell *cell) { - struct afs_cell *cell = p->private; + struct afs_net *net = cell->net; - read_unlock(&cell->servers_lock); + _enter(""); + remove_proc_subtree(cell->name, net->proc_afs); + _leave(""); } /* - * display a header line followed by a load of volume lines + * initialise the /proc/fs/afs/ directory */ -static int afs_proc_cell_servers_show(struct seq_file *m, void *v) +int afs_proc_init(struct afs_net *net) { - struct afs_cell *cell = m->private; - struct afs_server *server = list_entry(v, struct afs_server, link); - char ipaddr[20]; + struct proc_dir_entry *p; - /* display header on line 1 */ - if (v == &cell->servers) { - seq_puts(m, "USE ADDR STATE\n"); - return 0; - } + _enter(""); - /* display one cell per line on subsequent lines */ - sprintf(ipaddr, "%pI4", &server->addr); - seq_printf(m, "%3d %-15.15s %5d\n", - atomic_read(&server->usage), ipaddr, server->fs_state); + p = proc_net_mkdir(net->net, "afs", net->net->proc_net); + if (!p) + goto error_dir; + if (!proc_create_net_data_write("cells", 0644, p, + &afs_proc_cells_ops, + afs_proc_cells_write, + sizeof(struct seq_net_private), + NULL) || + !proc_create_net_single_write("rootcell", 0644, p, + afs_proc_rootcell_show, + afs_proc_rootcell_write, + NULL) || + !proc_create_net("servers", 0444, p, &afs_proc_servers_ops, + sizeof(struct seq_net_private)) || + !proc_create_net_single("stats", 0444, p, afs_proc_stats_show, NULL) || + !proc_create_net_data_write("sysname", 0644, p, + &afs_proc_sysname_ops, + afs_proc_sysname_write, + sizeof(struct seq_net_private), + NULL) || + !proc_create_net_single_write("addr_prefs", 0644, p, + afs_proc_addr_prefs_show, + afs_proc_addr_prefs_write, + NULL)) + goto error_tree; + + net->proc_afs = p; + _leave(" = 0"); return 0; + +error_tree: + proc_remove(p); +error_dir: + _leave(" = -ENOMEM"); + return -ENOMEM; +} + +/* + * clean up the /proc/fs/afs/ directory + */ +void afs_proc_cleanup(struct afs_net *net) +{ + proc_remove(net->proc_afs); + net->proc_afs = NULL; } diff --git a/fs/afs/protocol_afs.h b/fs/afs/protocol_afs.h new file mode 100644 index 000000000000..0c39358c8b70 --- /dev/null +++ b/fs/afs/protocol_afs.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* AFS protocol bits + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + + +#define AFSCAPABILITIESMAX 196 /* Maximum number of words in a capability set */ + +/* AFS3 Fileserver capabilities word 0 */ +#define AFS3_VICED_CAPABILITY_ERRORTRANS 0x0001 /* Uses UAE errors */ +#define AFS3_VICED_CAPABILITY_64BITFILES 0x0002 /* FetchData64 & StoreData64 supported */ +#define AFS3_VICED_CAPABILITY_WRITELOCKACL 0x0004 /* Can lock a file even without lock perm */ +#define AFS3_VICED_CAPABILITY_SANEACLS 0x0008 /* ACLs reviewed for sanity - don't use */ diff --git a/fs/afs/protocol_uae.h b/fs/afs/protocol_uae.h new file mode 100644 index 000000000000..1b3d1060bd34 --- /dev/null +++ b/fs/afs/protocol_uae.h @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Universal AFS Error codes (UAE). + * + * Copyright (C) 2003, Daria Phoebe Brashear + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + */ + +enum { + UAEPERM = 0x2f6df00, /* Operation not permitted */ + UAENOENT = 0x2f6df01, /* No such file or directory */ + UAESRCH = 0x2f6df02, /* No such process */ + UAEINTR = 0x2f6df03, /* Interrupted system call */ + UAEIO = 0x2f6df04, /* I/O error */ + UAENXIO = 0x2f6df05, /* No such device or address */ + UAE2BIG = 0x2f6df06, /* Arg list too long */ + UAENOEXEC = 0x2f6df07, /* Exec format error */ + UAEBADF = 0x2f6df08, /* Bad file number */ + UAECHILD = 0x2f6df09, /* No child processes */ + UAEAGAIN = 0x2f6df0a, /* Try again */ + UAENOMEM = 0x2f6df0b, /* Out of memory */ + UAEACCES = 0x2f6df0c, /* Permission denied */ + UAEFAULT = 0x2f6df0d, /* Bad address */ + UAENOTBLK = 0x2f6df0e, /* Block device required */ + UAEBUSY = 0x2f6df0f, /* Device or resource busy */ + UAEEXIST = 0x2f6df10, /* File exists */ + UAEXDEV = 0x2f6df11, /* Cross-device link */ + UAENODEV = 0x2f6df12, /* No such device */ + UAENOTDIR = 0x2f6df13, /* Not a directory */ + UAEISDIR = 0x2f6df14, /* Is a directory */ + UAEINVAL = 0x2f6df15, /* Invalid argument */ + UAENFILE = 0x2f6df16, /* File table overflow */ + UAEMFILE = 0x2f6df17, /* Too many open files */ + UAENOTTY = 0x2f6df18, /* Not a typewriter */ + UAETXTBSY = 0x2f6df19, /* Text file busy */ + UAEFBIG = 0x2f6df1a, /* File too large */ + UAENOSPC = 0x2f6df1b, /* No space left on device */ + UAESPIPE = 0x2f6df1c, /* Illegal seek */ + UAEROFS = 0x2f6df1d, /* Read-only file system */ + UAEMLINK = 0x2f6df1e, /* Too many links */ + UAEPIPE = 0x2f6df1f, /* Broken pipe */ + UAEDOM = 0x2f6df20, /* Math argument out of domain of func */ + UAERANGE = 0x2f6df21, /* Math result not representable */ + UAEDEADLK = 0x2f6df22, /* Resource deadlock would occur */ + UAENAMETOOLONG = 0x2f6df23, /* File name too long */ + UAENOLCK = 0x2f6df24, /* No record locks available */ + UAENOSYS = 0x2f6df25, /* Function not implemented */ + UAENOTEMPTY = 0x2f6df26, /* Directory not empty */ + UAELOOP = 0x2f6df27, /* Too many symbolic links encountered */ + UAEWOULDBLOCK = 0x2f6df28, /* Operation would block */ + UAENOMSG = 0x2f6df29, /* No message of desired type */ + UAEIDRM = 0x2f6df2a, /* Identifier removed */ + UAECHRNG = 0x2f6df2b, /* Channel number out of range */ + UAEL2NSYNC = 0x2f6df2c, /* Level 2 not synchronized */ + UAEL3HLT = 0x2f6df2d, /* Level 3 halted */ + UAEL3RST = 0x2f6df2e, /* Level 3 reset */ + UAELNRNG = 0x2f6df2f, /* Link number out of range */ + UAEUNATCH = 0x2f6df30, /* Protocol driver not attached */ + UAENOCSI = 0x2f6df31, /* No CSI structure available */ + UAEL2HLT = 0x2f6df32, /* Level 2 halted */ + UAEBADE = 0x2f6df33, /* Invalid exchange */ + UAEBADR = 0x2f6df34, /* Invalid request descriptor */ + UAEXFULL = 0x2f6df35, /* Exchange full */ + UAENOANO = 0x2f6df36, /* No anode */ + UAEBADRQC = 0x2f6df37, /* Invalid request code */ + UAEBADSLT = 0x2f6df38, /* Invalid slot */ + UAEBFONT = 0x2f6df39, /* Bad font file format */ + UAENOSTR = 0x2f6df3a, /* Device not a stream */ + UAENODATA = 0x2f6df3b, /* No data available */ + UAETIME = 0x2f6df3c, /* Timer expired */ + UAENOSR = 0x2f6df3d, /* Out of streams resources */ + UAENONET = 0x2f6df3e, /* Machine is not on the network */ + UAENOPKG = 0x2f6df3f, /* Package not installed */ + UAEREMOTE = 0x2f6df40, /* Object is remote */ + UAENOLINK = 0x2f6df41, /* Link has been severed */ + UAEADV = 0x2f6df42, /* Advertise error */ + UAESRMNT = 0x2f6df43, /* Srmount error */ + UAECOMM = 0x2f6df44, /* Communication error on send */ + UAEPROTO = 0x2f6df45, /* Protocol error */ + UAEMULTIHOP = 0x2f6df46, /* Multihop attempted */ + UAEDOTDOT = 0x2f6df47, /* RFS specific error */ + UAEBADMSG = 0x2f6df48, /* Not a data message */ + UAEOVERFLOW = 0x2f6df49, /* Value too large for defined data type */ + UAENOTUNIQ = 0x2f6df4a, /* Name not unique on network */ + UAEBADFD = 0x2f6df4b, /* File descriptor in bad state */ + UAEREMCHG = 0x2f6df4c, /* Remote address changed */ + UAELIBACC = 0x2f6df4d, /* Can not access a needed shared library */ + UAELIBBAD = 0x2f6df4e, /* Accessing a corrupted shared library */ + UAELIBSCN = 0x2f6df4f, /* .lib section in a.out corrupted */ + UAELIBMAX = 0x2f6df50, /* Attempting to link in too many shared libraries */ + UAELIBEXEC = 0x2f6df51, /* Cannot exec a shared library directly */ + UAEILSEQ = 0x2f6df52, /* Illegal byte sequence */ + UAERESTART = 0x2f6df53, /* Interrupted system call should be restarted */ + UAESTRPIPE = 0x2f6df54, /* Streams pipe error */ + UAEUSERS = 0x2f6df55, /* Too many users */ + UAENOTSOCK = 0x2f6df56, /* Socket operation on non-socket */ + UAEDESTADDRREQ = 0x2f6df57, /* Destination address required */ + UAEMSGSIZE = 0x2f6df58, /* Message too long */ + UAEPROTOTYPE = 0x2f6df59, /* Protocol wrong type for socket */ + UAENOPROTOOPT = 0x2f6df5a, /* Protocol not available */ + UAEPROTONOSUPPORT = 0x2f6df5b, /* Protocol not supported */ + UAESOCKTNOSUPPORT = 0x2f6df5c, /* Socket type not supported */ + UAEOPNOTSUPP = 0x2f6df5d, /* Operation not supported on transport endpoint */ + UAEPFNOSUPPORT = 0x2f6df5e, /* Protocol family not supported */ + UAEAFNOSUPPORT = 0x2f6df5f, /* Address family not supported by protocol */ + UAEADDRINUSE = 0x2f6df60, /* Address already in use */ + UAEADDRNOTAVAIL = 0x2f6df61, /* Cannot assign requested address */ + UAENETDOWN = 0x2f6df62, /* Network is down */ + UAENETUNREACH = 0x2f6df63, /* Network is unreachable */ + UAENETRESET = 0x2f6df64, /* Network dropped connection because of reset */ + UAECONNABORTED = 0x2f6df65, /* Software caused connection abort */ + UAECONNRESET = 0x2f6df66, /* Connection reset by peer */ + UAENOBUFS = 0x2f6df67, /* No buffer space available */ + UAEISCONN = 0x2f6df68, /* Transport endpoint is already connected */ + UAENOTCONN = 0x2f6df69, /* Transport endpoint is not connected */ + UAESHUTDOWN = 0x2f6df6a, /* Cannot send after transport endpoint shutdown */ + UAETOOMANYREFS = 0x2f6df6b, /* Too many references: cannot splice */ + UAETIMEDOUT = 0x2f6df6c, /* Connection timed out */ + UAECONNREFUSED = 0x2f6df6d, /* Connection refused */ + UAEHOSTDOWN = 0x2f6df6e, /* Host is down */ + UAEHOSTUNREACH = 0x2f6df6f, /* No route to host */ + UAEALREADY = 0x2f6df70, /* Operation already in progress */ + UAEINPROGRESS = 0x2f6df71, /* Operation now in progress */ + UAESTALE = 0x2f6df72, /* Stale NFS file handle */ + UAEUCLEAN = 0x2f6df73, /* Structure needs cleaning */ + UAENOTNAM = 0x2f6df74, /* Not a XENIX named type file */ + UAENAVAIL = 0x2f6df75, /* No XENIX semaphores available */ + UAEISNAM = 0x2f6df76, /* Is a named type file */ + UAEREMOTEIO = 0x2f6df77, /* Remote I/O error */ + UAEDQUOT = 0x2f6df78, /* Quota exceeded */ + UAENOMEDIUM = 0x2f6df79, /* No medium found */ + UAEMEDIUMTYPE = 0x2f6df7a, /* Wrong medium type */ +}; diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h new file mode 100644 index 000000000000..b2f06c1917c2 --- /dev/null +++ b/fs/afs/protocol_yfs.h @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* YFS protocol bits + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define YFS_FS_SERVICE 2500 +#define YFS_CM_SERVICE 2501 + +#define YFSCBMAX 1024 + +enum YFS_CM_Operations { + YFSCBProbe = 206, /* probe client */ + YFSCBGetLock = 207, /* get contents of CM lock table */ + YFSCBXStatsVersion = 209, /* get version of extended statistics */ + YFSCBGetXStats = 210, /* get contents of extended statistics data */ + YFSCBInitCallBackState3 = 213, /* initialise callback state, version 3 */ + YFSCBProbeUuid = 214, /* check the client hasn't rebooted */ + YFSCBGetServerPrefs = 215, + YFSCBGetCellServDV = 216, + YFSCBGetLocalCell = 217, + YFSCBGetCacheConfig = 218, + YFSCBGetCellByNum = 65537, + YFSCBTellMeAboutYourself = 65538, /* get client capabilities */ + YFSCBCallBack = 64204, +}; + +enum YFS_FS_Operations { + YFSFETCHACL = 64131, /* YFS Fetch file AFS3 ACL */ + YFSFETCHSTATUS = 64132, /* YFS Fetch file status */ + YFSSTOREACL = 64134, /* YFS Store file AFS3 ACL */ + YFSSTORESTATUS = 64135, /* YFS Store file status */ + YFSREMOVEFILE = 64136, /* YFS Remove a file */ + YFSCREATEFILE = 64137, /* YFS Create a file */ + YFSRENAME = 64138, /* YFS Rename or move a file or directory */ + YFSSYMLINK = 64139, /* YFS Create a symbolic link */ + YFSLINK = 64140, /* YFS Create a hard link */ + YFSMAKEDIR = 64141, /* YFS Create a directory */ + YFSREMOVEDIR = 64142, /* YFS Remove a directory */ + YFSGETVOLUMESTATUS = 64149, /* YFS Get volume status information */ + YFSSETVOLUMESTATUS = 64150, /* YFS Set volume status information */ + YFSSETLOCK = 64156, /* YFS Request a file lock */ + YFSEXTENDLOCK = 64157, /* YFS Extend a file lock */ + YFSRELEASELOCK = 64158, /* YFS Release a file lock */ + YFSLOOKUP = 64161, /* YFS lookup file in directory */ + YFSFLUSHCPS = 64165, + YFSFETCHOPAQUEACL = 64168, /* YFS Fetch file YFS ACL */ + YFSWHOAMI = 64170, + YFSREMOVEACL = 64171, + YFSREMOVEFILE2 = 64173, + YFSSTOREOPAQUEACL2 = 64174, + YFSRENAME_REPLACE = 64176, + YFSRENAME_NOREPLACE = 64177, + YFSRENAME_EXCHANGE = 64187, + YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */ + YFSFETCHDATA64 = 64537, /* YFS Fetch file data */ + YFSSTOREDATA64 = 64538, /* YFS Store file data */ + YFSUPDATESYMLINK = 64540, +}; + +struct yfs_xdr_u64 { + __be32 msw; + __be32 lsw; +} __packed; + +static inline u64 xdr_to_u64(const struct yfs_xdr_u64 x) +{ + return ((u64)ntohl(x.msw) << 32) | ntohl(x.lsw); +} + +static inline struct yfs_xdr_u64 u64_to_xdr(const u64 x) +{ + return (struct yfs_xdr_u64){ .msw = htonl(x >> 32), .lsw = htonl(x) }; +} + +struct yfs_xdr_vnode { + struct yfs_xdr_u64 lo; + __be32 hi; + __be32 unique; +} __packed; + +struct yfs_xdr_YFSFid { + struct yfs_xdr_u64 volume; + struct yfs_xdr_vnode vnode; +} __packed; + + +struct yfs_xdr_YFSFetchStatus { + __be32 type; + __be32 nlink; + struct yfs_xdr_u64 size; + struct yfs_xdr_u64 data_version; + struct yfs_xdr_u64 author; + struct yfs_xdr_u64 owner; + struct yfs_xdr_u64 group; + __be32 mode; + __be32 caller_access; + __be32 anon_access; + struct yfs_xdr_vnode parent; + __be32 data_access_protocol; + struct yfs_xdr_u64 mtime_client; + struct yfs_xdr_u64 mtime_server; + __be32 lock_count; + __be32 abort_code; +} __packed; + +struct yfs_xdr_YFSCallBack { + __be32 version; + struct yfs_xdr_u64 expiration_time; + __be32 type; +} __packed; + +struct yfs_xdr_YFSStoreStatus { + __be32 mask; + __be32 mode; + struct yfs_xdr_u64 mtime_client; + struct yfs_xdr_u64 owner; + struct yfs_xdr_u64 group; +} __packed; + +struct yfs_xdr_RPCFlags { + __be32 rpc_flags; +} __packed; + +struct yfs_xdr_YFSVolSync { + struct yfs_xdr_u64 vol_creation_date; + struct yfs_xdr_u64 vol_update_date; + struct yfs_xdr_u64 max_quota; + struct yfs_xdr_u64 blocks_in_use; + struct yfs_xdr_u64 blocks_avail; +} __packed; + +enum yfs_volume_type { + yfs_volume_type_ro = 0, + yfs_volume_type_rw = 1, +}; + +#define yfs_FVSOnline 0x1 +#define yfs_FVSInservice 0x2 +#define yfs_FVSBlessed 0x4 +#define yfs_FVSNeedsSalvage 0x8 + +struct yfs_xdr_YFSFetchVolumeStatus { + struct yfs_xdr_u64 vid; + struct yfs_xdr_u64 parent_id; + __be32 flags; + __be32 type; + struct yfs_xdr_u64 max_quota; + struct yfs_xdr_u64 blocks_in_use; + struct yfs_xdr_u64 part_blocks_avail; + struct yfs_xdr_u64 part_max_blocks; + struct yfs_xdr_u64 vol_copy_date; + struct yfs_xdr_u64 vol_backup_date; +} __packed; + +struct yfs_xdr_YFSStoreVolumeStatus { + __be32 mask; + struct yfs_xdr_u64 min_quota; + struct yfs_xdr_u64 max_quota; + struct yfs_xdr_u64 file_quota; +} __packed; + +enum yfs_lock_type { + yfs_LockNone = -1, + yfs_LockRead = 0, + yfs_LockWrite = 1, + yfs_LockExtend = 2, + yfs_LockRelease = 3, + yfs_LockMandatoryRead = 0x100, + yfs_LockMandatoryWrite = 0x101, + yfs_LockMandatoryExtend = 0x102, +}; + +/* RXYFS Viced Capability Flags */ +#define YFS_VICED_CAPABILITY_ERRORTRANS 0x0001 /* Deprecated v0.195 */ +#define YFS_VICED_CAPABILITY_64BITFILES 0x0002 /* Deprecated v0.195 */ +#define YFS_VICED_CAPABILITY_WRITELOCKACL 0x0004 /* Can lock a file even without lock perm */ +#define YFS_VICED_CAPABILITY_SANEACLS 0x0008 /* Deprecated v0.195 */ diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c new file mode 100644 index 000000000000..6a4e7da10fc4 --- /dev/null +++ b/fs/afs/rotate.c @@ -0,0 +1,763 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Handle fileserver selection and rotation. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/sched/signal.h> +#include "internal.h" +#include "afs_fs.h" +#include "protocol_uae.h" + +void afs_clear_server_states(struct afs_operation *op) +{ + unsigned int i; + + if (op->server_states) { + for (i = 0; i < op->server_list->nr_servers; i++) + afs_put_endpoint_state(op->server_states[i].endpoint_state, + afs_estate_trace_put_server_state); + kfree(op->server_states); + } +} + +/* + * Begin iteration through a server list, starting with the vnode's last used + * server if possible, or the last recorded good server if not. + */ +static bool afs_start_fs_iteration(struct afs_operation *op, + struct afs_vnode *vnode) +{ + struct afs_server *server; + void *cb_server; + int i; + + trace_afs_rotate(op, afs_rotate_trace_start, 0); + + read_lock(&op->volume->servers_lock); + op->server_list = afs_get_serverlist( + rcu_dereference_protected(op->volume->servers, + lockdep_is_held(&op->volume->servers_lock))); + read_unlock(&op->volume->servers_lock); + + op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]), + GFP_KERNEL); + if (!op->server_states) { + afs_op_nomem(op); + trace_afs_rotate(op, afs_rotate_trace_nomem, 0); + return false; + } + + rcu_read_lock(); + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_endpoint_state *estate; + struct afs_server_state *s = &op->server_states[i]; + + server = op->server_list->servers[i].server; + estate = rcu_dereference(server->endpoint_state); + s->endpoint_state = afs_get_endpoint_state(estate, + afs_estate_trace_get_server_state); + s->probe_seq = estate->probe_seq; + s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1; + init_waitqueue_entry(&s->probe_waiter, current); + afs_get_address_preferences(op->net, estate->addresses); + } + rcu_read_unlock(); + + + op->untried_servers = (1UL << op->server_list->nr_servers) - 1; + op->server_index = -1; + + cb_server = vnode->cb_server; + if (cb_server) { + /* See if the vnode's preferred record is still available */ + for (i = 0; i < op->server_list->nr_servers; i++) { + server = op->server_list->servers[i].server; + if (server == cb_server) { + op->server_index = i; + goto found_interest; + } + } + + /* If we have a lock outstanding on a server that's no longer + * serving this vnode, then we can't switch to another server + * and have to return an error. + */ + if (op->flags & AFS_OPERATION_CUR_ONLY) { + afs_op_set_error(op, -ESTALE); + trace_afs_rotate(op, afs_rotate_trace_stale_lock, 0); + return false; + } + + /* Note that the callback promise is effectively broken */ + write_seqlock(&vnode->cb_lock); + ASSERTCMP(cb_server, ==, vnode->cb_server); + vnode->cb_server = NULL; + if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_rotate_server)) + vnode->cb_break++; + write_sequnlock(&vnode->cb_lock); + } + +found_interest: + return true; +} + +/* + * Post volume busy note. + */ +static void afs_busy(struct afs_operation *op, u32 abort_code) +{ + const char *m; + + switch (abort_code) { + case VOFFLINE: m = "offline"; break; + case VRESTARTING: m = "restarting"; break; + case VSALVAGING: m = "being salvaged"; break; + default: m = "busy"; break; + } + + pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n", + op->volume->vid, op->volume->name, &op->server->uuid, m); +} + +/* + * Sleep and retry the operation to the same fileserver. + */ +static bool afs_sleep_and_retry(struct afs_operation *op) +{ + trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0); + if (!(op->flags & AFS_OPERATION_UNINTR)) { + msleep_interruptible(1000); + if (signal_pending(current)) { + afs_op_set_error(op, -ERESTARTSYS); + return false; + } + } else { + msleep(1000); + } + + return true; +} + +/* + * Select the fileserver to use. May be called multiple times to rotate + * through the fileservers. + */ +bool afs_select_fileserver(struct afs_operation *op) +{ + struct afs_addr_list *alist; + struct afs_server *server; + struct afs_vnode *vnode = op->file[0].vnode; + unsigned long set, failed; + s32 abort_code = op->call_abort_code; + int best_prio = 0; + int error = op->call_error, addr_index, i, j; + + op->nr_iterations++; + + _enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d", + op->debug_id, op->nr_iterations, op->volume->vid, + op->server_index, op->untried_servers, + op->addr_index, op->addr_tried, + error, abort_code); + + if (op->flags & AFS_OPERATION_STOP) { + trace_afs_rotate(op, afs_rotate_trace_stopped, 0); + _leave(" = f [stopped]"); + return false; + } + + if (op->nr_iterations == 0) + goto start; + + WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error); + trace_afs_rotate(op, afs_rotate_trace_iter, op->call_error); + + /* Evaluate the result of the previous operation, if there was one. */ + switch (op->call_error) { + case 0: + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); + op->cumul_error.responded = true; + + /* We succeeded, but we may need to redo the op from another + * server if we're looking at a set of RO volumes where some of + * the servers have not yet been brought up to date lest we + * regress the data. We only switch to the new version once + * >=50% of the servers are updated. + */ + error = afs_update_volume_state(op); + if (error != 0) { + if (error == 1) { + afs_sleep_and_retry(op); + goto restart_from_beginning; + } + afs_op_set_error(op, error); + goto failed; + } + fallthrough; + default: + /* Success or local failure. Stop. */ + afs_op_set_error(op, error); + op->flags |= AFS_OPERATION_STOP; + trace_afs_rotate(op, afs_rotate_trace_stop, error); + _leave(" = f [okay/local %d]", error); + return false; + + case -ECONNABORTED: + /* The far side rejected the operation on some grounds. This + * might involve the server being busy or the volume having been moved. + * + * Note that various V* errors should not be sent to a cache manager + * by a fileserver as they should be translated to more modern UAE* + * errors instead. IBM AFS and OpenAFS fileservers, however, do leak + * these abort codes. + */ + trace_afs_rotate(op, afs_rotate_trace_aborted, abort_code); + op->cumul_error.responded = true; + switch (abort_code) { + case VNOVOL: + /* This fileserver doesn't know about the volume. + * - May indicate that the VL is wrong - retry once and compare + * the results. + * - May indicate that the fileserver couldn't attach to the vol. + * - The volume might have been temporarily removed so that it can + * be replaced by a volume restore. "vos" might have ended one + * transaction and has yet to create the next. + * - The volume might not be blessed or might not be in-service + * (administrative action). + */ + if (op->flags & AFS_OPERATION_VNOVOL) { + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + goto next_server; + } + + write_lock(&op->volume->servers_lock); + op->server_list->vnovol_mask |= 1 << op->server_index; + write_unlock(&op->volume->servers_lock); + + set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); + error = afs_check_volume_status(op->volume, op); + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } + + if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) { + afs_op_set_error(op, -ENOMEDIUM); + goto failed; + } + + /* If the server list didn't change, then assume that + * it's the fileserver having trouble. + */ + if (rcu_access_pointer(op->volume->servers) == op->server_list) { + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + goto next_server; + } + + /* Try again */ + op->flags |= AFS_OPERATION_VNOVOL; + _leave(" = t [vnovol]"); + return true; + + case VVOLEXISTS: + case VONLINE: + /* These should not be returned from the fileserver. */ + pr_warn("Fileserver returned unexpected abort %d\n", + abort_code); + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + goto next_server; + + case VNOSERVICE: + /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver + * if the volume was neither in-service nor administratively + * blessed. All usage was replaced by VNOVOL because AFS 3.1 and + * earlier cache managers did not handle VNOSERVICE and assumed + * it was the client OSes errno 105. + * + * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the + * fileserver idle dead time error which was sent in place of + * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the + * fileserver took too long to send a reply to the client. + * RX_CALL_TIMEOUT would have caused the cache manager to mark the + * server down whereas VNOSERVICE since AFS 3.2 would cause cache + * manager to temporarily (up to 15 minutes) mark the volume + * instance as unusable. + * + * The idle dead logic resulted in cache inconsistency since a + * state changing call that the cache manager assumed was dead + * could still be processed to completion by the fileserver. This + * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer + * returned. However, many 1.4.8 through 1.6.24 fileservers are + * still in existence. + * + * AuriStorFS fileservers have never returned VNOSERVICE. + * + * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT. + */ + case RX_CALL_TIMEOUT: + afs_op_accumulate_error(op, -ETIMEDOUT, abort_code); + goto next_server; + + case VSALVAGING: /* This error should not be leaked to cache managers + * but is from OpenAFS demand attach fileservers. + * It should be treated as an alias for VOFFLINE. + */ + case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */ + case VOFFLINE: + /* The volume is in use by the volserver or another volume utility + * for an operation that might alter the contents. The volume is + * expected to come back but it might take a long time (could be + * days). + */ + if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags)) { + afs_busy(op, abort_code); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); + } + if (op->flags & AFS_OPERATION_NO_VSLEEP) { + afs_op_set_error(op, -EADV); + goto failed; + } + goto busy; + + case VRESTARTING: /* The fileserver is either shutting down or starting up. */ + case VBUSY: + /* The volume is in use by the volserver or another volume + * utility for an operation that is not expected to alter the + * contents of the volume. VBUSY does not need to be returned + * for a ROVOL or BACKVOL bound to an ITBusy volserver + * transaction. The fileserver is permitted to continue serving + * content from ROVOLs and BACKVOLs during an ITBusy transaction + * because the content will not change. However, many fileserver + * releases do return VBUSY for ROVOL and BACKVOL instances under + * many circumstances. + * + * Retry after going round all the servers unless we have a file + * lock we need to maintain. + */ + if (op->flags & AFS_OPERATION_NO_VSLEEP) { + afs_op_set_error(op, -EBUSY); + goto failed; + } + if (!test_and_set_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags)) { + afs_busy(op, abort_code); + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); + } + busy: + if (op->flags & AFS_OPERATION_CUR_ONLY) { + if (!afs_sleep_and_retry(op)) + goto failed; + + /* Retry with same server & address */ + _leave(" = t [vbusy]"); + return true; + } + + op->flags |= AFS_OPERATION_VBUSY; + goto next_server; + + case VMOVED: + /* The volume migrated to another server. We consider + * consider all locks and callbacks broken and request + * an update from the VLDB. + * + * We also limit the number of VMOVED hops we will + * honour, just in case someone sets up a loop. + */ + if (op->flags & AFS_OPERATION_VMOVED) { + afs_op_set_error(op, -EREMOTEIO); + goto failed; + } + op->flags |= AFS_OPERATION_VMOVED; + + set_bit(AFS_VOLUME_WAIT, &op->volume->flags); + set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); + error = afs_check_volume_status(op->volume, op); + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } + + /* If the server list didn't change, then the VLDB is + * out of sync with the fileservers. This is hopefully + * a temporary condition, however, so we don't want to + * permanently block access to the file. + * + * TODO: Try other fileservers if we can. + * + * TODO: Retry a few times with sleeps. + */ + if (rcu_access_pointer(op->volume->servers) == op->server_list) { + afs_op_accumulate_error(op, -ENOMEDIUM, abort_code); + goto failed; + } + + goto restart_from_beginning; + + case UAEIO: + case VIO: + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + if (op->volume->type != AFSVL_RWVOL) + goto next_server; + goto failed; + + case VDISKFULL: + case UAENOSPC: + /* The partition is full. Only applies to RWVOLs. + * Translate locally and return ENOSPC. + * No replicas to failover to. + */ + afs_op_set_error(op, -ENOSPC); + goto failed_but_online; + + case VOVERQUOTA: + case UAEDQUOT: + /* Volume is full. Only applies to RWVOLs. + * Translate locally and return EDQUOT. + * No replicas to failover to. + */ + afs_op_set_error(op, -EDQUOT); + goto failed_but_online; + + case RX_INVALID_OPERATION: + case RXGEN_OPCODE: + /* Handle downgrading to an older operation. */ + afs_op_set_error(op, -ENOTSUPP); + if (op->flags & AFS_OPERATION_DOWNGRADE) { + op->flags &= ~AFS_OPERATION_DOWNGRADE; + goto go_again; + } + goto failed_but_online; + + default: + afs_op_accumulate_error(op, error, abort_code); + failed_but_online: + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); + goto failed; + } + + case -ETIMEDOUT: + case -ETIME: + if (afs_op_error(op) != -EDESTADDRREQ) + goto iterate_address; + fallthrough; + case -ERFKILL: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: + case -EHOSTDOWN: + case -ECONNREFUSED: + _debug("no conn"); + afs_op_accumulate_error(op, error, 0); + goto iterate_address; + + case -ENETRESET: + pr_warn("kAFS: Peer reset %s (op=%x)\n", + op->type ? op->type->name : "???", op->debug_id); + fallthrough; + case -ECONNRESET: + _debug("call reset"); + afs_op_set_error(op, error); + goto failed; + } + +restart_from_beginning: + trace_afs_rotate(op, afs_rotate_trace_restart, 0); + _debug("restart"); + op->estate = NULL; + op->server = NULL; + afs_clear_server_states(op); + op->server_states = NULL; + afs_put_serverlist(op->net, op->server_list); + op->server_list = NULL; +start: + _debug("start"); + ASSERTCMP(op->estate, ==, NULL); + /* See if we need to do an update of the volume record. Note that the + * volume may have moved or even have been deleted. + */ + error = afs_check_volume_status(op->volume, op); + trace_afs_rotate(op, afs_rotate_trace_check_vol_status, error); + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } + + if (!afs_start_fs_iteration(op, vnode)) + goto failed; + + _debug("__ VOL %llx __", op->volume->vid); + +pick_server: + _debug("pick [%lx]", op->untried_servers); + ASSERTCMP(op->estate, ==, NULL); + + error = afs_wait_for_fs_probes(op, op->server_states, + !(op->flags & AFS_OPERATION_UNINTR)); + switch (error) { + case 0: /* No untried responsive servers and no outstanding probes */ + trace_afs_rotate(op, afs_rotate_trace_probe_none, 0); + goto no_more_servers; + case 1: /* Got a response */ + trace_afs_rotate(op, afs_rotate_trace_probe_response, 0); + break; + case 2: /* Probe data superseded */ + trace_afs_rotate(op, afs_rotate_trace_probe_superseded, 0); + goto restart_from_beginning; + default: + trace_afs_rotate(op, afs_rotate_trace_probe_error, error); + afs_op_set_error(op, error); + goto failed; + } + + /* Pick the untried server with the highest priority untried endpoint. + * If we have outstanding callbacks, we stick with the server we're + * already using if we can. + */ + if (op->server) { + _debug("server %u", op->server_index); + if (test_bit(op->server_index, &op->untried_servers)) + goto selected_server; + op->server = NULL; + _debug("no server"); + } + + rcu_read_lock(); + op->server_index = -1; + best_prio = -1; + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_endpoint_state *es; + struct afs_server_entry *se = &op->server_list->servers[i]; + struct afs_addr_list *sal; + struct afs_server *s = se->server; + + if (!test_bit(i, &op->untried_servers) || + test_bit(AFS_SE_EXCLUDED, &se->flags) || + !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags)) + continue; + es = op->server_states[i].endpoint_state; + sal = es->addresses; + + afs_get_address_preferences_rcu(op->net, sal); + for (j = 0; j < sal->nr_addrs; j++) { + if (es->failed_set & (1 << j)) + continue; + if (!sal->addrs[j].peer) + continue; + if (sal->addrs[j].prio > best_prio) { + op->server_index = i; + best_prio = sal->addrs[j].prio; + } + } + } + rcu_read_unlock(); + + if (op->server_index == -1) + goto no_more_servers; + +selected_server: + trace_afs_rotate(op, afs_rotate_trace_selected_server, best_prio); + _debug("use %d prio %u", op->server_index, best_prio); + __clear_bit(op->server_index, &op->untried_servers); + + /* We're starting on a different fileserver from the list. We need to + * check it, create a callback intercept, find its address list and + * probe its capabilities before we use it. + */ + ASSERTCMP(op->estate, ==, NULL); + server = op->server_list->servers[op->server_index].server; + + if (!afs_check_server_record(op, server, op->key)) + goto failed; + + _debug("USING SERVER: %pU", &server->uuid); + + op->flags |= AFS_OPERATION_RETRY_SERVER; + op->server = server; + if (vnode->cb_server != server) { + vnode->cb_server = server; + vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); + afs_clear_cb_promise(vnode, afs_cb_promise_clear_server_change); + } + +retry_server: + op->addr_tried = 0; + op->addr_index = -1; + +iterate_address: + /* Iterate over the current server's address list to try and find an + * address on which it will respond to us. + */ + op->estate = op->server_states[op->server_index].endpoint_state; + set = READ_ONCE(op->estate->responsive_set); + failed = READ_ONCE(op->estate->failed_set); + _debug("iterate ES=%x rs=%lx fs=%lx", op->estate->probe_seq, set, failed); + set &= ~(failed | op->addr_tried); + trace_afs_rotate(op, afs_rotate_trace_iterate_addr, set); + if (!set) + goto wait_for_more_probe_results; + + alist = op->estate->addresses; + best_prio = -1; + addr_index = 0; + for (i = 0; i < alist->nr_addrs; i++) { + if (!(set & (1 << i))) + continue; + if (alist->addrs[i].prio > best_prio) { + addr_index = i; + best_prio = alist->addrs[i].prio; + } + } + + alist->preferred = addr_index; + + op->addr_index = addr_index; + set_bit(addr_index, &op->addr_tried); + + _debug("address [%u] %u/%u %pISp", + op->server_index, addr_index, alist->nr_addrs, + rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer)); +go_again: + op->volsync.creation = TIME64_MIN; + op->volsync.update = TIME64_MIN; + op->call_responded = false; + _leave(" = t"); + return true; + +wait_for_more_probe_results: + error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried, + !(op->flags & AFS_OPERATION_UNINTR)); + if (error == 1) + goto iterate_address; + if (!error) + goto restart_from_beginning; + + /* We've now had a failure to respond on all of a server's addresses - + * immediately probe them again and consider retrying the server. + */ + trace_afs_rotate(op, afs_rotate_trace_probe_fileserver, 0); + afs_probe_fileserver(op->net, op->server); + if (op->flags & AFS_OPERATION_RETRY_SERVER) { + error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried, + !(op->flags & AFS_OPERATION_UNINTR)); + switch (error) { + case 1: + op->flags &= ~AFS_OPERATION_RETRY_SERVER; + trace_afs_rotate(op, afs_rotate_trace_retry_server, 1); + goto retry_server; + case 0: + trace_afs_rotate(op, afs_rotate_trace_retry_server, 0); + goto restart_from_beginning; + case -ERESTARTSYS: + afs_op_set_error(op, error); + goto failed; + case -ETIME: + case -EDESTADDRREQ: + goto next_server; + } + } + +next_server: + trace_afs_rotate(op, afs_rotate_trace_next_server, 0); + _debug("next"); + op->estate = NULL; + goto pick_server; + +no_more_servers: + /* That's all the servers poked to no good effect. Try again if some + * of them were busy. + */ + trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0); + if (op->flags & AFS_OPERATION_VBUSY) { + afs_sleep_and_retry(op); + op->flags &= ~AFS_OPERATION_VBUSY; + goto restart_from_beginning; + } + + rcu_read_lock(); + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_endpoint_state *estate; + + estate = op->server_states[i].endpoint_state; + error = READ_ONCE(estate->error); + if (error < 0) + afs_op_accumulate_error(op, error, estate->abort_code); + } + rcu_read_unlock(); + +failed: + trace_afs_rotate(op, afs_rotate_trace_failed, 0); + op->flags |= AFS_OPERATION_STOP; + op->estate = NULL; + _leave(" = f [failed %d]", afs_op_error(op)); + return false; +} + +/* + * Dump cursor state in the case of the error being EDESTADDRREQ. + */ +void afs_dump_edestaddrreq(const struct afs_operation *op) +{ + static int count; + int i; + + if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3) + return; + count++; + + rcu_read_lock(); + + pr_notice("EDESTADDR occurred\n"); + pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n", + op->file[0].cb_break_before, + op->file[1].cb_break_before, op->flags, op->cumul_error.error); + pr_notice("OP: ut=%lx ix=%d ni=%u\n", + op->untried_servers, op->server_index, op->nr_iterations); + pr_notice("OP: call er=%d ac=%d r=%u\n", + op->call_error, op->call_abort_code, op->call_responded); + + if (op->server_list) { + const struct afs_server_list *sl = op->server_list; + + pr_notice("FC: SL nr=%u vnov=%hx\n", + sl->nr_servers, sl->vnovol_mask); + for (i = 0; i < sl->nr_servers; i++) { + const struct afs_server *s = sl->servers[i].server; + const struct afs_endpoint_state *e = + rcu_dereference(s->endpoint_state); + const struct afs_addr_list *a = e->addresses; + + pr_notice("FC: server fl=%lx av=%u %pU\n", + s->flags, s->addr_version, &s->uuid); + pr_notice("FC: - pq=%x R=%lx F=%lx\n", + e->probe_seq, e->responsive_set, e->failed_set); + if (a) { + pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n", + a->version, + a->nr_ipv4, a->nr_addrs, a->max_addrs, + a->preferred); + if (a == e->addresses) + pr_notice("FC: - current\n"); + } + } + } + + pr_notice("AC: t=%lx ax=%d\n", op->addr_tried, op->addr_index); + rcu_read_unlock(); +} diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8ad8c2a0703a..bf0e4ea0aafd 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -1,68 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Maintain an RxRPC server socket to do AFS communications through * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/slab.h> +#include <linux/sched/signal.h> + #include <net/sock.h> #include <net/af_rxrpc.h> -#include <rxrpc/packet.h> #include "internal.h" #include "afs_cm.h" +#include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include <trace/events/rxrpc.h> -static struct socket *afs_socket; /* my RxRPC socket */ -static struct workqueue_struct *afs_async_calls; -static atomic_t afs_outstanding_calls; -static atomic_t afs_outstanding_skbs; +struct workqueue_struct *afs_async_calls; -static void afs_wake_up_call_waiter(struct afs_call *); -static int afs_wait_for_call_to_complete(struct afs_call *); -static void afs_wake_up_async_call(struct afs_call *); -static int afs_dont_wait_for_call_to_complete(struct afs_call *); +static void afs_deferred_free_worker(struct work_struct *work); +static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); +static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_process_async_call(struct work_struct *); -static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); -static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); - -/* synchronous call management */ -const struct afs_wait_mode afs_sync_call = { - .rx_wakeup = afs_wake_up_call_waiter, - .wait = afs_wait_for_call_to_complete, -}; - -/* asynchronous call management */ -const struct afs_wait_mode afs_async_call = { - .rx_wakeup = afs_wake_up_async_call, - .wait = afs_dont_wait_for_call_to_complete, -}; - -/* asynchronous incoming call management */ -static const struct afs_wait_mode afs_async_incoming_call = { - .rx_wakeup = afs_wake_up_async_call, +static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); +static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob); +static int afs_deliver_cm_op_id(struct afs_call *); + +static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { + .notify_new_call = afs_rx_new_call, + .discard_new_call = afs_rx_discard_new_call, + .user_attach_call = afs_rx_attach, + .notify_oob = afs_rx_notify_oob, }; /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", .deliver = afs_deliver_cm_op_id, - .abort_to_error = afs_abort_to_error, }; -static void afs_collect_incoming_call(struct work_struct *); - -static struct sk_buff_head afs_incoming_calls; -static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); - /* * open an RxRPC socket and bind it to be a server for callback notifications * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT */ -int afs_open_socket(void) +int afs_open_socket(struct afs_net *net) { struct sockaddr_rxrpc srx; struct socket *socket; @@ -70,157 +53,254 @@ int afs_open_socket(void) _enter(""); - skb_queue_head_init(&afs_incoming_calls); - - afs_async_calls = create_singlethread_workqueue("kafsd"); - if (!afs_async_calls) { - _leave(" = -ENOMEM [wq]"); - return -ENOMEM; - } - - ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); - if (ret < 0) { - destroy_workqueue(afs_async_calls); - _leave(" = %d [socket]", ret); - return ret; - } + ret = sock_create_kern(net->net, AF_RXRPC, SOCK_DGRAM, PF_INET6, &socket); + if (ret < 0) + goto error_1; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_user_data = net; /* bind the callback manager's address to make this a server socket */ + memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; srx.srx_service = CM_SERVICE; srx.transport_type = SOCK_DGRAM; - srx.transport_len = sizeof(srx.transport.sin); - srx.transport.sin.sin_family = AF_INET; - srx.transport.sin.sin_port = htons(AFS_CM_PORT); - memset(&srx.transport.sin.sin_addr, 0, - sizeof(srx.transport.sin.sin_addr)); - - ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); - if (ret < 0) { - sock_release(socket); - destroy_workqueue(afs_async_calls); - _leave(" = %d [bind]", ret); - return ret; + srx.transport_len = sizeof(srx.transport.sin6); + srx.transport.sin6.sin6_family = AF_INET6; + srx.transport.sin6.sin6_port = htons(AFS_CM_PORT); + + ret = rxrpc_sock_set_min_security_level(socket->sk, + RXRPC_SECURITY_ENCRYPT); + if (ret < 0) + goto error_2; + + ret = rxrpc_sock_set_manage_response(socket->sk, true); + if (ret < 0) + goto error_2; + + ret = afs_create_token_key(net, socket); + if (ret < 0) + pr_err("Couldn't create RxGK CM key: %d\n", ret); + + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); + if (ret == -EADDRINUSE) { + srx.transport.sin6.sin6_port = 0; + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); } + if (ret < 0) + goto error_2; + + srx.srx_service = YFS_CM_SERVICE; + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); + if (ret < 0) + goto error_2; + + /* Ideally, we'd turn on service upgrade here, but we can't because + * OpenAFS is buggy and leaks the userStatus field from packet to + * packet and between FS packets and CB packets - so if we try to do an + * upgrade on an FS packet, OpenAFS will leak that into the CB packet + * it sends back to us. + */ - rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor); + rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops); - afs_socket = socket; + ret = kernel_listen(socket, INT_MAX); + if (ret < 0) + goto error_2; + + net->socket = socket; + afs_charge_preallocation(&net->charge_preallocation_work); _leave(" = 0"); return 0; + +error_2: + sock_release(socket); +error_1: + _leave(" = %d", ret); + return ret; } /* * close the RxRPC socket AFS was using */ -void afs_close_socket(void) +void afs_close_socket(struct afs_net *net) { _enter(""); - sock_release(afs_socket); + kernel_listen(net->socket, 0); + flush_workqueue(afs_async_calls); - _debug("dework"); - destroy_workqueue(afs_async_calls); + if (net->spare_incoming_call) { + afs_put_call(net->spare_incoming_call); + net->spare_incoming_call = NULL; + } + + _debug("outstanding %u", atomic_read(&net->nr_outstanding_calls)); + wait_var_event(&net->nr_outstanding_calls, + !atomic_read(&net->nr_outstanding_calls)); + _debug("no outstanding calls"); - ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0); - ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0); + kernel_sock_shutdown(net->socket, SHUT_RDWR); + flush_workqueue(afs_async_calls); + net->socket->sk->sk_user_data = NULL; + sock_release(net->socket); + key_put(net->fs_cm_token_key); + + _debug("dework"); _leave(""); } /* - * note that the data in a socket buffer is now delivered and that the buffer - * should be freed + * Allocate a call. */ -static void afs_data_delivered(struct sk_buff *skb) +static struct afs_call *afs_alloc_call(struct afs_net *net, + const struct afs_call_type *type, + gfp_t gfp) { - if (!skb) { - _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("DLVR %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - if (atomic_dec_return(&afs_outstanding_skbs) == -1) - BUG(); - rxrpc_kernel_data_delivered(skb); + struct afs_call *call; + int o; + + call = kzalloc(sizeof(*call), gfp); + if (!call) + return NULL; + + call->type = type; + call->net = net; + call->debug_id = atomic_inc_return(&rxrpc_debug_id); + refcount_set(&call->ref, 1); + INIT_WORK(&call->async_work, type->async_rx ?: afs_process_async_call); + INIT_WORK(&call->work, call->type->work); + INIT_WORK(&call->free_work, afs_deferred_free_worker); + init_waitqueue_head(&call->waitq); + spin_lock_init(&call->state_lock); + call->iter = &call->def_iter; + + o = atomic_inc_return(&net->nr_outstanding_calls); + trace_afs_call(call->debug_id, afs_call_trace_alloc, 1, o, + __builtin_return_address(0)); + return call; +} + +static void afs_free_call(struct afs_call *call) +{ + struct afs_net *net = call->net; + int o; + + ASSERT(!work_pending(&call->async_work)); + + rxrpc_kernel_put_peer(call->peer); + + if (call->rxcall) { + rxrpc_kernel_shutdown_call(net->socket, call->rxcall); + rxrpc_kernel_put_call(net->socket, call->rxcall); + call->rxcall = NULL; } + if (call->type->destructor) + call->type->destructor(call); + + afs_unuse_server_notime(call->net, call->server, afs_server_trace_unuse_call); + kfree(call->request); + + o = atomic_read(&net->nr_outstanding_calls); + trace_afs_call(call->debug_id, afs_call_trace_free, 0, o, + __builtin_return_address(0)); + kfree(call); + + o = atomic_dec_return(&net->nr_outstanding_calls); + if (o == 0) + wake_up_var(&net->nr_outstanding_calls); } /* - * free a socket buffer + * Dispose of a reference on a call. */ -static void afs_free_skb(struct sk_buff *skb) +void afs_put_call(struct afs_call *call) { - if (!skb) { - _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("FREE %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - if (atomic_dec_return(&afs_outstanding_skbs) == -1) - BUG(); - rxrpc_kernel_free_skb(skb); - } + struct afs_net *net = call->net; + unsigned int debug_id = call->debug_id; + bool zero; + int r, o; + + zero = __refcount_dec_and_test(&call->ref, &r); + o = atomic_read(&net->nr_outstanding_calls); + trace_afs_call(debug_id, afs_call_trace_put, r - 1, o, + __builtin_return_address(0)); + if (zero) + afs_free_call(call); +} + +static void afs_deferred_free_worker(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, free_work); + + afs_free_call(call); } /* - * free a call + * Dispose of a reference on a call, deferring the cleanup to a workqueue + * to avoid lock recursion. */ -static void afs_free_call(struct afs_call *call) +void afs_deferred_put_call(struct afs_call *call) { - _debug("DONE %p{%s} [%d]", - call, call->type->name, atomic_read(&afs_outstanding_calls)); - if (atomic_dec_return(&afs_outstanding_calls) == -1) - BUG(); - - ASSERTCMP(call->rxcall, ==, NULL); - ASSERT(!work_pending(&call->async_work)); - ASSERT(skb_queue_empty(&call->rx_queue)); - ASSERT(call->type->name != NULL); + struct afs_net *net = call->net; + unsigned int debug_id = call->debug_id; + bool zero; + int r, o; + + zero = __refcount_dec_and_test(&call->ref, &r); + o = atomic_read(&net->nr_outstanding_calls); + trace_afs_call(debug_id, afs_call_trace_put, r - 1, o, + __builtin_return_address(0)); + if (zero) + schedule_work(&call->free_work); +} - kfree(call->request); - kfree(call); +/* + * Queue the call for actual work. + */ +static void afs_queue_call_work(struct afs_call *call) +{ + if (call->type->work) { + afs_get_call(call, afs_call_trace_work); + if (!queue_work(afs_wq, &call->work)) + afs_put_call(call); + } } /* * allocate a call with flat request and reply buffers */ -struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, - size_t request_size, size_t reply_size) +struct afs_call *afs_alloc_flat_call(struct afs_net *net, + const struct afs_call_type *type, + size_t request_size, size_t reply_max) { struct afs_call *call; - call = kzalloc(sizeof(*call), GFP_NOFS); + call = afs_alloc_call(net, type, GFP_NOFS); if (!call) goto nomem_call; - _debug("CALL %p{%s} [%d]", - call, type->name, atomic_read(&afs_outstanding_calls)); - atomic_inc(&afs_outstanding_calls); - - call->type = type; - call->request_size = request_size; - call->reply_max = reply_size; - if (request_size) { + call->request_size = request_size; call->request = kmalloc(request_size, GFP_NOFS); if (!call->request) goto nomem_free; } - if (reply_size) { - call->buffer = kmalloc(reply_size, GFP_NOFS); + if (reply_max) { + call->reply_max = reply_max; + call->buffer = kmalloc(reply_max, GFP_NOFS); if (!call->buffer) goto nomem_free; } + afs_extract_to_buf(call, call->reply_max); + call->operation_ID = type->op; init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); return call; nomem_free: - afs_free_call(call); + afs_put_call(call); nomem_call: return NULL; } @@ -239,114 +319,79 @@ void afs_flat_call_destructor(struct afs_call *call) } /* - * attach the data from a bunch of pages on an inode to a call + * Advance the AFS call state when the RxRPC call ends the transmit phase. */ -static int afs_send_pages(struct afs_call *call, struct msghdr *msg, - struct kvec *iov) +static void afs_notify_end_request_tx(struct sock *sock, + struct rxrpc_call *rxcall, + unsigned long call_user_ID) { - struct page *pages[8]; - unsigned count, n, loop, offset, to; - pgoff_t first = call->first, last = call->last; - int ret; - - _enter(""); + struct afs_call *call = (struct afs_call *)call_user_ID; - offset = call->first_offset; - call->first_offset = 0; - - do { - _debug("attach %lx-%lx", first, last); - - count = last - first + 1; - if (count > ARRAY_SIZE(pages)) - count = ARRAY_SIZE(pages); - n = find_get_pages_contig(call->mapping, first, count, pages); - ASSERTCMP(n, ==, count); - - loop = 0; - do { - msg->msg_flags = 0; - to = PAGE_SIZE; - if (first + loop >= last) - to = call->last_to; - else - msg->msg_flags = MSG_MORE; - iov->iov_base = kmap(pages[loop]) + offset; - iov->iov_len = to - offset; - offset = 0; - - _debug("- range %u-%u%s", - offset, to, msg->msg_flags ? " [more]" : ""); - msg->msg_iov = (struct iovec *) iov; - msg->msg_iovlen = 1; - - /* have to change the state *before* sending the last - * packet as RxRPC might give us the reply before it - * returns from sending the request */ - if (first + loop >= last) - call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(call->rxcall, msg, - to - offset); - kunmap(pages[loop]); - if (ret < 0) - break; - } while (++loop < count); - first += count; - - for (loop = 0; loop < count; loop++) - put_page(pages[loop]); - if (ret < 0) - break; - } while (first <= last); - - _leave(" = %d", ret); - return ret; + afs_set_call_state(call, AFS_CALL_CL_REQUESTING, AFS_CALL_CL_AWAIT_REPLY); } /* - * initiate a call + * Initiate a call and synchronously queue up the parameters for dispatch. Any + * error is stored into the call struct, which the caller must check for. */ -int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, - const struct afs_wait_mode *wait_mode) +void afs_make_call(struct afs_call *call, gfp_t gfp) { - struct sockaddr_rxrpc srx; struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; + size_t len; + s64 tx_total_len; int ret; - struct sk_buff *skb; - _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); + _enter(",{%pISp+%u},", rxrpc_kernel_remote_addr(call->peer), call->service_id); ASSERT(call->type != NULL); ASSERT(call->type->name != NULL); _debug("____MAKE %p{%s,%x} [%d]____", call, call->type->name, key_serial(call->key), - atomic_read(&afs_outstanding_calls)); - - call->wait_mode = wait_mode; - INIT_WORK(&call->async_work, afs_process_async_call); - - memset(&srx, 0, sizeof(srx)); - srx.srx_family = AF_RXRPC; - srx.srx_service = call->service_id; - srx.transport_type = SOCK_DGRAM; - srx.transport_len = sizeof(srx.transport.sin); - srx.transport.sin.sin_family = AF_INET; - srx.transport.sin.sin_port = call->port; - memcpy(&srx.transport.sin.sin_addr, addr, 4); + atomic_read(&call->net->nr_outstanding_calls)); + + trace_afs_make_call(call); + + /* Work out the length we're going to transmit. This is awkward for + * calls such as FS.StoreData where there's an extra injection of data + * after the initial fixed part. + */ + tx_total_len = call->request_size; + if (call->write_iter) + tx_total_len += iov_iter_count(call->write_iter); + + /* If the call is going to be asynchronous, we need an extra ref for + * the call to hold itself so the caller need not hang on to its ref. + */ + if (call->async) { + afs_get_call(call, afs_call_trace_get); + call->drop_ref = true; + } /* create a call */ - rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, - (unsigned long) call, gfp); - call->key = NULL; + rxcall = rxrpc_kernel_begin_call(call->net->socket, call->peer, call->key, + (unsigned long)call, + tx_total_len, + call->max_lifespan, + gfp, + (call->async ? + afs_wake_up_async_call : + afs_wake_up_call_waiter), + call->service_id, + call->upgrade, + (call->intr ? RXRPC_PREINTERRUPTIBLE : + RXRPC_UNINTERRUPTIBLE), + call->debug_id); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); + call->error = ret; goto error_kill_call; } call->rxcall = rxcall; + call->issue_time = ktime_get_real(); /* send the request */ iov[0].iov_base = call->request; @@ -354,402 +399,448 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, msg.msg_name = NULL; msg.msg_namelen = 0; - msg.msg_iov = (struct iovec *) iov; - msg.msg_iovlen = 1; + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; - msg.msg_flags = (call->send_pages ? MSG_MORE : 0); - - /* have to change the state *before* sending the last packet as RxRPC - * might give us the reply before it returns from sending the - * request */ - if (!call->send_pages) - call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size); + msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0); + + ret = rxrpc_kernel_send_data(call->net->socket, rxcall, + &msg, call->request_size, + afs_notify_end_request_tx); if (ret < 0) goto error_do_abort; - if (call->send_pages) { - ret = afs_send_pages(call, &msg, iov); + if (call->write_iter) { + msg.msg_iter = *call->write_iter; + msg.msg_flags &= ~MSG_MORE; + trace_afs_send_data(call, &msg); + + ret = rxrpc_kernel_send_data(call->net->socket, + call->rxcall, &msg, + iov_iter_count(&msg.msg_iter), + afs_notify_end_request_tx); + *call->write_iter = msg.msg_iter; + + trace_afs_sent_data(call, &msg, ret); if (ret < 0) goto error_do_abort; } - /* at this point, an async call may no longer exist as it may have - * already completed */ - return wait_mode->wait(call); + /* Note that at this point, we may have received the reply or an abort + * - and an asynchronous call may already have completed. + * + * afs_wait_for_call_to_complete(call) + * must be called to synchronously clean up. + */ + return; error_do_abort: - rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); - rxrpc_kernel_end_call(rxcall); - call->rxcall = NULL; + if (ret != -ECONNABORTED) + rxrpc_kernel_abort_call(call->net->socket, rxcall, + RX_USER_ABORT, ret, + afs_abort_send_data_error); + if (call->async) { + afs_see_call(call, afs_call_trace_async_abort); + return; + } + + if (ret == -ECONNABORTED) { + len = 0; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0); + rxrpc_kernel_recv_data(call->net->socket, rxcall, + &msg.msg_iter, &len, false, + &call->abort_code, &call->service_id); + call->responded = true; + } + call->error = ret; + trace_afs_call_done(call); error_kill_call: - call->type->destructor(call); - afs_free_call(call); + if (call->async) + afs_see_call(call, afs_call_trace_async_kill); + if (call->type->immediate_cancel) + call->type->immediate_cancel(call); + + /* We need to dispose of the extra ref we grabbed for an async call. + * The call, however, might be queued on afs_async_calls and we need to + * make sure we don't get any more notifications that might requeue it. + */ + if (call->rxcall) + rxrpc_kernel_shutdown_call(call->net->socket, call->rxcall); + if (call->async) { + if (cancel_work_sync(&call->async_work)) + afs_put_call(call); + afs_set_call_complete(call, ret, 0); + } + + call->error = ret; + call->state = AFS_CALL_COMPLETE; _leave(" = %d", ret); - return ret; } /* - * handles intercepted messages that were arriving in the socket's Rx queue - * - called with the socket receive queue lock held to ensure message ordering - * - called with softirqs disabled + * Log remote abort codes that indicate that we have a protocol disagreement + * with the server. */ -static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID, - struct sk_buff *skb) +static void afs_log_error(struct afs_call *call, s32 remote_abort) { - struct afs_call *call = (struct afs_call *) user_call_ID; - - _enter("%p,,%u", call, skb->mark); - - _debug("ICPT %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - - ASSERTCMP(sk, ==, afs_socket->sk); - atomic_inc(&afs_outstanding_skbs); - - if (!call) { - /* its an incoming call for our callback service */ - skb_queue_tail(&afs_incoming_calls, skb); - queue_work(afs_wq, &afs_collect_incoming_call_work); - } else { - /* route the messages directly to the appropriate call */ - skb_queue_tail(&call->rx_queue, skb); - call->wait_mode->rx_wakeup(call); + static int max = 0; + const char *msg; + int m; + + switch (remote_abort) { + case RX_EOF: msg = "unexpected EOF"; break; + case RXGEN_CC_MARSHAL: msg = "client marshalling"; break; + case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling"; break; + case RXGEN_SS_MARSHAL: msg = "server marshalling"; break; + case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling"; break; + case RXGEN_DECODE: msg = "opcode decode"; break; + case RXGEN_SS_XDRFREE: msg = "server XDR cleanup"; break; + case RXGEN_CC_XDRFREE: msg = "client XDR cleanup"; break; + case -32: msg = "insufficient data"; break; + default: + return; } - _leave(""); + m = max; + if (m < 3) { + max = m + 1; + pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", + msg, call->type->name, + rxrpc_kernel_remote_addr(call->peer)); + } } /* * deliver messages to a call */ -static void afs_deliver_to_call(struct afs_call *call) +void afs_deliver_to_call(struct afs_call *call) { - struct sk_buff *skb; - bool last; - u32 abort_code; + enum afs_call_state state; + size_t len; + u32 abort_code, remote_abort = 0; int ret; - _enter(""); - - while ((call->state == AFS_CALL_AWAIT_REPLY || - call->state == AFS_CALL_AWAIT_OP_ID || - call->state == AFS_CALL_AWAIT_REQUEST || - call->state == AFS_CALL_AWAIT_ACK) && - (skb = skb_dequeue(&call->rx_queue))) { - switch (skb->mark) { - case RXRPC_SKB_MARK_DATA: - _debug("Rcv DATA"); - last = rxrpc_kernel_is_data_last(skb); - ret = call->type->deliver(call, skb, last); - switch (ret) { - case 0: - if (last && - call->state == AFS_CALL_AWAIT_REPLY) - call->state = AFS_CALL_COMPLETE; - break; - case -ENOTCONN: - abort_code = RX_CALL_DEAD; - goto do_abort; - case -ENOTSUPP: - abort_code = RX_INVALID_OPERATION; - goto do_abort; - default: - abort_code = RXGEN_CC_UNMARSHAL; - if (call->state != AFS_CALL_AWAIT_REPLY) - abort_code = RXGEN_SS_UNMARSHAL; - do_abort: - rxrpc_kernel_abort_call(call->rxcall, - abort_code); - call->error = ret; - call->state = AFS_CALL_ERROR; - break; + _enter("%s", call->type->name); + + while (state = READ_ONCE(call->state), + state == AFS_CALL_CL_AWAIT_REPLY || + state == AFS_CALL_SV_AWAIT_OP_ID || + state == AFS_CALL_SV_AWAIT_REQUEST || + state == AFS_CALL_SV_AWAIT_ACK + ) { + if (state == AFS_CALL_SV_AWAIT_ACK) { + len = 0; + iov_iter_kvec(&call->def_iter, ITER_DEST, NULL, 0, 0); + ret = rxrpc_kernel_recv_data(call->net->socket, + call->rxcall, &call->def_iter, + &len, false, &remote_abort, + &call->service_id); + trace_afs_receive_data(call, &call->def_iter, false, ret); + + if (ret == -EINPROGRESS || ret == -EAGAIN) + return; + if (ret < 0 || ret == 1) { + if (ret == 1) + ret = 0; + goto call_complete; } - afs_data_delivered(skb); - skb = NULL; - continue; - case RXRPC_SKB_MARK_FINAL_ACK: - _debug("Rcv ACK"); - call->state = AFS_CALL_COMPLETE; - break; - case RXRPC_SKB_MARK_BUSY: - _debug("Rcv BUSY"); - call->error = -EBUSY; - call->state = AFS_CALL_BUSY; - break; - case RXRPC_SKB_MARK_REMOTE_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Rcv ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_NET_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv NET ERROR %d", call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv LOCAL ERROR %d", call->error); - break; - default: - BUG(); - break; + return; } - afs_free_skb(skb); - } - - /* make sure the queue is empty if the call is done with (we might have - * aborted the call early because of an unmarshalling error) */ - if (call->state >= AFS_CALL_COMPLETE) { - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); - if (call->incoming) { - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + ret = call->type->deliver(call); + state = READ_ONCE(call->state); + if (ret == 0 && call->unmarshalling_error) + ret = -EBADMSG; + switch (ret) { + case 0: + call->responded = true; + afs_queue_call_work(call); + if (state == AFS_CALL_CL_PROC_REPLY) { + if (call->op) + set_bit(AFS_SERVER_FL_MAY_HAVE_CB, + &call->op->server->flags); + goto call_complete; + } + ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY); + goto done; + case -EINPROGRESS: + case -EAGAIN: + goto out; + case -ECONNABORTED: + ASSERTCMP(state, ==, AFS_CALL_COMPLETE); + call->responded = true; + afs_log_error(call, call->abort_code); + goto done; + case -ENOTSUPP: + call->responded = true; + abort_code = RXGEN_OPCODE; + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + abort_code, ret, + afs_abort_op_not_supported); + goto local_abort; + case -EIO: + pr_err("kAFS: Call %u in bad state %u\n", + call->debug_id, state); + fallthrough; + case -ENODATA: + case -EBADMSG: + case -EMSGSIZE: + case -ENOMEM: + case -EFAULT: + abort_code = RXGEN_CC_UNMARSHAL; + if (state != AFS_CALL_CL_AWAIT_REPLY) + abort_code = RXGEN_SS_UNMARSHAL; + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + abort_code, ret, + afs_abort_unmarshal_error); + goto local_abort; + default: + abort_code = RX_CALL_DEAD; + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + abort_code, ret, + afs_abort_general_error); + goto local_abort; } } +done: + if (call->type->done) + call->type->done(call); +out: _leave(""); + return; + +local_abort: + abort_code = 0; +call_complete: + afs_set_call_complete(call, ret, remote_abort); + goto done; } /* - * wait synchronously for a call to complete + * Wait synchronously for a call to complete. */ -static int afs_wait_for_call_to_complete(struct afs_call *call) +void afs_wait_for_call_to_complete(struct afs_call *call) { - struct sk_buff *skb; - int ret; - - DECLARE_WAITQUEUE(myself, current); + bool rxrpc_complete = false; _enter(""); - add_wait_queue(&call->waitq, &myself); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); + if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { + DECLARE_WAITQUEUE(myself, current); - /* deliver any messages that are in the queue */ - if (!skb_queue_empty(&call->rx_queue)) { - __set_current_state(TASK_RUNNING); - afs_deliver_to_call(call); - continue; - } + add_wait_queue(&call->waitq, &myself); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); - ret = call->error; - if (call->state >= AFS_CALL_COMPLETE) - break; - ret = -EINTR; - if (signal_pending(current)) - break; - schedule(); - } + /* deliver any messages that are in the queue */ + if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && + call->need_attention) { + call->need_attention = false; + __set_current_state(TASK_RUNNING); + afs_deliver_to_call(call); + continue; + } + + if (afs_check_call_state(call, AFS_CALL_COMPLETE)) + break; - remove_wait_queue(&call->waitq, &myself); - __set_current_state(TASK_RUNNING); + if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { + /* rxrpc terminated the call. */ + rxrpc_complete = true; + break; + } + + schedule(); + } - /* kill the call */ - if (call->state < AFS_CALL_COMPLETE) { - _debug("call incomplete"); - rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); + remove_wait_queue(&call->waitq, &myself); + __set_current_state(TASK_RUNNING); } - _debug("call complete"); - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); - _leave(" = %d", ret); - return ret; + if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { + if (rxrpc_complete) { + afs_set_call_complete(call, call->error, call->abort_code); + } else { + /* Kill off the call if it's still live. */ + _debug("call interrupted"); + if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + RX_USER_ABORT, -EINTR, + afs_abort_interrupted)) + afs_set_call_complete(call, -EINTR, 0); + } + } } /* * wake up a waiting call */ -static void afs_wake_up_call_waiter(struct afs_call *call) -{ - wake_up(&call->waitq); -} - -/* - * wake up an asynchronous call - */ -static void afs_wake_up_async_call(struct afs_call *call) +static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { - _enter(""); - queue_work(afs_async_calls, &call->async_work); -} + struct afs_call *call = (struct afs_call *)call_user_ID; -/* - * put a call into asynchronous mode - * - mustn't touch the call descriptor as the call my have completed by the - * time we get here - */ -static int afs_dont_wait_for_call_to_complete(struct afs_call *call) -{ - _enter(""); - return -EINPROGRESS; + call->need_attention = true; + wake_up(&call->waitq); } /* - * delete an asynchronous call + * Wake up an asynchronous call. The caller is holding the call notify + * spinlock around this, so we can't call afs_put_call(). */ -static void afs_delete_async_call(struct work_struct *work) +static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); + struct afs_call *call = (struct afs_call *)call_user_ID; + int r; - _enter(""); + trace_afs_notify_call(rxcall, call); + call->need_attention = true; - afs_free_call(call); + if (__refcount_inc_not_zero(&call->ref, &r)) { + trace_afs_call(call->debug_id, afs_call_trace_wake, r + 1, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); - _leave(""); + if (!queue_work(afs_async_calls, &call->async_work)) + afs_deferred_put_call(call); + } } /* - * perform processing on an asynchronous call - * - on a multiple-thread workqueue this work item may try to run on several - * CPUs at the same time + * Perform I/O processing on an asynchronous call. The work item carries a ref + * to the call struct that we either need to release or to pass on. */ static void afs_process_async_call(struct work_struct *work) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); + struct afs_call *call = container_of(work, struct afs_call, async_work); _enter(""); - if (!skb_queue_empty(&call->rx_queue)) + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; afs_deliver_to_call(call); - - if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) { - if (call->wait_mode->async_complete) - call->wait_mode->async_complete(call->reply, - call->error); - call->reply = NULL; - - /* kill the call */ - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - if (call->type->destructor) - call->type->destructor(call); - - /* we can't just delete the call because the work item may be - * queued */ - PREPARE_WORK(&call->async_work, afs_delete_async_call); - queue_work(afs_async_calls, &call->async_work); } + afs_put_call(call); _leave(""); } -/* - * empty a socket buffer into a flat reply buffer - */ -void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb) +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) { - size_t len = skb->len; + struct afs_call *call = (struct afs_call *)user_call_ID; - if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0) - BUG(); - call->reply_size += len; + call->rxcall = rxcall; } /* - * accept the backlog of incoming calls + * Charge the incoming call preallocation. */ -static void afs_collect_incoming_call(struct work_struct *work) +void afs_charge_preallocation(struct work_struct *work) { - struct rxrpc_call *rxcall; - struct afs_call *call = NULL; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&afs_incoming_calls))) { - _debug("new call"); - - /* don't need the notification */ - afs_free_skb(skb); + struct afs_net *net = + container_of(work, struct afs_net, charge_preallocation_work); + struct afs_call *call = net->spare_incoming_call; + for (;;) { if (!call) { - call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); - if (!call) { - rxrpc_kernel_reject_call(afs_socket); - return; - } + call = afs_alloc_call(net, &afs_RXCMxxxx, GFP_KERNEL); + if (!call) + break; - INIT_WORK(&call->async_work, afs_process_async_call); - call->wait_mode = &afs_async_incoming_call; - call->type = &afs_RXCMxxxx; + call->drop_ref = true; + call->async = true; + call->state = AFS_CALL_SV_AWAIT_OP_ID; init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); - call->state = AFS_CALL_AWAIT_OP_ID; - - _debug("CALL %p{%s} [%d]", - call, call->type->name, - atomic_read(&afs_outstanding_calls)); - atomic_inc(&afs_outstanding_calls); + afs_extract_to_tmp(call); } - rxcall = rxrpc_kernel_accept_call(afs_socket, - (unsigned long) call); - if (!IS_ERR(rxcall)) { - call->rxcall = rxcall; - call = NULL; - } + if (rxrpc_kernel_charge_accept(net->socket, + afs_wake_up_async_call, + (unsigned long)call, + GFP_KERNEL, + call->debug_id) < 0) + break; + call = NULL; } + net->spare_incoming_call = call; +} - if (call) - afs_free_call(call); +/* + * Discard a preallocated call when a socket is shut down. + */ +static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + struct afs_call *call = (struct afs_call *)user_call_ID; + + call->rxcall = NULL; + afs_put_call(call); } /* - * grab the operation ID from an incoming cache manager call + * Notification of an incoming call. */ -static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, - bool last) +static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long user_call_ID) { - size_t len = skb->len; - void *oibuf = (void *) &call->operation_ID; + struct afs_call *call = (struct afs_call *)user_call_ID; + struct afs_net *net = afs_sock2net(sk); - _enter("{%u},{%zu},%d", call->offset, len, last); + call->peer = rxrpc_kernel_get_call_peer(sk->sk_socket, call->rxcall); + call->server = afs_find_server(call->peer); + if (!call->server) + trace_afs_cm_no_server(call, rxrpc_kernel_remote_srx(call->peer)); + + queue_work(afs_wq, &net->charge_preallocation_work); +} - ASSERTCMP(call->offset, <, 4); +/* + * Grab the operation ID from an incoming cache manager call. The socket + * buffer is discarded on error or if we don't yet have sufficient data. + */ +static int afs_deliver_cm_op_id(struct afs_call *call) +{ + int ret; + + _enter("{%zu}", iov_iter_count(call->iter)); /* the operation ID forms the first four bytes of the request data */ - len = min_t(size_t, len, 4 - call->offset); - if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0) - BUG(); - if (!pskb_pull(skb, len)) - BUG(); - call->offset += len; - - if (call->offset < 4) { - if (last) { - _leave(" = -EBADMSG [op ID short]"); - return -EBADMSG; - } - _leave(" = 0 [incomplete]"); - return 0; - } + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; - call->state = AFS_CALL_AWAIT_REQUEST; + call->operation_ID = ntohl(call->tmp); + afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST); /* ask the cache manager to route the call (it'll change the call type * if successful) */ if (!afs_cm_incoming_call(call)) return -ENOTSUPP; - /* pass responsibility for the remainer of this message off to the + call->security_ix = rxrpc_kernel_query_call_security(call->rxcall, + &call->service_id, + &call->enctype); + + trace_afs_cb_call(call); + call->work.func = call->type->work; + + /* pass responsibility for the remainder of this message off to the * cache manager op */ - return call->type->deliver(call, skb, last); + return call->type->deliver(call); +} + +/* + * Advance the AFS call state when an RxRPC service call ends the transmit + * phase. + */ +static void afs_notify_end_reply_tx(struct sock *sock, + struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ + struct afs_call *call = (struct afs_call *)call_user_ID; + + afs_set_call_state(call, AFS_CALL_SV_REPLYING, AFS_CALL_SV_AWAIT_ACK); } /* @@ -757,35 +848,33 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, */ void afs_send_empty_reply(struct afs_call *call) { + struct afs_net *net = call->net; struct msghdr msg; - struct iovec iov[1]; _enter(""); - iov[0].iov_base = NULL; - iov[0].iov_len = 0; + rxrpc_kernel_set_tx_length(net->socket, call->rxcall, 0); + msg.msg_name = NULL; msg.msg_namelen = 0; - msg.msg_iov = iov; - msg.msg_iovlen = 0; + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; - call->state = AFS_CALL_AWAIT_ACK; - switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) { + switch (rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, 0, + afs_notify_end_reply_tx)) { case 0: _leave(" [replied]"); return; case -ENOMEM: _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(net->socket, call->rxcall, + RXGEN_SS_MARSHAL, -ENOMEM, + afs_abort_oom); + fallthrough; default: - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); _leave(" [error]"); return; } @@ -796,64 +885,102 @@ void afs_send_empty_reply(struct afs_call *call) */ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) { + struct afs_net *net = call->net; struct msghdr msg; - struct iovec iov[1]; + struct kvec iov[1]; int n; _enter(""); + rxrpc_kernel_set_tx_length(net->socket, call->rxcall, len); + iov[0].iov_base = (void *) buf; iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; - msg.msg_iov = iov; - msg.msg_iovlen = 1; + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; - call->state = AFS_CALL_AWAIT_ACK; - n = rxrpc_kernel_send_data(call->rxcall, &msg, len); + n = rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, len, + afs_notify_end_reply_tx); if (n >= 0) { + /* Success */ _leave(" [replied]"); return; } + if (n == -ENOMEM) { _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(net->socket, call->rxcall, + RXGEN_SS_MARSHAL, -ENOMEM, + afs_abort_oom); } - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); _leave(" [error]"); } /* - * extract a piece of data from the received data socket buffers + * Extract a piece of data from the received data socket buffers. */ -int afs_extract_data(struct afs_call *call, struct sk_buff *skb, - bool last, void *buf, size_t count) +int afs_extract_data(struct afs_call *call, bool want_more) { - size_t len = skb->len; - - _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count); + struct afs_net *net = call->net; + struct iov_iter *iter = call->iter; + enum afs_call_state state; + u32 remote_abort = 0; + int ret; - ASSERTCMP(call->offset, <, count); + _enter("{%s,%zu,%zu},%d", + call->type->name, call->iov_len, iov_iter_count(iter), want_more); - len = min_t(size_t, len, count - call->offset); - if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 || - !pskb_pull(skb, len)) - BUG(); - call->offset += len; + ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter, + &call->iov_len, want_more, &remote_abort, + &call->service_id); + trace_afs_receive_data(call, call->iter, want_more, ret); + if (ret == 0 || ret == -EAGAIN) + return ret; - if (call->offset < count) { - if (last) { - _leave(" = -EBADMSG [%d < %zu]", call->offset, count); - return -EBADMSG; + state = READ_ONCE(call->state); + if (ret == 1) { + switch (state) { + case AFS_CALL_CL_AWAIT_REPLY: + afs_set_call_state(call, state, AFS_CALL_CL_PROC_REPLY); + break; + case AFS_CALL_SV_AWAIT_REQUEST: + afs_set_call_state(call, state, AFS_CALL_SV_REPLYING); + break; + case AFS_CALL_COMPLETE: + kdebug("prem complete %d", call->error); + return afs_io_error(call, afs_io_error_extract); + default: + break; } - _leave(" = -EAGAIN"); - return -EAGAIN; + return 0; } - return 0; + + afs_set_call_complete(call, ret, remote_abort); + return ret; +} + +/* + * Log protocol error production. + */ +noinline int afs_protocol_error(struct afs_call *call, + enum afs_eproto_cause cause) +{ + trace_afs_protocol_error(call, cause); + if (call) + call->unmarshalling_error = true; + return -EBADMSG; +} + +/* + * Wake up OOB notification processing. + */ +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob) +{ + struct afs_net *net = sk->sk_user_data; + + schedule_work(&net->rx_oob_work); } diff --git a/fs/afs/security.c b/fs/afs/security.c index 8d010422dc89..55ddce94af03 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* AFS security handling * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/init.h> @@ -14,27 +10,63 @@ #include <linux/fs.h> #include <linux/ctype.h> #include <linux/sched.h> +#include <linux/hashtable.h> #include <keys/rxrpc-type.h> #include "internal.h" +static DEFINE_HASHTABLE(afs_permits_cache, 10); +static DEFINE_SPINLOCK(afs_permits_lock); +static DEFINE_MUTEX(afs_key_lock); + +/* + * Allocate a key to use as a placeholder for anonymous user security. + */ +static int afs_alloc_anon_key(struct afs_cell *cell) +{ + struct key *key; + + mutex_lock(&afs_key_lock); + key = cell->anonymous_key; + if (!key) { + key = rxrpc_get_null_key(cell->key_desc); + if (!IS_ERR(key)) + cell->anonymous_key = key; + } + mutex_unlock(&afs_key_lock); + + if (IS_ERR(key)) + return PTR_ERR(key); + + _debug("anon key %p{%x}", + cell->anonymous_key, key_serial(cell->anonymous_key)); + return 0; +} + /* * get a key */ struct key *afs_request_key(struct afs_cell *cell) { struct key *key; + int ret; - _enter("{%x}", key_serial(cell->anonymous_key)); + _enter("{%s}", cell->key_desc); - _debug("key %s", cell->anonymous_key->description); - key = request_key(&key_type_rxrpc, cell->anonymous_key->description, - NULL); + _debug("key %s", cell->key_desc); + key = request_key_net(&key_type_rxrpc, cell->key_desc, + cell->net->net, NULL); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { _leave(" = %ld", PTR_ERR(key)); return key; } + if (!cell->anonymous_key) { + ret = afs_alloc_anon_key(cell); + if (ret < 0) + return ERR_PTR(ret); + } + /* act as anonymous user */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); @@ -46,167 +78,293 @@ struct key *afs_request_key(struct afs_cell *cell) } /* - * dispose of a permits list + * Get a key when pathwalk is in rcuwalk mode. */ -void afs_zap_permits(struct rcu_head *rcu) +struct key *afs_request_key_rcu(struct afs_cell *cell) { - struct afs_permits *permits = - container_of(rcu, struct afs_permits, rcu); - int loop; + struct key *key; - _enter("{%d}", permits->count); + _enter("{%s}", cell->key_desc); - for (loop = permits->count - 1; loop >= 0; loop--) - key_put(permits->permits[loop].key); - kfree(permits); + _debug("key %s", cell->key_desc); + key = request_key_net_rcu(&key_type_rxrpc, cell->key_desc, + cell->net->net); + if (IS_ERR(key)) { + if (PTR_ERR(key) != -ENOKEY) { + _leave(" = %ld", PTR_ERR(key)); + return key; + } + + /* act as anonymous user */ + if (!cell->anonymous_key) + return NULL; /* Need to allocate */ + _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); + return key_get(cell->anonymous_key); + } else { + /* act as authorised user */ + _leave(" = {%x} [auth]", key_serial(key)); + return key; + } } /* - * dispose of a permits list in which all the key pointers have been copied + * Dispose of a list of permits. */ -static void afs_dispose_of_permits(struct rcu_head *rcu) +static void afs_permits_rcu(struct rcu_head *rcu) { struct afs_permits *permits = container_of(rcu, struct afs_permits, rcu); + int i; - _enter("{%d}", permits->count); - + for (i = 0; i < permits->nr_permits; i++) + key_put(permits->permits[i].key); kfree(permits); } /* - * get the authorising vnode - this is the specified inode itself if it's a - * directory or it's the parent directory if the specified inode is a file or - * symlink - * - the caller must release the ref on the inode + * Discard a permission cache. */ -static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode, - struct key *key) +void afs_put_permits(struct afs_permits *permits) { - struct afs_vnode *auth_vnode; - struct inode *auth_inode; - - _enter(""); - - if (S_ISDIR(vnode->vfs_inode.i_mode)) { - auth_inode = igrab(&vnode->vfs_inode); - ASSERT(auth_inode != NULL); - } else { - auth_inode = afs_iget(vnode->vfs_inode.i_sb, key, - &vnode->status.parent, NULL, NULL); - if (IS_ERR(auth_inode)) - return ERR_CAST(auth_inode); + if (permits && refcount_dec_and_test(&permits->usage)) { + spin_lock(&afs_permits_lock); + hash_del_rcu(&permits->hash_node); + spin_unlock(&afs_permits_lock); + call_rcu(&permits->rcu, afs_permits_rcu); } - - auth_vnode = AFS_FS_I(auth_inode); - _leave(" = {%x}", auth_vnode->fid.vnode); - return auth_vnode; } /* - * clear the permit cache on a directory vnode + * Clear a permit cache on callback break. */ void afs_clear_permits(struct afs_vnode *vnode) { struct afs_permits *permits; - _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + spin_lock(&vnode->lock); + permits = rcu_dereference_protected(vnode->permit_cache, + lockdep_is_held(&vnode->lock)); + RCU_INIT_POINTER(vnode->permit_cache, NULL); + spin_unlock(&vnode->lock); + + afs_put_permits(permits); +} + +/* + * Hash a list of permits. Use simple addition to make it easy to add an extra + * one at an as-yet indeterminate position in the list. + */ +static void afs_hash_permits(struct afs_permits *permits) +{ + unsigned long h = permits->nr_permits; + int i; - mutex_lock(&vnode->permits_lock); - permits = vnode->permits; - rcu_assign_pointer(vnode->permits, NULL); - mutex_unlock(&vnode->permits_lock); + for (i = 0; i < permits->nr_permits; i++) { + h += (unsigned long)permits->permits[i].key / sizeof(void *); + h += permits->permits[i].access; + } - if (permits) - call_rcu(&permits->rcu, afs_zap_permits); - _leave(""); + permits->h = h; } /* - * add the result obtained for a vnode to its or its parent directory's cache - * for the key used to access it + * Cache the CallerAccess result obtained from doing a fileserver operation + * that returned a vnode status for a particular key. If a callback break + * occurs whilst the operation was in progress then we have to ditch the cache + * as the ACL *may* have changed. */ -void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order) +void afs_cache_permit(struct afs_vnode *vnode, struct key *key, + unsigned int cb_break, struct afs_status_cb *scb) { - struct afs_permits *permits, *xpermits; - struct afs_permit *permit; - struct afs_vnode *auth_vnode; - int count, loop; - - _enter("{%x:%u},%x,%lx", - vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order); - - auth_vnode = afs_get_auth_inode(vnode, key); - if (IS_ERR(auth_vnode)) { - _leave(" [get error %ld]", PTR_ERR(auth_vnode)); - return; + struct afs_permits *permits, *xpermits, *replacement, *zap, *new = NULL; + afs_access_t caller_access = scb->status.caller_access; + size_t size = 0; + bool changed = false; + int i, j; + + _enter("{%llx:%llu},%x,%x", + vnode->fid.vid, vnode->fid.vnode, key_serial(key), caller_access); + + rcu_read_lock(); + + /* Check for the common case first: We got back the same access as last + * time we tried and already have it recorded. + */ + permits = rcu_dereference(vnode->permit_cache); + if (permits) { + if (!permits->invalidated) { + for (i = 0; i < permits->nr_permits; i++) { + if (permits->permits[i].key < key) + continue; + if (permits->permits[i].key > key) + break; + if (permits->permits[i].access != caller_access) { + changed = true; + break; + } + + if (afs_cb_is_broken(cb_break, vnode)) { + changed = true; + break; + } + + /* The cache is still good. */ + rcu_read_unlock(); + return; + } + } + + changed |= permits->invalidated; + size = permits->nr_permits; + + /* If this set of permits is now wrong, clear the permits + * pointer so that no one tries to use the stale information. + */ + if (changed) { + spin_lock(&vnode->lock); + if (permits != rcu_access_pointer(vnode->permit_cache)) + goto someone_else_changed_it_unlock; + RCU_INIT_POINTER(vnode->permit_cache, NULL); + spin_unlock(&vnode->lock); + + afs_put_permits(permits); + permits = NULL; + size = 0; + } } - mutex_lock(&auth_vnode->permits_lock); + if (afs_cb_is_broken(cb_break, vnode)) + goto someone_else_changed_it; + + /* We need a ref on any permits list we want to copy as we'll have to + * drop the lock to do memory allocation. + */ + if (permits && !refcount_inc_not_zero(&permits->usage)) + goto someone_else_changed_it; + + rcu_read_unlock(); + + /* Speculatively create a new list with the revised permission set. We + * discard this if we find an extant match already in the hash, but + * it's easier to compare with memcmp this way. + * + * We fill in the key pointers at this time, but we don't get the refs + * yet. + */ + size++; + new = kzalloc(struct_size(new, permits, size), GFP_NOFS); + if (!new) + goto out_put; + + refcount_set(&new->usage, 1); + new->nr_permits = size; + i = j = 0; + if (permits) { + for (i = 0; i < permits->nr_permits; i++) { + if (j == i && permits->permits[i].key > key) { + new->permits[j].key = key; + new->permits[j].access = caller_access; + j++; + } + new->permits[j].key = permits->permits[i].key; + new->permits[j].access = permits->permits[i].access; + j++; + } + } - /* guard against a rename being detected whilst we waited for the - * lock */ - if (memcmp(&auth_vnode->fid, &vnode->status.parent, - sizeof(struct afs_fid)) != 0) { - _debug("renamed"); - goto out_unlock; + if (j == i) { + new->permits[j].key = key; + new->permits[j].access = caller_access; } - /* have to be careful as the directory's callback may be broken between - * us receiving the status we're trying to cache and us getting the - * lock to update the cache for the status */ - if (auth_vnode->acl_order - acl_order > 0) { - _debug("ACL changed?"); - goto out_unlock; + afs_hash_permits(new); + + /* Now see if the permit list we want is actually already available */ + spin_lock(&afs_permits_lock); + + hash_for_each_possible(afs_permits_cache, xpermits, hash_node, new->h) { + if (xpermits->h != new->h || + xpermits->invalidated || + xpermits->nr_permits != new->nr_permits || + memcmp(xpermits->permits, new->permits, + new->nr_permits * sizeof(struct afs_permit)) != 0) + continue; + + if (refcount_inc_not_zero(&xpermits->usage)) { + replacement = xpermits; + goto found; + } + + break; } - /* always update the anonymous mask */ - _debug("anon access %x", vnode->status.anon_access); - auth_vnode->status.anon_access = vnode->status.anon_access; - if (key == vnode->volume->cell->anonymous_key) - goto out_unlock; - - xpermits = auth_vnode->permits; - count = 0; - if (xpermits) { - /* see if the permit is already in the list - * - if it is then we just amend the list - */ - count = xpermits->count; - permit = xpermits->permits; - for (loop = count; loop > 0; loop--) { - if (permit->key == key) { - permit->access_mask = - vnode->status.caller_access; - goto out_unlock; - } - permit++; + for (i = 0; i < new->nr_permits; i++) + key_get(new->permits[i].key); + hash_add_rcu(afs_permits_cache, &new->hash_node, new->h); + replacement = new; + new = NULL; + +found: + spin_unlock(&afs_permits_lock); + + kfree(new); + + rcu_read_lock(); + spin_lock(&vnode->lock); + zap = rcu_access_pointer(vnode->permit_cache); + if (!afs_cb_is_broken(cb_break, vnode) && zap == permits) + rcu_assign_pointer(vnode->permit_cache, replacement); + else + zap = replacement; + spin_unlock(&vnode->lock); + rcu_read_unlock(); + afs_put_permits(zap); +out_put: + afs_put_permits(permits); + return; + +someone_else_changed_it_unlock: + spin_unlock(&vnode->lock); +someone_else_changed_it: + /* Someone else changed the cache under us - don't recheck at this + * time. + */ + rcu_read_unlock(); + return; +} + +static bool afs_check_permit_rcu(struct afs_vnode *vnode, struct key *key, + afs_access_t *_access) +{ + const struct afs_permits *permits; + int i; + + _enter("{%llx:%llu},%x", + vnode->fid.vid, vnode->fid.vnode, key_serial(key)); + + /* check the permits to see if we've got one yet */ + if (key == vnode->volume->cell->anonymous_key) { + *_access = vnode->status.anon_access; + _leave(" = t [anon %x]", *_access); + return true; + } + + permits = rcu_dereference(vnode->permit_cache); + if (permits) { + for (i = 0; i < permits->nr_permits; i++) { + if (permits->permits[i].key < key) + continue; + if (permits->permits[i].key > key) + break; + + *_access = permits->permits[i].access; + _leave(" = %u [perm %x]", !permits->invalidated, *_access); + return !permits->invalidated; } } - permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1), - GFP_NOFS); - if (!permits) - goto out_unlock; - - if (xpermits) - memcpy(permits->permits, xpermits->permits, - count * sizeof(struct afs_permit)); - - _debug("key %x access %x", - key_serial(key), vnode->status.caller_access); - permits->permits[count].access_mask = vnode->status.caller_access; - permits->permits[count].key = key_get(key); - permits->count = count + 1; - - rcu_assign_pointer(auth_vnode->permits, permits); - if (xpermits) - call_rcu(&xpermits->rcu, afs_dispose_of_permits); - -out_unlock: - mutex_unlock(&auth_vnode->permits_lock); - iput(&auth_vnode->vfs_inode); - _leave(""); + _leave(" = f"); + return false; } /* @@ -214,68 +372,53 @@ out_unlock: * permitted to be accessed with this authorisation, and if so, what access it * is granted */ -static int afs_check_permit(struct afs_vnode *vnode, struct key *key, - afs_access_t *_access) +int afs_check_permit(struct afs_vnode *vnode, struct key *key, + afs_access_t *_access) { struct afs_permits *permits; - struct afs_permit *permit; - struct afs_vnode *auth_vnode; - bool valid; - int loop, ret; + bool valid = false; + int i, ret; - _enter("{%x:%u},%x", + _enter("{%llx:%llu},%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key)); - auth_vnode = afs_get_auth_inode(vnode, key); - if (IS_ERR(auth_vnode)) { - *_access = 0; - _leave(" = %ld", PTR_ERR(auth_vnode)); - return PTR_ERR(auth_vnode); - } - - ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode)); - /* check the permits to see if we've got one yet */ - if (key == auth_vnode->volume->cell->anonymous_key) { + if (key == vnode->volume->cell->anonymous_key) { _debug("anon"); - *_access = auth_vnode->status.anon_access; + *_access = vnode->status.anon_access; valid = true; } else { - valid = false; rcu_read_lock(); - permits = rcu_dereference(auth_vnode->permits); + permits = rcu_dereference(vnode->permit_cache); if (permits) { - permit = permits->permits; - for (loop = permits->count; loop > 0; loop--) { - if (permit->key == key) { - _debug("found in cache"); - *_access = permit->access_mask; - valid = true; + for (i = 0; i < permits->nr_permits; i++) { + if (permits->permits[i].key < key) + continue; + if (permits->permits[i].key > key) break; - } - permit++; + + *_access = permits->permits[i].access; + valid = !permits->invalidated; + break; } } rcu_read_unlock(); } if (!valid) { - /* check the status on the file we're actually interested in - * (the post-processing will cache the result on auth_vnode) */ + /* Check the status on the file we're actually interested in + * (the post-processing will cache the result). + */ _debug("no valid permit"); - set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - ret = afs_vnode_fetch_status(vnode, auth_vnode, key); + ret = afs_fetch_status(vnode, key, false, _access); if (ret < 0) { - iput(&auth_vnode->vfs_inode); *_access = 0; _leave(" = %d", ret); return ret; } - *_access = vnode->status.caller_access; } - iput(&auth_vnode->vfs_inode); _leave(" = 0 [access %x]", *_access); return 0; } @@ -285,72 +428,77 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key, * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct inode *inode, int mask) +int afs_permission(struct mnt_idmap *idmap, struct inode *inode, + int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); - afs_access_t uninitialized_var(access); + afs_access_t access; struct key *key; - int ret; + int ret = 0; - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - _enter("{{%x:%u},%lx},%x,", + _enter("{{%llx:%llu},%lx},%x,", vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask); - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - return PTR_ERR(key); - } + if (mask & MAY_NOT_BLOCK) { + key = afs_request_key_rcu(vnode->volume->cell); + if (IS_ERR_OR_NULL(key)) + return -ECHILD; + + ret = -ECHILD; + if (!afs_check_validity(vnode) || + !afs_check_permit_rcu(vnode, key, &access)) + goto error; + } else { + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) { + _leave(" = %ld [key]", PTR_ERR(key)); + return PTR_ERR(key); + } - /* if the promise has expired, we need to check the server again */ - if (!vnode->cb_promised) { - _debug("not promised"); - ret = afs_vnode_fetch_status(vnode, NULL, key); + ret = afs_validate(vnode, key); if (ret < 0) goto error; - _debug("new promise [fl=%lx]", vnode->flags); - } - /* check the permits to see if we've got one yet */ - ret = afs_check_permit(vnode, key, &access); - if (ret < 0) - goto error; + /* check the permits to see if we've got one yet */ + ret = afs_check_permit(vnode, key, &access); + if (ret < 0) + goto error; + } /* interpret the access mask */ _debug("REQ %x ACC %x on %s", mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file"); + ret = 0; if (S_ISDIR(inode->i_mode)) { - if (mask & MAY_EXEC) { + if (mask & (MAY_EXEC | MAY_READ | MAY_CHDIR)) { if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; - } else if (mask & MAY_READ) { - if (!(access & AFS_ACE_READ)) - goto permission_denied; - } else if (mask & MAY_WRITE) { + } + if (mask & MAY_WRITE) { if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */ - AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */ - AFS_ACE_WRITE))) /* chmod */ + AFS_ACE_INSERT))) /* create, mkdir, symlink, rename to */ goto permission_denied; - } else { - BUG(); } } else { if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; + if ((mask & MAY_EXEC) && !(inode->i_mode & S_IXUSR)) + goto permission_denied; if (mask & (MAY_EXEC | MAY_READ)) { if (!(access & AFS_ACE_READ)) goto permission_denied; + if (!(inode->i_mode & S_IRUSR)) + goto permission_denied; } else if (mask & MAY_WRITE) { if (!(access & AFS_ACE_WRITE)) goto permission_denied; + if (!(inode->i_mode & S_IWUSR)) + goto permission_denied; } } key_put(key); - ret = generic_permission(inode, mask); _leave(" = %d", ret); return ret; @@ -361,3 +509,12 @@ error: _leave(" = %d", ret); return ret; } + +void __exit afs_clean_up_permit_cache(void) +{ + int i; + + for (i = 0; i < HASH_SIZE(afs_permits_cache); i++) + WARN_ON_ONCE(!hlist_empty(&afs_permits_cache[i])); + +} diff --git a/fs/afs/server.c b/fs/afs/server.c index f342acf3547d..c4428ebddb1d 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -1,322 +1,631 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* AFS server record management * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/sched.h> #include <linux/slab.h> +#include "afs_fs.h" #include "internal.h" +#include "protocol_yfs.h" -static unsigned afs_server_timeout = 10; /* server timeout in seconds */ +static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */ +static atomic_t afs_server_debug_id; -static void afs_reap_server(struct work_struct *); +static void __afs_put_server(struct afs_net *, struct afs_server *); +static void afs_server_timer(struct timer_list *timer); +static void afs_server_destroyer(struct work_struct *work); -/* tree of all the servers, indexed by IP address */ -static struct rb_root afs_servers = RB_ROOT; -static DEFINE_RWLOCK(afs_servers_lock); +/* + * Find a server by one of its addresses. + */ +struct afs_server *afs_find_server(const struct rxrpc_peer *peer) +{ + struct afs_server *server = (struct afs_server *)rxrpc_kernel_get_peer_data(peer); -/* LRU list of all the servers not currently in use */ -static LIST_HEAD(afs_server_graveyard); -static DEFINE_SPINLOCK(afs_server_graveyard_lock); -static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server); + if (!server) + return NULL; + return afs_use_server(server, false, afs_server_trace_use_cm_call); +} /* - * install a server record in the master tree + * Look up a server by its UUID and mark it active. The caller must hold + * cell->fs_lock. */ -static int afs_install_server(struct afs_server *server) +static struct afs_server *afs_find_server_by_uuid(struct afs_cell *cell, const uuid_t *uuid) { - struct afs_server *xserver; - struct rb_node **pp, *p; - int ret; + struct afs_server *server; + struct rb_node *p; + int diff; - _enter("%p", server); + _enter("%pU", uuid); - write_lock(&afs_servers_lock); + p = cell->fs_servers.rb_node; + while (p) { + server = rb_entry(p, struct afs_server, uuid_rb); - ret = -EEXIST; - pp = &afs_servers.rb_node; + diff = memcmp(uuid, &server->uuid, sizeof(*uuid)); + if (diff < 0) { + p = p->rb_left; + } else if (diff > 0) { + p = p->rb_right; + } else { + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) + return NULL; /* Need a write lock */ + afs_use_server(server, true, afs_server_trace_use_by_uuid); + return server; + } + } + + return NULL; +} + +/* + * Install a server record in the cell tree. The caller must hold an exclusive + * lock on cell->fs_lock. + */ +static struct afs_server *afs_install_server(struct afs_cell *cell, + struct afs_server **candidate) +{ + struct afs_server *server; + struct afs_net *net = cell->net; + struct rb_node **pp, *p; + int diff; + + _enter("%p", candidate); + + /* Firstly install the server in the UUID lookup tree */ + pp = &cell->fs_servers.rb_node; p = NULL; while (*pp) { p = *pp; _debug("- consider %p", p); - xserver = rb_entry(p, struct afs_server, master_rb); - if (server->addr.s_addr < xserver->addr.s_addr) + server = rb_entry(p, struct afs_server, uuid_rb); + diff = memcmp(&(*candidate)->uuid, &server->uuid, sizeof(uuid_t)); + if (diff < 0) pp = &(*pp)->rb_left; - else if (server->addr.s_addr > xserver->addr.s_addr) + else if (diff > 0) pp = &(*pp)->rb_right; else - goto error; + goto exists; } - rb_link_node(&server->master_rb, p, pp); - rb_insert_color(&server->master_rb, &afs_servers); - ret = 0; + server = *candidate; + *candidate = NULL; + rb_link_node(&server->uuid_rb, p, pp); + rb_insert_color(&server->uuid_rb, &cell->fs_servers); + write_seqlock(&net->fs_lock); + hlist_add_head_rcu(&server->proc_link, &net->fs_proc); + write_sequnlock(&net->fs_lock); + + afs_get_cell(cell, afs_cell_trace_get_server); -error: - write_unlock(&afs_servers_lock); - return ret; +exists: + afs_use_server(server, true, afs_server_trace_use_install); + return server; } /* - * allocate a new server record + * Allocate a new server record and mark it as active but uncreated. */ -static struct afs_server *afs_alloc_server(struct afs_cell *cell, - const struct in_addr *addr) +static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid) { struct afs_server *server; + struct afs_net *net = cell->net; _enter(""); server = kzalloc(sizeof(struct afs_server), GFP_KERNEL); - if (server) { - atomic_set(&server->usage, 1); - server->cell = cell; - - INIT_LIST_HEAD(&server->link); - INIT_LIST_HEAD(&server->grave); - init_rwsem(&server->sem); - spin_lock_init(&server->fs_lock); - server->fs_vnodes = RB_ROOT; - server->cb_promises = RB_ROOT; - spin_lock_init(&server->cb_lock); - init_waitqueue_head(&server->cb_break_waitq); - INIT_DELAYED_WORK(&server->cb_break_work, - afs_dispatch_give_up_callbacks); - - memcpy(&server->addr, addr, sizeof(struct in_addr)); - server->addr.s_addr = addr->s_addr; - _leave(" = %p{%d}", server, atomic_read(&server->usage)); - } else { - _leave(" = NULL [nomem]"); - } + if (!server) + return NULL; + + refcount_set(&server->ref, 1); + atomic_set(&server->active, 0); + __set_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + server->debug_id = atomic_inc_return(&afs_server_debug_id); + server->uuid = *uuid; + rwlock_init(&server->fs_lock); + INIT_WORK(&server->destroyer, &afs_server_destroyer); + timer_setup(&server->timer, afs_server_timer, 0); + INIT_LIST_HEAD(&server->volumes); + init_waitqueue_head(&server->probe_wq); + mutex_init(&server->cm_token_lock); + INIT_LIST_HEAD(&server->probe_link); + INIT_HLIST_NODE(&server->proc_link); + spin_lock_init(&server->probe_lock); + server->cell = cell; + server->rtt = UINT_MAX; + server->service_id = FS_SERVICE; + server->probe_counter = 1; + server->probed_at = jiffies - LONG_MAX / 2; + + afs_inc_servers_outstanding(net); + _leave(" = %p", server); return server; } /* - * get an FS-server record for a cell + * Look up an address record for a server */ -struct afs_server *afs_lookup_server(struct afs_cell *cell, - const struct in_addr *addr) +static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_server *server, + struct key *key) { - struct afs_server *server, *candidate; + struct afs_vl_cursor vc; + struct afs_addr_list *alist = NULL; + int ret; + + ret = -ERESTARTSYS; + if (afs_begin_vlserver_operation(&vc, server->cell, key)) { + while (afs_select_vlserver(&vc)) { + if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) + alist = afs_yfsvl_get_endpoints(&vc, &server->uuid); + else + alist = afs_vl_get_addrs_u(&vc, &server->uuid); + } - _enter("%p,%pI4", cell, &addr->s_addr); + ret = afs_end_vlserver_operation(&vc); + } - /* quick scan of the list to see if we already have the server */ - read_lock(&cell->servers_lock); + return ret < 0 ? ERR_PTR(ret) : alist; +} - list_for_each_entry(server, &cell->servers, link) { - if (server->addr.s_addr == addr->s_addr) - goto found_server_quickly; +/* + * Get or create a fileserver record and return it with an active-use count on + * it. + */ +struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, + const uuid_t *uuid, u32 addr_version) +{ + struct afs_addr_list *alist = NULL; + struct afs_server *server, *candidate = NULL; + bool creating = false; + int ret; + + _enter("%p,%pU", cell->net, uuid); + + down_read(&cell->fs_lock); + server = afs_find_server_by_uuid(cell, uuid); + /* Won't see servers marked uncreated. */ + up_read(&cell->fs_lock); + + if (server) { + timer_delete_sync(&server->timer); + if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) + goto wait_for_creation; + if (server->addr_version != addr_version) + set_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); + return server; } - read_unlock(&cell->servers_lock); - candidate = afs_alloc_server(cell, addr); + candidate = afs_alloc_server(cell, uuid); if (!candidate) { - _leave(" = -ENOMEM"); + afs_put_addrlist(alist, afs_alist_trace_put_server_oom); return ERR_PTR(-ENOMEM); } - write_lock(&cell->servers_lock); + down_write(&cell->fs_lock); + server = afs_install_server(cell, &candidate); + if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) { + /* We need to wait for creation to complete. */ + up_write(&cell->fs_lock); + goto wait_for_creation; + } + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) { + set_bit(AFS_SERVER_FL_CREATING, &server->flags); + clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + creating = true; + } + up_write(&cell->fs_lock); + timer_delete_sync(&server->timer); + + /* If we get to create the server, we look up the addresses and then + * immediately dispatch an asynchronous probe to each interface on the + * fileserver. This will make sure the repeat-probing service is + * started. + */ + if (creating) { + alist = afs_vl_lookup_addrs(server, key); + if (IS_ERR(alist)) { + ret = PTR_ERR(alist); + goto create_failed; + } + + ret = afs_fs_probe_fileserver(cell->net, server, alist, key); + if (ret) + goto create_failed; + + clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags); + } - /* check the cell's server list again */ - list_for_each_entry(server, &cell->servers, link) { - if (server->addr.s_addr == addr->s_addr) - goto found_server; +out: + afs_put_addrlist(alist, afs_alist_trace_put_server_create); + if (candidate) { + kfree(rcu_access_pointer(server->endpoint_state)); + kfree(candidate); + afs_dec_servers_outstanding(cell->net); + } + return server ?: ERR_PTR(ret); + +wait_for_creation: + afs_see_server(server, afs_server_trace_wait_create); + wait_on_bit(&server->flags, AFS_SERVER_FL_CREATING, TASK_UNINTERRUPTIBLE); + if (test_bit_acquire(AFS_SERVER_FL_UNCREATED, &server->flags)) { + /* Barrier: read flag before error */ + ret = READ_ONCE(server->create_error); + afs_put_server(cell->net, server, afs_server_trace_unuse_create_fail); + server = NULL; + goto out; } - _debug("new"); - server = candidate; - if (afs_install_server(server) < 0) - goto server_in_two_cells; + ret = 0; + goto out; - afs_get_cell(cell); - list_add_tail(&server->link, &cell->servers); +create_failed: + down_write(&cell->fs_lock); - write_unlock(&cell->servers_lock); - _leave(" = %p{%d}", server, atomic_read(&server->usage)); - return server; + WRITE_ONCE(server->create_error, ret); + smp_wmb(); /* Barrier: set error before flag. */ + set_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + + clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags); - /* found a matching server quickly */ -found_server_quickly: - _debug("found quickly"); - afs_get_server(server); - read_unlock(&cell->servers_lock); -no_longer_unused: - if (!list_empty(&server->grave)) { - spin_lock(&afs_server_graveyard_lock); - list_del_init(&server->grave); - spin_unlock(&afs_server_graveyard_lock); + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) { + clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + creating = true; } - _leave(" = %p{%d}", server, atomic_read(&server->usage)); - return server; + afs_unuse_server(cell->net, server, afs_server_trace_unuse_create_fail); + server = NULL; - /* found a matching server on the second pass */ -found_server: - _debug("found"); - afs_get_server(server); - write_unlock(&cell->servers_lock); - kfree(candidate); - goto no_longer_unused; - - /* found a server that seems to be in two cells */ -server_in_two_cells: - write_unlock(&cell->servers_lock); - kfree(candidate); - printk(KERN_NOTICE "kAFS: Server %pI4 appears to be in two cells\n", - addr); - _leave(" = -EEXIST"); - return ERR_PTR(-EEXIST); + up_write(&cell->fs_lock); + goto out; } /* - * look up a server by its IP address + * Set/reduce a server's timer. */ -struct afs_server *afs_find_server(const struct in_addr *_addr) +static void afs_set_server_timer(struct afs_server *server, unsigned int delay_secs) { - struct afs_server *server = NULL; - struct rb_node *p; - struct in_addr addr = *_addr; - - _enter("%pI4", &addr.s_addr); + mod_timer(&server->timer, jiffies + delay_secs * HZ); +} - read_lock(&afs_servers_lock); +/* + * Get a reference on a server object. + */ +struct afs_server *afs_get_server(struct afs_server *server, + enum afs_server_trace reason) +{ + unsigned int a; + int r; - p = afs_servers.rb_node; - while (p) { - server = rb_entry(p, struct afs_server, master_rb); + __refcount_inc(&server->ref, &r); + a = atomic_read(&server->active); + trace_afs_server(server->debug_id, r + 1, a, reason); + return server; +} - _debug("- consider %p", p); +/* + * Get an active count on a server object and maybe remove from the inactive + * list. + */ +struct afs_server *afs_use_server(struct afs_server *server, bool activate, + enum afs_server_trace reason) +{ + unsigned int a; + int r; - if (addr.s_addr < server->addr.s_addr) { - p = p->rb_left; - } else if (addr.s_addr > server->addr.s_addr) { - p = p->rb_right; - } else { - afs_get_server(server); - goto found; - } - } + __refcount_inc(&server->ref, &r); + a = atomic_inc_return(&server->active); + if (a == 1 && activate && + !test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + timer_delete(&server->timer); - server = NULL; -found: - read_unlock(&afs_servers_lock); - ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr); - _leave(" = %p", server); + trace_afs_server(server->debug_id, r + 1, a, reason); return server; } /* - * destroy a server record - * - removes from the cell list + * Release a reference on a server record. */ -void afs_put_server(struct afs_server *server) +void afs_put_server(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason) { + unsigned int a, debug_id; + bool zero; + int r; + if (!server) return; - _enter("%p{%d}", server, atomic_read(&server->usage)); + debug_id = server->debug_id; + a = atomic_read(&server->active); + zero = __refcount_dec_and_test(&server->ref, &r); + trace_afs_server(debug_id, r - 1, a, reason); + if (unlikely(zero)) + __afs_put_server(net, server); +} - _debug("PUT SERVER %d", atomic_read(&server->usage)); +/* + * Drop an active count on a server object without updating the last-unused + * time. + */ +void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason) +{ + if (!server) + return; + + if (atomic_dec_and_test(&server->active)) { + if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) || + READ_ONCE(server->cell->state) >= AFS_CELL_REMOVING) + schedule_work(&server->destroyer); + } - ASSERTCMP(atomic_read(&server->usage), >, 0); + afs_put_server(net, server, reason); +} - if (likely(!atomic_dec_and_test(&server->usage))) { - _leave(""); +/* + * Drop an active count on a server object. + */ +void afs_unuse_server(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason) +{ + if (!server) return; + + if (atomic_dec_and_test(&server->active)) { + if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) && + READ_ONCE(server->cell->state) < AFS_CELL_REMOVING) { + time64_t unuse_time = ktime_get_real_seconds(); + + server->unuse_time = unuse_time; + afs_set_server_timer(server, afs_server_gc_delay); + } else { + schedule_work(&server->destroyer); + } } - afs_flush_callback_breaks(server); + afs_put_server(net, server, reason); +} + +static void afs_server_rcu(struct rcu_head *rcu) +{ + struct afs_server *server = container_of(rcu, struct afs_server, rcu); + + trace_afs_server(server->debug_id, refcount_read(&server->ref), + atomic_read(&server->active), afs_server_trace_free); + afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), + afs_estate_trace_put_server); + afs_put_cell(server->cell, afs_cell_trace_put_server); + kfree(server->cm_rxgk_appdata.data); + kfree(server); +} + +static void __afs_put_server(struct afs_net *net, struct afs_server *server) +{ + call_rcu(&server->rcu, afs_server_rcu); + afs_dec_servers_outstanding(net); +} + +static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server) +{ + struct afs_endpoint_state *estate = rcu_access_pointer(server->endpoint_state); + struct afs_addr_list *alist = estate->addresses; + + afs_fs_give_up_all_callbacks(net, server, &alist->addrs[alist->preferred], NULL); +} + +/* + * Check to see if the server record has expired. + */ +static bool afs_has_server_expired(const struct afs_server *server) +{ + time64_t expires_at; + + if (atomic_read(&server->active)) + return false; - spin_lock(&afs_server_graveyard_lock); - if (atomic_read(&server->usage) == 0) { - list_move_tail(&server->grave, &afs_server_graveyard); - server->time_of_death = get_seconds(); - queue_delayed_work(afs_wq, &afs_server_reaper, - afs_server_timeout * HZ); + if (server->cell->net->live || + server->cell->state >= AFS_CELL_REMOVING) { + trace_afs_server(server->debug_id, refcount_read(&server->ref), + 0, afs_server_trace_purging); + return true; } - spin_unlock(&afs_server_graveyard_lock); - _leave(" [dead]"); + + expires_at = server->unuse_time; + if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && + !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) + expires_at += afs_server_gc_delay; + + return ktime_get_real_seconds() > expires_at; } /* - * destroy a dead server + * Remove a server record from it's parent cell's database. */ -static void afs_destroy_server(struct afs_server *server) +static bool afs_remove_server_from_cell(struct afs_server *server) { - _enter("%p", server); + struct afs_cell *cell = server->cell; + + down_write(&cell->fs_lock); - ASSERTIF(server->cb_break_head != server->cb_break_tail, - delayed_work_pending(&server->cb_break_work)); + if (!afs_has_server_expired(server)) { + up_write(&cell->fs_lock); + return false; + } - ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL); - ASSERTCMP(server->cb_promises.rb_node, ==, NULL); - ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail); - ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0); + set_bit(AFS_SERVER_FL_EXPIRED, &server->flags); + _debug("expire %pU %u", &server->uuid, atomic_read(&server->active)); + afs_see_server(server, afs_server_trace_see_expired); + rb_erase(&server->uuid_rb, &cell->fs_servers); + up_write(&cell->fs_lock); + return true; +} - afs_put_cell(server->cell); - kfree(server); +static void afs_server_destroyer(struct work_struct *work) +{ + struct afs_endpoint_state *estate; + struct afs_server *server = container_of(work, struct afs_server, destroyer); + struct afs_net *net = server->cell->net; + + afs_see_server(server, afs_server_trace_see_destroyer); + + if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + return; + + if (!afs_remove_server_from_cell(server)) + return; + + timer_shutdown_sync(&server->timer); + cancel_work(&server->destroyer); + + if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) + afs_give_up_callbacks(net, server); + + /* Unbind the rxrpc_peer records from the server. */ + estate = rcu_access_pointer(server->endpoint_state); + if (estate) + afs_set_peer_appdata(server, estate->addresses, NULL); + + write_seqlock(&net->fs_lock); + list_del_init(&server->probe_link); + if (!hlist_unhashed(&server->proc_link)) + hlist_del_rcu(&server->proc_link); + write_sequnlock(&net->fs_lock); + + afs_put_server(net, server, afs_server_trace_destroy); +} + +static void afs_server_timer(struct timer_list *timer) +{ + struct afs_server *server = container_of(timer, struct afs_server, timer); + + afs_see_server(server, afs_server_trace_see_timer); + if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + schedule_work(&server->destroyer); } /* - * reap dead server records + * Wake up all the servers in a cell so that they can purge themselves. */ -static void afs_reap_server(struct work_struct *work) +void afs_purge_servers(struct afs_cell *cell) { - LIST_HEAD(corpses); struct afs_server *server; - unsigned long delay, expiry; - time_t now; - - now = get_seconds(); - spin_lock(&afs_server_graveyard_lock); - - while (!list_empty(&afs_server_graveyard)) { - server = list_entry(afs_server_graveyard.next, - struct afs_server, grave); - - /* the queue is ordered most dead first */ - expiry = server->time_of_death + afs_server_timeout; - if (expiry > now) { - delay = (expiry - now) * HZ; - mod_delayed_work(afs_wq, &afs_server_reaper, delay); - break; - } + struct rb_node *rb; - write_lock(&server->cell->servers_lock); - write_lock(&afs_servers_lock); - if (atomic_read(&server->usage) > 0) { - list_del_init(&server->grave); - } else { - list_move_tail(&server->grave, &corpses); - list_del_init(&server->link); - rb_erase(&server->master_rb, &afs_servers); - } - write_unlock(&afs_servers_lock); - write_unlock(&server->cell->servers_lock); + down_read(&cell->fs_lock); + for (rb = rb_first(&cell->fs_servers); rb; rb = rb_next(rb)) { + server = rb_entry(rb, struct afs_server, uuid_rb); + afs_see_server(server, afs_server_trace_see_purge); + schedule_work(&server->destroyer); } + up_read(&cell->fs_lock); +} + +/* + * Wait for outstanding servers. + */ +void afs_wait_for_servers(struct afs_net *net) +{ + _enter(""); + + atomic_dec(&net->servers_outstanding); + wait_var_event(&net->servers_outstanding, + !atomic_read(&net->servers_outstanding)); + _leave(""); +} + +/* + * Get an update for a server's address list. + */ +static noinline bool afs_update_server_record(struct afs_operation *op, + struct afs_server *server, + struct key *key) +{ + struct afs_endpoint_state *estate; + struct afs_addr_list *alist; + bool has_addrs; - spin_unlock(&afs_server_graveyard_lock); + _enter(""); - /* now reap the corpses we've extracted */ - while (!list_empty(&corpses)) { - server = list_entry(corpses.next, struct afs_server, grave); - list_del(&server->grave); - afs_destroy_server(server); + trace_afs_server(server->debug_id, refcount_read(&server->ref), + atomic_read(&server->active), + afs_server_trace_update); + + alist = afs_vl_lookup_addrs(server, op->key); + if (IS_ERR(alist)) { + rcu_read_lock(); + estate = rcu_dereference(server->endpoint_state); + has_addrs = estate->addresses; + rcu_read_unlock(); + + if ((PTR_ERR(alist) == -ERESTARTSYS || + PTR_ERR(alist) == -EINTR) && + (op->flags & AFS_OPERATION_UNINTR) && + has_addrs) { + _leave(" = t [intr]"); + return true; + } + afs_op_set_error(op, PTR_ERR(alist)); + _leave(" = f [%d]", afs_op_error(op)); + return false; } + + if (server->addr_version != alist->version) + afs_fs_probe_fileserver(op->net, server, alist, key); + + afs_put_addrlist(alist, afs_alist_trace_put_server_update); + _leave(" = t"); + return true; } /* - * discard all the server records for rmmod + * See if a server's address list needs updating. */ -void __exit afs_purge_servers(void) +bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, + struct key *key) { - afs_server_timeout = 0; - mod_delayed_work(afs_wq, &afs_server_reaper, 0); + bool success; + int ret, retries = 0; + + _enter(""); + + ASSERT(server); + +retry: + if (test_bit(AFS_SERVER_FL_UPDATING, &server->flags)) + goto wait; + if (test_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags)) + goto update; + _leave(" = t [good]"); + return true; + +update: + if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) { + clear_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); + success = afs_update_server_record(op, server, key); + clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags); + wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING); + _leave(" = %d", success); + return success; + } + +wait: + ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING, + (op->flags & AFS_OPERATION_UNINTR) ? + TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE); + if (ret == -ERESTARTSYS) { + afs_op_set_error(op, ret); + _leave(" = f [intr]"); + return false; + } + + retries++; + if (retries == 4) { + _leave(" = f [stale]"); + ret = -ESTALE; + return false; + } + goto retry; } diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c new file mode 100644 index 000000000000..20d5474837df --- /dev/null +++ b/fs/afs/server_list.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* AFS fileserver list management. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include "internal.h" + +void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) +{ + int i; + + if (slist && refcount_dec_and_test(&slist->usage)) { + for (i = 0; i < slist->nr_servers; i++) + afs_unuse_server(net, slist->servers[i].server, + afs_server_trace_unuse_slist); + kfree_rcu(slist, rcu); + } +} + +/* + * Build a server list from a VLDB record. + */ +struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, + struct key *key, + struct afs_vldb_entry *vldb) +{ + struct afs_server_list *slist; + struct afs_server *server; + unsigned int type_mask = 1 << volume->type; + bool use_newrepsites = false; + int ret = -ENOMEM, nr_servers = 0, newrep = 0, i, j, usable = 0; + + /* Work out if we're going to restrict to NEWREPSITE-marked servers or + * not. If at least one site is marked as NEWREPSITE, then it's likely + * that "vos release" is busy updating RO sites. We cut over from one + * to the other when >=50% of the sites have been updated. Sites that + * are in the process of being updated are marked DONTUSE. + */ + for (i = 0; i < vldb->nr_servers; i++) { + if (!(vldb->fs_mask[i] & type_mask)) + continue; + nr_servers++; + if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE) + continue; + usable++; + if (vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE) + newrep++; + } + + slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL); + if (!slist) + goto error; + + if (newrep) { + if (newrep < usable / 2) { + slist->ro_replicating = AFS_RO_REPLICATING_USE_OLD; + } else { + slist->ro_replicating = AFS_RO_REPLICATING_USE_NEW; + use_newrepsites = true; + } + } + + refcount_set(&slist->usage, 1); + rwlock_init(&slist->lock); + + /* Make sure a records exists for each server in the list. */ + for (i = 0; i < vldb->nr_servers; i++) { + unsigned long se_flags = 0; + bool newrepsite = vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE; + + if (!(vldb->fs_mask[i] & type_mask)) + continue; + if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE) + __set_bit(AFS_SE_EXCLUDED, &se_flags); + if (newrep && (newrepsite ^ use_newrepsites)) + __set_bit(AFS_SE_EXCLUDED, &se_flags); + + server = afs_lookup_server(volume->cell, key, &vldb->fs_server[i], + vldb->addr_version[i]); + if (IS_ERR(server)) { + ret = PTR_ERR(server); + if (ret == -ENOENT || + ret == -ENOMEDIUM) + continue; + goto error_2; + } + + /* Insertion-sort by UUID */ + for (j = 0; j < slist->nr_servers; j++) + if (memcmp(&slist->servers[j].server->uuid, + &server->uuid, + sizeof(server->uuid)) >= 0) + break; + if (j < slist->nr_servers) { + if (slist->servers[j].server == server) { + afs_unuse_server_notime(volume->cell->net, server, + afs_server_trace_unuse_slist_isort); + continue; + } + + memmove(slist->servers + j + 1, + slist->servers + j, + (slist->nr_servers - j) * sizeof(struct afs_server_entry)); + } + + slist->servers[j].server = server; + slist->servers[j].volume = volume; + slist->servers[j].flags = se_flags; + slist->servers[j].cb_expires_at = AFS_NO_CB_PROMISE; + slist->nr_servers++; + } + + if (slist->nr_servers == 0) { + ret = -EDESTADDRREQ; + goto error_2; + } + + return slist; + +error_2: + afs_put_serverlist(volume->cell->net, slist); +error: + return ERR_PTR(ret); +} + +/* + * Copy the annotations from an old server list to its potential replacement. + */ +bool afs_annotate_server_list(struct afs_server_list *new, + struct afs_server_list *old) +{ + unsigned long mask = 1UL << AFS_SE_EXCLUDED; + int i; + + if (old->nr_servers != new->nr_servers || + old->ro_replicating != new->ro_replicating) + goto changed; + + for (i = 0; i < old->nr_servers; i++) { + if (old->servers[i].server != new->servers[i].server) + goto changed; + if ((old->servers[i].flags & mask) != (new->servers[i].flags & mask)) + goto changed; + } + return false; +changed: + return true; +} + +/* + * Attach a volume to the servers it is going to use. + */ +void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist) +{ + struct afs_server_entry *se, *pe; + struct afs_server *server; + struct list_head *p; + unsigned int i; + + down_write(&volume->cell->vs_lock); + + for (i = 0; i < slist->nr_servers; i++) { + se = &slist->servers[i]; + server = se->server; + + list_for_each(p, &server->volumes) { + pe = list_entry(p, struct afs_server_entry, slink); + if (volume->vid <= pe->volume->vid) + break; + } + list_add_tail(&se->slink, p); + } + + slist->attached = true; + up_write(&volume->cell->vs_lock); +} + +/* + * Reattach a volume to the servers it is going to use when server list is + * replaced. We try to switch the attachment points to avoid rewalking the + * lists. + */ +void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *new, + struct afs_server_list *old) +{ + unsigned int n = 0, o = 0; + + down_write(&volume->cell->vs_lock); + + while (n < new->nr_servers || o < old->nr_servers) { + struct afs_server_entry *pn = n < new->nr_servers ? &new->servers[n] : NULL; + struct afs_server_entry *po = o < old->nr_servers ? &old->servers[o] : NULL; + struct afs_server_entry *s; + struct list_head *p; + int diff; + + if (pn && po && pn->server == po->server) { + pn->cb_expires_at = po->cb_expires_at; + list_replace(&po->slink, &pn->slink); + n++; + o++; + continue; + } + + if (pn && po) + diff = memcmp(&pn->server->uuid, &po->server->uuid, + sizeof(pn->server->uuid)); + else + diff = pn ? -1 : 1; + + if (diff < 0) { + list_for_each(p, &pn->server->volumes) { + s = list_entry(p, struct afs_server_entry, slink); + if (volume->vid <= s->volume->vid) + break; + } + list_add_tail(&pn->slink, p); + n++; + } else { + list_del(&po->slink); + o++; + } + } + + up_write(&volume->cell->vs_lock); +} + +/* + * Detach a volume from the servers it has been using. + */ +void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist) +{ + unsigned int i; + + if (!slist->attached) + return; + + down_write(&volume->cell->vs_lock); + + for (i = 0; i < slist->nr_servers; i++) + list_del(&slist->servers[i].slink); + + slist->attached = false; + up_write(&volume->cell->vs_lock); +} diff --git a/fs/afs/super.c b/fs/afs/super.c index c4861557e385..d672b7ab57ae 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -1,6 +1,6 @@ /* AFS superblock handling * - * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved. + * Copyright (c) 2002, 2007, 2018 Red Hat, Inc. All rights reserved. * * This software may be freely redistributed under the terms of the * GNU General Public License. @@ -21,58 +21,73 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/statfs.h> #include <linux/sched.h> #include <linux/nsproxy.h> +#include <linux/magic.h> #include <net/net_namespace.h> #include "internal.h" -#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ - static void afs_i_init_once(void *foo); -static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data); static void afs_kill_super(struct super_block *sb); static struct inode *afs_alloc_inode(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); +static void afs_free_inode(struct inode *inode); static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); +static int afs_show_devname(struct seq_file *m, struct dentry *root); +static int afs_show_options(struct seq_file *m, struct dentry *root); +static int afs_init_fs_context(struct fs_context *fc); +static const struct fs_parameter_spec afs_fs_parameters[]; struct file_system_type afs_fs_type = { - .owner = THIS_MODULE, - .name = "afs", - .mount = afs_mount, - .kill_sb = afs_kill_super, - .fs_flags = 0, + .owner = THIS_MODULE, + .name = "afs", + .init_fs_context = afs_init_fs_context, + .parameters = afs_fs_parameters, + .kill_sb = afs_kill_super, + .fs_flags = FS_RENAME_DOES_D_MOVE, }; MODULE_ALIAS_FS("afs"); +int afs_net_id; + static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, + .write_inode = netfs_unpin_writeback, .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, + .free_inode = afs_free_inode, .evict_inode = afs_evict_inode, - .show_options = generic_show_options, + .show_devname = afs_show_devname, + .show_options = afs_show_options, }; static struct kmem_cache *afs_inode_cachep; static atomic_t afs_count_active_inodes; -enum { - afs_no_opt, - afs_opt_cell, - afs_opt_rwpath, - afs_opt_vol, - afs_opt_autocell, +enum afs_param { + Opt_autocell, + Opt_dyn, + Opt_flock, + Opt_source, }; -static const match_table_t afs_options_list = { - { afs_opt_cell, "cell=%s" }, - { afs_opt_rwpath, "rwpath" }, - { afs_opt_vol, "vol=%s" }, - { afs_opt_autocell, "autocell" }, - { afs_no_opt, NULL }, +static const struct constant_table afs_param_flock[] = { + {"local", afs_flock_mode_local }, + {"openafs", afs_flock_mode_openafs }, + {"strict", afs_flock_mode_strict }, + {"write", afs_flock_mode_write }, + {} +}; + +static const struct fs_parameter_spec afs_fs_parameters[] = { + fsparam_flag ("autocell", Opt_autocell), + fsparam_flag ("dyn", Opt_dyn), + fsparam_enum ("flock", Opt_flock, afs_param_flock), + fsparam_string("source", Opt_source), + {} }; /* @@ -91,7 +106,7 @@ int __init afs_fs_init(void) afs_inode_cachep = kmem_cache_create("afs_inode_cache", sizeof(struct afs_vnode), 0, - SLAB_HWCACHE_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, afs_i_init_once); if (!afs_inode_cachep) { printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n"); @@ -113,7 +128,7 @@ int __init afs_fs_init(void) /* * clean up the filesystem */ -void __exit afs_fs_exit(void) +void afs_fs_exit(void) { _enter(""); @@ -136,150 +151,265 @@ void __exit afs_fs_exit(void) } /* - * parse the mount options - * - this function has been shamelessly adapted from the ext3 fs which - * shamelessly adapted it from the msdos fs + * Display the mount device name in /proc/mounts. */ -static int afs_parse_options(struct afs_mount_params *params, - char *options, const char **devname) +static int afs_show_devname(struct seq_file *m, struct dentry *root) { - struct afs_cell *cell; - substring_t args[MAX_OPT_ARGS]; - char *p; - int token; - - _enter("%s", options); - - options[PAGE_SIZE - 1] = 0; - - while ((p = strsep(&options, ","))) { - if (!*p) - continue; - - token = match_token(p, afs_options_list, args); - switch (token) { - case afs_opt_cell: - cell = afs_cell_lookup(args[0].from, - args[0].to - args[0].from, - false); - if (IS_ERR(cell)) - return PTR_ERR(cell); - afs_put_cell(params->cell); - params->cell = cell; - break; - - case afs_opt_rwpath: - params->rwpath = 1; - break; - - case afs_opt_vol: - *devname = args[0].from; - break; - - case afs_opt_autocell: - params->autocell = 1; - break; - - default: - printk(KERN_ERR "kAFS:" - " Unknown or invalid mount option: '%s'\n", p); - return -EINVAL; - } + struct afs_super_info *as = AFS_FS_S(root->d_sb); + struct afs_volume *volume = as->volume; + struct afs_cell *cell = as->cell; + const char *suf = ""; + char pref = '%'; + + if (as->dyn_root) { + seq_puts(m, "none"); + return 0; } - _leave(" = 0"); + switch (volume->type) { + case AFSVL_RWVOL: + break; + case AFSVL_ROVOL: + pref = '#'; + if (volume->type_force) + suf = ".readonly"; + break; + case AFSVL_BACKVOL: + pref = '#'; + suf = ".backup"; + break; + } + + seq_printf(m, "%c%s:%s%s", pref, cell->name, volume->name, suf); return 0; } /* - * parse a device name to get cell name, volume name, volume type and R/W - * selector - * - this can be one of the following: + * Display the mount options in /proc/mounts. + */ +static int afs_show_options(struct seq_file *m, struct dentry *root) +{ + struct afs_super_info *as = AFS_FS_S(root->d_sb); + const char *p = NULL; + + if (as->dyn_root) + seq_puts(m, ",dyn"); + switch (as->flock_mode) { + case afs_flock_mode_unset: break; + case afs_flock_mode_local: p = "local"; break; + case afs_flock_mode_openafs: p = "openafs"; break; + case afs_flock_mode_strict: p = "strict"; break; + case afs_flock_mode_write: p = "write"; break; + } + if (p) + seq_printf(m, ",flock=%s", p); + + return 0; +} + +/* + * Parse the source name to get cell name, volume name, volume type and R/W + * selector. + * + * This can be one of the following: * "%[cell:]volume[.]" R/W volume - * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0), - * or R/W (rwpath=1) volume + * "#[cell:]volume[.]" R/O or R/W volume (R/O parent), + * or R/W (R/W parent) volume * "%[cell:]volume.readonly" R/O volume * "#[cell:]volume.readonly" R/O volume * "%[cell:]volume.backup" Backup volume * "#[cell:]volume.backup" Backup volume */ -static int afs_parse_device_name(struct afs_mount_params *params, - const char *name) +static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) { + struct afs_fs_context *ctx = fc->fs_private; struct afs_cell *cell; - const char *cellname, *suffix; + const char *cellname, *suffix, *name = param->string; int cellnamesz; _enter(",%s", name); + if (fc->source) + return invalf(fc, "kAFS: Multiple sources not supported"); + if (!name) { printk(KERN_ERR "kAFS: no volume name specified\n"); return -EINVAL; } if ((name[0] != '%' && name[0] != '#') || !name[1]) { + /* To use dynroot, we don't want to have to provide a source */ + if (strcmp(name, "none") == 0) { + ctx->no_cell = true; + return 0; + } printk(KERN_ERR "kAFS: unparsable volume name\n"); return -EINVAL; } /* determine the type of volume we're looking for */ - params->type = AFSVL_ROVOL; - params->force = false; - if (params->rwpath || name[0] == '%') { - params->type = AFSVL_RWVOL; - params->force = true; + if (name[0] == '%') { + ctx->type = AFSVL_RWVOL; + ctx->force = true; } name++; /* split the cell name out if there is one */ - params->volname = strchr(name, ':'); - if (params->volname) { + ctx->volname = strchr(name, ':'); + if (ctx->volname) { cellname = name; - cellnamesz = params->volname - name; - params->volname++; + cellnamesz = ctx->volname - name; + ctx->volname++; } else { - params->volname = name; + ctx->volname = name; cellname = NULL; cellnamesz = 0; } /* the volume type is further affected by a possible suffix */ - suffix = strrchr(params->volname, '.'); + suffix = strrchr(ctx->volname, '.'); if (suffix) { if (strcmp(suffix, ".readonly") == 0) { - params->type = AFSVL_ROVOL; - params->force = true; + ctx->type = AFSVL_ROVOL; + ctx->force = true; } else if (strcmp(suffix, ".backup") == 0) { - params->type = AFSVL_BACKVOL; - params->force = true; + ctx->type = AFSVL_BACKVOL; + ctx->force = true; } else if (suffix[1] == 0) { } else { suffix = NULL; } } - params->volnamesz = suffix ? - suffix - params->volname : strlen(params->volname); + ctx->volnamesz = suffix ? + suffix - ctx->volname : strlen(ctx->volname); _debug("cell %*.*s [%p]", - cellnamesz, cellnamesz, cellname ?: "", params->cell); + cellnamesz, cellnamesz, cellname ?: "", ctx->cell); /* lookup the cell record */ - if (cellname || !params->cell) { - cell = afs_cell_lookup(cellname, cellnamesz, true); + if (cellname) { + cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, + NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT, + afs_cell_trace_use_lookup_mount); if (IS_ERR(cell)) { - printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n", + pr_err("kAFS: unable to lookup cell '%*.*s'\n", cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_put_cell(params->cell); - params->cell = cell; + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_parse); + afs_see_cell(cell, afs_cell_trace_see_source); + ctx->cell = cell; } _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s", - params->cell->name, params->cell, - params->volnamesz, params->volnamesz, params->volname, - suffix ?: "-", params->type, params->force ? " FORCE" : ""); + ctx->cell->name, ctx->cell, + ctx->volnamesz, ctx->volnamesz, ctx->volname, + suffix ?: "-", ctx->type, ctx->force ? " FORCE" : ""); + + fc->source = param->string; + param->string = NULL; + return 0; +} + +/* + * Parse a single mount parameter. + */ +static int afs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct fs_parse_result result; + struct afs_fs_context *ctx = fc->fs_private; + int opt; + + opt = fs_parse(fc, afs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_source: + return afs_parse_source(fc, param); + + case Opt_autocell: + ctx->autocell = true; + break; + + case Opt_dyn: + ctx->dyn_root = true; + break; + + case Opt_flock: + ctx->flock_mode = result.uint_32; + break; + + default: + return -EINVAL; + } + + _leave(" = 0"); + return 0; +} + +/* + * Validate the options, get the cell key and look up the volume. + */ +static int afs_validate_fc(struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + struct afs_volume *volume; + struct afs_cell *cell; + struct key *key; + int ret; + + if (!ctx->dyn_root) { + if (ctx->no_cell) { + pr_warn("kAFS: Can only specify source 'none' with -o dyn\n"); + return -EINVAL; + } + + if (!ctx->cell) { + pr_warn("kAFS: No cell specified\n"); + return -EDESTADDRREQ; + } + + reget_key: + /* We try to do the mount securely. */ + key = afs_request_key(ctx->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + + ctx->key = key; + + if (ctx->volume) { + afs_put_volume(ctx->volume, afs_volume_trace_put_validate_fc); + ctx->volume = NULL; + } + + if (test_bit(AFS_CELL_FL_CHECK_ALIAS, &ctx->cell->flags)) { + ret = afs_cell_detect_alias(ctx->cell, key); + if (ret < 0) + return ret; + if (ret == 1) { + _debug("switch to alias"); + key_put(ctx->key); + ctx->key = NULL; + cell = afs_use_cell(ctx->cell->alias_of, + afs_cell_trace_use_fc_alias); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc); + ctx->cell = cell; + goto reget_key; + } + } + + volume = afs_create_volume(ctx); + if (IS_ERR(volume)) + return PTR_ERR(volume); + + ctx->volume = volume; + if (volume->type != AFSVL_RWVOL) { + ctx->flock_mode = afs_flock_mode_local; + fc->sb_flags |= SB_RDONLY; + } + } return 0; } @@ -287,58 +417,77 @@ static int afs_parse_device_name(struct afs_mount_params *params, /* * check a superblock to see if it's the one we're looking for */ -static int afs_test_super(struct super_block *sb, void *data) +static int afs_test_super(struct super_block *sb, struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + struct afs_super_info *as = AFS_FS_S(sb); + + return (as->net_ns == fc->net_ns && + as->volume && + as->volume->vid == ctx->volume->vid && + as->cell == ctx->cell && + !as->dyn_root); +} + +static int afs_dynroot_test_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as1 = data; - struct afs_super_info *as = sb->s_fs_info; + struct afs_super_info *as = AFS_FS_S(sb); - return as->volume == as1->volume; + return (as->net_ns == fc->net_ns && + as->dyn_root); } -static int afs_set_super(struct super_block *sb, void *data) +static int afs_set_super(struct super_block *sb, struct fs_context *fc) { - sb->s_fs_info = data; return set_anon_super(sb, NULL); } /* * fill in the superblock */ -static int afs_fill_super(struct super_block *sb, - struct afs_mount_params *params) +static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) { - struct afs_super_info *as = sb->s_fs_info; - struct afs_fid fid; + struct afs_super_info *as = AFS_FS_S(sb); struct inode *inode = NULL; int ret; _enter(""); /* fill in the superblock */ - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; - sb->s_bdi = &as->volume->bdi; - strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); + if (!as->dyn_root) + sb->s_xattr = afs_xattr_handlers; + ret = super_setup_bdi(sb); + if (ret) + return ret; /* allocate the root inode and dentry */ - fid.vid = as->volume->vid; - fid.vnode = 1; - fid.unique = 1; - inode = afs_iget(sb, params->key, &fid, NULL, NULL); + if (as->dyn_root) { + inode = afs_dynroot_iget_root(sb); + } else { + sprintf(sb->s_id, "%llu", as->volume->vid); + afs_activate_volume(as->volume); + inode = afs_root_iget(sb, ctx->key); + } + if (IS_ERR(inode)) return PTR_ERR(inode); - if (params->autocell) - set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); - ret = -ENOMEM; sb->s_root = d_make_root(inode); if (!sb->s_root) goto error; - sb->s_d_op = &afs_fs_dentry_operations; + if (as->dyn_root) { + set_default_d_op(sb, &afs_dynroot_dentry_operations); + } else { + set_default_d_op(sb, &afs_fs_dentry_operations); + rcu_assign_pointer(as->volume->sb, sb); + } _leave(" = 0"); return 0; @@ -348,130 +497,171 @@ error: return ret; } -/* - * get an AFS superblock - */ -static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *options) +static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) { - struct afs_mount_params params; - struct super_block *sb; - struct afs_volume *vol; - struct key *key; - char *new_opts = kstrdup(options, GFP_KERNEL); + struct afs_fs_context *ctx = fc->fs_private; struct afs_super_info *as; - int ret; - _enter(",,%s,%p", dev_name, options); + as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); + if (as) { + as->net_ns = get_net(fc->net_ns); + as->flock_mode = ctx->flock_mode; + if (ctx->dyn_root) { + as->dyn_root = true; + } else { + as->cell = afs_use_cell(ctx->cell, afs_cell_trace_use_sbi); + as->volume = afs_get_volume(ctx->volume, + afs_volume_trace_get_alloc_sbi); + } + } + return as; +} - memset(¶ms, 0, sizeof(params)); +static void afs_destroy_sbi(struct afs_super_info *as) +{ + if (as) { + afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi); + afs_unuse_cell(as->cell, afs_cell_trace_unuse_sbi); + put_net(as->net_ns); + kfree(as); + } +} - ret = -EINVAL; - if (current->nsproxy->net_ns != &init_net) - goto error; +static void afs_kill_super(struct super_block *sb) +{ + struct afs_super_info *as = AFS_FS_S(sb); - /* parse the options and device name */ - if (options) { - ret = afs_parse_options(¶ms, options, &dev_name); - if (ret < 0) - goto error; - } + /* Clear the callback interests (which will do ilookup5) before + * deactivating the superblock. + */ + if (as->volume) + rcu_assign_pointer(as->volume->sb, NULL); + kill_anon_super(sb); + if (as->volume) + afs_deactivate_volume(as->volume); + afs_destroy_sbi(as); +} - ret = afs_parse_device_name(¶ms, dev_name); - if (ret < 0) - goto error; +/* + * Get an AFS superblock and root directory. + */ +static int afs_get_tree(struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + struct super_block *sb; + struct afs_super_info *as; + int ret; - /* try and do the mount securely */ - key = afs_request_key(params.cell); - if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - ret = PTR_ERR(key); + ret = afs_validate_fc(fc); + if (ret) goto error; - } - params.key = key; - /* parse the device name */ - vol = afs_volume_lookup(¶ms); - if (IS_ERR(vol)) { - ret = PTR_ERR(vol); - goto error; - } + _enter(""); /* allocate a superblock info record */ - as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); - if (!as) { - ret = -ENOMEM; - afs_put_volume(vol); + ret = -ENOMEM; + as = afs_alloc_sbi(fc); + if (!as) goto error; - } - as->volume = vol; + fc->s_fs_info = as; /* allocate a deviceless superblock */ - sb = sget(fs_type, afs_test_super, afs_set_super, flags, as); + sb = sget_fc(fc, + as->dyn_root ? afs_dynroot_test_super : afs_test_super, + afs_set_super); if (IS_ERR(sb)) { ret = PTR_ERR(sb); - afs_put_volume(vol); - kfree(as); goto error; } if (!sb->s_root) { /* initial superblock/root creation */ _debug("create"); - ret = afs_fill_super(sb, ¶ms); - if (ret < 0) { - deactivate_locked_super(sb); - goto error; - } - save_mount_options(sb, new_opts); - sb->s_flags |= MS_ACTIVE; + ret = afs_fill_super(sb, ctx); + if (ret < 0) + goto error_sb; + sb->s_flags |= SB_ACTIVE; } else { _debug("reuse"); - ASSERTCMP(sb->s_flags, &, MS_ACTIVE); - afs_put_volume(vol); - kfree(as); + ASSERTCMP(sb->s_flags, &, SB_ACTIVE); } - afs_put_cell(params.cell); - kfree(new_opts); + fc->root = dget(sb->s_root); + trace_afs_get_tree(as->cell, as->volume); _leave(" = 0 [%p]", sb); - return dget(sb->s_root); + return 0; +error_sb: + deactivate_locked_super(sb); error: - afs_put_cell(params.cell); - key_put(params.key); - kfree(new_opts); _leave(" = %d", ret); - return ERR_PTR(ret); + return ret; } -static void afs_kill_super(struct super_block *sb) +static void afs_free_fc(struct fs_context *fc) { - struct afs_super_info *as = sb->s_fs_info; - kill_anon_super(sb); - afs_put_volume(as->volume); - kfree(as); + struct afs_fs_context *ctx = fc->fs_private; + + afs_destroy_sbi(fc->s_fs_info); + afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc); + key_put(ctx->key); + kfree(ctx); } +static const struct fs_context_operations afs_context_ops = { + .free = afs_free_fc, + .parse_param = afs_parse_param, + .get_tree = afs_get_tree, +}; + /* - * initialise an inode cache slab element prior to any use + * Set up the filesystem mount context. + */ +static int afs_init_fs_context(struct fs_context *fc) +{ + struct afs_fs_context *ctx; + struct afs_cell *cell; + + ctx = kzalloc(sizeof(struct afs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->type = AFSVL_ROVOL; + ctx->net = afs_net(fc->net_ns); + + /* Default to the workstation cell. */ + cell = afs_find_cell(ctx->net, NULL, 0, afs_cell_trace_use_fc); + if (IS_ERR(cell)) + cell = NULL; + ctx->cell = cell; + + fc->fs_private = ctx; + fc->ops = &afs_context_ops; + return 0; +} + +/* + * Initialise an inode cache slab element prior to any use. Note that + * afs_alloc_inode() *must* reset anything that could incorrectly leak from one + * inode to another. */ static void afs_i_init_once(void *_vnode) { struct afs_vnode *vnode = _vnode; memset(vnode, 0, sizeof(*vnode)); - inode_init_once(&vnode->vfs_inode); - init_waitqueue_head(&vnode->update_waitq); - mutex_init(&vnode->permits_lock); - mutex_init(&vnode->validate_lock); - spin_lock_init(&vnode->writeback_lock); + inode_init_once(&vnode->netfs.inode); + INIT_LIST_HEAD(&vnode->io_lock_waiters); + init_rwsem(&vnode->validate_lock); + spin_lock_init(&vnode->wb_lock); spin_lock_init(&vnode->lock); - INIT_LIST_HEAD(&vnode->writebacks); + INIT_LIST_HEAD(&vnode->wb_keys); INIT_LIST_HEAD(&vnode->pending_locks); INIT_LIST_HEAD(&vnode->granted_locks); INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work); - INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work); + INIT_LIST_HEAD(&vnode->cb_mmap_link); + seqlock_init(&vnode->cb_lock); } /* @@ -481,29 +671,36 @@ static struct inode *afs_alloc_inode(struct super_block *sb) { struct afs_vnode *vnode; - vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL); + vnode = alloc_inode_sb(sb, afs_inode_cachep, GFP_KERNEL); if (!vnode) return NULL; atomic_inc(&afs_count_active_inodes); + /* Reset anything that shouldn't leak from one inode to the next. */ memset(&vnode->fid, 0, sizeof(vnode->fid)); memset(&vnode->status, 0, sizeof(vnode->status)); + afs_vnode_set_cache(vnode, NULL); vnode->volume = NULL; - vnode->update_cnt = 0; + vnode->lock_key = NULL; + vnode->permit_cache = NULL; + vnode->directory = NULL; + vnode->directory_size = 0; + vnode->flags = 1 << AFS_VNODE_UNSET; - vnode->cb_promised = false; + vnode->lock_state = AFS_VNODE_LOCK_NONE; + + init_rwsem(&vnode->rmdir_lock); + INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work); - _leave(" = %p", &vnode->vfs_inode); - return &vnode->vfs_inode; + _leave(" = %p", &vnode->netfs.inode); + return &vnode->netfs.inode; } -static void afs_i_callback(struct rcu_head *head) +static void afs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); - struct afs_vnode *vnode = AFS_FS_I(inode); - kmem_cache_free(afs_inode_cachep, vnode); + kmem_cache_free(afs_inode_cachep, AFS_FS_I(inode)); } /* @@ -513,45 +710,61 @@ static void afs_destroy_inode(struct inode *inode) { struct afs_vnode *vnode = AFS_FS_I(inode); - _enter("%p{%x:%u}", inode, vnode->fid.vid, vnode->fid.vnode); + _enter("%p{%llx:%llu}", inode, vnode->fid.vid, vnode->fid.vnode); _debug("DESTROY INODE %p", inode); - ASSERTCMP(vnode->server, ==, NULL); - - call_rcu(&inode->i_rcu, afs_i_callback); atomic_dec(&afs_count_active_inodes); } +static void afs_get_volume_status_success(struct afs_operation *op) +{ + struct afs_volume_status *vs = &op->volstatus.vs; + struct kstatfs *buf = op->volstatus.buf; + + if (vs->max_quota == 0) + buf->f_blocks = vs->part_max_blocks; + else + buf->f_blocks = vs->max_quota; + + if (buf->f_blocks > vs->blocks_in_use) + buf->f_bavail = buf->f_bfree = + buf->f_blocks - vs->blocks_in_use; +} + +static const struct afs_operation_ops afs_get_volume_status_operation = { + .issue_afs_rpc = afs_fs_get_volume_status, + .issue_yfs_rpc = yfs_fs_get_volume_status, + .success = afs_get_volume_status_success, +}; + /* * return information about an AFS volume */ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct afs_volume_status vs; - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); - struct key *key; - int ret; - - key = afs_request_key(vnode->volume->cell); - if (IS_ERR(key)) - return PTR_ERR(key); - - ret = afs_vnode_get_volume_status(vnode, key, &vs); - key_put(key); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } + struct afs_super_info *as = AFS_FS_S(dentry->d_sb); + struct afs_operation *op; + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = AFS_BLOCK_SIZE; buf->f_namelen = AFSNAMEMAX - 1; - if (vs.max_quota == 0) - buf->f_blocks = vs.part_max_blocks; - else - buf->f_blocks = vs.max_quota; - buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use; - return 0; + if (as->dyn_root) { + buf->f_blocks = 1; + buf->f_bavail = 0; + buf->f_bfree = 0; + return 0; + } + + op = afs_alloc_operation(NULL, as->volume); + if (IS_ERR(op)) + return PTR_ERR(op); + + afs_op_set_vnode(op, 0, vnode); + op->nr_files = 1; + op->volstatus.buf = buf; + op->ops = &afs_get_volume_status_operation; + return afs_do_sync_operation(op); } diff --git a/fs/afs/validation.c b/fs/afs/validation.c new file mode 100644 index 000000000000..0ba8336c9025 --- /dev/null +++ b/fs/afs/validation.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* vnode and volume validity verification. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include "internal.h" + +/* + * Data validation is managed through a number of mechanisms from the server: + * + * (1) On first contact with a server (such as if it has just been rebooted), + * the server sends us a CB.InitCallBackState* request. + * + * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC + * calls, the server maintains a time-limited per-vnode promise that it + * will send us a CB.CallBack request if a third party alters the vnodes + * accessed. + * + * Note that a vnode-level callbacks may also be sent for other reasons, + * such as filelock release. + * + * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC + * calls, each server maintains a time-limited per-volume promise that it + * will send us a CB.CallBack request if the RO volume is updated to a + * snapshot of the RW volume ("vos release"). This is an atomic event + * that cuts over all instances of the RO volume across multiple servers + * simultaneously. + * + * Note that a volume-level callbacks may also be sent for other reasons, + * such as the volumeserver taking over control of the volume from the + * fileserver. + * + * Note also that each server maintains an independent time limit on an + * independent callback. + * + * (4) Certain RPC calls include a volume information record "VolSync" in + * their reply. This contains a creation date for the volume that should + * remain unchanged for a RW volume (but will be changed if the volume is + * restored from backup) or will be bumped to the time of snapshotting + * when a RO volume is released. + * + * In order to track this events, the following are provided: + * + * ->cb_v_break. A counter of events that might mean that the contents of + * a volume have been altered since we last checked a vnode. + * + * ->cb_v_check. A counter of the number of events that we've sent a + * query to the server for. Everything's up to date if this equals + * cb_v_break. + * + * ->cb_scrub. A counter of the number of regression events for which we + * have to completely wipe the cache. + * + * ->cb_ro_snapshot. A counter of the number of times that we've + * recognised that a RO volume has been updated. + * + * ->cb_break. A counter of events that might mean that the contents of a + * vnode have been altered. + * + * ->cb_expires_at. The time at which the callback promise expires or + * AFS_NO_CB_PROMISE if we have no promise. + * + * The way we manage things is: + * + * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on + * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the + * volume and volume's server record. + * + * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level + * callback break on all the volumes that have been using that volume + * (ie. increment ->cb_v_break and reset ->cb_expires_at). + * + * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the + * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also + * dispatch a work item to unmap all PTEs to the vnode's pagecache to + * force reentry to the filesystem for revalidation. + * + * (4) When entering the filesystem, we call afs_validate() to check the + * validity of a vnode. This first checks to see if ->cb_v_check and + * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock + * exclusively and perform an FS.FetchStatus on the vnode. + * + * After checking the volume, we check the vnode. If there's a mismatch + * between the volume counters and the vnode's mirrors of those counters, + * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode. + * + * (5) When the reply from FS.FetchStatus arrives, the VolSync record is + * parsed: + * + * (A) If the Creation timestamp has changed on a RW volume or regressed + * on a RO volume, we try to increment ->cb_scrub; if it advances on a + * RO volume, we assume "vos release" happened and try to increment + * ->cb_ro_snapshot. + * + * (B) If the Update timestamp has regressed, we try to increment + * ->cb_scrub. + * + * Note that in both of these cases, we only do the increment if we can + * cmpxchg the value of the timestamp from the value we noted before the + * op. This tries to prevent parallel ops from fighting one another. + * + * volume->cb_v_check is then set to ->cb_v_break. + * + * (6) The AFSCallBack record included in the FS.FetchStatus reply is also + * parsed and used to set the promise in ->cb_expires_at for the vnode, + * the volume and the volume's server record. + * + * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for + * the vnode. + */ + +/* + * Check the validity of a vnode/inode and its parent volume. + */ +bool afs_check_validity(const struct afs_vnode *vnode) +{ + const struct afs_volume *volume = vnode->volume; + enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace; + time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at); + time64_t deadline = ktime_get_real_seconds() + 10; + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + return true; + + if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) + trace = afs_vnode_invalid_trace_cb_v_break; + else if (cb_expires_at == AFS_NO_CB_PROMISE) + trace = afs_vnode_invalid_trace_no_cb_promise; + else if (cb_expires_at <= deadline) + trace = afs_vnode_invalid_trace_expired; + else if (volume->cb_expires_at <= deadline) + trace = afs_vnode_invalid_trace_vol_expired; + else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot)) + trace = afs_vnode_invalid_trace_cb_ro_snapshot; + else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub)) + trace = afs_vnode_invalid_trace_cb_scrub; + else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) + trace = afs_vnode_invalid_trace_zap_data; + else + return true; + trace_afs_vnode_invalid(vnode, trace); + return false; +} + +/* + * See if the server we've just talked to is currently excluded. + */ +static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) +{ + const struct afs_server_entry *se; + const struct afs_server_list *slist; + bool is_excluded = true; + int i; + + rcu_read_lock(); + + slist = rcu_dereference(volume->servers); + for (i = 0; i < slist->nr_servers; i++) { + se = &slist->servers[i]; + if (op->server == se->server) { + is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags); + break; + } + } + + rcu_read_unlock(); + return is_excluded; +} + +/* + * Update the volume's server list when the creation time changes and see if + * the server we've just talked to is currently excluded. + */ +static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) +{ + int ret; + + if (__afs_is_server_excluded(op, volume)) + return 1; + + set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); + ret = afs_check_volume_status(op->volume, op); + if (ret < 0) + return ret; + + return __afs_is_server_excluded(op, volume); +} + +/* + * Handle a change to the volume creation time in the VolSync record. + */ +static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume) +{ + unsigned int snap; + time64_t cur = volume->creation_time; + time64_t old = op->pre_volsync.creation; + time64_t new = op->volsync.creation; + int ret; + + _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); + + if (cur == TIME64_MIN) { + volume->creation_time = new; + return 0; + } + + if (new == cur) + return 0; + + /* Try to advance the creation timestamp from what we had before the + * operation to what we got back from the server. This should + * hopefully ensure that in a race between multiple operations only one + * of them will do this. + */ + if (cur != old) + return 0; + + /* If the creation time changes in an unexpected way, we need to scrub + * our caches. For a RW vol, this will only change if the volume is + * restored from a backup; for a RO/Backup vol, this will advance when + * the volume is updated to a new snapshot (eg. "vos release"). + */ + if (volume->type == AFSVL_RWVOL) + goto regressed; + if (volume->type == AFSVL_BACKVOL) { + if (new < old) + goto regressed; + goto advance; + } + + /* We have an RO volume, we need to query the VL server and look at the + * server flags to see if RW->RO replication is in progress. + */ + ret = afs_is_server_excluded(op, volume); + if (ret < 0) + return ret; + if (ret > 0) { + snap = atomic_read(&volume->cb_ro_snapshot); + trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded); + return ret; + } + +advance: + snap = atomic_inc_return(&volume->cb_ro_snapshot); + trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release); + volume->creation_time = new; + return 0; + +regressed: + atomic_inc(&volume->cb_scrub); + trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress); + volume->creation_time = new; + return 0; +} + +/* + * Handle a change to the volume update time in the VolSync record. + */ +static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume) +{ + enum afs_cb_break_reason reason = afs_cb_break_no_break; + time64_t cur = volume->update_time; + time64_t old = op->pre_volsync.update; + time64_t new = op->volsync.update; + + _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); + + if (cur == TIME64_MIN) { + volume->update_time = new; + return; + } + + if (new == cur) + return; + + /* If the volume update time changes in an unexpected way, we need to + * scrub our caches. For a RW vol, this will advance on every + * modification op; for a RO/Backup vol, this will advance when the + * volume is updated to a new snapshot (eg. "vos release"). + */ + if (new < old) + reason = afs_cb_break_for_update_regress; + + /* Try to advance the update timestamp from what we had before the + * operation to what we got back from the server. This should + * hopefully ensure that in a race between multiple operations only one + * of them will do this. + */ + if (cur == old) { + if (reason == afs_cb_break_for_update_regress) { + atomic_inc(&volume->cb_scrub); + trace_afs_cb_v_break(volume->vid, 0, reason); + } + volume->update_time = new; + } +} + +static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume) +{ + int ret = 0; + + if (likely(op->volsync.creation == volume->creation_time && + op->volsync.update == volume->update_time)) + return 0; + + mutex_lock(&volume->volsync_lock); + if (op->volsync.creation != volume->creation_time) { + ret = afs_update_volume_creation_time(op, volume); + if (ret < 0) + goto out; + } + if (op->volsync.update != volume->update_time) + afs_update_volume_update_time(op, volume); +out: + mutex_unlock(&volume->volsync_lock); + return ret; +} + +/* + * Update the state of a volume, including recording the expiration time of the + * callback promise. Returns 1 to redo the operation from the start. + */ +int afs_update_volume_state(struct afs_operation *op) +{ + struct afs_server_list *slist = op->server_list; + struct afs_server_entry *se = &slist->servers[op->server_index]; + struct afs_callback *cb = &op->file[0].scb.callback; + struct afs_volume *volume = op->volume; + unsigned int cb_v_break = atomic_read(&volume->cb_v_break); + unsigned int cb_v_check = atomic_read(&volume->cb_v_check); + int ret; + + _enter("%llx", op->volume->vid); + + if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) { + ret = afs_update_volume_times(op, volume); + if (ret != 0) { + _leave(" = %d", ret); + return ret; + } + } + + if (op->cb_v_break == cb_v_break && + (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) { + time64_t expires_at = cb->expires_at; + + if (!op->file[0].scb.have_cb) + expires_at = op->file[1].scb.callback.expires_at; + + se->cb_expires_at = expires_at; + volume->cb_expires_at = expires_at; + } + if (cb_v_check < op->cb_v_break) + atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break); + return 0; +} + +/* + * mark the data attached to an inode as obsolete due to a write on the server + * - might also want to ditch all the outstanding writes and dirty pages + */ +static void afs_zap_data(struct afs_vnode *vnode) +{ + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); + + afs_invalidate_cache(vnode, 0); + + /* nuke all the non-dirty pages that aren't locked, mapped or being + * written back in a regular file and completely discard the pages in a + * directory or symlink */ + if (S_ISREG(vnode->netfs.inode.i_mode)) + filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX); + else + filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX); +} + +/* + * validate a vnode/inode + * - there are several things we need to check + * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, + * symlink) + * - parent dir metadata changed (security changes) + * - dentry data changed (write, truncate) + * - dentry metadata changed (security changes) + */ +int afs_validate(struct afs_vnode *vnode, struct key *key) +{ + struct afs_volume *volume = vnode->volume; + unsigned int cb_ro_snapshot, cb_scrub; + time64_t deadline = ktime_get_real_seconds() + 10; + bool zap = false, locked_vol = false; + int ret; + + _enter("{v={%llx:%llu} fl=%lx},%x", + vnode->fid.vid, vnode->fid.vnode, vnode->flags, + key_serial(key)); + + if (afs_check_validity(vnode)) + return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; + + ret = down_write_killable(&vnode->validate_lock); + if (ret < 0) + goto error; + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + ret = -ESTALE; + goto error_unlock; + } + + /* Validate a volume after the v_break has changed or the volume + * callback expired. We only want to do this once per volume per + * v_break change. The actual work will be done when parsing the + * status fetch reply. + */ + if (volume->cb_expires_at <= deadline || + atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) { + ret = mutex_lock_interruptible(&volume->cb_check_lock); + if (ret < 0) + goto error_unlock; + locked_vol = true; + } + + cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); + cb_scrub = atomic_read(&volume->cb_scrub); + if (vnode->cb_ro_snapshot != cb_ro_snapshot || + vnode->cb_scrub != cb_scrub) + unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); + + if (vnode->cb_ro_snapshot != cb_ro_snapshot || + vnode->cb_scrub != cb_scrub || + volume->cb_expires_at <= deadline || + atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || + atomic64_read(&vnode->cb_expires_at) <= deadline + ) { + ret = afs_fetch_status(vnode, key, false, NULL); + if (ret < 0) { + if (ret == -ENOENT) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + ret = -ESTALE; + } + goto error_unlock; + } + + _debug("new promise [fl=%lx]", vnode->flags); + } + + /* We can drop the volume lock now as. */ + if (locked_vol) { + mutex_unlock(&volume->cb_check_lock); + locked_vol = false; + } + + cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); + cb_scrub = atomic_read(&volume->cb_scrub); + _debug("vnode inval %x==%x %x==%x", + vnode->cb_ro_snapshot, cb_ro_snapshot, + vnode->cb_scrub, cb_scrub); + if (vnode->cb_scrub != cb_scrub) + zap = true; + vnode->cb_ro_snapshot = cb_ro_snapshot; + vnode->cb_scrub = cb_scrub; + + /* if the vnode's data version number changed then its contents are + * different */ + zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); + if (zap) + afs_zap_data(vnode); + up_write(&vnode->validate_lock); + _leave(" = 0"); + return 0; + +error_unlock: + if (locked_vol) + mutex_unlock(&volume->cb_check_lock); + up_write(&vnode->validate_lock); +error: + _leave(" = %d", ret); + return ret; +} diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c new file mode 100644 index 000000000000..fc9676abd252 --- /dev/null +++ b/fs/afs/vl_alias.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* AFS cell alias detection + * + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/namei.h> +#include <keys/rxrpc-type.h> +#include "internal.h" + +/* + * Sample a volume. + */ +static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *key, + const char *name, unsigned int namelen) +{ + struct afs_volume *volume; + struct afs_fs_context fc = { + .type = 0, /* Explicitly leave it to the VLDB */ + .volnamesz = namelen, + .volname = name, + .net = cell->net, + .cell = cell, + .key = key, /* This might need to be something */ + }; + + volume = afs_create_volume(&fc); + _leave(" = %p", volume); + return volume; +} + +/* + * Compare the address lists of a pair of fileservers. + */ +static int afs_compare_fs_alists(const struct afs_server *server_a, + const struct afs_server *server_b) +{ + const struct afs_addr_list *la, *lb; + int a = 0, b = 0, addr_matches = 0; + + la = rcu_dereference(server_a->endpoint_state)->addresses; + lb = rcu_dereference(server_b->endpoint_state)->addresses; + + while (a < la->nr_addrs && b < lb->nr_addrs) { + unsigned long pa = (unsigned long)la->addrs[a].peer; + unsigned long pb = (unsigned long)lb->addrs[b].peer; + long diff = pa - pb; + + if (diff < 0) { + a++; + } else if (diff > 0) { + b++; + } else { + addr_matches++; + a++; + b++; + } + } + + return addr_matches; +} + +/* + * Compare the fileserver lists of two volumes. The server lists are sorted in + * order of ascending UUID. + */ +static int afs_compare_volume_slists(const struct afs_volume *vol_a, + const struct afs_volume *vol_b) +{ + const struct afs_server_list *la, *lb; + int i, a = 0, b = 0, uuid_matches = 0, addr_matches = 0; + + la = rcu_dereference(vol_a->servers); + lb = rcu_dereference(vol_b->servers); + + for (i = 0; i < AFS_MAXTYPES; i++) + if (vol_a->vids[i] != vol_b->vids[i]) + return 0; + + while (a < la->nr_servers && b < lb->nr_servers) { + const struct afs_server *server_a = la->servers[a].server; + const struct afs_server *server_b = lb->servers[b].server; + int diff = memcmp(&server_a->uuid, &server_b->uuid, sizeof(uuid_t)); + + if (diff < 0) { + a++; + } else if (diff > 0) { + b++; + } else { + uuid_matches++; + addr_matches += afs_compare_fs_alists(server_a, server_b); + a++; + b++; + } + } + + _leave(" = %d [um %d]", addr_matches, uuid_matches); + return addr_matches; +} + +/* + * Compare root.cell volumes. + */ +static int afs_compare_cell_roots(struct afs_cell *cell) +{ + struct afs_cell *p; + + _enter(""); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(p, &cell->net->proc_cells, proc_link) { + if (p == cell || p->alias_of) + continue; + if (!p->root_volume) + continue; /* Ignore cells that don't have a root.cell volume. */ + + if (afs_compare_volume_slists(cell->root_volume, p->root_volume) != 0) + goto is_alias; + } + + rcu_read_unlock(); + _leave(" = 0"); + return 0; + +is_alias: + rcu_read_unlock(); + cell->alias_of = afs_use_cell(p, afs_cell_trace_use_alias); + return 1; +} + +/* + * Query the new cell for a volume from a cell we're already using. + */ +static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, + struct afs_cell *p) +{ + struct afs_volume *volume, *pvol = NULL; + int ret; + + /* Arbitrarily pick a volume from the list. */ + read_seqlock_excl(&p->volume_lock); + if (!RB_EMPTY_ROOT(&p->volumes)) + pvol = afs_get_volume(rb_entry(p->volumes.rb_node, + struct afs_volume, cell_node), + afs_volume_trace_get_query_alias); + read_sequnlock_excl(&p->volume_lock); + if (!pvol) + return 0; + + _enter("%s:%s", cell->name, pvol->name); + + /* And see if it's in the new cell. */ + volume = afs_sample_volume(cell, key, pvol->name, pvol->name_len); + if (IS_ERR(volume)) { + afs_put_volume(pvol, afs_volume_trace_put_query_alias); + if (PTR_ERR(volume) != -ENOMEDIUM) + return PTR_ERR(volume); + /* That volume is not in the new cell, so not an alias */ + return 0; + } + + /* The new cell has a like-named volume also - compare volume ID, + * server and address lists. + */ + ret = 0; + if (pvol->vid == volume->vid) { + rcu_read_lock(); + if (afs_compare_volume_slists(volume, pvol)) + ret = 1; + rcu_read_unlock(); + } + + afs_put_volume(volume, afs_volume_trace_put_query_alias); + afs_put_volume(pvol, afs_volume_trace_put_query_alias); + return ret; +} + +/* + * Query the new cell for volumes we know exist in cells we're already using. + */ +static int afs_query_for_alias(struct afs_cell *cell, struct key *key) +{ + struct afs_cell *p; + + _enter("%s", cell->name); + + if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) + return -ERESTARTSYS; + + hlist_for_each_entry(p, &cell->net->proc_cells, proc_link) { + if (p == cell || p->alias_of) + continue; + if (RB_EMPTY_ROOT(&p->volumes)) + continue; + if (p->root_volume) + continue; /* Ignore cells that have a root.cell volume. */ + afs_use_cell(p, afs_cell_trace_use_check_alias); + mutex_unlock(&cell->net->proc_cells_lock); + + if (afs_query_for_alias_one(cell, key, p) != 0) + goto is_alias; + + if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) { + afs_unuse_cell(p, afs_cell_trace_unuse_check_alias); + return -ERESTARTSYS; + } + + afs_unuse_cell(p, afs_cell_trace_unuse_check_alias); + } + + mutex_unlock(&cell->net->proc_cells_lock); + _leave(" = 0"); + return 0; + +is_alias: + cell->alias_of = p; /* Transfer our ref */ + return 1; +} + +/* + * Look up a VLDB record for a volume. + */ +static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key) +{ + struct afs_vl_cursor vc; + char *cell_name = ERR_PTR(-EDESTADDRREQ); + bool skipped = false, not_skipped = false; + int ret; + + if (!afs_begin_vlserver_operation(&vc, cell, key)) + return ERR_PTR(-ERESTARTSYS); + + while (afs_select_vlserver(&vc)) { + if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) { + vc.call_error = -EOPNOTSUPP; + skipped = true; + continue; + } + not_skipped = true; + cell_name = afs_yfsvl_get_cell_name(&vc); + } + + ret = afs_end_vlserver_operation(&vc); + if (skipped && !not_skipped) + ret = -EOPNOTSUPP; + return ret < 0 ? ERR_PTR(ret) : cell_name; +} + +static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) +{ + struct afs_cell *master; + size_t name_len; + char *cell_name; + + cell_name = afs_vl_get_cell_name(cell, key); + if (IS_ERR(cell_name)) + return PTR_ERR(cell_name); + + if (strcmp(cell_name, cell->name) == 0) { + kfree(cell_name); + return 0; + } + + name_len = strlen(cell_name); + if (!name_len || name_len > AFS_MAXCELLNAME) + master = ERR_PTR(-EOPNOTSUPP); + else + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, + AFS_LOOKUP_CELL_ALIAS_CHECK, + afs_cell_trace_use_lookup_canonical); + kfree(cell_name); + if (IS_ERR(master)) + return PTR_ERR(master); + + cell->alias_of = master; /* Transfer our ref */ + return 1; +} + +static int afs_do_cell_detect_alias(struct afs_cell *cell, struct key *key) +{ + struct afs_volume *root_volume; + int ret; + + _enter("%s", cell->name); + + ret = yfs_check_canonical_cell_name(cell, key); + if (ret != -EOPNOTSUPP) + return ret; + + /* Try and get the root.cell volume for comparison with other cells */ + root_volume = afs_sample_volume(cell, key, "root.cell", 9); + if (!IS_ERR(root_volume)) { + cell->root_volume = root_volume; + return afs_compare_cell_roots(cell); + } + + if (PTR_ERR(root_volume) != -ENOMEDIUM) + return PTR_ERR(root_volume); + + /* Okay, this cell doesn't have an root.cell volume. We need to + * locate some other random volume and use that to check. + */ + return afs_query_for_alias(cell, key); +} + +/* + * Check to see if a new cell is an alias of a cell we already have. At this + * point we have the cell's volume server list. + * + * Returns 0 if we didn't detect an alias, 1 if we found an alias and an error + * if we had problems gathering the data required. In the case the we did + * detect an alias, cell->alias_of is set to point to the assumed master. + */ +int afs_cell_detect_alias(struct afs_cell *cell, struct key *key) +{ + struct afs_net *net = cell->net; + int ret; + + if (mutex_lock_interruptible(&net->cells_alias_lock) < 0) + return -ERESTARTSYS; + + if (test_bit(AFS_CELL_FL_CHECK_ALIAS, &cell->flags)) { + ret = afs_do_cell_detect_alias(cell, key); + if (ret >= 0) + clear_bit_unlock(AFS_CELL_FL_CHECK_ALIAS, &cell->flags); + } else { + ret = cell->alias_of ? 1 : 0; + } + + mutex_unlock(&net->cells_alias_lock); + + if (ret == 1) + pr_notice("kAFS: Cell %s is an alias of %s\n", + cell->name, cell->alias_of->name); + return ret; +} diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c new file mode 100644 index 000000000000..9b1c20daac53 --- /dev/null +++ b/fs/afs/vl_list.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* AFS vlserver list management. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include "internal.h" + +struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, + unsigned short port) +{ + struct afs_vlserver *vlserver; + static atomic_t debug_ids; + + vlserver = kzalloc(struct_size(vlserver, name, name_len + 1), + GFP_KERNEL); + if (vlserver) { + refcount_set(&vlserver->ref, 1); + rwlock_init(&vlserver->lock); + init_waitqueue_head(&vlserver->probe_wq); + spin_lock_init(&vlserver->probe_lock); + vlserver->debug_id = atomic_inc_return(&debug_ids); + vlserver->rtt = UINT_MAX; + vlserver->name_len = name_len; + vlserver->service_id = VL_SERVICE; + vlserver->port = port; + memcpy(vlserver->name, name, name_len); + } + return vlserver; +} + +static void afs_vlserver_rcu(struct rcu_head *rcu) +{ + struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu); + + afs_put_addrlist(rcu_access_pointer(vlserver->addresses), + afs_alist_trace_put_vlserver); + kfree_rcu(vlserver, rcu); +} + +void afs_put_vlserver(struct afs_net *net, struct afs_vlserver *vlserver) +{ + if (vlserver && + refcount_dec_and_test(&vlserver->ref)) + call_rcu(&vlserver->rcu, afs_vlserver_rcu); +} + +struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int nr_servers) +{ + struct afs_vlserver_list *vllist; + + vllist = kzalloc(struct_size(vllist, servers, nr_servers), GFP_KERNEL); + if (vllist) { + refcount_set(&vllist->ref, 1); + rwlock_init(&vllist->lock); + } + + return vllist; +} + +void afs_put_vlserverlist(struct afs_net *net, struct afs_vlserver_list *vllist) +{ + if (vllist) { + if (refcount_dec_and_test(&vllist->ref)) { + int i; + + for (i = 0; i < vllist->nr_servers; i++) { + afs_put_vlserver(net, vllist->servers[i].server); + } + kfree_rcu(vllist, rcu); + } + } +} + +static u16 afs_extract_le16(const u8 **_b) +{ + u16 val; + + val = (u16)*(*_b)++ << 0; + val |= (u16)*(*_b)++ << 8; + return val; +} + +/* + * Build a VL server address list from a DNS queried server list. + */ +static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net, + const u8 **_b, const u8 *end, + u8 nr_addrs, u16 port) +{ + struct afs_addr_list *alist; + const u8 *b = *_b; + int ret = -EINVAL; + + alist = afs_alloc_addrlist(nr_addrs); + if (!alist) + return ERR_PTR(-ENOMEM); + if (nr_addrs == 0) + return alist; + + for (; nr_addrs > 0 && end - b >= nr_addrs; nr_addrs--) { + struct dns_server_list_v1_address hdr; + __be32 x[4]; + + hdr.address_type = *b++; + + switch (hdr.address_type) { + case DNS_ADDRESS_IS_IPV4: + if (end - b < 4) { + _leave(" = -EINVAL [short inet]"); + goto error; + } + memcpy(x, b, 4); + ret = afs_merge_fs_addr4(net, alist, x[0], port); + if (ret < 0) + goto error; + b += 4; + break; + + case DNS_ADDRESS_IS_IPV6: + if (end - b < 16) { + _leave(" = -EINVAL [short inet6]"); + goto error; + } + memcpy(x, b, 16); + ret = afs_merge_fs_addr6(net, alist, x, port); + if (ret < 0) + goto error; + b += 16; + break; + + default: + _leave(" = -EADDRNOTAVAIL [unknown af %u]", + hdr.address_type); + ret = -EADDRNOTAVAIL; + goto error; + } + } + + /* Start with IPv6 if available. */ + if (alist->nr_ipv4 < alist->nr_addrs) + alist->preferred = alist->nr_ipv4; + + *_b = b; + return alist; + +error: + *_b = b; + afs_put_addrlist(alist, afs_alist_trace_put_parse_error); + return ERR_PTR(ret); +} + +/* + * Build a VL server list from a DNS queried server list. + */ +struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, + const void *buffer, + size_t buffer_size) +{ + const struct dns_server_list_v1_header *hdr = buffer; + struct dns_server_list_v1_server bs; + struct afs_vlserver_list *vllist, *previous; + struct afs_addr_list *addrs; + struct afs_vlserver *server; + const u8 *b = buffer, *end = buffer + buffer_size; + int ret = -ENOMEM, nr_servers, i, j; + + _enter(""); + + /* Check that it's a server list, v1 */ + if (end - b < sizeof(*hdr) || + hdr->hdr.content != DNS_PAYLOAD_IS_SERVER_LIST || + hdr->hdr.version != 1) { + pr_notice("kAFS: Got DNS record [%u,%u] len %zu\n", + hdr->hdr.content, hdr->hdr.version, end - b); + ret = -EDESTADDRREQ; + goto dump; + } + + nr_servers = hdr->nr_servers; + + vllist = afs_alloc_vlserver_list(nr_servers); + if (!vllist) + return ERR_PTR(-ENOMEM); + + vllist->source = (hdr->source < NR__dns_record_source) ? + hdr->source : NR__dns_record_source; + vllist->status = (hdr->status < NR__dns_lookup_status) ? + hdr->status : NR__dns_lookup_status; + + read_lock(&cell->vl_servers_lock); + previous = afs_get_vlserverlist( + rcu_dereference_protected(cell->vl_servers, + lockdep_is_held(&cell->vl_servers_lock))); + read_unlock(&cell->vl_servers_lock); + + b += sizeof(*hdr); + while (end - b >= sizeof(bs)) { + bs.name_len = afs_extract_le16(&b); + bs.priority = afs_extract_le16(&b); + bs.weight = afs_extract_le16(&b); + bs.port = afs_extract_le16(&b); + bs.source = *b++; + bs.status = *b++; + bs.protocol = *b++; + bs.nr_addrs = *b++; + + _debug("extract %u %u %u %u %u %u %*.*s", + bs.name_len, bs.priority, bs.weight, + bs.port, bs.protocol, bs.nr_addrs, + bs.name_len, bs.name_len, b); + + if (end - b < bs.name_len) + break; + + ret = -EPROTONOSUPPORT; + if (bs.protocol == DNS_SERVER_PROTOCOL_UNSPECIFIED) { + bs.protocol = DNS_SERVER_PROTOCOL_UDP; + } else if (bs.protocol != DNS_SERVER_PROTOCOL_UDP) { + _leave(" = [proto %u]", bs.protocol); + goto error; + } + + if (bs.port == 0) + bs.port = AFS_VL_PORT; + if (bs.source > NR__dns_record_source) + bs.source = NR__dns_record_source; + if (bs.status > NR__dns_lookup_status) + bs.status = NR__dns_lookup_status; + + /* See if we can update an old server record */ + server = NULL; + for (i = 0; i < previous->nr_servers; i++) { + struct afs_vlserver *p = previous->servers[i].server; + + if (p->name_len == bs.name_len && + p->port == bs.port && + strncasecmp(b, p->name, bs.name_len) == 0) { + server = afs_get_vlserver(p); + break; + } + } + + if (!server) { + ret = -ENOMEM; + server = afs_alloc_vlserver(b, bs.name_len, bs.port); + if (!server) + goto error; + } + + b += bs.name_len; + + /* Extract the addresses - note that we can't skip this as we + * have to advance the payload pointer. + */ + addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port); + if (IS_ERR(addrs)) { + ret = PTR_ERR(addrs); + goto error_2; + } + + if (vllist->nr_servers >= nr_servers) { + _debug("skip %u >= %u", vllist->nr_servers, nr_servers); + afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty); + afs_put_vlserver(cell->net, server); + continue; + } + + addrs->source = bs.source; + addrs->status = bs.status; + + if (addrs->nr_addrs == 0) { + afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty); + if (!rcu_access_pointer(server->addresses)) { + afs_put_vlserver(cell->net, server); + continue; + } + } else { + struct afs_addr_list *old = addrs; + + write_lock(&server->lock); + old = rcu_replace_pointer(server->addresses, old, + lockdep_is_held(&server->lock)); + write_unlock(&server->lock); + afs_put_addrlist(old, afs_alist_trace_put_vlserver_old); + } + + + /* TODO: Might want to check for duplicates */ + + /* Insertion-sort by priority and weight */ + for (j = 0; j < vllist->nr_servers; j++) { + if (bs.priority < vllist->servers[j].priority) + break; /* Lower preferable */ + if (bs.priority == vllist->servers[j].priority && + bs.weight > vllist->servers[j].weight) + break; /* Higher preferable */ + } + + if (j < vllist->nr_servers) { + memmove(vllist->servers + j + 1, + vllist->servers + j, + (vllist->nr_servers - j) * sizeof(struct afs_vlserver_entry)); + } + + clear_bit(AFS_VLSERVER_FL_PROBED, &server->flags); + + vllist->servers[j].priority = bs.priority; + vllist->servers[j].weight = bs.weight; + vllist->servers[j].server = server; + vllist->nr_servers++; + } + + if (b != end) { + _debug("parse error %zd", b - end); + goto error; + } + + afs_put_vlserverlist(cell->net, previous); + _leave(" = ok [%u]", vllist->nr_servers); + return vllist; + +error_2: + afs_put_vlserver(cell->net, server); +error: + afs_put_vlserverlist(cell->net, vllist); + afs_put_vlserverlist(cell->net, previous); +dump: + if (ret != -ENOMEM) { + printk(KERN_DEBUG "DNS: at %zu\n", (const void *)b - buffer); + print_hex_dump_bytes("DNS: ", DUMP_PREFIX_NONE, buffer, buffer_size); + } + return ERR_PTR(ret); +} diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c new file mode 100644 index 000000000000..3d2e0c925460 --- /dev/null +++ b/fs/afs/vl_probe.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* AFS vlserver probing + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include "afs_fs.h" +#include "internal.h" +#include "protocol_yfs.h" + + +/* + * Handle the completion of a set of probes. + */ +static void afs_finished_vl_probe(struct afs_vlserver *server) +{ + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) { + server->rtt = UINT_MAX; + clear_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags); + } + + clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags); + wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING); +} + +/* + * Handle the completion of a probe RPC call. + */ +static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up) +{ + if (atomic_dec_and_test(&server->probe_outstanding)) { + afs_finished_vl_probe(server); + wake_up = true; + } + + if (wake_up) + wake_up_all(&server->probe_wq); +} + +/* + * Process the result of probing a vlserver. This is called after successful + * or failed delivery of an VL.GetCapabilities operation. + */ +void afs_vlserver_probe_result(struct afs_call *call) +{ + struct afs_addr_list *alist = call->vl_probe; + struct afs_vlserver *server = call->vlserver; + struct afs_address *addr = &alist->addrs[call->probe_index]; + unsigned int server_index = call->server_index; + unsigned int rtt_us = 0; + unsigned int index = call->probe_index; + bool have_result = false; + int ret = call->error; + + _enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code); + + spin_lock(&server->probe_lock); + + switch (ret) { + case 0: + server->probe.error = 0; + goto responded; + case -ECONNABORTED: + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) { + server->probe.abort_code = call->abort_code; + server->probe.error = ret; + } + goto responded; + case -ENOMEM: + case -ENONET: + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EKEYREJECTED: + server->probe.flags |= AFS_VLSERVER_PROBE_LOCAL_FAILURE; + if (server->probe.error == 0) + server->probe.error = ret; + trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail); + goto out; + case -ECONNRESET: /* Responded, but call expired. */ + case -ERFKILL: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: + case -EHOSTDOWN: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + default: + clear_bit(index, &alist->responded); + set_bit(index, &alist->probe_failed); + if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) && + (server->probe.error == 0 || + server->probe.error == -ETIMEDOUT || + server->probe.error == -ETIME)) + server->probe.error = ret; + trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail); + goto out; + } + +responded: + set_bit(index, &alist->responded); + clear_bit(index, &alist->probe_failed); + + if (call->service_id == YFS_VL_SERVICE) { + server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS; + set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); + server->service_id = call->service_id; + } else { + server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS; + if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) { + clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); + server->service_id = call->service_id; + } + } + + rtt_us = rxrpc_kernel_get_srtt(addr->peer); + if (rtt_us < server->probe.rtt) { + server->probe.rtt = rtt_us; + server->rtt = rtt_us; + alist->preferred = index; + } + + smp_wmb(); /* Set rtt before responded. */ + server->probe.flags |= AFS_VLSERVER_PROBE_RESPONDED; + set_bit(AFS_VLSERVER_FL_PROBED, &server->flags); + set_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags); + have_result = true; +out: + spin_unlock(&server->probe_lock); + + trace_afs_vl_probe(server, false, alist, index, call->error, call->abort_code, rtt_us); + _debug("probe [%u][%u] %pISpc rtt=%d ret=%d", + server_index, index, rxrpc_kernel_remote_addr(addr->peer), + rtt_us, ret); + + afs_done_one_vl_probe(server, have_result); +} + +/* + * Probe all of a vlserver's addresses to find out the best route and to + * query its capabilities. + */ +static bool afs_do_probe_vlserver(struct afs_net *net, + struct afs_vlserver *server, + struct key *key, + unsigned int server_index, + struct afs_error *_e) +{ + struct afs_addr_list *alist; + struct afs_call *call; + unsigned long unprobed; + unsigned int index, i; + bool in_progress = false; + int best_prio; + + _enter("%s", server->name); + + read_lock(&server->lock); + alist = rcu_dereference_protected(server->addresses, + lockdep_is_held(&server->lock)); + afs_get_addrlist(alist, afs_alist_trace_get_vlprobe); + read_unlock(&server->lock); + + atomic_set(&server->probe_outstanding, alist->nr_addrs); + memset(&server->probe, 0, sizeof(server->probe)); + server->probe.rtt = UINT_MAX; + + unprobed = (1UL << alist->nr_addrs) - 1; + while (unprobed) { + best_prio = -1; + index = 0; + for (i = 0; i < alist->nr_addrs; i++) { + if (test_bit(i, &unprobed) && + alist->addrs[i].prio > best_prio) { + index = i; + best_prio = alist->addrs[i].prio; + } + } + __clear_bit(index, &unprobed); + + trace_afs_vl_probe(server, true, alist, index, 0, 0, 0); + call = afs_vl_get_capabilities(net, alist, index, key, server, + server_index); + if (!IS_ERR(call)) { + afs_prioritise_error(_e, call->error, call->abort_code); + afs_put_call(call); + in_progress = true; + } else { + afs_prioritise_error(_e, PTR_ERR(call), 0); + afs_done_one_vl_probe(server, false); + } + } + + afs_put_addrlist(alist, afs_alist_trace_put_vlprobe); + return in_progress; +} + +/* + * Send off probes to all unprobed servers. + */ +int afs_send_vl_probes(struct afs_net *net, struct key *key, + struct afs_vlserver_list *vllist) +{ + struct afs_vlserver *server; + struct afs_error e = {}; + bool in_progress = false; + int i; + + for (i = 0; i < vllist->nr_servers; i++) { + server = vllist->servers[i].server; + if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags)) + continue; + + if (!test_and_set_bit_lock(AFS_VLSERVER_FL_PROBING, &server->flags) && + afs_do_probe_vlserver(net, server, key, i, &e)) + in_progress = true; + } + + return in_progress ? 0 : e.error; +} + +/* + * Wait for the first as-yet untried server to respond. + */ +int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist, + unsigned long untried) +{ + struct wait_queue_entry *waits; + struct afs_vlserver *server; + unsigned int rtt = UINT_MAX, rtt_s; + bool have_responders = false; + int pref = -1, i; + + _enter("%u,%lx", vllist->nr_servers, untried); + + /* Only wait for servers that have a probe outstanding. */ + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) + __clear_bit(i, &untried); + if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) + have_responders = true; + } + } + if (have_responders || !untried) + return 0; + + waits = kmalloc(array_size(vllist->nr_servers, sizeof(*waits)), GFP_KERNEL); + if (!waits) + return -ENOMEM; + + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + init_waitqueue_entry(&waits[i], current); + add_wait_queue(&server->probe_wq, &waits[i]); + } + } + + for (;;) { + bool still_probing = false; + + set_current_state(TASK_INTERRUPTIBLE); + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) + goto stop; + if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags)) + still_probing = true; + } + } + + if (!still_probing || signal_pending(current)) + goto stop; + schedule(); + } + +stop: + set_current_state(TASK_RUNNING); + + for (i = 0; i < vllist->nr_servers; i++) { + if (test_bit(i, &untried)) { + server = vllist->servers[i].server; + rtt_s = READ_ONCE(server->rtt); + if (test_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags) && + rtt_s < rtt) { + pref = i; + rtt = rtt_s; + } + + remove_wait_queue(&server->probe_wq, &waits[i]); + } + } + + kfree(waits); + + if (pref == -1 && signal_pending(current)) + return -ERESTARTSYS; + + if (pref >= 0) + vllist->preferred = pref; + + _leave(" = 0 [%u]", pref); + return 0; +} diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c new file mode 100644 index 000000000000..6ad9688d8f4b --- /dev/null +++ b/fs/afs/vl_rotate.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Handle vlserver selection and rotation. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sched/signal.h> +#include "internal.h" +#include "afs_vl.h" + +/* + * Begin an operation on a volume location server. + */ +bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell, + struct key *key) +{ + static atomic_t debug_ids; + + memset(vc, 0, sizeof(*vc)); + vc->cell = cell; + vc->key = key; + vc->cumul_error.error = -EDESTADDRREQ; + vc->nr_iterations = -1; + + if (signal_pending(current)) { + vc->cumul_error.error = -EINTR; + vc->flags |= AFS_VL_CURSOR_STOP; + return false; + } + + vc->debug_id = atomic_inc_return(&debug_ids); + return true; +} + +/* + * Begin iteration through a server list, starting with the last used server if + * possible, or the last recorded good server if not. + */ +static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) +{ + struct afs_cell *cell = vc->cell; + unsigned int dns_lookup_count; + + if (cell->dns_source == DNS_RECORD_UNAVAILABLE || + cell->dns_expiry <= ktime_get_real_seconds()) { + dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count); + set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); + afs_queue_cell(cell, afs_cell_trace_queue_dns); + + if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { + if (wait_var_event_interruptible( + &cell->dns_lookup_count, + smp_load_acquire(&cell->dns_lookup_count) + != dns_lookup_count) < 0) { + vc->cumul_error.error = -ERESTARTSYS; + return false; + } + } + + /* Status load is ordered after lookup counter load */ + if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) { + pr_warn("No record of cell %s\n", cell->name); + vc->cumul_error.error = -ENOENT; + return false; + } + + if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { + vc->cumul_error.error = -EDESTADDRREQ; + return false; + } + } + + read_lock(&cell->vl_servers_lock); + vc->server_list = afs_get_vlserverlist( + rcu_dereference_protected(cell->vl_servers, + lockdep_is_held(&cell->vl_servers_lock))); + read_unlock(&cell->vl_servers_lock); + if (!vc->server_list->nr_servers) + return false; + + vc->untried_servers = (1UL << vc->server_list->nr_servers) - 1; + vc->server_index = -1; + return true; +} + +/* + * Select the vlserver to use. May be called multiple times to rotate + * through the vlservers. + */ +bool afs_select_vlserver(struct afs_vl_cursor *vc) +{ + struct afs_addr_list *alist = vc->alist; + struct afs_vlserver *vlserver; + unsigned long set, failed; + unsigned int rtt; + s32 abort_code = vc->call_abort_code; + int error = vc->call_error, i; + + vc->nr_iterations++; + + _enter("VC=%x+%x,%d{%lx},%d{%lx},%d,%d", + vc->debug_id, vc->nr_iterations, vc->server_index, vc->untried_servers, + vc->addr_index, vc->addr_tried, + error, abort_code); + + if (vc->flags & AFS_VL_CURSOR_STOP) { + _leave(" = f [stopped]"); + return false; + } + + if (vc->nr_iterations == 0) + goto start; + + WRITE_ONCE(alist->addrs[vc->addr_index].last_error, error); + + /* Evaluate the result of the previous operation, if there was one. */ + switch (error) { + default: + case 0: + /* Success or local failure. Stop. */ + vc->cumul_error.error = error; + vc->flags |= AFS_VL_CURSOR_STOP; + _leave(" = f [okay/local %d]", vc->cumul_error.error); + return false; + + case -ECONNABORTED: + /* The far side rejected the operation on some grounds. This + * might involve the server being busy or the volume having been moved. + */ + switch (abort_code) { + case AFSVL_IO: + case AFSVL_BADVOLOPER: + case AFSVL_NOMEM: + /* The server went weird. */ + afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code); + //write_lock(&vc->cell->vl_servers_lock); + //vc->server_list->weird_mask |= 1 << vc->server_index; + //write_unlock(&vc->cell->vl_servers_lock); + goto next_server; + + default: + afs_prioritise_error(&vc->cumul_error, error, abort_code); + goto failed; + } + + case -ERFKILL: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: + case -EHOSTDOWN: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + _debug("no conn %d", error); + afs_prioritise_error(&vc->cumul_error, error, 0); + goto iterate_address; + + case -ECONNRESET: + _debug("call reset"); + afs_prioritise_error(&vc->cumul_error, error, 0); + vc->flags |= AFS_VL_CURSOR_RETRY; + goto next_server; + + case -EOPNOTSUPP: + _debug("notsupp"); + goto next_server; + } + +restart_from_beginning: + _debug("restart"); + if (vc->call_responded && + vc->addr_index != vc->alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_restart); + alist = vc->alist = NULL; + + afs_put_vlserverlist(vc->cell->net, vc->server_list); + vc->server_list = NULL; + if (vc->flags & AFS_VL_CURSOR_RETRIED) + goto failed; + vc->flags |= AFS_VL_CURSOR_RETRIED; +start: + _debug("start"); + ASSERTCMP(alist, ==, NULL); + + if (!afs_start_vl_iteration(vc)) + goto failed; + + error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list); + if (error < 0) { + afs_prioritise_error(&vc->cumul_error, error, 0); + goto failed; + } + +pick_server: + _debug("pick [%lx]", vc->untried_servers); + ASSERTCMP(alist, ==, NULL); + + error = afs_wait_for_vl_probes(vc->server_list, vc->untried_servers); + if (error < 0) { + afs_prioritise_error(&vc->cumul_error, error, 0); + goto failed; + } + + /* Pick the untried server with the lowest RTT. */ + vc->server_index = vc->server_list->preferred; + if (test_bit(vc->server_index, &vc->untried_servers)) + goto selected_server; + + vc->server_index = -1; + rtt = UINT_MAX; + for (i = 0; i < vc->server_list->nr_servers; i++) { + struct afs_vlserver *s = vc->server_list->servers[i].server; + + if (!test_bit(i, &vc->untried_servers) || + !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) + continue; + if (s->probe.rtt <= rtt) { + vc->server_index = i; + rtt = s->probe.rtt; + } + } + + if (vc->server_index == -1) + goto no_more_servers; + +selected_server: + _debug("use %d", vc->server_index); + __clear_bit(vc->server_index, &vc->untried_servers); + + /* We're starting on a different vlserver from the list. We need to + * check it, find its address list and probe its capabilities before we + * use it. + */ + vlserver = vc->server_list->servers[vc->server_index].server; + vc->server = vlserver; + + _debug("USING VLSERVER: %s", vlserver->name); + + read_lock(&vlserver->lock); + alist = rcu_dereference_protected(vlserver->addresses, + lockdep_is_held(&vlserver->lock)); + vc->alist = afs_get_addrlist(alist, afs_alist_trace_get_vlrotate_set); + read_unlock(&vlserver->lock); + + vc->addr_tried = 0; + vc->addr_index = -1; + +iterate_address: + /* Iterate over the current server's address list to try and find an + * address on which it will respond to us. + */ + set = READ_ONCE(alist->responded); + failed = READ_ONCE(alist->probe_failed); + vc->addr_index = READ_ONCE(alist->preferred); + + _debug("%lx-%lx-%lx,%d", set, failed, vc->addr_tried, vc->addr_index); + + set &= ~(failed | vc->addr_tried); + + if (!set) + goto next_server; + + if (!test_bit(vc->addr_index, &set)) + vc->addr_index = __ffs(set); + + set_bit(vc->addr_index, &vc->addr_tried); + vc->alist = alist; + + _debug("VL address %d/%d", vc->addr_index, alist->nr_addrs); + + vc->call_responded = false; + _leave(" = t %pISpc", rxrpc_kernel_remote_addr(alist->addrs[vc->addr_index].peer)); + return true; + +next_server: + _debug("next"); + ASSERT(alist); + if (vc->call_responded && + vc->addr_index != alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_next); + alist = vc->alist = NULL; + goto pick_server; + +no_more_servers: + /* That's all the servers poked to no good effect. Try again if some + * of them were busy. + */ + if (vc->flags & AFS_VL_CURSOR_RETRY) + goto restart_from_beginning; + + for (i = 0; i < vc->server_list->nr_servers; i++) { + struct afs_vlserver *s = vc->server_list->servers[i].server; + + if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) + vc->cumul_error.responded = true; + afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error), + s->probe.abort_code); + } + +failed: + if (alist) { + if (vc->call_responded && + vc->addr_index != alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_fail); + alist = vc->alist = NULL; + } + vc->flags |= AFS_VL_CURSOR_STOP; + _leave(" = f [failed %d]", vc->cumul_error.error); + return false; +} + +/* + * Dump cursor state in the case of the error being EDESTADDRREQ. + */ +static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) +{ + struct afs_cell *cell = vc->cell; + static int count; + int i; + + if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3) + return; + count++; + + rcu_read_lock(); + pr_notice("EDESTADDR occurred\n"); + pr_notice("CELL: %s err=%d\n", cell->name, cell->error); + pr_notice("DNS: src=%u st=%u lc=%x\n", + cell->dns_source, cell->dns_status, cell->dns_lookup_count); + pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n", + vc->untried_servers, vc->server_index, vc->nr_iterations, + vc->flags, vc->cumul_error.error); + pr_notice("VC: call er=%d ac=%d r=%u\n", + vc->call_error, vc->call_abort_code, vc->call_responded); + + if (vc->server_list) { + const struct afs_vlserver_list *sl = vc->server_list; + pr_notice("VC: SL nr=%u ix=%u\n", + sl->nr_servers, sl->index); + for (i = 0; i < sl->nr_servers; i++) { + const struct afs_vlserver *s = sl->servers[i].server; + pr_notice("VC: server %s+%hu fl=%lx E=%hd\n", + s->name, s->port, s->flags, s->probe.error); + if (s->addresses) { + const struct afs_addr_list *a = + rcu_dereference(s->addresses); + pr_notice("VC: - nr=%u/%u/%u pf=%u\n", + a->nr_ipv4, a->nr_addrs, a->max_addrs, + a->preferred); + pr_notice("VC: - R=%lx F=%lx\n", + a->responded, a->probe_failed); + if (a == vc->alist) + pr_notice("VC: - current\n"); + } + } + } + + pr_notice("AC: t=%lx ax=%u\n", vc->addr_tried, vc->addr_index); + rcu_read_unlock(); +} + +/* + * Tidy up a volume location server cursor and unlock the vnode. + */ +int afs_end_vlserver_operation(struct afs_vl_cursor *vc) +{ + struct afs_net *net = vc->cell->net; + + _enter("VC=%x+%x", vc->debug_id, vc->nr_iterations); + + switch (vc->cumul_error.error) { + case -EDESTADDRREQ: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: + afs_vl_dump_edestaddrreq(vc); + break; + } + + if (vc->alist) { + if (vc->call_responded && + vc->addr_index != vc->alist->preferred && + test_bit(vc->alist->preferred, &vc->addr_tried)) + WRITE_ONCE(vc->alist->preferred, vc->addr_index); + afs_put_addrlist(vc->alist, afs_alist_trace_put_vlrotate_end); + vc->alist = NULL; + } + afs_put_vlserverlist(net, vc->server_list); + return vc->cumul_error.error; +} diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 340afd0cd182..3a23c0b08eb6 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -1,219 +1,793 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* AFS Volume Location Service client * * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/gfp.h> #include <linux/init.h> #include <linux/sched.h> +#include "afs_fs.h" #include "internal.h" /* - * map volume locator abort codes to error codes + * Deliver reply data to a VL.GetEntryByNameU call. */ -static int afs_vl_abort_to_error(u32 abort_code) +static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) { - _enter("%u", abort_code); - - switch (abort_code) { - case AFSVL_IDEXIST: return -EEXIST; - case AFSVL_IO: return -EREMOTEIO; - case AFSVL_NAMEEXIST: return -EEXIST; - case AFSVL_CREATEFAIL: return -EREMOTEIO; - case AFSVL_NOENT: return -ENOMEDIUM; - case AFSVL_EMPTY: return -ENOMEDIUM; - case AFSVL_ENTDELETED: return -ENOMEDIUM; - case AFSVL_BADNAME: return -EINVAL; - case AFSVL_BADINDEX: return -EINVAL; - case AFSVL_BADVOLTYPE: return -EINVAL; - case AFSVL_BADSERVER: return -EINVAL; - case AFSVL_BADPARTITION: return -EINVAL; - case AFSVL_REPSFULL: return -EFBIG; - case AFSVL_NOREPSERVER: return -ENOENT; - case AFSVL_DUPREPSERVER: return -EEXIST; - case AFSVL_RWNOTFOUND: return -ENOENT; - case AFSVL_BADREFCOUNT: return -EINVAL; - case AFSVL_SIZEEXCEEDED: return -EINVAL; - case AFSVL_BADENTRY: return -EINVAL; - case AFSVL_BADVOLIDBUMP: return -EINVAL; - case AFSVL_IDALREADYHASHED: return -EINVAL; - case AFSVL_ENTRYLOCKED: return -EBUSY; - case AFSVL_BADVOLOPER: return -EBADRQC; - case AFSVL_BADRELLOCKTYPE: return -EINVAL; - case AFSVL_RERELEASE: return -EREMOTEIO; - case AFSVL_BADSERVERFLAG: return -EINVAL; - case AFSVL_PERM: return -EACCES; - case AFSVL_NOMEM: return -EREMOTEIO; - default: - return afs_abort_to_error(abort_code); + struct afs_uvldbentry__xdr *uvldb; + struct afs_vldb_entry *entry; + u32 nr_servers, vlflags; + int i, ret; + + _enter(""); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + uvldb = call->buffer; + entry = call->ret_vldb; + + nr_servers = ntohl(uvldb->nServers); + if (nr_servers > AFS_NMAXNSERVERS) + nr_servers = AFS_NMAXNSERVERS; + + for (i = 0; i < ARRAY_SIZE(uvldb->name) - 1; i++) + entry->name[i] = (u8)ntohl(uvldb->name[i]); + entry->name[i] = 0; + entry->name_len = strlen(entry->name); + + vlflags = ntohl(uvldb->flags); + for (i = 0; i < nr_servers; i++) { + struct afs_uuid__xdr *xdr; + struct afs_uuid *uuid; + u32 tmp = ntohl(uvldb->serverFlags[i]); + int j; + int n = entry->nr_servers; + + if (tmp & AFS_VLSF_RWVOL) { + entry->fs_mask[n] |= AFS_VOL_VTM_RW; + if (vlflags & AFS_VLF_BACKEXISTS) + entry->fs_mask[n] |= AFS_VOL_VTM_BAK; + } + if (tmp & AFS_VLSF_ROVOL) + entry->fs_mask[n] |= AFS_VOL_VTM_RO; + if (!entry->fs_mask[n]) + continue; + + xdr = &uvldb->serverNumber[i]; + uuid = (struct afs_uuid *)&entry->fs_server[n]; + uuid->time_low = xdr->time_low; + uuid->time_mid = htons(ntohl(xdr->time_mid)); + uuid->time_hi_and_version = htons(ntohl(xdr->time_hi_and_version)); + uuid->clock_seq_hi_and_reserved = (u8)ntohl(xdr->clock_seq_hi_and_reserved); + uuid->clock_seq_low = (u8)ntohl(xdr->clock_seq_low); + for (j = 0; j < 6; j++) + uuid->node[j] = (u8)ntohl(xdr->node[j]); + + entry->vlsf_flags[n] = tmp; + entry->addr_version[n] = ntohl(uvldb->serverUnique[i]); + entry->nr_servers++; } + + for (i = 0; i < AFS_MAXTYPES; i++) + entry->vid[i] = ntohl(uvldb->volumeId[i]); + + if (vlflags & AFS_VLF_RWEXISTS) + __set_bit(AFS_VLDB_HAS_RW, &entry->flags); + if (vlflags & AFS_VLF_ROEXISTS) + __set_bit(AFS_VLDB_HAS_RO, &entry->flags); + if (vlflags & AFS_VLF_BACKEXISTS) + __set_bit(AFS_VLDB_HAS_BAK, &entry->flags); + + if (!(vlflags & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) { + entry->error = -ENOMEDIUM; + __set_bit(AFS_VLDB_QUERY_ERROR, &entry->flags); + } + + __set_bit(AFS_VLDB_QUERY_VALID, &entry->flags); + _leave(" = 0 [done]"); + return 0; } /* - * deliver reply data to a VL.GetEntryByXXX call + * VL.GetEntryByNameU operation type. */ -static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call, - struct sk_buff *skb, bool last) +static const struct afs_call_type afs_RXVLGetEntryByNameU = { + .name = "VL.GetEntryByNameU", + .op = afs_VL_GetEntryByNameU, + .deliver = afs_deliver_vl_get_entry_by_name_u, + .destructor = afs_flat_call_destructor, +}; + +/* + * Dispatch a get volume entry by name or ID operation (uuid variant). If the + * volname is a decimal number then it's a volume ID not a volume name. + */ +struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, + const char *volname, + int volnamesz) { - struct afs_cache_vlocation *entry; + struct afs_vldb_entry *entry; + struct afs_call *call; + struct afs_net *net = vc->cell->net; + size_t reqsz, padsz; __be32 *bp; - u32 tmp; - int loop; - _enter(",,%u", last); + _enter(""); - afs_transfer_reply(call, skb); - if (!last) - return 0; + padsz = (4 - (volnamesz & 3)) & 3; + reqsz = 8 + volnamesz + padsz; - if (call->reply_size != call->reply_max) - return -EBADMSG; + entry = kzalloc(sizeof(struct afs_vldb_entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); - /* unmarshall the reply once we've received all of it */ - entry = call->reply; - bp = call->buffer; + call = afs_alloc_flat_call(net, &afs_RXVLGetEntryByNameU, reqsz, + sizeof(struct afs_uvldbentry__xdr)); + if (!call) { + kfree(entry); + return ERR_PTR(-ENOMEM); + } - for (loop = 0; loop < 64; loop++) - entry->name[loop] = ntohl(*bp++); - entry->name[loop] = 0; - bp++; /* final NUL */ + call->key = vc->key; + call->ret_vldb = entry; + call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; - bp++; /* type */ - entry->nservers = ntohl(*bp++); + /* Marshall the parameters */ + bp = call->request; + *bp++ = htonl(VLGETENTRYBYNAMEU); + *bp++ = htonl(volnamesz); + memcpy(bp, volname, volnamesz); + if (padsz > 0) + memset((void *)bp + volnamesz, 0, padsz); + + trace_afs_make_vl_call(call); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + afs_put_call(call); + if (vc->call_error) { + kfree(entry); + return ERR_PTR(vc->call_error); + } + return entry; +} - for (loop = 0; loop < 8; loop++) - entry->servers[loop].s_addr = *bp++; +/* + * Deliver reply data to a VL.GetAddrsU call. + * + * GetAddrsU(IN ListAddrByAttributes *inaddr, + * OUT afsUUID *uuidp1, + * OUT uint32_t *uniquifier, + * OUT uint32_t *nentries, + * OUT bulkaddrs *blkaddrs); + */ +static int afs_deliver_vl_get_addrs_u(struct afs_call *call) +{ + struct afs_addr_list *alist; + __be32 *bp; + u32 uniquifier, nentries, count; + int i, ret; + + _enter("{%u,%zu/%u}", + call->unmarshall, iov_iter_count(call->iter), call->count); + + switch (call->unmarshall) { + case 0: + afs_extract_to_buf(call, + sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32)); + call->unmarshall++; + + /* Extract the returned uuid, uniquifier, nentries and + * blkaddrs size */ + fallthrough; + case 1: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + bp = call->buffer + sizeof(struct afs_uuid__xdr); + uniquifier = ntohl(*bp++); + nentries = ntohl(*bp++); + count = ntohl(*bp); + + nentries = min(nentries, count); + alist = afs_alloc_addrlist(nentries); + if (!alist) + return -ENOMEM; + alist->version = uniquifier; + call->ret_alist = alist; + call->count = count; + call->count2 = nentries; + call->unmarshall++; + + more_entries: + count = min(call->count, 4U); + afs_extract_to_buf(call, count * sizeof(__be32)); + + fallthrough; /* and extract entries */ + case 2: + ret = afs_extract_data(call, call->count > 4); + if (ret < 0) + return ret; + + alist = call->ret_alist; + bp = call->buffer; + count = min(call->count, 4U); + for (i = 0; i < count; i++) { + if (alist->nr_addrs < call->count2) { + ret = afs_merge_fs_addr4(call->net, alist, *bp++, AFS_FS_PORT); + if (ret < 0) + return ret; + } + } + + call->count -= count; + if (call->count > 0) + goto more_entries; + call->unmarshall++; + break; + } - bp += 8; /* partition IDs */ + _leave(" = 0 [done]"); + return 0; +} - for (loop = 0; loop < 8; loop++) { - tmp = ntohl(*bp++); - entry->srvtmask[loop] = 0; - if (tmp & AFS_VLSF_RWVOL) - entry->srvtmask[loop] |= AFS_VOL_VTM_RW; - if (tmp & AFS_VLSF_ROVOL) - entry->srvtmask[loop] |= AFS_VOL_VTM_RO; - if (tmp & AFS_VLSF_BACKVOL) - entry->srvtmask[loop] |= AFS_VOL_VTM_BAK; +/* + * VL.GetAddrsU operation type. + */ +static const struct afs_call_type afs_RXVLGetAddrsU = { + .name = "VL.GetAddrsU", + .op = afs_VL_GetAddrsU, + .deliver = afs_deliver_vl_get_addrs_u, + .destructor = afs_flat_call_destructor, +}; + +/* + * Dispatch an operation to get the addresses for a server, where the server is + * nominated by UUID. + */ +struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, + const uuid_t *uuid) +{ + struct afs_ListAddrByAttributes__xdr *r; + struct afs_addr_list *alist; + const struct afs_uuid *u = (const struct afs_uuid *)uuid; + struct afs_call *call; + struct afs_net *net = vc->cell->net; + __be32 *bp; + int i; + + _enter(""); + + call = afs_alloc_flat_call(net, &afs_RXVLGetAddrsU, + sizeof(__be32) + sizeof(struct afs_ListAddrByAttributes__xdr), + sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32)); + if (!call) + return ERR_PTR(-ENOMEM); + + call->key = vc->key; + call->ret_alist = NULL; + call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; + + /* Marshall the parameters */ + bp = call->request; + *bp++ = htonl(VLGETADDRSU); + r = (struct afs_ListAddrByAttributes__xdr *)bp; + r->Mask = htonl(AFS_VLADDR_UUID); + r->ipaddr = 0; + r->index = 0; + r->spare = 0; + r->uuid.time_low = u->time_low; + r->uuid.time_mid = htonl(ntohs(u->time_mid)); + r->uuid.time_hi_and_version = htonl(ntohs(u->time_hi_and_version)); + r->uuid.clock_seq_hi_and_reserved = htonl(u->clock_seq_hi_and_reserved); + r->uuid.clock_seq_low = htonl(u->clock_seq_low); + for (i = 0; i < 6; i++) + r->uuid.node[i] = htonl(u->node[i]); + + trace_afs_make_vl_call(call); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + alist = call->ret_alist; + afs_put_call(call); + if (vc->call_error) { + afs_put_addrlist(alist, afs_alist_trace_put_getaddru); + return ERR_PTR(vc->call_error); } + return alist; +} + +/* + * Deliver reply data to an VL.GetCapabilities operation. + */ +static int afs_deliver_vl_get_capabilities(struct afs_call *call) +{ + u32 count; + int ret; + + _enter("{%u,%zu/%u}", + call->unmarshall, iov_iter_count(call->iter), call->count); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; - entry->vid[0] = ntohl(*bp++); - entry->vid[1] = ntohl(*bp++); - entry->vid[2] = ntohl(*bp++); + fallthrough; /* and extract the capabilities word count */ + case 1: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; - bp++; /* clone ID */ + count = ntohl(call->tmp); + call->count = count; + call->count2 = count; - tmp = ntohl(*bp++); /* flags */ - entry->vidmask = 0; - if (tmp & AFS_VLF_RWEXISTS) - entry->vidmask |= AFS_VOL_VTM_RW; - if (tmp & AFS_VLF_ROEXISTS) - entry->vidmask |= AFS_VOL_VTM_RO; - if (tmp & AFS_VLF_BACKEXISTS) - entry->vidmask |= AFS_VOL_VTM_BAK; - if (!entry->vidmask) - return -EBADMSG; + call->unmarshall++; + afs_extract_discard(call, count * sizeof(__be32)); + + fallthrough; /* and extract capabilities words */ + case 2: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + /* TODO: Examine capabilities */ + + call->unmarshall++; + break; + } _leave(" = 0 [done]"); return 0; } +static void afs_destroy_vl_get_capabilities(struct afs_call *call) +{ + afs_put_addrlist(call->vl_probe, afs_alist_trace_put_vlgetcaps); + afs_put_vlserver(call->net, call->vlserver); + afs_flat_call_destructor(call); +} + /* - * VL.GetEntryByName operation type + * VL.GetCapabilities operation type */ -static const struct afs_call_type afs_RXVLGetEntryByName = { - .name = "VL.GetEntryByName", - .deliver = afs_deliver_vl_get_entry_by_xxx, - .abort_to_error = afs_vl_abort_to_error, - .destructor = afs_flat_call_destructor, +static const struct afs_call_type afs_RXVLGetCapabilities = { + .name = "VL.GetCapabilities", + .op = afs_VL_GetCapabilities, + .deliver = afs_deliver_vl_get_capabilities, + .immediate_cancel = afs_vlserver_probe_result, + .done = afs_vlserver_probe_result, + .destructor = afs_destroy_vl_get_capabilities, }; /* - * VL.GetEntryById operation type + * Probe a volume server for the capabilities that it supports. This can + * return up to 196 words. + * + * We use this to probe for service upgrade to determine what the server at the + * other end supports. + */ +struct afs_call *afs_vl_get_capabilities(struct afs_net *net, + struct afs_addr_list *alist, + unsigned int addr_index, + struct key *key, + struct afs_vlserver *server, + unsigned int server_index) +{ + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(net, &afs_RXVLGetCapabilities, 1 * 4, 16 * 4); + if (!call) + return ERR_PTR(-ENOMEM); + + call->key = key; + call->vlserver = afs_get_vlserver(server); + call->server_index = server_index; + call->peer = rxrpc_kernel_get_peer(alist->addrs[addr_index].peer); + call->vl_probe = afs_get_addrlist(alist, afs_alist_trace_get_vlgetcaps); + call->probe_index = addr_index; + call->service_id = server->service_id; + call->upgrade = true; + call->async = true; + call->max_lifespan = AFS_PROBE_MAX_LIFESPAN; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(VLGETCAPABILITIES); + + /* Can't take a ref on server */ + trace_afs_make_vl_call(call); + afs_make_call(call, GFP_KERNEL); + return call; +} + +/* + * Deliver reply data to a YFSVL.GetEndpoints call. + * + * GetEndpoints(IN yfsServerAttributes *attr, + * OUT opr_uuid *uuid, + * OUT afs_int32 *uniquifier, + * OUT endpoints *fsEndpoints, + * OUT endpoints *volEndpoints) + */ +static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) +{ + struct afs_addr_list *alist; + __be32 *bp; + u32 uniquifier, size; + int ret; + + _enter("{%u,%zu,%u}", + call->unmarshall, iov_iter_count(call->iter), call->count2); + + switch (call->unmarshall) { + case 0: + afs_extract_to_buf(call, sizeof(uuid_t) + 3 * sizeof(__be32)); + call->unmarshall = 1; + + /* Extract the returned uuid, uniquifier, fsEndpoints count and + * either the first fsEndpoint type or the volEndpoints + * count if there are no fsEndpoints. */ + fallthrough; + case 1: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + bp = call->buffer + sizeof(uuid_t); + uniquifier = ntohl(*bp++); + call->count = ntohl(*bp++); + call->count2 = ntohl(*bp); /* Type or next count */ + + if (call->count > YFS_MAXENDPOINTS) + return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num); + + alist = afs_alloc_addrlist(call->count); + if (!alist) + return -ENOMEM; + alist->version = uniquifier; + call->ret_alist = alist; + + if (call->count == 0) + goto extract_volendpoints; + + next_fsendpoint: + switch (call->count2) { + case YFS_ENDPOINT_IPV4: + size = sizeof(__be32) * (1 + 1 + 1); + break; + case YFS_ENDPOINT_IPV6: + size = sizeof(__be32) * (1 + 4 + 1); + break; + default: + return afs_protocol_error(call, afs_eproto_yvl_fsendpt_type); + } + + size += sizeof(__be32); + afs_extract_to_buf(call, size); + call->unmarshall = 2; + + fallthrough; /* and extract fsEndpoints[] entries */ + case 2: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + alist = call->ret_alist; + bp = call->buffer; + switch (call->count2) { + case YFS_ENDPOINT_IPV4: + if (ntohl(bp[0]) != sizeof(__be32) * 2) + return afs_protocol_error( + call, afs_eproto_yvl_fsendpt4_len); + ret = afs_merge_fs_addr4(call->net, alist, bp[1], ntohl(bp[2])); + if (ret < 0) + return ret; + bp += 3; + break; + case YFS_ENDPOINT_IPV6: + if (ntohl(bp[0]) != sizeof(__be32) * 5) + return afs_protocol_error( + call, afs_eproto_yvl_fsendpt6_len); + ret = afs_merge_fs_addr6(call->net, alist, bp + 1, ntohl(bp[5])); + if (ret < 0) + return ret; + bp += 6; + break; + default: + return afs_protocol_error(call, afs_eproto_yvl_fsendpt_type); + } + + /* Got either the type of the next entry or the count of + * volEndpoints if no more fsEndpoints. + */ + call->count2 = ntohl(*bp++); + + call->count--; + if (call->count > 0) + goto next_fsendpoint; + + extract_volendpoints: + /* Extract the list of volEndpoints. */ + call->count = call->count2; + if (!call->count) + goto end; + if (call->count > YFS_MAXENDPOINTS) + return afs_protocol_error(call, afs_eproto_yvl_vlendpt_type); + + afs_extract_to_buf(call, 1 * sizeof(__be32)); + call->unmarshall = 3; + + /* Extract the type of volEndpoints[0]. Normally we would + * extract the type of the next endpoint when we extract the + * data of the current one, but this is the first... + */ + fallthrough; + case 3: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + bp = call->buffer; + + next_volendpoint: + call->count2 = ntohl(*bp++); + switch (call->count2) { + case YFS_ENDPOINT_IPV4: + size = sizeof(__be32) * (1 + 1 + 1); + break; + case YFS_ENDPOINT_IPV6: + size = sizeof(__be32) * (1 + 4 + 1); + break; + default: + return afs_protocol_error(call, afs_eproto_yvl_vlendpt_type); + } + + if (call->count > 1) + size += sizeof(__be32); /* Get next type too */ + afs_extract_to_buf(call, size); + call->unmarshall = 4; + + fallthrough; /* and extract volEndpoints[] entries */ + case 4: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + bp = call->buffer; + switch (call->count2) { + case YFS_ENDPOINT_IPV4: + if (ntohl(bp[0]) != sizeof(__be32) * 2) + return afs_protocol_error( + call, afs_eproto_yvl_vlendpt4_len); + bp += 3; + break; + case YFS_ENDPOINT_IPV6: + if (ntohl(bp[0]) != sizeof(__be32) * 5) + return afs_protocol_error( + call, afs_eproto_yvl_vlendpt6_len); + bp += 6; + break; + default: + return afs_protocol_error(call, afs_eproto_yvl_vlendpt_type); + } + + /* Got either the type of the next entry or the count of + * volEndpoints if no more fsEndpoints. + */ + call->count--; + if (call->count > 0) + goto next_volendpoint; + + end: + afs_extract_discard(call, 0); + call->unmarshall = 5; + + fallthrough; /* Done */ + case 5: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + call->unmarshall = 6; + fallthrough; + + case 6: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFSVL.GetEndpoints operation type. */ -static const struct afs_call_type afs_RXVLGetEntryById = { - .name = "VL.GetEntryById", - .deliver = afs_deliver_vl_get_entry_by_xxx, - .abort_to_error = afs_vl_abort_to_error, +static const struct afs_call_type afs_YFSVLGetEndpoints = { + .name = "YFSVL.GetEndpoints", + .op = afs_YFSVL_GetEndpoints, + .deliver = afs_deliver_yfsvl_get_endpoints, .destructor = afs_flat_call_destructor, }; /* - * dispatch a get volume entry by name operation + * Dispatch an operation to get the addresses for a server, where the server is + * nominated by UUID. */ -int afs_vl_get_entry_by_name(struct in_addr *addr, - struct key *key, - const char *volname, - struct afs_cache_vlocation *entry, - const struct afs_wait_mode *wait_mode) +struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, + const uuid_t *uuid) { + struct afs_addr_list *alist; struct afs_call *call; - size_t volnamesz, reqsz, padsz; + struct afs_net *net = vc->cell->net; __be32 *bp; _enter(""); - volnamesz = strlen(volname); - padsz = (4 - (volnamesz & 3)) & 3; - reqsz = 8 + volnamesz + padsz; - - call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384); + call = afs_alloc_flat_call(net, &afs_YFSVLGetEndpoints, + sizeof(__be32) * 2 + sizeof(*uuid), + sizeof(struct in6_addr) + sizeof(__be32) * 3); if (!call) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - call->key = key; - call->reply = entry; - call->service_id = VL_SERVICE; - call->port = htons(AFS_VL_PORT); + call->key = vc->key; + call->ret_alist = NULL; + call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; - /* marshall the parameters */ + /* Marshall the parameters */ bp = call->request; - *bp++ = htonl(VLGETENTRYBYNAME); - *bp++ = htonl(volnamesz); - memcpy(bp, volname, volnamesz); - if (padsz > 0) - memset((void *) bp + volnamesz, 0, padsz); + *bp++ = htonl(YVLGETENDPOINTS); + *bp++ = htonl(YFS_SERVER_UUID); + memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */ + + trace_afs_make_vl_call(call); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + alist = call->ret_alist; + afs_put_call(call); + if (vc->call_error) { + afs_put_addrlist(alist, afs_alist_trace_put_getaddru); + return ERR_PTR(vc->call_error); + } + return alist; +} + +/* + * Deliver reply data to a YFSVL.GetCellName operation. + */ +static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) +{ + char *cell_name; + u32 namesz, paddedsz; + int ret; + + _enter("{%u,%zu/%u}", + call->unmarshall, iov_iter_count(call->iter), call->count); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + + fallthrough; /* and extract the cell name length */ + case 1: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + namesz = ntohl(call->tmp); + if (namesz > YFS_VL_MAXCELLNAME) + return afs_protocol_error(call, afs_eproto_cellname_len); + paddedsz = (namesz + 3) & ~3; + call->count = namesz; + call->count2 = paddedsz - namesz; + + cell_name = kmalloc(namesz + 1, GFP_KERNEL); + if (!cell_name) + return -ENOMEM; + cell_name[namesz] = 0; + call->ret_str = cell_name; + + afs_extract_begin(call, cell_name, namesz); + call->unmarshall++; + + fallthrough; /* and extract cell name */ + case 2: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + afs_extract_discard(call, call->count2); + call->unmarshall++; + + fallthrough; /* and extract padding */ + case 3: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + call->unmarshall++; + break; + } - /* initiate the call */ - return afs_make_call(addr, call, GFP_KERNEL, wait_mode); + _leave(" = 0 [done]"); + return 0; } /* - * dispatch a get volume entry by ID operation - */ -int afs_vl_get_entry_by_id(struct in_addr *addr, - struct key *key, - afs_volid_t volid, - afs_voltype_t voltype, - struct afs_cache_vlocation *entry, - const struct afs_wait_mode *wait_mode) + * VL.GetCapabilities operation type + */ +static const struct afs_call_type afs_YFSVLGetCellName = { + .name = "YFSVL.GetCellName", + .op = afs_YFSVL_GetCellName, + .deliver = afs_deliver_yfsvl_get_cell_name, + .destructor = afs_flat_call_destructor, +}; + +/* + * Probe a volume server for the capabilities that it supports. This can + * return up to 196 words. + * + * We use this to probe for service upgrade to determine what the server at the + * other end supports. + */ +char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) { struct afs_call *call; + struct afs_net *net = vc->cell->net; __be32 *bp; + char *cellname; _enter(""); - call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384); + call = afs_alloc_flat_call(net, &afs_YFSVLGetCellName, 1 * 4, 0); if (!call) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - call->key = key; - call->reply = entry; - call->service_id = VL_SERVICE; - call->port = htons(AFS_VL_PORT); + call->key = vc->key; + call->ret_str = NULL; + call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* marshall the parameters */ bp = call->request; - *bp++ = htonl(VLGETENTRYBYID); - *bp++ = htonl(volid); - *bp = htonl(voltype); - - /* initiate the call */ - return afs_make_call(addr, call, GFP_KERNEL, wait_mode); + *bp++ = htonl(YVLGETCELLNAME); + + /* Can't take a ref on server */ + trace_afs_make_vl_call(call); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + cellname = call->ret_str; + afs_put_call(call); + if (vc->call_error) { + kfree(cellname); + return ERR_PTR(vc->call_error); + } + return cellname; } diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c deleted file mode 100644 index 57bcb1596530..000000000000 --- a/fs/afs/vlocation.c +++ /dev/null @@ -1,718 +0,0 @@ -/* AFS volume location management - * - * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/sched.h> -#include "internal.h" - -static unsigned afs_vlocation_timeout = 10; /* volume location timeout in seconds */ -static unsigned afs_vlocation_update_timeout = 10 * 60; - -static void afs_vlocation_reaper(struct work_struct *); -static void afs_vlocation_updater(struct work_struct *); - -static LIST_HEAD(afs_vlocation_updates); -static LIST_HEAD(afs_vlocation_graveyard); -static DEFINE_SPINLOCK(afs_vlocation_updates_lock); -static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock); -static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper); -static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater); -static struct workqueue_struct *afs_vlocation_update_worker; - -/* - * iterate through the VL servers in a cell until one of them admits knowing - * about the volume in question - */ -static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl, - struct key *key, - struct afs_cache_vlocation *vldb) -{ - struct afs_cell *cell = vl->cell; - struct in_addr addr; - int count, ret; - - _enter("%s,%s", cell->name, vl->vldb.name); - - down_write(&vl->cell->vl_sem); - ret = -ENOMEDIUM; - for (count = cell->vl_naddrs; count > 0; count--) { - addr = cell->vl_addrs[cell->vl_curr_svix]; - - _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr); - - /* attempt to access the VL server */ - ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb, - &afs_sync_call); - switch (ret) { - case 0: - goto out; - case -ENOMEM: - case -ENONET: - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - if (ret == -ENOMEM || ret == -ENONET) - goto out; - goto rotate; - case -ENOMEDIUM: - case -EKEYREJECTED: - case -EKEYEXPIRED: - goto out; - default: - ret = -EIO; - goto rotate; - } - - /* rotate the server records upon lookup failure */ - rotate: - cell->vl_curr_svix++; - cell->vl_curr_svix %= cell->vl_naddrs; - } - -out: - up_write(&vl->cell->vl_sem); - _leave(" = %d", ret); - return ret; -} - -/* - * iterate through the VL servers in a cell until one of them admits knowing - * about the volume in question - */ -static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl, - struct key *key, - afs_volid_t volid, - afs_voltype_t voltype, - struct afs_cache_vlocation *vldb) -{ - struct afs_cell *cell = vl->cell; - struct in_addr addr; - int count, ret; - - _enter("%s,%x,%d,", cell->name, volid, voltype); - - down_write(&vl->cell->vl_sem); - ret = -ENOMEDIUM; - for (count = cell->vl_naddrs; count > 0; count--) { - addr = cell->vl_addrs[cell->vl_curr_svix]; - - _debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr); - - /* attempt to access the VL server */ - ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb, - &afs_sync_call); - switch (ret) { - case 0: - goto out; - case -ENOMEM: - case -ENONET: - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - if (ret == -ENOMEM || ret == -ENONET) - goto out; - goto rotate; - case -EBUSY: - vl->upd_busy_cnt++; - if (vl->upd_busy_cnt <= 3) { - if (vl->upd_busy_cnt > 1) { - /* second+ BUSY - sleep a little bit */ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(1); - __set_current_state(TASK_RUNNING); - } - continue; - } - break; - case -ENOMEDIUM: - vl->upd_rej_cnt++; - goto rotate; - default: - ret = -EIO; - goto rotate; - } - - /* rotate the server records upon lookup failure */ - rotate: - cell->vl_curr_svix++; - cell->vl_curr_svix %= cell->vl_naddrs; - vl->upd_busy_cnt = 0; - } - -out: - if (ret < 0 && vl->upd_rej_cnt > 0) { - printk(KERN_NOTICE "kAFS:" - " Active volume no longer valid '%s'\n", - vl->vldb.name); - vl->valid = 0; - ret = -ENOMEDIUM; - } - - up_write(&vl->cell->vl_sem); - _leave(" = %d", ret); - return ret; -} - -/* - * allocate a volume location record - */ -static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell, - const char *name, - size_t namesz) -{ - struct afs_vlocation *vl; - - vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL); - if (vl) { - vl->cell = cell; - vl->state = AFS_VL_NEW; - atomic_set(&vl->usage, 1); - INIT_LIST_HEAD(&vl->link); - INIT_LIST_HEAD(&vl->grave); - INIT_LIST_HEAD(&vl->update); - init_waitqueue_head(&vl->waitq); - spin_lock_init(&vl->lock); - memcpy(vl->vldb.name, name, namesz); - } - - _leave(" = %p", vl); - return vl; -} - -/* - * update record if we found it in the cache - */ -static int afs_vlocation_update_record(struct afs_vlocation *vl, - struct key *key, - struct afs_cache_vlocation *vldb) -{ - afs_voltype_t voltype; - afs_volid_t vid; - int ret; - - /* try to look up a cached volume in the cell VL databases by ID */ - _debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }", - vl->vldb.name, - vl->vldb.vidmask, - ntohl(vl->vldb.servers[0].s_addr), - vl->vldb.srvtmask[0], - ntohl(vl->vldb.servers[1].s_addr), - vl->vldb.srvtmask[1], - ntohl(vl->vldb.servers[2].s_addr), - vl->vldb.srvtmask[2]); - - _debug("Vids: %08x %08x %08x", - vl->vldb.vid[0], - vl->vldb.vid[1], - vl->vldb.vid[2]); - - if (vl->vldb.vidmask & AFS_VOL_VTM_RW) { - vid = vl->vldb.vid[0]; - voltype = AFSVL_RWVOL; - } else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) { - vid = vl->vldb.vid[1]; - voltype = AFSVL_ROVOL; - } else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) { - vid = vl->vldb.vid[2]; - voltype = AFSVL_BACKVOL; - } else { - BUG(); - vid = 0; - voltype = 0; - } - - /* contact the server to make sure the volume is still available - * - TODO: need to handle disconnected operation here - */ - ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb); - switch (ret) { - /* net error */ - default: - printk(KERN_WARNING "kAFS:" - " failed to update volume '%s' (%x) up in '%s': %d\n", - vl->vldb.name, vid, vl->cell->name, ret); - _leave(" = %d", ret); - return ret; - - /* pulled from local cache into memory */ - case 0: - _leave(" = 0"); - return 0; - - /* uh oh... looks like the volume got deleted */ - case -ENOMEDIUM: - printk(KERN_ERR "kAFS:" - " volume '%s' (%x) does not exist '%s'\n", - vl->vldb.name, vid, vl->cell->name); - - /* TODO: make existing record unavailable */ - _leave(" = %d", ret); - return ret; - } -} - -/* - * apply the update to a VL record - */ -static void afs_vlocation_apply_update(struct afs_vlocation *vl, - struct afs_cache_vlocation *vldb) -{ - _debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }", - vldb->name, vldb->vidmask, - ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0], - ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1], - ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]); - - _debug("Vids: %08x %08x %08x", - vldb->vid[0], vldb->vid[1], vldb->vid[2]); - - if (strcmp(vldb->name, vl->vldb.name) != 0) - printk(KERN_NOTICE "kAFS:" - " name of volume '%s' changed to '%s' on server\n", - vl->vldb.name, vldb->name); - - vl->vldb = *vldb; - -#ifdef CONFIG_AFS_FSCACHE - fscache_update_cookie(vl->cache); -#endif -} - -/* - * fill in a volume location record, consulting the cache and the VL server - * both - */ -static int afs_vlocation_fill_in_record(struct afs_vlocation *vl, - struct key *key) -{ - struct afs_cache_vlocation vldb; - int ret; - - _enter(""); - - ASSERTCMP(vl->valid, ==, 0); - - memset(&vldb, 0, sizeof(vldb)); - - /* see if we have an in-cache copy (will set vl->valid if there is) */ -#ifdef CONFIG_AFS_FSCACHE - vl->cache = fscache_acquire_cookie(vl->cell->cache, - &afs_vlocation_cache_index_def, vl); -#endif - - if (vl->valid) { - /* try to update a known volume in the cell VL databases by - * ID as the name may have changed */ - _debug("found in cache"); - ret = afs_vlocation_update_record(vl, key, &vldb); - } else { - /* try to look up an unknown volume in the cell VL databases by - * name */ - ret = afs_vlocation_access_vl_by_name(vl, key, &vldb); - if (ret < 0) { - printk("kAFS: failed to locate '%s' in cell '%s'\n", - vl->vldb.name, vl->cell->name); - return ret; - } - } - - afs_vlocation_apply_update(vl, &vldb); - _leave(" = 0"); - return 0; -} - -/* - * queue a vlocation record for updates - */ -static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl) -{ - struct afs_vlocation *xvl; - - /* wait at least 10 minutes before updating... */ - vl->update_at = get_seconds() + afs_vlocation_update_timeout; - - spin_lock(&afs_vlocation_updates_lock); - - if (!list_empty(&afs_vlocation_updates)) { - /* ... but wait at least 1 second more than the newest record - * already queued so that we don't spam the VL server suddenly - * with lots of requests - */ - xvl = list_entry(afs_vlocation_updates.prev, - struct afs_vlocation, update); - if (vl->update_at <= xvl->update_at) - vl->update_at = xvl->update_at + 1; - } else { - queue_delayed_work(afs_vlocation_update_worker, - &afs_vlocation_update, - afs_vlocation_update_timeout * HZ); - } - - list_add_tail(&vl->update, &afs_vlocation_updates); - spin_unlock(&afs_vlocation_updates_lock); -} - -/* - * lookup volume location - * - iterate through the VL servers in a cell until one of them admits knowing - * about the volume in question - * - lookup in the local cache if not able to find on the VL server - * - insert/update in the local cache if did get a VL response - */ -struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell, - struct key *key, - const char *name, - size_t namesz) -{ - struct afs_vlocation *vl; - int ret; - - _enter("{%s},{%x},%*.*s,%zu", - cell->name, key_serial(key), - (int) namesz, (int) namesz, name, namesz); - - if (namesz >= sizeof(vl->vldb.name)) { - _leave(" = -ENAMETOOLONG"); - return ERR_PTR(-ENAMETOOLONG); - } - - /* see if we have an in-memory copy first */ - down_write(&cell->vl_sem); - spin_lock(&cell->vl_lock); - list_for_each_entry(vl, &cell->vl_list, link) { - if (vl->vldb.name[namesz] != '\0') - continue; - if (memcmp(vl->vldb.name, name, namesz) == 0) - goto found_in_memory; - } - spin_unlock(&cell->vl_lock); - - /* not in the cell's in-memory lists - create a new record */ - vl = afs_vlocation_alloc(cell, name, namesz); - if (!vl) { - up_write(&cell->vl_sem); - return ERR_PTR(-ENOMEM); - } - - afs_get_cell(cell); - - list_add_tail(&vl->link, &cell->vl_list); - vl->state = AFS_VL_CREATING; - up_write(&cell->vl_sem); - -fill_in_record: - ret = afs_vlocation_fill_in_record(vl, key); - if (ret < 0) - goto error_abandon; - spin_lock(&vl->lock); - vl->state = AFS_VL_VALID; - spin_unlock(&vl->lock); - wake_up(&vl->waitq); - - /* update volume entry in local cache */ -#ifdef CONFIG_AFS_FSCACHE - fscache_update_cookie(vl->cache); -#endif - - /* schedule for regular updates */ - afs_vlocation_queue_for_updates(vl); - goto success; - -found_in_memory: - /* found in memory */ - _debug("found in memory"); - atomic_inc(&vl->usage); - spin_unlock(&cell->vl_lock); - if (!list_empty(&vl->grave)) { - spin_lock(&afs_vlocation_graveyard_lock); - list_del_init(&vl->grave); - spin_unlock(&afs_vlocation_graveyard_lock); - } - up_write(&cell->vl_sem); - - /* see if it was an abandoned record that we might try filling in */ - spin_lock(&vl->lock); - while (vl->state != AFS_VL_VALID) { - afs_vlocation_state_t state = vl->state; - - _debug("invalid [state %d]", state); - - if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) { - vl->state = AFS_VL_CREATING; - spin_unlock(&vl->lock); - goto fill_in_record; - } - - /* must now wait for creation or update by someone else to - * complete */ - _debug("wait"); - - spin_unlock(&vl->lock); - ret = wait_event_interruptible(vl->waitq, - vl->state == AFS_VL_NEW || - vl->state == AFS_VL_VALID || - vl->state == AFS_VL_NO_VOLUME); - if (ret < 0) - goto error; - spin_lock(&vl->lock); - } - spin_unlock(&vl->lock); - -success: - _leave(" = %p", vl); - return vl; - -error_abandon: - spin_lock(&vl->lock); - vl->state = AFS_VL_NEW; - spin_unlock(&vl->lock); - wake_up(&vl->waitq); -error: - ASSERT(vl != NULL); - afs_put_vlocation(vl); - _leave(" = %d", ret); - return ERR_PTR(ret); -} - -/* - * finish using a volume location record - */ -void afs_put_vlocation(struct afs_vlocation *vl) -{ - if (!vl) - return; - - _enter("%s", vl->vldb.name); - - ASSERTCMP(atomic_read(&vl->usage), >, 0); - - if (likely(!atomic_dec_and_test(&vl->usage))) { - _leave(""); - return; - } - - spin_lock(&afs_vlocation_graveyard_lock); - if (atomic_read(&vl->usage) == 0) { - _debug("buried"); - list_move_tail(&vl->grave, &afs_vlocation_graveyard); - vl->time_of_death = get_seconds(); - queue_delayed_work(afs_wq, &afs_vlocation_reap, - afs_vlocation_timeout * HZ); - - /* suspend updates on this record */ - if (!list_empty(&vl->update)) { - spin_lock(&afs_vlocation_updates_lock); - list_del_init(&vl->update); - spin_unlock(&afs_vlocation_updates_lock); - } - } - spin_unlock(&afs_vlocation_graveyard_lock); - _leave(" [killed?]"); -} - -/* - * destroy a dead volume location record - */ -static void afs_vlocation_destroy(struct afs_vlocation *vl) -{ - _enter("%p", vl); - -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vl->cache, 0); -#endif - afs_put_cell(vl->cell); - kfree(vl); -} - -/* - * reap dead volume location records - */ -static void afs_vlocation_reaper(struct work_struct *work) -{ - LIST_HEAD(corpses); - struct afs_vlocation *vl; - unsigned long delay, expiry; - time_t now; - - _enter(""); - - now = get_seconds(); - spin_lock(&afs_vlocation_graveyard_lock); - - while (!list_empty(&afs_vlocation_graveyard)) { - vl = list_entry(afs_vlocation_graveyard.next, - struct afs_vlocation, grave); - - _debug("check %p", vl); - - /* the queue is ordered most dead first */ - expiry = vl->time_of_death + afs_vlocation_timeout; - if (expiry > now) { - delay = (expiry - now) * HZ; - _debug("delay %lu", delay); - mod_delayed_work(afs_wq, &afs_vlocation_reap, delay); - break; - } - - spin_lock(&vl->cell->vl_lock); - if (atomic_read(&vl->usage) > 0) { - _debug("no reap"); - list_del_init(&vl->grave); - } else { - _debug("reap"); - list_move_tail(&vl->grave, &corpses); - list_del_init(&vl->link); - } - spin_unlock(&vl->cell->vl_lock); - } - - spin_unlock(&afs_vlocation_graveyard_lock); - - /* now reap the corpses we've extracted */ - while (!list_empty(&corpses)) { - vl = list_entry(corpses.next, struct afs_vlocation, grave); - list_del(&vl->grave); - afs_vlocation_destroy(vl); - } - - _leave(""); -} - -/* - * initialise the VL update process - */ -int __init afs_vlocation_update_init(void) -{ - afs_vlocation_update_worker = - create_singlethread_workqueue("kafs_vlupdated"); - return afs_vlocation_update_worker ? 0 : -ENOMEM; -} - -/* - * discard all the volume location records for rmmod - */ -void afs_vlocation_purge(void) -{ - afs_vlocation_timeout = 0; - - spin_lock(&afs_vlocation_updates_lock); - list_del_init(&afs_vlocation_updates); - spin_unlock(&afs_vlocation_updates_lock); - mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0); - destroy_workqueue(afs_vlocation_update_worker); - - mod_delayed_work(afs_wq, &afs_vlocation_reap, 0); -} - -/* - * update a volume location - */ -static void afs_vlocation_updater(struct work_struct *work) -{ - struct afs_cache_vlocation vldb; - struct afs_vlocation *vl, *xvl; - time_t now; - long timeout; - int ret; - - _enter(""); - - now = get_seconds(); - - /* find a record to update */ - spin_lock(&afs_vlocation_updates_lock); - for (;;) { - if (list_empty(&afs_vlocation_updates)) { - spin_unlock(&afs_vlocation_updates_lock); - _leave(" [nothing]"); - return; - } - - vl = list_entry(afs_vlocation_updates.next, - struct afs_vlocation, update); - if (atomic_read(&vl->usage) > 0) - break; - list_del_init(&vl->update); - } - - timeout = vl->update_at - now; - if (timeout > 0) { - queue_delayed_work(afs_vlocation_update_worker, - &afs_vlocation_update, timeout * HZ); - spin_unlock(&afs_vlocation_updates_lock); - _leave(" [nothing]"); - return; - } - - list_del_init(&vl->update); - atomic_inc(&vl->usage); - spin_unlock(&afs_vlocation_updates_lock); - - /* we can now perform the update */ - _debug("update %s", vl->vldb.name); - vl->state = AFS_VL_UPDATING; - vl->upd_rej_cnt = 0; - vl->upd_busy_cnt = 0; - - ret = afs_vlocation_update_record(vl, NULL, &vldb); - spin_lock(&vl->lock); - switch (ret) { - case 0: - afs_vlocation_apply_update(vl, &vldb); - vl->state = AFS_VL_VALID; - break; - case -ENOMEDIUM: - vl->state = AFS_VL_VOLUME_DELETED; - break; - default: - vl->state = AFS_VL_UNCERTAIN; - break; - } - spin_unlock(&vl->lock); - wake_up(&vl->waitq); - - /* and then reschedule */ - _debug("reschedule"); - vl->update_at = get_seconds() + afs_vlocation_update_timeout; - - spin_lock(&afs_vlocation_updates_lock); - - if (!list_empty(&afs_vlocation_updates)) { - /* next update in 10 minutes, but wait at least 1 second more - * than the newest record already queued so that we don't spam - * the VL server suddenly with lots of requests - */ - xvl = list_entry(afs_vlocation_updates.prev, - struct afs_vlocation, update); - if (vl->update_at <= xvl->update_at) - vl->update_at = xvl->update_at + 1; - xvl = list_entry(afs_vlocation_updates.next, - struct afs_vlocation, update); - timeout = xvl->update_at - now; - if (timeout < 0) - timeout = 0; - } else { - timeout = afs_vlocation_update_timeout; - } - - ASSERT(list_empty(&vl->update)); - - list_add_tail(&vl->update, &afs_vlocation_updates); - - _debug("timeout %ld", timeout); - queue_delayed_work(afs_vlocation_update_worker, - &afs_vlocation_update, timeout * HZ); - spin_unlock(&afs_vlocation_updates_lock); - afs_put_vlocation(vl); -} diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c deleted file mode 100644 index 25cf4c3f4ff7..000000000000 --- a/fs/afs/vnode.c +++ /dev/null @@ -1,1025 +0,0 @@ -/* AFS vnode management - * - * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/sched.h> -#include "internal.h" - -#if 0 -static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent, - int depth, char lr) -{ - struct afs_vnode *vnode; - bool bad = false; - - if (!node) - return false; - - if (node->rb_left) - bad = dump_tree_aux(node->rb_left, node, depth + 2, '/'); - - vnode = rb_entry(node, struct afs_vnode, cb_promise); - _debug("%c %*.*s%c%p {%d}", - rb_is_red(node) ? 'R' : 'B', - depth, depth, "", lr, - vnode, vnode->cb_expires_at); - if (rb_parent(node) != parent) { - printk("BAD: %p != %p\n", rb_parent(node), parent); - bad = true; - } - - if (node->rb_right) - bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\'); - - return bad; -} - -static noinline void dump_tree(const char *name, struct afs_server *server) -{ - _enter("%s", name); - if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-')) - BUG(); -} -#endif - -/* - * insert a vnode into the backing server's vnode tree - */ -static void afs_install_vnode(struct afs_vnode *vnode, - struct afs_server *server) -{ - struct afs_server *old_server = vnode->server; - struct afs_vnode *xvnode; - struct rb_node *parent, **p; - - _enter("%p,%p", vnode, server); - - if (old_server) { - spin_lock(&old_server->fs_lock); - rb_erase(&vnode->server_rb, &old_server->fs_vnodes); - spin_unlock(&old_server->fs_lock); - } - - afs_get_server(server); - vnode->server = server; - afs_put_server(old_server); - - /* insert into the server's vnode tree in FID order */ - spin_lock(&server->fs_lock); - - parent = NULL; - p = &server->fs_vnodes.rb_node; - while (*p) { - parent = *p; - xvnode = rb_entry(parent, struct afs_vnode, server_rb); - if (vnode->fid.vid < xvnode->fid.vid) - p = &(*p)->rb_left; - else if (vnode->fid.vid > xvnode->fid.vid) - p = &(*p)->rb_right; - else if (vnode->fid.vnode < xvnode->fid.vnode) - p = &(*p)->rb_left; - else if (vnode->fid.vnode > xvnode->fid.vnode) - p = &(*p)->rb_right; - else if (vnode->fid.unique < xvnode->fid.unique) - p = &(*p)->rb_left; - else if (vnode->fid.unique > xvnode->fid.unique) - p = &(*p)->rb_right; - else - BUG(); /* can't happen unless afs_iget() malfunctions */ - } - - rb_link_node(&vnode->server_rb, parent, p); - rb_insert_color(&vnode->server_rb, &server->fs_vnodes); - - spin_unlock(&server->fs_lock); - _leave(""); -} - -/* - * insert a vnode into the promising server's update/expiration tree - * - caller must hold vnode->lock - */ -static void afs_vnode_note_promise(struct afs_vnode *vnode, - struct afs_server *server) -{ - struct afs_server *old_server; - struct afs_vnode *xvnode; - struct rb_node *parent, **p; - - _enter("%p,%p", vnode, server); - - ASSERT(server != NULL); - - old_server = vnode->server; - if (vnode->cb_promised) { - if (server == old_server && - vnode->cb_expires == vnode->cb_expires_at) { - _leave(" [no change]"); - return; - } - - spin_lock(&old_server->cb_lock); - if (vnode->cb_promised) { - _debug("delete"); - rb_erase(&vnode->cb_promise, &old_server->cb_promises); - vnode->cb_promised = false; - } - spin_unlock(&old_server->cb_lock); - } - - if (vnode->server != server) - afs_install_vnode(vnode, server); - - vnode->cb_expires_at = vnode->cb_expires; - _debug("PROMISE on %p {%lu}", - vnode, (unsigned long) vnode->cb_expires_at); - - /* abuse an RB-tree to hold the expiration order (we may have multiple - * items with the same expiration time) */ - spin_lock(&server->cb_lock); - - parent = NULL; - p = &server->cb_promises.rb_node; - while (*p) { - parent = *p; - xvnode = rb_entry(parent, struct afs_vnode, cb_promise); - if (vnode->cb_expires_at < xvnode->cb_expires_at) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - - rb_link_node(&vnode->cb_promise, parent, p); - rb_insert_color(&vnode->cb_promise, &server->cb_promises); - vnode->cb_promised = true; - - spin_unlock(&server->cb_lock); - _leave(""); -} - -/* - * handle remote file deletion by discarding the callback promise - */ -static void afs_vnode_deleted_remotely(struct afs_vnode *vnode) -{ - struct afs_server *server; - - _enter("{%p}", vnode->server); - - set_bit(AFS_VNODE_DELETED, &vnode->flags); - - server = vnode->server; - if (server) { - if (vnode->cb_promised) { - spin_lock(&server->cb_lock); - if (vnode->cb_promised) { - rb_erase(&vnode->cb_promise, - &server->cb_promises); - vnode->cb_promised = false; - } - spin_unlock(&server->cb_lock); - } - - spin_lock(&server->fs_lock); - rb_erase(&vnode->server_rb, &server->fs_vnodes); - spin_unlock(&server->fs_lock); - - vnode->server = NULL; - afs_put_server(server); - } else { - ASSERT(!vnode->cb_promised); - } - - _leave(""); -} - -/* - * finish off updating the recorded status of a file after a successful - * operation completion - * - starts callback expiry timer - * - adds to server's callback list - */ -void afs_vnode_finalise_status_update(struct afs_vnode *vnode, - struct afs_server *server) -{ - struct afs_server *oldserver = NULL; - - _enter("%p,%p", vnode, server); - - spin_lock(&vnode->lock); - clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - afs_vnode_note_promise(vnode, server); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - - wake_up_all(&vnode->update_waitq); - afs_put_server(oldserver); - _leave(""); -} - -/* - * finish off updating the recorded status of a file after an operation failed - */ -static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret) -{ - _enter("{%x:%u},%d", vnode->fid.vid, vnode->fid.vnode, ret); - - spin_lock(&vnode->lock); - - clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); - - if (ret == -ENOENT) { - /* the file was deleted on the server */ - _debug("got NOENT from server - marking file deleted"); - afs_vnode_deleted_remotely(vnode); - } - - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - - wake_up_all(&vnode->update_waitq); - _leave(""); -} - -/* - * fetch file status from the volume - * - don't issue a fetch if: - * - the changed bit is not set and there's a valid callback - * - there are any outstanding ops that will fetch the status - * - TODO implement local caching - */ -int afs_vnode_fetch_status(struct afs_vnode *vnode, - struct afs_vnode *auth_vnode, struct key *key) -{ - struct afs_server *server; - unsigned long acl_order; - int ret; - - DECLARE_WAITQUEUE(myself, current); - - _enter("%s,{%x:%u.%u}", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); - - if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && - vnode->cb_promised) { - _leave(" [unchanged]"); - return 0; - } - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _leave(" [deleted]"); - return -ENOENT; - } - - acl_order = 0; - if (auth_vnode) - acl_order = auth_vnode->acl_order; - - spin_lock(&vnode->lock); - - if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && - vnode->cb_promised) { - spin_unlock(&vnode->lock); - _leave(" [unchanged]"); - return 0; - } - - ASSERTCMP(vnode->update_cnt, >=, 0); - - if (vnode->update_cnt > 0) { - /* someone else started a fetch */ - _debug("wait on fetch %d", vnode->update_cnt); - - set_current_state(TASK_UNINTERRUPTIBLE); - ASSERT(myself.func != NULL); - add_wait_queue(&vnode->update_waitq, &myself); - - /* wait for the status to be updated */ - for (;;) { - if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) - break; - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - break; - - /* check to see if it got updated and invalidated all - * before we saw it */ - if (vnode->update_cnt == 0) { - remove_wait_queue(&vnode->update_waitq, - &myself); - set_current_state(TASK_RUNNING); - goto get_anyway; - } - - spin_unlock(&vnode->lock); - - schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - - spin_lock(&vnode->lock); - } - - remove_wait_queue(&vnode->update_waitq, &myself); - spin_unlock(&vnode->lock); - set_current_state(TASK_RUNNING); - - return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? - -ENOENT : 0; - } - -get_anyway: - /* okay... we're going to have to initiate the op */ - vnode->update_cnt++; - - spin_unlock(&vnode->lock); - - /* merge AFS status fetches and clear outstanding callback on this - * vnode */ - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %p{%08x}", - server, ntohl(server->addr.s_addr)); - - ret = afs_fs_fetch_file_status(server, key, vnode, NULL, - &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - _debug("adjust"); - if (auth_vnode) - afs_cache_permit(vnode, key, acl_order); - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - } else { - _debug("failed [%d]", ret); - afs_vnode_status_update_failed(vnode, ret); - } - - ASSERTCMP(vnode->update_cnt, >=, 0); - - _leave(" = %d [cnt %d]", ret, vnode->update_cnt); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); - return PTR_ERR(server); -} - -/* - * fetch file data from the volume - * - TODO implement caching - */ -int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key, - off_t offset, size_t length, struct page *page) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,,,", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key)); - - /* this op will fetch the status */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - /* merge in AFS status fetches and clear outstanding callback on this - * vnode */ - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_fetch_data(server, key, vnode, offset, length, - page, &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - } - - _leave(" = %d", ret); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - return PTR_ERR(server); -} - -/* - * make a file or a directory - */ -int afs_vnode_create(struct afs_vnode *vnode, struct key *key, - const char *name, umode_t mode, struct afs_fid *newfid, - struct afs_file_status *newstatus, - struct afs_callback *newcb, struct afs_server **_server) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,%s,,", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key), - name); - - /* this op will fetch the status on the directory we're creating in */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_create(server, key, vnode, name, mode, newfid, - newstatus, newcb, &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - *_server = server; - } else { - afs_vnode_status_update_failed(vnode, ret); - *_server = NULL; - } - - _leave(" = %d [cnt %d]", ret, vnode->update_cnt); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); - return PTR_ERR(server); -} - -/* - * remove a file or directory - */ -int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name, - bool isdir) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,%s", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key), - name); - - /* this op will fetch the status on the directory we're removing from */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_remove(server, key, vnode, name, isdir, - &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - } - - _leave(" = %d [cnt %d]", ret, vnode->update_cnt); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); - return PTR_ERR(server); -} - -/* - * create a hard link - */ -int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode, - struct key *key, const char *name) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s", - dvnode->volume->vlocation->vldb.name, - dvnode->fid.vid, - dvnode->fid.vnode, - dvnode->fid.unique, - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key), - name); - - /* this op will fetch the status on the directory we're removing from */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - spin_lock(&dvnode->lock); - dvnode->update_cnt++; - spin_unlock(&dvnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(dvnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_link(server, key, dvnode, vnode, name, - &afs_sync_call); - - } while (!afs_volume_release_fileserver(dvnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - afs_vnode_finalise_status_update(dvnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - afs_vnode_status_update_failed(dvnode, ret); - } - - _leave(" = %d [cnt %d]", ret, vnode->update_cnt); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - spin_lock(&dvnode->lock); - dvnode->update_cnt--; - ASSERTCMP(dvnode->update_cnt, >=, 0); - spin_unlock(&dvnode->lock); - _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); - return PTR_ERR(server); -} - -/* - * create a symbolic link - */ -int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key, - const char *name, const char *content, - struct afs_fid *newfid, - struct afs_file_status *newstatus, - struct afs_server **_server) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,%s,%s,,,", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key), - name, content); - - /* this op will fetch the status on the directory we're creating in */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_symlink(server, key, vnode, name, content, - newfid, newstatus, &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - *_server = server; - } else { - afs_vnode_status_update_failed(vnode, ret); - *_server = NULL; - } - - _leave(" = %d [cnt %d]", ret, vnode->update_cnt); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - _leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt); - return PTR_ERR(server); -} - -/* - * rename a file - */ -int afs_vnode_rename(struct afs_vnode *orig_dvnode, - struct afs_vnode *new_dvnode, - struct key *key, - const char *orig_name, - const char *new_name) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s", - orig_dvnode->volume->vlocation->vldb.name, - orig_dvnode->fid.vid, - orig_dvnode->fid.vnode, - orig_dvnode->fid.unique, - new_dvnode->volume->vlocation->vldb.name, - new_dvnode->fid.vid, - new_dvnode->fid.vnode, - new_dvnode->fid.unique, - key_serial(key), - orig_name, - new_name); - - /* this op will fetch the status on both the directories we're dealing - * with */ - spin_lock(&orig_dvnode->lock); - orig_dvnode->update_cnt++; - spin_unlock(&orig_dvnode->lock); - if (new_dvnode != orig_dvnode) { - spin_lock(&new_dvnode->lock); - new_dvnode->update_cnt++; - spin_unlock(&new_dvnode->lock); - } - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(orig_dvnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_rename(server, key, orig_dvnode, orig_name, - new_dvnode, new_name, &afs_sync_call); - - } while (!afs_volume_release_fileserver(orig_dvnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(orig_dvnode, server); - if (new_dvnode != orig_dvnode) - afs_vnode_finalise_status_update(new_dvnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(orig_dvnode, ret); - if (new_dvnode != orig_dvnode) - afs_vnode_status_update_failed(new_dvnode, ret); - } - - _leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt); - return ret; - -no_server: - spin_lock(&orig_dvnode->lock); - orig_dvnode->update_cnt--; - ASSERTCMP(orig_dvnode->update_cnt, >=, 0); - spin_unlock(&orig_dvnode->lock); - if (new_dvnode != orig_dvnode) { - spin_lock(&new_dvnode->lock); - new_dvnode->update_cnt--; - ASSERTCMP(new_dvnode->update_cnt, >=, 0); - spin_unlock(&new_dvnode->lock); - } - _leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt); - return PTR_ERR(server); -} - -/* - * write to a file - */ -int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last, - unsigned offset, unsigned to) -{ - struct afs_server *server; - struct afs_vnode *vnode = wb->vnode; - int ret; - - _enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(wb->key), - first, last, offset, to); - - /* this op will fetch the status */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_store_data(server, wb, first, last, offset, to, - &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - } - - _leave(" = %d", ret); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - return PTR_ERR(server); -} - -/* - * set the attributes on a file - */ -int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key, - struct iattr *attr) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key)); - - /* this op will fetch the status */ - spin_lock(&vnode->lock); - vnode->update_cnt++; - spin_unlock(&vnode->lock); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_setattr(server, key, vnode, attr, &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) { - afs_vnode_finalise_status_update(vnode, server); - afs_put_server(server); - } else { - afs_vnode_status_update_failed(vnode, ret); - } - - _leave(" = %d", ret); - return ret; - -no_server: - spin_lock(&vnode->lock); - vnode->update_cnt--; - ASSERTCMP(vnode->update_cnt, >=, 0); - spin_unlock(&vnode->lock); - return PTR_ERR(server); -} - -/* - * get the status of a volume - */ -int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key, - struct afs_volume_status *vs) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key)); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_get_volume_status(server, key, vnode, vs, &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) - afs_put_server(server); - - _leave(" = %d", ret); - return ret; - -no_server: - return PTR_ERR(server); -} - -/* - * get a lock on a file - */ -int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key, - afs_lock_type_t type) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x,%u", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key), type); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) - afs_put_server(server); - - _leave(" = %d", ret); - return ret; - -no_server: - return PTR_ERR(server); -} - -/* - * extend a lock on a file - */ -int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key)); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) - afs_put_server(server); - - _leave(" = %d", ret); - return ret; - -no_server: - return PTR_ERR(server); -} - -/* - * release a lock on a file - */ -int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key) -{ - struct afs_server *server; - int ret; - - _enter("%s{%x:%u.%u},%x", - vnode->volume->vlocation->vldb.name, - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - key_serial(key)); - - do { - /* pick a server to query */ - server = afs_volume_pick_fileserver(vnode); - if (IS_ERR(server)) - goto no_server; - - _debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr)); - - ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call); - - } while (!afs_volume_release_fileserver(vnode, server, ret)); - - /* adjust the flags */ - if (ret == 0) - afs_put_server(server); - - _leave(" = %d", ret); - return ret; - -no_server: - return PTR_ERR(server); -} diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 401eeb21869f..0efff3d25133 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -1,28 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* AFS volume management * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> #include <linux/slab.h> -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/sched.h> #include "internal.h" -static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" }; +static unsigned __read_mostly afs_volume_record_life = 60 * 60; +static atomic_t afs_volume_debug_id; + +static void afs_destroy_volume(struct work_struct *work); /* - * lookup a volume by name - * - this can be one of the following: + * Insert a volume into a cell. If there's an existing volume record, that is + * returned instead with a ref held. + */ +static struct afs_volume *afs_insert_volume_into_cell(struct afs_cell *cell, + struct afs_volume *volume) +{ + struct afs_volume *p; + struct rb_node *parent = NULL, **pp; + + write_seqlock(&cell->volume_lock); + + pp = &cell->volumes.rb_node; + while (*pp) { + parent = *pp; + p = rb_entry(parent, struct afs_volume, cell_node); + if (p->vid < volume->vid) { + pp = &(*pp)->rb_left; + } else if (p->vid > volume->vid) { + pp = &(*pp)->rb_right; + } else { + if (afs_try_get_volume(p, afs_volume_trace_get_cell_insert)) { + volume = p; + goto found; + } + + set_bit(AFS_VOLUME_RM_TREE, &volume->flags); + rb_replace_node_rcu(&p->cell_node, &volume->cell_node, &cell->volumes); + } + } + + rb_link_node_rcu(&volume->cell_node, parent, pp); + rb_insert_color(&volume->cell_node, &cell->volumes); + hlist_add_head_rcu(&volume->proc_link, &cell->proc_volumes); + +found: + write_sequnlock(&cell->volume_lock); + return volume; + +} + +static void afs_remove_volume_from_cell(struct afs_volume *volume) +{ + struct afs_cell *cell = volume->cell; + + if (!hlist_unhashed(&volume->proc_link)) { + trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref), + afs_volume_trace_remove); + write_seqlock(&cell->volume_lock); + hlist_del_rcu(&volume->proc_link); + if (!test_and_set_bit(AFS_VOLUME_RM_TREE, &volume->flags)) + rb_erase(&volume->cell_node, &cell->volumes); + write_sequnlock(&cell->volume_lock); + } +} + +/* + * Allocate a volume record and load it up from a vldb record. + */ +static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, + struct afs_vldb_entry *vldb, + struct afs_server_list **_slist) +{ + struct afs_server_list *slist; + struct afs_volume *volume; + int ret = -ENOMEM, i; + + volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL); + if (!volume) + goto error_0; + + volume->debug_id = atomic_inc_return(&afs_volume_debug_id); + volume->vid = vldb->vid[params->type]; + volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; + volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol); + volume->type = params->type; + volume->type_force = params->force; + volume->name_len = vldb->name_len; + volume->creation_time = TIME64_MIN; + volume->update_time = TIME64_MIN; + + refcount_set(&volume->ref, 1); + INIT_HLIST_NODE(&volume->proc_link); + INIT_WORK(&volume->destructor, afs_destroy_volume); + rwlock_init(&volume->servers_lock); + mutex_init(&volume->volsync_lock); + mutex_init(&volume->cb_check_lock); + rwlock_init(&volume->cb_v_break_lock); + INIT_LIST_HEAD(&volume->open_mmaps); + init_rwsem(&volume->open_mmaps_lock); + memcpy(volume->name, vldb->name, vldb->name_len + 1); + + for (i = 0; i < AFS_MAXTYPES; i++) + volume->vids[i] = vldb->vid[i]; + + slist = afs_alloc_server_list(volume, params->key, vldb); + if (IS_ERR(slist)) { + ret = PTR_ERR(slist); + goto error_1; + } + + *_slist = slist; + rcu_assign_pointer(volume->servers, slist); + trace_afs_volume(volume->debug_id, volume->vid, 1, afs_volume_trace_alloc); + return volume; + +error_1: + afs_put_cell(volume->cell, afs_cell_trace_put_vol); + kfree(volume); +error_0: + return ERR_PTR(ret); +} + +/* + * Look up or allocate a volume record. + */ +static struct afs_volume *afs_lookup_volume(struct afs_fs_context *params, + struct afs_vldb_entry *vldb) +{ + struct afs_server_list *slist; + struct afs_volume *candidate, *volume; + + candidate = afs_alloc_volume(params, vldb, &slist); + if (IS_ERR(candidate)) + return candidate; + + volume = afs_insert_volume_into_cell(params->cell, candidate); + if (volume == candidate) + afs_attach_volume_to_servers(volume, slist); + else + afs_put_volume(candidate, afs_volume_trace_put_cell_dup); + return volume; +} + +/* + * Look up a VLDB record for a volume. + */ +static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell, + struct key *key, + const char *volname, + size_t volnamesz) +{ + struct afs_vldb_entry *vldb = ERR_PTR(-EDESTADDRREQ); + struct afs_vl_cursor vc; + int ret; + + if (!afs_begin_vlserver_operation(&vc, cell, key)) + return ERR_PTR(-ERESTARTSYS); + + while (afs_select_vlserver(&vc)) { + vldb = afs_vl_get_entry_by_name_u(&vc, volname, volnamesz); + } + + ret = afs_end_vlserver_operation(&vc); + return ret < 0 ? ERR_PTR(ret) : vldb; +} + +/* + * Look up a volume in the VL server and create a candidate volume record for + * it. + * + * The volume name can be one of the following: * "%[cell:]volume[.]" R/W volume * "#[cell:]volume[.]" R/O or R/W volume (rwparent=0), * or R/W (rwparent=1) volume @@ -42,360 +195,278 @@ static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" }; * - Rule 3: If parent volume is R/W, then only mount R/W volume unless * explicitly told otherwise */ -struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) +struct afs_volume *afs_create_volume(struct afs_fs_context *params) { - struct afs_vlocation *vlocation = NULL; - struct afs_volume *volume = NULL; - struct afs_server *server = NULL; - char srvtmask; - int ret, loop; - - _enter("{%*.*s,%d}", - params->volnamesz, params->volnamesz, params->volname, params->rwpath); - - /* lookup the volume location record */ - vlocation = afs_vlocation_lookup(params->cell, params->key, - params->volname, params->volnamesz); - if (IS_ERR(vlocation)) { - ret = PTR_ERR(vlocation); - vlocation = NULL; - goto error; - } + struct afs_vldb_entry *vldb; + struct afs_volume *volume; + unsigned long type_mask = 1UL << params->type; - /* make the final decision on the type we want */ - ret = -ENOMEDIUM; - if (params->force && !(vlocation->vldb.vidmask & (1 << params->type))) - goto error; + vldb = afs_vl_lookup_vldb(params->cell, params->key, + params->volname, params->volnamesz); + if (IS_ERR(vldb)) + return ERR_CAST(vldb); - srvtmask = 0; - for (loop = 0; loop < vlocation->vldb.nservers; loop++) - srvtmask |= vlocation->vldb.srvtmask[loop]; + if (test_bit(AFS_VLDB_QUERY_ERROR, &vldb->flags)) { + volume = ERR_PTR(vldb->error); + goto error; + } + /* Make the final decision on the type we want */ + volume = ERR_PTR(-ENOMEDIUM); if (params->force) { - if (!(srvtmask & (1 << params->type))) + if (!(vldb->flags & type_mask)) goto error; - } else if (srvtmask & AFS_VOL_VTM_RO) { + } else if (test_bit(AFS_VLDB_HAS_RO, &vldb->flags)) { params->type = AFSVL_ROVOL; - } else if (srvtmask & AFS_VOL_VTM_RW) { + } else if (test_bit(AFS_VLDB_HAS_RW, &vldb->flags)) { params->type = AFSVL_RWVOL; } else { goto error; } - down_write(¶ms->cell->vl_sem); + volume = afs_lookup_volume(params, vldb); - /* is the volume already active? */ - if (vlocation->vols[params->type]) { - /* yes - re-use it */ - volume = vlocation->vols[params->type]; - afs_get_volume(volume); - goto success; - } - - /* create a new volume record */ - _debug("creating new volume record"); - - ret = -ENOMEM; - volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL); - if (!volume) - goto error_up; +error: + kfree(vldb); + return volume; +} - atomic_set(&volume->usage, 1); - volume->type = params->type; - volume->type_force = params->force; - volume->cell = params->cell; - volume->vid = vlocation->vldb.vid[params->type]; - - ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY); - if (ret) - goto error_bdi; - - init_rwsem(&volume->server_sem); - - /* look up all the applicable server records */ - for (loop = 0; loop < 8; loop++) { - if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) { - server = afs_lookup_server( - volume->cell, &vlocation->vldb.servers[loop]); - if (IS_ERR(server)) { - ret = PTR_ERR(server); - goto error_discard; - } +/* + * Destroy a volume record + */ +static void afs_destroy_volume(struct work_struct *work) +{ + struct afs_volume *volume = container_of(work, struct afs_volume, destructor); + struct afs_server_list *slist = rcu_access_pointer(volume->servers); - volume->servers[volume->nservers] = server; - volume->nservers++; - } - } + _enter("%p", volume); - /* attach the cache and volume location */ #ifdef CONFIG_AFS_FSCACHE - volume->cache = fscache_acquire_cookie(vlocation->cache, - &afs_volume_cache_index_def, - volume); + ASSERTCMP(volume->cache, ==, NULL); #endif - afs_get_vlocation(vlocation); - volume->vlocation = vlocation; - vlocation->vols[volume->type] = volume; + afs_detach_volume_from_servers(volume, slist); + afs_remove_volume_from_cell(volume); + afs_put_serverlist(volume->cell->net, slist); + afs_put_cell(volume->cell, afs_cell_trace_put_vol); + trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref), + afs_volume_trace_free); + kfree_rcu(volume, rcu); -success: - _debug("kAFS selected %s volume %08x", - afs_voltypes[volume->type], volume->vid); - up_write(¶ms->cell->vl_sem); - afs_put_vlocation(vlocation); - _leave(" = %p", volume); - return volume; - - /* clean up */ -error_up: - up_write(¶ms->cell->vl_sem); -error: - afs_put_vlocation(vlocation); - _leave(" = %d", ret); - return ERR_PTR(ret); - -error_discard: - bdi_destroy(&volume->bdi); -error_bdi: - up_write(¶ms->cell->vl_sem); - - for (loop = volume->nservers - 1; loop >= 0; loop--) - afs_put_server(volume->servers[loop]); - - kfree(volume); - goto error; + _leave(" [destroyed]"); } /* - * destroy a volume record + * Try to get a reference on a volume record. */ -void afs_put_volume(struct afs_volume *volume) +bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason) { - struct afs_vlocation *vlocation; - int loop; - - if (!volume) - return; + int r; - _enter("%p", volume); - - ASSERTCMP(atomic_read(&volume->usage), >, 0); - - vlocation = volume->vlocation; + if (__refcount_inc_not_zero(&volume->ref, &r)) { + trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason); + return true; + } + return false; +} - /* to prevent a race, the decrement and the dequeue must be effectively - * atomic */ - down_write(&vlocation->cell->vl_sem); +/* + * Get a reference on a volume record. + */ +struct afs_volume *afs_get_volume(struct afs_volume *volume, + enum afs_volume_trace reason) +{ + if (volume) { + int r; - if (likely(!atomic_dec_and_test(&volume->usage))) { - up_write(&vlocation->cell->vl_sem); - _leave(""); - return; + __refcount_inc(&volume->ref, &r); + trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason); } + return volume; +} - vlocation->vols[volume->type] = NULL; - up_write(&vlocation->cell->vl_sem); +/* + * Drop a reference on a volume record. + */ +void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason) +{ + if (volume) { + unsigned int debug_id = volume->debug_id; + afs_volid_t vid = volume->vid; + bool zero; + int r; + + zero = __refcount_dec_and_test(&volume->ref, &r); + trace_afs_volume(debug_id, vid, r - 1, reason); + if (zero) + schedule_work(&volume->destructor); + } +} - /* finish cleaning up the volume */ +/* + * Activate a volume. + */ +int afs_activate_volume(struct afs_volume *volume) +{ #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(volume->cache, 0); + struct fscache_volume *vcookie; + char *name; + + name = kasprintf(GFP_KERNEL, "afs,%s,%llx", + volume->cell->name, volume->vid); + if (!name) + return -ENOMEM; + + vcookie = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + kfree(name); + return PTR_ERR(vcookie); + } + pr_err("AFS: Cache volume key already in use (%s)\n", name); + vcookie = NULL; + } + volume->cache = vcookie; + kfree(name); #endif - afs_put_vlocation(vlocation); + return 0; +} - for (loop = volume->nservers - 1; loop >= 0; loop--) - afs_put_server(volume->servers[loop]); +/* + * Deactivate a volume. + */ +void afs_deactivate_volume(struct afs_volume *volume) +{ + _enter("%s", volume->name); - bdi_destroy(&volume->bdi); - kfree(volume); +#ifdef CONFIG_AFS_FSCACHE + fscache_relinquish_volume(volume->cache, NULL, + test_bit(AFS_VOLUME_DELETED, &volume->flags)); + volume->cache = NULL; +#endif - _leave(" [destroyed]"); + _leave(""); } /* - * pick a server to use to try accessing this volume - * - returns with an elevated usage count on the server chosen + * Query the VL service to update the volume status. */ -struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode) +static int afs_update_volume_status(struct afs_volume *volume, struct key *key) { - struct afs_volume *volume = vnode->volume; - struct afs_server *server; - int ret, state, loop; + struct afs_server_list *new, *old, *discard; + struct afs_vldb_entry *vldb; + char idbuf[24]; + int ret, idsz; + + _enter(""); - _enter("%s", volume->vlocation->vldb.name); + /* We look up an ID by passing it as a decimal string in the + * operation's name parameter. + */ + idsz = snprintf(idbuf, sizeof(idbuf), "%llu", volume->vid); - /* stick with the server we're already using if we can */ - if (vnode->server && vnode->server->fs_state == 0) { - afs_get_server(vnode->server); - _leave(" = %p [current]", vnode->server); - return vnode->server; + vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz); + if (IS_ERR(vldb)) { + ret = PTR_ERR(vldb); + goto error; } - down_read(&volume->server_sem); + /* See if the volume got renamed. */ + if (vldb->name_len != volume->name_len || + memcmp(vldb->name, volume->name, vldb->name_len) != 0) { + /* TODO: Use RCU'd string. */ + memcpy(volume->name, vldb->name, AFS_MAXVOLNAME); + volume->name_len = vldb->name_len; + } - /* handle the no-server case */ - if (volume->nservers == 0) { - ret = volume->rjservers ? -ENOMEDIUM : -ESTALE; - up_read(&volume->server_sem); - _leave(" = %d [no servers]", ret); - return ERR_PTR(ret); + /* See if the volume's server list got updated. */ + new = afs_alloc_server_list(volume, key, vldb); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto error_vldb; } - /* basically, just search the list for the first live server and use - * that */ - ret = 0; - for (loop = 0; loop < volume->nservers; loop++) { - server = volume->servers[loop]; - state = server->fs_state; - - _debug("consider %d [%d]", loop, state); - - switch (state) { - /* found an apparently healthy server */ - case 0: - afs_get_server(server); - up_read(&volume->server_sem); - _leave(" = %p (picked %08x)", - server, ntohl(server->addr.s_addr)); - return server; - - case -ENETUNREACH: - if (ret == 0) - ret = state; - break; - - case -EHOSTUNREACH: - if (ret == 0 || - ret == -ENETUNREACH) - ret = state; - break; - - case -ECONNREFUSED: - if (ret == 0 || - ret == -ENETUNREACH || - ret == -EHOSTUNREACH) - ret = state; - break; - - default: - case -EREMOTEIO: - if (ret == 0 || - ret == -ENETUNREACH || - ret == -EHOSTUNREACH || - ret == -ECONNREFUSED) - ret = state; - break; - } + write_lock(&volume->servers_lock); + + discard = new; + old = rcu_dereference_protected(volume->servers, + lockdep_is_held(&volume->servers_lock)); + if (afs_annotate_server_list(new, old)) { + new->seq = volume->servers_seq + 1; + rcu_assign_pointer(volume->servers, new); + smp_wmb(); + volume->servers_seq++; + discard = old; } - /* no available servers - * - TODO: handle the no active servers case better - */ - up_read(&volume->server_sem); + /* Check more often if replication is ongoing. */ + if (new->ro_replicating) + volume->update_at = ktime_get_real_seconds() + 10 * 60; + else + volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; + write_unlock(&volume->servers_lock); + + if (discard == old) + afs_reattach_volume_to_servers(volume, new, old); + afs_put_serverlist(volume->cell->net, discard); + ret = 0; +error_vldb: + kfree(vldb); +error: _leave(" = %d", ret); - return ERR_PTR(ret); + return ret; } /* - * release a server after use - * - releases the ref on the server struct that was acquired by picking - * - records result of using a particular server to access a volume - * - return 0 to try again, 1 if okay or to issue error - * - the caller must release the server struct if result was 0 + * Make sure the volume record is up to date. */ -int afs_volume_release_fileserver(struct afs_vnode *vnode, - struct afs_server *server, - int result) +int afs_check_volume_status(struct afs_volume *volume, struct afs_operation *op) { - struct afs_volume *volume = vnode->volume; - unsigned loop; - - _enter("%s,%08x,%d", - volume->vlocation->vldb.name, ntohl(server->addr.s_addr), - result); - - switch (result) { - /* success */ - case 0: - server->fs_act_jif = jiffies; - server->fs_state = 0; - _leave(""); - return 1; - - /* the fileserver denied all knowledge of the volume */ - case -ENOMEDIUM: - server->fs_act_jif = jiffies; - down_write(&volume->server_sem); - - /* firstly, find where the server is in the active list (if it - * is) */ - for (loop = 0; loop < volume->nservers; loop++) - if (volume->servers[loop] == server) - goto present; - - /* no longer there - may have been discarded by another op */ - goto try_next_server_upw; - - present: - volume->nservers--; - memmove(&volume->servers[loop], - &volume->servers[loop + 1], - sizeof(volume->servers[loop]) * - (volume->nservers - loop)); - volume->servers[volume->nservers] = NULL; - afs_put_server(server); - volume->rjservers++; - - if (volume->nservers > 0) - /* another server might acknowledge its existence */ - goto try_next_server_upw; - - /* handle the case where all the fileservers have rejected the - * volume - * - TODO: try asking the fileservers for volume information - * - TODO: contact the VL server again to see if the volume is - * no longer registered - */ - up_write(&volume->server_sem); - afs_put_server(server); - _leave(" [completely rejected]"); - return 1; - - /* problem reaching the server */ - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - case -ETIME: - case -ETIMEDOUT: - case -EREMOTEIO: - /* mark the server as dead - * TODO: vary dead timeout depending on error - */ - spin_lock(&server->fs_lock); - if (!server->fs_state) { - server->fs_dead_jif = jiffies + HZ * 10; - server->fs_state = result; - printk("kAFS: SERVER DEAD state=%d\n", result); - } - spin_unlock(&server->fs_lock); - goto try_next_server; - - /* miscellaneous error */ - default: - server->fs_act_jif = jiffies; - case -ENOMEM: - case -ENONET: - /* tell the caller to accept the result */ - afs_put_server(server); - _leave(" [local failure]"); - return 1; - } + int ret, retries = 0; + + _enter(""); - /* tell the caller to loop around and try the next server */ -try_next_server_upw: - up_write(&volume->server_sem); -try_next_server: - afs_put_server(server); - _leave(" [try next server]"); +retry: + if (test_bit(AFS_VOLUME_WAIT, &volume->flags)) + goto wait; + if (volume->update_at <= ktime_get_real_seconds() || + test_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags)) + goto update; + _leave(" = 0"); return 0; + +update: + if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) { + clear_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); + ret = afs_update_volume_status(volume, op->key); + if (ret < 0) + set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); + clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags); + clear_bit_unlock(AFS_VOLUME_UPDATING, &volume->flags); + wake_up_bit(&volume->flags, AFS_VOLUME_WAIT); + _leave(" = %d", ret); + return ret; + } + +wait: + if (!test_bit(AFS_VOLUME_WAIT, &volume->flags)) { + _leave(" = 0 [no wait]"); + return 0; + } + + ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, + (op->flags & AFS_OPERATION_UNINTR) ? + TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE); + if (ret == -ERESTARTSYS) { + _leave(" = %d", ret); + return ret; + } + + retries++; + if (retries == 4) { + _leave(" = -ESTALE"); + return -ESTALE; + } + goto retry; } diff --git a/fs/afs/write.c b/fs/afs/write.c index a890db4b9898..93ad86ff3345 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -1,678 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* handling of writes to regular files and writing back to the server * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ + #include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> -#include <linux/aio.h> +#include <linux/netfs.h> +#include <trace/events/netfs.h> #include "internal.h" -static int afs_write_back_from_locked_page(struct afs_writeback *wb, - struct page *page); - -/* - * mark a page as having been made dirty and thus needing writeback - */ -int afs_set_page_dirty(struct page *page) -{ - _enter(""); - return __set_page_dirty_nobuffers(page); -} - -/* - * unlink a writeback record because its usage has reached zero - * - must be called with the wb->vnode->writeback_lock held - */ -static void afs_unlink_writeback(struct afs_writeback *wb) -{ - struct afs_writeback *front; - struct afs_vnode *vnode = wb->vnode; - - list_del_init(&wb->link); - if (!list_empty(&vnode->writebacks)) { - /* if an fsync rises to the front of the queue then wake it - * up */ - front = list_entry(vnode->writebacks.next, - struct afs_writeback, link); - if (front->state == AFS_WBACK_SYNCING) { - _debug("wake up sync"); - front->state = AFS_WBACK_COMPLETE; - wake_up(&front->waitq); - } - } -} - -/* - * free a writeback record - */ -static void afs_free_writeback(struct afs_writeback *wb) -{ - _enter(""); - key_put(wb->key); - kfree(wb); -} - /* - * dispose of a reference to a writeback record + * completion of write to server */ -void afs_put_writeback(struct afs_writeback *wb) +static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct afs_vnode *vnode = wb->vnode; - - _enter("{%d}", wb->usage); + _enter("{%llx:%llu},{%x @%llx}", + vnode->fid.vid, vnode->fid.vnode, len, start); - spin_lock(&vnode->writeback_lock); - if (--wb->usage == 0) - afs_unlink_writeback(wb); - else - wb = NULL; - spin_unlock(&vnode->writeback_lock); - if (wb) - afs_free_writeback(wb); + afs_prune_wb_keys(vnode); + _leave(""); } /* - * partly or wholly fill a page that's under preparation for writing + * Find a key to use for the writeback. We cached the keys used to author the + * writes on the vnode. wreq->netfs_priv2 will contain the last writeback key + * record used or NULL and we need to start from there if it's set. + * wreq->netfs_priv will be set to the key itself or NULL. */ -static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - loff_t pos, struct page *page) +static void afs_get_writeback_key(struct netfs_io_request *wreq) { - loff_t i_size; - int ret; - int len; + struct afs_wb_key *wbk, *old = wreq->netfs_priv2; + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); - _enter(",,%llu", (unsigned long long)pos); + key_put(wreq->netfs_priv); + wreq->netfs_priv = NULL; + wreq->netfs_priv2 = NULL; - i_size = i_size_read(&vnode->vfs_inode); - if (pos + PAGE_CACHE_SIZE > i_size) - len = i_size - pos; + spin_lock(&vnode->wb_lock); + if (old) + wbk = list_next_entry(old, vnode_link); else - len = PAGE_CACHE_SIZE; - - ret = afs_vnode_fetch_data(vnode, key, pos, len, page); - if (ret < 0) { - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - } - - _leave(" = %d", ret); - return ret; -} - -/* - * prepare to perform part of a write to a page - */ -int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - struct afs_writeback *candidate, *wb; - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct page *page; - struct key *key = file->private_data; - unsigned from = pos & (PAGE_CACHE_SIZE - 1); - unsigned to = from + len; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - int ret; - - _enter("{%x:%u},{%lx},%u,%u", - vnode->fid.vid, vnode->fid.vnode, index, from, to); - - candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); - if (!candidate) - return -ENOMEM; - candidate->vnode = vnode; - candidate->first = candidate->last = index; - candidate->offset_first = from; - candidate->to_last = to; - INIT_LIST_HEAD(&candidate->link); - candidate->usage = 1; - candidate->state = AFS_WBACK_PENDING; - init_waitqueue_head(&candidate->waitq); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { - kfree(candidate); - return -ENOMEM; - } - *pagep = page; - /* page won't leak in error case: it eventually gets cleaned off LRU */ - - if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) { - ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page); - if (ret < 0) { - kfree(candidate); - _leave(" = %d [prep]", ret); - return ret; - } - SetPageUptodate(page); - } - -try_again: - spin_lock(&vnode->writeback_lock); - - /* see if this page is already pending a writeback under a suitable key - * - if so we can just join onto that one */ - wb = (struct afs_writeback *) page_private(page); - if (wb) { - if (wb->key == key && wb->state == AFS_WBACK_PENDING) - goto subsume_in_current_wb; - goto flush_conflicting_wb; - } - - if (index > 0) { - /* see if we can find an already pending writeback that we can - * append this page to */ - list_for_each_entry(wb, &vnode->writebacks, link) { - if (wb->last == index - 1 && wb->key == key && - wb->state == AFS_WBACK_PENDING) - goto append_to_previous_wb; + wbk = list_first_entry(&vnode->wb_keys, struct afs_wb_key, vnode_link); + + list_for_each_entry_from(wbk, &vnode->wb_keys, vnode_link) { + _debug("wbk %u", key_serial(wbk->key)); + if (key_validate(wbk->key) == 0) { + refcount_inc(&wbk->usage); + wreq->netfs_priv = key_get(wbk->key); + wreq->netfs_priv2 = wbk; + _debug("USE WB KEY %u", key_serial(wbk->key)); + break; } } - list_add_tail(&candidate->link, &vnode->writebacks); - candidate->key = key_get(key); - spin_unlock(&vnode->writeback_lock); - SetPagePrivate(page); - set_page_private(page, (unsigned long) candidate); - _leave(" = 0 [new]"); - return 0; - -subsume_in_current_wb: - _debug("subsume"); - ASSERTRANGE(wb->first, <=, index, <=, wb->last); - if (index == wb->first && from < wb->offset_first) - wb->offset_first = from; - if (index == wb->last && to > wb->to_last) - wb->to_last = to; - spin_unlock(&vnode->writeback_lock); - kfree(candidate); - _leave(" = 0 [sub]"); - return 0; - -append_to_previous_wb: - _debug("append into %lx-%lx", wb->first, wb->last); - wb->usage++; - wb->last++; - wb->to_last = to; - spin_unlock(&vnode->writeback_lock); - SetPagePrivate(page); - set_page_private(page, (unsigned long) wb); - kfree(candidate); - _leave(" = 0 [app]"); - return 0; - - /* the page is currently bound to another context, so if it's dirty we - * need to flush it before we can use the new context */ -flush_conflicting_wb: - _debug("flush conflict"); - if (wb->state == AFS_WBACK_PENDING) - wb->state = AFS_WBACK_CONFLICTING; - spin_unlock(&vnode->writeback_lock); - if (PageDirty(page)) { - ret = afs_write_back_from_locked_page(wb, page); - if (ret < 0) { - afs_put_writeback(candidate); - _leave(" = %d", ret); - return ret; - } - } + spin_unlock(&vnode->wb_lock); - /* the page holds a ref on the writeback record */ - afs_put_writeback(wb); - set_page_private(page, 0); - ClearPagePrivate(page); - goto try_again; + afs_put_wb_key(old); } -/* - * finalise part of a write to a page - */ -int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +static void afs_store_data_success(struct afs_operation *op) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - loff_t i_size, maybe_i_size; - - _enter("{%x:%u},{%lx}", - vnode->fid.vid, vnode->fid.vnode, page->index); - - maybe_i_size = pos + copied; - - i_size = i_size_read(&vnode->vfs_inode); - if (maybe_i_size > i_size) { - spin_lock(&vnode->writeback_lock); - i_size = i_size_read(&vnode->vfs_inode); - if (maybe_i_size > i_size) - i_size_write(&vnode->vfs_inode, maybe_i_size); - spin_unlock(&vnode->writeback_lock); + struct afs_vnode *vnode = op->file[0].vnode; + + op->ctime = op->file[0].scb.status.mtime_client; + afs_vnode_commit_status(op, &op->file[0]); + if (!afs_op_error(op)) { + afs_pages_written_back(vnode, op->store.pos, op->store.size); + afs_stat_v(vnode, n_stores); + atomic_long_add(op->store.size, &afs_v2net(vnode)->n_store_bytes); } - - set_page_dirty(page); - if (PageDirty(page)) - _debug("dirtied"); - unlock_page(page); - page_cache_release(page); - - return copied; } -/* - * kill all the pages in the given range - */ -static void afs_kill_pages(struct afs_vnode *vnode, bool error, - pgoff_t first, pgoff_t last) -{ - struct pagevec pv; - unsigned count, loop; - - _enter("{%x:%u},%lx-%lx", - vnode->fid.vid, vnode->fid.vnode, first, last); - - pagevec_init(&pv, 0); - - do { - _debug("kill %lx-%lx", first, last); - - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, - first, count, pv.pages); - ASSERTCMP(pv.nr, ==, count); - - for (loop = 0; loop < count; loop++) { - ClearPageUptodate(pv.pages[loop]); - if (error) - SetPageError(pv.pages[loop]); - end_page_writeback(pv.pages[loop]); - } - - __pagevec_release(&pv); - } while (first < last); - - _leave(""); -} +static const struct afs_operation_ops afs_store_data_operation = { + .issue_afs_rpc = afs_fs_store_data, + .issue_yfs_rpc = yfs_fs_store_data, + .success = afs_store_data_success, +}; /* - * synchronously write back the locked page and any subsequent non-locked dirty - * pages also covered by the same writeback record + * Prepare a subrequest to write to the server. This sets the max_len + * parameter. */ -static int afs_write_back_from_locked_page(struct afs_writeback *wb, - struct page *primary_page) +void afs_prepare_write(struct netfs_io_subrequest *subreq) { - struct page *pages[8], *page; - unsigned long count; - unsigned n, offset, to; - pgoff_t start, first, last; - int loop, ret; - - _enter(",%lx", primary_page->index); - - count = 1; - if (!clear_page_dirty_for_io(primary_page)) - BUG(); - if (test_set_page_writeback(primary_page)) - BUG(); - - /* find all consecutive lockable dirty pages, stopping when we find a - * page that is not immediately lockable, is not dirty or is missing, - * or we reach the end of the range */ - start = primary_page->index; - if (start >= wb->last) - goto no_more; - start++; - do { - _debug("more %lx [%lx]", start, count); - n = wb->last - start + 1; - if (n > ARRAY_SIZE(pages)) - n = ARRAY_SIZE(pages); - n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping, - start, n, pages); - _debug("fgpc %u", n); - if (n == 0) - goto no_more; - if (pages[0]->index != start) { - do { - put_page(pages[--n]); - } while (n > 0); - goto no_more; - } - - for (loop = 0; loop < n; loop++) { - page = pages[loop]; - if (page->index > wb->last) - break; - if (!trylock_page(page)) - break; - if (!PageDirty(page) || - page_private(page) != (unsigned long) wb) { - unlock_page(page); - break; - } - if (!clear_page_dirty_for_io(page)) - BUG(); - if (test_set_page_writeback(page)) - BUG(); - unlock_page(page); - put_page(page); - } - count += loop; - if (loop < n) { - for (; loop < n; loop++) - put_page(pages[loop]); - goto no_more; - } - - start += loop; - } while (start <= wb->last && count < 65536); - -no_more: - /* we now have a contiguous set of dirty pages, each with writeback set - * and the dirty mark cleared; the first page is locked and must remain - * so, all the rest are unlocked */ - first = primary_page->index; - last = first + count - 1; - - offset = (first == wb->first) ? wb->offset_first : 0; - to = (last == wb->last) ? wb->to_last : PAGE_SIZE; - - _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); - - ret = afs_vnode_store_data(wb, first, last, offset, to); - if (ret < 0) { - switch (ret) { - case -EDQUOT: - case -ENOSPC: - set_bit(AS_ENOSPC, - &wb->vnode->vfs_inode.i_mapping->flags); - break; - case -EROFS: - case -EIO: - case -EREMOTEIO: - case -EFBIG: - case -ENOENT: - case -ENOMEDIUM: - case -ENXIO: - afs_kill_pages(wb->vnode, true, first, last); - set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); - break; - case -EACCES: - case -EPERM: - case -ENOKEY: - case -EKEYEXPIRED: - case -EKEYREJECTED: - case -EKEYREVOKED: - afs_kill_pages(wb->vnode, false, first, last); - break; - default: - break; - } - } else { - ret = count; - } + struct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr]; - _leave(" = %d", ret); - return ret; + //if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) + // subreq->max_len = 512 * 1024; + //else + stream->sreq_max_len = 256 * 1024 * 1024; } /* - * write a page back to the server - * - the caller locked the page for us + * Issue a subrequest to write to the server. */ -int afs_writepage(struct page *page, struct writeback_control *wbc) +static void afs_issue_write_worker(struct work_struct *work) { - struct afs_writeback *wb; - int ret; - - _enter("{%lx},", page->index); - - wb = (struct afs_writeback *) page_private(page); - ASSERT(wb != NULL); - - ret = afs_write_back_from_locked_page(wb, page); - unlock_page(page); - if (ret < 0) { - _leave(" = %d", ret); - return 0; + struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work); + struct netfs_io_request *wreq = subreq->rreq; + struct afs_operation *op; + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); + unsigned long long pos = subreq->start + subreq->transferred; + size_t len = subreq->len - subreq->transferred; + int ret = -ENOKEY; + + _enter("R=%x[%x],%s{%llx:%llu.%u},%llx,%zx", + wreq->debug_id, subreq->debug_index, + vnode->volume->name, + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + pos, len); + +#if 0 // Error injection + if (subreq->debug_index == 3) + return netfs_write_subrequest_terminated(subreq, -ENOANO); + + if (!subreq->retry_count) { + set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + return netfs_write_subrequest_terminated(subreq, -EAGAIN); } +#endif - wbc->nr_to_write -= ret; - - _leave(" = 0"); - return 0; -} - -/* - * write a region of pages back to the server - */ -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - pgoff_t index, pgoff_t end, pgoff_t *_next) -{ - struct afs_writeback *wb; - struct page *page; - int ret, n; - - _enter(",,%lx,%lx,", index, end); - - do { - n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, - 1, &page); - if (!n) - break; - - _debug("wback %lx", page->index); - - if (page->index > end) { - *_next = index; - page_cache_release(page); - _leave(" = 0 [%lx]", *_next); - return 0; - } - - /* at this point we hold neither mapping->tree_lock nor lock on - * the page itself: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled back from - * swapper_space to tmpfs file mapping + op = afs_alloc_operation(wreq->netfs_priv, vnode->volume); + if (IS_ERR(op)) + return netfs_write_subrequest_terminated(subreq, -EAGAIN); + + afs_op_set_vnode(op, 0, vnode); + op->file[0].dv_delta = 1; + op->file[0].modification = true; + op->store.pos = pos; + op->store.size = len; + op->flags |= AFS_OPERATION_UNINTR; + op->ops = &afs_store_data_operation; + + afs_begin_vnode_operation(op); + + op->store.write_iter = &subreq->io_iter; + op->store.i_size = umax(pos + len, vnode->netfs.remote_i_size); + op->mtime = inode_get_mtime(&vnode->netfs.inode); + + afs_wait_for_operation(op); + ret = afs_put_operation(op); + switch (ret) { + case 0: + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + break; + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + /* If there are more keys we can try, use the retry algorithm + * to rotate the keys. */ - lock_page(page); - - if (page->mapping != mapping) { - unlock_page(page); - page_cache_release(page); - continue; - } - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || !PageDirty(page)) { - unlock_page(page); - continue; - } - - wb = (struct afs_writeback *) page_private(page); - ASSERT(wb != NULL); - - spin_lock(&wb->vnode->writeback_lock); - wb->state = AFS_WBACK_WRITING; - spin_unlock(&wb->vnode->writeback_lock); - - ret = afs_write_back_from_locked_page(wb, page); - unlock_page(page); - page_cache_release(page); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } - - wbc->nr_to_write -= ret; - - cond_resched(); - } while (index < end && wbc->nr_to_write > 0); + if (wreq->netfs_priv2) + set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + break; + } - *_next = index; - _leave(" = 0 [%lx]", *_next); - return 0; + netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len); } -/* - * write some of the pending data back to the server - */ -int afs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +void afs_issue_write(struct netfs_io_subrequest *subreq) { - pgoff_t start, end, next; - int ret; - - _enter(""); - - if (wbc->range_cyclic) { - start = mapping->writeback_index; - end = -1; - ret = afs_writepages_region(mapping, wbc, start, end, &next); - if (start > 0 && wbc->nr_to_write > 0 && ret == 0) - ret = afs_writepages_region(mapping, wbc, 0, start, - &next); - mapping->writeback_index = next; - } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT); - ret = afs_writepages_region(mapping, wbc, 0, end, &next); - if (wbc->nr_to_write > 0) - mapping->writeback_index = next; - } else { - start = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - ret = afs_writepages_region(mapping, wbc, start, end, &next); - } - - _leave(" = %d", ret); - return ret; + subreq->work.func = afs_issue_write_worker; + if (!queue_work(system_dfl_wq, &subreq->work)) + WARN_ON_ONCE(1); } /* - * completion of write to server + * Writeback calls this when it finds a folio that needs uploading. This isn't + * called if writeback only has copy-to-cache to deal with. */ -void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) +void afs_begin_writeback(struct netfs_io_request *wreq) { - struct afs_writeback *wb = call->wb; - struct pagevec pv; - unsigned count, loop; - pgoff_t first = call->first, last = call->last; - bool free_wb; - - _enter("{%x:%u},{%lx-%lx}", - vnode->fid.vid, vnode->fid.vnode, first, last); - - ASSERT(wb != NULL); - - pagevec_init(&pv, 0); - - do { - _debug("done %lx-%lx", first, last); - - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(call->mapping, first, count, - pv.pages); - ASSERTCMP(pv.nr, ==, count); - - spin_lock(&vnode->writeback_lock); - for (loop = 0; loop < count; loop++) { - struct page *page = pv.pages[loop]; - end_page_writeback(page); - if (page_private(page) == (unsigned long) wb) { - set_page_private(page, 0); - ClearPagePrivate(page); - wb->usage--; - } - } - free_wb = false; - if (wb->usage == 0) { - afs_unlink_writeback(wb); - free_wb = true; - } - spin_unlock(&vnode->writeback_lock); - first += count; - if (free_wb) { - afs_free_writeback(wb); - wb = NULL; - } - - __pagevec_release(&pv); - } while (first <= last); - - _leave(""); + if (S_ISREG(wreq->inode->i_mode)) + afs_get_writeback_key(wreq); } /* - * write to an AFS file + * Prepare to retry the writes in request. Use this to try rotating the + * available writeback keys. */ -ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); - ssize_t result; - size_t count = iov_length(iov, nr_segs); - - _enter("{%x.%u},{%zu},%lu,", - vnode->fid.vid, vnode->fid.vnode, count, nr_segs); - - if (IS_SWAPFILE(&vnode->vfs_inode)) { - printk(KERN_INFO - "AFS: Attempt to write to active swap file!\n"); - return -EBUSY; + struct netfs_io_subrequest *subreq = + list_first_entry(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + + switch (wreq->origin) { + case NETFS_READAHEAD: + case NETFS_READPAGE: + case NETFS_READ_GAPS: + case NETFS_READ_SINGLE: + case NETFS_READ_FOR_WRITE: + case NETFS_UNBUFFERED_READ: + case NETFS_DIO_READ: + return; + default: + break; } - if (!count) - return 0; - - result = generic_file_aio_write(iocb, iov, nr_segs, pos); - if (IS_ERR_VALUE(result)) { - _leave(" = %zd", result); - return result; + switch (subreq->error) { + case -EACCES: + case -EPERM: + case -ENOKEY: + case -EKEYEXPIRED: + case -EKEYREJECTED: + case -EKEYREVOKED: + afs_get_writeback_key(wreq); + if (!wreq->netfs_priv) + stream->failed = true; + break; } - - _leave(" = %zd", result); - return result; } /* - * flush the vnode to the fileserver + * write some of the pending data back to the server */ -int afs_writeback_all(struct afs_vnode *vnode) +int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct address_space *mapping = vnode->vfs_inode.i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_cyclic = 1, - }; + struct afs_vnode *vnode = AFS_FS_I(mapping->host); int ret; - _enter(""); - - ret = mapping->a_ops->writepages(mapping, &wbc); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ + if (wbc->sync_mode == WB_SYNC_ALL) + down_read(&vnode->validate_lock); + else if (!down_read_trylock(&vnode->validate_lock)) + return 0; - _leave(" = %d", ret); + ret = netfs_writepages(mapping, wbc); + up_read(&vnode->validate_lock); return ret; } @@ -683,82 +252,58 @@ int afs_writeback_all(struct afs_vnode *vnode) */ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = file->f_mapping->host; - struct afs_writeback *wb, *xwb; - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct afs_file *af = file->private_data; int ret; - _enter("{%x:%u},{n=%s},%d", - vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, + _enter("{%llx:%llu},{n=%pD},%d", + vnode->fid.vid, vnode->fid.vnode, file, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) + ret = afs_validate(vnode, af->key); + if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); - - /* use a writeback record as a marker in the queue - when this reaches - * the front of the queue, all the outstanding writes are either - * completed or rejected */ - wb = kzalloc(sizeof(*wb), GFP_KERNEL); - if (!wb) { - ret = -ENOMEM; - goto out; - } - wb->vnode = vnode; - wb->first = 0; - wb->last = -1; - wb->offset_first = 0; - wb->to_last = PAGE_SIZE; - wb->usage = 1; - wb->state = AFS_WBACK_SYNCING; - init_waitqueue_head(&wb->waitq); - - spin_lock(&vnode->writeback_lock); - list_for_each_entry(xwb, &vnode->writebacks, link) { - if (xwb->state == AFS_WBACK_PENDING) - xwb->state = AFS_WBACK_CONFLICTING; - } - list_add_tail(&wb->link, &vnode->writebacks); - spin_unlock(&vnode->writeback_lock); - - /* push all the outstanding writebacks to the server */ - ret = afs_writeback_all(vnode); - if (ret < 0) { - afs_put_writeback(wb); - _leave(" = %d [wb]", ret); - goto out; - } - /* wait for the preceding writes to actually complete */ - ret = wait_event_interruptible(wb->waitq, - wb->state == AFS_WBACK_COMPLETE || - vnode->writebacks.next == &wb->link); - afs_put_writeback(wb); - _leave(" = %d", ret); -out: - mutex_unlock(&inode->i_mutex); - return ret; + return file_write_and_wait_range(file, start, end); } /* * notification that a previously read-only page is about to become writable * - if it returns an error, the caller will deliver a bus error signal */ -int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { - struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host); + struct file *file = vmf->vma->vm_file; - _enter("{{%x:%u}},{%lx}", - vnode->fid.vid, vnode->fid.vnode, page->index); + if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0) + return VM_FAULT_SIGBUS; + return netfs_page_mkwrite(vmf, NULL); +} - /* wait for the page to be written to the cache before we allow it to - * be modified */ -#ifdef CONFIG_AFS_FSCACHE - fscache_wait_on_page_write(vnode->cache, page); -#endif +/* + * Prune the keys cached for writeback. The caller must hold vnode->wb_lock. + */ +void afs_prune_wb_keys(struct afs_vnode *vnode) +{ + LIST_HEAD(graveyard); + struct afs_wb_key *wbk, *tmp; - _leave(" = 0"); - return 0; + /* Discard unused keys */ + spin_lock(&vnode->wb_lock); + + if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) && + !mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) { + list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) { + if (refcount_read(&wbk->usage) == 1) + list_move(&wbk->vnode_link, &graveyard); + } + } + + spin_unlock(&vnode->wb_lock); + + while (!list_empty(&graveyard)) { + wbk = list_entry(graveyard.next, struct afs_wb_key, vnode_link); + list_del(&wbk->vnode_link); + afs_put_wb_key(wbk); + } } diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c new file mode 100644 index 000000000000..e19f396aa370 --- /dev/null +++ b/fs/afs/xattr.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Extended attribute handling for AFS. We use xattrs to get and set metadata + * instead of providing pioctl(). + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/xattr.h> +#include "internal.h" + +/* + * Deal with the result of a successful fetch ACL operation. + */ +static void afs_acl_success(struct afs_operation *op) +{ + afs_vnode_commit_status(op, &op->file[0]); +} + +static void afs_acl_put(struct afs_operation *op) +{ + kfree(op->acl); +} + +static const struct afs_operation_ops afs_fetch_acl_operation = { + .issue_afs_rpc = afs_fs_fetch_acl, + .success = afs_acl_success, + .put = afs_acl_put, +}; + +/* + * Get a file's ACL. + */ +static int afs_xattr_get_acl(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_operation *op; + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_acl *acl = NULL; + int ret; + + op = afs_alloc_operation(NULL, vnode->volume); + if (IS_ERR(op)) + return -ENOMEM; + + afs_op_set_vnode(op, 0, vnode); + op->ops = &afs_fetch_acl_operation; + + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + acl = op->acl; + op->acl = NULL; + ret = afs_put_operation(op); + + if (ret == 0) { + ret = acl->size; + if (size > 0) { + if (acl->size <= size) + memcpy(buffer, acl->data, acl->size); + else + ret = -ERANGE; + } + } + + kfree(acl); + return ret; +} + +static bool afs_make_acl(struct afs_operation *op, + const void *buffer, size_t size) +{ + struct afs_acl *acl; + + acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL); + if (!acl) { + afs_op_nomem(op); + return false; + } + + acl->size = size; + memcpy(acl->data, buffer, size); + op->acl = acl; + return true; +} + +static const struct afs_operation_ops afs_store_acl_operation = { + .issue_afs_rpc = afs_fs_store_acl, + .success = afs_acl_success, + .put = afs_acl_put, +}; + +/* + * Set a file's AFS3 ACL. + */ +static int afs_xattr_set_acl(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, + struct inode *inode, const char *name, + const void *buffer, size_t size, int flags) +{ + struct afs_operation *op; + struct afs_vnode *vnode = AFS_FS_I(inode); + + if (flags == XATTR_CREATE) + return -EINVAL; + + op = afs_alloc_operation(NULL, vnode->volume); + if (IS_ERR(op)) + return -ENOMEM; + + afs_op_set_vnode(op, 0, vnode); + if (!afs_make_acl(op, buffer, size)) + return afs_put_operation(op); + + op->ops = &afs_store_acl_operation; + return afs_do_sync_operation(op); +} + +static const struct xattr_handler afs_xattr_afs_acl_handler = { + .name = "afs.acl", + .get = afs_xattr_get_acl, + .set = afs_xattr_set_acl, +}; + +static const struct afs_operation_ops yfs_fetch_opaque_acl_operation = { + .issue_yfs_rpc = yfs_fs_fetch_opaque_acl, + .success = afs_acl_success, + /* Don't free op->yacl in .put here */ +}; + +/* + * Get a file's YFS ACL. + */ +static int afs_xattr_get_yfs(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_operation *op; + struct afs_vnode *vnode = AFS_FS_I(inode); + struct yfs_acl *yacl = NULL; + char buf[16], *data; + int which = 0, dsize, ret = -ENOMEM; + + if (strcmp(name, "acl") == 0) + which = 0; + else if (strcmp(name, "acl_inherited") == 0) + which = 1; + else if (strcmp(name, "acl_num_cleaned") == 0) + which = 2; + else if (strcmp(name, "vol_acl") == 0) + which = 3; + else + return -EOPNOTSUPP; + + yacl = kzalloc(sizeof(struct yfs_acl), GFP_KERNEL); + if (!yacl) + goto error; + + if (which == 0) + yacl->flags |= YFS_ACL_WANT_ACL; + else if (which == 3) + yacl->flags |= YFS_ACL_WANT_VOL_ACL; + + op = afs_alloc_operation(NULL, vnode->volume); + if (IS_ERR(op)) + goto error_yacl; + + afs_op_set_vnode(op, 0, vnode); + op->yacl = yacl; + op->ops = &yfs_fetch_opaque_acl_operation; + + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + ret = afs_put_operation(op); + + if (ret == 0) { + switch (which) { + case 0: + data = yacl->acl->data; + dsize = yacl->acl->size; + break; + case 1: + data = buf; + dsize = scnprintf(buf, sizeof(buf), "%u", yacl->inherit_flag); + break; + case 2: + data = buf; + dsize = scnprintf(buf, sizeof(buf), "%u", yacl->num_cleaned); + break; + case 3: + data = yacl->vol_acl->data; + dsize = yacl->vol_acl->size; + break; + default: + ret = -EOPNOTSUPP; + goto error_yacl; + } + + ret = dsize; + if (size > 0) { + if (dsize <= size) + memcpy(buffer, data, dsize); + else + ret = -ERANGE; + } + } else if (ret == -ENOTSUPP) { + ret = -ENODATA; + } + +error_yacl: + yfs_free_opaque_acl(yacl); +error: + return ret; +} + +static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { + .issue_yfs_rpc = yfs_fs_store_opaque_acl2, + .success = afs_acl_success, + .put = afs_acl_put, +}; + +/* + * Set a file's YFS ACL. + */ +static int afs_xattr_set_yfs(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, + struct inode *inode, const char *name, + const void *buffer, size_t size, int flags) +{ + struct afs_operation *op; + struct afs_vnode *vnode = AFS_FS_I(inode); + int ret; + + if (flags == XATTR_CREATE || + strcmp(name, "acl") != 0) + return -EINVAL; + + op = afs_alloc_operation(NULL, vnode->volume); + if (IS_ERR(op)) + return -ENOMEM; + + afs_op_set_vnode(op, 0, vnode); + if (!afs_make_acl(op, buffer, size)) + return afs_put_operation(op); + + op->ops = &yfs_store_opaque_acl2_operation; + ret = afs_do_sync_operation(op); + if (ret == -ENOTSUPP) + ret = -ENODATA; + return ret; +} + +static const struct xattr_handler afs_xattr_yfs_handler = { + .prefix = "afs.yfs.", + .get = afs_xattr_get_yfs, + .set = afs_xattr_set_yfs, +}; + +/* + * Get the name of the cell on which a file resides. + */ +static int afs_xattr_get_cell(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_cell *cell = vnode->volume->cell; + size_t namelen; + + namelen = cell->name_len; + if (size == 0) + return namelen; + if (namelen > size) + return -ERANGE; + memcpy(buffer, cell->name, namelen); + return namelen; +} + +static const struct xattr_handler afs_xattr_afs_cell_handler = { + .name = "afs.cell", + .get = afs_xattr_get_cell, +}; + +/* + * Get the volume ID, vnode ID and vnode uniquifier of a file as a sequence of + * hex numbers separated by colons. + */ +static int afs_xattr_get_fid(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + char text[16 + 1 + 24 + 1 + 8 + 1]; + size_t len; + + /* The volume ID is 64-bit, the vnode ID is 96-bit and the + * uniquifier is 32-bit. + */ + len = scnprintf(text, sizeof(text), "%llx:", vnode->fid.vid); + if (vnode->fid.vnode_hi) + len += scnprintf(text + len, sizeof(text) - len, "%x%016llx", + vnode->fid.vnode_hi, vnode->fid.vnode); + else + len += scnprintf(text + len, sizeof(text) - len, "%llx", + vnode->fid.vnode); + len += scnprintf(text + len, sizeof(text) - len, ":%x", + vnode->fid.unique); + + if (size == 0) + return len; + if (len > size) + return -ERANGE; + memcpy(buffer, text, len); + return len; +} + +static const struct xattr_handler afs_xattr_afs_fid_handler = { + .name = "afs.fid", + .get = afs_xattr_get_fid, +}; + +/* + * Get the name of the volume on which a file resides. + */ +static int afs_xattr_get_volume(const struct xattr_handler *handler, + struct dentry *dentry, + struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + const char *volname = vnode->volume->name; + size_t namelen; + + namelen = strlen(volname); + if (size == 0) + return namelen; + if (namelen > size) + return -ERANGE; + memcpy(buffer, volname, namelen); + return namelen; +} + +static const struct xattr_handler afs_xattr_afs_volume_handler = { + .name = "afs.volume", + .get = afs_xattr_get_volume, +}; + +const struct xattr_handler * const afs_xattr_handlers[] = { + &afs_xattr_afs_acl_handler, + &afs_xattr_afs_cell_handler, + &afs_xattr_afs_fid_handler, + &afs_xattr_afs_volume_handler, + &afs_xattr_yfs_handler, /* afs.yfs. prefix */ + NULL +}; diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h new file mode 100644 index 000000000000..cc5f143d21a3 --- /dev/null +++ b/fs/afs/xdr_fs.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* AFS fileserver XDR types + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#ifndef XDR_FS_H +#define XDR_FS_H + +struct afs_xdr_AFSFetchStatus { + __be32 if_version; +#define AFS_FSTATUS_VERSION 1 + __be32 type; + __be32 nlink; + __be32 size_lo; + __be32 data_version_lo; + __be32 author; + __be32 owner; + __be32 caller_access; + __be32 anon_access; + __be32 mode; + __be32 parent_vnode; + __be32 parent_unique; + __be32 seg_size; + __be32 mtime_client; + __be32 mtime_server; + __be32 group; + __be32 sync_counter; + __be32 data_version_hi; + __be32 lock_count; + __be32 size_hi; + __be32 abort_code; +} __packed; + +#define AFS_DIR_HASHTBL_SIZE 128 +#define AFS_DIR_DIRENT_SIZE 32 +#define AFS_DIR_SLOTS_PER_BLOCK 64 +#define AFS_DIR_BLOCK_SIZE 2048 +#define AFS_DIR_BLOCKS_PER_PAGE (PAGE_SIZE / AFS_DIR_BLOCK_SIZE) +#define AFS_DIR_MAX_SLOTS 65536 +#define AFS_DIR_BLOCKS_WITH_CTR 128 +#define AFS_DIR_MAX_BLOCKS 1023 +#define AFS_DIR_RESV_BLOCKS 1 +#define AFS_DIR_RESV_BLOCKS0 13 + +/* + * Directory entry structure. + */ +union afs_xdr_dirent { + struct { + u8 valid; + u8 unused[1]; + __be16 hash_next; + __be32 vnode; + __be32 unique; + u8 name[]; + /* When determining the number of dirent slots needed to + * represent a directory entry, name should be assumed to be 16 + * bytes, due to a now-standardised (mis)calculation, but it is + * in fact 20 bytes in size. afs_dir_calc_slots() should be + * used for this. + * + * For names longer than (16 or) 20 bytes, extra slots should + * be annexed to this one using the extended_name format. + */ + } u; + u8 extended_name[32]; +} __packed; + +/* + * Directory block header (one at the beginning of every 2048-byte block). + */ +struct afs_xdr_dir_hdr { + __be16 npages; + __be16 magic; +#define AFS_DIR_MAGIC htons(1234) + u8 reserved; + u8 bitmap[8]; + u8 pad[19]; +} __packed; + +/* + * Directory block layout + */ +union afs_xdr_dir_block { + struct afs_xdr_dir_hdr hdr; + + struct { + struct afs_xdr_dir_hdr hdr; + u8 alloc_ctrs[AFS_DIR_BLOCKS_WITH_CTR]; + __be16 hashtable[AFS_DIR_HASHTBL_SIZE]; + } meta; + + union afs_xdr_dirent dirents[AFS_DIR_SLOTS_PER_BLOCK]; +} __packed; + +/* + * Directory layout on a linux VM page. + */ +struct afs_xdr_dir_page { + union afs_xdr_dir_block blocks[AFS_DIR_BLOCKS_PER_PAGE]; +}; + +/* + * Calculate the number of dirent slots required for any given name length. + * The calculation is made assuming the part of the name in the first slot is + * 16 bytes, rather than 20, but this miscalculation is now standardised. + */ +static inline unsigned int afs_dir_calc_slots(size_t name_len) +{ + name_len++; /* NUL-terminated */ + return 1 + ((name_len + 15) / AFS_DIR_DIRENT_SIZE); +} + +#endif /* XDR_FS_H */ diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c new file mode 100644 index 000000000000..febf13a49f0b --- /dev/null +++ b/fs/afs/yfsclient.c @@ -0,0 +1,2232 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* YFS File Server client stubs + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/circ_buf.h> +#include <linux/iversion.h> +#include "internal.h" +#include "afs_fs.h" +#include "xdr_fs.h" +#include "protocol_yfs.h" + +#define xdr_size(x) (sizeof(*x) / sizeof(__be32)) + +static void xdr_decode_YFSFid(const __be32 **_bp, struct afs_fid *fid) +{ + const struct yfs_xdr_YFSFid *x = (const void *)*_bp; + + fid->vid = xdr_to_u64(x->volume); + fid->vnode = xdr_to_u64(x->vnode.lo); + fid->vnode_hi = ntohl(x->vnode.hi); + fid->unique = ntohl(x->vnode.unique); + *_bp += xdr_size(x); +} + +static __be32 *xdr_encode_u32(__be32 *bp, u32 n) +{ + *bp++ = htonl(n); + return bp; +} + +static __be32 *xdr_encode_u64(__be32 *bp, u64 n) +{ + struct yfs_xdr_u64 *x = (void *)bp; + + *x = u64_to_xdr(n); + return bp + xdr_size(x); +} + +static __be32 *xdr_encode_YFSFid(__be32 *bp, struct afs_fid *fid) +{ + struct yfs_xdr_YFSFid *x = (void *)bp; + + x->volume = u64_to_xdr(fid->vid); + x->vnode.lo = u64_to_xdr(fid->vnode); + x->vnode.hi = htonl(fid->vnode_hi); + x->vnode.unique = htonl(fid->unique); + return bp + xdr_size(x); +} + +static size_t xdr_strlen(unsigned int len) +{ + return sizeof(__be32) + round_up(len, sizeof(__be32)); +} + +static __be32 *xdr_encode_string(__be32 *bp, const char *p, unsigned int len) +{ + bp = xdr_encode_u32(bp, len); + bp = memcpy(bp, p, len); + if (len & 3) { + unsigned int pad = 4 - (len & 3); + + memset((u8 *)bp + len, 0, pad); + len += pad; + } + + return bp + len / sizeof(__be32); +} + +static __be32 *xdr_encode_name(__be32 *bp, const struct qstr *p) +{ + return xdr_encode_string(bp, p->name, p->len); +} + +static s64 linux_to_yfs_time(const struct timespec64 *t) +{ + /* Convert to 100ns intervals. */ + return (u64)t->tv_sec * 10000000 + t->tv_nsec/100; +} + +static __be32 *xdr_encode_YFSStoreStatus(__be32 *bp, mode_t *mode, + const struct timespec64 *t) +{ + struct yfs_xdr_YFSStoreStatus *x = (void *)bp; + mode_t masked_mode = mode ? *mode & S_IALLUGO : 0; + s64 mtime = linux_to_yfs_time(t); + u32 mask = AFS_SET_MTIME; + + mask |= mode ? AFS_SET_MODE : 0; + + x->mask = htonl(mask); + x->mode = htonl(masked_mode); + x->mtime_client = u64_to_xdr(mtime); + x->owner = u64_to_xdr(0); + x->group = u64_to_xdr(0); + return bp + xdr_size(x); +} + +/* + * Convert a signed 100ns-resolution 64-bit time into a timespec. + */ +static struct timespec64 yfs_time_to_linux(s64 t) +{ + struct timespec64 ts; + u64 abs_t; + + /* + * Unfortunately can not use normal 64 bit division on 32 bit arch, but + * the alternative, do_div, does not work with negative numbers so have + * to special case them + */ + if (t < 0) { + abs_t = -t; + ts.tv_nsec = (time64_t)(do_div(abs_t, 10000000) * 100); + ts.tv_nsec = -ts.tv_nsec; + ts.tv_sec = -abs_t; + } else { + abs_t = t; + ts.tv_nsec = (time64_t)do_div(abs_t, 10000000) * 100; + ts.tv_sec = abs_t; + } + + return ts; +} + +static struct timespec64 xdr_to_time(const struct yfs_xdr_u64 xdr) +{ + s64 t = xdr_to_u64(xdr); + + return yfs_time_to_linux(t); +} + +static void yfs_check_req(struct afs_call *call, __be32 *bp) +{ + size_t len = (void *)bp - call->request; + + if (len > call->request_size) + pr_err("kAFS: %s: Request buffer overflow (%zu>%u)\n", + call->type->name, len, call->request_size); + else if (len < call->request_size) + pr_warn("kAFS: %s: Request buffer underflow (%zu<%u)\n", + call->type->name, len, call->request_size); +} + +/* + * Dump a bad file status record. + */ +static void xdr_dump_bad(const __be32 *bp) +{ + __be32 x[4]; + int i; + + pr_notice("YFS XDR: Bad status record\n"); + for (i = 0; i < 6 * 4 * 4; i += 16) { + memcpy(x, bp, 16); + bp += 4; + pr_notice("%03x: %08x %08x %08x %08x\n", + i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3])); + } + + memcpy(x, bp, 8); + pr_notice("0x60: %08x %08x\n", ntohl(x[0]), ntohl(x[1])); +} + +/* + * Decode a YFSFetchStatus block + */ +static void xdr_decode_YFSFetchStatus(const __be32 **_bp, + struct afs_call *call, + struct afs_status_cb *scb) +{ + const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp; + struct afs_file_status *status = &scb->status; + u32 type; + + status->abort_code = ntohl(xdr->abort_code); + if (status->abort_code != 0) { + if (status->abort_code == VNOVNODE) + status->nlink = 0; + scb->have_error = true; + goto advance; + } + + type = ntohl(xdr->type); + switch (type) { + case AFS_FTYPE_FILE: + case AFS_FTYPE_DIR: + case AFS_FTYPE_SYMLINK: + status->type = type; + break; + default: + goto bad; + } + + status->nlink = ntohl(xdr->nlink); + status->author = xdr_to_u64(xdr->author); + status->owner = xdr_to_u64(xdr->owner); + status->caller_access = ntohl(xdr->caller_access); /* Ticket dependent */ + status->anon_access = ntohl(xdr->anon_access); + status->mode = ntohl(xdr->mode) & S_IALLUGO; + status->group = xdr_to_u64(xdr->group); + status->lock_count = ntohl(xdr->lock_count); + + status->mtime_client = xdr_to_time(xdr->mtime_client); + status->mtime_server = xdr_to_time(xdr->mtime_server); + status->size = xdr_to_u64(xdr->size); + status->data_version = xdr_to_u64(xdr->data_version); + scb->have_status = true; +advance: + *_bp += xdr_size(xdr); + return; + +bad: + xdr_dump_bad(*_bp); + afs_protocol_error(call, afs_eproto_bad_status); + goto advance; +} + +/* + * Decode a YFSCallBack block + */ +static void xdr_decode_YFSCallBack(const __be32 **_bp, + struct afs_call *call, + struct afs_status_cb *scb) +{ + struct yfs_xdr_YFSCallBack *x = (void *)*_bp; + struct afs_callback *cb = &scb->callback; + ktime_t cb_expiry; + + cb_expiry = ktime_add(call->issue_time, xdr_to_u64(x->expiration_time) * 100); + cb->expires_at = ktime_divns(cb_expiry, NSEC_PER_SEC); + scb->have_cb = true; + *_bp += xdr_size(x); +} + +/* + * Decode a YFSVolSync block + */ +static void xdr_decode_YFSVolSync(const __be32 **_bp, + struct afs_volsync *volsync) +{ + struct yfs_xdr_YFSVolSync *x = (void *)*_bp; + u64 creation, update; + + if (volsync) { + creation = xdr_to_u64(x->vol_creation_date); + do_div(creation, 10 * 1000 * 1000); + volsync->creation = creation; + update = xdr_to_u64(x->vol_update_date); + do_div(update, 10 * 1000 * 1000); + volsync->update = update; + } + + *_bp += xdr_size(x); +} + +/* + * Encode the requested attributes into a YFSStoreStatus block + */ +static __be32 *xdr_encode_YFS_StoreStatus(__be32 *bp, struct iattr *attr) +{ + struct yfs_xdr_YFSStoreStatus *x = (void *)bp; + s64 mtime = 0, owner = 0, group = 0; + u32 mask = 0, mode = 0; + + mask = 0; + if (attr->ia_valid & ATTR_MTIME) { + mask |= AFS_SET_MTIME; + mtime = linux_to_yfs_time(&attr->ia_mtime); + } + + if (attr->ia_valid & ATTR_UID) { + mask |= AFS_SET_OWNER; + owner = from_kuid(&init_user_ns, attr->ia_uid); + } + + if (attr->ia_valid & ATTR_GID) { + mask |= AFS_SET_GROUP; + group = from_kgid(&init_user_ns, attr->ia_gid); + } + + if (attr->ia_valid & ATTR_MODE) { + mask |= AFS_SET_MODE; + mode = attr->ia_mode & S_IALLUGO; + } + + x->mask = htonl(mask); + x->mode = htonl(mode); + x->mtime_client = u64_to_xdr(mtime); + x->owner = u64_to_xdr(owner); + x->group = u64_to_xdr(group); + return bp + xdr_size(x); +} + +/* + * Decode a YFSFetchVolumeStatus block. + */ +static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp, + struct afs_volume_status *vs) +{ + const struct yfs_xdr_YFSFetchVolumeStatus *x = (const void *)*_bp; + u32 flags; + + vs->vid = xdr_to_u64(x->vid); + vs->parent_id = xdr_to_u64(x->parent_id); + flags = ntohl(x->flags); + vs->online = flags & yfs_FVSOnline; + vs->in_service = flags & yfs_FVSInservice; + vs->blessed = flags & yfs_FVSBlessed; + vs->needs_salvage = flags & yfs_FVSNeedsSalvage; + vs->type = ntohl(x->type); + vs->min_quota = 0; + vs->max_quota = xdr_to_u64(x->max_quota); + vs->blocks_in_use = xdr_to_u64(x->blocks_in_use); + vs->part_blocks_avail = xdr_to_u64(x->part_blocks_avail); + vs->part_max_blocks = xdr_to_u64(x->part_max_blocks); + vs->vol_copy_date = xdr_to_u64(x->vol_copy_date); + vs->vol_backup_date = xdr_to_u64(x->vol_backup_date); + *_bp += sizeof(*x) / sizeof(__be32); +} + +/* + * Deliver reply data to operations that just return a file status and a volume + * sync record. + */ +static int yfs_deliver_status_and_volsync(struct afs_call *call) +{ + struct afs_operation *op = call->op; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSFetchStatus(&bp, call, &op->file[0].scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * Deliver reply data to an YFS.FetchData64. + */ +static int yfs_deliver_fs_fetch_data64(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct netfs_io_subrequest *subreq = op->fetch.subreq; + struct afs_vnode_param *vp = &op->file[0]; + const __be32 *bp; + size_t count_before; + int ret; + + _enter("{%u,%zu, %zu/%llu}", + call->unmarshall, call->iov_len, iov_iter_count(call->iter), + call->remaining); + + switch (call->unmarshall) { + case 0: + call->remaining = 0; + afs_extract_to_tmp64(call); + call->unmarshall++; + fallthrough; + + /* Extract the returned data length into ->actual_len. This + * may indicate more or less data than was requested will be + * returned. + */ + case 1: + _debug("extract data length"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->remaining = be64_to_cpu(call->tmp64); + _debug("DATA length: %llu", call->remaining); + + if (call->remaining == 0) + goto no_more_data; + + call->iter = &subreq->io_iter; + call->iov_len = min(call->remaining, subreq->len - subreq->transferred); + call->unmarshall++; + fallthrough; + + /* extract the returned data */ + case 2: + count_before = call->iov_len; + _debug("extract data %zu/%llu", count_before, call->remaining); + + ret = afs_extract_data(call, true); + subreq->transferred += count_before - call->iov_len; + if (ret < 0) + return ret; + + call->iter = &call->def_iter; + if (call->remaining) + goto no_more_data; + + /* Discard any excess data the server gave us */ + afs_extract_discard(call, call->remaining); + call->unmarshall = 3; + fallthrough; + + case 3: + _debug("extract discard %zu/%llu", + iov_iter_count(call->iter), call->remaining); + + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + no_more_data: + call->unmarshall = 4; + afs_extract_to_buf(call, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + fallthrough; + + /* extract the metadata */ + case 4: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSCallBack(&bp, call, &vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + + if (subreq->start + subreq->transferred >= vp->scb.status.size) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); + + call->unmarshall++; + fallthrough; + + case 5: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.FetchData64 operation type + */ +static const struct afs_call_type yfs_RXYFSFetchData64 = { + .name = "YFS.FetchData64", + .op = yfs_FS_FetchData64, + .async_rx = afs_fetch_data_async_rx, + .deliver = yfs_deliver_fs_fetch_data64, + .immediate_cancel = afs_fetch_data_immediate_cancel, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch data from a file. + */ +void yfs_fs_fetch_data(struct afs_operation *op) +{ + struct netfs_io_subrequest *subreq = op->fetch.subreq; + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%llx:%llu},%llx,%zx", + key_serial(op->key), vp->fid.vid, vp->fid.vnode, + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchData64, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_u64) * 2, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + if (op->flags & AFS_OPERATION_ASYNC) + call->async = true; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSFETCHDATA64); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vp->fid); + bp = xdr_encode_u64(bp, subreq->start + subreq->transferred); + bp = xdr_encode_u64(bp, subreq->len - subreq->transferred); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data for YFS.CreateFile or YFS.MakeDir. + */ +static int yfs_deliver_fs_create_vnode(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_YFSFid(&bp, &op->file[1].fid); + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_YFSCallBack(&bp, call, &vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.CreateFile and FS.MakeDir operation type + */ +static const struct afs_call_type afs_RXFSCreateFile = { + .name = "YFS.CreateFile", + .op = yfs_FS_CreateFile, + .deliver = yfs_deliver_fs_create_vnode, + .destructor = afs_flat_call_destructor, +}; + +/* + * Create a file. + */ +void yfs_fs_create_file(struct afs_operation *op) +{ + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_call *call; + size_t reqsz, rplsz; + __be32 *bp; + + _enter(""); + + reqsz = (sizeof(__be32) + + sizeof(__be32) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(name->len) + + sizeof(struct yfs_xdr_YFSStoreStatus) + + sizeof(__be32)); + rplsz = (sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + + call = afs_alloc_flat_call(op->net, &afs_RXFSCreateFile, reqsz, rplsz); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSCREATEFILE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime); + bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */ + yfs_check_req(call, bp); + + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} + +static const struct afs_call_type yfs_RXFSMakeDir = { + .name = "YFS.MakeDir", + .op = yfs_FS_MakeDir, + .deliver = yfs_deliver_fs_create_vnode, + .destructor = afs_flat_call_destructor, +}; + +/* + * Make a directory. + */ +void yfs_fs_make_dir(struct afs_operation *op) +{ + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_call *call; + size_t reqsz, rplsz; + __be32 *bp; + + _enter(""); + + reqsz = (sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(name->len) + + sizeof(struct yfs_xdr_YFSStoreStatus)); + rplsz = (sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + + call = afs_alloc_flat_call(op->net, &yfs_RXFSMakeDir, reqsz, rplsz); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSMAKEDIR); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime); + yfs_check_req(call, bp); + + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to a YFS.RemoveFile2 operation. + */ +static int yfs_deliver_fs_remove_file2(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + struct afs_fid fid; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_YFSFid(&bp, &fid); + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + /* Was deleted if vnode->status.abort_code == VNOVNODE. */ + + xdr_decode_YFSVolSync(&bp, &op->volsync); + return 0; +} + +static void yfs_done_fs_remove_file2(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + (call->abort_code == RX_INVALID_OPERATION || + call->abort_code == RXGEN_OPCODE)) { + set_bit(AFS_SERVER_FL_NO_RM2, &call->op->server->flags); + call->op->flags |= AFS_OPERATION_DOWNGRADE; + } +} + +/* + * YFS.RemoveFile2 operation type. + */ +static const struct afs_call_type yfs_RXYFSRemoveFile2 = { + .name = "YFS.RemoveFile2", + .op = yfs_FS_RemoveFile2, + .deliver = yfs_deliver_fs_remove_file2, + .done = yfs_done_fs_remove_file2, + .destructor = afs_flat_call_destructor, +}; + +/* + * Remove a file and retrieve new file status. + */ +void yfs_fs_remove_file2(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + const struct qstr *name = &op->dentry->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRemoveFile2, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSREMOVEFILE2); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + yfs_check_req(call, bp); + + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to a YFS.RemoveFile or YFS.RemoveDir operation. + */ +static int yfs_deliver_fs_remove(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + return 0; +} + +/* + * FS.RemoveDir and FS.RemoveFile operation types. + */ +static const struct afs_call_type yfs_RXYFSRemoveFile = { + .name = "YFS.RemoveFile", + .op = yfs_FS_RemoveFile, + .deliver = yfs_deliver_fs_remove, + .destructor = afs_flat_call_destructor, +}; + +/* + * Remove a file. + */ +void yfs_fs_remove_file(struct afs_operation *op) +{ + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + if (!test_bit(AFS_SERVER_FL_NO_RM2, &op->server->flags)) + return yfs_fs_remove_file2(op); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRemoveFile, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSREMOVEFILE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + yfs_check_req(call, bp); + + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} + +static const struct afs_call_type yfs_RXYFSRemoveDir = { + .name = "YFS.RemoveDir", + .op = yfs_FS_RemoveDir, + .deliver = yfs_deliver_fs_remove, + .destructor = afs_flat_call_destructor, +}; + +/* + * Remove a directory. + */ +void yfs_fs_remove_dir(struct afs_operation *op) +{ + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRemoveDir, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSREMOVEDIR); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + yfs_check_req(call, bp); + + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to a YFS.Link operation. + */ +static int yfs_deliver_fs_link(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.Link operation type. + */ +static const struct afs_call_type yfs_RXYFSLink = { + .name = "YFS.Link", + .op = yfs_FS_Link, + .deliver = yfs_deliver_fs_link, + .destructor = afs_flat_call_destructor, +}; + +/* + * Make a hard link. + */ +void yfs_fs_link(struct afs_operation *op) +{ + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSLink, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(name->len) + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSLINK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + bp = xdr_encode_YFSFid(bp, &vp->fid); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call1(call, &vp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to a YFS.Symlink operation. + */ +static int yfs_deliver_fs_symlink(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_YFSFid(&bp, &vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.Symlink operation type + */ +static const struct afs_call_type yfs_RXYFSSymlink = { + .name = "YFS.Symlink", + .op = yfs_FS_Symlink, + .deliver = yfs_deliver_fs_symlink, + .destructor = afs_flat_call_destructor, +}; + +/* + * Create a symbolic link. + */ +void yfs_fs_symlink(struct afs_operation *op) +{ + const struct qstr *name = &op->dentry->d_name; + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_call *call; + size_t contents_sz; + mode_t mode = 0777; + __be32 *bp; + + _enter(""); + + contents_sz = strlen(op->create.symlink); + call = afs_alloc_flat_call(op->net, &yfs_RXYFSSymlink, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(name->len) + + xdr_strlen(contents_sz) + + sizeof(struct yfs_xdr_YFSStoreStatus), + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSYMLINK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_name(bp, name); + bp = xdr_encode_string(bp, op->create.symlink, contents_sz); + bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime); + yfs_check_req(call, bp); + + call->fid = dvp->fid; + trace_afs_make_fs_call1(call, &dvp->fid, name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to a YFS.Rename operation. + */ +static int yfs_deliver_fs_rename(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.Rename operation type + */ +static const struct afs_call_type yfs_RXYFSRename = { + .name = "FS.Rename", + .op = yfs_FS_Rename, + .deliver = yfs_deliver_fs_rename, + .destructor = afs_flat_call_destructor, +}; + +/* + * Rename a file or directory. + */ +void yfs_fs_rename(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags)) + return yfs_fs_rename_replace(op); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to a YFS.Rename_NoReplace operation. This does not + * return the status of a displaced target inode as there cannot be one. + */ +static int yfs_deliver_fs_rename_1(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +/* + * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange + * operation. These return the status of the displaced target inode if there + * was one. + */ +static int yfs_deliver_fs_rename_2(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + struct afs_vnode_param *new_vp = &op->more_files[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSFid(&bp, &new_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +static void yfs_done_fs_rename_replace(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + (call->abort_code == RX_INVALID_OPERATION || + call->abort_code == RXGEN_OPCODE)) { + set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags); + call->op->flags |= AFS_OPERATION_DOWNGRADE; + } +} + +/* + * YFS.Rename_Replace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Replace = { + .name = "FS.Rename_Replace", + .op = yfs_FS_Rename_Replace, + .deliver = yfs_deliver_fs_rename_2, + .done = yfs_done_fs_rename_replace, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_NoReplace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_NoReplace = { + .name = "FS.Rename_NoReplace", + .op = yfs_FS_Rename_NoReplace, + .deliver = yfs_deliver_fs_rename_1, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_Exchange operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Exchange = { + .name = "FS.Rename_Exchange", + .op = yfs_FS_Rename_Exchange, + .deliver = yfs_deliver_fs_rename_2, + .destructor = afs_flat_call_destructor, +}; + +/* + * Rename a file or directory, replacing the target if it exists. The status + * of a displaced target is returned. + */ +void yfs_fs_rename_replace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_REPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Rename a file or directory, failing if the target dirent exists. + */ +void yfs_fs_rename_noreplace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Exchange a pair of files directories. + */ +void yfs_fs_rename_exchange(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * YFS.StoreData64 operation type. + */ +static const struct afs_call_type yfs_RXYFSStoreData64 = { + .name = "YFS.StoreData64", + .op = yfs_FS_StoreData64, + .deliver = yfs_deliver_status_and_volsync, + .destructor = afs_flat_call_destructor, +}; + +/* + * Store a set of pages to a large file. + */ +void yfs_fs_store_data(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); + + _debug("size %llx, at %llx, i_size %llx", + (unsigned long long)op->store.size, + (unsigned long long)op->store.pos, + (unsigned long long)op->store.i_size); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreData64, + sizeof(__be32) + + sizeof(__be32) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSStoreStatus) + + sizeof(struct yfs_xdr_u64) * 3, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + call->write_iter = op->store.write_iter; + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSTOREDATA64); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vp->fid); + bp = xdr_encode_YFSStoreStatus(bp, NULL, &op->mtime); + bp = xdr_encode_u64(bp, op->store.pos); + bp = xdr_encode_u64(bp, op->store.size); + bp = xdr_encode_u64(bp, op->store.i_size); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * YFS.StoreStatus operation type + */ +static const struct afs_call_type yfs_RXYFSStoreStatus = { + .name = "YFS.StoreStatus", + .op = yfs_FS_StoreStatus, + .deliver = yfs_deliver_status_and_volsync, + .destructor = afs_flat_call_destructor, +}; + +static const struct afs_call_type yfs_RXYFSStoreData64_as_Status = { + .name = "YFS.StoreData64", + .op = yfs_FS_StoreData64, + .deliver = yfs_deliver_status_and_volsync, + .destructor = afs_flat_call_destructor, +}; + +/* + * Set the attributes on a file, using YFS.StoreData64 rather than + * YFS.StoreStatus so as to alter the file size also. + */ +static void yfs_fs_setattr_size(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + struct iattr *attr = op->setattr.attr; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreData64_as_Status, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSStoreStatus) + + sizeof(struct yfs_xdr_u64) * 3, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSTOREDATA64); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vp->fid); + bp = xdr_encode_YFS_StoreStatus(bp, attr); + bp = xdr_encode_u64(bp, attr->ia_size); /* position of start of write */ + bp = xdr_encode_u64(bp, 0); /* size of write */ + bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */ + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Set the attributes on a file, using YFS.StoreData64 if there's a change in + * file size, and YFS.StoreStatus otherwise. + */ +void yfs_fs_setattr(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + struct iattr *attr = op->setattr.attr; + __be32 *bp; + + if (attr->ia_valid & ATTR_SIZE) + return yfs_fs_setattr_size(op); + + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreStatus, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSStoreStatus), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSTORESTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vp->fid); + bp = xdr_encode_YFS_StoreStatus(bp, attr); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to a YFS.GetVolumeStatus operation. + */ +static int yfs_deliver_fs_get_volume_status(struct afs_call *call) +{ + struct afs_operation *op = call->op; + const __be32 *bp; + char *p; + u32 size; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + call->unmarshall++; + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchVolumeStatus)); + fallthrough; + + /* extract the returned status record */ + case 1: + _debug("extract status"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSFetchVolumeStatus(&bp, &op->volstatus.vs); + call->unmarshall++; + afs_extract_to_tmp(call); + fallthrough; + + /* extract the volume name length */ + case 2: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("volname length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return afs_protocol_error(call, afs_eproto_volname_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_to_buf(call, size); + call->unmarshall++; + fallthrough; + + /* extract the volume name */ + case 3: + _debug("extract volname"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + p = call->buffer; + p[call->count] = 0; + _debug("volname '%s'", p); + afs_extract_to_tmp(call); + call->unmarshall++; + fallthrough; + + /* extract the offline message length */ + case 4: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("offline msg length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return afs_protocol_error(call, afs_eproto_offline_msg_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_to_buf(call, size); + call->unmarshall++; + fallthrough; + + /* extract the offline message */ + case 5: + _debug("extract offline"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + p = call->buffer; + p[call->count] = 0; + _debug("offline '%s'", p); + + afs_extract_to_tmp(call); + call->unmarshall++; + fallthrough; + + /* extract the message of the day length */ + case 6: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + call->count = ntohl(call->tmp); + _debug("motd length: %u", call->count); + if (call->count >= AFSNAMEMAX) + return afs_protocol_error(call, afs_eproto_motd_len); + size = (call->count + 3) & ~3; /* It's padded */ + afs_extract_to_buf(call, size); + call->unmarshall++; + fallthrough; + + /* extract the message of the day */ + case 7: + _debug("extract motd"); + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + p = call->buffer; + p[call->count] = 0; + _debug("motd '%s'", p); + + call->unmarshall++; + fallthrough; + + case 8: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.GetVolumeStatus operation type + */ +static const struct afs_call_type yfs_RXYFSGetVolumeStatus = { + .name = "YFS.GetVolumeStatus", + .op = yfs_FS_GetVolumeStatus, + .deliver = yfs_deliver_fs_get_volume_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * fetch the status of a volume + */ +void yfs_fs_get_volume_status(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSGetVolumeStatus, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_u64), + max_t(size_t, + sizeof(struct yfs_xdr_YFSFetchVolumeStatus) + + sizeof(__be32), + AFSOPAQUEMAX + 1)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSGETVOLUMESTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_u64(bp, vp->fid.vid); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * YFS.SetLock operation type + */ +static const struct afs_call_type yfs_RXYFSSetLock = { + .name = "YFS.SetLock", + .op = yfs_FS_SetLock, + .deliver = yfs_deliver_status_and_volsync, + .done = afs_lock_op_done, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.ExtendLock operation type + */ +static const struct afs_call_type yfs_RXYFSExtendLock = { + .name = "YFS.ExtendLock", + .op = yfs_FS_ExtendLock, + .deliver = yfs_deliver_status_and_volsync, + .done = afs_lock_op_done, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.ReleaseLock operation type + */ +static const struct afs_call_type yfs_RXYFSReleaseLock = { + .name = "YFS.ReleaseLock", + .op = yfs_FS_ReleaseLock, + .deliver = yfs_deliver_status_and_volsync, + .destructor = afs_flat_call_destructor, +}; + +/* + * Set a lock on a file + */ +void yfs_fs_set_lock(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSSetLock, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(__be32), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSETLOCK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vp->fid); + bp = xdr_encode_u32(bp, op->lock.type); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_calli(call, &vp->fid, op->lock.type); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * extend a lock on a file + */ +void yfs_fs_extend_lock(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSExtendLock, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSEXTENDLOCK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vp->fid); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * release a lock on a file + */ +void yfs_fs_release_lock(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSReleaseLock, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRELEASELOCK); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vp->fid); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver a reply to YFS.FetchStatus + */ +static int yfs_deliver_fs_fetch_status(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSCallBack(&bp, call, &vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * YFS.FetchStatus operation type + */ +static const struct afs_call_type yfs_RXYFSFetchStatus = { + .name = "YFS.FetchStatus", + .op = yfs_FS_FetchStatus, + .deliver = yfs_deliver_fs_fetch_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for a fid without needing a vnode handle. + */ +void yfs_fs_fetch_status(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchStatus, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSCallBack) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSFETCHSTATUS); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vp->fid); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to an YFS.InlineBulkStatus call + */ +static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_status_cb *scb; + const __be32 *bp; + u32 tmp; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + fallthrough; + + /* Extract the file status count and array in two steps */ + case 1: + _debug("extract status count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("status count: %u/%u", tmp, op->nr_files); + if (tmp != op->nr_files) + return afs_protocol_error(call, afs_eproto_ibulkst_count); + + call->count = 0; + call->unmarshall++; + more_counts: + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSFetchStatus)); + fallthrough; + + case 2: + _debug("extract status array %u", call->count); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + switch (call->count) { + case 0: + scb = &op->file[0].scb; + break; + case 1: + scb = &op->file[1].scb; + break; + default: + scb = &op->more_files[call->count - 2].scb; + break; + } + + bp = call->buffer; + xdr_decode_YFSFetchStatus(&bp, call, scb); + + call->count++; + if (call->count < op->nr_files) + goto more_counts; + + call->count = 0; + call->unmarshall++; + afs_extract_to_tmp(call); + fallthrough; + + /* Extract the callback count and array in two steps */ + case 3: + _debug("extract CB count"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("CB count: %u", tmp); + if (tmp != op->nr_files) + return afs_protocol_error(call, afs_eproto_ibulkst_cb_count); + call->count = 0; + call->unmarshall++; + more_cbs: + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSCallBack)); + fallthrough; + + case 4: + _debug("extract CB array"); + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + _debug("unmarshall CB array"); + switch (call->count) { + case 0: + scb = &op->file[0].scb; + break; + case 1: + scb = &op->file[1].scb; + break; + default: + scb = &op->more_files[call->count - 2].scb; + break; + } + + bp = call->buffer; + xdr_decode_YFSCallBack(&bp, call, scb); + call->count++; + if (call->count < op->nr_files) + goto more_cbs; + + afs_extract_to_buf(call, sizeof(struct yfs_xdr_YFSVolSync)); + call->unmarshall++; + fallthrough; + + case 5: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + bp = call->buffer; + xdr_decode_YFSVolSync(&bp, &op->volsync); + + call->unmarshall++; + fallthrough; + + case 6: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.InlineBulkStatus operation type + */ +static const struct afs_call_type yfs_RXYFSInlineBulkStatus = { + .name = "YFS.InlineBulkStatus", + .op = yfs_FS_InlineBulkStatus, + .deliver = yfs_deliver_fs_inline_bulk_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for up to 1024 files + */ +void yfs_fs_inline_bulk_status(struct afs_operation *op) +{ + struct afs_vnode_param *dvp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[1]; + struct afs_call *call; + __be32 *bp; + int i; + + _enter(",%x,{%llx:%llu},%u", + key_serial(op->key), vp->fid.vid, vp->fid.vnode, op->nr_files); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSInlineBulkStatus, + sizeof(__be32) + + sizeof(__be32) + + sizeof(__be32) + + sizeof(struct yfs_xdr_YFSFid) * op->nr_files, + sizeof(struct yfs_xdr_YFSFetchStatus)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSINLINEBULKSTATUS); + bp = xdr_encode_u32(bp, 0); /* RPCFlags */ + bp = xdr_encode_u32(bp, op->nr_files); + bp = xdr_encode_YFSFid(bp, &dvp->fid); + bp = xdr_encode_YFSFid(bp, &vp->fid); + for (i = 0; i < op->nr_files - 2; i++) + bp = xdr_encode_YFSFid(bp, &op->more_files[i].fid); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to an YFS.FetchOpaqueACL. + */ +static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[0]; + struct yfs_acl *yacl = op->yacl; + struct afs_acl *acl; + const __be32 *bp; + unsigned int size; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + afs_extract_to_tmp(call); + call->unmarshall++; + fallthrough; + + /* Extract the file ACL length */ + case 1: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + size = call->count2 = ntohl(call->tmp); + size = round_up(size, 4); + + if (yacl->flags & YFS_ACL_WANT_ACL) { + acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL); + if (!acl) + return -ENOMEM; + yacl->acl = acl; + acl->size = call->count2; + afs_extract_begin(call, acl->data, size); + } else { + afs_extract_discard(call, size); + } + call->unmarshall++; + fallthrough; + + /* Extract the file ACL */ + case 2: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + afs_extract_to_tmp(call); + call->unmarshall++; + fallthrough; + + /* Extract the volume ACL length */ + case 3: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + size = call->count2 = ntohl(call->tmp); + size = round_up(size, 4); + + if (yacl->flags & YFS_ACL_WANT_VOL_ACL) { + acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL); + if (!acl) + return -ENOMEM; + yacl->vol_acl = acl; + acl->size = call->count2; + afs_extract_begin(call, acl->data, size); + } else { + afs_extract_discard(call, size); + } + call->unmarshall++; + fallthrough; + + /* Extract the volume ACL */ + case 4: + ret = afs_extract_data(call, true); + if (ret < 0) + return ret; + + afs_extract_to_buf(call, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + call->unmarshall++; + fallthrough; + + /* extract the metadata */ + case 5: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; + + bp = call->buffer; + yacl->inherit_flag = ntohl(*bp++); + yacl->num_cleaned = ntohl(*bp++); + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + + call->unmarshall++; + fallthrough; + + case 6: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +void yfs_free_opaque_acl(struct yfs_acl *yacl) +{ + if (yacl) { + kfree(yacl->acl); + kfree(yacl->vol_acl); + kfree(yacl); + } +} + +/* + * YFS.FetchOpaqueACL operation type + */ +static const struct afs_call_type yfs_RXYFSFetchOpaqueACL = { + .name = "YFS.FetchOpaqueACL", + .op = yfs_FS_FetchOpaqueACL, + .deliver = yfs_deliver_fs_fetch_opaque_acl, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the YFS advanced ACLs for a file. + */ +void yfs_fs_fetch_opaque_acl(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchOpaqueACL, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid), + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSFETCHOPAQUEACL); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vp->fid); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_KERNEL); +} + +/* + * YFS.StoreOpaqueACL2 operation type + */ +static const struct afs_call_type yfs_RXYFSStoreOpaqueACL2 = { + .name = "YFS.StoreOpaqueACL2", + .op = yfs_FS_StoreOpaqueACL2, + .deliver = yfs_deliver_status_and_volsync, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the YFS ACL for a file. + */ +void yfs_fs_store_opaque_acl2(struct afs_operation *op) +{ + struct afs_vnode_param *vp = &op->file[0]; + struct afs_call *call; + struct afs_acl *acl = op->acl; + size_t size; + __be32 *bp; + + _enter(",%x,{%llx:%llu},,", + key_serial(op->key), vp->fid.vid, vp->fid.vnode); + + size = round_up(acl->size, 4); + call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreOpaqueACL2, + sizeof(__be32) * 2 + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(__be32) + size, + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* marshall the parameters */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSSTOREOPAQUEACL2); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &vp->fid); + bp = xdr_encode_u32(bp, acl->size); + memcpy(bp, acl->data, acl->size); + if (acl->size != size) + memset((void *)bp + acl->size, 0, size - acl->size); + bp += size / sizeof(__be32); + yfs_check_req(call, bp); + + call->fid = vp->fid; + trace_afs_make_fs_call(call, &vp->fid); + afs_make_op_call(op, call, GFP_KERNEL); +} |
