diff options
Diffstat (limited to 'fs/afs')
44 files changed, 6748 insertions, 4385 deletions
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index fc8ba9142f2f..682bd8ec2c10 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -5,6 +5,7 @@ config AFS_FS select AF_RXRPC select DNS_RESOLVER select NETFS_SUPPORT + select CRYPTO_KRB5 help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/Makefile b/fs/afs/Makefile index e8956b65d7ff..b49b8fe682f3 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -5,11 +5,14 @@ kafs-y := \ addr_list.o \ + addr_prefs.o \ callback.o \ cell.o \ + cm_security.o \ cmservice.o \ dir.o \ dir_edit.o \ + dir_search.o \ dir_silly.o \ dynroot.o \ file.o \ @@ -27,6 +30,7 @@ kafs-y := \ server.o \ server_list.o \ super.o \ + validation.o \ vlclient.o \ vl_alias.o \ vl_list.o \ diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index de1ae0bead3b..e941da5b6dd9 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -13,26 +13,55 @@ #include "internal.h" #include "afs_fs.h" +static void afs_free_addrlist(struct rcu_head *rcu) +{ + struct afs_addr_list *alist = container_of(rcu, struct afs_addr_list, rcu); + unsigned int i; + + for (i = 0; i < alist->nr_addrs; i++) + rxrpc_kernel_put_peer(alist->addrs[i].peer); + trace_afs_alist(alist->debug_id, refcount_read(&alist->usage), afs_alist_trace_free); + kfree(alist); +} + /* * Release an address list. */ -void afs_put_addrlist(struct afs_addr_list *alist) +void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason) { - if (alist && refcount_dec_and_test(&alist->usage)) - kfree_rcu(alist, rcu); + unsigned int debug_id; + bool dead; + int r; + + if (!alist) + return; + debug_id = alist->debug_id; + dead = __refcount_dec_and_test(&alist->usage, &r); + trace_afs_alist(debug_id, r - 1, reason); + if (dead) + call_rcu(&alist->rcu, afs_free_addrlist); +} + +struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason) +{ + int r; + + if (alist) { + __refcount_inc(&alist->usage, &r); + trace_afs_alist(alist->debug_id, r + 1, reason); + } + return alist; } /* * Allocate an address list. */ -struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, - unsigned short service, - unsigned short port) +struct afs_addr_list *afs_alloc_addrlist(unsigned int nr) { struct afs_addr_list *alist; - unsigned int i; + static atomic_t debug_id; - _enter("%u,%u,%u", nr, service, port); + _enter("%u", nr); if (nr > AFS_MAX_ADDRESSES) nr = AFS_MAX_ADDRESSES; @@ -43,17 +72,8 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, refcount_set(&alist->usage, 1); alist->max_addrs = nr; - - for (i = 0; i < nr; i++) { - struct sockaddr_rxrpc *srx = &alist->addrs[i]; - srx->srx_family = AF_RXRPC; - srx->srx_service = service; - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin6); - srx->transport.sin6.sin6_family = AF_INET6; - srx->transport.sin6.sin6_port = htons(port); - } - + alist->debug_id = atomic_inc_return(&debug_id); + trace_afs_alist(alist->debug_id, 1, afs_alist_trace_alloc); return alist; } @@ -126,7 +146,7 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, if (!vllist->servers[0].server) goto error_vl; - alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT); + alist = afs_alloc_addrlist(nr); if (!alist) goto error; @@ -197,9 +217,11 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, } if (family == AF_INET) - afs_merge_fs_addr4(alist, x[0], xport); + ret = afs_merge_fs_addr4(net, alist, x[0], xport); else - afs_merge_fs_addr6(alist, x, xport); + ret = afs_merge_fs_addr6(net, alist, x, xport); + if (ret < 0) + goto error; } while (p < end); @@ -216,26 +238,13 @@ bad_address: problem, p - text, (int)len, (int)len, text); ret = -EINVAL; error: - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_parse_error); error_vl: afs_put_vlserverlist(net, vllist); return ERR_PTR(ret); } /* - * Compare old and new address lists to see if there's been any change. - * - How to do this in better than O(Nlog(N)) time? - * - We don't really want to sort the address list, but would rather take the - * list as we got it so as not to undo record rotation by the DNS server. - */ -#if 0 -static int afs_cmp_addr_list(const struct afs_addr_list *a1, - const struct afs_addr_list *a2) -{ -} -#endif - -/* * Perform a DNS query for VL servers and build a up an address list. */ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry) @@ -271,25 +280,33 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry /* * Merge an IPv4 entry into a fileserver address list. */ -void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) +int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *alist, + __be32 xdr, u16 port) { - struct sockaddr_rxrpc *srx; - u32 addr = ntohl(xdr); + struct sockaddr_rxrpc srx; + struct rxrpc_peer *peer; int i; if (alist->nr_addrs >= alist->max_addrs) - return; + return 0; - for (i = 0; i < alist->nr_ipv4; i++) { - struct sockaddr_in *a = &alist->addrs[i].transport.sin; - u32 a_addr = ntohl(a->sin_addr.s_addr); - u16 a_port = ntohs(a->sin_port); + srx.srx_family = AF_RXRPC; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin); + srx.transport.sin.sin_family = AF_INET; + srx.transport.sin.sin_port = htons(port); + srx.transport.sin.sin_addr.s_addr = xdr; - if (addr == a_addr && port == a_port) - return; - if (addr == a_addr && port < a_port) - break; - if (addr < a_addr) + peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL); + if (!peer) + return -ENOMEM; + + for (i = 0; i < alist->nr_ipv4; i++) { + if (peer == alist->addrs[i].peer) { + rxrpc_kernel_put_peer(peer); + return 0; + } + if (peer <= alist->addrs[i].peer) break; } @@ -298,38 +315,42 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) alist->addrs + i, sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); - srx = &alist->addrs[i]; - srx->srx_family = AF_RXRPC; - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin); - srx->transport.sin.sin_family = AF_INET; - srx->transport.sin.sin_port = htons(port); - srx->transport.sin.sin_addr.s_addr = xdr; + alist->addrs[i].peer = peer; alist->nr_ipv4++; alist->nr_addrs++; + return 0; } /* * Merge an IPv6 entry into a fileserver address list. */ -void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) +int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist, + __be32 *xdr, u16 port) { - struct sockaddr_rxrpc *srx; - int i, diff; + struct sockaddr_rxrpc srx; + struct rxrpc_peer *peer; + int i; if (alist->nr_addrs >= alist->max_addrs) - return; + return 0; - for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { - struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6; - u16 a_port = ntohs(a->sin6_port); + srx.srx_family = AF_RXRPC; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin6); + srx.transport.sin6.sin6_family = AF_INET6; + srx.transport.sin6.sin6_port = htons(port); + memcpy(&srx.transport.sin6.sin6_addr, xdr, 16); - diff = memcmp(xdr, &a->sin6_addr, 16); - if (diff == 0 && port == a_port) - return; - if (diff == 0 && port < a_port) - break; - if (diff < 0) + peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL); + if (!peer) + return -ENOMEM; + + for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { + if (peer == alist->addrs[i].peer) { + rxrpc_kernel_put_peer(peer); + return 0; + } + if (peer <= alist->addrs[i].peer) break; } @@ -337,68 +358,57 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) memmove(alist->addrs + i + 1, alist->addrs + i, sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); - - srx = &alist->addrs[i]; - srx->srx_family = AF_RXRPC; - srx->transport_type = SOCK_DGRAM; - srx->transport_len = sizeof(srx->transport.sin6); - srx->transport.sin6.sin6_family = AF_INET6; - srx->transport.sin6.sin6_port = htons(port); - memcpy(&srx->transport.sin6.sin6_addr, xdr, 16); + alist->addrs[i].peer = peer; alist->nr_addrs++; + return 0; } /* - * Get an address to try. + * Set the app data on the rxrpc peers an address list points to */ -bool afs_iterate_addresses(struct afs_addr_cursor *ac) +void afs_set_peer_appdata(struct afs_server *server, + struct afs_addr_list *old_alist, + struct afs_addr_list *new_alist) { - unsigned long set, failed; - int index; - - if (!ac->alist) - return false; - - set = ac->alist->responded; - failed = ac->alist->failed; - _enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index); - - ac->nr_iterations++; - - set &= ~(failed | ac->tried); - - if (!set) - return false; - - index = READ_ONCE(ac->alist->preferred); - if (test_bit(index, &set)) - goto selected; + unsigned long data = (unsigned long)server; + int n = 0, o = 0; - index = __ffs(set); - -selected: - ac->index = index; - set_bit(index, &ac->tried); - ac->responded = false; - return true; -} + if (!old_alist) { + /* New server. Just set all. */ + for (; n < new_alist->nr_addrs; n++) + rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data); + return; + } + if (!new_alist) { + /* Dead server. Just remove all. */ + for (; o < old_alist->nr_addrs; o++) + rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0); + return; + } -/* - * Release an address list cursor. - */ -int afs_end_cursor(struct afs_addr_cursor *ac) -{ - struct afs_addr_list *alist; + /* Walk through the two lists simultaneously, setting new peers and + * clearing old ones. The two lists are ordered by pointer to peer + * record. + */ + while (n < new_alist->nr_addrs && o < old_alist->nr_addrs) { + struct rxrpc_peer *pn = new_alist->addrs[n].peer; + struct rxrpc_peer *po = old_alist->addrs[o].peer; - alist = ac->alist; - if (alist) { - if (ac->responded && - ac->index != alist->preferred && - test_bit(ac->alist->preferred, &ac->tried)) - WRITE_ONCE(alist->preferred, ac->index); - afs_put_addrlist(alist); - ac->alist = NULL; + if (pn == po) + continue; + if (pn < po) { + rxrpc_kernel_set_peer_data(pn, data); + n++; + } else { + rxrpc_kernel_set_peer_data(po, 0); + o++; + } } - return ac->error; + if (n < new_alist->nr_addrs) + for (; n < new_alist->nr_addrs; n++) + rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data); + if (o < old_alist->nr_addrs) + for (; o < old_alist->nr_addrs; o++) + rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0); } diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c new file mode 100644 index 000000000000..133736412c3d --- /dev/null +++ b/fs/afs/addr_prefs.c @@ -0,0 +1,533 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Address preferences management + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": addr_prefs: " fmt +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/inet.h> +#include <linux/seq_file.h> +#include <keys/rxrpc-type.h> +#include "internal.h" + +static inline struct afs_net *afs_seq2net_single(struct seq_file *m) +{ + return afs_net(seq_file_single_net(m)); +} + +/* + * Split a NUL-terminated string up to the first newline around spaces. The + * source string will be modified to have NUL-terminations inserted. + */ +static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv) +{ + unsigned int count = 0; + char *p = *pbuf; + + maxstrv--; /* Allow for terminal NULL */ + for (;;) { + /* Skip over spaces */ + while (isspace(*p)) { + if (*p == '\n') { + p++; + break; + } + p++; + } + if (!*p) + break; + + /* Mark start of word */ + if (count >= maxstrv) { + pr_warn("Too many elements in string\n"); + return -EINVAL; + } + strv[count++] = p; + + /* Skip over word */ + while (!isspace(*p) && *p) + p++; + if (!*p) + break; + + /* Mark end of word */ + if (*p == '\n') { + *p++ = 0; + break; + } + *p++ = 0; + } + + *pbuf = p; + strv[count] = NULL; + return count; +} + +/* + * Parse an address with an optional subnet mask. + */ +static int afs_parse_address(char *p, struct afs_addr_preference *pref) +{ + const char *stop; + unsigned long mask, tmp; + char *end = p + strlen(p); + bool bracket = false; + + if (*p == '[') { + p++; + bracket = true; + } + +#if 0 + if (*p == '[') { + p++; + q = memchr(p, ']', end - p); + if (!q) { + pr_warn("Can't find closing ']'\n"); + return -EINVAL; + } + } else { + for (q = p; q < end; q++) + if (*q == '/') + break; + } +#endif + + if (in4_pton(p, end - p, (u8 *)&pref->ipv4_addr, -1, &stop)) { + pref->family = AF_INET; + mask = 32; + } else if (in6_pton(p, end - p, (u8 *)&pref->ipv6_addr, -1, &stop)) { + pref->family = AF_INET6; + mask = 128; + } else { + pr_warn("Can't determine address family\n"); + return -EINVAL; + } + + p = (char *)stop; + if (bracket) { + if (*p != ']') { + pr_warn("Can't find closing ']'\n"); + return -EINVAL; + } + p++; + } + + if (*p == '/') { + p++; + tmp = simple_strtoul(p, &p, 10); + if (tmp > mask) { + pr_warn("Subnet mask too large\n"); + return -EINVAL; + } + if (tmp == 0) { + pr_warn("Subnet mask too small\n"); + return -EINVAL; + } + mask = tmp; + } + + if (*p) { + pr_warn("Invalid address\n"); + return -EINVAL; + } + + pref->subnet_mask = mask; + return 0; +} + +enum cmp_ret { + CONTINUE_SEARCH, + INSERT_HERE, + EXACT_MATCH, + SUBNET_MATCH, +}; + +/* + * See if a candidate address matches a listed address. + */ +static enum cmp_ret afs_cmp_address_pref(const struct afs_addr_preference *a, + const struct afs_addr_preference *b) +{ + int subnet = min(a->subnet_mask, b->subnet_mask); + const __be32 *pa, *pb; + u32 mask, na, nb; + int diff; + + if (a->family != b->family) + return INSERT_HERE; + + switch (a->family) { + case AF_INET6: + pa = a->ipv6_addr.s6_addr32; + pb = b->ipv6_addr.s6_addr32; + break; + case AF_INET: + pa = &a->ipv4_addr.s_addr; + pb = &b->ipv4_addr.s_addr; + break; + } + + while (subnet > 32) { + diff = ntohl(*pa++) - ntohl(*pb++); + if (diff < 0) + return INSERT_HERE; /* a<b */ + if (diff > 0) + return CONTINUE_SEARCH; /* a>b */ + subnet -= 32; + } + + if (subnet == 0) + return EXACT_MATCH; + + mask = 0xffffffffU << (32 - subnet); + na = ntohl(*pa); + nb = ntohl(*pb); + diff = (na & mask) - (nb & mask); + //kdebug("diff %08x %08x %08x %d", na, nb, mask, diff); + if (diff < 0) + return INSERT_HERE; /* a<b */ + if (diff > 0) + return CONTINUE_SEARCH; /* a>b */ + if (a->subnet_mask == b->subnet_mask) + return EXACT_MATCH; + if (a->subnet_mask > b->subnet_mask) + return SUBNET_MATCH; /* a binds tighter than b */ + return CONTINUE_SEARCH; /* b binds tighter than a */ +} + +/* + * Insert an address preference. + */ +static int afs_insert_address_pref(struct afs_addr_preference_list **_preflist, + struct afs_addr_preference *pref, + int index) +{ + struct afs_addr_preference_list *preflist = *_preflist, *old = preflist; + size_t size, max_prefs; + + _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index); + + if (preflist->nr == 255) + return -ENOSPC; + if (preflist->nr >= preflist->max_prefs) { + max_prefs = preflist->max_prefs + 1; + size = struct_size(preflist, prefs, max_prefs); + size = roundup_pow_of_two(size); + max_prefs = min_t(size_t, (size - sizeof(*preflist)) / sizeof(*pref), 255); + preflist = kmalloc(size, GFP_KERNEL); + if (!preflist) + return -ENOMEM; + *preflist = **_preflist; + preflist->max_prefs = max_prefs; + *_preflist = preflist; + + if (index < preflist->nr) + memcpy(preflist->prefs + index + 1, old->prefs + index, + sizeof(*pref) * (preflist->nr - index)); + if (index > 0) + memcpy(preflist->prefs, old->prefs, sizeof(*pref) * index); + } else { + if (index < preflist->nr) + memmove(preflist->prefs + index + 1, preflist->prefs + index, + sizeof(*pref) * (preflist->nr - index)); + } + + preflist->prefs[index] = *pref; + preflist->nr++; + if (pref->family == AF_INET) + preflist->ipv6_off++; + return 0; +} + +/* + * Add an address preference. + * echo "add <proto> <IP>[/<mask>] <prior>" >/proc/fs/afs/addr_prefs + */ +static int afs_add_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist, + int argc, char **argv) +{ + struct afs_addr_preference_list *preflist = *_preflist; + struct afs_addr_preference pref; + enum cmp_ret cmp; + int ret, i, stop; + + if (argc != 3) { + pr_warn("Wrong number of params\n"); + return -EINVAL; + } + + if (strcmp(argv[0], "udp") != 0) { + pr_warn("Unsupported protocol\n"); + return -EINVAL; + } + + ret = afs_parse_address(argv[1], &pref); + if (ret < 0) + return ret; + + ret = kstrtou16(argv[2], 10, &pref.prio); + if (ret < 0) { + pr_warn("Invalid priority\n"); + return ret; + } + + if (pref.family == AF_INET) { + i = 0; + stop = preflist->ipv6_off; + } else { + i = preflist->ipv6_off; + stop = preflist->nr; + } + + for (; i < stop; i++) { + cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + case SUBNET_MATCH: + return afs_insert_address_pref(_preflist, &pref, i); + case EXACT_MATCH: + preflist->prefs[i].prio = pref.prio; + return 0; + } + } + + return afs_insert_address_pref(_preflist, &pref, i); +} + +/* + * Delete an address preference. + */ +static int afs_delete_address_pref(struct afs_addr_preference_list **_preflist, + int index) +{ + struct afs_addr_preference_list *preflist = *_preflist; + + _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index); + + if (preflist->nr == 0) + return -ENOENT; + + if (index < preflist->nr - 1) + memmove(preflist->prefs + index, preflist->prefs + index + 1, + sizeof(preflist->prefs[0]) * (preflist->nr - index - 1)); + + if (index < preflist->ipv6_off) + preflist->ipv6_off--; + preflist->nr--; + return 0; +} + +/* + * Delete an address preference. + * echo "del <proto> <IP>[/<mask>]" >/proc/fs/afs/addr_prefs + */ +static int afs_del_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist, + int argc, char **argv) +{ + struct afs_addr_preference_list *preflist = *_preflist; + struct afs_addr_preference pref; + enum cmp_ret cmp; + int ret, i, stop; + + if (argc != 2) { + pr_warn("Wrong number of params\n"); + return -EINVAL; + } + + if (strcmp(argv[0], "udp") != 0) { + pr_warn("Unsupported protocol\n"); + return -EINVAL; + } + + ret = afs_parse_address(argv[1], &pref); + if (ret < 0) + return ret; + + if (pref.family == AF_INET) { + i = 0; + stop = preflist->ipv6_off; + } else { + i = preflist->ipv6_off; + stop = preflist->nr; + } + + for (; i < stop; i++) { + cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + case SUBNET_MATCH: + return 0; + case EXACT_MATCH: + return afs_delete_address_pref(_preflist, i); + } + } + + return -ENOANO; +} + +/* + * Handle writes to /proc/fs/afs/addr_prefs + */ +int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size) +{ + struct afs_addr_preference_list *preflist, *old; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net_single(m); + size_t psize; + char *argv[5]; + int ret, argc, max_prefs; + + inode_lock(file_inode(file)); + + /* Allocate a candidate new list and initialise it from the old. */ + old = rcu_dereference_protected(net->address_prefs, + lockdep_is_held(&file_inode(file)->i_rwsem)); + + if (old) + max_prefs = old->nr + 1; + else + max_prefs = 1; + + psize = struct_size(old, prefs, max_prefs); + psize = roundup_pow_of_two(psize); + max_prefs = min_t(size_t, (psize - sizeof(*old)) / sizeof(old->prefs[0]), 255); + + ret = -ENOMEM; + preflist = kmalloc(struct_size(preflist, prefs, max_prefs), GFP_KERNEL); + if (!preflist) + goto done; + + if (old) + memcpy(preflist, old, struct_size(preflist, prefs, old->nr)); + else + memset(preflist, 0, sizeof(*preflist)); + preflist->max_prefs = max_prefs; + + do { + argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv)); + if (argc < 0) { + ret = argc; + goto done; + } + if (argc < 2) + goto inval; + + if (strcmp(argv[0], "add") == 0) + ret = afs_add_address_pref(net, &preflist, argc - 1, argv + 1); + else if (strcmp(argv[0], "del") == 0) + ret = afs_del_address_pref(net, &preflist, argc - 1, argv + 1); + else + goto inval; + if (ret < 0) + goto done; + } while (*buf); + + preflist->version++; + rcu_assign_pointer(net->address_prefs, preflist); + /* Store prefs before version */ + smp_store_release(&net->address_pref_version, preflist->version); + kfree_rcu(old, rcu); + preflist = NULL; + ret = 0; + +done: + kfree(preflist); + inode_unlock(file_inode(file)); + _leave(" = %d", ret); + return ret; + +inval: + pr_warn("Invalid Command\n"); + ret = -EINVAL; + goto done; +} + +/* + * Mark the priorities on an address list if the address preferences table has + * changed. The caller must hold the RCU read lock. + */ +void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist) +{ + const struct afs_addr_preference_list *preflist = + rcu_dereference(net->address_prefs); + const struct sockaddr_in6 *sin6; + const struct sockaddr_in *sin; + const struct sockaddr *sa; + struct afs_addr_preference test; + enum cmp_ret cmp; + int i, j; + + if (!preflist || !preflist->nr || !alist->nr_addrs || + smp_load_acquire(&alist->addr_pref_version) == preflist->version) + return; + + test.family = AF_INET; + test.subnet_mask = 32; + test.prio = 0; + for (i = 0; i < alist->nr_ipv4; i++) { + sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer); + sin = (const struct sockaddr_in *)sa; + test.ipv4_addr = sin->sin_addr; + for (j = 0; j < preflist->ipv6_off; j++) { + cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + break; + case EXACT_MATCH: + case SUBNET_MATCH: + WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio); + break; + } + } + } + + test.family = AF_INET6; + test.subnet_mask = 128; + test.prio = 0; + for (; i < alist->nr_addrs; i++) { + sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer); + sin6 = (const struct sockaddr_in6 *)sa; + test.ipv6_addr = sin6->sin6_addr; + for (j = preflist->ipv6_off; j < preflist->nr; j++) { + cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]); + switch (cmp) { + case CONTINUE_SEARCH: + continue; + case INSERT_HERE: + break; + case EXACT_MATCH: + case SUBNET_MATCH: + WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio); + break; + } + } + } + + smp_store_release(&alist->addr_pref_version, preflist->version); +} + +/* + * Mark the priorities on an address list if the address preferences table has + * changed. Avoid taking the RCU read lock if we can. + */ +void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist) +{ + if (!net->address_prefs || + /* Load version before prefs */ + smp_load_acquire(&net->address_pref_version) == alist->addr_pref_version) + return; + + rcu_read_lock(); + afs_get_address_preferences_rcu(net, alist); + rcu_read_unlock(); +} diff --git a/fs/afs/afs.h b/fs/afs/afs.h index 432cb4b23961..ec3db00bd081 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -10,7 +10,7 @@ #include <linux/in.h> -#define AFS_MAXCELLNAME 256 /* Maximum length of a cell name */ +#define AFS_MAXCELLNAME 253 /* Maximum length of a cell name (DNS limited) */ #define AFS_MAXVOLNAME 64 /* Maximum length of a volume name */ #define AFS_MAXNSERVERS 8 /* Maximum servers in a basic volume record */ #define AFS_NMAXNSERVERS 13 /* Maximum servers in a N/U-class volume record */ @@ -19,8 +19,8 @@ #define AFSPATHMAX 1024 /* Maximum length of a pathname plus NUL */ #define AFSOPAQUEMAX 1024 /* Maximum length of an opaque field */ -#define AFS_VL_MAX_LIFESPAN (120 * HZ) -#define AFS_PROBE_MAX_LIFESPAN (30 * HZ) +#define AFS_VL_MAX_LIFESPAN 120 +#define AFS_PROBE_MAX_LIFESPAN 30 typedef u64 afs_volid_t; typedef u64 afs_vnodeid_t; @@ -165,7 +165,8 @@ struct afs_status_cb { * AFS volume synchronisation information */ struct afs_volsync { - time64_t creation; /* volume creation time */ + time64_t creation; /* Volume creation time (or TIME64_MIN) */ + time64_t update; /* Volume update time (or TIME64_MIN) */ }; /* diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h index 9c65ffb8a523..b835e25a2c02 100644 --- a/fs/afs/afs_vl.h +++ b/fs/afs/afs_vl.h @@ -13,6 +13,7 @@ #define AFS_VL_PORT 7003 /* volume location service port */ #define VL_SERVICE 52 /* RxRPC service ID for the Volume Location service */ #define YFS_VL_SERVICE 2503 /* Service ID for AuriStor upgraded VL service */ +#define YFS_VL_MAXCELLNAME 256 /* Maximum length of a cell name in YFS protocol */ enum AFSVL_Operations { VLGETENTRYBYID = 503, /* AFS Get VLDB entry by ID */ @@ -134,13 +135,4 @@ struct afs_uvldbentry__xdr { __be32 spares9; }; -struct afs_address_list { - refcount_t usage; - unsigned int version; - unsigned int nr_addrs; - struct sockaddr_rxrpc addrs[]; -}; - -extern void afs_put_address_list(struct afs_address_list *alist); - #endif /* AFS_VL_H */ diff --git a/fs/afs/callback.c b/fs/afs/callback.c index a484fa642808..894d2bad6b6c 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -33,22 +33,20 @@ void afs_invalidate_mmap_work(struct work_struct *work) unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); } -void afs_server_init_callback_work(struct work_struct *work) +static void afs_volume_init_callback(struct afs_volume *volume) { - struct afs_server *server = container_of(work, struct afs_server, initcb_work); struct afs_vnode *vnode; - struct afs_cell *cell = server->cell; - down_read(&cell->fs_open_mmaps_lock); + down_read(&volume->open_mmaps_lock); - list_for_each_entry(vnode, &cell->fs_open_mmaps, cb_mmap_link) { - if (vnode->cb_server == server) { - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); - queue_work(system_unbound_wq, &vnode->cb_work); + list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) { + if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) { + afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb); + queue_work(system_dfl_wq, &vnode->cb_work); } } - up_read(&cell->fs_open_mmaps_lock); + up_read(&volume->open_mmaps_lock); } /* @@ -57,15 +55,20 @@ void afs_server_init_callback_work(struct work_struct *work) */ void afs_init_callback_state(struct afs_server *server) { - rcu_read_lock(); - do { - server->cb_s_break++; - atomic_inc(&server->cell->fs_s_break); - if (!list_empty(&server->cell->fs_open_mmaps)) - queue_work(system_unbound_wq, &server->initcb_work); + struct afs_server_entry *se; - } while ((server = rcu_dereference(server->uuid_next))); - rcu_read_unlock(); + down_read(&server->cell->vs_lock); + + list_for_each_entry(se, &server->volumes, slink) { + se->cb_expires_at = AFS_NO_CB_PROMISE; + se->volume->cb_expires_at = AFS_NO_CB_PROMISE; + trace_afs_cb_v_break(se->volume->vid, atomic_read(&se->volume->cb_v_break), + afs_cb_break_for_s_reinit); + if (!list_empty(&se->volume->open_mmaps)) + afs_volume_init_callback(se->volume); + } + + up_read(&server->cell->vs_lock); } /* @@ -76,9 +79,9 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas _enter(""); clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_cb_break)) { vnode->cb_break++; - vnode->cb_v_break = vnode->volume->cb_v_break; + vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); afs_clear_permits(vnode); if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB) @@ -87,7 +90,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas if (reason != afs_cb_break_for_deleted && vnode->status.type == AFS_FTYPE_FILE && atomic_read(&vnode->cb_nr_mmap)) - queue_work(system_unbound_wq, &vnode->cb_work); + queue_work(system_dfl_wq, &vnode->cb_work); trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true); } else { @@ -110,13 +113,14 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, { struct afs_volume *volume = NULL; struct rb_node *p; - int seq = 0; + int seq = 1; - do { + for (;;) { /* Unfortunately, rbtree walking doesn't give reliable results * under just the RCU read lock, so we have to check for * changes. */ + seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&cell->volume_lock, &seq); p = rcu_dereference_raw(cell->volumes.rb_node); @@ -132,35 +136,63 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell, volume = NULL; } - } while (need_seqretry(&cell->volume_lock, seq)); + if (volume && afs_try_get_volume(volume, afs_volume_trace_get_callback)) + break; + if (!need_seqretry(&cell->volume_lock, seq)) + break; + seq |= 1; /* Want a lock next time */ + } done_seqretry(&cell->volume_lock, seq); return volume; } /* + * Allow the fileserver to break callbacks at the volume-level. This is + * typically done when, for example, a R/W volume is snapshotted to a R/O + * volume (the only way to change an R/O volume). It may also, however, happen + * when a volserver takes control of a volume (offlining it, moving it, etc.). + * + * Every file in that volume will need to be reevaluated. + */ +static void afs_break_volume_callback(struct afs_server *server, + struct afs_volume *volume) + __releases(RCU) +{ + struct afs_server_list *slist = rcu_dereference(volume->servers); + unsigned int i, cb_v_break; + + write_lock(&volume->cb_v_break_lock); + + for (i = 0; i < slist->nr_servers; i++) + if (slist->servers[i].server == server) + slist->servers[i].cb_expires_at = AFS_NO_CB_PROMISE; + volume->cb_expires_at = AFS_NO_CB_PROMISE; + + cb_v_break = atomic_inc_return_release(&volume->cb_v_break); + trace_afs_cb_v_break(volume->vid, cb_v_break, afs_cb_break_for_volume_callback); + + write_unlock(&volume->cb_v_break_lock); + rcu_read_unlock(); + + if (!list_empty(&volume->open_mmaps)) + afs_volume_init_callback(volume); +} + +/* * allow the fileserver to explicitly break one callback * - happens when * - the backing file is changed * - a lock is released */ -static void afs_break_one_callback(struct afs_volume *volume, +static void afs_break_one_callback(struct afs_server *server, + struct afs_volume *volume, struct afs_fid *fid) { struct super_block *sb; struct afs_vnode *vnode; struct inode *inode; - if (fid->vnode == 0 && fid->unique == 0) { - /* The callback break applies to an entire volume. */ - write_lock(&volume->cb_v_break_lock); - volume->cb_v_break++; - trace_afs_cb_break(fid, volume->cb_v_break, - afs_cb_break_for_volume_callback, false); - write_unlock(&volume->cb_v_break_lock); - return; - } - /* See if we can find a matching inode - even an I_NEW inode needs to * be marked as it can have its callback broken before we finish * setting up the local inode. @@ -187,25 +219,35 @@ static void afs_break_some_callbacks(struct afs_server *server, afs_volid_t vid = cbb->fid.vid; size_t i; + rcu_read_lock(); volume = afs_lookup_volume_rcu(server->cell, vid); + if (cbb->fid.vnode == 0 && cbb->fid.unique == 0) { + afs_break_volume_callback(server, volume); + *_count -= 1; + if (*_count) + memmove(cbb, cbb + 1, sizeof(*cbb) * *_count); + } else { + /* TODO: Find all matching volumes if we couldn't match the server and + * break them anyway. + */ - /* TODO: Find all matching volumes if we couldn't match the server and - * break them anyway. - */ - - for (i = *_count; i > 0; cbb++, i--) { - if (cbb->fid.vid == vid) { - _debug("- Fid { vl=%08llx n=%llu u=%u }", - cbb->fid.vid, - cbb->fid.vnode, - cbb->fid.unique); - --*_count; - if (volume) - afs_break_one_callback(volume, &cbb->fid); - } else { - *residue++ = *cbb; + for (i = *_count; i > 0; cbb++, i--) { + if (cbb->fid.vid == vid) { + _debug("- Fid { vl=%08llx n=%llu u=%u }", + cbb->fid.vid, + cbb->fid.vnode, + cbb->fid.unique); + --*_count; + if (volume) + afs_break_one_callback(server, volume, &cbb->fid); + } else { + *residue++ = *cbb; + } } + rcu_read_unlock(); } + + afs_put_volume(volume, afs_volume_trace_put_callback); } /* @@ -218,11 +260,6 @@ void afs_break_callbacks(struct afs_server *server, size_t count, ASSERT(server != NULL); - rcu_read_lock(); - while (count > 0) afs_break_some_callbacks(server, callbacks, &count); - - rcu_read_unlock(); - return; } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 988c2ac7cece..71c10a05cebe 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -20,8 +20,9 @@ static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; static atomic_t cell_debug_id; -static void afs_queue_cell_manager(struct afs_net *); -static void afs_manage_cell_work(struct work_struct *); +static void afs_cell_timer(struct timer_list *timer); +static void afs_destroy_cell_work(struct work_struct *work); +static void afs_manage_cell_work(struct work_struct *work); static void afs_dec_cells_outstanding(struct afs_net *net) { @@ -29,19 +30,11 @@ static void afs_dec_cells_outstanding(struct afs_net *net) wake_up_var(&net->cells_outstanding); } -/* - * Set the cell timer to fire after a given delay, assuming it's not already - * set for an earlier time. - */ -static void afs_set_cell_timer(struct afs_net *net, time64_t delay) +static void afs_set_cell_state(struct afs_cell *cell, enum afs_cell_state state) { - if (net->live) { - atomic_inc(&net->cells_outstanding); - if (timer_reduce(&net->cells_timer, jiffies + delay * HZ)) - afs_dec_cells_outstanding(net); - } else { - afs_queue_cell_manager(net); - } + smp_store_release(&cell->state, state); /* Commit cell changes before state */ + smp_wmb(); /* Set cell state before task state */ + wake_up_var(&cell->state); } /* @@ -64,7 +57,8 @@ static struct afs_cell *afs_find_cell_locked(struct afs_net *net, return ERR_PTR(-ENAMETOOLONG); if (!name) { - cell = net->ws_cell; + cell = rcu_dereference_protected(net->ws_cell, + lockdep_is_held(&net->cells_lock)); if (!cell) return ERR_PTR(-EDESTADDRREQ); goto found; @@ -115,7 +109,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, const char *name, unsigned int namelen, const char *addresses) { - struct afs_vlserver_list *vllist; + struct afs_vlserver_list *vllist = NULL; struct afs_cell *cell; int i, ret; @@ -146,28 +140,37 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, return ERR_PTR(-ENOMEM); } - cell->name = kmalloc(namelen + 1, GFP_KERNEL); + /* Allocate the cell name and the key name in one go. */ + cell->name = kmalloc(1 + namelen + 1 + + 4 + namelen + 1, GFP_KERNEL); if (!cell->name) { kfree(cell); return ERR_PTR(-ENOMEM); } - cell->net = net; + cell->name[0] = '.'; + cell->name++; cell->name_len = namelen; for (i = 0; i < namelen; i++) cell->name[i] = tolower(name[i]); - cell->name[i] = 0; + cell->name[i++] = 0; + + cell->key_desc = cell->name + i; + memcpy(cell->key_desc, "afs@", 4); + memcpy(cell->key_desc + 4, cell->name, cell->name_len + 1); + cell->net = net; refcount_set(&cell->ref, 1); atomic_set(&cell->active, 0); + INIT_WORK(&cell->destroyer, afs_destroy_cell_work); INIT_WORK(&cell->manager, afs_manage_cell_work); + timer_setup(&cell->management_timer, afs_cell_timer, 0); + init_rwsem(&cell->vs_lock); cell->volumes = RB_ROOT; INIT_HLIST_HEAD(&cell->proc_volumes); seqlock_init(&cell->volume_lock); cell->fs_servers = RB_ROOT; - seqlock_init(&cell->fs_lock); - INIT_LIST_HEAD(&cell->fs_open_mmaps); - init_rwsem(&cell->fs_open_mmaps_lock); + init_rwsem(&cell->fs_lock); rwlock_init(&cell->vl_servers_lock); cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); @@ -180,6 +183,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, VL_SERVICE, AFS_VL_PORT); if (IS_ERR(vllist)) { ret = PTR_ERR(vllist); + vllist = NULL; goto parse_failed; } @@ -202,7 +206,13 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ atomic_inc(&net->cells_outstanding); + ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell, + 2, INT_MAX / 2, GFP_KERNEL); + if (ret < 0) + goto error; + cell->dynroot_ino = ret; cell->debug_id = atomic_inc_return(&cell_debug_id); + trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc); _leave(" = %p", cell); @@ -212,7 +222,8 @@ parse_failed: if (ret == -EINVAL) printk(KERN_ERR "kAFS: bad VL server IP address\n"); error: - kfree(cell->name); + afs_put_vlserverlist(cell->net, vllist); + kfree(cell->name - 1); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); @@ -224,7 +235,8 @@ error: * @name: The name of the cell. * @namesz: The strlen of the cell name. * @vllist: A colon/comma separated list of numeric IP addresses or NULL. - * @excl: T if an error should be given if the cell name already exists. + * @reason: The reason we're doing the lookup + * @trace: The reason to be logged if the lookup is successful. * * Look up a cell record by name and query the DNS for VL server addresses if * needed. Note that that actual DNS query is punted off to the manager thread @@ -233,19 +245,27 @@ error: */ struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl) + const char *vllist, + enum afs_lookup_cell_for reason, + enum afs_cell_trace trace) { struct afs_cell *cell, *candidate, *cursor; struct rb_node *parent, **pp; enum afs_cell_state state; int ret, n; - _enter("%s,%s", name, vllist); + _enter("%s,%s,%u", name, vllist, reason); - if (!excl) { - cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup); - if (!IS_ERR(cell)) + if (reason != AFS_LOOKUP_CELL_PRELOAD) { + cell = afs_find_cell(net, name, namesz, trace); + if (!IS_ERR(cell)) { + if (reason == AFS_LOOKUP_CELL_DYNROOT) + goto no_wait; + if (cell->state == AFS_CELL_SETTING_UP || + cell->state == AFS_CELL_UNLOOKED) + goto lookup_cell; goto wait_for_cell; + } } /* Assume we're probably going to create a cell and preallocate and @@ -286,29 +306,74 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, cell = candidate; candidate = NULL; - atomic_set(&cell->active, 2); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert); + afs_use_cell(cell, trace); rb_link_node_rcu(&cell->net_node, parent, pp); rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); - afs_queue_cell(cell, afs_cell_trace_get_queue_new); +lookup_cell: + if (reason != AFS_LOOKUP_CELL_PRELOAD && + reason != AFS_LOOKUP_CELL_ROOTCELL) { + set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); + afs_queue_cell(cell, afs_cell_trace_queue_new); + } wait_for_cell: - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active), - afs_cell_trace_wait); - _debug("wait_for_cell"); - wait_var_event(&cell->state, - ({ - state = smp_load_acquire(&cell->state); /* vs error */ - state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED; - })); + state = smp_load_acquire(&cell->state); /* vs error */ + switch (state) { + case AFS_CELL_ACTIVE: + case AFS_CELL_DEAD: + break; + case AFS_CELL_UNLOOKED: + default: + if (reason == AFS_LOOKUP_CELL_PRELOAD || + reason == AFS_LOOKUP_CELL_ROOTCELL) + break; + _debug("wait_for_cell"); + afs_see_cell(cell, afs_cell_trace_wait); + wait_var_event(&cell->state, + ({ + state = smp_load_acquire(&cell->state); /* vs error */ + state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; + })); + _debug("waited_for_cell %d %d", cell->state, cell->error); + } +no_wait: /* Check the state obtained from the wait check. */ - if (state == AFS_CELL_REMOVED) { + state = smp_load_acquire(&cell->state); /* vs error */ + if (state == AFS_CELL_DEAD) { ret = cell->error; goto error; } + if (state == AFS_CELL_ACTIVE) { + switch (cell->dns_status) { + case DNS_LOOKUP_NOT_DONE: + if (cell->dns_source == DNS_RECORD_FROM_CONFIG) { + ret = 0; + break; + } + fallthrough; + default: + ret = -EIO; + goto error; + case DNS_LOOKUP_GOOD: + case DNS_LOOKUP_GOOD_WITH_BAD: + ret = 0; + break; + case DNS_LOOKUP_GOT_NOT_FOUND: + ret = -ENOENT; + goto error; + case DNS_LOOKUP_BAD: + ret = -EREMOTEIO; + goto error; + case DNS_LOOKUP_GOT_LOCAL_FAILURE: + case DNS_LOOKUP_GOT_TEMP_FAILURE: + case DNS_LOOKUP_GOT_NS_FAILURE: + ret = -EDESTADDRREQ; + goto error; + } + } _leave(" = %p [cell]", cell); return cell; @@ -316,10 +381,10 @@ wait_for_cell: cell_already_exists: _debug("cell exists"); cell = cursor; - if (excl) { + if (reason == AFS_LOOKUP_CELL_PRELOAD) { ret = -EEXIST; } else { - afs_use_cell(cursor, afs_cell_trace_use_lookup); + afs_use_cell(cursor, trace); ret = 0; } up_write(&net->cells_lock); @@ -329,7 +394,7 @@ cell_already_exists: goto wait_for_cell; goto error_noput; error: - afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup); + afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_error); error_noput: _leave(" = %d [error]", ret); return ERR_PTR(ret); @@ -366,8 +431,18 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) len = cp - rootcell; } - /* allocate a cell record for the root cell */ - new_root = afs_lookup_cell(net, rootcell, len, vllist, false); + if (len == 0 || !rootcell[0] || rootcell[0] == '.' || rootcell[len - 1] == '.') + return -EINVAL; + if (memchr(rootcell, '/', len)) + return -EINVAL; + cp = strstr(rootcell, ".."); + if (cp && cp < rootcell + len) + return -EINVAL; + + /* allocate a cell record for the root/workstation cell */ + new_root = afs_lookup_cell(net, rootcell, len, vllist, + AFS_LOOKUP_CELL_ROOTCELL, + afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); return PTR_ERR(new_root); @@ -378,12 +453,11 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) /* install the new cell */ down_write(&net->cells_lock); - afs_see_cell(new_root, afs_cell_trace_see_ws); - old_root = net->ws_cell; - net->ws_cell = new_root; + old_root = rcu_replace_pointer(net->ws_cell, new_root, + lockdep_is_held(&net->cells_lock)); up_write(&net->cells_lock); - afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws); + afs_unuse_cell(old_root, afs_cell_trace_unuse_ws); _leave(" = 0"); return 0; } @@ -409,10 +483,12 @@ static int afs_update_cell(struct afs_cell *cell) if (ret == -ENOMEM) goto out_wake; - ret = -ENOMEM; vllist = afs_alloc_vlserver_list(0); - if (!vllist) + if (!vllist) { + if (ret >= 0) + ret = -ENOMEM; goto out_wake; + } switch (ret) { case -ENODATA: @@ -499,39 +575,24 @@ static void afs_cell_destroy(struct rcu_head *rcu) trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free); afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); - afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias); + afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias); key_put(cell->anonymous_key); - kfree(cell->name); + idr_remove(&net->cells_dyn_ino, cell->dynroot_ino); + kfree(cell->name - 1); kfree(cell); afs_dec_cells_outstanding(net); _leave(" [destroyed]"); } -/* - * Queue the cell manager. - */ -static void afs_queue_cell_manager(struct afs_net *net) -{ - int outstanding = atomic_inc_return(&net->cells_outstanding); - - _enter("%d", outstanding); - - if (!queue_work(afs_wq, &net->cells_manager)) - afs_dec_cells_outstanding(net); -} - -/* - * Cell management timer. We have an increment on cells_outstanding that we - * need to pass along to the work item. - */ -void afs_cells_timer(struct timer_list *timer) +static void afs_destroy_cell_work(struct work_struct *work) { - struct afs_net *net = container_of(timer, struct afs_net, cells_timer); + struct afs_cell *cell = container_of(work, struct afs_cell, destroyer); - _enter(""); - if (!queue_work(afs_wq, &net->cells_manager)) - afs_dec_cells_outstanding(net); + afs_see_cell(cell, afs_cell_trace_destroy); + timer_delete_sync(&cell->management_timer); + cancel_work_sync(&cell->manager); + call_rcu(&cell->rcu, afs_cell_destroy); } /* @@ -563,7 +624,7 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason) if (zero) { a = atomic_read(&cell->active); WARN(a != 0, "Cell active count %u > 0\n", a); - call_rcu(&cell->rcu, afs_cell_destroy); + WARN_ON(!queue_work(afs_wq, &cell->destroyer)); } } } @@ -575,10 +636,9 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) { int r, a; - r = refcount_read(&cell->ref); - WARN_ON(r == 0); + __refcount_inc(&cell->ref, &r); a = atomic_inc_return(&cell->active); - trace_afs_cell(cell->debug_id, r, a, reason); + trace_afs_cell(cell->debug_id, r + 1, a, reason); return cell; } @@ -586,10 +646,11 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) * Record a cell becoming less active. When the active counter reaches 1, it * is scheduled for destruction, but may get reactivated. */ -void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason) +void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason) { unsigned int debug_id; time64_t now, expire_delay; + bool zero; int r, a; if (!cell) @@ -604,13 +665,15 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr expire_delay = afs_cell_gc_delay; debug_id = cell->debug_id; - r = refcount_read(&cell->ref); a = atomic_dec_return(&cell->active); - trace_afs_cell(debug_id, r, a, reason); - WARN_ON(a == 0); - if (a == 1) + if (!a) /* 'cell' may now be garbage collected. */ - afs_set_cell_timer(net, expire_delay); + afs_set_cell_timer(cell, expire_delay); + + zero = __refcount_dec_and_test(&cell->ref, &r); + trace_afs_cell(debug_id, r - 1, a, reason); + if (zero) + WARN_ON(!queue_work(afs_wq, &cell->destroyer)); } /* @@ -630,36 +693,27 @@ void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason) */ void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason) { - afs_get_cell(cell, reason); - if (!queue_work(afs_wq, &cell->manager)) - afs_put_cell(cell, afs_cell_trace_put_queue_fail); + queue_work(afs_wq, &cell->manager); } /* - * Allocate a key to use as a placeholder for anonymous user security. + * Cell-specific management timer. */ -static int afs_alloc_anon_key(struct afs_cell *cell) +static void afs_cell_timer(struct timer_list *timer) { - struct key *key; - char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp; - - /* Create a key to represent an anonymous user. */ - memcpy(keyname, "afs@", 4); - dp = keyname + 4; - cp = cell->name; - do { - *dp++ = tolower(*cp); - } while (*cp++); + struct afs_cell *cell = container_of(timer, struct afs_cell, management_timer); - key = rxrpc_get_null_key(keyname); - if (IS_ERR(key)) - return PTR_ERR(key); - - cell->anonymous_key = key; + afs_see_cell(cell, afs_cell_trace_see_mgmt_timer); + if (refcount_read(&cell->ref) > 0 && cell->net->live) + queue_work(afs_wq, &cell->manager); +} - _debug("anon key %p{%x}", - cell->anonymous_key, key_serial(cell->anonymous_key)); - return 0; +/* + * Set/reduce the cell timer. + */ +void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs) +{ + timer_reduce(&cell->management_timer, jiffies + delay_secs * HZ); } /* @@ -671,12 +725,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) struct afs_cell *pcell; int ret; - if (!cell->anonymous_key) { - ret = afs_alloc_anon_key(cell); - if (ret < 0) - return ret; - } - ret = afs_proc_cell_setup(cell); if (ret < 0) return ret; @@ -694,7 +742,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) if (cell->proc_link.next) cell->proc_link.next->pprev = &cell->proc_link.next; - afs_dynroot_mkdir(net, cell); mutex_unlock(&net->proc_cells_lock); return 0; } @@ -709,242 +756,167 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) afs_proc_cell_remove(cell); mutex_lock(&net->proc_cells_lock); - hlist_del_rcu(&cell->proc_link); - afs_dynroot_rmdir(net, cell); + if (!hlist_unhashed(&cell->proc_link)) + hlist_del_rcu(&cell->proc_link); mutex_unlock(&net->proc_cells_lock); _leave(""); } +static bool afs_has_cell_expired(struct afs_cell *cell, time64_t *_next_manage) +{ + const struct afs_vlserver_list *vllist; + time64_t expire_at = cell->last_inactive; + time64_t now = ktime_get_real_seconds(); + + if (atomic_read(&cell->active)) + return false; + if (!cell->net->live) + return true; + + vllist = rcu_dereference_protected(cell->vl_servers, true); + if (vllist && vllist->nr_servers > 0) + expire_at += afs_cell_gc_delay; + + if (expire_at <= now) + return true; + if (expire_at < *_next_manage) + *_next_manage = expire_at; + return false; +} + /* * Manage a cell record, initialising and destroying it, maintaining its DNS * records. */ -static void afs_manage_cell(struct afs_cell *cell) +static bool afs_manage_cell(struct afs_cell *cell) { struct afs_net *net = cell->net; - int ret, active; + time64_t next_manage = TIME64_MAX; + int ret; _enter("%s", cell->name); -again: _debug("state %u", cell->state); switch (cell->state) { - case AFS_CELL_INACTIVE: - case AFS_CELL_FAILED: - down_write(&net->cells_lock); - active = 1; - if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) { - rb_erase(&cell->net_node, &net->cells); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0, - afs_cell_trace_unuse_delete); - smp_store_release(&cell->state, AFS_CELL_REMOVED); - } - up_write(&net->cells_lock); - if (cell->state == AFS_CELL_REMOVED) { - wake_up_var(&cell->state); - goto final_destruction; - } - if (cell->state == AFS_CELL_FAILED) - goto done; - smp_store_release(&cell->state, AFS_CELL_UNSET); - wake_up_var(&cell->state); - goto again; - - case AFS_CELL_UNSET: - smp_store_release(&cell->state, AFS_CELL_ACTIVATING); - wake_up_var(&cell->state); - goto again; - - case AFS_CELL_ACTIVATING: - ret = afs_activate_cell(net, cell); - if (ret < 0) - goto activation_failed; + case AFS_CELL_SETTING_UP: + goto set_up_cell; + case AFS_CELL_UNLOOKED: + case AFS_CELL_ACTIVE: + goto cell_is_active; + case AFS_CELL_REMOVING: + WARN_ON_ONCE(1); + return false; + case AFS_CELL_DEAD: + return false; + default: + _debug("bad state %u", cell->state); + WARN_ON_ONCE(1); /* Unhandled state */ + return false; + } - smp_store_release(&cell->state, AFS_CELL_ACTIVE); - wake_up_var(&cell->state); - goto again; +set_up_cell: + ret = afs_activate_cell(net, cell); + if (ret < 0) { + cell->error = ret; + goto remove_cell; + } - case AFS_CELL_ACTIVE: - if (atomic_read(&cell->active) > 1) { - if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { - ret = afs_update_cell(cell); - if (ret < 0) - cell->error = ret; - } - goto done; - } - smp_store_release(&cell->state, AFS_CELL_DEACTIVATING); - wake_up_var(&cell->state); - goto again; + afs_set_cell_state(cell, AFS_CELL_UNLOOKED); - case AFS_CELL_DEACTIVATING: - if (atomic_read(&cell->active) > 1) - goto reverse_deactivation; - afs_deactivate_cell(net, cell); - smp_store_release(&cell->state, AFS_CELL_INACTIVE); - wake_up_var(&cell->state); - goto again; +cell_is_active: + if (afs_has_cell_expired(cell, &next_manage)) + goto remove_cell; - case AFS_CELL_REMOVED: - goto done; + if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { + ret = afs_update_cell(cell); + if (ret < 0) + cell->error = ret; + if (cell->state == AFS_CELL_UNLOOKED) + afs_set_cell_state(cell, AFS_CELL_ACTIVE); + } - default: - break; + if (next_manage < TIME64_MAX && cell->net->live) { + time64_t now = ktime_get_real_seconds(); + + if (next_manage - now <= 0) + afs_queue_cell(cell, afs_cell_trace_queue_again); + else + afs_set_cell_timer(cell, next_manage - now); } - _debug("bad state %u", cell->state); - BUG(); /* Unhandled state */ + _leave(" [done %u]", cell->state); + return false; -activation_failed: - cell->error = ret; - afs_deactivate_cell(net, cell); +remove_cell: + down_write(&net->cells_lock); - smp_store_release(&cell->state, AFS_CELL_FAILED); /* vs error */ - wake_up_var(&cell->state); - goto again; + if (atomic_read(&cell->active)) { + up_write(&net->cells_lock); + goto cell_is_active; + } -reverse_deactivation: - smp_store_release(&cell->state, AFS_CELL_ACTIVE); - wake_up_var(&cell->state); - _leave(" [deact->act]"); - return; + /* Make sure that the expiring server records are going to see the fact + * that the cell is caput. + */ + afs_set_cell_state(cell, AFS_CELL_REMOVING); -done: - _leave(" [done %u]", cell->state); - return; + afs_deactivate_cell(net, cell); + afs_purge_servers(cell); + + rb_erase(&cell->net_node, &net->cells); + afs_see_cell(cell, afs_cell_trace_unuse_delete); + up_write(&net->cells_lock); -final_destruction: /* The root volume is pinning the cell */ - afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root); + afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root); cell->root_volume = NULL; - afs_put_cell(cell, afs_cell_trace_put_destroy); + + afs_set_cell_state(cell, AFS_CELL_DEAD); + return true; } static void afs_manage_cell_work(struct work_struct *work) { struct afs_cell *cell = container_of(work, struct afs_cell, manager); + bool final_put; - afs_manage_cell(cell); - afs_put_cell(cell, afs_cell_trace_put_queue_work); + afs_see_cell(cell, afs_cell_trace_manage); + final_put = afs_manage_cell(cell); + afs_see_cell(cell, afs_cell_trace_managed); + if (final_put) + afs_put_cell(cell, afs_cell_trace_put_final); } /* - * Manage the records of cells known to a network namespace. This includes - * updating the DNS records and garbage collecting unused cells that were - * automatically added. - * - * Note that constructed cell records may only be removed from net->cells by - * this work item, so it is safe for this work item to stash a cursor pointing - * into the tree and then return to caller (provided it skips cells that are - * still under construction). - * - * Note also that we were given an increment on net->cells_outstanding by - * whoever queued us that we need to deal with before returning. + * Purge in-memory cell database. */ -void afs_manage_cells(struct work_struct *work) +void afs_cell_purge(struct afs_net *net) { - struct afs_net *net = container_of(work, struct afs_net, cells_manager); + struct afs_cell *ws; struct rb_node *cursor; - time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX; - bool purging = !net->live; _enter(""); - /* Trawl the cell database looking for cells that have expired from - * lack of use and cells whose DNS results have expired and dispatch - * their managers. - */ - down_read(&net->cells_lock); + down_write(&net->cells_lock); + ws = rcu_replace_pointer(net->ws_cell, NULL, + lockdep_is_held(&net->cells_lock)); + up_write(&net->cells_lock); + afs_unuse_cell(ws, afs_cell_trace_unuse_ws); + _debug("kick cells"); + down_read(&net->cells_lock); for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { - struct afs_cell *cell = - rb_entry(cursor, struct afs_cell, net_node); - unsigned active; - bool sched_cell = false; - - active = atomic_read(&cell->active); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), - active, afs_cell_trace_manage); - - ASSERTCMP(active, >=, 1); - - if (purging) { - if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) { - active = atomic_dec_return(&cell->active); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), - active, afs_cell_trace_unuse_pin); - } - } + struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node); - if (active == 1) { - struct afs_vlserver_list *vllist; - time64_t expire_at = cell->last_inactive; - - read_lock(&cell->vl_servers_lock); - vllist = rcu_dereference_protected( - cell->vl_servers, - lockdep_is_held(&cell->vl_servers_lock)); - if (vllist->nr_servers > 0) - expire_at += afs_cell_gc_delay; - read_unlock(&cell->vl_servers_lock); - if (purging || expire_at <= now) - sched_cell = true; - else if (expire_at < next_manage) - next_manage = expire_at; - } + afs_see_cell(cell, afs_cell_trace_purge); - if (!purging) { - if (test_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) - sched_cell = true; - } + if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) + afs_unuse_cell(cell, afs_cell_trace_unuse_pin); - if (sched_cell) - afs_queue_cell(cell, afs_cell_trace_get_queue_manage); + afs_queue_cell(cell, afs_cell_trace_queue_purge); } - up_read(&net->cells_lock); - /* Update the timer on the way out. We have to pass an increment on - * cells_outstanding in the namespace that we are in to the timer or - * the work scheduler. - */ - if (!purging && next_manage < TIME64_MAX) { - now = ktime_get_real_seconds(); - - if (next_manage - now <= 0) { - if (queue_work(afs_wq, &net->cells_manager)) - atomic_inc(&net->cells_outstanding); - } else { - afs_set_cell_timer(net, next_manage - now); - } - } - - afs_dec_cells_outstanding(net); - _leave(" [%d]", atomic_read(&net->cells_outstanding)); -} - -/* - * Purge in-memory cell database. - */ -void afs_cell_purge(struct afs_net *net) -{ - struct afs_cell *ws; - - _enter(""); - - down_write(&net->cells_lock); - ws = net->ws_cell; - net->ws_cell = NULL; - up_write(&net->cells_lock); - afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws); - - _debug("del timer"); - if (del_timer_sync(&net->cells_timer)) - atomic_dec(&net->cells_outstanding); - - _debug("kick mgr"); - afs_queue_cell_manager(net); - _debug("wait"); wait_var_event(&net->cells_outstanding, !atomic_read(&net->cells_outstanding)); diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c new file mode 100644 index 000000000000..edcbd249d202 --- /dev/null +++ b/fs/afs/cm_security.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Cache manager security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include <crypto/krb5.h> +#include "internal.h" +#include "afs_cm.h" +#include "afs_fs.h" +#include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include <trace/events/rxrpc.h> + +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32))) + +#ifdef CONFIG_RXGK +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server); +#endif + +/* + * Respond to an RxGK challenge, adding appdata. + */ +static int afs_respond_to_challenge(struct sk_buff *challenge) +{ +#ifdef CONFIG_RXGK + struct krb5_buffer appdata = {}; + struct afs_server *server; +#endif + struct rxrpc_peer *peer; + unsigned long peer_data; + u16 service_id; + u8 security_index; + + rxrpc_kernel_query_challenge(challenge, &peer, &peer_data, + &service_id, &security_index); + + _enter("%u,%u", service_id, security_index); + + switch (service_id) { + /* We don't send CM_SERVICE RPCs, so don't expect a challenge + * therefrom. + */ + case FS_SERVICE: + case VL_SERVICE: + case YFS_FS_SERVICE: + case YFS_VL_SERVICE: + break; + default: + pr_warn("Can't respond to unknown challenge %u:%u", + service_id, security_index); + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } + + switch (security_index) { +#ifdef CONFIG_RXKAD + case RXRPC_SECURITY_RXKAD: + return rxkad_kernel_respond_to_challenge(challenge); +#endif + +#ifdef CONFIG_RXGK + case RXRPC_SECURITY_RXGK: + return rxgk_kernel_respond_to_challenge(challenge, &appdata); + + case RXRPC_SECURITY_YFS_RXGK: + switch (service_id) { + case FS_SERVICE: + case YFS_FS_SERVICE: + server = (struct afs_server *)peer_data; + if (!server->cm_rxgk_appdata.data) { + mutex_lock(&server->cm_token_lock); + if (!server->cm_rxgk_appdata.data) + afs_create_yfs_cm_token(challenge, server); + mutex_unlock(&server->cm_token_lock); + } + if (server->cm_rxgk_appdata.data) + appdata = server->cm_rxgk_appdata; + break; + } + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +#endif + + default: + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } +} + +/* + * Process the OOB message queue, processing challenge packets. + */ +void afs_process_oob_queue(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, rx_oob_work); + struct sk_buff *oob; + enum rxrpc_oob_type type; + + while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) { + switch (type) { + case RXRPC_OOB_CHALLENGE: + afs_respond_to_challenge(oob); + break; + } + rxrpc_kernel_free_oob(oob); + } +} + +#ifdef CONFIG_RXGK +/* + * Create a securities keyring for the cache manager and attach a key to it for + * the RxGK tokens we want to use to secure the callback connection back from + * the fileserver. + */ +int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + const struct krb5_enctype *krb5; + struct key *ring; + key_ref_t key; + char K0[32], *desc; + int ret; + + ring = keyring_alloc("kafs", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_SEARCH | KEY_POS_WRITE | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(ring)) + return PTR_ERR(ring); + + ret = rxrpc_sock_set_security_keyring(socket->sk, ring); + if (ret < 0) + goto out; + + ret = -ENOPKG; + krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (!krb5) + goto out; + + if (WARN_ON_ONCE(krb5->key_len > sizeof(K0))) + goto out; + + ret = -ENOMEM; + desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u", + YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype); + if (!desc) + goto out; + + wait_for_random_bytes(); + get_random_bytes(K0, krb5->key_len); + + key = key_create(make_key_ref(ring, true), + "rxrpc_s", desc, + K0, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + kfree(desc); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto out; + } + + net->fs_cm_token_key = key_ref_to_ptr(key); + ret = 0; +out: + key_put(ring); + return ret; +} + +/* + * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver. + */ +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server) +{ + const struct krb5_enctype *conn_krb5, *token_krb5; + const struct krb5_buffer *token_key; + struct crypto_aead *aead; + struct scatterlist sg; + struct afs_net *net = server->cell->net; + const struct key *key = net->fs_cm_token_key; + size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset; + __be32 caps[1] = { + [0] = htonl(AFS_CAP_ERROR_TRANSLATION), + }; + __be32 *xdr; + void *appdata, *K0, *encbase; + u32 enctype; + int ret; + + if (!key) + return -ENOKEY; + + /* Assume that the fileserver is happy to use the same encoding type as + * we were told to use by the token obtained by the user. + */ + enctype = rxgk_kernel_query_challenge(challenge); + + conn_krb5 = crypto_krb5_find_enctype(enctype); + if (!conn_krb5) + return -ENOPKG; + token_krb5 = key->payload.data[0]; + token_key = (const struct krb5_buffer *)&key->payload.data[2]; + + /* struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + */ + keysize = 4 + xdr_len_object(conn_krb5->key_len); + + /* struct RXGK_AuthName { + * afs_int32 kind; + * opaque data<AUTHDATAMAX>; + * opaque display<AUTHPRINTABLEMAX>; + * }; + */ + uuidsize = sizeof(server->uuid); + authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0); + + /* struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ + toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize); + + offset = 0; + encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset); + + /* struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + */ + contsize = 4 + 4 + xdr_len_object(encsize); + + /* struct YFSAppData { + * opr_uuid initiatorUuid; + * opr_uuid acceptorUuid; + * Capabilities caps; + * afs_int32 enctype; + * opaque callbackKey<>; + * opaque callbackToken<>; + * }; + */ + adatasize = 16 + 16 + + xdr_len_object(sizeof(caps)) + + 4 + + xdr_len_object(conn_krb5->key_len) + + xdr_len_object(contsize); + + ret = -ENOMEM; + appdata = kzalloc(adatasize, GFP_KERNEL); + if (!appdata) + goto out; + xdr = appdata; + + memcpy(xdr, &net->uuid, 16); /* appdata.initiatorUuid */ + xdr += 16 / 4; + memcpy(xdr, &server->uuid, 16); /* appdata.acceptorUuid */ + xdr += 16 / 4; + *xdr++ = htonl(ARRAY_SIZE(caps)); /* appdata.caps.len */ + memcpy(xdr, &caps, sizeof(caps)); /* appdata.caps */ + xdr += ARRAY_SIZE(caps); + *xdr++ = htonl(conn_krb5->etype); /* appdata.enctype */ + + *xdr++ = htonl(conn_krb5->key_len); /* appdata.callbackKey.len */ + K0 = xdr; + get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(contsize); /* appdata.callbackToken.len */ + *xdr++ = htonl(1); /* cont.kvno */ + *xdr++ = htonl(token_krb5->etype); /* cont.enctype */ + *xdr++ = htonl(encsize); /* cont.encrypted_token.len */ + + encbase = xdr; + xdr += offset / 4; + *xdr++ = htonl(conn_krb5->etype); /* token.K0.enctype */ + *xdr++ = htonl(conn_krb5->key_len); /* token.K0.key.len */ + memcpy(xdr, K0, conn_krb5->key_len); /* token.K0.key.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(RXRPC_SECURITY_ENCRYPT); /* token.level */ + *xdr++ = htonl(0); /* token.starttime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(0); /* token.lifetime */ + *xdr++ = htonl(0); /* token.bytelife */ + *xdr++ = htonl(0); /* token.expirationtime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(1); /* token.identities.count */ + *xdr++ = htonl(0); /* token.identities[0].kind */ + *xdr++ = htonl(uuidsize); /* token.identities[0].data.len */ + memcpy(xdr, &server->uuid, uuidsize); + xdr += xdr_round_up(uuidsize) / 4; + *xdr++ = htonl(0); /* token.identities[0].display.len */ + + xdr = encbase + xdr_round_up(encsize); + + if ((unsigned long)xdr - (unsigned long)appdata != adatasize) + pr_err("Appdata size incorrect %lx != %zx\n", + (unsigned long)xdr - (unsigned long)appdata, adatasize); + + aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN, + GFP_KERNEL); + if (IS_ERR(aead)) { + ret = PTR_ERR(aead); + goto out_token; + } + + sg_init_one(&sg, encbase, encsize); + ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false); + if (ret < 0) + goto out_aead; + + server->cm_rxgk_appdata.len = adatasize; + server->cm_rxgk_appdata.data = appdata; + appdata = NULL; + +out_aead: + crypto_free_aead(aead); +out_token: + kfree(appdata); +out: + return ret; +} +#endif /* CONFIG_RXGK */ diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index d4ddb20d6732..1a906805a9e3 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -139,48 +139,6 @@ bool afs_cm_incoming_call(struct afs_call *call) } /* - * Find the server record by peer address and record a probe to the cache - * manager from a server. - */ -static int afs_find_cm_server_by_peer(struct afs_call *call) -{ - struct sockaddr_rxrpc srx; - struct afs_server *server; - - rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - - server = afs_find_server(call->net, &srx); - if (!server) { - trace_afs_cm_no_server(call, &srx); - return 0; - } - - call->server = server; - return 0; -} - -/* - * Find the server record by server UUID and record a probe to the cache - * manager from a server. - */ -static int afs_find_cm_server_by_uuid(struct afs_call *call, - struct afs_uuid *uuid) -{ - struct afs_server *server; - - rcu_read_lock(); - server = afs_find_server_by_uuid(call->net, call->request); - rcu_read_unlock(); - if (!server) { - trace_afs_cm_no_server_u(call, call->request); - return 0; - } - - call->server = server; - return 0; -} - -/* * Clean up a cache manager call. */ static void afs_cm_destructor(struct afs_call *call) @@ -321,10 +279,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -348,18 +303,10 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) */ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { - int ret; - _enter(""); afs_extract_discard(call, 0); - ret = afs_extract_data(call, false); - if (ret < 0) - return ret; - - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_peer(call); + return afs_extract_data(call, false); } /* @@ -372,8 +319,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) __be32 *b; int ret; - _enter(""); - _enter("{%u}", call->unmarshall); switch (call->unmarshall) { @@ -420,9 +365,13 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_uuid(call, call->request); + if (memcmp(call->request, &call->server->_uuid, sizeof(call->server->_uuid)) != 0) { + pr_notice("Callback UUID does not match fileserver UUID\n"); + trace_afs_cm_no_server_u(call, call->request); + return 0; + } + + return 0; } /* @@ -454,7 +403,7 @@ static int afs_deliver_cb_probe(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -532,7 +481,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -592,7 +541,7 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -666,9 +615,5 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - - /* We'll need the file server record as that tells us which set of - * vnodes to operate upon. - */ - return afs_find_cm_server_by_peer(call); + return 0; } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index b7c1f8c84b38..f4e9e12373ac 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -12,6 +12,8 @@ #include <linux/swap.h> #include <linux/ctype.h> #include <linux/sched.h> +#include <linux/iversion.h> +#include <linux/iov_iter.h> #include <linux/task_io_accounting_ops.h> #include "internal.h" #include "afs_fs.h" @@ -21,35 +23,27 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); static int afs_readdir(struct file *file, struct dir_context *ctx); -static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); +static int afs_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_iput(struct dentry *dentry, struct inode *inode); static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); -static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode); +static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry); -static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *content); -static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); -static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags); -static void afs_dir_invalidate_folio(struct folio *folio, size_t offset, - size_t length); - -static bool afs_dir_dirty_folio(struct address_space *mapping, - struct folio *folio) -{ - BUG(); /* This should never happen. */ -} const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, @@ -74,10 +68,7 @@ const struct inode_operations afs_dir_inode_operations = { }; const struct address_space_operations afs_dir_aops = { - .dirty_folio = afs_dir_dirty_folio, - .release_folio = afs_dir_release_folio, - .invalidate_folio = afs_dir_invalidate_folio, - .migrate_folio = filemap_migrate_folio, + .writepages = afs_single_writepages, }; const struct dentry_operations afs_fs_dentry_operations = { @@ -98,152 +89,124 @@ struct afs_lookup_one_cookie { struct afs_lookup_cookie { struct dir_context ctx; struct qstr name; - bool found; - bool one_only; unsigned short nr_fids; struct afs_fid fids[50]; }; +static void afs_dir_unuse_cookie(struct afs_vnode *dvnode, int ret) +{ + if (ret == 0) { + struct afs_vnode_cache_aux aux; + loff_t i_size = i_size_read(&dvnode->netfs.inode); + + afs_set_cache_aux(dvnode, &aux); + fscache_unuse_cookie(afs_vnode_cache(dvnode), &aux, &i_size); + } else { + fscache_unuse_cookie(afs_vnode_cache(dvnode), NULL, NULL); + } +} + /* - * Drop the refs that we're holding on the folios we were reading into. We've - * got refs on the first nr_pages pages. + * Iterate through a kmapped directory segment, dumping a summary of + * the contents. */ -static void afs_dir_read_cleanup(struct afs_read *req) +static size_t afs_dir_dump_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) { - struct address_space *mapping = req->vnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t last = req->nr_pages - 1; + do { + union afs_xdr_dir_block *block = iter_base; - XA_STATE(xas, &mapping->i_pages, 0); + pr_warn("[%05zx] %32phN\n", progress, block); + iter_base += AFS_DIR_BLOCK_SIZE; + progress += AFS_DIR_BLOCK_SIZE; + len -= AFS_DIR_BLOCK_SIZE; + } while (len > 0); - if (unlikely(!req->nr_pages)) - return; + return len; +} - rcu_read_lock(); - xas_for_each(&xas, folio, last) { - if (xas_retry(&xas, folio)) - continue; - BUG_ON(xa_is_value(folio)); - ASSERTCMP(folio_file_mapping(folio), ==, mapping); +/* + * Dump the contents of a directory. + */ +static void afs_dir_dump(struct afs_vnode *dvnode) +{ + struct iov_iter iter; + unsigned long long i_size = i_size_read(&dvnode->netfs.inode); - folio_put(folio); - } + pr_warn("DIR %llx:%llx is=%llx\n", + dvnode->fid.vid, dvnode->fid.vnode, i_size); - rcu_read_unlock(); + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size); + iterate_folioq(&iter, iov_iter_count(&iter), NULL, NULL, + afs_dir_dump_step); } /* * check that a directory folio is valid */ -static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio, - loff_t i_size) +static bool afs_dir_check_block(struct afs_vnode *dvnode, size_t progress, + union afs_xdr_dir_block *block) { - union afs_xdr_dir_block *block; - size_t offset, size; - loff_t pos; + if (block->hdr.magic != AFS_DIR_MAGIC) { + pr_warn("%s(%lx): [%zx] bad magic %04x\n", + __func__, dvnode->netfs.inode.i_ino, + progress, ntohs(block->hdr.magic)); + trace_afs_dir_check_failed(dvnode, progress); + trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic); + return false; + } - /* Determine how many magic numbers there should be in this folio, but - * we must take care because the directory may change size under us. + /* Make sure each block is NUL terminated so we can reasonably + * use string functions on it. The filenames in the folio + * *should* be NUL-terminated anyway. */ - pos = folio_pos(folio); - if (i_size <= pos) - goto checked; - - size = min_t(loff_t, folio_size(folio), i_size - pos); - for (offset = 0; offset < size; offset += sizeof(*block)) { - block = kmap_local_folio(folio, offset); - if (block->hdr.magic != AFS_DIR_MAGIC) { - printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n", - __func__, dvnode->netfs.inode.i_ino, - pos, offset, size, ntohs(block->hdr.magic)); - trace_afs_dir_check_failed(dvnode, pos + offset, i_size); - kunmap_local(block); - trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic); - goto error; - } - - /* Make sure each block is NUL terminated so we can reasonably - * use string functions on it. The filenames in the folio - * *should* be NUL-terminated anyway. - */ - ((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0; - - kunmap_local(block); - } -checked: + ((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0; afs_stat_v(dvnode, n_read_dir); return true; - -error: - return false; } /* - * Dump the contents of a directory. + * Iterate through a kmapped directory segment, checking the content. */ -static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) +static size_t afs_dir_check_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) { - union afs_xdr_dir_block *block; - struct address_space *mapping = dvnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t last = req->nr_pages - 1; - size_t offset, size; - - XA_STATE(xas, &mapping->i_pages, 0); - - pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx\n", - dvnode->fid.vid, dvnode->fid.vnode, - req->file_size, req->len, req->actual_len); - pr_warn("DIR %llx %x %zx %zx\n", - req->pos, req->nr_pages, - req->iter->iov_offset, iov_iter_count(req->iter)); - - xas_for_each(&xas, folio, last) { - if (xas_retry(&xas, folio)) - continue; + struct afs_vnode *dvnode = priv; - BUG_ON(folio_file_mapping(folio) != mapping); + if (WARN_ON_ONCE(progress % AFS_DIR_BLOCK_SIZE || + len % AFS_DIR_BLOCK_SIZE)) + return len; - size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio)); - for (offset = 0; offset < size; offset += sizeof(*block)) { - block = kmap_local_folio(folio, offset); - pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block); - kunmap_local(block); - } - } + do { + if (!afs_dir_check_block(dvnode, progress, iter_base)) + break; + iter_base += AFS_DIR_BLOCK_SIZE; + len -= AFS_DIR_BLOCK_SIZE; + } while (len > 0); + + return len; } /* - * Check all the blocks in a directory. All the folios are held pinned. + * Check all the blocks in a directory. */ -static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) +static int afs_dir_check(struct afs_vnode *dvnode) { - struct address_space *mapping = dvnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t last = req->nr_pages - 1; - int ret = 0; + struct iov_iter iter; + unsigned long long i_size = i_size_read(&dvnode->netfs.inode); + size_t checked = 0; - XA_STATE(xas, &mapping->i_pages, 0); - - if (unlikely(!req->nr_pages)) + if (unlikely(!i_size)) return 0; - rcu_read_lock(); - xas_for_each(&xas, folio, last) { - if (xas_retry(&xas, folio)) - continue; - - BUG_ON(folio_file_mapping(folio) != mapping); - - if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) { - afs_dir_dump(dvnode, req); - ret = -EIO; - break; - } + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size); + checked = iterate_folioq(&iter, iov_iter_count(&iter), dvnode, NULL, + afs_dir_check_step); + if (checked != i_size) { + afs_dir_dump(dvnode); + return -EIO; } - - rcu_read_unlock(); - return ret; + return 0; } /* @@ -263,130 +226,140 @@ static int afs_dir_open(struct inode *inode, struct file *file) } /* - * Read the directory into the pagecache in one go, scrubbing the previous - * contents. The list of folios is returned, pinning them so that they don't - * get reclaimed during the iteration. + * Read a file in a single download. */ -static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) - __acquires(&dvnode->validate_lock) +static ssize_t afs_do_read_single(struct afs_vnode *dvnode, struct file *file) { - struct address_space *mapping = dvnode->netfs.inode.i_mapping; - struct afs_read *req; + struct iov_iter iter; + ssize_t ret; loff_t i_size; - int nr_pages, i; - int ret; - - _enter(""); - - req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - return ERR_PTR(-ENOMEM); + bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) && + !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags)); - refcount_set(&req->usage, 1); - req->vnode = dvnode; - req->key = key_get(key); - req->cleanup = afs_dir_read_cleanup; - -expand: i_size = i_size_read(&dvnode->netfs.inode); - if (i_size < 2048) { - ret = afs_bad(dvnode, afs_file_error_dir_small); - goto error; - } - if (i_size > 2048 * 1024) { - trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); - ret = -EFBIG; - goto error; + if (is_dir) { + if (i_size < AFS_DIR_BLOCK_SIZE) + return afs_bad(dvnode, afs_file_error_dir_small); + if (i_size > AFS_DIR_BLOCK_SIZE * 1024) { + trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); + return -EFBIG; + } + } else { + if (i_size > AFSPATHMAX) { + trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); + return -EFBIG; + } } - _enter("%llu", i_size); + /* Expand the storage. TODO: Shrink the storage too. */ + if (dvnode->directory_size < i_size) { + size_t cur_size = dvnode->directory_size; - nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE; + ret = netfs_alloc_folioq_buffer(NULL, + &dvnode->directory, &cur_size, i_size, + mapping_gfp_mask(dvnode->netfs.inode.i_mapping)); + dvnode->directory_size = cur_size; + if (ret < 0) + return ret; + } - req->actual_len = i_size; /* May change */ - req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ - req->data_version = dvnode->status.data_version; /* May change */ - iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages, - 0, i_size); - req->iter = &req->def_iter; + iov_iter_folio_queue(&iter, ITER_DEST, dvnode->directory, 0, 0, dvnode->directory_size); - /* Fill in any gaps that we might find where the memory reclaimer has - * been at work and pin all the folios. If there are any gaps, we will - * need to reread the entire directory contents. + /* AFS requires us to perform the read of a directory synchronously as + * a single unit to avoid issues with the directory contents being + * changed between reads. */ - i = req->nr_pages; - while (i < nr_pages) { - struct folio *folio; - - folio = filemap_get_folio(mapping, i); - if (!folio) { - if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - afs_stat_v(dvnode, n_inval); - - ret = -ENOMEM; - folio = __filemap_get_folio(mapping, - i, FGP_LOCK | FGP_CREAT, - mapping->gfp_mask); - if (!folio) - goto error; - folio_attach_private(folio, (void *)1); - folio_unlock(folio); + ret = netfs_read_single(&dvnode->netfs.inode, file, &iter); + if (ret >= 0) { + i_size = i_size_read(&dvnode->netfs.inode); + if (i_size > ret) { + /* The content has grown, so we need to expand the + * buffer. + */ + ret = -ESTALE; + } else if (is_dir) { + int ret2 = afs_dir_check(dvnode); + + if (ret2 < 0) + ret = ret2; + } else if (i_size < folioq_folio_size(dvnode->directory, 0)) { + /* NUL-terminate a symlink. */ + char *symlink = kmap_local_folio(folioq_folio(dvnode->directory, 0), 0); + + symlink[i_size] = 0; + kunmap_local(symlink); } - - req->nr_pages += folio_nr_pages(folio); - i += folio_nr_pages(folio); } - /* If we're going to reload, we need to lock all the pages to prevent - * races. - */ + return ret; +} + +ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file) +{ + ssize_t ret; + + fscache_use_cookie(afs_vnode_cache(dvnode), false); + ret = afs_do_read_single(dvnode, file); + fscache_unuse_cookie(afs_vnode_cache(dvnode), NULL, NULL); + return ret; +} + +/* + * Read the directory into a folio_queue buffer in one go, scrubbing the + * previous contents. We return -ESTALE if the caller needs to call us again. + */ +ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file) + __acquires(&dvnode->validate_lock) +{ + ssize_t ret; + loff_t i_size; + + i_size = i_size_read(&dvnode->netfs.inode); + ret = -ERESTARTSYS; if (down_read_killable(&dvnode->validate_lock) < 0) goto error; - if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - goto success; + /* We only need to reread the data if it became invalid - or if we + * haven't read it yet. + */ + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && + test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) { + ret = i_size; + goto valid; + } up_read(&dvnode->validate_lock); if (down_write_killable(&dvnode->validate_lock) < 0) goto error; - if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { - trace_afs_reload_dir(dvnode); - ret = afs_fetch_data(dvnode, req); - if (ret < 0) - goto error_unlock; + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_invalidate_cache(dvnode, 0); - task_io_account_read(PAGE_SIZE * req->nr_pages); - - if (req->len < req->file_size) { - /* The content has grown, so we need to expand the - * buffer. - */ - up_write(&dvnode->validate_lock); - goto expand; - } - - /* Validate the data we just read. */ - ret = afs_dir_check(dvnode, req); + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) || + !test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) { + trace_afs_reload_dir(dvnode); + ret = afs_read_single(dvnode, file); if (ret < 0) goto error_unlock; // TODO: Trim excess pages set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); + set_bit(AFS_VNODE_DIR_READ, &dvnode->flags); + } else { + ret = i_size; } downgrade_write(&dvnode->validate_lock); -success: - return req; +valid: + return ret; error_unlock: up_write(&dvnode->validate_lock); error: - afs_put_read(req); - _leave(" = %d", ret); - return ERR_PTR(ret); + _leave(" = %zd", ret); + return ret; } /* @@ -394,79 +367,69 @@ error: */ static int afs_dir_iterate_block(struct afs_vnode *dvnode, struct dir_context *ctx, - union afs_xdr_dir_block *block, - unsigned blkoff) + union afs_xdr_dir_block *block) { union afs_xdr_dirent *dire; - unsigned offset, next, curr, nr_slots; + unsigned int blknum, base, hdr, pos, next, nr_slots; size_t nlen; int tmp; - _enter("%llx,%x", ctx->pos, blkoff); + blknum = ctx->pos / AFS_DIR_BLOCK_SIZE; + base = blknum * AFS_DIR_SLOTS_PER_BLOCK; + hdr = (blknum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + pos = DIV_ROUND_UP(ctx->pos, AFS_DIR_DIRENT_SIZE) - base; - curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent); + _enter("%llx,%x", ctx->pos, blknum); /* walk through the block, an entry at a time */ - for (offset = (blkoff == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); - offset < AFS_DIR_SLOTS_PER_BLOCK; - offset = next - ) { + for (unsigned int slot = hdr; slot < AFS_DIR_SLOTS_PER_BLOCK; slot = next) { /* skip entries marked unused in the bitmap */ - if (!(block->hdr.bitmap[offset / 8] & - (1 << (offset % 8)))) { - _debug("ENT[%zu.%u]: unused", - blkoff / sizeof(union afs_xdr_dir_block), offset); - next = offset + 1; - if (offset >= curr) - ctx->pos = blkoff + - next * sizeof(union afs_xdr_dirent); + if (!(block->hdr.bitmap[slot / 8] & + (1 << (slot % 8)))) { + _debug("ENT[%x]: Unused", base + slot); + next = slot + 1; + if (next >= pos) + ctx->pos = (base + next) * sizeof(union afs_xdr_dirent); continue; } /* got a valid entry */ - dire = &block->dirents[offset]; + dire = &block->dirents[slot]; nlen = strnlen(dire->u.name, - sizeof(*block) - - offset * sizeof(union afs_xdr_dirent)); + (unsigned long)(block + 1) - (unsigned long)dire->u.name - 1); if (nlen > AFSNAMEMAX - 1) { - _debug("ENT[%zu]: name too long (len %u/%zu)", - blkoff / sizeof(union afs_xdr_dir_block), - offset, nlen); + _debug("ENT[%x]: Name too long (len %zx)", + base + slot, nlen); return afs_bad(dvnode, afs_file_error_dir_name_too_long); } - _debug("ENT[%zu.%u]: %s %zu \"%s\"", - blkoff / sizeof(union afs_xdr_dir_block), offset, - (offset < curr ? "skip" : "fill"), + _debug("ENT[%x]: %s %zx \"%s\"", + base + slot, (slot < pos ? "skip" : "fill"), nlen, dire->u.name); nr_slots = afs_dir_calc_slots(nlen); - next = offset + nr_slots; + next = slot + nr_slots; if (next > AFS_DIR_SLOTS_PER_BLOCK) { - _debug("ENT[%zu.%u]:" - " %u extends beyond end dir block" - " (len %zu)", - blkoff / sizeof(union afs_xdr_dir_block), - offset, next, nlen); + _debug("ENT[%x]: extends beyond end dir block (len %zx)", + base + slot, nlen); return afs_bad(dvnode, afs_file_error_dir_over_end); } /* Check that the name-extension dirents are all allocated */ for (tmp = 1; tmp < nr_slots; tmp++) { - unsigned int ix = offset + tmp; - if (!(block->hdr.bitmap[ix / 8] & (1 << (ix % 8)))) { - _debug("ENT[%zu.u]:" - " %u unmarked extension (%u/%u)", - blkoff / sizeof(union afs_xdr_dir_block), - offset, tmp, nr_slots); + unsigned int xslot = slot + tmp; + + if (!(block->hdr.bitmap[xslot / 8] & (1 << (xslot % 8)))) { + _debug("ENT[%x]: Unmarked extension (%x/%x)", + base + slot, tmp, nr_slots); return afs_bad(dvnode, afs_file_error_dir_unmarked_ext); } } /* skip if starts before the current position */ - if (offset < curr) { - if (next > curr) - ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); + if (slot < pos) { + if (next > pos) + ctx->pos = (base + next) * sizeof(union afs_xdr_dirent); continue; } @@ -480,75 +443,110 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, return 0; } - ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); + ctx->pos = (base + next) * sizeof(union afs_xdr_dirent); } _leave(" = 1 [more]"); return 1; } +struct afs_dir_iteration_ctx { + struct dir_context *dir_ctx; + int error; +}; + /* - * iterate through the data blob that lists the contents of an AFS directory + * Iterate through a kmapped directory segment. */ -static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, - struct key *key, afs_dataversion_t *_dir_version) +static size_t afs_dir_iterate_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) { - struct afs_vnode *dvnode = AFS_FS_I(dir); - union afs_xdr_dir_block *dblock; - struct afs_read *req; - struct folio *folio; - unsigned offset, size; + struct afs_dir_iteration_ctx *ctx = priv2; + struct afs_vnode *dvnode = priv; int ret; - _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos); - - if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { - _leave(" = -ESTALE"); - return -ESTALE; + if (WARN_ON_ONCE(progress % AFS_DIR_BLOCK_SIZE || + len % AFS_DIR_BLOCK_SIZE)) { + pr_err("Mis-iteration prog=%zx len=%zx\n", + progress % AFS_DIR_BLOCK_SIZE, + len % AFS_DIR_BLOCK_SIZE); + return len; } - req = afs_read_dir(dvnode, key); - if (IS_ERR(req)) - return PTR_ERR(req); - *_dir_version = req->data_version; + do { + ret = afs_dir_iterate_block(dvnode, ctx->dir_ctx, iter_base); + if (ret != 1) + break; - /* round the file position up to the next entry boundary */ - ctx->pos += sizeof(union afs_xdr_dirent) - 1; - ctx->pos &= ~(sizeof(union afs_xdr_dirent) - 1); + ctx->dir_ctx->pos = round_up(ctx->dir_ctx->pos, AFS_DIR_BLOCK_SIZE); + iter_base += AFS_DIR_BLOCK_SIZE; + len -= AFS_DIR_BLOCK_SIZE; + } while (len > 0); - /* walk through the blocks in sequence */ - ret = 0; - while (ctx->pos < req->actual_len) { - /* Fetch the appropriate folio from the directory and re-add it - * to the LRU. We have all the pages pinned with an extra ref. - */ - folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE, - FGP_ACCESSED, 0); - if (!folio) { - ret = afs_bad(dvnode, afs_file_error_dir_missing_page); - break; - } + return len; +} - offset = round_down(ctx->pos, sizeof(*dblock)) - folio_file_pos(folio); - size = min_t(loff_t, folio_size(folio), - req->actual_len - folio_file_pos(folio)); +/* + * Iterate through the directory folios. + */ +static int afs_dir_iterate_contents(struct inode *dir, struct dir_context *dir_ctx) +{ + struct afs_dir_iteration_ctx ctx = { .dir_ctx = dir_ctx }; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct iov_iter iter; + unsigned long long i_size = i_size_read(dir); - do { - dblock = kmap_local_folio(folio, offset); - ret = afs_dir_iterate_block(dvnode, ctx, dblock, - folio_file_pos(folio) + offset); - kunmap_local(dblock); - if (ret != 1) - goto out; + /* Round the file position up to the next entry boundary */ + dir_ctx->pos = round_up(dir_ctx->pos, sizeof(union afs_xdr_dirent)); - } while (offset += sizeof(*dblock), offset < size); + if (i_size <= 0 || dir_ctx->pos >= i_size) + return 0; - ret = 0; - } + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, i_size); + iov_iter_advance(&iter, round_down(dir_ctx->pos, AFS_DIR_BLOCK_SIZE)); + + iterate_folioq(&iter, iov_iter_count(&iter), dvnode, &ctx, + afs_dir_iterate_step); + + if (ctx.error == -ESTALE) + afs_invalidate_dir(dvnode, afs_dir_invalid_iter_stale); + return ctx.error; +} + +/* + * iterate through the data blob that lists the contents of an AFS directory + */ +static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, + struct file *file, afs_dataversion_t *_dir_version) +{ + struct afs_vnode *dvnode = AFS_FS_I(dir); + int retry_limit = 100; + int ret; + + _enter("{%lu},%llx,,", dir->i_ino, ctx->pos); + + do { + if (--retry_limit < 0) { + pr_warn("afs_read_dir(): Too many retries\n"); + ret = -ESTALE; + break; + } + ret = afs_read_dir(dvnode, file); + if (ret < 0) { + if (ret != -ESTALE) + break; + if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { + ret = -ESTALE; + break; + } + continue; + } + *_dir_version = inode_peek_iversion_raw(dir); + + ret = afs_dir_iterate_contents(dir, ctx); + up_read(&dvnode->validate_lock); + } while (ret == -ESTALE); -out: - up_read(&dvnode->validate_lock); - afs_put_read(req); _leave(" = %d", ret); return ret; } @@ -560,8 +558,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) { afs_dataversion_t dir_version; - return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file), - &dir_version); + return afs_dir_iterate(file_inode(file), ctx, file, &dir_version); } /* @@ -601,22 +598,22 @@ static bool afs_lookup_one_filldir(struct dir_context *ctx, const char *name, * Do a lookup of a single name in a directory * - just returns the FID the dentry name maps to if found */ -static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, - struct afs_fid *fid, struct key *key, +static int afs_do_lookup_one(struct inode *dir, const struct qstr *name, + struct afs_fid *fid, afs_dataversion_t *_dir_version) { struct afs_super_info *as = dir->i_sb->s_fs_info; struct afs_lookup_one_cookie cookie = { .ctx.actor = afs_lookup_one_filldir, - .name = dentry->d_name, + .name = *name, .fid.vid = as->volume->vid }; int ret; - _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); + _enter("{%lu},{%.*s},", dir->i_ino, name->len, name->name); /* search the directory */ - ret = afs_dir_iterate(dir, &cookie.ctx, key, _dir_version); + ret = afs_dir_iterate(dir, &cookie.ctx, NULL, _dir_version); if (ret < 0) { _leave(" = %d [iter]", ret); return ret; @@ -651,19 +648,10 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); - if (cookie->found) { - if (cookie->nr_fids < 50) { - cookie->fids[cookie->nr_fids].vnode = ino; - cookie->fids[cookie->nr_fids].unique = dtype; - cookie->nr_fids++; - } - } else if (cookie->name.len == nlen && - memcmp(cookie->name.name, name, nlen) == 0) { - cookie->fids[1].vnode = ino; - cookie->fids[1].unique = dtype; - cookie->found = 1; - if (cookie->one_only) - return false; + if (cookie->nr_fids < 50) { + cookie->fids[cookie->nr_fids].vnode = ino; + cookie->fids[cookie->nr_fids].unique = dtype; + cookie->nr_fids++; } return cookie->nr_fids < 50; @@ -689,8 +677,9 @@ static void afs_do_lookup_success(struct afs_operation *op) vp = &op->file[0]; abort_code = vp->scb.status.abort_code; if (abort_code != 0) { - op->ac.abort_code = abort_code; - op->error = afs_abort_to_error(abort_code); + op->call_abort_code = abort_code; + afs_op_set_error(op, afs_abort_to_error(abort_code)); + op->cumul_error.abort_code = abort_code; } break; @@ -703,6 +692,8 @@ static void afs_do_lookup_success(struct afs_operation *op) break; } + if (vp->scb.status.abort_code) + trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code); if (!vp->scb.have_status && !vp->scb.have_error) continue; @@ -780,8 +771,7 @@ static bool afs_server_supports_ibulk(struct afs_vnode *dvnode) * files in one go and create inodes for them. The inode of the file we were * asked for is returned. */ -static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, - struct key *key) +static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) { struct afs_lookup_cookie *cookie; struct afs_vnode_param *vp; @@ -789,6 +779,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct inode *inode = NULL, *ti; afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version); + bool supports_ibulk, isnew; long ret; int i; @@ -802,22 +793,22 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, cookie->fids[i].vid = dvnode->fid.vid; cookie->ctx.actor = afs_lookup_filldir; cookie->name = dentry->d_name; - cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want - * and slot 1 for the directory */ + cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want + * and slot 0 for the directory */ - if (!afs_server_supports_ibulk(dvnode)) - cookie->one_only = true; - - /* search the directory */ - ret = afs_dir_iterate(dir, &cookie->ctx, key, &data_version); + /* Search the directory for the named entry using the hash table... */ + ret = afs_dir_search(dvnode, &dentry->d_name, &cookie->fids[1], &data_version); if (ret < 0) goto out; - dentry->d_fsdata = (void *)(unsigned long)data_version; + supports_ibulk = afs_server_supports_ibulk(dvnode); + if (supports_ibulk) { + /* ...then scan linearly from that point for entries to lookup-ahead. */ + cookie->ctx.pos = (ret + 1) * AFS_DIR_DIRENT_SIZE; + afs_dir_iterate(dir, &cookie->ctx, NULL, &data_version); + } - ret = -ENOENT; - if (!cookie->found) - goto out; + dentry->d_fsdata = (void *)(unsigned long)data_version; /* Check to see if we already have an inode for the primary fid. */ inode = ilookup5(dir->i_sb, cookie->fids[1].vnode, @@ -842,13 +833,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, _debug("nr_files %u", op->nr_files); /* Need space for examining all the selected files */ - op->error = -ENOMEM; if (op->nr_files > 2) { op->more_files = kvcalloc(op->nr_files - 2, sizeof(struct afs_vnode_param), GFP_KERNEL); - if (!op->more_files) + if (!op->more_files) { + afs_op_nomem(op); goto out_op; + } for (i = 2; i < op->nr_files; i++) { vp = &op->more_files[i - 2]; @@ -858,7 +850,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, * callback counters. */ ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode, - afs_ilookup5_test_by_fid, &vp->fid); + afs_ilookup5_test_by_fid, &vp->fid, &isnew); if (!IS_ERR_OR_NULL(ti)) { vnode = AFS_FS_I(ti); vp->dv_before = vnode->status.data_version; @@ -874,14 +866,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, * lookups contained therein are stored in the reply without aborting * the whole operation. */ - op->error = -ENOTSUPP; - if (!cookie->one_only) { + afs_op_set_error(op, -ENOTSUPP); + if (supports_ibulk) { op->ops = &afs_inline_bulk_status_operation; afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - if (op->error == -ENOTSUPP) { + if (afs_op_error(op) == -ENOTSUPP) { /* We could try FS.BulkStatus next, but this aborts the entire * op if any of the lookups fails - so, for the moment, revert * to FS.FetchStatus for op->file[1]. @@ -891,12 +883,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, afs_begin_vnode_operation(op); afs_wait_for_operation(op); } - inode = ERR_PTR(op->error); out_op: - if (op->error == 0) { - inode = &op->file[1].vnode->netfs.inode; - op->file[1].vnode = NULL; + if (!afs_op_error(op)) { + if (op->file[1].scb.status.abort_code) { + afs_op_accumulate_error(op, -ECONNABORTED, + op->file[1].scb.status.abort_code); + } else { + inode = &op->file[1].vnode->netfs.inode; + op->file[1].vnode = NULL; + } } if (op->file[0].scb.have_status) @@ -913,8 +909,7 @@ out: /* * Look up an entry in a directory with @sys substitution. */ -static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry, - struct key *key) +static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry) { struct afs_sysnames *subs; struct afs_net *net = afs_i2net(dir); @@ -948,7 +943,7 @@ static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry, } strcpy(p, name); - ret = lookup_one_len(buf, dentry->d_parent, len); + ret = lookup_noperm(&QSTR(buf), dentry->d_parent); if (IS_ERR(ret) || d_is_positive(ret)) goto out_s; dput(ret); @@ -962,7 +957,6 @@ out_s: afs_put_sysnames(subs); kfree(buf); out_p: - key_put(key); return ret; } @@ -976,7 +970,6 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, struct afs_fid fid = {}; struct inode *inode; struct dentry *d; - struct key *key; int ret; _enter("{%llx:%llu},%p{%pd},", @@ -994,15 +987,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ESTALE); } - key = afs_request_key(dvnode->volume->cell); - if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - return ERR_CAST(key); - } - - ret = afs_validate(dvnode, key); + ret = afs_validate(dvnode, NULL); if (ret < 0) { - key_put(key); + afs_dir_unuse_cookie(dvnode, ret); _leave(" = %d [val]", ret); return ERR_PTR(ret); } @@ -1012,15 +999,13 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, dentry->d_name.name[dentry->d_name.len - 3] == 's' && dentry->d_name.name[dentry->d_name.len - 2] == 'y' && dentry->d_name.name[dentry->d_name.len - 1] == 's') - return afs_lookup_atsys(dir, dentry, key); + return afs_lookup_atsys(dir, dentry); afs_stat_v(dvnode, n_lookup); - inode = afs_do_lookup(dir, dentry, key); - key_put(key); + inode = afs_do_lookup(dir, dentry); if (inode == ERR_PTR(-ENOENT)) - inode = afs_try_auto_mntpt(dentry, dir); - - if (!IS_ERR_OR_NULL(inode)) + inode = NULL; + else if (!IS_ERR_OR_NULL(inode)) fid = AFS_FS_I(inode)->fid; _debug("splice %p", dentry->d_inode); @@ -1038,21 +1023,12 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, /* * Check the validity of a dentry under RCU conditions. */ -static int afs_d_revalidate_rcu(struct dentry *dentry) +static int afs_d_revalidate_rcu(struct afs_vnode *dvnode, struct dentry *dentry) { - struct afs_vnode *dvnode; - struct dentry *parent; - struct inode *dir; long dir_version, de_version; _enter("%p", dentry); - /* Check the parent directory is still valid first. */ - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - dvnode = AFS_FS_I(dir); if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) return -ECHILD; @@ -1080,11 +1056,11 @@ static int afs_d_revalidate_rcu(struct dentry *dentry) * - NOTE! the hit can be a negative hit too, so we can't assume we have an * inode */ -static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) +static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { - struct afs_vnode *vnode, *dir; + struct afs_vnode *vnode, *dir = AFS_FS_I(parent_dir); struct afs_fid fid; - struct dentry *parent; struct inode *inode; struct key *key; afs_dataversion_t dir_version, invalid_before; @@ -1092,7 +1068,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) int ret; if (flags & LOOKUP_RCU) - return afs_d_revalidate_rcu(dentry); + return afs_d_revalidate_rcu(dir, dentry); if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); @@ -1107,12 +1083,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (IS_ERR(key)) key = NULL; - /* Hold the parent dentry so we can peer at it */ - parent = dget_parent(dentry); - dir = AFS_FS_I(d_inode(parent)); - /* validate the parent directory */ - afs_validate(dir, key); + ret = afs_validate(dir, key); + if (ret == -ERESTARTSYS) { + key_put(key); + return ret; + } if (test_bit(AFS_VNODE_DELETED, &dir->flags)) { _debug("%pd: parent dir deleted", dentry); @@ -1137,7 +1113,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version); + ret = afs_do_lookup_one(&dir->netfs.inode, name, &fid, &dir_version); switch (ret) { case 0: /* the filename maps to something */ @@ -1181,22 +1157,19 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_valid; default: - _debug("failed to iterate dir %pd: %d", - parent, ret); + _debug("failed to iterate parent %pd2: %d", dentry, ret); goto not_found; } out_valid: dentry->d_fsdata = (void *)(unsigned long)dir_version; out_valid_noupdate: - dput(parent); key_put(key); _leave(" = 1 [valid]"); return 1; not_found: _debug("dropping dentry %pd2", dentry); - dput(parent); key_put(key); _leave(" = 0 [bad]"); @@ -1251,9 +1224,10 @@ void afs_check_for_remote_deletion(struct afs_operation *op) { struct afs_vnode *vnode = op->file[0].vnode; - switch (op->ac.abort_code) { + switch (afs_op_abort_code(op)) { case VNOVNODE: set_bit(AFS_VNODE_DELETED, &vnode->flags); + clear_nlink(&vnode->netfs.inode); afs_break_callback(vnode, afs_cb_break_for_deleted); } } @@ -1263,26 +1237,31 @@ void afs_check_for_remote_deletion(struct afs_operation *op) */ static void afs_vnode_new_inode(struct afs_operation *op) { + struct afs_vnode_param *dvp = &op->file[0]; struct afs_vnode_param *vp = &op->file[1]; struct afs_vnode *vnode; struct inode *inode; _enter(""); - ASSERTCMP(op->error, ==, 0); + ASSERTCMP(afs_op_error(op), ==, 0); inode = afs_iget(op, vp); if (IS_ERR(inode)) { /* ENOMEM or EINTR at a really inconvenient time - just abandon * the new directory on the server. */ - op->error = PTR_ERR(inode); + afs_op_accumulate_error(op, PTR_ERR(inode), 0); return; } vnode = AFS_FS_I(inode); set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - if (!op->error) + if (S_ISDIR(inode->i_mode)) + afs_mkdir_init_dir(vnode, dvp->vnode); + else if (S_ISLNK(inode->i_mode)) + afs_init_new_symlink(vnode, op); + if (!afs_op_error(op)) afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb); d_instantiate(op->dentry, inode); } @@ -1298,25 +1277,28 @@ static void afs_create_success(struct afs_operation *op) static void afs_create_edit_dir(struct afs_operation *op) { + struct netfs_cache_resources cres = {}; struct afs_vnode_param *dvp = &op->file[0]; struct afs_vnode_param *vp = &op->file[1]; struct afs_vnode *dvnode = dvp->vnode; _enter("op=%08x", op->debug_id); + fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode)); down_write(&dvnode->validate_lock); if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) afs_edit_dir_add(dvnode, &op->dentry->d_name, &vp->fid, op->create.reason); up_write(&dvnode->validate_lock); + fscache_end_operation(&cres); } static void afs_create_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - if (op->error) + if (afs_op_error(op)) d_drop(op->dentry); } @@ -1332,11 +1314,12 @@ static const struct afs_operation_ops afs_mkdir_operation = { /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); + int ret; _enter("{%llx:%llu},{%pd},%ho", dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); @@ -1344,9 +1327,11 @@ static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, op = afs_alloc_operation(NULL, dvnode->volume); if (IS_ERR(op)) { d_drop(dentry); - return PTR_ERR(op); + return ERR_CAST(op); } + fscache_use_cookie(afs_vnode_cache(dvnode), true); + afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; op->file[0].modification = true; @@ -1354,8 +1339,11 @@ static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, op->dentry = dentry; op->create.mode = S_IFDIR | mode; op->create.reason = afs_edit_dir_for_mkdir; + op->mtime = current_time(dir); op->ops = &afs_mkdir_operation; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ERR_PTR(ret); } /* @@ -1368,8 +1356,8 @@ static void afs_dir_remove_subdir(struct dentry *dentry) clear_nlink(&vnode->netfs.inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + afs_clear_cb_promise(vnode, afs_cb_promise_clear_rmdir); + afs_invalidate_dir(vnode, afs_dir_invalid_subdir_removed); } } @@ -1383,18 +1371,21 @@ static void afs_rmdir_success(struct afs_operation *op) static void afs_rmdir_edit_dir(struct afs_operation *op) { + struct netfs_cache_resources cres = {}; struct afs_vnode_param *dvp = &op->file[0]; struct afs_vnode *dvnode = dvp->vnode; _enter("op=%08x", op->debug_id); afs_dir_remove_subdir(op->dentry); + fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode)); down_write(&dvnode->validate_lock); if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) afs_edit_dir_remove(dvnode, &op->dentry->d_name, afs_edit_dir_for_rmdir); up_write(&dvnode->validate_lock); + fscache_end_operation(&cres); } static void afs_rmdir_put(struct afs_operation *op) @@ -1429,6 +1420,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) if (IS_ERR(op)) return PTR_ERR(op); + fscache_use_cookie(afs_vnode_cache(dvnode), true); + afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; op->file[0].modification = true; @@ -1452,10 +1445,18 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) op->file[1].vnode = vnode; } - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + + /* Not all systems that can host afs servers have ENOTEMPTY. */ + if (ret == -EEXIST) + ret = -ENOTEMPTY; +out: + afs_dir_unuse_cookie(dvnode, ret); + return ret; error: - return afs_put_operation(op); + ret = afs_put_operation(op); + goto out; } /* @@ -1475,7 +1476,7 @@ static void afs_dir_remove_link(struct afs_operation *op) struct dentry *dentry = op->dentry; int ret; - if (op->error != 0 || + if (afs_op_error(op) || (op->file[1].scb.have_status && op->file[1].scb.have_error)) return; if (d_really_is_positive(dentry)) @@ -1499,10 +1500,10 @@ static void afs_dir_remove_link(struct afs_operation *op) ret = afs_validate(vnode, op->key); if (ret != -ESTALE) - op->error = ret; + afs_op_set_error(op, ret); } - _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error); + _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, afs_op_error(op)); } static void afs_unlink_success(struct afs_operation *op) @@ -1518,22 +1519,25 @@ static void afs_unlink_success(struct afs_operation *op) static void afs_unlink_edit_dir(struct afs_operation *op) { + struct netfs_cache_resources cres = {}; struct afs_vnode_param *dvp = &op->file[0]; struct afs_vnode *dvnode = dvp->vnode; _enter("op=%08x", op->debug_id); + fscache_begin_write_operation(&cres, afs_vnode_cache(dvnode)); down_write(&dvnode->validate_lock); if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) && dvnode->status.data_version == dvp->dv_before + dvp->dv_delta) afs_edit_dir_remove(dvnode, &op->dentry->d_name, afs_edit_dir_for_unlink); up_write(&dvnode->validate_lock); + fscache_end_operation(&cres); } static void afs_unlink_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - if (op->unlink.need_rehash && op->error < 0 && op->error != -ENOENT) + if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT) d_rehash(op->dentry); } @@ -1566,6 +1570,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) if (IS_ERR(op)) return PTR_ERR(op); + fscache_use_cookie(afs_vnode_cache(dvnode), true); + afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; op->file[0].modification = true; @@ -1574,7 +1580,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) /* Try to make sure we have a callback promise on the victim. */ ret = afs_validate(vnode, op->key); if (ret < 0) { - op->error = ret; + afs_op_set_error(op, ret); goto error; } @@ -1583,7 +1589,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(d_inode(dentry), 0); - op->error = afs_sillyrename(dvnode, vnode, dentry, op->key); + afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key)); goto error; } if (!d_unhashed(dentry)) { @@ -1604,7 +1610,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) /* If there was a conflict with a third party, check the status of the * unlinked vnode. */ - if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + if (afs_op_error(op) == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { op->file[1].update_ctime = false; op->fetch_status.which = 1; op->ops = &afs_fetch_status_operation; @@ -1612,10 +1618,10 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) afs_wait_for_operation(op); } - return afs_put_operation(op); - error: - return afs_put_operation(op); + ret = afs_put_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; } static const struct afs_operation_ops afs_create_operation = { @@ -1630,7 +1636,7 @@ static const struct afs_operation_ops afs_create_operation = { /* * create a regular file on an AFS filesystem */ -static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct afs_operation *op; @@ -1649,6 +1655,8 @@ static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, goto error; } + fscache_use_cookie(afs_vnode_cache(dvnode), true); + afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; op->file[0].modification = true; @@ -1657,8 +1665,11 @@ static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, op->dentry = dentry; op->create.mode = S_IFREG | mode; op->create.reason = afs_edit_dir_for_create; + op->mtime = current_time(dir); op->ops = &afs_create_operation; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; error: d_drop(dentry); @@ -1685,7 +1696,7 @@ static void afs_link_success(struct afs_operation *op) static void afs_link_put(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - if (op->error) + if (afs_op_error(op)) d_drop(op->dentry); } @@ -1723,6 +1734,8 @@ static int afs_link(struct dentry *from, struct inode *dir, goto error; } + fscache_use_cookie(afs_vnode_cache(dvnode), true); + ret = afs_validate(vnode, op->key); if (ret < 0) goto error_op; @@ -1738,10 +1751,13 @@ static int afs_link(struct dentry *from, struct inode *dir, op->dentry_2 = from; op->ops = &afs_link_operation; op->create.reason = afs_edit_dir_for_link; - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; error_op: afs_put_operation(op); + afs_dir_unuse_cookie(dvnode, ret); error: d_drop(dentry); _leave(" = %d", ret); @@ -1760,7 +1776,7 @@ static const struct afs_operation_ops afs_symlink_operation = { /* * create a symlink in an AFS filesystem */ -static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *content) { struct afs_operation *op; @@ -1785,6 +1801,8 @@ static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, goto error; } + fscache_use_cookie(afs_vnode_cache(dvnode), true); + afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; @@ -1792,7 +1810,10 @@ static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, op->ops = &afs_symlink_operation; op->create.reason = afs_edit_dir_for_symlink; op->create.symlink = content; - return afs_do_sync_operation(op); + op->mtime = current_time(dir); + ret = afs_do_sync_operation(op); + afs_dir_unuse_cookie(dvnode, ret); + return ret; error: d_drop(dentry); @@ -1802,6 +1823,9 @@ error: static void afs_rename_success(struct afs_operation *op) { + struct afs_vnode *vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; + _enter("op=%08x", op->debug_id); op->ctime = op->file[0].scb.status.mtime_client; @@ -1811,10 +1835,46 @@ static void afs_rename_success(struct afs_operation *op) op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); } + if (op->more_files[0].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[0]); + if (op->more_files[1].scb.have_status) + afs_vnode_commit_status(op, &op->more_files[1]); + + /* If we're moving a subdir between dirs, we need to update + * its DV counter too as the ".." will be altered. + */ + if (op->file[0].vnode != op->file[1].vnode) { + if (S_ISDIR(vnode->netfs.inode.i_mode)) { + u64 new_dv; + + write_seqlock(&vnode->cb_lock); + + new_dv = vnode->status.data_version + 1; + trace_afs_set_dv(vnode, new_dv); + vnode->status.data_version = new_dv; + inode_set_iversion_raw(&vnode->netfs.inode, new_dv); + + write_sequnlock(&vnode->cb_lock); + } + + if ((op->rename.rename_flags & RENAME_EXCHANGE) && + S_ISDIR(new_vnode->netfs.inode.i_mode)) { + u64 new_dv; + + write_seqlock(&new_vnode->cb_lock); + + new_dv = new_vnode->status.data_version + 1; + new_vnode->status.data_version = new_dv; + inode_set_iversion_raw(&new_vnode->netfs.inode, new_dv); + + write_sequnlock(&new_vnode->cb_lock); + } + } } static void afs_rename_edit_dir(struct afs_operation *op) { + struct netfs_cache_resources orig_cres = {}, new_cres = {}; struct afs_vnode_param *orig_dvp = &op->file[0]; struct afs_vnode_param *new_dvp = &op->file[1]; struct afs_vnode *orig_dvnode = orig_dvp->vnode; @@ -1831,6 +1891,10 @@ static void afs_rename_edit_dir(struct afs_operation *op) op->rename.rehash = NULL; } + fscache_begin_write_operation(&orig_cres, afs_vnode_cache(orig_dvnode)); + if (new_dvnode != orig_dvnode) + fscache_begin_write_operation(&new_cres, afs_vnode_cache(new_dvnode)); + down_write(&orig_dvnode->validate_lock); if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) @@ -1852,6 +1916,12 @@ static void afs_rename_edit_dir(struct afs_operation *op) &vnode->fid, afs_edit_dir_for_rename_2); } + if (S_ISDIR(vnode->netfs.inode.i_mode) && + new_dvnode != orig_dvnode && + test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + afs_edit_dir_update(vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); + new_inode = d_inode(new_dentry); if (new_inode) { spin_lock(&new_inode->i_lock); @@ -1864,9 +1934,6 @@ static void afs_rename_edit_dir(struct afs_operation *op) /* Now we can update d_fsdata on the dentries to reflect their * new parent's data_version. - * - * Note that if we ever implement RENAME_EXCHANGE, we'll have - * to update both dentries with opposing dir versions. */ afs_update_dentry_version(op, new_dvp, op->dentry); afs_update_dentry_version(op, new_dvp, op->dentry_2); @@ -1874,6 +1941,70 @@ static void afs_rename_edit_dir(struct afs_operation *op) d_move(old_dentry, new_dentry); up_write(&new_dvnode->validate_lock); + fscache_end_operation(&orig_cres); + if (new_dvnode != orig_dvnode) + fscache_end_operation(&new_cres); +} + +static void afs_rename_exchange_edit_dir(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode *orig_dvnode = orig_dvp->vnode; + struct afs_vnode *new_dvnode = new_dvp->vnode; + struct afs_vnode *old_vnode = op->more_files[0].vnode; + struct afs_vnode *new_vnode = op->more_files[1].vnode; + struct dentry *old_dentry = op->dentry; + struct dentry *new_dentry = op->dentry_2; + + _enter("op=%08x", op->debug_id); + + if (new_dvnode == orig_dvnode) { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) { + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + afs_edit_dir_update(orig_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + } + + d_exchange(old_dentry, new_dentry); + up_write(&orig_dvnode->validate_lock); + } else { + down_write(&orig_dvnode->validate_lock); + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) && + orig_dvnode->status.data_version == orig_dvp->dv_before + orig_dvp->dv_delta) + afs_edit_dir_update(orig_dvnode, &old_dentry->d_name, + new_vnode, afs_edit_dir_for_rename_0); + + up_write(&orig_dvnode->validate_lock); + down_write(&new_dvnode->validate_lock); + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) && + new_dvnode->status.data_version == new_dvp->dv_before + new_dvp->dv_delta) + afs_edit_dir_update(new_dvnode, &new_dentry->d_name, + old_vnode, afs_edit_dir_for_rename_1); + + if (S_ISDIR(old_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &old_vnode->flags)) + afs_edit_dir_update(old_vnode, &dotdot_name, new_dvnode, + afs_edit_dir_for_rename_sub); + + if (S_ISDIR(new_vnode->netfs.inode.i_mode) && + test_bit(AFS_VNODE_DIR_VALID, &new_vnode->flags)) + afs_edit_dir_update(new_vnode, &dotdot_name, orig_dvnode, + afs_edit_dir_for_rename_sub); + + /* Now we can update d_fsdata on the dentries to reflect their + * new parents' data_version. + */ + afs_update_dentry_version(op, new_dvp, old_dentry); + afs_update_dentry_version(op, orig_dvp, new_dentry); + + d_exchange(old_dentry, new_dentry); + up_write(&new_dvnode->validate_lock); + } } static void afs_rename_put(struct afs_operation *op) @@ -1882,7 +2013,7 @@ static void afs_rename_put(struct afs_operation *op) if (op->rename.rehash) d_rehash(op->rename.rehash); dput(op->rename.tmp); - if (op->error) + if (afs_op_error(op)) d_rehash(op->dentry); } @@ -1894,18 +2025,44 @@ static const struct afs_operation_ops afs_rename_operation = { .put = afs_rename_put, }; +#if 0 /* Autoswitched in yfs_fs_rename_replace(). */ +static const struct afs_operation_ops afs_rename_replace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_replace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; +#endif + +static const struct afs_operation_ops afs_rename_noreplace_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_noreplace, + .success = afs_rename_success, + .edit_dir = afs_rename_edit_dir, + .put = afs_rename_put, +}; + +static const struct afs_operation_ops afs_rename_exchange_operation = { + .issue_afs_rpc = NULL, + .issue_yfs_rpc = yfs_fs_rename_exchange, + .success = afs_rename_success, + .edit_dir = afs_rename_exchange_edit_dir, + .put = afs_rename_put, +}; + /* * rename a file in an AFS filesystem and/or move it between directories */ -static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, +static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct afs_operation *op; - struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; + struct afs_vnode *orig_dvnode, *new_dvnode, *vnode, *new_vnode = NULL; int ret; - if (flags) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; /* Don't allow silly-rename files be moved around. */ @@ -1915,6 +2072,8 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); + if (d_is_positive(new_dentry)) + new_vnode = AFS_FS_I(d_inode(new_dentry)); _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, @@ -1926,11 +2085,20 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, if (IS_ERR(op)) return PTR_ERR(op); + fscache_use_cookie(afs_vnode_cache(orig_dvnode), true); + if (new_dvnode != orig_dvnode) + fscache_use_cookie(afs_vnode_cache(new_dvnode), true); + ret = afs_validate(vnode, op->key); - op->error = ret; + afs_op_set_error(op, ret); if (ret < 0) goto error; + ret = -ENOMEM; + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) + goto error; + afs_op_set_vnode(op, 0, orig_dvnode); afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; @@ -1939,46 +2107,63 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = vnode; + op->more_files[0].speculative = true; + op->more_files[1].vnode = new_vnode; + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old_dentry; op->dentry_2 = new_dentry; + op->rename.rename_flags = flags; op->rename.new_negative = d_is_negative(new_dentry); - op->ops = &afs_rename_operation; - /* For non-directories, check whether the target is busy and if so, - * make a copy of the dentry and then do a silly-rename. If the - * silly-rename succeeds, the copied dentry is hashed and becomes the - * new target. - */ - if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { - /* To prevent any new references to the target during the - * rename, we unhash the dentry in advance. + if (flags & RENAME_NOREPLACE) { + op->ops = &afs_rename_noreplace_operation; + } else if (flags & RENAME_EXCHANGE) { + op->ops = &afs_rename_exchange_operation; + d_drop(new_dentry); + } else { + /* If we might displace the target, we might need to do silly + * rename. */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - op->rename.rehash = new_dentry; - } + op->ops = &afs_rename_operation; - if (d_count(new_dentry) > 2) { - /* copy the target dentry's name */ - op->rename.tmp = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!op->rename.tmp) { - op->error = -ENOMEM; - goto error; + /* For non-directories, check whether the target is busy and if + * so, make a copy of the dentry and then do a silly-rename. + * If the silly-rename succeeds, the copied dentry is hashed + * and becomes the new target. + */ + if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) { + /* To prevent any new references to the target during + * the rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + op->rename.rehash = new_dentry; } - ret = afs_sillyrename(new_dvnode, - AFS_FS_I(d_inode(new_dentry)), - new_dentry, op->key); - if (ret) { - op->error = ret; - goto error; + if (d_count(new_dentry) > 2) { + /* copy the target dentry's name */ + op->rename.tmp = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!op->rename.tmp) { + afs_op_nomem(op); + goto error; + } + + ret = afs_sillyrename(new_dvnode, + AFS_FS_I(d_inode(new_dentry)), + new_dentry, op->key); + if (ret) { + afs_op_set_error(op, ret); + goto error; + } + + op->dentry_2 = op->rename.tmp; + op->rename.rehash = NULL; + op->rename.new_negative = true; } - - op->dentry_2 = op->rename.tmp; - op->rename.rehash = NULL; - op->rename.new_negative = true; } } @@ -1993,47 +2178,45 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, */ d_drop(old_dentry); - return afs_do_sync_operation(op); + ret = afs_do_sync_operation(op); + if (ret == -ENOTSUPP) + ret = -EINVAL; +out: + afs_dir_unuse_cookie(orig_dvnode, ret); + if (new_dvnode != orig_dvnode) + afs_dir_unuse_cookie(new_dvnode, ret); + return ret; error: - return afs_put_operation(op); -} - -/* - * Release a directory folio and clean up its private state if it's not busy - * - return true if the folio can now be released, false if not - */ -static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags) -{ - struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - - _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio)); - - folio_detach_private(folio); - - /* The directory will need reloading. */ - if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - afs_stat_v(dvnode, n_relpg); - return true; + ret = afs_put_operation(op); + goto out; } /* - * Invalidate part or all of a folio. + * Write the file contents to the cache as a single blob. */ -static void afs_dir_invalidate_folio(struct folio *folio, size_t offset, - size_t length) +int afs_single_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - - _enter("{%lu},%zu,%zu", folio->index, offset, length); - - BUG_ON(!folio_test_locked(folio)); + struct afs_vnode *dvnode = AFS_FS_I(mapping->host); + struct iov_iter iter; + bool is_dir = (S_ISDIR(dvnode->netfs.inode.i_mode) && + !test_bit(AFS_VNODE_MOUNTPOINT, &dvnode->flags)); + int ret = 0; - /* The directory will need reloading. */ - if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) - afs_stat_v(dvnode, n_inval); + /* Need to lock to prevent the folio queue and folios from being thrown + * away. + */ + down_read(&dvnode->validate_lock); + + if (is_dir ? + test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) : + atomic64_read(&dvnode->cb_expires_at) != AFS_NO_CB_PROMISE) { + iov_iter_folio_queue(&iter, ITER_SOURCE, dvnode->directory, 0, 0, + i_size_read(&dvnode->netfs.inode)); + ret = netfs_writeback_single(mapping, wbc, &iter); + } - /* we clean up only if the entire folio is being invalidated */ - if (offset == 0 && length == folio_size(folio)) - folio_detach_private(folio); + up_read(&dvnode->validate_lock); + return ret; } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 0ab7752d1b75..fd3aa9f97ce6 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -10,6 +10,7 @@ #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/iversion.h> +#include <linux/folio_queue.h> #include "internal.h" #include "xdr_fs.h" @@ -105,31 +106,66 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block, } /* - * Get a new directory folio. + * Get a specific block, extending the directory storage to cover it as needed. */ -static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) +static union afs_xdr_dir_block *afs_dir_get_block(struct afs_dir_iter *iter, size_t block) { - struct address_space *mapping = vnode->netfs.inode.i_mapping; + struct folio_queue *fq; + struct afs_vnode *dvnode = iter->dvnode; struct folio *folio; + size_t blpos = block * AFS_DIR_BLOCK_SIZE; + size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos; + int ret; + + if (dvnode->directory_size < blend) { + size_t cur_size = dvnode->directory_size; + + ret = netfs_alloc_folioq_buffer( + NULL, &dvnode->directory, &cur_size, blend, + mapping_gfp_mask(dvnode->netfs.inode.i_mapping)); + dvnode->directory_size = cur_size; + if (ret < 0) + goto fail; + } - folio = __filemap_get_folio(mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, - mapping->gfp_mask); - if (!folio) - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); - else if (folio && !folio_test_private(folio)) - folio_attach_private(folio, (void *)1); + fq = iter->fq; + if (!fq) + fq = dvnode->directory; + + /* Search the folio queue for the folio containing the block... */ + for (; fq; fq = fq->next) { + for (int s = iter->fq_slot; s < folioq_count(fq); s++) { + size_t fsize = folioq_folio_size(fq, s); + + if (blend <= fpos + fsize) { + /* ... and then return the mapped block. */ + folio = folioq_folio(fq, s); + if (WARN_ON_ONCE(folio_pos(folio) != fpos)) + goto fail; + iter->fq = fq; + iter->fq_slot = s; + iter->fpos = fpos; + return kmap_local_folio(folio, blpos - fpos); + } + fpos += fsize; + } + iter->fq_slot = 0; + } - return folio; +fail: + iter->fq = NULL; + iter->fq_slot = 0; + afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block); + return NULL; } /* * Scan a directory block looking for a dirent of the right name. */ -static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name, +static int afs_dir_scan_block(const union afs_xdr_dir_block *block, const struct qstr *name, unsigned int blocknum) { - union afs_xdr_dirent *de; + const union afs_xdr_dirent *de; u64 bitmap; int d, len, n; @@ -203,14 +239,13 @@ static void afs_edit_init_block(union afs_xdr_dir_block *meta, * The caller must hold the inode locked. */ void afs_edit_dir_add(struct afs_vnode *vnode, - struct qstr *name, struct afs_fid *new_fid, + const struct qstr *name, struct afs_fid *new_fid, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *meta, *block; union afs_xdr_dirent *de; - struct folio *folio0, *folio; - unsigned int need_slots, nr_blocks, b; - pgoff_t index; + struct afs_dir_iter iter = { .dvnode = vnode }; + unsigned int nr_blocks, b, entry; loff_t i_size; int slot; @@ -219,20 +254,17 @@ void afs_edit_dir_add(struct afs_vnode *vnode, i_size = i_size_read(&vnode->netfs.inode); if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_bad_size); return; } - folio0 = afs_dir_get_folio(vnode, 0); - if (!folio0) { - _leave(" [fgp]"); + meta = afs_dir_get_block(&iter, 0); + if (!meta) return; - } /* Work out how many slots we're going to need. */ - need_slots = afs_dir_calc_slots(name->len); + iter.nr_slots = afs_dir_calc_slots(name->len); - meta = kmap_local_folio(folio0, 0); if (i_size == 0) goto new_directory; nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; @@ -244,22 +276,21 @@ void afs_edit_dir_add(struct afs_vnode *vnode, /* If the directory extended into a new folio, then we need to * tack a new folio on the end. */ - index = b / AFS_DIR_BLOCKS_PER_PAGE; if (nr_blocks >= AFS_DIR_MAX_BLOCKS) - goto error; - if (index >= folio_nr_pages(folio0)) { - folio = afs_dir_get_folio(vnode, index); - if (!folio) - goto error; - } else { - folio = folio0; - } + goto error_too_many_blocks; - block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio)); + /* Lower dir blocks have a counter in the header we can check. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR && + meta->meta.alloc_ctrs[b] < iter.nr_slots) + continue; + + block = afs_dir_get_block(&iter, b); + if (!block) + goto error; /* Abandon the edit if we got a callback break. */ if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - goto invalidated; + goto already_invalidated; _debug("block %u: %2u %3u %u", b, @@ -274,31 +305,23 @@ void afs_edit_dir_add(struct afs_vnode *vnode, afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE); } - /* Only lower dir blocks have a counter in the header. */ - if (b >= AFS_DIR_BLOCKS_WITH_CTR || - meta->meta.alloc_ctrs[b] >= need_slots) { - /* We need to try and find one or more consecutive - * slots to hold the entry. - */ - slot = afs_find_contig_bits(block, need_slots); - if (slot >= 0) { - _debug("slot %u", slot); - goto found_space; - } + /* We need to try and find one or more consecutive slots to + * hold the entry. + */ + slot = afs_find_contig_bits(block, iter.nr_slots); + if (slot >= 0) { + _debug("slot %u", slot); + goto found_space; } kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } } /* There are no spare slots of sufficient size, yet the operation * succeeded. Download the directory again. */ trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_no_slots); goto out_unmap; new_directory: @@ -306,8 +329,7 @@ new_directory: i_size = AFS_DIR_BLOCK_SIZE; afs_set_i_size(vnode, i_size); slot = AFS_DIR_RESV_BLOCKS0; - folio = folio0; - block = kmap_local_folio(folio, 0); + block = afs_dir_get_block(&iter, 0); nr_blocks = 1; b = 0; @@ -325,41 +347,39 @@ found_space: de->u.name[name->len] = 0; /* Adjust the bitmap. */ - afs_set_contig_bits(block, slot, need_slots); - kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } + afs_set_contig_bits(block, slot, iter.nr_slots); /* Adjust the allocation counter. */ if (b < AFS_DIR_BLOCKS_WITH_CTR) - meta->meta.alloc_ctrs[b] -= need_slots; + meta->meta.alloc_ctrs[b] -= iter.nr_slots; + + /* Adjust the hash chain. */ + entry = b * AFS_DIR_SLOTS_PER_BLOCK + slot; + iter.bucket = afs_dir_hash_name(name); + de->u.hash_next = meta->meta.hashtable[iter.bucket]; + meta->meta.hashtable[iter.bucket] = htons(entry); + kunmap_local(block); inode_inc_iversion_raw(&vnode->netfs.inode); afs_stat_v(vnode, n_dir_cr); _debug("Insert %s in %u[%u]", name->name, b, slot); + netfs_single_mark_inode_dirty(&vnode->netfs.inode); + out_unmap: kunmap_local(meta); - folio_unlock(folio0); - folio_put(folio0); _leave(""); return; -invalidated: +already_invalidated: trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } goto out_unmap; +error_too_many_blocks: + afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_too_many_blocks); error: trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); goto out_unmap; } @@ -371,15 +391,16 @@ error: * The caller must hold the inode locked. */ void afs_edit_dir_remove(struct afs_vnode *vnode, - struct qstr *name, enum afs_edit_dir_reason why) + const struct qstr *name, enum afs_edit_dir_reason why) { - union afs_xdr_dir_block *meta, *block; - union afs_xdr_dirent *de; - struct folio *folio0, *folio; - unsigned int need_slots, nr_blocks, b; - pgoff_t index; + union afs_xdr_dir_block *meta, *block, *pblock; + union afs_xdr_dirent *de, *pde; + struct afs_dir_iter iter = { .dvnode = vnode }; + struct afs_fid fid; + unsigned int b, slot, entry; loff_t i_size; - int slot; + __be16 next; + int found; _enter(",,{%d,%s},", name->len, name->name); @@ -387,81 +408,95 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, if (i_size < AFS_DIR_BLOCK_SIZE || i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_bad_size); return; } - nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; - folio0 = afs_dir_get_folio(vnode, 0); - if (!folio0) { - _leave(" [fgp]"); + if (!afs_dir_init_iter(&iter, name)) return; - } - - /* Work out how many slots we're going to discard. */ - need_slots = afs_dir_calc_slots(name->len); - - meta = kmap_local_folio(folio0, 0); - - /* Find a block that has sufficient slots available. Each folio - * contains two or more directory blocks. - */ - for (b = 0; b < nr_blocks; b++) { - index = b / AFS_DIR_BLOCKS_PER_PAGE; - if (index >= folio_nr_pages(folio0)) { - folio = afs_dir_get_folio(vnode, index); - if (!folio) - goto error; - } else { - folio = folio0; - } - block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio)); - - /* Abandon the edit if we got a callback break. */ - if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - goto invalidated; - - if (b > AFS_DIR_BLOCKS_WITH_CTR || - meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) { - slot = afs_dir_scan_block(block, name, b); - if (slot >= 0) - goto found_dirent; - } + meta = afs_dir_find_block(&iter, 0); + if (!meta) + return; - kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } + /* Find the entry in the blob. */ + found = afs_dir_search_bucket(&iter, name, &fid); + if (found < 0) { + /* Didn't find the dirent to clobber. Re-download. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent, + 0, 0, 0, 0, name->name); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_wrong_name); + goto out_unmap; } - /* Didn't find the dirent to clobber. Download the directory again. */ - trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent, - 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); - goto out_unmap; + entry = found; + b = entry / AFS_DIR_SLOTS_PER_BLOCK; + slot = entry % AFS_DIR_SLOTS_PER_BLOCK; -found_dirent: + block = afs_dir_find_block(&iter, b); + if (!block) + goto error; + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto already_invalidated; + + /* Check and clear the entry. */ de = &block->dirents[slot]; + if (de->u.valid != 1) + goto error_unmap; trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot, ntohl(de->u.vnode), ntohl(de->u.unique), name->name); - memset(de, 0, sizeof(*de) * need_slots); - /* Adjust the bitmap. */ - afs_clear_contig_bits(block, slot, need_slots); - kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } + afs_clear_contig_bits(block, slot, iter.nr_slots); /* Adjust the allocation counter. */ if (b < AFS_DIR_BLOCKS_WITH_CTR) - meta->meta.alloc_ctrs[b] += need_slots; + meta->meta.alloc_ctrs[b] += iter.nr_slots; + + /* Clear the constituent entries. */ + next = de->u.hash_next; + memset(de, 0, sizeof(*de) * iter.nr_slots); + kunmap_local(block); + + /* Adjust the hash chain: if iter->prev_entry is 0, the hashtable head + * index is previous; otherwise it's slot number of the previous entry. + */ + if (!iter.prev_entry) { + __be16 prev_next = meta->meta.hashtable[iter.bucket]; + + if (unlikely(prev_next != htons(entry))) { + pr_warn("%llx:%llx:%x: not head of chain b=%x p=%x,%x e=%x %*s", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, + iter.bucket, iter.prev_entry, prev_next, entry, + name->len, name->name); + goto error; + } + meta->meta.hashtable[iter.bucket] = next; + } else { + unsigned int pb = iter.prev_entry / AFS_DIR_SLOTS_PER_BLOCK; + unsigned int ps = iter.prev_entry % AFS_DIR_SLOTS_PER_BLOCK; + __be16 prev_next; + + pblock = afs_dir_find_block(&iter, pb); + if (!pblock) + goto error; + pde = &pblock->dirents[ps]; + prev_next = pde->u.hash_next; + if (prev_next != htons(entry)) { + kunmap_local(pblock); + pr_warn("%llx:%llx:%x: not prev in chain b=%x p=%x,%x e=%x %*s", + vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, + iter.bucket, iter.prev_entry, prev_next, entry, + name->len, name->name); + goto error; + } + pde->u.hash_next = next; + kunmap_local(pblock); + } + + netfs_single_mark_inode_dirty(&vnode->netfs.inode); inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); afs_stat_v(vnode, n_dir_rm); @@ -469,25 +504,145 @@ found_dirent: out_unmap: kunmap_local(meta); - folio_unlock(folio0); - folio_put(folio0); _leave(""); return; -invalidated: +already_invalidated: + kunmap_local(block); trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval, 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); - kunmap_local(block); - if (folio != folio0) { - folio_unlock(folio); - folio_put(folio); - } goto out_unmap; +error_unmap: + kunmap_local(block); error: trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error, 0, 0, 0, 0, name->name); - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); goto out_unmap; } + +/* + * Edit an entry in a directory to update the vnode it refers to. This is also + * used to update the ".." entry in a directory. + */ +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why) +{ + union afs_xdr_dir_block *block; + union afs_xdr_dirent *de; + struct afs_dir_iter iter = { .dvnode = vnode }; + unsigned int nr_blocks, b; + loff_t i_size; + int slot; + + _enter(""); + + i_size = i_size_read(&vnode->netfs.inode); + if (i_size < AFS_DIR_BLOCK_SIZE) { + afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_bad_size); + return; + } + + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + /* Find a block that has sufficient slots available. Each folio + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks; b++) { + block = afs_dir_get_block(&iter, b); + if (!block) + goto error; + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto already_invalidated; + + slot = afs_dir_scan_block(block, name, b); + if (slot >= 0) + goto found_dirent; + + kunmap_local(block); + } + + /* Didn't find the dirent to clobber. Download the directory again. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, + 0, 0, 0, 0, name->name); + afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd); + goto out; + +found_dirent: + de = &block->dirents[slot]; + de->u.vnode = htonl(new_dvnode->fid.vnode); + de->u.unique = htonl(new_dvnode->fid.unique); + + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot, + ntohl(de->u.vnode), ntohl(de->u.unique), name->name); + + kunmap_local(block); + netfs_single_mark_inode_dirty(&vnode->netfs.inode); + inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); + +out: + _leave(""); + return; + +already_invalidated: + kunmap_local(block); + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, + 0, 0, 0, 0, name->name); + goto out; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, + 0, 0, 0, 0, name->name); + goto out; +} + +/* + * Initialise a new directory. We need to fill in the "." and ".." entries. + */ +void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_dvnode) +{ + union afs_xdr_dir_block *meta; + struct afs_dir_iter iter = { .dvnode = dvnode }; + union afs_xdr_dirent *de; + unsigned int slot = AFS_DIR_RESV_BLOCKS0; + loff_t i_size; + + i_size = i_size_read(&dvnode->netfs.inode); + if (i_size != AFS_DIR_BLOCK_SIZE) { + afs_invalidate_dir(dvnode, afs_dir_invalid_edit_add_bad_size); + return; + } + + meta = afs_dir_get_block(&iter, 0); + if (!meta) + return; + + afs_edit_init_block(meta, meta, 0); + + de = &meta->dirents[slot]; + de->u.valid = 1; + de->u.vnode = htonl(dvnode->fid.vnode); + de->u.unique = htonl(dvnode->fid.unique); + memcpy(de->u.name, ".", 2); + trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot, + dvnode->fid.vnode, dvnode->fid.unique, "."); + slot++; + + de = &meta->dirents[slot]; + de->u.valid = 1; + de->u.vnode = htonl(parent_dvnode->fid.vnode); + de->u.unique = htonl(parent_dvnode->fid.unique); + memcpy(de->u.name, "..", 3); + trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot, + parent_dvnode->fid.vnode, parent_dvnode->fid.unique, ".."); + + afs_set_contig_bits(meta, AFS_DIR_RESV_BLOCKS0, 2); + meta->meta.alloc_ctrs[0] -= 2; + kunmap_local(meta); + + netfs_single_mark_inode_dirty(&dvnode->netfs.inode); + set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); + set_bit(AFS_VNODE_DIR_READ, &dvnode->flags); +} diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c new file mode 100644 index 000000000000..d2516e55b5ed --- /dev/null +++ b/fs/afs/dir_search.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Search a directory's hash table. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * https://tools.ietf.org/html/draft-keiser-afs3-directory-object-00 + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/iversion.h> +#include "internal.h" +#include "afs_fs.h" +#include "xdr_fs.h" + +/* + * Calculate the name hash. + */ +unsigned int afs_dir_hash_name(const struct qstr *name) +{ + const unsigned char *p = name->name; + unsigned int hash = 0, i; + int bucket; + + for (i = 0; i < name->len; i++) + hash = (hash * 173) + p[i]; + bucket = hash & (AFS_DIR_HASHTBL_SIZE - 1); + if (hash > INT_MAX) { + bucket = AFS_DIR_HASHTBL_SIZE - bucket; + bucket &= (AFS_DIR_HASHTBL_SIZE - 1); + } + return bucket; +} + +/* + * Reset a directory iterator. + */ +static bool afs_dir_reset_iter(struct afs_dir_iter *iter) +{ + unsigned long long i_size = i_size_read(&iter->dvnode->netfs.inode); + unsigned int nblocks; + + /* Work out the maximum number of steps we can take. */ + nblocks = umin(i_size / AFS_DIR_BLOCK_SIZE, AFS_DIR_MAX_BLOCKS); + if (!nblocks) + return false; + iter->loop_check = nblocks * (AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS); + iter->prev_entry = 0; /* Hash head is previous */ + return true; +} + +/* + * Initialise a directory iterator for looking up a name. + */ +bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name) +{ + iter->nr_slots = afs_dir_calc_slots(name->len); + iter->bucket = afs_dir_hash_name(name); + return afs_dir_reset_iter(iter); +} + +/* + * Get a specific block. + */ +union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block) +{ + struct folio_queue *fq = iter->fq; + struct afs_vnode *dvnode = iter->dvnode; + struct folio *folio; + size_t blpos = block * AFS_DIR_BLOCK_SIZE; + size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos; + int slot = iter->fq_slot; + + _enter("%zx,%d", block, slot); + + if (iter->block) { + kunmap_local(iter->block); + iter->block = NULL; + } + + if (dvnode->directory_size < blend) + goto fail; + + if (!fq || blpos < fpos) { + fq = dvnode->directory; + slot = 0; + fpos = 0; + } + + /* Search the folio queue for the folio containing the block... */ + for (; fq; fq = fq->next) { + for (; slot < folioq_count(fq); slot++) { + size_t fsize = folioq_folio_size(fq, slot); + + if (blend <= fpos + fsize) { + /* ... and then return the mapped block. */ + folio = folioq_folio(fq, slot); + if (WARN_ON_ONCE(folio_pos(folio) != fpos)) + goto fail; + iter->fq = fq; + iter->fq_slot = slot; + iter->fpos = fpos; + iter->block = kmap_local_folio(folio, blpos - fpos); + return iter->block; + } + fpos += fsize; + } + slot = 0; + } + +fail: + iter->fq = NULL; + iter->fq_slot = 0; + afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block); + return NULL; +} + +/* + * Search through a directory bucket. + */ +int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, + struct afs_fid *_fid) +{ + const union afs_xdr_dir_block *meta; + unsigned int entry; + int ret = -ESTALE; + + meta = afs_dir_find_block(iter, 0); + if (!meta) + return -ESTALE; + + entry = ntohs(meta->meta.hashtable[iter->bucket & (AFS_DIR_HASHTBL_SIZE - 1)]); + _enter("%x,%x", iter->bucket, entry); + + while (entry) { + const union afs_xdr_dir_block *block; + const union afs_xdr_dirent *dire; + unsigned int blnum = entry / AFS_DIR_SLOTS_PER_BLOCK; + unsigned int slot = entry % AFS_DIR_SLOTS_PER_BLOCK; + unsigned int resv = (blnum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + + _debug("search %x", entry); + + if (slot < resv) { + kdebug("slot out of range h=%x rs=%2x sl=%2x-%2x", + iter->bucket, resv, slot, slot + iter->nr_slots - 1); + goto bad; + } + + block = afs_dir_find_block(iter, blnum); + if (!block) + goto bad; + dire = &block->dirents[slot]; + + if (slot + iter->nr_slots <= AFS_DIR_SLOTS_PER_BLOCK && + memcmp(dire->u.name, name->name, name->len) == 0 && + dire->u.name[name->len] == '\0') { + _fid->vnode = ntohl(dire->u.vnode); + _fid->unique = ntohl(dire->u.unique); + ret = entry; + goto found; + } + + iter->prev_entry = entry; + entry = ntohs(dire->u.hash_next); + if (!--iter->loop_check) { + kdebug("dir chain loop h=%x", iter->bucket); + goto bad; + } + } + + ret = -ENOENT; +found: + if (iter->block) { + kunmap_local(iter->block); + iter->block = NULL; + } + +bad: + if (ret == -ESTALE) + afs_invalidate_dir(iter->dvnode, afs_dir_invalid_iter_stale); + _leave(" = %d", ret); + return ret; +} + +/* + * Search the appropriate hash chain in the contents of an AFS directory. + */ +int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, + struct afs_fid *_fid, afs_dataversion_t *_dir_version) +{ + struct afs_dir_iter iter = { .dvnode = dvnode, }; + int ret, retry_limit = 3; + + _enter("{%lu},,,", dvnode->netfs.inode.i_ino); + + if (!afs_dir_init_iter(&iter, name)) + return -ENOENT; + do { + if (--retry_limit < 0) { + pr_warn("afs_read_dir(): Too many retries\n"); + ret = -ESTALE; + break; + } + ret = afs_read_dir(dvnode, NULL); + if (ret < 0) { + if (ret != -ESTALE) + break; + if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) { + ret = -ESTALE; + break; + } + continue; + } + *_dir_version = inode_peek_iversion_raw(&dvnode->netfs.inode); + + ret = afs_dir_search_bucket(&iter, name, _fid); + up_read(&dvnode->validate_lock); + if (ret == -ESTALE) + afs_dir_reset_iter(&iter); + } while (ret == -ESTALE); + + _leave(" = %d", ret); + return ret; +} diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index bb5807e87fa4..014495d4b868 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -69,6 +69,12 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode if (IS_ERR(op)) return PTR_ERR(op); + op->more_files = kvcalloc(2, sizeof(struct afs_vnode_param), GFP_KERNEL); + if (!op->more_files) { + afs_put_operation(op); + return -ENOMEM; + } + afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, dvnode); op->file[0].dv_delta = 1; @@ -77,6 +83,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; + op->more_files[0].vnode = AFS_FS_I(d_inode(old)); + op->more_files[0].speculative = true; + op->more_files[1].vnode = AFS_FS_I(d_inode(new)); + op->more_files[1].speculative = true; + op->nr_files = 4; op->dentry = old; op->dentry_2 = new; @@ -113,16 +124,14 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, sdentry = NULL; do { - int slen; - dput(sdentry); sillycounter++; /* Create a silly name. Note that the ".__afs" prefix is * understood by the salvager and must not be changed. */ - slen = scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter); - sdentry = lookup_one_len(silly, dentry->d_parent, slen); + scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter); + sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent); /* N.B. Better to return EBUSY here ... it could be dangerous * to delete the file while it's in use. @@ -218,7 +227,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode /* If there was a conflict with a third party, check the status of the * unlinked vnode. */ - if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + if (op->cumul_error.error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { op->file[1].update_ctime = false; op->fetch_status.which = 1; op->ops = &afs_fetch_status_operation; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index d7d9402ff718..aa56e8951e03 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -10,16 +10,19 @@ #include <linux/dns_resolver.h> #include "internal.h" -static atomic_t afs_autocell_ino; +#define AFS_MIN_DYNROOT_CELL_INO 4 /* Allow for ., .., @cell, .@cell */ +#define AFS_MAX_DYNROOT_CELL_INO ((unsigned int)INT_MAX) + +static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino); /* * iget5() comparator for inode created by autocell operations - * - * These pseudo inodes don't match anything. */ static int afs_iget5_pseudo_test(struct inode *inode, void *opaque) { - return 0; + struct afs_fid *fid = opaque; + + return inode->i_ino == fid->vnode; } /* @@ -39,28 +42,16 @@ static int afs_iget5_pseudo_set(struct inode *inode, void *opaque) } /* - * Create an inode for a dynamic root directory or an autocell dynamic - * automount dir. + * Create an inode for an autocell dynamic automount dir. */ -struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) +static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) { - struct afs_super_info *as = AFS_FS_S(sb); struct afs_vnode *vnode; struct inode *inode; - struct afs_fid fid = {}; + struct afs_fid fid = { .vnode = ino, .unique = 1, }; _enter(""); - if (as->volume) - fid.vid = as->volume->vid; - if (root) { - fid.vnode = 1; - fid.unique = 1; - } else { - fid.vnode = atomic_inc_return(&afs_autocell_ino); - fid.unique = 0; - } - inode = iget5_locked(sb, fid.vnode, afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); if (!inode) { @@ -73,149 +64,76 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) vnode = AFS_FS_I(inode); - /* there shouldn't be an existing inode */ - BUG_ON(!(inode->i_state & I_NEW)); - - netfs_inode_init(&vnode->netfs, NULL); - inode->i_size = 0; - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - if (root) { - inode->i_op = &afs_dynroot_inode_operations; - inode->i_fop = &simple_dir_operations; - } else { - inode->i_op = &afs_autocell_inode_operations; - } - set_nlink(inode, 2); - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode); - inode->i_blocks = 0; - inode->i_generation = 0; - - set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); - if (!root) { + if (inode_state_read_once(inode) & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 2); + inode->i_size = 0; + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_autocell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_AUTOMOUNT | S_NOATIME; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); - inode->i_flags |= S_AUTOMOUNT; - } - inode->i_flags |= S_NOATIME; - unlock_new_inode(inode); + unlock_new_inode(inode); + } _leave(" = %p", inode); return inode; } /* - * Probe to see if a cell may exist. This prevents positive dentries from - * being created unnecessarily. + * Try to automount the mountpoint with pseudo directory, if the autocell + * option is set. */ -static int afs_probe_cell_name(struct dentry *dentry) +static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *dentry, + unsigned int flags) { - struct afs_cell *cell; + struct afs_cell *cell = NULL; struct afs_net *net = afs_d2net(dentry); + struct inode *inode = NULL; const char *name = dentry->d_name.name; size_t len = dentry->d_name.len; - int ret; + bool dotted = false; + int ret = -ENOENT; /* Names prefixed with a dot are R/W mounts. */ if (name[0] == '.') { - if (len == 1) - return -EINVAL; name++; len--; + dotted = true; } - cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe); - if (!IS_ERR(cell)) { - afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe); - return 0; + cell = afs_lookup_cell(net, name, len, NULL, + AFS_LOOKUP_CELL_DYNROOT, + afs_cell_trace_use_lookup_dynroot); + if (IS_ERR(cell)) { + ret = PTR_ERR(cell); + goto out_no_cell; } - ret = dns_query(net->net, "afsdb", name, len, "srv=1", - NULL, NULL, false); - if (ret == -ENODATA) - ret = -EDESTADDRREQ; - return ret; -} - -/* - * Try to auto mount the mountpoint with pseudo directory, if the autocell - * operation is setted. - */ -struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir) -{ - struct afs_vnode *vnode = AFS_FS_I(dir); - struct inode *inode; - int ret = -ENOENT; - - _enter("%p{%pd}, {%llx:%llu}", - dentry, dentry, vnode->fid.vid, vnode->fid.vnode); - - if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) - goto out; - - ret = afs_probe_cell_name(dentry); - if (ret < 0) - goto out; - - inode = afs_iget_pseudo_dir(dir->i_sb, false); + inode = afs_iget_pseudo_dir(dir->i_sb, cell->dynroot_ino * 2 + dotted); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out; } - _leave("= %p", inode); - return inode; + dentry->d_fsdata = cell; + return d_splice_alias(inode, dentry); out: - _leave("= %d", ret); + afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_dynroot); +out_no_cell: + if (!inode) + return d_splice_alias(inode, dentry); return ret == -ENOENT ? NULL : ERR_PTR(ret); } /* - * Look up @cell in a dynroot directory. This is a substitution for the - * local cell name for the net namespace. - */ -static struct dentry *afs_lookup_atcell(struct dentry *dentry) -{ - struct afs_cell *cell; - struct afs_net *net = afs_d2net(dentry); - struct dentry *ret; - char *name; - int len; - - if (!net->ws_cell) - return ERR_PTR(-ENOENT); - - ret = ERR_PTR(-ENOMEM); - name = kmalloc(AFS_MAXCELLNAME + 1, GFP_KERNEL); - if (!name) - goto out_p; - - down_read(&net->cells_lock); - cell = net->ws_cell; - if (cell) { - len = cell->name_len; - memcpy(name, cell->name, len + 1); - } - up_read(&net->cells_lock); - - ret = ERR_PTR(-ENOENT); - if (!cell) - goto out_n; - - ret = lookup_one_len(name, dentry->d_parent, len); - - /* We don't want to d_add() the @cell dentry here as we don't want to - * the cached dentry to hide changes to the local cell name. - */ - -out_n: - kfree(name); -out_p: - return ret; -} - -/* * Look up an entry in a dynroot directory. */ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, @@ -223,8 +141,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr { _enter("%pd", dentry); - ASSERTCMP(d_inode(dentry), ==, NULL); - if (flags & LOOKUP_CREATE) return ERR_PTR(-EOPNOTSUPP); @@ -235,160 +151,256 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr if (dentry->d_name.len == 5 && memcmp(dentry->d_name.name, "@cell", 5) == 0) - return afs_lookup_atcell(dentry); + return afs_lookup_atcell(dir, dentry, 2); + + if (dentry->d_name.len == 6 && + memcmp(dentry->d_name.name, ".@cell", 6) == 0) + return afs_lookup_atcell(dir, dentry, 3); - return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry); + return afs_dynroot_lookup_cell(dir, dentry, flags); } const struct inode_operations afs_dynroot_inode_operations = { .lookup = afs_dynroot_lookup, }; -/* - * Dirs in the dynamic root don't need revalidation. - */ -static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags) +static void afs_dynroot_d_release(struct dentry *dentry) { - return 1; + struct afs_cell *cell = dentry->d_fsdata; + + afs_unuse_cell(cell, afs_cell_trace_unuse_dynroot_mntpt); } /* - * Allow the VFS to enquire as to whether a dentry should be unhashed (mustn't - * sleep) - * - called from dput() when d_count is going to 0. - * - return 1 to request dentry be unhashed, 0 otherwise + * Keep @cell symlink dentries around, but only keep cell autodirs when they're + * being used. */ -static int afs_dynroot_d_delete(const struct dentry *dentry) +static int afs_dynroot_delete_dentry(const struct dentry *dentry) { - return d_really_is_positive(dentry); + const struct qstr *name = &dentry->d_name; + + if (name->len == 5 && memcmp(name->name, "@cell", 5) == 0) + return 0; + if (name->len == 6 && memcmp(name->name, ".@cell", 6) == 0) + return 0; + return 1; } const struct dentry_operations afs_dynroot_dentry_operations = { - .d_revalidate = afs_dynroot_d_revalidate, - .d_delete = afs_dynroot_d_delete, - .d_release = afs_d_release, + .d_delete = afs_dynroot_delete_dentry, + .d_release = afs_dynroot_d_release, .d_automount = afs_d_automount, }; +static void afs_atcell_delayed_put_cell(void *arg) +{ + struct afs_cell *cell = arg; + + afs_put_cell(cell, afs_cell_trace_put_atcell); +} + /* - * Create a manually added cell mount directory. - * - The caller must hold net->proc_cells_lock + * Read @cell or .@cell symlinks. */ -int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell) +static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) { - struct super_block *sb = net->dynroot_sb; - struct dentry *root, *subdir; - int ret; + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_cell *cell; + struct afs_net *net = afs_i2net(inode); + const char *name; + bool dotted = vnode->fid.vnode == 3; - if (!sb || atomic_read(&sb->s_active) == 0) - return 0; + if (!rcu_access_pointer(net->ws_cell)) + return ERR_PTR(-ENOENT); - /* Let the ->lookup op do the creation */ - root = sb->s_root; - inode_lock(root->d_inode); - subdir = lookup_one_len(cell->name, root, cell->name_len); - if (IS_ERR(subdir)) { - ret = PTR_ERR(subdir); - goto unlock; + if (!dentry) { + /* We're in RCU-pathwalk. */ + cell = rcu_dereference(net->ws_cell); + if (dotted) + name = cell->name - 1; + else + name = cell->name; + /* Shouldn't need to set a delayed call. */ + return name; } - /* Note that we're retaining an extra ref on the dentry */ - subdir->d_fsdata = (void *)1UL; - ret = 0; -unlock: - inode_unlock(root->d_inode); - return ret; + down_read(&net->cells_lock); + + cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock)); + if (dotted) + name = cell->name - 1; + else + name = cell->name; + afs_get_cell(cell, afs_cell_trace_get_atcell); + set_delayed_call(done, afs_atcell_delayed_put_cell, cell); + + up_read(&net->cells_lock); + return name; } +static const struct inode_operations afs_atcell_inode_operations = { + .get_link = afs_atcell_get_link, +}; + /* - * Remove a manually added cell mount directory. - * - The caller must hold net->proc_cells_lock + * Create an inode for the @cell or .@cell symlinks. */ -void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell) +static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino) { - struct super_block *sb = net->dynroot_sb; - struct dentry *root, *subdir; + struct afs_vnode *vnode; + struct inode *inode; + struct afs_fid fid = { .vnode = ino, .unique = 1, }; - if (!sb || atomic_read(&sb->s_active) == 0) - return; + inode = iget5_locked(dir->i_sb, fid.vnode, + afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); + if (!inode) + return ERR_PTR(-ENOMEM); - root = sb->s_root; - inode_lock(root->d_inode); + vnode = AFS_FS_I(inode); - /* Don't want to trigger a lookup call, which will re-add the cell */ - subdir = try_lookup_one_len(cell->name, root, cell->name_len); - if (IS_ERR_OR_NULL(subdir)) { - _debug("lookup %ld", PTR_ERR(subdir)); - goto no_dentry; + if (inode_state_read_once(inode) & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 1); + inode->i_size = 0; + inode->i_mode = S_IFLNK | 0555; + inode->i_op = &afs_atcell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + unlock_new_inode(inode); } + return d_splice_alias(inode, dentry); +} + +/* + * Transcribe the cell database into readdir content under the RCU read lock. + * Each cell produces two entries, one prefixed with a dot and one not. + */ +static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx) +{ + const struct afs_cell *cell; + loff_t newpos; + + _enter("%llu", ctx->pos); + + for (;;) { + unsigned int ix = ctx->pos >> 1; + + cell = idr_get_next(&net->cells_dyn_ino, &ix); + if (!cell) + return 0; + if (READ_ONCE(cell->state) == AFS_CELL_REMOVING || + READ_ONCE(cell->state) == AFS_CELL_DEAD) { + ctx->pos += 2; + ctx->pos &= ~1; + continue; + } - _debug("rmdir %pd %u", subdir, d_count(subdir)); + newpos = ix << 1; + if (newpos > ctx->pos) + ctx->pos = newpos; - if (subdir->d_fsdata) { - _debug("unpin %u", d_count(subdir)); - subdir->d_fsdata = NULL; - dput(subdir); + _debug("pos %llu -> cell %u", ctx->pos, cell->dynroot_ino); + + if ((ctx->pos & 1) == 0) { + if (!dir_emit(ctx, cell->name, cell->name_len, + cell->dynroot_ino, DT_DIR)) + return 0; + ctx->pos++; + } + if ((ctx->pos & 1) == 1) { + if (!dir_emit(ctx, cell->name - 1, cell->name_len + 1, + cell->dynroot_ino + 1, DT_DIR)) + return 0; + ctx->pos++; + } } - dput(subdir); -no_dentry: - inode_unlock(root->d_inode); - _leave(""); + return 0; } /* - * Populate a newly created dynamic root with cell names. + * Read the AFS dynamic root directory. This produces a list of cellnames, + * dotted and undotted, along with @cell and .@cell links if configured. */ -int afs_dynroot_populate(struct super_block *sb) +static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx) { - struct afs_cell *cell; - struct afs_net *net = afs_sb2net(sb); - int ret; + struct afs_net *net = afs_d2net(file->f_path.dentry); + int ret = 0; - mutex_lock(&net->proc_cells_lock); + if (!dir_emit_dots(file, ctx)) + return 0; - net->dynroot_sb = sb; - hlist_for_each_entry(cell, &net->proc_cells, proc_link) { - ret = afs_dynroot_mkdir(net, cell); - if (ret < 0) - goto error; + if (ctx->pos == 2) { + if (rcu_access_pointer(net->ws_cell) && + !dir_emit(ctx, "@cell", 5, 2, DT_LNK)) + return 0; + ctx->pos = 3; + } + if (ctx->pos == 3) { + if (rcu_access_pointer(net->ws_cell) && + !dir_emit(ctx, ".@cell", 6, 3, DT_LNK)) + return 0; + ctx->pos = 4; } - ret = 0; -out: - mutex_unlock(&net->proc_cells_lock); + if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) { + down_read(&net->cells_lock); + ret = afs_dynroot_readdir_cells(net, ctx); + up_read(&net->cells_lock); + } return ret; - -error: - net->dynroot_sb = NULL; - goto out; } +static const struct file_operations afs_dynroot_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = afs_dynroot_readdir, + .fsync = noop_fsync, +}; + /* - * When a dynamic root that's in the process of being destroyed, depopulate it - * of pinned directories. + * Create an inode for a dynamic root directory. */ -void afs_dynroot_depopulate(struct super_block *sb) +struct inode *afs_dynroot_iget_root(struct super_block *sb) { - struct afs_net *net = afs_sb2net(sb); - struct dentry *root = sb->s_root, *subdir, *tmp; - - /* Prevent more subdirs from being created */ - mutex_lock(&net->proc_cells_lock); - if (net->dynroot_sb == sb) - net->dynroot_sb = NULL; - mutex_unlock(&net->proc_cells_lock); - - if (root) { - inode_lock(root->d_inode); - - /* Remove all the pins for dirs created for manually added cells */ - list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) { - if (subdir->d_fsdata) { - subdir->d_fsdata = NULL; - dput(subdir); - } - } + struct afs_super_info *as = AFS_FS_S(sb); + struct afs_vnode *vnode; + struct inode *inode; + struct afs_fid fid = { .vid = 0, .vnode = 1, .unique = 1,}; + + if (as->volume) + fid.vid = as->volume->vid; + + inode = iget5_locked(sb, fid.vnode, + afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); + if (!inode) + return ERR_PTR(-ENOMEM); + + vnode = AFS_FS_I(inode); - inode_unlock(root->d_inode); + /* there shouldn't be an existing inode */ + if (inode_state_read_once(inode) & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 2); + inode->i_size = 0; + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_dynroot_inode_operations; + inode->i_fop = &afs_dynroot_file_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); + unlock_new_inode(inode); } + _leave(" = %p", inode); + return inode; } diff --git a/fs/afs/file.c b/fs/afs/file.c index 68d6d5dc608d..f66a92294284 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -16,15 +16,15 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/netfs.h> +#include <trace/events/netfs.h> #include "internal.h" -static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); -static int afs_symlink_read_folio(struct file *file, struct folio *folio); -static void afs_invalidate_folio(struct folio *folio, size_t offset, - size_t length); -static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags); +static int afs_file_mmap_prepare(struct vm_area_desc *desc); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); +static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); static void afs_vm_open(struct vm_area_struct *area); static void afs_vm_close(struct vm_area_struct *area); static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); @@ -34,9 +34,9 @@ const struct file_operations afs_file_operations = { .release = afs_release, .llseek = generic_file_llseek, .read_iter = afs_file_read_iter, - .write_iter = afs_file_write, - .mmap = afs_file_mmap, - .splice_read = generic_file_splice_read, + .write_iter = netfs_file_write_iter, + .mmap_prepare = afs_file_mmap_prepare, + .splice_read = afs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = afs_fsync, .lock = afs_lock, @@ -50,23 +50,14 @@ const struct inode_operations afs_file_inode_operations = { }; const struct address_space_operations afs_file_aops = { + .direct_IO = noop_direct_IO, .read_folio = netfs_read_folio, .readahead = netfs_readahead, - .dirty_folio = afs_dirty_folio, - .launder_folio = afs_launder_folio, - .release_folio = afs_release_folio, - .invalidate_folio = afs_invalidate_folio, - .write_begin = afs_write_begin, - .write_end = afs_write_end, - .writepages = afs_writepages, - .migrate_folio = filemap_migrate_folio, -}; - -const struct address_space_operations afs_symlink_aops = { - .read_folio = afs_symlink_read_folio, - .release_folio = afs_release_folio, - .invalidate_folio = afs_invalidate_folio, + .dirty_folio = netfs_dirty_folio, + .release_folio = netfs_release_folio, + .invalidate_folio = netfs_invalidate_folio, .migrate_folio = filemap_migrate_folio, + .writepages = afs_writepages, }; static const struct vm_operations_struct afs_vm_ops = { @@ -209,50 +200,12 @@ int afs_release(struct inode *inode, struct file *file) return ret; } -/* - * Allocate a new read record. - */ -struct afs_read *afs_alloc_read(gfp_t gfp) -{ - struct afs_read *req; - - req = kzalloc(sizeof(struct afs_read), gfp); - if (req) - refcount_set(&req->usage, 1); - - return req; -} - -/* - * Dispose of a ref to a read record. - */ -void afs_put_read(struct afs_read *req) -{ - if (refcount_dec_and_test(&req->usage)) { - if (req->cleanup) - req->cleanup(req); - key_put(req->key); - kfree(req); - } -} - static void afs_fetch_data_notify(struct afs_operation *op) { - struct afs_read *req = op->fetch.req; - struct netfs_io_subrequest *subreq = req->subreq; - int error = op->error; - - if (error == -ECONNABORTED) - error = afs_abort_to_error(op->ac.abort_code); - req->error = error; - - if (subreq) { - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_subreq_terminated(subreq, error ?: req->actual_len, false); - req->subreq = NULL; - } else if (req->done) { - req->done(req); - } + struct netfs_io_subrequest *subreq = op->fetch.subreq; + + subreq->error = afs_op_error(op); + netfs_read_subreq_terminated(subreq); } static void afs_fetch_data_success(struct afs_operation *op) @@ -262,117 +215,199 @@ static void afs_fetch_data_success(struct afs_operation *op) _enter("op=%08x", op->debug_id); afs_vnode_commit_status(op, &op->file[0]); afs_stat_v(vnode, n_fetches); - atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes); + atomic_long_add(op->fetch.subreq->transferred, &op->net->n_fetch_bytes); afs_fetch_data_notify(op); } -static void afs_fetch_data_put(struct afs_operation *op) +static void afs_fetch_data_aborted(struct afs_operation *op) { - op->fetch.req->error = op->error; - afs_put_read(op->fetch.req); + afs_check_for_remote_deletion(op); + afs_fetch_data_notify(op); } -static const struct afs_operation_ops afs_fetch_data_operation = { +const struct afs_operation_ops afs_fetch_data_operation = { .issue_afs_rpc = afs_fs_fetch_data, .issue_yfs_rpc = yfs_fs_fetch_data, .success = afs_fetch_data_success, - .aborted = afs_check_for_remote_deletion, + .aborted = afs_fetch_data_aborted, .failed = afs_fetch_data_notify, - .put = afs_fetch_data_put, }; +static void afs_issue_read_call(struct afs_operation *op) +{ + op->call_responded = false; + op->call_error = 0; + op->call_abort_code = 0; + if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags)) + yfs_fs_fetch_data(op); + else + afs_fs_fetch_data(op); +} + +static void afs_end_read(struct afs_operation *op) +{ + if (op->call_responded && op->server) + set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags); + + if (!afs_op_error(op)) + afs_fetch_data_success(op); + else if (op->cumul_error.aborted) + afs_fetch_data_aborted(op); + else + afs_fetch_data_notify(op); + + afs_end_vnode_operation(op); + afs_put_operation(op); +} + +/* + * Perform I/O processing on an asynchronous call. The work item carries a ref + * to the call struct that we either need to release or to pass on. + */ +static void afs_read_receive(struct afs_call *call) +{ + struct afs_operation *op = call->op; + enum afs_call_state state; + + _enter(""); + + state = READ_ONCE(call->state); + if (state == AFS_CALL_COMPLETE) + return; + trace_afs_read_recv(op, call); + + while (state < AFS_CALL_COMPLETE && READ_ONCE(call->need_attention)) { + WRITE_ONCE(call->need_attention, false); + afs_deliver_to_call(call); + state = READ_ONCE(call->state); + } + + if (state < AFS_CALL_COMPLETE) { + netfs_read_subreq_progress(op->fetch.subreq); + if (rxrpc_kernel_check_life(call->net->socket, call->rxcall)) + return; + /* rxrpc terminated the call. */ + afs_set_call_complete(call, call->error, call->abort_code); + } + + op->call_abort_code = call->abort_code; + op->call_error = call->error; + op->call_responded = call->responded; + op->call = NULL; + call->op = NULL; + afs_put_call(call); + + /* If the call failed, then we need to crank the server rotation + * handle and try the next. + */ + if (afs_select_fileserver(op)) { + afs_issue_read_call(op); + return; + } + + afs_end_read(op); +} + +void afs_fetch_data_async_rx(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, async_work); + + afs_read_receive(call); + afs_put_call(call); +} + +void afs_fetch_data_immediate_cancel(struct afs_call *call) +{ + if (call->async) { + afs_get_call(call, afs_call_trace_wake); + if (!queue_work(afs_async_calls, &call->async_work)) + afs_deferred_put_call(call); + flush_work(&call->async_work); + } +} + /* * Fetch file data from the volume. */ -int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) +static void afs_issue_read(struct netfs_io_subrequest *subreq) { struct afs_operation *op; + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + struct key *key = subreq->rreq->netfs_priv; _enter("%s{%llx:%llu.%u},%x,,,", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, - key_serial(req->key)); + key_serial(key)); - op = afs_alloc_operation(req->key, vnode->volume); + op = afs_alloc_operation(key, vnode->volume); if (IS_ERR(op)) { - if (req->subreq) - netfs_subreq_terminated(req->subreq, PTR_ERR(op), false); - return PTR_ERR(op); + subreq->error = PTR_ERR(op); + netfs_read_subreq_terminated(subreq); + return; } afs_op_set_vnode(op, 0, vnode); - op->fetch.req = afs_get_read(req); + op->fetch.subreq = subreq; op->ops = &afs_fetch_data_operation; - return afs_do_sync_operation(op); -} - -static void afs_issue_read(struct netfs_io_subrequest *subreq) -{ - struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); - struct afs_read *fsreq; - fsreq = afs_alloc_read(GFP_NOFS); - if (!fsreq) - return netfs_subreq_terminated(subreq, -ENOMEM, false); + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); - fsreq->subreq = subreq; - fsreq->pos = subreq->start + subreq->transferred; - fsreq->len = subreq->len - subreq->transferred; - fsreq->key = key_get(subreq->rreq->netfs_priv); - fsreq->vnode = vnode; - fsreq->iter = &fsreq->def_iter; + if (subreq->rreq->origin == NETFS_READAHEAD || + subreq->rreq->iocb) { + op->flags |= AFS_OPERATION_ASYNC; - iov_iter_xarray(&fsreq->def_iter, ITER_DEST, - &fsreq->vnode->netfs.inode.i_mapping->i_pages, - fsreq->pos, fsreq->len); + if (!afs_begin_vnode_operation(op)) { + subreq->error = afs_put_operation(op); + netfs_read_subreq_terminated(subreq); + return; + } - afs_fetch_data(fsreq->vnode, fsreq); - afs_put_read(fsreq); -} + if (!afs_select_fileserver(op)) { + afs_end_read(op); + return; + } -static int afs_symlink_read_folio(struct file *file, struct folio *folio) -{ - struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host); - struct afs_read *fsreq; - int ret; - - fsreq = afs_alloc_read(GFP_NOFS); - if (!fsreq) - return -ENOMEM; - - fsreq->pos = folio_pos(folio); - fsreq->len = folio_size(folio); - fsreq->vnode = vnode; - fsreq->iter = &fsreq->def_iter; - iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages, - fsreq->pos, fsreq->len); - - ret = afs_fetch_data(fsreq->vnode, fsreq); - if (ret == 0) - folio_mark_uptodate(folio); - folio_unlock(folio); - return ret; + afs_issue_read_call(op); + } else { + afs_do_sync_operation(op); + } } static int afs_init_request(struct netfs_io_request *rreq, struct file *file) { - rreq->netfs_priv = key_get(afs_file_key(file)); - return 0; -} - -static int afs_begin_cache_operation(struct netfs_io_request *rreq) -{ -#ifdef CONFIG_AFS_FSCACHE struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - return fscache_begin_read_operation(&rreq->cache_resources, - afs_vnode_cache(vnode)); -#else - return -ENOBUFS; -#endif + if (file) + rreq->netfs_priv = key_get(afs_file_key(file)); + rreq->rsize = 256 * 1024; + rreq->wsize = 256 * 1024 * 1024; + + switch (rreq->origin) { + case NETFS_READ_SINGLE: + if (!file) { + struct key *key = afs_request_key(vnode->volume->cell); + + if (IS_ERR(key)) + return PTR_ERR(key); + rreq->netfs_priv = key; + } + break; + case NETFS_WRITEBACK: + case NETFS_WRITETHROUGH: + case NETFS_UNBUFFERED_WRITE: + case NETFS_DIO_WRITE: + if (S_ISREG(rreq->inode->i_mode)) + rreq->io_streams[0].avail = true; + break; + case NETFS_WRITEBACK_SINGLE: + default: + break; + } + return 0; } static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, @@ -386,171 +421,87 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, static void afs_free_request(struct netfs_io_request *rreq) { key_put(rreq->netfs_priv); + afs_put_wb_key(rreq->netfs_priv2); } -const struct netfs_request_ops afs_req_ops = { - .init_request = afs_init_request, - .free_request = afs_free_request, - .begin_cache_operation = afs_begin_cache_operation, - .check_write_begin = afs_check_write_begin, - .issue_read = afs_issue_read, -}; - -int afs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode))); - return 0; -} - -/* - * Adjust the dirty region of the page on truncation or full invalidation, - * getting rid of the markers altogether if the region is entirely invalidated. - */ -static void afs_invalidate_dirty(struct folio *folio, size_t offset, - size_t length) +static void afs_update_i_size(struct inode *inode, loff_t new_i_size) { - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - unsigned long priv; - unsigned int f, t, end = offset + length; - - priv = (unsigned long)folio_get_private(folio); - - /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == folio_size(folio)) - goto full_invalidate; - - /* If the page was dirtied by page_mkwrite(), the PTE stays writable - * and we don't get another notification to tell us to expand it - * again. - */ - if (afs_is_folio_dirty_mmapped(priv)) - return; - - /* We may need to shorten the dirty region */ - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - - if (t <= offset || f >= end) - return; /* Doesn't overlap */ - - if (f < offset && t > end) - return; /* Splits the dirty region - just absorb it */ - - if (f >= offset && t <= end) - goto undirty; + struct afs_vnode *vnode = AFS_FS_I(inode); + loff_t i_size; - if (f < offset) - t = offset; - else - f = end; - if (f == t) - goto undirty; - - priv = afs_folio_dirty(folio, f, t); - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio); - return; - -undirty: - trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio); - folio_clear_dirty_for_io(folio); -full_invalidate: - trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio); - folio_detach_private(folio); + write_seqlock(&vnode->cb_lock); + i_size = i_size_read(&vnode->netfs.inode); + if (new_i_size > i_size) { + i_size_write(&vnode->netfs.inode, new_i_size); + inode_set_bytes(&vnode->netfs.inode, new_i_size); + } + write_sequnlock(&vnode->cb_lock); + fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size); } -/* - * invalidate part or all of a page - * - release a page and clean up its private data if offset is 0 (indicating - * the entire page) - */ -static void afs_invalidate_folio(struct folio *folio, size_t offset, - size_t length) +static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq) { - _enter("{%lu},%zu,%zu", folio->index, offset, length); + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); - BUG_ON(!folio_test_locked(folio)); - - if (folio_get_private(folio)) - afs_invalidate_dirty(folio, offset, length); - - folio_wait_fscache(folio); - _leave(""); + afs_invalidate_cache(vnode, 0); } -/* - * release a page and clean up its private state if it's not busy - * - return true if the page can now be released, false if not - */ -static bool afs_release_folio(struct folio *folio, gfp_t gfp) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - - _enter("{{%llx:%llu}[%lu],%lx},%x", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags, - gfp); - - /* deny if folio is being written to the cache and the caller hasn't - * elected to wait */ -#ifdef CONFIG_AFS_FSCACHE - if (folio_test_fscache(folio)) { - if (current_is_kswapd() || !(gfp & __GFP_FS)) - return false; - folio_wait_fscache(folio); - } - fscache_note_page_release(afs_vnode_cache(vnode)); -#endif - - if (folio_test_private(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio); - folio_detach_private(folio); - } - - /* Indicate that the folio can be released */ - _leave(" = T"); - return true; -} +const struct netfs_request_ops afs_req_ops = { + .init_request = afs_init_request, + .free_request = afs_free_request, + .check_write_begin = afs_check_write_begin, + .issue_read = afs_issue_read, + .update_i_size = afs_update_i_size, + .invalidate_cache = afs_netfs_invalidate_cache, + .begin_writeback = afs_begin_writeback, + .prepare_write = afs_prepare_write, + .issue_write = afs_issue_write, + .retry_request = afs_retry_request, +}; static void afs_add_open_mmap(struct afs_vnode *vnode) { if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) { - down_write(&vnode->volume->cell->fs_open_mmaps_lock); + down_write(&vnode->volume->open_mmaps_lock); if (list_empty(&vnode->cb_mmap_link)) - list_add_tail(&vnode->cb_mmap_link, - &vnode->volume->cell->fs_open_mmaps); + list_add_tail(&vnode->cb_mmap_link, &vnode->volume->open_mmaps); - up_write(&vnode->volume->cell->fs_open_mmaps_lock); + up_write(&vnode->volume->open_mmaps_lock); } } static void afs_drop_open_mmap(struct afs_vnode *vnode) { - if (!atomic_dec_and_test(&vnode->cb_nr_mmap)) + if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1)) return; - down_write(&vnode->volume->cell->fs_open_mmaps_lock); + down_write(&vnode->volume->open_mmaps_lock); - if (atomic_read(&vnode->cb_nr_mmap) == 0) + read_seqlock_excl(&vnode->cb_lock); + // the only place where ->cb_nr_mmap may hit 0 + // see __afs_break_callback() for the other side... + if (atomic_dec_and_test(&vnode->cb_nr_mmap)) list_del_init(&vnode->cb_mmap_link); + read_sequnlock_excl(&vnode->cb_lock); - up_write(&vnode->volume->cell->fs_open_mmaps_lock); + up_write(&vnode->volume->open_mmaps_lock); flush_work(&vnode->cb_work); } /* * Handle setting up a memory mapping on an AFS file. */ -static int afs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int afs_file_mmap_prepare(struct vm_area_desc *desc) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(desc->file)); int ret; afs_add_open_mmap(vnode); - ret = generic_file_mmap(file, vma); + ret = generic_file_mmap_prepare(desc); if (ret == 0) - vma->vm_ops = &afs_vm_ops; + desc->vm_ops = &afs_vm_ops; else afs_drop_open_mmap(vnode); return ret; @@ -569,31 +520,47 @@ static void afs_vm_close(struct vm_area_struct *vma) static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file)); - struct afs_file *af = vmf->vma->vm_file->private_data; - switch (afs_validate(vnode, af->key)) { - case 0: + if (afs_check_validity(vnode)) return filemap_map_pages(vmf, start_pgoff, end_pgoff); - case -ENOMEM: - return VM_FAULT_OOM; - case -EINTR: - case -ERESTARTSYS: - return VM_FAULT_RETRY; - case -ESTALE: - default: - return VM_FAULT_SIGBUS; - } + return 0; } static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); + struct inode *inode = file_inode(iocb->ki_filp); + struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = iocb->ki_filp->private_data; - int ret; + ssize_t ret; - ret = afs_validate(vnode, af->key); + if (iocb->ki_flags & IOCB_DIRECT) + return netfs_unbuffered_read_iter(iocb, iter); + + ret = netfs_start_io_read(inode); if (ret < 0) return ret; + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_read(iocb, iter, 0); + netfs_end_io_read(inode); + return ret; +} - return generic_file_read_iter(iocb, iter); +static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_file *af = in->private_data; + ssize_t ret; + + ret = netfs_start_io_read(inode); + if (ret < 0) + return ret; + ret = afs_validate(vnode, af->key); + if (ret == 0) + ret = filemap_splice_read(in, ppos, pipe, len, flags); + netfs_end_io_read(inode); + return ret; } diff --git a/fs/afs/flock.c b/fs/afs/flock.c index bbcc5afd1576..f0e96a35093f 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -93,13 +93,13 @@ static void afs_grant_locks(struct afs_vnode *vnode) bool exclusive = (vnode->lock_type == AFS_LOCK_WRITE); list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) { - if (!exclusive && p->fl_type == F_WRLCK) + if (!exclusive && lock_is_write(p)) continue; list_move_tail(&p->fl_u.afs.link, &vnode->granted_locks); p->fl_u.afs.state = AFS_LOCK_GRANTED; trace_afs_flock_op(vnode, p, afs_flock_op_grant); - wake_up(&p->fl_wait); + locks_wake_up(p); } } @@ -112,25 +112,24 @@ static void afs_next_locker(struct afs_vnode *vnode, int error) { struct file_lock *p, *_p, *next = NULL; struct key *key = vnode->lock_key; - unsigned int fl_type = F_RDLCK; + unsigned int type = F_RDLCK; _enter(""); if (vnode->lock_type == AFS_LOCK_WRITE) - fl_type = F_WRLCK; + type = F_WRLCK; list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) { if (error && - p->fl_type == fl_type && - afs_file_key(p->fl_file) == key) { + p->c.flc_type == type && + afs_file_key(p->c.flc_file) == key) { list_del_init(&p->fl_u.afs.link); p->fl_u.afs.state = error; - wake_up(&p->fl_wait); + locks_wake_up(p); } /* Select the next locker to hand off to. */ - if (next && - (next->fl_type == F_WRLCK || p->fl_type == F_RDLCK)) + if (next && (lock_is_write(next) || lock_is_read(p))) continue; next = p; } @@ -142,7 +141,7 @@ static void afs_next_locker(struct afs_vnode *vnode, int error) afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING); next->fl_u.afs.state = AFS_LOCK_YOUR_TRY; trace_afs_flock_op(vnode, next, afs_flock_op_wake); - wake_up(&next->fl_wait); + locks_wake_up(next); } else { afs_set_lock_state(vnode, AFS_VNODE_LOCK_NONE); trace_afs_flock_ev(vnode, NULL, afs_flock_no_lockers, 0); @@ -166,7 +165,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode) struct file_lock, fl_u.afs.link); list_del_init(&p->fl_u.afs.link); p->fl_u.afs.state = -ENOENT; - wake_up(&p->fl_wait); + locks_wake_up(p); } key_put(vnode->lock_key); @@ -451,7 +450,7 @@ static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key, */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct inode *inode = locks_inode(file); + struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); enum afs_flock_mode mode = AFS_FS_S(inode->i_sb)->flock_mode; afs_lock_type_t type; @@ -464,14 +463,14 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) _enter("{%llx:%llu},%llu-%llu,%u,%u", vnode->fid.vid, vnode->fid.vnode, - fl->fl_start, fl->fl_end, fl->fl_type, mode); + fl->fl_start, fl->fl_end, fl->c.flc_type, mode); fl->fl_ops = &afs_lock_ops; INIT_LIST_HEAD(&fl->fl_u.afs.link); fl->fl_u.afs.state = AFS_LOCK_PENDING; partial = (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX); - type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + type = lock_is_read(fl) ? AFS_LOCK_READ : AFS_LOCK_WRITE; if (mode == afs_flock_mode_write && partial) type = AFS_LOCK_WRITE; @@ -524,7 +523,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) } if (vnode->lock_state == AFS_VNODE_LOCK_NONE && - !(fl->fl_flags & FL_SLEEP)) { + !(fl->c.flc_flags & FL_SLEEP)) { ret = -EAGAIN; if (type == AFS_LOCK_READ) { if (vnode->status.lock_count == -1) @@ -621,7 +620,7 @@ skip_server_lock: return 0; lock_is_contended: - if (!(fl->fl_flags & FL_SLEEP)) { + if (!(fl->c.flc_flags & FL_SLEEP)) { list_del_init(&fl->fl_u.afs.link); afs_next_locker(vnode, 0); ret = -EAGAIN; @@ -641,7 +640,7 @@ need_to_wait: spin_unlock(&vnode->lock); trace_afs_flock_ev(vnode, fl, afs_flock_waiting, 0); - ret = wait_event_interruptible(fl->fl_wait, + ret = wait_event_interruptible(fl->c.flc_wait, fl->fl_u.afs.state != AFS_LOCK_PENDING); trace_afs_flock_ev(vnode, fl, afs_flock_waited, ret); @@ -701,10 +700,11 @@ error: */ static int afs_do_unlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; - _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + _enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, + fl->c.flc_type); trace_afs_flock_op(vnode, fl, afs_flock_op_unlock); @@ -721,7 +721,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) */ static int afs_do_getlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct key *key = afs_file_key(file); int ret, lock_count; @@ -730,11 +730,11 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) if (vnode->lock_state == AFS_VNODE_LOCK_DELETED) return -ENOENT; - fl->fl_type = F_UNLCK; + fl->c.flc_type = F_UNLCK; /* check local lock records first */ posix_test_lock(file, fl); - if (fl->fl_type == F_UNLCK) { + if (lock_is_unlock(fl)) { /* no local locks; consult the server */ ret = afs_fetch_status(vnode, key, false, NULL); if (ret < 0) @@ -743,18 +743,18 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) lock_count = READ_ONCE(vnode->status.lock_count); if (lock_count != 0) { if (lock_count > 0) - fl->fl_type = F_RDLCK; + fl->c.flc_type = F_RDLCK; else - fl->fl_type = F_WRLCK; + fl->c.flc_type = F_WRLCK; fl->fl_start = 0; fl->fl_end = OFFSET_MAX; - fl->fl_pid = 0; + fl->c.flc_pid = 0; } } ret = 0; error: - _leave(" = %d [%hd]", ret, fl->fl_type); + _leave(" = %d [%hd]", ret, fl->c.flc_type); return ret; } @@ -763,13 +763,13 @@ error: */ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); enum afs_flock_operation op; int ret; _enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}", vnode->fid.vid, vnode->fid.vnode, cmd, - fl->fl_type, fl->fl_flags, + fl->c.flc_type, fl->c.flc_flags, (long long) fl->fl_start, (long long) fl->fl_end); if (IS_GETLK(cmd)) @@ -778,7 +778,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id); trace_afs_flock_op(vnode, fl, afs_flock_op_lock); - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) ret = afs_do_unlk(file, fl); else ret = afs_do_setlk(file, fl); @@ -798,13 +798,13 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) */ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); enum afs_flock_operation op; int ret; _enter("{%llx:%llu},%d,{t=%x,fl=%x}", vnode->fid.vid, vnode->fid.vnode, cmd, - fl->fl_type, fl->fl_flags); + fl->c.flc_type, fl->c.flc_flags); /* * No BSD flocks over NFS allowed. @@ -813,14 +813,14 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) * Not sure whether that would be unique, though, or whether * that would break in other places. */ - if (!(fl->fl_flags & FL_FLOCK)) + if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id); trace_afs_flock_op(vnode, fl, afs_flock_op_flock); /* we're simulating flock() locks using posix locks on the server */ - if (fl->fl_type == F_UNLCK) + if (lock_is_unlock(fl)) ret = afs_do_unlk(file, fl); else ret = afs_do_setlk(file, fl); @@ -843,7 +843,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file)); _enter(""); @@ -861,7 +861,7 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file)); _enter(""); diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 7a3803ce3a22..8418813ee043 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -35,18 +35,119 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo key_get(key); } - op->key = key; - op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op); - op->net = volume->cell->net; - op->cb_v_break = volume->cb_v_break; - op->debug_id = atomic_inc_return(&afs_operation_debug_counter); - op->error = -EDESTADDRREQ; - op->ac.error = SHRT_MAX; + op->key = key; + op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op); + op->net = volume->cell->net; + op->cb_v_break = atomic_read(&volume->cb_v_break); + op->pre_volsync.creation = volume->creation_time; + op->pre_volsync.update = volume->update_time; + op->debug_id = atomic_inc_return(&afs_operation_debug_counter); + op->nr_iterations = -1; + afs_op_set_error(op, -EDESTADDRREQ); _leave(" = [op=%08x]", op->debug_id); return op; } +struct afs_io_locker { + struct list_head link; + struct task_struct *task; + unsigned long have_lock; +}; + +/* + * Unlock the I/O lock on a vnode. + */ +static void afs_unlock_for_io(struct afs_vnode *vnode) +{ + struct afs_io_locker *locker; + + spin_lock(&vnode->lock); + locker = list_first_entry_or_null(&vnode->io_lock_waiters, + struct afs_io_locker, link); + if (locker) { + list_del(&locker->link); + smp_store_release(&locker->have_lock, 1); /* The unlock barrier. */ + smp_mb__after_atomic(); /* Store have_lock before task state */ + wake_up_process(locker->task); + } else { + clear_bit(AFS_VNODE_IO_LOCK, &vnode->flags); + } + spin_unlock(&vnode->lock); +} + +/* + * Lock the I/O lock on a vnode uninterruptibly. We can't use an ordinary + * mutex as lockdep will complain if we unlock it in the wrong thread. + */ +static void afs_lock_for_io(struct afs_vnode *vnode) +{ + struct afs_io_locker myself = { .task = current, }; + + spin_lock(&vnode->lock); + + if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) { + spin_unlock(&vnode->lock); + return; + } + + list_add_tail(&myself.link, &vnode->io_lock_waiters); + spin_unlock(&vnode->lock); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (smp_load_acquire(&myself.have_lock)) /* The lock barrier */ + break; + schedule(); + } + __set_current_state(TASK_RUNNING); +} + +/* + * Lock the I/O lock on a vnode interruptibly. We can't use an ordinary mutex + * as lockdep will complain if we unlock it in the wrong thread. + */ +static int afs_lock_for_io_interruptible(struct afs_vnode *vnode) +{ + struct afs_io_locker myself = { .task = current, }; + int ret = 0; + + spin_lock(&vnode->lock); + + if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) { + spin_unlock(&vnode->lock); + return 0; + } + + list_add_tail(&myself.link, &vnode->io_lock_waiters); + spin_unlock(&vnode->lock); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (smp_load_acquire(&myself.have_lock) || /* The lock barrier */ + signal_pending(current)) + break; + schedule(); + } + __set_current_state(TASK_RUNNING); + + /* If we got a signal, try to transfer the lock onto the next + * waiter. + */ + if (unlikely(signal_pending(current))) { + spin_lock(&vnode->lock); + if (myself.have_lock) { + spin_unlock(&vnode->lock); + afs_unlock_for_io(vnode); + } else { + list_del(&myself.link); + spin_unlock(&vnode->lock); + } + ret = -ERESTARTSYS; + } + return ret; +} + /* * Lock the vnode(s) being operated upon. */ @@ -58,7 +159,7 @@ static bool afs_get_io_locks(struct afs_operation *op) _enter(""); if (op->flags & AFS_OPERATION_UNINTR) { - mutex_lock(&vnode->io_lock); + afs_lock_for_io(vnode); op->flags |= AFS_OPERATION_LOCK_0; _leave(" = t [1]"); return true; @@ -70,8 +171,8 @@ static bool afs_get_io_locks(struct afs_operation *op) if (vnode2 > vnode) swap(vnode, vnode2); - if (mutex_lock_interruptible(&vnode->io_lock) < 0) { - op->error = -ERESTARTSYS; + if (afs_lock_for_io_interruptible(vnode) < 0) { + afs_op_set_error(op, -ERESTARTSYS); op->flags |= AFS_OPERATION_STOP; _leave(" = f [I 0]"); return false; @@ -79,10 +180,10 @@ static bool afs_get_io_locks(struct afs_operation *op) op->flags |= AFS_OPERATION_LOCK_0; if (vnode2) { - if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) { - op->error = -ERESTARTSYS; + if (afs_lock_for_io_interruptible(vnode2) < 0) { + afs_op_set_error(op, -ERESTARTSYS); op->flags |= AFS_OPERATION_STOP; - mutex_unlock(&vnode->io_lock); + afs_unlock_for_io(vnode); op->flags &= ~AFS_OPERATION_LOCK_0; _leave(" = f [I 1]"); return false; @@ -102,9 +203,9 @@ static void afs_drop_io_locks(struct afs_operation *op) _enter(""); if (op->flags & AFS_OPERATION_LOCK_1) - mutex_unlock(&vnode2->io_lock); + afs_unlock_for_io(vnode2); if (op->flags & AFS_OPERATION_LOCK_0) - mutex_unlock(&vnode->io_lock); + afs_unlock_for_io(vnode); } static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *vp, @@ -147,7 +248,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op) afs_prepare_vnode(op, &op->file[0], 0); afs_prepare_vnode(op, &op->file[1], 1); - op->cb_v_break = op->volume->cb_v_break; + op->cb_v_break = atomic_read(&op->volume->cb_v_break); _leave(" = true"); return true; } @@ -155,20 +256,20 @@ bool afs_begin_vnode_operation(struct afs_operation *op) /* * Tidy up a filesystem cursor and unlock the vnode. */ -static void afs_end_vnode_operation(struct afs_operation *op) +void afs_end_vnode_operation(struct afs_operation *op) { _enter(""); - if (op->error == -EDESTADDRREQ || - op->error == -EADDRNOTAVAIL || - op->error == -ENETUNREACH || - op->error == -EHOSTUNREACH) + switch (afs_op_error(op)) { + case -EDESTADDRREQ: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: afs_dump_edestaddrreq(op); + break; + } afs_drop_io_locks(op); - - if (op->error == -ECONNABORTED) - op->error = afs_abort_to_error(op->ac.abort_code); } /* @@ -179,37 +280,43 @@ void afs_wait_for_operation(struct afs_operation *op) _enter(""); while (afs_select_fileserver(op)) { - op->cb_s_break = op->server->cb_s_break; + op->call_responded = false; + op->call_error = 0; + op->call_abort_code = 0; if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) && op->ops->issue_yfs_rpc) op->ops->issue_yfs_rpc(op); else if (op->ops->issue_afs_rpc) op->ops->issue_afs_rpc(op); else - op->ac.error = -ENOTSUPP; - - if (op->call) - op->error = afs_wait_for_call_to_complete(op->call, &op->ac); + op->call_error = -ENOTSUPP; + + if (op->call) { + afs_wait_for_call_to_complete(op->call); + op->call_abort_code = op->call->abort_code; + op->call_error = op->call->error; + op->call_responded = op->call->responded; + afs_put_call(op->call); + } } - switch (op->error) { - case 0: + if (op->call_responded && op->server) + set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags); + + if (!afs_op_error(op)) { _debug("success"); op->ops->success(op); - break; - case -ECONNABORTED: + } else if (op->cumul_error.aborted) { if (op->ops->aborted) op->ops->aborted(op); - fallthrough; - default: + } else { if (op->ops->failed) op->ops->failed(op); - break; } afs_end_vnode_operation(op); - if (op->error == 0 && op->ops->edit_dir) { + if (!afs_op_error(op) && op->ops->edit_dir) { _debug("edit_dir"); op->ops->edit_dir(op); } @@ -221,7 +328,8 @@ void afs_wait_for_operation(struct afs_operation *op) */ int afs_put_operation(struct afs_operation *op) { - int i, ret = op->error; + struct afs_addr_list *alist; + int i, ret = afs_op_error(op); _enter("op=%08x,%d", op->debug_id, ret); @@ -243,9 +351,19 @@ int afs_put_operation(struct afs_operation *op) kfree(op->more_files); } - afs_end_cursor(&op->ac); + if (op->estate) { + alist = op->estate->addresses; + if (alist) { + if (op->call_responded && + op->addr_index != alist->preferred && + test_bit(alist->preferred, &op->addr_tried)) + WRITE_ONCE(alist->preferred, op->addr_index); + } + } + + afs_clear_server_states(op); afs_put_serverlist(op->net, op->server_list); - afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op); + afs_put_volume(op->volume, afs_volume_trace_put_put_op); key_put(op->key); kfree(op); return ret; diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index daaf3810cc92..e0030ac74ea0 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -15,6 +15,42 @@ static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ; static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ; +struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, + enum afs_estate_trace where) +{ + if (estate) { + int r; + + __refcount_inc(&estate->ref, &r); + trace_afs_estate(estate->server_id, estate->probe_seq, r, where); + } + return estate; +} + +static void afs_endpoint_state_rcu(struct rcu_head *rcu) +{ + struct afs_endpoint_state *estate = container_of(rcu, struct afs_endpoint_state, rcu); + + trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), + afs_estate_trace_free); + afs_put_addrlist(estate->addresses, afs_alist_trace_put_estate); + kfree(estate); +} + +void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where) +{ + if (estate) { + unsigned int server_id = estate->server_id, probe_seq = estate->probe_seq; + bool dead; + int r; + + dead = __refcount_dec_and_test(&estate->ref, &r); + trace_afs_estate(server_id, probe_seq, r, where); + if (dead) + call_rcu(&estate->rcu, afs_endpoint_state_rcu); + } +} + /* * Start the probe polling timer. We have to supply it with an inc on the * outstanding server count. @@ -38,9 +74,10 @@ static void afs_schedule_fs_probe(struct afs_net *net, /* * Handle the completion of a set of probes. */ -static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server) +static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate) { - bool responded = server->probe.responded; + bool responded = test_bit(AFS_ESTATE_RESPONDED, &estate->flags); write_seqlock(&net->fs_lock); if (responded) { @@ -50,6 +87,7 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags); list_add_tail(&server->probe_link, &net->fs_probe_fast); } + write_sequnlock(&net->fs_lock); afs_schedule_fs_probe(net, server, !responded); @@ -58,12 +96,13 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server /* * Handle the completion of a probe. */ -static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server) +static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate) { _enter(""); - if (atomic_dec_and_test(&server->probe_outstanding)) - afs_finished_fs_probe(net, server); + if (atomic_dec_and_test(&estate->nr_probing)) + afs_finished_fs_probe(net, server, estate); wake_up_all(&server->probe_wq); } @@ -74,24 +113,22 @@ static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server */ static void afs_fs_probe_not_done(struct afs_net *net, struct afs_server *server, - struct afs_addr_cursor *ac) + struct afs_endpoint_state *estate, + int index) { - struct afs_addr_list *alist = ac->alist; - unsigned int index = ac->index; - _enter(""); trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail); spin_lock(&server->probe_lock); - server->probe.local_failure = true; - if (server->probe.error == 0) - server->probe.error = -ENOMEM; + set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags); + if (estate->error == 0) + estate->error = -ENOMEM; - set_bit(index, &alist->failed); + set_bit(index, &estate->failed_set); spin_unlock(&server->probe_lock); - return afs_done_one_fs_probe(net, server); + return afs_done_one_fs_probe(net, server, estate); } /* @@ -100,30 +137,34 @@ static void afs_fs_probe_not_done(struct afs_net *net, */ void afs_fileserver_probe_result(struct afs_call *call) { - struct afs_addr_list *alist = call->alist; + struct afs_endpoint_state *estate = call->probe; + struct afs_addr_list *alist = estate->addresses; + struct afs_address *addr = &alist->addrs[call->probe_index]; struct afs_server *server = call->server; - unsigned int index = call->addr_ix; - unsigned int rtt_us = 0, cap0; + unsigned int index = call->probe_index; + unsigned int rtt_us = -1, cap0; int ret = call->error; _enter("%pU,%u", &server->uuid, index); + WRITE_ONCE(addr->last_error, ret); + spin_lock(&server->probe_lock); switch (ret) { case 0: - server->probe.error = 0; + estate->error = 0; goto responded; case -ECONNABORTED: - if (!server->probe.responded) { - server->probe.abort_code = call->abort_code; - server->probe.error = ret; + if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) { + estate->abort_code = call->abort_code; + estate->error = ret; } goto responded; case -ENOMEM: case -ENONET: - clear_bit(index, &alist->responded); - server->probe.local_failure = true; + clear_bit(index, &estate->responsive_set); + set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags); trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail); goto out; case -ECONNRESET: /* Responded, but call expired. */ @@ -136,29 +177,29 @@ void afs_fileserver_probe_result(struct afs_call *call) case -ETIMEDOUT: case -ETIME: default: - clear_bit(index, &alist->responded); - set_bit(index, &alist->failed); - if (!server->probe.responded && - (server->probe.error == 0 || - server->probe.error == -ETIMEDOUT || - server->probe.error == -ETIME)) - server->probe.error = ret; + clear_bit(index, &estate->responsive_set); + set_bit(index, &estate->failed_set); + if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags) && + (estate->error == 0 || + estate->error == -ETIMEDOUT || + estate->error == -ETIME)) + estate->error = ret; trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail); goto out; } responded: - clear_bit(index, &alist->failed); + clear_bit(index, &estate->failed_set); if (call->service_id == YFS_FS_SERVICE) { - server->probe.is_yfs = true; + set_bit(AFS_ESTATE_IS_YFS, &estate->flags); set_bit(AFS_SERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } else { - server->probe.not_yfs = true; - if (!server->probe.is_yfs) { + set_bit(AFS_ESTATE_NOT_YFS, &estate->flags); + if (!test_bit(AFS_ESTATE_IS_YFS, &estate->flags)) { clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } cap0 = ntohl(call->tmp); if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES) @@ -167,116 +208,148 @@ responded: clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags); } - rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); - if (rtt_us < server->probe.rtt) { - server->probe.rtt = rtt_us; + rtt_us = rxrpc_kernel_get_srtt(addr->peer); + if (rtt_us < estate->rtt) { + estate->rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; } smp_wmb(); /* Set rtt before responded. */ - server->probe.responded = true; - set_bit(index, &alist->responded); + set_bit(AFS_ESTATE_RESPONDED, &estate->flags); + set_bit(index, &estate->responsive_set); set_bit(AFS_SERVER_FL_RESPONDING, &server->flags); out: spin_unlock(&server->probe_lock); - _debug("probe %pU [%u] %pISpc rtt=%u ret=%d", - &server->uuid, index, &alist->addrs[index].transport, + trace_afs_fs_probe(server, false, estate, index, call->error, call->abort_code, rtt_us); + _debug("probe[%x] %pU [%u] %pISpc rtt=%d ret=%d", + estate->probe_seq, &server->uuid, index, + rxrpc_kernel_remote_addr(alist->addrs[index].peer), rtt_us, ret); - return afs_done_one_fs_probe(call->net, server); + return afs_done_one_fs_probe(call->net, server, estate); } /* - * Probe one or all of a fileserver's addresses to find out the best route and - * to query its capabilities. + * Probe all of a fileserver's addresses to find out the best route and to + * query its capabilities. */ -void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, - struct key *key, bool all) +int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_alist, struct key *key) { - struct afs_addr_cursor ac = { - .index = 0, - }; + struct afs_endpoint_state *estate, *old; + struct afs_addr_list *old_alist = NULL, *alist; + unsigned long unprobed; _enter("%pU", &server->uuid); - read_lock(&server->fs_lock); - ac.alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->fs_lock)); - afs_get_addrlist(ac.alist); - read_unlock(&server->fs_lock); + estate = kzalloc(sizeof(*estate), GFP_KERNEL); + if (!estate) + return -ENOMEM; + + refcount_set(&estate->ref, 2); + estate->server_id = server->debug_id; + estate->rtt = UINT_MAX; + + write_lock(&server->fs_lock); + + old = rcu_dereference_protected(server->endpoint_state, + lockdep_is_held(&server->fs_lock)); + if (old) { + estate->responsive_set = old->responsive_set; + if (!new_alist) + new_alist = old->addresses; + } + + if (old_alist != new_alist) + afs_set_peer_appdata(server, old_alist, new_alist); + + estate->addresses = afs_get_addrlist(new_alist, afs_alist_trace_get_estate); + alist = estate->addresses; + estate->probe_seq = ++server->probe_counter; + atomic_set(&estate->nr_probing, alist->nr_addrs); + + if (new_alist) + server->addr_version = new_alist->version; + rcu_assign_pointer(server->endpoint_state, estate); + write_unlock(&server->fs_lock); + if (old) + set_bit(AFS_ESTATE_SUPERSEDED, &old->flags); + + trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), + afs_estate_trace_alloc_probe); + + afs_get_address_preferences(net, new_alist); server->probed_at = jiffies; - atomic_set(&server->probe_outstanding, all ? ac.alist->nr_addrs : 1); - memset(&server->probe, 0, sizeof(server->probe)); - server->probe.rtt = UINT_MAX; - - ac.index = ac.alist->preferred; - if (ac.index < 0 || ac.index >= ac.alist->nr_addrs) - all = true; - - if (all) { - for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) - if (!afs_fs_get_capabilities(net, server, &ac, key)) - afs_fs_probe_not_done(net, server, &ac); - } else { - if (!afs_fs_get_capabilities(net, server, &ac, key)) - afs_fs_probe_not_done(net, server, &ac); + unprobed = (1UL << alist->nr_addrs) - 1; + while (unprobed) { + unsigned int index = 0, i; + int best_prio = -1; + + for (i = 0; i < alist->nr_addrs; i++) { + if (test_bit(i, &unprobed) && + alist->addrs[i].prio > best_prio) { + index = i; + best_prio = alist->addrs[i].prio; + } + } + __clear_bit(index, &unprobed); + + trace_afs_fs_probe(server, true, estate, index, 0, 0, 0); + if (!afs_fs_get_capabilities(net, server, estate, index, key)) + afs_fs_probe_not_done(net, server, estate, index); } - afs_put_addrlist(ac.alist); + afs_put_endpoint_state(old, afs_estate_trace_put_probe); + afs_put_endpoint_state(estate, afs_estate_trace_put_probe); + return 0; } /* - * Wait for the first as-yet untried fileserver to respond. + * Wait for the first as-yet untried fileserver to respond, for the probe state + * to be superseded or for all probes to finish. */ -int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) +int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr) { - struct wait_queue_entry *waits; - struct afs_server *server; - unsigned int rtt = UINT_MAX, rtt_s; - bool have_responders = false; - int pref = -1, i; + struct afs_endpoint_state *estate; + struct afs_server_list *slist = op->server_list; + bool still_probing = true; + int ret = 0, i; - _enter("%u,%lx", slist->nr_servers, untried); + _enter("%u", slist->nr_servers); - /* Only wait for servers that have a probe outstanding. */ for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - if (!atomic_read(&server->probe_outstanding)) - __clear_bit(i, &untried); - if (server->probe.responded) - have_responders = true; - } + estate = states[i].endpoint_state; + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) + return 2; + if (atomic_read(&estate->nr_probing)) + still_probing = true; + if (estate->responsive_set & states[i].untried_addrs) + return 1; } - if (have_responders || !untried) + if (!still_probing) return 0; - waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL); - if (!waits) - return -ENOMEM; - - for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - init_waitqueue_entry(&waits[i], current); - add_wait_queue(&server->probe_wq, &waits[i]); - } - } + for (i = 0; i < slist->nr_servers; i++) + add_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter); for (;;) { - bool still_probing = false; + still_probing = false; - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - if (server->probe.responded) - goto stop; - if (atomic_read(&server->probe_outstanding)) - still_probing = true; + estate = states[i].endpoint_state; + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) { + ret = 2; + goto stop; + } + if (atomic_read(&estate->nr_probing)) + still_probing = true; + if (estate->responsive_set & states[i].untried_addrs) { + ret = 1; + goto stop; } } @@ -288,28 +361,12 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried) stop: set_current_state(TASK_RUNNING); - for (i = 0; i < slist->nr_servers; i++) { - if (test_bit(i, &untried)) { - server = slist->servers[i].server; - rtt_s = READ_ONCE(server->rtt); - if (test_bit(AFS_SERVER_FL_RESPONDING, &server->flags) && - rtt_s < rtt) { - pref = i; - rtt = rtt_s; - } + for (i = 0; i < slist->nr_servers; i++) + remove_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter); - remove_wait_queue(&server->probe_wq, &waits[i]); - } - } - - kfree(waits); - - if (pref == -1 && signal_pending(current)) - return -ERESTARTSYS; - - if (pref >= 0) - slist->preferred = pref; - return 0; + if (!ret && signal_pending(current)) + ret = -ERESTARTSYS; + return ret; } /* @@ -327,7 +384,7 @@ void afs_fs_probe_timer(struct timer_list *timer) /* * Dispatch a probe to a server. */ -static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server, bool all) +static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server) __releases(&net->fs_lock) { struct key *key = NULL; @@ -340,7 +397,7 @@ static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server afs_get_server(server, afs_server_trace_get_probe); write_sequnlock(&net->fs_lock); - afs_fs_probe_fileserver(net, server, key, all); + afs_fs_probe_fileserver(net, server, NULL, key); afs_put_server(net, server, afs_server_trace_put_probe); } @@ -352,7 +409,7 @@ void afs_probe_fileserver(struct afs_net *net, struct afs_server *server) { write_seqlock(&net->fs_lock); if (!list_empty(&server->probe_link)) - return afs_dispatch_fs_probe(net, server, true); + return afs_dispatch_fs_probe(net, server); write_sequnlock(&net->fs_lock); } @@ -412,7 +469,7 @@ again: _debug("probe %pU", &server->uuid); if (server && (first_pass || !need_resched())) { - afs_dispatch_fs_probe(net, server, server == fast); + afs_dispatch_fs_probe(net, server); first_pass = false; goto again; } @@ -436,12 +493,13 @@ again: /* * Wait for a probe on a particular fileserver to complete for 2s. */ -int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr) +int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, + unsigned long exclude, bool is_intr) { struct wait_queue_entry wait; unsigned long timo = 2 * HZ; - if (atomic_read(&server->probe_outstanding) == 0) + if (atomic_read(&estate->nr_probing) == 0) goto dont_wait; init_wait_entry(&wait, 0); @@ -449,8 +507,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr) prepare_to_wait_event(&server->probe_wq, &wait, is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (timo == 0 || - server->probe.responded || - atomic_read(&server->probe_outstanding) == 0 || + test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags) || + (estate->responsive_set & ~exclude) || + atomic_read(&estate->nr_probing) == 0 || (is_intr && signal_pending(current))) break; timo = schedule_timeout(timo); @@ -459,8 +518,10 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr) finish_wait(&server->probe_wq, &wait); dont_wait: - if (server->probe.responded) + if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) return 0; + if (estate->responsive_set & ~exclude) + return 1; if (is_intr && signal_pending(current)) return -ERESTARTSYS; if (timo == 0) @@ -473,6 +534,6 @@ dont_wait: */ void afs_fs_probe_cleanup(struct afs_net *net) { - if (del_timer_sync(&net->fs_probe_timer)) + if (timer_delete_sync(&net->fs_probe_timer)) afs_dec_servers_outstanding(net); } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 7d37f63ef0f0..bc9556991d7c 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -290,6 +290,7 @@ void afs_fs_fetch_status(struct afs_operation *op) bp[2] = htonl(vp->fid.vnode); bp[3] = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -300,18 +301,19 @@ void afs_fs_fetch_status(struct afs_operation *op) static int afs_deliver_fs_fetch_data(struct afs_call *call) { struct afs_operation *op = call->op; + struct netfs_io_subrequest *subreq = op->fetch.subreq; struct afs_vnode_param *vp = &op->file[0]; - struct afs_read *req = op->fetch.req; const __be32 *bp; + size_t count_before; int ret; _enter("{%u,%zu,%zu/%llu}", call->unmarshall, call->iov_len, iov_iter_count(call->iter), - req->actual_len); + call->remaining); switch (call->unmarshall) { case 0: - req->actual_len = 0; + call->remaining = 0; call->unmarshall++; if (call->operation_ID == FSFETCHDATA64) { afs_extract_to_tmp64(call); @@ -321,8 +323,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) } fallthrough; - /* Extract the returned data length into - * ->actual_len. This may indicate more or less data than was + /* Extract the returned data length into ->remaining. + * This may indicate more or less data than was * requested will be returned. */ case 1: @@ -331,38 +333,40 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (ret < 0) return ret; - req->actual_len = be64_to_cpu(call->tmp64); - _debug("DATA length: %llu", req->actual_len); + call->remaining = be64_to_cpu(call->tmp64); + _debug("DATA length: %llu", call->remaining); - if (req->actual_len == 0) + if (call->remaining == 0) goto no_more_data; - call->iter = req->iter; - call->iov_len = min(req->actual_len, req->len); + call->iter = &subreq->io_iter; + call->iov_len = umin(call->remaining, subreq->len - subreq->transferred); call->unmarshall++; fallthrough; /* extract the returned data */ case 2: - _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->actual_len); + count_before = call->iov_len; + _debug("extract data %zu/%llu", count_before, call->remaining); ret = afs_extract_data(call, true); + subreq->transferred += count_before - call->iov_len; + call->remaining -= count_before - call->iov_len; if (ret < 0) return ret; call->iter = &call->def_iter; - if (req->actual_len <= req->len) + if (call->remaining) goto no_more_data; /* Discard any excess data the server gave us */ - afs_extract_discard(call, req->actual_len - req->len); + afs_extract_discard(call, call->remaining); call->unmarshall = 3; fallthrough; case 3: _debug("extract discard %zu/%llu", - iov_iter_count(call->iter), req->actual_len - req->len); + iov_iter_count(call->iter), call->remaining); ret = afs_extract_data(call, true); if (ret < 0) @@ -384,8 +388,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) xdr_decode_AFSCallBack(&bp, call, &vp->scb); xdr_decode_AFSVolSync(&bp, &op->volsync); - req->data_version = vp->scb.status.data_version; - req->file_size = vp->scb.status.size; + if (subreq->start + subreq->transferred >= vp->scb.status.size) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); call->unmarshall++; fallthrough; @@ -404,14 +408,18 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) static const struct afs_call_type afs_RXFSFetchData = { .name = "FS.FetchData", .op = afs_FS_FetchData, + .async_rx = afs_fetch_data_async_rx, .deliver = afs_deliver_fs_fetch_data, + .immediate_cancel = afs_fetch_data_immediate_cancel, .destructor = afs_flat_call_destructor, }; static const struct afs_call_type afs_RXFSFetchData64 = { .name = "FS.FetchData64", .op = afs_FS_FetchData64, + .async_rx = afs_fetch_data_async_rx, .deliver = afs_deliver_fs_fetch_data, + .immediate_cancel = afs_fetch_data_immediate_cancel, .destructor = afs_flat_call_destructor, }; @@ -420,8 +428,8 @@ static const struct afs_call_type afs_RXFSFetchData64 = { */ static void afs_fs_fetch_data64(struct afs_operation *op) { + struct netfs_io_subrequest *subreq = op->fetch.subreq; struct afs_vnode_param *vp = &op->file[0]; - struct afs_read *req = op->fetch.req; struct afs_call *call; __be32 *bp; @@ -431,17 +439,21 @@ static void afs_fs_fetch_data64(struct afs_operation *op) if (!call) return afs_op_nomem(op); + if (op->flags & AFS_OPERATION_ASYNC) + call->async = true; + /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA64); bp[1] = htonl(vp->fid.vid); bp[2] = htonl(vp->fid.vnode); bp[3] = htonl(vp->fid.unique); - bp[4] = htonl(upper_32_bits(req->pos)); - bp[5] = htonl(lower_32_bits(req->pos)); + bp[4] = htonl(upper_32_bits(subreq->start + subreq->transferred)); + bp[5] = htonl(lower_32_bits(subreq->start + subreq->transferred)); bp[6] = 0; - bp[7] = htonl(lower_32_bits(req->len)); + bp[7] = htonl(lower_32_bits(subreq->len - subreq->transferred)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -451,9 +463,9 @@ static void afs_fs_fetch_data64(struct afs_operation *op) */ void afs_fs_fetch_data(struct afs_operation *op) { + struct netfs_io_subrequest *subreq = op->fetch.subreq; struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - struct afs_read *req = op->fetch.req; __be32 *bp; if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags)) @@ -465,17 +477,16 @@ void afs_fs_fetch_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); - req->call_debug_id = call->debug_id; - /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA); bp[1] = htonl(vp->fid.vid); bp[2] = htonl(vp->fid.vnode); bp[3] = htonl(vp->fid.unique); - bp[4] = htonl(lower_32_bits(req->pos)); - bp[5] = htonl(lower_32_bits(req->len)); + bp[4] = htonl(lower_32_bits(subreq->start + subreq->transferred)); + bp[5] = htonl(lower_32_bits(subreq->len + subreq->transferred)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -559,6 +570,7 @@ void afs_fs_create_file(struct afs_operation *op) *bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */ *bp++ = 0; /* segment size */ + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -612,6 +624,7 @@ void afs_fs_make_dir(struct afs_operation *op) *bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */ *bp++ = 0; /* segment size */ + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -685,6 +698,7 @@ void afs_fs_remove_file(struct afs_operation *op) bp = (void *) bp + padsz; } + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -732,6 +746,7 @@ void afs_fs_remove_dir(struct afs_operation *op) bp = (void *) bp + padsz; } + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -812,6 +827,7 @@ void afs_fs_link(struct afs_operation *op) *bp++ = htonl(vp->fid.vnode); *bp++ = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call1(call, &vp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -907,6 +923,7 @@ void afs_fs_symlink(struct afs_operation *op) *bp++ = htonl(S_IRWXUGO); /* unix mode */ *bp++ = 0; /* segment size */ + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1003,6 +1020,7 @@ void afs_fs_rename(struct afs_operation *op) bp = (void *) bp + n_padsz; } + call->fid = orig_dvp->fid; trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1090,6 +1108,7 @@ static void afs_fs_store_data64(struct afs_operation *op) *bp++ = htonl(upper_32_bits(op->store.i_size)); *bp++ = htonl(lower_32_bits(op->store.i_size)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1140,6 +1159,7 @@ void afs_fs_store_data(struct afs_operation *op) *bp++ = htonl(lower_32_bits(op->store.size)); *bp++ = htonl(lower_32_bits(op->store.i_size)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1206,6 +1226,7 @@ static void afs_fs_setattr_size64(struct afs_operation *op) *bp++ = htonl(upper_32_bits(attr->ia_size)); /* new file length */ *bp++ = htonl(lower_32_bits(attr->ia_size)); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1247,6 +1268,7 @@ static void afs_fs_setattr_size(struct afs_operation *op) *bp++ = 0; /* size of write */ *bp++ = htonl(attr->ia_size); /* new file length */ + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1283,6 +1305,7 @@ void afs_fs_setattr(struct afs_operation *op) xdr_encode_AFS_StoreStatus(&bp, op->setattr.attr); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1446,6 +1469,7 @@ void afs_fs_get_volume_status(struct afs_operation *op) bp[0] = htonl(FSGETVOLUMESTATUS); bp[1] = htonl(vp->fid.vid); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1528,6 +1552,7 @@ void afs_fs_set_lock(struct afs_operation *op) *bp++ = htonl(vp->fid.unique); *bp++ = htonl(op->lock.type); + call->fid = vp->fid; trace_afs_make_fs_calli(call, &vp->fid, op->lock.type); afs_make_op_call(op, call, GFP_NOFS); } @@ -1554,6 +1579,7 @@ void afs_fs_extend_lock(struct afs_operation *op) *bp++ = htonl(vp->fid.vnode); *bp++ = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1580,6 +1606,7 @@ void afs_fs_release_lock(struct afs_operation *op) *bp++ = htonl(vp->fid.vnode); *bp++ = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1605,13 +1632,12 @@ static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = { /* * Flush all the callbacks we have on a server. */ -int afs_fs_give_up_all_callbacks(struct afs_net *net, - struct afs_server *server, - struct afs_addr_cursor *ac, - struct key *key) +int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, + struct afs_address *addr, struct key *key) { struct afs_call *call; __be32 *bp; + int ret; _enter(""); @@ -1619,15 +1645,22 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, if (!call) return -ENOMEM; - call->key = key; + call->key = key; + call->peer = rxrpc_kernel_get_peer(addr->peer); + call->service_id = server->service_id; /* marshall the parameters */ bp = call->request; *bp++ = htonl(FSGIVEUPALLCALLBACKS); - call->server = afs_use_server(server, afs_server_trace_give_up_cb); - afs_make_call(ac, call, GFP_NOFS); - return afs_wait_for_call_to_complete(call, ac); + call->server = afs_use_server(server, false, afs_server_trace_use_give_up_cb); + afs_make_call(call, GFP_NOFS); + afs_wait_for_call_to_complete(call); + ret = call->error; + if (call->responded) + set_bit(AFS_SERVER_FL_RESPONDING, &server->flags); + afs_put_call(call); + return ret; } /* @@ -1689,6 +1722,12 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) return 0; } +static void afs_fs_get_capabilities_destructor(struct afs_call *call) +{ + afs_put_endpoint_state(call->probe, afs_estate_trace_put_getcaps); + afs_flat_call_destructor(call); +} + /* * FS.GetCapabilities operation type */ @@ -1697,7 +1736,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { .op = afs_FS_GetCapabilities, .deliver = afs_deliver_fs_get_capabilities, .done = afs_fileserver_probe_result, - .destructor = afs_flat_call_destructor, + .immediate_cancel = afs_fileserver_probe_result, + .destructor = afs_fs_get_capabilities_destructor, }; /* @@ -1707,7 +1747,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = { * ->done() - otherwise we return false to indicate we didn't even try. */ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, - struct afs_addr_cursor *ac, struct key *key) + struct afs_endpoint_state *estate, unsigned int addr_index, + struct key *key) { struct afs_call *call; __be32 *bp; @@ -1718,10 +1759,14 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, if (!call) return false; - call->key = key; - call->server = afs_use_server(server, afs_server_trace_get_caps); - call->upgrade = true; - call->async = true; + call->key = key; + call->server = afs_use_server(server, false, afs_server_trace_use_get_caps); + call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer); + call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps); + call->probe_index = addr_index; + call->service_id = server->service_id; + call->upgrade = true; + call->async = true; call->max_lifespan = AFS_PROBE_MAX_LIFESPAN; /* marshall the parameters */ @@ -1729,7 +1774,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, *bp++ = htonl(FSGETCAPABILITIES); trace_afs_make_fs_call(call, NULL); - afs_make_call(ac, call, GFP_NOFS); + afs_make_call(call, GFP_NOFS); afs_put_call(call); return true; } @@ -1853,7 +1898,10 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSVolSync(&bp, &op->volsync); + /* Unfortunately, prior to OpenAFS-1.6, volsync here is filled + * with rubbish. + */ + xdr_decode_AFSVolSync(&bp, NULL); call->unmarshall++; fallthrough; @@ -1899,7 +1947,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op) int i; if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) { - op->error = -ENOTSUPP; + afs_op_set_error(op, -ENOTSUPP); return; } @@ -1928,6 +1976,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op) *bp++ = htonl(op->more_files[i].fid.unique); } + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -2033,6 +2082,7 @@ void afs_fs_fetch_acl(struct afs_operation *op) bp[2] = htonl(vp->fid.vnode); bp[3] = htonl(vp->fid.unique); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } @@ -2078,6 +2128,7 @@ void afs_fs_store_acl(struct afs_operation *op) if (acl->size != size) memset((void *)&bp[5] + acl->size, 0, size - acl->size); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 6d3a3dbe4928..dde1857fcabb 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -25,8 +25,94 @@ #include "internal.h" #include "afs_fs.h" +void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op) +{ + size_t size = strlen(op->create.symlink) + 1; + size_t dsize = 0; + char *p; + + if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size, + mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0) + return; + + vnode->directory_size = dsize; + p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0); + memcpy(p, op->create.symlink, size); + kunmap_local(p); + set_bit(AFS_VNODE_DIR_READ, &vnode->flags); + netfs_single_mark_inode_dirty(&vnode->netfs.inode); +} + +static void afs_put_link(void *arg) +{ + struct folio *folio = virt_to_folio(arg); + + kunmap_local(arg); + folio_put(folio); +} + +const char *afs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + struct afs_vnode *vnode = AFS_FS_I(inode); + struct folio *folio; + char *content; + ssize_t ret; + + if (!dentry) { + /* RCU pathwalk. */ + if (!test_bit(AFS_VNODE_DIR_READ, &vnode->flags) || !afs_check_validity(vnode)) + return ERR_PTR(-ECHILD); + goto good; + } + + if (test_bit(AFS_VNODE_DIR_READ, &vnode->flags)) + goto fetch; + + ret = afs_validate(vnode, NULL); + if (ret < 0) + return ERR_PTR(ret); + + if (!test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && + test_bit(AFS_VNODE_DIR_READ, &vnode->flags)) + goto good; + +fetch: + ret = afs_read_single(vnode, NULL); + if (ret < 0) + return ERR_PTR(ret); + set_bit(AFS_VNODE_DIR_READ, &vnode->flags); + +good: + folio = folioq_folio(vnode->directory, 0); + folio_get(folio); + content = kmap_local_folio(folio, 0); + set_delayed_call(callback, afs_put_link, content); + return content; +} + +int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + DEFINE_DELAYED_CALL(done); + const char *content; + int len; + + content = afs_get_link(dentry, d_inode(dentry), &done); + if (IS_ERR(content)) { + do_delayed_call(&done); + return PTR_ERR(content); + } + + len = umin(strlen(content), buflen); + if (copy_to_user(buffer, content, len)) + len = -EFAULT; + do_delayed_call(&done); + return len; +} + static const struct inode_operations afs_symlink_inode_operations = { - .get_link = page_get_link, + .get_link = afs_get_link, + .readlink = afs_readlink, }; static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode) @@ -58,7 +144,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren */ static void afs_set_netfs_context(struct afs_vnode *vnode) { - netfs_inode_init(&vnode->netfs, &afs_req_ops); + netfs_inode_init(&vnode->netfs, &afs_req_ops, true); } /* @@ -85,14 +171,13 @@ static int afs_inode_init_from_status(struct afs_operation *op, write_seqlock(&vnode->cb_lock); - vnode->cb_v_break = op->cb_v_break; - vnode->cb_s_break = op->cb_s_break; + vnode->cb_v_check = op->cb_v_break; vnode->status = *status; t = status->mtime_client; - inode->i_ctime = t; - inode->i_mtime = t; - inode->i_atime = t; + inode_set_ctime_to_ts(inode, t); + inode_set_mtime_to_ts(inode, t); + inode_set_atime_to_ts(inode, t); inode->i_flags |= S_NOATIME; inode->i_uid = make_kuid(&init_user_ns, status->owner); inode->i_gid = make_kgid(&init_user_ns, status->group); @@ -111,7 +196,9 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_op = &afs_dir_inode_operations; inode->i_fop = &afs_dir_file_operations; inode->i_mapping->a_ops = &afs_dir_aops; - mapping_set_large_folios(inode->i_mapping); + __set_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &vnode->netfs.flags); + /* Assume locally cached directory data will be valid. */ + __set_bit(AFS_VNODE_DIR_VALID, &vnode->flags); break; case AFS_FTYPE_SYMLINK: /* Symlinks with a mode of 0644 are actually mountpoints. */ @@ -123,13 +210,13 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_mode = S_IFDIR | 0555; inode->i_op = &afs_mntpt_inode_operations; inode->i_fop = &afs_mntpt_file_operations; - inode->i_mapping->a_ops = &afs_symlink_aops; } else { inode->i_mode = S_IFLNK | status->mode; inode->i_op = &afs_symlink_inode_operations; - inode->i_mapping->a_ops = &afs_symlink_aops; } + inode->i_mapping->a_ops = &afs_dir_aops; inode_nohighmem(inode); + mapping_set_release_always(inode->i_mapping); break; default: dump_vnode(vnode, op->file[0].vnode != vnode ? op->file[0].vnode : NULL); @@ -141,16 +228,17 @@ static int afs_inode_init_from_status(struct afs_operation *op, afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; + trace_afs_set_dv(vnode, status->data_version); inode_set_iversion_raw(&vnode->netfs.inode, status->data_version); if (!vp->scb.have_cb) { /* it's a symlink we just created (the fileserver * didn't give us a callback) */ - vnode->cb_expires_at = ktime_get_real_seconds(); + afs_clear_cb_promise(vnode, afs_cb_promise_set_new_symlink); } else { - vnode->cb_expires_at = vp->scb.callback.expires_at; vnode->cb_server = op->server; - set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + afs_set_cb_promise(vnode, vp->scb.callback.expires_at, + afs_cb_promise_set_new_inode); } write_sequnlock(&vnode->cb_lock); @@ -168,6 +256,7 @@ static void afs_apply_status(struct afs_operation *op, struct inode *inode = &vnode->netfs.inode; struct timespec64 t; umode_t mode; + bool unexpected_jump = false; bool data_changed = false; bool change_size = vp->set_size; @@ -204,17 +293,23 @@ static void afs_apply_status(struct afs_operation *op, } t = status->mtime_client; - inode->i_mtime = t; + inode_set_mtime_to_ts(inode, t); if (vp->update_ctime) - inode->i_ctime = op->ctime; + inode_set_ctime_to_ts(inode, op->ctime); - if (vnode->status.data_version != status->data_version) + if (vnode->status.data_version != status->data_version) { + trace_afs_set_dv(vnode, status->data_version); data_changed = true; + } vnode->status = *status; if (vp->dv_before + vp->dv_delta != status->data_version) { - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) + trace_afs_dv_mismatch(vnode, vp->dv_before, vp->dv_delta, + status->data_version); + + if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) && + atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE) pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n", vnode->fid.vid, vnode->fid.vnode, (unsigned long long)vp->dv_before + vp->dv_delta, @@ -223,13 +318,13 @@ static void afs_apply_status(struct afs_operation *op, op->debug_id); vnode->invalid_before = status->data_version; - if (vnode->status.type == AFS_FTYPE_DIR) { - if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) - afs_stat_v(vnode, n_inval); - } else { + if (vnode->status.type == AFS_FTYPE_DIR) + afs_invalidate_dir(vnode, afs_dir_invalid_dv_mismatch); + else set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); - } change_size = true; + data_changed = true; + unexpected_jump = true; } else if (vnode->status.type == AFS_FTYPE_DIR) { /* Expected directory change is handled elsewhere so * that we can locally edit the directory and save on a @@ -249,11 +344,15 @@ static void afs_apply_status(struct afs_operation *op, * what's on the server. */ vnode->netfs.remote_i_size = status->size; - if (change_size) { + if (change_size || status->size > i_size_read(inode)) { afs_set_i_size(vnode, status->size); - inode->i_ctime = t; - inode->i_atime = t; + if (unexpected_jump) + vnode->netfs.zero_point = status->size; + inode_set_ctime_to_ts(inode, t); + inode_set_atime_to_ts(inode, t); } + if (op->ops == &afs_fetch_data_operation) + op->fetch.subreq->rreq->i_size = status->size; } } @@ -267,9 +366,9 @@ static void afs_apply_callback(struct afs_operation *op, struct afs_vnode *vnode = vp->vnode; if (!afs_cb_is_broken(vp->cb_break_before, vnode)) { - vnode->cb_expires_at = cb->expires_at; - vnode->cb_server = op->server; - set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + if (op->volume->type == AFSVL_RWVOL) + vnode->cb_server = op->server; + afs_set_cb_promise(vnode, cb->expires_at, afs_cb_promise_set_apply_cb); } } @@ -328,9 +427,9 @@ static void afs_fetch_status_success(struct afs_operation *op) struct afs_vnode *vnode = vp->vnode; int ret; - if (vnode->netfs.inode.i_state & I_NEW) { + if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); - op->error = ret; + afs_op_set_error(op, ret); if (ret == 0) afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb); } else { @@ -431,7 +530,9 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) } __packed key; struct afs_vnode_cache_aux aux; - if (vnode->status.type != AFS_FTYPE_FILE) { + if (vnode->status.type != AFS_FTYPE_FILE && + vnode->status.type != AFS_FTYPE_DIR && + vnode->status.type != AFS_FTYPE_SYMLINK) { vnode->netfs.cache = NULL; return; } @@ -449,7 +550,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) 0 : FSCACHE_ADV_SINGLE_CHUNK, &key, sizeof(key), &aux, sizeof(aux), - vnode->status.size)); + i_size_read(&vnode->netfs.inode))); #endif } @@ -478,7 +579,7 @@ struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp) inode, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); /* deal with an existing inode */ - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { _leave(" = %p", inode); return inode; } @@ -508,7 +609,7 @@ static int afs_iget5_set_root(struct inode *inode, void *opaque) struct afs_vnode *vnode = AFS_FS_I(inode); vnode->volume = as->volume; - vnode->fid.vid = as->volume->vid, + vnode->fid.vid = as->volume->vid; vnode->fid.vnode = 1; vnode->fid.unique = 1; inode->i_ino = 1; @@ -538,10 +639,10 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) _debug("GOT ROOT INODE %p { vl=%llx }", inode, as->volume->vid); - BUG_ON(!(inode->i_state & I_NEW)); + BUG_ON(!(inode_state_read_once(inode) & I_NEW)); vnode = AFS_FS_I(inode); - vnode->cb_v_break = as->volume->cb_v_break, + vnode->cb_v_check = atomic_read(&as->volume->cb_v_break); afs_set_netfs_context(vnode); op = afs_alloc_operation(key, as->volume); @@ -572,184 +673,21 @@ error: } /* - * mark the data attached to an inode as obsolete due to a write on the server - * - might also want to ditch all the outstanding writes and dirty pages - */ -static void afs_zap_data(struct afs_vnode *vnode) -{ - _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); - - afs_invalidate_cache(vnode, 0); - - /* nuke all the non-dirty pages that aren't locked, mapped or being - * written back in a regular file and completely discard the pages in a - * directory or symlink */ - if (S_ISREG(vnode->netfs.inode.i_mode)) - invalidate_remote_inode(&vnode->netfs.inode); - else - invalidate_inode_pages2(vnode->netfs.inode.i_mapping); -} - -/* - * Check to see if we have a server currently serving this volume and that it - * hasn't been reinitialised or dropped from the list. - */ -static bool afs_check_server_good(struct afs_vnode *vnode) -{ - struct afs_server_list *slist; - struct afs_server *server; - bool good; - int i; - - if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break)) - return true; - - rcu_read_lock(); - - slist = rcu_dereference(vnode->volume->servers); - for (i = 0; i < slist->nr_servers; i++) { - server = slist->servers[i].server; - if (server == vnode->cb_server) { - good = (vnode->cb_s_break == server->cb_s_break); - rcu_read_unlock(); - return good; - } - } - - rcu_read_unlock(); - return false; -} - -/* - * Check the validity of a vnode/inode. - */ -bool afs_check_validity(struct afs_vnode *vnode) -{ - enum afs_cb_break_reason need_clear = afs_cb_break_no_break; - time64_t now = ktime_get_real_seconds(); - unsigned int cb_break; - int seq = 0; - - do { - read_seqbegin_or_lock(&vnode->cb_lock, &seq); - cb_break = vnode->cb_break; - - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - if (vnode->cb_v_break != vnode->volume->cb_v_break) - need_clear = afs_cb_break_for_v_break; - else if (!afs_check_server_good(vnode)) - need_clear = afs_cb_break_for_s_reinit; - else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) - need_clear = afs_cb_break_for_zap; - else if (vnode->cb_expires_at - 10 <= now) - need_clear = afs_cb_break_for_lapsed; - } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - ; - } else { - need_clear = afs_cb_break_no_promise; - } - - } while (need_seqretry(&vnode->cb_lock, seq)); - - done_seqretry(&vnode->cb_lock, seq); - - if (need_clear == afs_cb_break_no_break) - return true; - - write_seqlock(&vnode->cb_lock); - if (need_clear == afs_cb_break_no_promise) - vnode->cb_v_break = vnode->volume->cb_v_break; - else if (cb_break == vnode->cb_break) - __afs_break_callback(vnode, need_clear); - else - trace_afs_cb_miss(&vnode->fid, need_clear); - write_sequnlock(&vnode->cb_lock); - return false; -} - -/* - * validate a vnode/inode - * - there are several things we need to check - * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, - * symlink) - * - parent dir metadata changed (security changes) - * - dentry data changed (write, truncate) - * - dentry metadata changed (security changes) - */ -int afs_validate(struct afs_vnode *vnode, struct key *key) -{ - int ret; - - _enter("{v={%llx:%llu} fl=%lx},%x", - vnode->fid.vid, vnode->fid.vnode, vnode->flags, - key_serial(key)); - - if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { - if (vnode->netfs.inode.i_nlink) - clear_nlink(&vnode->netfs.inode); - goto valid; - } - - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && - afs_check_validity(vnode)) - goto valid; - - down_write(&vnode->validate_lock); - - /* if the promise has expired, we need to check the server again to get - * a new promise - note that if the (parent) directory's metadata was - * changed then the security may be different and we may no longer have - * access */ - if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - _debug("not promised"); - ret = afs_fetch_status(vnode, key, false, NULL); - if (ret < 0) { - if (ret == -ENOENT) { - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - goto error_unlock; - } - _debug("new promise [fl=%lx]", vnode->flags); - } - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _debug("file already deleted"); - ret = -ESTALE; - goto error_unlock; - } - - /* if the vnode's data version number changed then its contents are - * different */ - if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) - afs_zap_data(vnode); - up_write(&vnode->validate_lock); -valid: - _leave(" = 0"); - return 0; - -error_unlock: - up_write(&vnode->validate_lock); - _leave(" = %d", ret); - return ret; -} - -/* * read the attributes of an inode */ -int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, +int afs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct afs_vnode *vnode = AFS_FS_I(inode); struct key *key; - int ret, seq = 0; + int ret, seq; _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); if (vnode->volume && !(query_flags & AT_STATX_DONT_SYNC) && - !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + atomic64_read(&vnode->cb_expires_at) == AFS_NO_CB_PROMISE) { key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) return PTR_ERR(key); @@ -760,14 +698,20 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, } do { - read_seqbegin_or_lock(&vnode->cb_lock, &seq); - generic_fillattr(&init_user_ns, inode, stat); + seq = read_seqbegin(&vnode->cb_lock); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; - } while (need_seqretry(&vnode->cb_lock, seq)); - done_seqretry(&vnode->cb_lock, seq); + /* Lie about the size of directories. We maintain a locally + * edited copy and may make different allocation decisions on + * it, but we need to give userspace the server's size. + */ + if (S_ISDIR(inode->i_mode)) + stat->size = vnode->netfs.remote_i_size; + } while (read_seqretry(&vnode->cb_lock, seq)); + return 0; } @@ -779,9 +723,9 @@ int afs_drop_inode(struct inode *inode) _enter(""); if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) - return generic_delete_inode(inode); + return inode_just_drop(inode); else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /* @@ -790,6 +734,7 @@ int afs_drop_inode(struct inode *inode) void afs_evict_inode(struct inode *inode) { struct afs_vnode_cache_aux aux; + struct afs_super_info *sbi = AFS_FS_S(inode->i_sb); struct afs_vnode *vnode = AFS_FS_I(inode); _enter("{%llx:%llu.%d}", @@ -801,10 +746,25 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + if ((S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) && + (inode_state_read_once(inode) & I_DIRTY) && + !sbi->dyn_root) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .for_sync = true, + .range_end = LLONG_MAX, + }; + + afs_single_writepages(inode->i_mapping, &wbc); + } + + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); + netfs_free_folioq_buffer(vnode->directory); afs_set_cache_aux(vnode, &aux); - fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux); + netfs_clear_inode_writeback(inode, &aux); clear_inode(inode); while (!list_empty(&vnode->wb_keys)) { @@ -846,17 +806,22 @@ static void afs_setattr_success(struct afs_operation *op) static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->netfs.inode; + struct afs_vnode *vnode = vp->vnode; + struct inode *inode = &vnode->netfs.inode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; - loff_t i_size = op->setattr.old_i_size; + loff_t old = op->setattr.old_i_size; - if (size < i_size) + /* Note: inode->i_size was updated by afs_apply_status() inside + * the I/O and callback locks. + */ + + if (size != old) { truncate_pagecache(inode, size); - if (size != i_size) - fscache_resize_cookie(afs_vnode_cache(vp->vnode), - vp->scb.status.size); + netfs_resize_file(&vnode->netfs, size, true); + fscache_resize_cookie(afs_vnode_cache(vnode), size); + } } } @@ -870,7 +835,7 @@ static const struct afs_operation_ops afs_setattr_operation = { /* * set the attributes of an inode */ -int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { const unsigned int supported = @@ -924,11 +889,11 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, */ if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && attr->ia_size < i_size && - attr->ia_size > vnode->status.size) { - truncate_pagecache(inode, attr->ia_size); + attr->ia_size > vnode->netfs.remote_i_size) { + truncate_setsize(inode, attr->ia_size); + netfs_resize_file(&vnode->netfs, size, false); fscache_resize_cookie(afs_vnode_cache(vnode), attr->ia_size); - i_size_write(inode, attr->ia_size); ret = 0; goto out_unlock; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index fd8567b98e2b..009064b8d661 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/fs.h> +#include <linux/filelock.h> #include <linux/pagemap.h> #include <linux/rxrpc.h> #include <linux/key.h> @@ -19,6 +20,7 @@ #include <linux/uuid.h> #include <linux/mm_types.h> #include <linux/dns_resolver.h> +#include <crypto/krb5.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> @@ -32,6 +34,7 @@ struct pagevec; struct afs_call; struct afs_vnode; +struct afs_server_probe; /* * Partial file-locking emulation mode. (The problem being that AFS3 only @@ -72,21 +75,51 @@ enum afs_call_state { }; /* + * Address preferences. + */ +struct afs_addr_preference { + union { + struct in_addr ipv4_addr; /* AF_INET address to compare against */ + struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */ + }; + sa_family_t family; /* Which address to use */ + u16 prio; /* Priority */ + u8 subnet_mask; /* How many bits to compare */ +}; + +struct afs_addr_preference_list { + struct rcu_head rcu; + u16 version; /* Incremented when prefs list changes */ + u8 ipv6_off; /* Offset of IPv6 addresses */ + u8 nr; /* Number of addresses in total */ + u8 max_prefs; /* Number of prefs allocated */ + struct afs_addr_preference prefs[] __counted_by(max_prefs); +}; + +struct afs_address { + struct rxrpc_peer *peer; + short last_error; /* Last error from this address */ + u16 prio; /* Address priority */ +}; + +/* * List of server addresses. */ struct afs_addr_list { struct rcu_head rcu; refcount_t usage; u32 version; /* Version */ + unsigned int debug_id; + unsigned int addr_pref_version; /* Version of address preference list */ unsigned char max_addrs; unsigned char nr_addrs; unsigned char preferred; /* Preferred address */ unsigned char nr_ipv4; /* Number of IPv4 addresses */ enum dns_record_source source:8; enum dns_lookup_status status:8; - unsigned long failed; /* Mask of addrs that failed locally/ICMP */ + unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */ unsigned long responded; /* Mask of addrs that responded */ - struct sockaddr_rxrpc addrs[]; + struct afs_address addrs[] __counted_by(max_addrs); #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; @@ -95,11 +128,12 @@ struct afs_addr_list { */ struct afs_call { const struct afs_call_type *type; /* type of call */ - struct afs_addr_list *alist; /* Address is alist[addr_ix] */ wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ + struct work_struct free_work; /* Deferred free processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ + struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* security for this call */ struct afs_net *net; /* The network namespace */ struct afs_server *server; /* The fileserver record if fs op (pins ref) */ @@ -115,11 +149,14 @@ struct afs_call { }; void *buffer; /* reply receive buffer */ union { - long ret0; /* Value to reply with instead of 0 */ + struct afs_endpoint_state *probe; + struct afs_addr_list *vl_probe; struct afs_addr_list *ret_alist; struct afs_vldb_entry *ret_vldb; char *ret_str; }; + struct afs_fid fid; /* Primary vnode ID (or all zeroes) */ + unsigned char probe_index; /* Address in ->probe_alist */ struct afs_operation *op; unsigned int server_index; refcount_t ref; @@ -127,20 +164,23 @@ struct afs_call { spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ - unsigned int max_lifespan; /* Maximum lifespan to set if not 0 */ + unsigned long long remaining; /* How much is left to receive */ + unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned count2; /* count used in unmarshalling */ unsigned char unmarshall; /* unmarshalling phase */ - unsigned char addr_ix; /* Address in ->alist */ bool drop_ref; /* T if need to drop ref for incoming call */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ + bool responded; /* Got a response from the call (may be abort) */ + u8 security_ix; /* Security class */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ + u32 enctype; /* Security encoding type */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ @@ -165,11 +205,17 @@ struct afs_call_type { /* clean up a call */ void (*destructor)(struct afs_call *call); + /* Async receive processing function */ + void (*async_rx)(struct work_struct *work); + /* Work function */ void (*work)(struct work_struct *work); /* Call done function (gets called immediately on success or failure) */ void (*done)(struct afs_call *call); + + /* Handle a call being immediately cancelled. */ + void (*immediate_cancel)(struct afs_call *call); }; /* @@ -197,28 +243,6 @@ static inline struct key *afs_file_key(struct file *file) } /* - * Record of an outstanding read operation on a vnode. - */ -struct afs_read { - loff_t pos; /* Where to start reading */ - loff_t len; /* How much we're asking for */ - loff_t actual_len; /* How much we're actually getting */ - loff_t file_size; /* File size returned by server */ - struct key *key; /* The key to use to reissue the read */ - struct afs_vnode *vnode; /* The file being read into. */ - struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */ - afs_dataversion_t data_version; /* Version number returned by server */ - refcount_t usage; - unsigned int call_debug_id; - unsigned int nr_pages; - int error; - void (*done)(struct afs_read *); - void (*cleanup)(struct afs_read *); - struct iov_iter *iter; /* Iterator representing the buffer */ - struct iov_iter def_iter; /* Default iterator */ -}; - -/* * AFS superblock private data * - there's one superblock per volume */ @@ -260,15 +284,15 @@ struct afs_net { struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; + struct work_struct rx_oob_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; /* Cell database */ struct rb_root cells; - struct afs_cell *ws_cell; - struct work_struct cells_manager; - struct timer_list cells_timer; + struct idr cells_dyn_ino; /* cell->dynroot_ino mapping */ + struct afs_cell __rcu *ws_cell; atomic_t cells_outstanding; struct rw_semaphore cells_lock; struct mutex cells_alias_lock; @@ -280,19 +304,12 @@ struct afs_net { * cell, but in practice, people create aliases and subsets and there's * no easy way to distinguish them. */ - seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */ - struct rb_root fs_servers; /* afs_server (by server UUID or address) */ + seqlock_t fs_lock; /* For fs_probe_*, fs_proc */ struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */ struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ - struct hlist_head fs_addresses4; /* afs_server (by lowest IPv4 addr) */ - struct hlist_head fs_addresses6; /* afs_server (by lowest IPv6 addr) */ - seqlock_t fs_addr_lock; /* For fs_addresses[46] */ - - struct work_struct fs_manager; - struct timer_list fs_timer; - + struct key *fs_cm_token_key; /* Key for creating CM tokens */ struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; @@ -305,6 +322,8 @@ struct afs_net { struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ struct afs_sysnames *sysnames; rwlock_t sysnames_lock; + struct afs_addr_preference_list __rcu *address_prefs; + u16 address_pref_version; /* Statistics counters */ atomic_t n_lookup; /* Number of lookups done */ @@ -323,13 +342,11 @@ struct afs_net { extern const char afs_init_sysname[]; enum afs_cell_state { - AFS_CELL_UNSET, - AFS_CELL_ACTIVATING, + AFS_CELL_SETTING_UP, + AFS_CELL_UNLOOKED, AFS_CELL_ACTIVE, - AFS_CELL_DEACTIVATING, - AFS_CELL_INACTIVE, - AFS_CELL_FAILED, - AFS_CELL_REMOVED, + AFS_CELL_REMOVING, + AFS_CELL_DEAD, }; /* @@ -360,7 +377,9 @@ struct afs_cell { struct afs_cell *alias_of; /* The cell this is an alias of */ struct afs_volume *root_volume; /* The root.cell volume if there is one */ struct key *anonymous_key; /* anonymous user key for this cell */ + struct work_struct destroyer; /* Destroyer for cell */ struct work_struct manager; /* Manager for init/deinit/dns */ + struct timer_list management_timer; /* General management timer */ struct hlist_node proc_link; /* /proc cell list link */ time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ @@ -376,18 +395,17 @@ struct afs_cell { enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ unsigned int debug_id; + unsigned int dynroot_ino; /* Inode numbers for dynroot (a pair) */ /* The volumes belonging to this cell */ + struct rw_semaphore vs_lock; /* Lock for server->volumes */ struct rb_root volumes; /* Tree of volumes on this server */ struct hlist_head proc_volumes; /* procfs volume list */ seqlock_t volume_lock; /* For volumes */ /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ - seqlock_t fs_lock; /* For fs_servers */ - struct rw_semaphore fs_open_mmaps_lock; - struct list_head fs_open_mmaps; /* List of vnodes that are mmapped */ - atomic_t fs_s_break; /* Counter of CB.InitCallBackState messages */ + struct rw_semaphore fs_lock; /* For fs_servers */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ @@ -395,6 +413,7 @@ struct afs_cell { u8 name_len; /* Length of name */ char *name; /* Cell name, case-flattened and NUL-padded */ + char *key_desc; /* Authentication key description */ }; /* @@ -411,13 +430,14 @@ struct afs_vlserver { rwlock_t lock; /* Lock on addresses */ refcount_t ref; unsigned int rtt; /* Server's current RTT in uS */ + unsigned int debug_id; /* Probe state */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; struct { - unsigned int rtt; /* RTT in uS */ + unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ u32 abort_code; short error; unsigned short flags; @@ -427,6 +447,7 @@ struct afs_vlserver { #define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ } probe; + u16 service_id; /* Service ID we're using */ u16 port; u16 name_len; /* Length of name */ char name[]; /* Server name, case-flattened */ @@ -476,6 +497,7 @@ struct afs_vldb_entry { #define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ #define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ #define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ + u8 vlsf_flags[AFS_NMAXNSERVERS]; short error; u8 nr_servers; /* Number of server records */ u8 name_len; @@ -483,6 +505,32 @@ struct afs_vldb_entry { }; /* + * Fileserver endpoint state. The records the addresses of a fileserver's + * endpoints and the state and result of a round of probing on them. This + * allows the rotation algorithm to access those results without them being + * erased by a subsequent round of probing. + */ +struct afs_endpoint_state { + struct rcu_head rcu; + struct afs_addr_list *addresses; /* The addresses being probed */ + unsigned long responsive_set; /* Bitset of responsive endpoints */ + unsigned long failed_set; /* Bitset of endpoints we failed to probe */ + refcount_t ref; + unsigned int server_id; /* Debug ID of server */ + unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */ + atomic_t nr_probing; /* Number of outstanding probes */ + unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ + s32 abort_code; + short error; + unsigned long flags; +#define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */ +#define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */ +#define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */ +#define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */ +#define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */ +}; + +/* * Record of fileserver with which we're actively communicating. */ struct afs_server { @@ -492,70 +540,76 @@ struct afs_server { struct afs_uuid _uuid; }; - struct afs_addr_list __rcu *addresses; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ - struct rb_node uuid_rb; /* Link in net->fs_servers */ - struct afs_server __rcu *uuid_next; /* Next server with same UUID */ - struct afs_server *uuid_prev; /* Previous server with same UUID */ - struct list_head probe_link; /* Link in net->fs_probe_list */ - struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ - struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ + struct rb_node uuid_rb; /* Link in cell->fs_servers */ + struct list_head probe_link; /* Link in net->fs_probe_* */ struct hlist_node proc_link; /* Link in net->fs_proc */ - struct work_struct initcb_work; /* Work for CB.InitCallBackState* */ - struct afs_server *gc_next; /* Next server in manager's list */ + struct list_head volumes; /* RCU list of afs_server_entry objects */ + struct work_struct destroyer; /* Work item to try and destroy a server */ + struct timer_list timer; /* Management timer */ + struct mutex cm_token_lock; /* Lock governing creation of appdata */ + struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ #define AFS_SERVER_FL_UPDATING 1 #define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */ -#define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */ -#define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */ -#define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */ +#define AFS_SERVER_FL_UNCREATED 3 /* The record needs creating */ +#define AFS_SERVER_FL_CREATING 4 /* The record is being created */ +#define AFS_SERVER_FL_EXPIRED 5 /* The record has expired */ +#define AFS_SERVER_FL_NOT_FOUND 6 /* VL server says no such server */ +#define AFS_SERVER_FL_VL_FAIL 7 /* Failed to access VL server */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ #define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ +#define AFS_SERVER_FL_NO_RENAME2 20 /* YFS Fileserver doesn't support enhanced rename */ refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ + u16 service_id; /* Service ID we're using. */ + short create_error; /* Creation error */ unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ /* file service access */ rwlock_t fs_lock; /* access lock */ - /* callback promise management */ - unsigned cb_s_break; /* Break-everything counter. */ - /* Probe state */ + struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */ unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ wait_queue_head_t probe_wq; - atomic_t probe_outstanding; + unsigned int probe_counter; /* Number of probes issued */ spinlock_t probe_lock; - struct { - unsigned int rtt; /* RTT in uS */ - u32 abort_code; - short error; - bool responded:1; - bool is_yfs:1; - bool not_yfs:1; - bool local_failure:1; - } probe; }; +enum afs_ro_replicating { + AFS_RO_NOT_REPLICATING, /* Not doing replication */ + AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */ + AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */ +} __mode(byte); + /* * Replaceable volume server list. */ struct afs_server_entry { struct afs_server *server; + struct afs_volume *volume; + struct list_head slink; /* Link in server->volumes */ + time64_t cb_expires_at; /* Time at which volume-level callback expires */ + unsigned long flags; +#define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */ +#define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */ +#define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */ }; struct afs_server_list { - afs_volid_t vids[AFS_MAXTYPES]; /* Volume IDs */ + struct rcu_head rcu; refcount_t usage; + bool attached; /* T if attached to servers */ + enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */ unsigned char nr_servers; - unsigned char preferred; /* Preferred server */ unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ rwlock_t lock; @@ -566,24 +620,24 @@ struct afs_server_list { * Live AFS volume management. */ struct afs_volume { - union { - struct rcu_head rcu; - afs_volid_t vid; /* volume ID */ - }; + struct rcu_head rcu; + afs_volid_t vid; /* The volume ID of this volume */ + afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ refcount_t ref; + unsigned int debug_id; /* Debugging ID for traces */ time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node cell_node; /* Link in cell->volumes */ struct hlist_node proc_link; /* Link in cell->proc_volumes */ struct super_block __rcu *sb; /* Superblock on which inodes reside */ + struct work_struct destructor; /* Deferred destructor */ unsigned long flags; #define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */ #define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */ #define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */ #define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */ -#define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */ -#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */ -#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */ +#define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */ +#define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */ #ifdef CONFIG_AFS_FSCACHE struct fscache_volume *cache; /* Caching cookie */ #endif @@ -591,8 +645,21 @@ struct afs_volume { rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ - unsigned cb_v_break; /* Break-everything counter. */ + /* RO release tracking */ + struct mutex volsync_lock; /* Time/state evaluation lock */ + time64_t creation_time; /* Volume creation time (or TIME64_MIN) */ + time64_t update_time; /* Volume update time (or TIME64_MIN) */ + + /* Callback management */ + struct mutex cb_check_lock; /* Lock to control race to check after v_break */ + time64_t cb_expires_at; /* Earliest volume callback expiry time */ + atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */ + atomic_t cb_v_break; /* Volume-break event counter. */ + atomic_t cb_v_check; /* Volume-break has-been-checked counter. */ + atomic_t cb_scrub; /* Scrub-all-data event counter. */ rwlock_t cb_v_break_lock; + struct rw_semaphore open_mmaps_lock; + struct list_head open_mmaps; /* List of vnodes that are mmapped */ afs_voltype_t type; /* type of volume */ char type_force; /* force volume type (suppress R/O -> R/W) */ @@ -624,25 +691,26 @@ struct afs_vnode { struct afs_file_status status; /* AFS status info for this file */ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ - struct mutex io_lock; /* Lock for serialising I/O on this mutex */ + struct list_head io_lock_waiters; /* Threads waiting for the I/O lock */ struct rw_semaphore validate_lock; /* lock for validating this vnode */ struct rw_semaphore rmdir_lock; /* Lock for rmdir vs sillyrename */ struct key *silly_key; /* Silly rename key */ spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; -#define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */ +#define AFS_VNODE_IO_LOCK 0 /* Set if the I/O serialisation lock is held */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ #define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ -#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ #define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ +#define AFS_VNODE_DIR_READ 11 /* Set if we've read a dir's contents */ + struct folio_queue *directory; /* Directory contents */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ @@ -651,19 +719,21 @@ struct afs_vnode { ktime_t locked_at; /* Time at which lock obtained */ enum afs_lock_state lock_state : 8; afs_lock_type_t lock_type : 8; + unsigned int directory_size; /* Amount of space in ->directory */ /* outstanding callback notification on this file */ struct work_struct cb_work; /* Work for mmap'd files */ struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */ void *cb_server; /* Server with callback/filelock */ atomic_t cb_nr_mmap; /* Number of mmaps */ - unsigned int cb_fs_s_break; /* Mass server break counter (cell->fs_s_break) */ - unsigned int cb_s_break; /* Mass break counter on ->server */ - unsigned int cb_v_break; /* Mass break counter on ->volume */ + unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */ + unsigned int cb_scrub; /* Scrub counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ + unsigned int cb_v_check; /* Break check counter on ->volume */ seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */ - time64_t cb_expires_at; /* time at which callback expires */ + atomic64_t cb_expires_at; /* time at which callback expires */ +#define AFS_NO_CB_PROMISE TIME64_MIN }; static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) @@ -680,6 +750,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; + if (cookie) + mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } @@ -702,47 +774,56 @@ struct afs_permits { refcount_t usage; unsigned short nr_permits; /* Number of records */ bool invalidated; /* Invalidated due to key change */ - struct afs_permit permits[]; /* List of permits sorted by key pointer */ + struct afs_permit permits[] __counted_by(nr_permits); /* List of permits sorted by key pointer */ }; /* * Error prioritisation and accumulation. */ struct afs_error { - short error; /* Accumulated error */ + s32 abort_code; /* Cumulative abort code */ + short error; /* Cumulative error */ bool responded; /* T if server responded */ -}; - -/* - * Cursor for iterating over a server's address list. - */ -struct afs_addr_cursor { - struct afs_addr_list *alist; /* Current address list (pins ref) */ - unsigned long tried; /* Tried addresses */ - signed char index; /* Current address */ - bool responded; /* T if the current address responded */ - unsigned short nr_iterations; /* Number of address iterations */ - short error; - u32 abort_code; + bool aborted; /* T if ->error is from an abort */ }; /* * Cursor for iterating over a set of volume location servers. */ struct afs_vl_cursor { - struct afs_addr_cursor ac; struct afs_cell *cell; /* The cell we're querying */ struct afs_vlserver_list *server_list; /* Current server list (pins ref) */ struct afs_vlserver *server; /* Server on which this resides */ + struct afs_addr_list *alist; /* Current address list (pins ref) */ struct key *key; /* Key for the server */ - unsigned long untried; /* Bitmask of untried servers */ - short index; /* Current server */ - short error; + unsigned long untried_servers; /* Bitmask of untried servers */ + unsigned long addr_tried; /* Tried addresses */ + struct afs_error cumul_error; /* Cumulative error */ + unsigned int debug_id; + s32 call_abort_code; + short call_error; /* Error from single call */ + short server_index; /* Current server */ + signed char addr_index; /* Current address */ unsigned short flags; #define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ #define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ #define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ - unsigned short nr_iterations; /* Number of server iterations */ + short nr_iterations; /* Number of server iterations */ + bool call_responded; /* T if the current address responded */ +}; + +/* + * Fileserver state tracking for an operation. An array of these is kept, + * indexed by server index. + */ +struct afs_server_state { + /* Tracking of fileserver probe state. Other operations may interfere + * by probing a fileserver when accessing other volumes. + */ + unsigned int probe_seq; + unsigned long untried_addrs; /* Addresses we haven't tried yet */ + struct wait_queue_entry probe_waiter; + struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */ }; /* @@ -763,7 +844,7 @@ struct afs_vnode_param { struct afs_fid fid; /* Fid to access */ struct afs_status_cb scb; /* Returned status and callback promise */ afs_dataversion_t dv_before; /* Data version before the call */ - unsigned int cb_break_before; /* cb_break + cb_s_break before the call */ + unsigned int cb_break_before; /* cb_break before the call */ u8 dv_delta; /* Expected change in data version */ bool put_vnode:1; /* T if we have a ref on the vnode */ bool need_io_lock:1; /* T if we need the I/O lock on this */ @@ -788,17 +869,17 @@ struct afs_operation { struct afs_volume *volume; /* Volume being accessed */ struct afs_vnode_param file[2]; struct afs_vnode_param *more_files; - struct afs_volsync volsync; + struct afs_volsync pre_volsync; /* Volsync before op */ + struct afs_volsync volsync; /* Volsync returned by op */ struct dentry *dentry; /* Dentry to be altered */ struct dentry *dentry_2; /* Second dentry to be altered */ struct timespec64 mtime; /* Modification time to record */ struct timespec64 ctime; /* Change time to set */ + struct afs_error cumul_error; /* Cumulative error */ short nr_files; /* Number of entries in file[], more_files */ - short error; unsigned int debug_id; unsigned int cb_v_break; /* Volume break counter before op */ - unsigned int cb_s_break; /* Server break counter before op */ union { struct { @@ -813,12 +894,13 @@ struct afs_operation { bool need_rehash; } unlink; struct { - struct dentry *rehash; - struct dentry *tmp; - bool new_negative; + struct dentry *rehash; + struct dentry *tmp; + unsigned int rename_flags; + bool new_negative; } rename; struct { - struct afs_read *req; + struct netfs_io_subrequest *subreq; } fetch; struct { afs_lock_type_t type; @@ -828,7 +910,6 @@ struct afs_operation { loff_t pos; loff_t size; loff_t i_size; - bool laundering; /* Laundering page, PG_writeback not set */ } store; struct { struct iattr *attr; @@ -843,13 +924,19 @@ struct afs_operation { }; /* Fileserver iteration state */ - struct afs_addr_cursor ac; struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_server *server; /* Server we're using (ref pinned by server_list) */ + struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */ + struct afs_server_state *server_states; /* States of the servers involved */ struct afs_call *call; - unsigned long untried; /* Bitmask of untried servers */ - short index; /* Current server */ - unsigned short nr_iterations; /* Number of server iterations */ + unsigned long untried_servers; /* Bitmask of untried servers */ + unsigned long addr_tried; /* Tried addresses */ + s32 call_abort_code; /* Abort code from single call */ + short call_error; /* Error from single call */ + short server_index; /* Current server */ + short nr_iterations; /* Number of server iterations */ + signed char addr_index; /* Current address */ + bool call_responded; /* T if the current address responded */ unsigned int flags; #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ @@ -865,6 +952,7 @@ struct afs_operation { #define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ #define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ #define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */ +#define AFS_OPERATION_ASYNC 0x2000 /* Set if should run asynchronously */ }; /* @@ -890,60 +978,19 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl } /* - * We use folio->private to hold the amount of the folio that we've written to, - * splitting the field into two parts. However, we need to represent a range - * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio - * exceeds what we can encode. + * Directory iteration management. */ -#ifdef CONFIG_64BIT -#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL -#define __AFS_FOLIO_PRIV_SHIFT 32 -#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL -#else -#define __AFS_FOLIO_PRIV_MASK 0x7fffUL -#define __AFS_FOLIO_PRIV_SHIFT 16 -#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL -#endif - -static inline unsigned int afs_folio_dirty_resolution(struct folio *folio) -{ - int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1); - return (shift > 0) ? shift : 0; -} - -static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv) -{ - unsigned long x = priv & __AFS_FOLIO_PRIV_MASK; - - /* The lower bound is inclusive */ - return x << afs_folio_dirty_resolution(folio); -} - -static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv) -{ - unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK; - - /* The upper bound is immediately beyond the region */ - return (x + 1) << afs_folio_dirty_resolution(folio); -} - -static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to) -{ - unsigned int res = afs_folio_dirty_resolution(folio); - from >>= res; - to = (to - 1) >> res; - return (to << __AFS_FOLIO_PRIV_SHIFT) | from; -} - -static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv) -{ - return priv | __AFS_FOLIO_PRIV_MMAPPED; -} - -static inline bool afs_is_folio_dirty_mmapped(unsigned long priv) -{ - return priv & __AFS_FOLIO_PRIV_MMAPPED; -} +struct afs_dir_iter { + struct afs_vnode *dvnode; + union afs_xdr_dir_block *block; + struct folio_queue *fq; + unsigned int fpos; + int fq_slot; + unsigned int loop_check; + u8 nr_slots; + u8 bucket; + unsigned int prev_entry; +}; #include <trace/events/afs.h> @@ -951,31 +998,35 @@ static inline bool afs_is_folio_dirty_mmapped(unsigned long priv) /* * addr_list.c */ -static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist) -{ - if (alist) - refcount_inc(&alist->usage); - return alist; -} -extern struct afs_addr_list *afs_alloc_addrlist(unsigned int, - unsigned short, - unsigned short); -extern void afs_put_addrlist(struct afs_addr_list *); +struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); +extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr); +extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, const char *, size_t, char, unsigned short, unsigned short); +bool afs_addr_list_same(const struct afs_addr_list *a, + const struct afs_addr_list *b); extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); -extern bool afs_iterate_addresses(struct afs_addr_cursor *); -extern int afs_end_cursor(struct afs_addr_cursor *); -extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16); -extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16); +extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, + __be32 xdr, u16 port); +extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, + __be32 *xdr, u16 port); +void afs_set_peer_appdata(struct afs_server *server, + struct afs_addr_list *old_alist, + struct afs_addr_list *new_alist); + +/* + * addr_prefs.c + */ +int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size); +void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist); +void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist); /* * callback.c */ extern void afs_invalidate_mmap_work(struct work_struct *); -extern void afs_server_init_callback_work(struct work_struct *work); extern void afs_init_callback_state(struct afs_server *); extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); @@ -983,13 +1034,15 @@ extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { - return vnode->cb_break + vnode->cb_v_break; + return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub; } static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_vnode *vnode) { - return cb_break != (vnode->cb_break + vnode->volume->cb_v_break); + return cb_break != (vnode->cb_break + + atomic_read(&vnode->volume->cb_ro_snapshot) + + atomic_read(&vnode->volume->cb_scrub)); } /* @@ -998,16 +1051,26 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); -extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, - const char *, bool); +enum afs_lookup_cell_for { + AFS_LOOKUP_CELL_DYNROOT, + AFS_LOOKUP_CELL_MOUNTPOINT, + AFS_LOOKUP_CELL_DIRECT_MOUNT, + AFS_LOOKUP_CELL_PRELOAD, + AFS_LOOKUP_CELL_ROOTCELL, + AFS_LOOKUP_CELL_ALIAS_CHECK, +}; +struct afs_cell *afs_lookup_cell(struct afs_net *net, + const char *name, unsigned int namesz, + const char *vllist, + enum afs_lookup_cell_for reason, + enum afs_cell_trace trace); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); -extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace); +void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); -extern void afs_manage_cells(struct work_struct *); -extern void afs_cells_timer(struct timer_list *); +void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs); extern void __net_exit afs_cell_purge(struct afs_net *); /* @@ -1016,6 +1079,19 @@ extern void __net_exit afs_cell_purge(struct afs_net *); extern bool afs_cm_incoming_call(struct afs_call *); /* + * cm_security.c + */ +void afs_process_oob_queue(struct work_struct *work); +#ifdef CONFIG_RXGK +int afs_create_token_key(struct afs_net *net, struct socket *socket); +#else +static inline int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + return 0; +} +#endif + +/* * dir.c */ extern const struct file_operations afs_dir_file_operations; @@ -1023,15 +1099,34 @@ extern const struct inode_operations afs_dir_inode_operations; extern const struct address_space_operations afs_dir_aops; extern const struct dentry_operations afs_fs_dentry_operations; +ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file); +ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file) + __acquires(&dvnode->validate_lock); extern void afs_d_release(struct dentry *); extern void afs_check_for_remote_deletion(struct afs_operation *); +int afs_single_writepages(struct address_space *mapping, + struct writeback_control *wbc); /* * dir_edit.c */ -extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, +extern void afs_edit_dir_add(struct afs_vnode *, const struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); -extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); +extern void afs_edit_dir_remove(struct afs_vnode *, const struct qstr *, enum afs_edit_dir_reason); +void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, + struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); +void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); + +/* + * dir_search.c + */ +unsigned int afs_dir_hash_name(const struct qstr *name); +bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name); +union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block); +int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, + struct afs_fid *_fid); +int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, + struct afs_fid *_fid, afs_dataversion_t *_dir_version); /* * dir_silly.c @@ -1046,35 +1141,23 @@ extern int afs_silly_iput(struct dentry *, struct inode *); extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; -extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); -extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *); -extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *); -extern int afs_dynroot_populate(struct super_block *); -extern void afs_dynroot_depopulate(struct super_block *); +struct inode *afs_dynroot_iget_root(struct super_block *sb); /* * file.c */ extern const struct address_space_operations afs_file_aops; -extern const struct address_space_operations afs_symlink_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; +extern const struct afs_operation_ops afs_fetch_data_operation; extern const struct netfs_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); -extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); -extern struct afs_read *afs_alloc_read(gfp_t); -extern void afs_put_read(struct afs_read *); -extern int afs_write_inode(struct inode *, struct writeback_control *); - -static inline struct afs_read *afs_get_read(struct afs_read *req) -{ - refcount_inc(&req->usage); - return req; -} +void afs_fetch_data_async_rx(struct work_struct *work); +void afs_fetch_data_immediate_cancel(struct afs_call *call); /* * flock.c @@ -1105,15 +1188,16 @@ extern void afs_fs_get_volume_status(struct afs_operation *); extern void afs_fs_set_lock(struct afs_operation *); extern void afs_fs_extend_lock(struct afs_operation *); extern void afs_fs_release_lock(struct afs_operation *); -extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, - struct afs_addr_cursor *, struct key *); -extern bool afs_fs_get_capabilities(struct afs_net *, struct afs_server *, - struct afs_addr_cursor *, struct key *); +int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, + struct afs_address *addr, struct key *key); +bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, + struct afs_endpoint_state *estate, unsigned int addr_index, + struct key *key); extern void afs_fs_inline_bulk_status(struct afs_operation *); struct afs_acl { u32 size; - u8 data[]; + u8 data[] __counted_by(size); }; extern void afs_fs_fetch_acl(struct afs_operation *); @@ -1125,14 +1209,10 @@ extern void afs_fs_store_acl(struct afs_operation *); extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *); extern int afs_put_operation(struct afs_operation *); extern bool afs_begin_vnode_operation(struct afs_operation *); +extern void afs_end_vnode_operation(struct afs_operation *op); extern void afs_wait_for_operation(struct afs_operation *); extern int afs_do_sync_operation(struct afs_operation *); -static inline void afs_op_nomem(struct afs_operation *op) -{ - op->error = -ENOMEM; -} - static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n, struct afs_vnode *vnode) { @@ -1149,12 +1229,17 @@ static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n, /* * fs_probe.c */ +struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, + enum afs_estate_trace where); +void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); extern void afs_fileserver_probe_result(struct afs_call *); -extern void afs_fs_probe_fileserver(struct afs_net *, struct afs_server *, struct key *, bool); -extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long); +int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_alist, struct key *key); +int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); -extern int afs_wait_for_one_fs_probe(struct afs_server *, bool); +int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, + unsigned long exclude, bool is_intr); extern void afs_fs_probe_cleanup(struct afs_net *); /* @@ -1162,17 +1247,18 @@ extern void afs_fs_probe_cleanup(struct afs_net *); */ extern const struct afs_operation_ops afs_fetch_status_operation; +void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op); +const char *afs_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback); +int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); -extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); -extern bool afs_check_validity(struct afs_vnode *); -extern int afs_validate(struct afs_vnode *, struct key *); -extern int afs_getattr(struct user_namespace *mnt_userns, const struct path *, +extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); -extern int afs_setattr(struct user_namespace *mnt_userns, struct dentry *, struct iattr *); +extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); @@ -1225,6 +1311,31 @@ static inline void __afs_stat(atomic_t *s) extern int afs_abort_to_error(u32); extern void afs_prioritise_error(struct afs_error *, int, u32); +static inline void afs_op_nomem(struct afs_operation *op) +{ + op->cumul_error.error = -ENOMEM; +} + +static inline int afs_op_error(const struct afs_operation *op) +{ + return op->cumul_error.error; +} + +static inline s32 afs_op_abort_code(const struct afs_operation *op) +{ + return op->cumul_error.abort_code; +} + +static inline int afs_op_set_error(struct afs_operation *op, int error) +{ + return op->cumul_error.error = error; +} + +static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code) +{ + afs_prioritise_error(&op->cumul_error, error, abort_code); +} + /* * mntpt.c */ @@ -1255,6 +1366,7 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} /* * rotate.c */ +void afs_clear_server_states(struct afs_operation *op); extern bool afs_select_fileserver(struct afs_operation *); extern void afs_dump_edestaddrreq(const struct afs_operation *); @@ -1267,8 +1379,10 @@ extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); -extern void afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t); -extern long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *); +void afs_deferred_put_call(struct afs_call *call); +void afs_make_call(struct afs_call *call, gfp_t gfp); +void afs_deliver_to_call(struct afs_call *call); +void afs_wait_for_call_to_complete(struct afs_call *call); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, const struct afs_call_type *, size_t, size_t); @@ -1278,15 +1392,41 @@ extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, bool); extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); +static inline struct afs_call *afs_get_call(struct afs_call *call, + enum afs_call_trace why) +{ + int r; + + __refcount_inc(&call->ref, &r); + + trace_afs_call(call->debug_id, why, r + 1, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); + return call; +} + +static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why) +{ + int r = refcount_read(&call->ref); + + trace_afs_call(call->debug_id, why, r, + atomic_read(&call->net->nr_outstanding_calls), + __builtin_return_address(0)); +} + static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, gfp_t gfp) { - op->call = call; - op->type = call->type; - call->op = op; - call->key = op->key; - call->intr = !(op->flags & AFS_OPERATION_UNINTR); - afs_make_call(&op->ac, call, gfp); + struct afs_addr_list *alist = op->estate->addresses; + + op->call = call; + op->type = call->type; + call->op = op; + call->key = op->key; + call->intr = !(op->flags & AFS_OPERATION_UNINTR); + call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer); + call->service_id = op->server->service_id; + afs_make_call(call, gfp); } static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) @@ -1387,7 +1527,7 @@ extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int, extern struct key *afs_request_key(struct afs_cell *); extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); -extern int afs_permission(struct user_namespace *, struct inode *, int); +extern int afs_permission(struct mnt_idmap *, struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); /* @@ -1395,20 +1535,29 @@ extern void __exit afs_clean_up_permit_cache(void); */ extern spinlock_t afs_server_peer_lock; -extern struct afs_server *afs_find_server(struct afs_net *, - const struct sockaddr_rxrpc *); -extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); +struct afs_server *afs_find_server(const struct rxrpc_peer *peer); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); -extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace); -extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace); -extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace); +struct afs_server *afs_use_server(struct afs_server *server, bool activate, + enum afs_server_trace reason); +void afs_unuse_server(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason); +void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason); extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); -extern void afs_manage_servers(struct work_struct *); -extern void afs_servers_timer(struct timer_list *); +void afs_purge_servers(struct afs_cell *cell); extern void afs_fs_probe_timer(struct timer_list *); -extern void __net_exit afs_purge_servers(struct afs_net *); -extern bool afs_check_server_record(struct afs_operation *, struct afs_server *); +void __net_exit afs_wait_for_servers(struct afs_net *net); +bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); + +static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace) +{ + int r = refcount_read(&server->ref); + int a = atomic_read(&server->active); + + trace_afs_server(server->debug_id, r, a, trace); + +} static inline void afs_inc_servers_outstanding(struct afs_net *net) { @@ -1436,10 +1585,14 @@ static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list } extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *); -extern struct afs_server_list *afs_alloc_server_list(struct afs_cell *, struct key *, - struct afs_vldb_entry *, - u8); +struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, + struct key *key, + struct afs_vldb_entry *vldb); extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *); +void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist); +void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist, + struct afs_server_list *old); +void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist); /* * super.c @@ -1448,13 +1601,24 @@ extern int __init afs_fs_init(void); extern void afs_fs_exit(void); /* + * validation.c + */ +bool afs_check_validity(const struct afs_vnode *vnode); +int afs_update_volume_state(struct afs_operation *op); +int afs_validate(struct afs_vnode *vnode, struct key *key); + +/* * vlclient.c */ extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *, const char *, int); extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *); -extern struct afs_call *afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, - struct key *, struct afs_vlserver *, unsigned int); +struct afs_call *afs_vl_get_capabilities(struct afs_net *net, + struct afs_addr_list *alist, + unsigned int addr_index, + struct key *key, + struct afs_vlserver *server, + unsigned int server_index); extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *); @@ -1508,36 +1672,27 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); +bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); -extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace); +void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ -#ifdef CONFIG_AFS_FSCACHE -bool afs_dirty_folio(struct address_space *, struct folio *); -#else -#define afs_dirty_folio filemap_dirty_folio -#endif -extern int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata); -extern int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); -extern int afs_writepage(struct page *, struct writeback_control *); +void afs_prepare_write(struct netfs_io_subrequest *subreq); +void afs_issue_write(struct netfs_io_subrequest *subreq); +void afs_begin_writeback(struct netfs_io_request *wreq); +void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream); extern int afs_writepages(struct address_space *, struct writeback_control *); -extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); -int afs_launder_folio(struct folio *); /* * xattr.c */ -extern const struct xattr_handler *afs_xattr_handlers[]; +extern const struct xattr_handler * const afs_xattr_handlers[]; /* * yfsclient.c @@ -1551,6 +1706,9 @@ extern void yfs_fs_remove_dir(struct afs_operation *); extern void yfs_fs_link(struct afs_operation *); extern void yfs_fs_symlink(struct afs_operation *); extern void yfs_fs_rename(struct afs_operation *); +void yfs_fs_rename_replace(struct afs_operation *op); +void yfs_fs_rename_noreplace(struct afs_operation *op); +void yfs_fs_rename_exchange(struct afs_operation *op); extern void yfs_fs_store_data(struct afs_operation *); extern void yfs_fs_setattr(struct afs_operation *); extern void yfs_fs_get_volume_status(struct afs_operation *); @@ -1596,7 +1754,7 @@ static inline void afs_update_dentry_version(struct afs_operation *op, struct afs_vnode_param *dir_vp, struct dentry *dentry) { - if (!op->error) + if (!op->cumul_error.error) dentry->d_fsdata = (void *)(unsigned long)dir_vp->scb.status.data_version; } @@ -1635,6 +1793,38 @@ static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where) return -EIO; } +/* + * Set the callback promise on a vnode. + */ +static inline void afs_set_cb_promise(struct afs_vnode *vnode, time64_t expires_at, + enum afs_cb_promise_trace trace) +{ + atomic64_set(&vnode->cb_expires_at, expires_at); + trace_afs_cb_promise(vnode, trace); +} + +/* + * Clear the callback promise on a vnode, returning true if it was promised. + */ +static inline bool afs_clear_cb_promise(struct afs_vnode *vnode, + enum afs_cb_promise_trace trace) +{ + trace_afs_cb_promise(vnode, trace); + return atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE; +} + +/* + * Mark a directory as being invalid. + */ +static inline void afs_invalidate_dir(struct afs_vnode *dvnode, + enum afs_dir_invalid_trace trace) +{ + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { + trace_afs_dir_invalid(dvnode, trace); + afs_stat_v(dvnode, n_inval); + } +} + /*****************************************************************************/ /* * debug tracing diff --git a/fs/afs/main.c b/fs/afs/main.c index eae288c8d40a..e6bb8237db98 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -41,8 +41,6 @@ const char afs_init_sysname[] = "arm_linux26"; const char afs_init_sysname[] = "aarch64_linux26"; #elif defined(CONFIG_X86_32) const char afs_init_sysname[] = "i386_linux26"; -#elif defined(CONFIG_IA64) -const char afs_init_sysname[] = "ia64_linux26"; #elif defined(CONFIG_PPC64) const char afs_init_sysname[] = "ppc64_linux26"; #elif defined(CONFIG_PPC32) @@ -75,29 +73,21 @@ static int __net_init afs_net_init(struct net *net_ns) generate_random_uuid((unsigned char *)&net->uuid); INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); + INIT_WORK(&net->rx_oob_work, afs_process_oob_queue); mutex_init(&net->socket_mutex); net->cells = RB_ROOT; + idr_init(&net->cells_dyn_ino); init_rwsem(&net->cells_lock); - INIT_WORK(&net->cells_manager, afs_manage_cells); - timer_setup(&net->cells_timer, afs_cells_timer, 0); - mutex_init(&net->cells_alias_lock); mutex_init(&net->proc_cells_lock); INIT_HLIST_HEAD(&net->proc_cells); seqlock_init(&net->fs_lock); - net->fs_servers = RB_ROOT; INIT_LIST_HEAD(&net->fs_probe_fast); INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); - INIT_HLIST_HEAD(&net->fs_addresses4); - INIT_HLIST_HEAD(&net->fs_addresses6); - seqlock_init(&net->fs_addr_lock); - - INIT_WORK(&net->fs_manager, afs_manage_servers); - timer_setup(&net->fs_timer, afs_servers_timer, 0); INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher); timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0); atomic_set(&net->servers_outstanding, 1); @@ -133,13 +123,14 @@ error_open_socket: net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); - afs_purge_servers(net); + afs_wait_for_servers(net); error_cell_init: net->live = false; afs_proc_cleanup(net); error_proc: afs_put_sysnames(net->sysnames); error_sysnames: + idr_destroy(&net->cells_dyn_ino); net->live = false; return ret; } @@ -154,10 +145,12 @@ static void __net_exit afs_net_exit(struct net *net_ns) net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); - afs_purge_servers(net); + afs_wait_for_servers(net); afs_close_socket(net); afs_proc_cleanup(net); afs_put_sysnames(net->sysnames); + idr_destroy(&net->cells_dyn_ino); + kfree_rcu(rcu_access_pointer(net->address_prefs), rcu); } static struct pernet_operations afs_net_ops = { @@ -176,13 +169,13 @@ static int __init afs_init(void) printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); - afs_wq = alloc_workqueue("afs", 0, 0); + afs_wq = alloc_workqueue("afs", WQ_PERCPU, 0); if (!afs_wq) goto error_afs_wq; - afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0); + afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!afs_async_calls) goto error_async; - afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0); + afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!afs_lock_manager) goto error_lockmgr; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 805328ca5428..c8a7f266080d 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> +#include <crypto/krb5.h> #include "internal.h" #include "afs_fs.h" #include "protocol_uae.h" @@ -103,7 +104,34 @@ int afs_abort_to_error(u32 abort_code) case RXKADDATALEN: return -EKEYREJECTED; case RXKADILLEGALLEVEL: return -EKEYREJECTED; + case RXGK_INCONSISTENCY: return -EPROTO; + case RXGK_PACKETSHORT: return -EPROTO; + case RXGK_BADCHALLENGE: return -EPROTO; + case RXGK_SEALEDINCON: return -EKEYREJECTED; + case RXGK_NOTAUTH: return -EKEYREJECTED; + case RXGK_EXPIRED: return -EKEYEXPIRED; + case RXGK_BADLEVEL: return -EKEYREJECTED; + case RXGK_BADKEYNO: return -EKEYREJECTED; + case RXGK_NOTRXGK: return -EKEYREJECTED; + case RXGK_UNSUPPORTED: return -EKEYREJECTED; + case RXGK_GSSERROR: return -EKEYREJECTED; +#ifdef RXGK_BADETYPE + case RXGK_BADETYPE: return -ENOPKG; +#endif +#ifdef RXGK_BADTOKEN + case RXGK_BADTOKEN: return -EKEYREJECTED; +#endif +#ifdef RXGK_BADETYPE + case RXGK_DATALEN: return -EPROTO; +#endif +#ifdef RXGK_BADQOP + case RXGK_BADQOP: return -EKEYREJECTED; +#endif + + case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; + case RXGEN_OPCODE: return -ENOTSUPP; + case RX_INVALID_OPERATION: return -ENOTSUPP; default: return -EREMOTEIO; } @@ -116,6 +144,8 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) { switch (error) { case 0: + e->aborted = false; + e->error = 0; return; default: if (e->error == -ETIMEDOUT || @@ -161,12 +191,16 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) if (e->responded) return; e->error = error; + e->aborted = false; return; case -ECONNABORTED: - error = afs_abort_to_error(abort_code); - fallthrough; + e->error = afs_abort_to_error(abort_code); + e->aborted = true; + e->responded = true; + return; case -ENETRESET: /* Responded, but we seem to have changed address */ + e->aborted = false; e->responded = true; e->error = error; return; diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 97f50e9fd9eb..57c204a3c04e 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -30,7 +30,7 @@ const struct file_operations afs_mntpt_file_operations = { const struct inode_operations afs_mntpt_inode_operations = { .lookup = afs_mntpt_lookup, - .readlink = page_readlink, + .readlink = afs_readlink, .getattr = afs_getattr, }; @@ -87,7 +87,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ctx->force = true; } if (ctx->cell) { - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_mntpt); ctx->cell = NULL; } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { @@ -107,7 +107,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size > AFS_MAXCELLNAME) return -ENAMETOOLONG; - cell = afs_lookup_cell(ctx->net, p, size, NULL, false); + cell = afs_lookup_cell(ctx->net, p, size, NULL, + AFS_LOOKUP_CELL_MOUNTPOINT, + afs_cell_trace_use_lookup_mntpt); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); return PTR_ERR(cell); @@ -118,9 +120,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ctx->volnamesz = sizeof(afs_root_volume) - 1; } else { /* read the contents of the AFS special symlink */ - struct page *page; + DEFINE_DELAYED_CALL(cleanup); + const char *content; loff_t size = i_size_read(d_inode(mntpt)); - char *buf; if (src_as->cell) ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt); @@ -128,18 +130,24 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size < 2 || size > PAGE_SIZE - 1) return -EINVAL; - page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); - if (IS_ERR(page)) - return PTR_ERR(page); + content = afs_get_link(mntpt, d_inode(mntpt), &cleanup); + if (IS_ERR(content)) { + do_delayed_call(&cleanup); + return PTR_ERR(content); + } - buf = kmap(page); ret = -EINVAL; - if (buf[size - 1] == '.') - ret = vfs_parse_fs_string(fc, "source", buf, size - 1); - kunmap(page); - put_page(page); + if (content[size - 1] == '.') + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(content, size - 1)); + do_delayed_call(&cleanup); if (ret < 0) return ret; + + /* Don't cross a backup volume mountpoint from a backup volume */ + if (src_as->volume && src_as->volume->type == AFSVL_BACKVOL && + ctx->type == AFSVL_BACKVOL) + return -ENODEV; } return 0; @@ -183,7 +191,6 @@ struct vfsmount *afs_d_automount(struct path *path) if (IS_ERR(newmnt)) return newmnt; - mntget(newmnt); /* prevent immediate expiration */ mnt_set_expiry(newmnt, &afs_vfsmounts); queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, afs_mntpt_expiry_timeout * HZ); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 2a0c83d71565..44520549b509 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -122,14 +122,16 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) if (strcmp(buf, "add") == 0) { struct afs_cell *cell; - cell = afs_lookup_cell(net, name, strlen(name), args, true); + cell = afs_lookup_cell(net, name, strlen(name), args, + AFS_LOOKUP_CELL_PRELOAD, + afs_cell_trace_use_lookup_add); if (IS_ERR(cell)) { ret = PTR_ERR(cell); goto done; } if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) - afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin); + afs_unuse_cell(cell, afs_cell_trace_unuse_no_pin); } else { goto inval; } @@ -147,6 +149,56 @@ inval: } /* + * Display the list of addr_prefs known to the namespace. + */ +static int afs_proc_addr_prefs_show(struct seq_file *m, void *v) +{ + struct afs_addr_preference_list *preflist; + struct afs_addr_preference *pref; + struct afs_net *net = afs_seq2net_single(m); + union { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } addr; + unsigned int i; + char buf[44]; /* Maximum ipv6 + max subnet is 43 */ + + rcu_read_lock(); + preflist = rcu_dereference(net->address_prefs); + + if (!preflist) { + seq_puts(m, "NO PREFS\n"); + goto out; + } + + seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n", + preflist->version, preflist->ipv6_off, preflist->nr, preflist->max_prefs); + + memset(&addr, 0, sizeof(addr)); + + for (i = 0; i < preflist->nr; i++) { + pref = &preflist->prefs[i]; + + addr.sin.sin_family = pref->family; + if (pref->family == AF_INET) { + memcpy(&addr.sin.sin_addr, &pref->ipv4_addr, + sizeof(addr.sin.sin_addr)); + snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin, pref->subnet_mask); + seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio); + } else { + memcpy(&addr.sin6.sin6_addr, &pref->ipv6_addr, + sizeof(addr.sin6.sin6_addr)); + snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin6, pref->subnet_mask); + seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio); + } + } + +out: + rcu_read_unlock(); + return 0; +} + +/* * Display the name of the current workstation cell. */ static int afs_proc_rootcell_show(struct seq_file *m, void *v) @@ -156,7 +208,7 @@ static int afs_proc_rootcell_show(struct seq_file *m, void *v) net = afs_seq2net_single(m); down_read(&net->cells_lock); - cell = net->ws_cell; + cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock)); if (cell) seq_printf(m, "%s\n", cell->name); up_read(&net->cells_lock); @@ -190,7 +242,13 @@ static int afs_proc_rootcell_write(struct file *file, char *buf, size_t size) /* determine command to perform */ _debug("rootcell=%s", buf); - ret = afs_cell_init(net, buf); + ret = -EEXIST; + inode_lock(file_inode(file)); + if (!rcu_access_pointer(net->ws_cell)) + ret = afs_cell_init(net, buf); + else + printk("busy\n"); + inode_unlock(file_inode(file)); out: _leave(" = %d", ret); @@ -307,7 +365,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) for (i = 0; i < alist->nr_addrs; i++) seq_printf(m, " %c %pISpc\n", alist->preferred == i ? '>' : '-', - &alist->addrs[i].transport); + rxrpc_kernel_remote_addr(alist->addrs[i].peer)); } seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt); seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n", @@ -375,32 +433,51 @@ static const struct seq_operations afs_proc_cell_vlservers_ops = { */ static int afs_proc_servers_show(struct seq_file *m, void *v) { - struct afs_server *server; + struct afs_endpoint_state *estate; struct afs_addr_list *alist; + struct afs_server *server; + unsigned long failed; int i; if (v == SEQ_START_TOKEN) { - seq_puts(m, "UUID REF ACT\n"); + seq_puts(m, "UUID REF ACT CELL\n"); return 0; } server = list_entry(v, struct afs_server, proc_link); - alist = rcu_dereference(server->addresses); - seq_printf(m, "%pU %3d %3d\n", + seq_printf(m, "%pU %3d %3d %s\n", &server->uuid, refcount_read(&server->ref), - atomic_read(&server->active)); - seq_printf(m, " - info: fl=%lx rtt=%u brk=%x\n", - server->flags, server->rtt, server->cb_s_break); - seq_printf(m, " - probe: last=%d out=%d\n", - (int)(jiffies - server->probed_at) / HZ, - atomic_read(&server->probe_outstanding)); - seq_printf(m, " - ALIST v=%u rsp=%lx f=%lx\n", - alist->version, alist->responded, alist->failed); - for (i = 0; i < alist->nr_addrs; i++) - seq_printf(m, " [%x] %pISpc%s\n", - i, &alist->addrs[i].transport, - alist->preferred == i ? "*" : ""); + atomic_read(&server->active), + server->cell->name); + seq_printf(m, " - info: fl=%lx rtt=%u\n", + server->flags, server->rtt); + seq_printf(m, " - probe: last=%d\n", + (int)(jiffies - server->probed_at) / HZ); + + estate = rcu_dereference(server->endpoint_state); + if (!estate) + goto out; + failed = estate->failed_set; + seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n", + estate->probe_seq, atomic_read(&estate->nr_probing), + estate->responsive_set, estate->failed_set); + + alist = estate->addresses; + seq_printf(m, " - ALIST v=%u ap=%u\n", + alist->version, alist->addr_pref_version); + for (i = 0; i < alist->nr_addrs; i++) { + const struct afs_address *addr = &alist->addrs[i]; + + seq_printf(m, " [%x] %pISpc%s rtt=%d err=%d p=%u\n", + i, rxrpc_kernel_remote_addr(addr->peer), + alist->preferred == i ? "*" : + test_bit(i, &failed) ? "!" : "", + rxrpc_kernel_get_srtt(addr->peer), + addr->last_error, addr->prio); + } + +out: return 0; } @@ -681,7 +758,11 @@ int afs_proc_init(struct afs_net *net) &afs_proc_sysname_ops, afs_proc_sysname_write, sizeof(struct seq_net_private), - NULL)) + NULL) || + !proc_create_net_single_write("addr_prefs", 0644, p, + afs_proc_addr_prefs_show, + afs_proc_addr_prefs_write, + NULL)) goto error_tree; net->proc_afs = p; diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index e4cd89c44c46..b2f06c1917c2 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -50,6 +50,9 @@ enum YFS_FS_Operations { YFSREMOVEACL = 64171, YFSREMOVEFILE2 = 64173, YFSSTOREOPAQUEACL2 = 64174, + YFSRENAME_REPLACE = 64176, + YFSRENAME_NOREPLACE = 64177, + YFSRENAME_EXCHANGE = 64187, YFSINLINEBULKSTATUS = 64536, /* YFS Fetch multiple file statuses with errors */ YFSFETCHDATA64 = 64537, /* YFS Fetch file data */ YFSSTOREDATA64 = 64538, /* YFS Store file data */ diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index a840c3588ebb..6a4e7da10fc4 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -13,6 +13,19 @@ #include <linux/sched/signal.h> #include "internal.h" #include "afs_fs.h" +#include "protocol_uae.h" + +void afs_clear_server_states(struct afs_operation *op) +{ + unsigned int i; + + if (op->server_states) { + for (i = 0; i < op->server_list->nr_servers; i++) + afs_put_endpoint_state(op->server_states[i].endpoint_state, + afs_estate_trace_put_server_state); + kfree(op->server_states); + } +} /* * Begin iteration through a server list, starting with the vnode's last used @@ -25,14 +38,41 @@ static bool afs_start_fs_iteration(struct afs_operation *op, void *cb_server; int i; + trace_afs_rotate(op, afs_rotate_trace_start, 0); + read_lock(&op->volume->servers_lock); op->server_list = afs_get_serverlist( rcu_dereference_protected(op->volume->servers, lockdep_is_held(&op->volume->servers_lock))); read_unlock(&op->volume->servers_lock); - op->untried = (1UL << op->server_list->nr_servers) - 1; - op->index = READ_ONCE(op->server_list->preferred); + op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]), + GFP_KERNEL); + if (!op->server_states) { + afs_op_nomem(op); + trace_afs_rotate(op, afs_rotate_trace_nomem, 0); + return false; + } + + rcu_read_lock(); + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_endpoint_state *estate; + struct afs_server_state *s = &op->server_states[i]; + + server = op->server_list->servers[i].server; + estate = rcu_dereference(server->endpoint_state); + s->endpoint_state = afs_get_endpoint_state(estate, + afs_estate_trace_get_server_state); + s->probe_seq = estate->probe_seq; + s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1; + init_waitqueue_entry(&s->probe_waiter, current); + afs_get_address_preferences(op->net, estate->addresses); + } + rcu_read_unlock(); + + + op->untried_servers = (1UL << op->server_list->nr_servers) - 1; + op->server_index = -1; cb_server = vnode->cb_server; if (cb_server) { @@ -40,7 +80,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op, for (i = 0; i < op->server_list->nr_servers; i++) { server = op->server_list->servers[i].server; if (server == cb_server) { - op->index = i; + op->server_index = i; goto found_interest; } } @@ -50,7 +90,8 @@ static bool afs_start_fs_iteration(struct afs_operation *op, * and have to return an error. */ if (op->flags & AFS_OPERATION_CUR_ONLY) { - op->error = -ESTALE; + afs_op_set_error(op, -ESTALE); + trace_afs_rotate(op, afs_rotate_trace_stale_lock, 0); return false; } @@ -58,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op, write_seqlock(&vnode->cb_lock); ASSERTCMP(cb_server, ==, vnode->cb_server); vnode->cb_server = NULL; - if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) + if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_rotate_server)) vnode->cb_break++; write_sequnlock(&vnode->cb_lock); } @@ -70,7 +111,7 @@ found_interest: /* * Post volume busy note. */ -static void afs_busy(struct afs_volume *volume, u32 abort_code) +static void afs_busy(struct afs_operation *op, u32 abort_code) { const char *m; @@ -81,7 +122,8 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) default: m = "busy"; break; } - pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m); + pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n", + op->volume->vid, op->volume->name, &op->server->uuid, m); } /* @@ -89,10 +131,11 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) */ static bool afs_sleep_and_retry(struct afs_operation *op) { + trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0); if (!(op->flags & AFS_OPERATION_UNINTR)) { msleep_interruptible(1000); if (signal_pending(current)) { - op->error = -ERESTARTSYS; + afs_op_set_error(op, -ERESTARTSYS); return false; } } else { @@ -111,62 +154,105 @@ bool afs_select_fileserver(struct afs_operation *op) struct afs_addr_list *alist; struct afs_server *server; struct afs_vnode *vnode = op->file[0].vnode; - struct afs_error e; - u32 rtt; - int error = op->ac.error, i; + unsigned long set, failed; + s32 abort_code = op->call_abort_code; + int best_prio = 0; + int error = op->call_error, addr_index, i, j; - _enter("%lx[%d],%lx[%d],%d,%d", - op->untried, op->index, - op->ac.tried, op->ac.index, - error, op->ac.abort_code); + op->nr_iterations++; + + _enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d", + op->debug_id, op->nr_iterations, op->volume->vid, + op->server_index, op->untried_servers, + op->addr_index, op->addr_tried, + error, abort_code); if (op->flags & AFS_OPERATION_STOP) { + trace_afs_rotate(op, afs_rotate_trace_stopped, 0); _leave(" = f [stopped]"); return false; } - op->nr_iterations++; - - /* Evaluate the result of the previous operation, if there was one. */ - switch (error) { - case SHRT_MAX: + if (op->nr_iterations == 0) goto start; + WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error); + trace_afs_rotate(op, afs_rotate_trace_iter, op->call_error); + + /* Evaluate the result of the previous operation, if there was one. */ + switch (op->call_error) { case 0: + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); + op->cumul_error.responded = true; + + /* We succeeded, but we may need to redo the op from another + * server if we're looking at a set of RO volumes where some of + * the servers have not yet been brought up to date lest we + * regress the data. We only switch to the new version once + * >=50% of the servers are updated. + */ + error = afs_update_volume_state(op); + if (error != 0) { + if (error == 1) { + afs_sleep_and_retry(op); + goto restart_from_beginning; + } + afs_op_set_error(op, error); + goto failed; + } + fallthrough; default: /* Success or local failure. Stop. */ - op->error = error; + afs_op_set_error(op, error); op->flags |= AFS_OPERATION_STOP; + trace_afs_rotate(op, afs_rotate_trace_stop, error); _leave(" = f [okay/local %d]", error); return false; case -ECONNABORTED: /* The far side rejected the operation on some grounds. This * might involve the server being busy or the volume having been moved. + * + * Note that various V* errors should not be sent to a cache manager + * by a fileserver as they should be translated to more modern UAE* + * errors instead. IBM AFS and OpenAFS fileservers, however, do leak + * these abort codes. */ - switch (op->ac.abort_code) { + trace_afs_rotate(op, afs_rotate_trace_aborted, abort_code); + op->cumul_error.responded = true; + switch (abort_code) { case VNOVOL: /* This fileserver doesn't know about the volume. * - May indicate that the VL is wrong - retry once and compare * the results. * - May indicate that the fileserver couldn't attach to the vol. + * - The volume might have been temporarily removed so that it can + * be replaced by a volume restore. "vos" might have ended one + * transaction and has yet to create the next. + * - The volume might not be blessed or might not be in-service + * (administrative action). */ if (op->flags & AFS_OPERATION_VNOVOL) { - op->error = -EREMOTEIO; + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); goto next_server; } write_lock(&op->volume->servers_lock); - op->server_list->vnovol_mask |= 1 << op->index; + op->server_list->vnovol_mask |= 1 << op->server_index; write_unlock(&op->volume->servers_lock); set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); error = afs_check_volume_status(op->volume, op); - if (error < 0) - goto failed_set_error; + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) { - op->error = -ENOMEDIUM; + afs_op_set_error(op, -ENOMEDIUM); goto failed; } @@ -174,7 +260,7 @@ bool afs_select_fileserver(struct afs_operation *op) * it's the fileserver having trouble. */ if (rcu_access_pointer(op->volume->servers) == op->server_list) { - op->error = -EREMOTEIO; + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); goto next_server; } @@ -183,50 +269,99 @@ bool afs_select_fileserver(struct afs_operation *op) _leave(" = t [vnovol]"); return true; - case VSALVAGE: /* TODO: Should this return an error or iterate? */ case VVOLEXISTS: - case VNOSERVICE: case VONLINE: - case VDISKFULL: - case VOVERQUOTA: - op->error = afs_abort_to_error(op->ac.abort_code); + /* These should not be returned from the fileserver. */ + pr_warn("Fileserver returned unexpected abort %d\n", + abort_code); + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); goto next_server; + case VNOSERVICE: + /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver + * if the volume was neither in-service nor administratively + * blessed. All usage was replaced by VNOVOL because AFS 3.1 and + * earlier cache managers did not handle VNOSERVICE and assumed + * it was the client OSes errno 105. + * + * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the + * fileserver idle dead time error which was sent in place of + * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the + * fileserver took too long to send a reply to the client. + * RX_CALL_TIMEOUT would have caused the cache manager to mark the + * server down whereas VNOSERVICE since AFS 3.2 would cause cache + * manager to temporarily (up to 15 minutes) mark the volume + * instance as unusable. + * + * The idle dead logic resulted in cache inconsistency since a + * state changing call that the cache manager assumed was dead + * could still be processed to completion by the fileserver. This + * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer + * returned. However, many 1.4.8 through 1.6.24 fileservers are + * still in existence. + * + * AuriStorFS fileservers have never returned VNOSERVICE. + * + * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT. + */ + case RX_CALL_TIMEOUT: + afs_op_accumulate_error(op, -ETIMEDOUT, abort_code); + goto next_server; + + case VSALVAGING: /* This error should not be leaked to cache managers + * but is from OpenAFS demand attach fileservers. + * It should be treated as an alias for VOFFLINE. + */ + case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */ case VOFFLINE: - if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) { - afs_busy(op->volume, op->ac.abort_code); - clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); + /* The volume is in use by the volserver or another volume utility + * for an operation that might alter the contents. The volume is + * expected to come back but it might take a long time (could be + * days). + */ + if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags)) { + afs_busy(op, abort_code); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); } if (op->flags & AFS_OPERATION_NO_VSLEEP) { - op->error = -EADV; - goto failed; - } - if (op->flags & AFS_OPERATION_CUR_ONLY) { - op->error = -ESTALE; + afs_op_set_error(op, -EADV); goto failed; } goto busy; - case VSALVAGING: - case VRESTARTING: + case VRESTARTING: /* The fileserver is either shutting down or starting up. */ case VBUSY: - /* Retry after going round all the servers unless we - * have a file lock we need to maintain. + /* The volume is in use by the volserver or another volume + * utility for an operation that is not expected to alter the + * contents of the volume. VBUSY does not need to be returned + * for a ROVOL or BACKVOL bound to an ITBusy volserver + * transaction. The fileserver is permitted to continue serving + * content from ROVOLs and BACKVOLs during an ITBusy transaction + * because the content will not change. However, many fileserver + * releases do return VBUSY for ROVOL and BACKVOL instances under + * many circumstances. + * + * Retry after going round all the servers unless we have a file + * lock we need to maintain. */ if (op->flags & AFS_OPERATION_NO_VSLEEP) { - op->error = -EBUSY; + afs_op_set_error(op, -EBUSY); goto failed; } - if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) { - afs_busy(op->volume, op->ac.abort_code); - clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); + if (!test_and_set_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags)) { + afs_busy(op, abort_code); + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); } busy: if (op->flags & AFS_OPERATION_CUR_ONLY) { if (!afs_sleep_and_retry(op)) goto failed; - /* Retry with same server & address */ + /* Retry with same server & address */ _leave(" = t [vbusy]"); return true; } @@ -243,7 +378,7 @@ bool afs_select_fileserver(struct afs_operation *op) * honour, just in case someone sets up a loop. */ if (op->flags & AFS_OPERATION_VMOVED) { - op->error = -EREMOTEIO; + afs_op_set_error(op, -EREMOTEIO); goto failed; } op->flags |= AFS_OPERATION_VMOVED; @@ -251,8 +386,10 @@ bool afs_select_fileserver(struct afs_operation *op) set_bit(AFS_VOLUME_WAIT, &op->volume->flags); set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); error = afs_check_volume_status(op->volume, op); - if (error < 0) - goto failed_set_error; + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } /* If the server list didn't change, then the VLDB is * out of sync with the fileservers. This is hopefully @@ -264,22 +401,60 @@ bool afs_select_fileserver(struct afs_operation *op) * TODO: Retry a few times with sleeps. */ if (rcu_access_pointer(op->volume->servers) == op->server_list) { - op->error = -ENOMEDIUM; + afs_op_accumulate_error(op, -ENOMEDIUM, abort_code); goto failed; } goto restart_from_beginning; + case UAEIO: + case VIO: + afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + if (op->volume->type != AFSVL_RWVOL) + goto next_server; + goto failed; + + case VDISKFULL: + case UAENOSPC: + /* The partition is full. Only applies to RWVOLs. + * Translate locally and return ENOSPC. + * No replicas to failover to. + */ + afs_op_set_error(op, -ENOSPC); + goto failed_but_online; + + case VOVERQUOTA: + case UAEDQUOT: + /* Volume is full. Only applies to RWVOLs. + * Translate locally and return EDQUOT. + * No replicas to failover to. + */ + afs_op_set_error(op, -EDQUOT); + goto failed_but_online; + + case RX_INVALID_OPERATION: + case RXGEN_OPCODE: + /* Handle downgrading to an older operation. */ + afs_op_set_error(op, -ENOTSUPP); + if (op->flags & AFS_OPERATION_DOWNGRADE) { + op->flags &= ~AFS_OPERATION_DOWNGRADE; + goto go_again; + } + goto failed_but_online; + default: - clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); - clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); - op->error = afs_abort_to_error(op->ac.abort_code); + afs_op_accumulate_error(op, error, abort_code); + failed_but_online: + clear_bit(AFS_SE_VOLUME_OFFLINE, + &op->server_list->servers[op->server_index].flags); + clear_bit(AFS_SE_VOLUME_BUSY, + &op->server_list->servers[op->server_index].flags); goto failed; } case -ETIMEDOUT: case -ETIME: - if (op->error != -EDESTADDRREQ) + if (afs_op_error(op) != -EDESTADDRREQ) goto iterate_address; fallthrough; case -ERFKILL: @@ -289,7 +464,7 @@ bool afs_select_fileserver(struct afs_operation *op) case -EHOSTDOWN: case -ECONNREFUSED: _debug("no conn"); - op->error = error; + afs_op_accumulate_error(op, error, 0); goto iterate_address; case -ENETRESET: @@ -298,24 +473,31 @@ bool afs_select_fileserver(struct afs_operation *op) fallthrough; case -ECONNRESET: _debug("call reset"); - op->error = error; + afs_op_set_error(op, error); goto failed; } restart_from_beginning: + trace_afs_rotate(op, afs_rotate_trace_restart, 0); _debug("restart"); - afs_end_cursor(&op->ac); + op->estate = NULL; op->server = NULL; + afs_clear_server_states(op); + op->server_states = NULL; afs_put_serverlist(op->net, op->server_list); op->server_list = NULL; start: _debug("start"); + ASSERTCMP(op->estate, ==, NULL); /* See if we need to do an update of the volume record. Note that the * volume may have moved or even have been deleted. */ error = afs_check_volume_status(op->volume, op); - if (error < 0) - goto failed_set_error; + trace_afs_rotate(op, afs_rotate_trace_check_vol_status, error); + if (error < 0) { + afs_op_set_error(op, error); + goto failed; + } if (!afs_start_fs_iteration(op, vnode)) goto failed; @@ -323,52 +505,85 @@ start: _debug("__ VOL %llx __", op->volume->vid); pick_server: - _debug("pick [%lx]", op->untried); + _debug("pick [%lx]", op->untried_servers); + ASSERTCMP(op->estate, ==, NULL); - error = afs_wait_for_fs_probes(op->server_list, op->untried); - if (error < 0) - goto failed_set_error; + error = afs_wait_for_fs_probes(op, op->server_states, + !(op->flags & AFS_OPERATION_UNINTR)); + switch (error) { + case 0: /* No untried responsive servers and no outstanding probes */ + trace_afs_rotate(op, afs_rotate_trace_probe_none, 0); + goto no_more_servers; + case 1: /* Got a response */ + trace_afs_rotate(op, afs_rotate_trace_probe_response, 0); + break; + case 2: /* Probe data superseded */ + trace_afs_rotate(op, afs_rotate_trace_probe_superseded, 0); + goto restart_from_beginning; + default: + trace_afs_rotate(op, afs_rotate_trace_probe_error, error); + afs_op_set_error(op, error); + goto failed; + } - /* Pick the untried server with the lowest RTT. If we have outstanding - * callbacks, we stick with the server we're already using if we can. + /* Pick the untried server with the highest priority untried endpoint. + * If we have outstanding callbacks, we stick with the server we're + * already using if we can. */ if (op->server) { - _debug("server %u", op->index); - if (test_bit(op->index, &op->untried)) + _debug("server %u", op->server_index); + if (test_bit(op->server_index, &op->untried_servers)) goto selected_server; op->server = NULL; _debug("no server"); } - op->index = -1; - rtt = U32_MAX; + rcu_read_lock(); + op->server_index = -1; + best_prio = -1; for (i = 0; i < op->server_list->nr_servers; i++) { - struct afs_server *s = op->server_list->servers[i].server; + struct afs_endpoint_state *es; + struct afs_server_entry *se = &op->server_list->servers[i]; + struct afs_addr_list *sal; + struct afs_server *s = se->server; - if (!test_bit(i, &op->untried) || + if (!test_bit(i, &op->untried_servers) || + test_bit(AFS_SE_EXCLUDED, &se->flags) || !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags)) continue; - if (s->probe.rtt < rtt) { - op->index = i; - rtt = s->probe.rtt; + es = op->server_states[i].endpoint_state; + sal = es->addresses; + + afs_get_address_preferences_rcu(op->net, sal); + for (j = 0; j < sal->nr_addrs; j++) { + if (es->failed_set & (1 << j)) + continue; + if (!sal->addrs[j].peer) + continue; + if (sal->addrs[j].prio > best_prio) { + op->server_index = i; + best_prio = sal->addrs[j].prio; + } } } + rcu_read_unlock(); - if (op->index == -1) + if (op->server_index == -1) goto no_more_servers; selected_server: - _debug("use %d", op->index); - __clear_bit(op->index, &op->untried); + trace_afs_rotate(op, afs_rotate_trace_selected_server, best_prio); + _debug("use %d prio %u", op->server_index, best_prio); + __clear_bit(op->server_index, &op->untried_servers); /* We're starting on a different fileserver from the list. We need to * check it, create a callback intercept, find its address list and * probe its capabilities before we use it. */ - ASSERTCMP(op->ac.alist, ==, NULL); - server = op->server_list->servers[op->index].server; + ASSERTCMP(op->estate, ==, NULL); + server = op->server_list->servers[op->server_index].server; - if (!afs_check_server_record(op, server)) + if (!afs_check_server_record(op, server, op->key)) goto failed; _debug("USING SERVER: %pU", &server->uuid); @@ -377,58 +592,81 @@ selected_server: op->server = server; if (vnode->cb_server != server) { vnode->cb_server = server; - vnode->cb_s_break = server->cb_s_break; - vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break); - vnode->cb_v_break = vnode->volume->cb_v_break; - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); + afs_clear_cb_promise(vnode, afs_cb_promise_clear_server_change); } - read_lock(&server->fs_lock); - alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->fs_lock)); - afs_get_addrlist(alist); - read_unlock(&server->fs_lock); - retry_server: - memset(&op->ac, 0, sizeof(op->ac)); - - if (!op->ac.alist) - op->ac.alist = alist; - else - afs_put_addrlist(alist); - - op->ac.index = -1; + op->addr_tried = 0; + op->addr_index = -1; iterate_address: - ASSERT(op->ac.alist); /* Iterate over the current server's address list to try and find an * address on which it will respond to us. */ - if (!afs_iterate_addresses(&op->ac)) - goto out_of_addresses; + op->estate = op->server_states[op->server_index].endpoint_state; + set = READ_ONCE(op->estate->responsive_set); + failed = READ_ONCE(op->estate->failed_set); + _debug("iterate ES=%x rs=%lx fs=%lx", op->estate->probe_seq, set, failed); + set &= ~(failed | op->addr_tried); + trace_afs_rotate(op, afs_rotate_trace_iterate_addr, set); + if (!set) + goto wait_for_more_probe_results; + + alist = op->estate->addresses; + best_prio = -1; + addr_index = 0; + for (i = 0; i < alist->nr_addrs; i++) { + if (!(set & (1 << i))) + continue; + if (alist->addrs[i].prio > best_prio) { + addr_index = i; + best_prio = alist->addrs[i].prio; + } + } - _debug("address [%u] %u/%u %pISp", - op->index, op->ac.index, op->ac.alist->nr_addrs, - &op->ac.alist->addrs[op->ac.index].transport); + alist->preferred = addr_index; + + op->addr_index = addr_index; + set_bit(addr_index, &op->addr_tried); + _debug("address [%u] %u/%u %pISp", + op->server_index, addr_index, alist->nr_addrs, + rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer)); +go_again: + op->volsync.creation = TIME64_MIN; + op->volsync.update = TIME64_MIN; + op->call_responded = false; _leave(" = t"); return true; -out_of_addresses: +wait_for_more_probe_results: + error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried, + !(op->flags & AFS_OPERATION_UNINTR)); + if (error == 1) + goto iterate_address; + if (!error) + goto restart_from_beginning; + /* We've now had a failure to respond on all of a server's addresses - * immediately probe them again and consider retrying the server. */ + trace_afs_rotate(op, afs_rotate_trace_probe_fileserver, 0); afs_probe_fileserver(op->net, op->server); if (op->flags & AFS_OPERATION_RETRY_SERVER) { - alist = op->ac.alist; - error = afs_wait_for_one_fs_probe( - op->server, !(op->flags & AFS_OPERATION_UNINTR)); + error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried, + !(op->flags & AFS_OPERATION_UNINTR)); switch (error) { - case 0: + case 1: op->flags &= ~AFS_OPERATION_RETRY_SERVER; + trace_afs_rotate(op, afs_rotate_trace_retry_server, 1); goto retry_server; + case 0: + trace_afs_rotate(op, afs_rotate_trace_retry_server, 0); + goto restart_from_beginning; case -ERESTARTSYS: - goto failed_set_error; + afs_op_set_error(op, error); + goto failed; case -ETIME: case -EDESTADDRREQ: goto next_server; @@ -436,34 +674,38 @@ out_of_addresses: } next_server: + trace_afs_rotate(op, afs_rotate_trace_next_server, 0); _debug("next"); - afs_end_cursor(&op->ac); + op->estate = NULL; goto pick_server; no_more_servers: /* That's all the servers poked to no good effect. Try again if some * of them were busy. */ - if (op->flags & AFS_OPERATION_VBUSY) + trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0); + if (op->flags & AFS_OPERATION_VBUSY) { + afs_sleep_and_retry(op); + op->flags &= ~AFS_OPERATION_VBUSY; goto restart_from_beginning; + } - e.error = -EDESTADDRREQ; - e.responded = false; + rcu_read_lock(); for (i = 0; i < op->server_list->nr_servers; i++) { - struct afs_server *s = op->server_list->servers[i].server; + struct afs_endpoint_state *estate; - afs_prioritise_error(&e, READ_ONCE(s->probe.error), - s->probe.abort_code); + estate = op->server_states[i].endpoint_state; + error = READ_ONCE(estate->error); + if (error < 0) + afs_op_accumulate_error(op, error, estate->abort_code); } + rcu_read_unlock(); - error = e.error; - -failed_set_error: - op->error = error; failed: + trace_afs_rotate(op, afs_rotate_trace_failed, 0); op->flags |= AFS_OPERATION_STOP; - afs_end_cursor(&op->ac); - _leave(" = f [failed %d]", op->error); + op->estate = NULL; + _leave(" = f [failed %d]", afs_op_error(op)); return false; } @@ -482,37 +724,40 @@ void afs_dump_edestaddrreq(const struct afs_operation *op) rcu_read_lock(); pr_notice("EDESTADDR occurred\n"); - pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n", + pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n", op->file[0].cb_break_before, - op->file[1].cb_break_before, op->flags, op->error); - pr_notice("FC: ut=%lx ix=%d ni=%u\n", - op->untried, op->index, op->nr_iterations); + op->file[1].cb_break_before, op->flags, op->cumul_error.error); + pr_notice("OP: ut=%lx ix=%d ni=%u\n", + op->untried_servers, op->server_index, op->nr_iterations); + pr_notice("OP: call er=%d ac=%d r=%u\n", + op->call_error, op->call_abort_code, op->call_responded); if (op->server_list) { const struct afs_server_list *sl = op->server_list; - pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n", - sl->nr_servers, sl->preferred, sl->vnovol_mask); + + pr_notice("FC: SL nr=%u vnov=%hx\n", + sl->nr_servers, sl->vnovol_mask); for (i = 0; i < sl->nr_servers; i++) { const struct afs_server *s = sl->servers[i].server; + const struct afs_endpoint_state *e = + rcu_dereference(s->endpoint_state); + const struct afs_addr_list *a = e->addresses; + pr_notice("FC: server fl=%lx av=%u %pU\n", s->flags, s->addr_version, &s->uuid); - if (s->addresses) { - const struct afs_addr_list *a = - rcu_dereference(s->addresses); + pr_notice("FC: - pq=%x R=%lx F=%lx\n", + e->probe_seq, e->responsive_set, e->failed_set); + if (a) { pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n", a->version, a->nr_ipv4, a->nr_addrs, a->max_addrs, a->preferred); - pr_notice("FC: - R=%lx F=%lx\n", - a->responded, a->failed); - if (a == op->ac.alist) + if (a == e->addresses) pr_notice("FC: - current\n"); } } } - pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", - op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error, - op->ac.responded, op->ac.nr_iterations); + pr_notice("AC: t=%lx ax=%d\n", op->addr_tried, op->addr_index); rcu_read_unlock(); } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 7817e2b860e5..bf0e4ea0aafd 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -18,13 +18,23 @@ struct workqueue_struct *afs_async_calls; +static void afs_deferred_free_worker(struct work_struct *work); static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob); static int afs_deliver_cm_op_id(struct afs_call *); +static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { + .notify_new_call = afs_rx_new_call, + .discard_new_call = afs_rx_discard_new_call, + .user_attach_call = afs_rx_attach, + .notify_oob = afs_rx_notify_oob, +}; + /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", @@ -48,6 +58,7 @@ int afs_open_socket(struct afs_net *net) goto error_1; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_user_data = net; /* bind the callback manager's address to make this a server socket */ memset(&srx, 0, sizeof(srx)); @@ -63,16 +74,24 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) goto error_2; - ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + ret = rxrpc_sock_set_manage_response(socket->sk, true); + if (ret < 0) + goto error_2; + + ret = afs_create_token_key(net, socket); + if (ret < 0) + pr_err("Couldn't create RxGK CM key: %d\n", ret); + + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; - ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); } if (ret < 0) goto error_2; srx.srx_service = YFS_CM_SERVICE; - ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *) &srx, sizeof(srx)); if (ret < 0) goto error_2; @@ -83,8 +102,7 @@ int afs_open_socket(struct afs_net *net) * it sends back to us. */ - rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, - afs_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -124,7 +142,9 @@ void afs_close_socket(struct afs_net *net) kernel_sock_shutdown(net->socket, SHUT_RDWR); flush_workqueue(afs_async_calls); + net->socket->sk->sk_user_data = NULL; sock_release(net->socket); + key_put(net->fs_cm_token_key); _debug("dework"); _leave(""); @@ -148,7 +168,9 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, call->net = net; call->debug_id = atomic_inc_return(&rxrpc_debug_id); refcount_set(&call->ref, 1); - INIT_WORK(&call->async_work, afs_process_async_call); + INIT_WORK(&call->async_work, type->async_rx ?: afs_process_async_call); + INIT_WORK(&call->work, call->type->work); + INIT_WORK(&call->free_work, afs_deferred_free_worker); init_waitqueue_head(&call->waitq); spin_lock_init(&call->state_lock); call->iter = &call->def_iter; @@ -159,6 +181,36 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, return call; } +static void afs_free_call(struct afs_call *call) +{ + struct afs_net *net = call->net; + int o; + + ASSERT(!work_pending(&call->async_work)); + + rxrpc_kernel_put_peer(call->peer); + + if (call->rxcall) { + rxrpc_kernel_shutdown_call(net->socket, call->rxcall); + rxrpc_kernel_put_call(net->socket, call->rxcall); + call->rxcall = NULL; + } + if (call->type->destructor) + call->type->destructor(call); + + afs_unuse_server_notime(call->net, call->server, afs_server_trace_unuse_call); + kfree(call->request); + + o = atomic_read(&net->nr_outstanding_calls); + trace_afs_call(call->debug_id, afs_call_trace_free, 0, o, + __builtin_return_address(0)); + kfree(call); + + o = atomic_dec_return(&net->nr_outstanding_calls); + if (o == 0) + wake_up_var(&net->nr_outstanding_calls); +} + /* * Dispose of a reference on a call. */ @@ -173,43 +225,34 @@ void afs_put_call(struct afs_call *call) o = atomic_read(&net->nr_outstanding_calls); trace_afs_call(debug_id, afs_call_trace_put, r - 1, o, __builtin_return_address(0)); + if (zero) + afs_free_call(call); +} - if (zero) { - ASSERT(!work_pending(&call->async_work)); - ASSERT(call->type->name != NULL); - - if (call->rxcall) { - rxrpc_kernel_end_call(net->socket, call->rxcall); - call->rxcall = NULL; - } - if (call->type->destructor) - call->type->destructor(call); - - afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); - afs_put_addrlist(call->alist); - kfree(call->request); - - trace_afs_call(call->debug_id, afs_call_trace_free, 0, o, - __builtin_return_address(0)); - kfree(call); +static void afs_deferred_free_worker(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, free_work); - o = atomic_dec_return(&net->nr_outstanding_calls); - if (o == 0) - wake_up_var(&net->nr_outstanding_calls); - } + afs_free_call(call); } -static struct afs_call *afs_get_call(struct afs_call *call, - enum afs_call_trace why) +/* + * Dispose of a reference on a call, deferring the cleanup to a workqueue + * to avoid lock recursion. + */ +void afs_deferred_put_call(struct afs_call *call) { - int r; - - __refcount_inc(&call->ref, &r); + struct afs_net *net = call->net; + unsigned int debug_id = call->debug_id; + bool zero; + int r, o; - trace_afs_call(call->debug_id, why, r + 1, - atomic_read(&call->net->nr_outstanding_calls), + zero = __refcount_dec_and_test(&call->ref, &r); + o = atomic_read(&net->nr_outstanding_calls); + trace_afs_call(debug_id, afs_call_trace_put, r - 1, o, __builtin_return_address(0)); - return call; + if (zero) + schedule_work(&call->free_work); } /* @@ -218,8 +261,6 @@ static struct afs_call *afs_get_call(struct afs_call *call, static void afs_queue_call_work(struct afs_call *call) { if (call->type->work) { - INIT_WORK(&call->work, call->type->work); - afs_get_call(call, afs_call_trace_work); if (!queue_work(afs_wq, &call->work)) afs_put_call(call); @@ -293,9 +334,8 @@ static void afs_notify_end_request_tx(struct sock *sock, * Initiate a call and synchronously queue up the parameters for dispatch. Any * error is stored into the call struct, which the caller must check for. */ -void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) +void afs_make_call(struct afs_call *call, gfp_t gfp) { - struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index]; struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; @@ -303,7 +343,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) s64 tx_total_len; int ret; - _enter(",{%pISp},", &srx->transport); + _enter(",{%pISp+%u},", rxrpc_kernel_remote_addr(call->peer), call->service_id); ASSERT(call->type != NULL); ASSERT(call->type->name != NULL); @@ -312,8 +352,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) call, call->type->name, key_serial(call->key), atomic_read(&call->net->nr_outstanding_calls)); - call->addr_ix = ac->index; - call->alist = afs_get_addrlist(ac->alist); + trace_afs_make_call(call); /* Work out the length we're going to transmit. This is awkward for * calls such as FS.StoreData where there's an extra injection of data @@ -332,12 +371,15 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) } /* create a call */ - rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key, + rxcall = rxrpc_kernel_begin_call(call->net->socket, call->peer, call->key, (unsigned long)call, - tx_total_len, gfp, + tx_total_len, + call->max_lifespan, + gfp, (call->async ? afs_wake_up_async_call : afs_wake_up_call_waiter), + call->service_id, call->upgrade, (call->intr ? RXRPC_PREINTERRUPTIBLE : RXRPC_UNINTERRUPTIBLE), @@ -349,10 +391,6 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) } call->rxcall = rxcall; - - if (call->max_lifespan) - rxrpc_kernel_set_max_life(call->net->socket, rxcall, - call->max_lifespan); call->issue_time = ktime_get_real(); /* send the request */ @@ -391,46 +429,50 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) /* Note that at this point, we may have received the reply or an abort * - and an asynchronous call may already have completed. * - * afs_wait_for_call_to_complete(call, ac) + * afs_wait_for_call_to_complete(call) * must be called to synchronously clean up. */ return; error_do_abort: - if (ret != -ECONNABORTED) { + if (ret != -ECONNABORTED) rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, afs_abort_send_data_error); - } else { + if (call->async) { + afs_see_call(call, afs_call_trace_async_abort); + return; + } + + if (ret == -ECONNABORTED) { len = 0; iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); - ac->abort_code = call->abort_code; - ac->responded = true; + call->responded = true; } call->error = ret; trace_afs_call_done(call); error_kill_call: - if (call->type->done) - call->type->done(call); + if (call->async) + afs_see_call(call, afs_call_trace_async_kill); + if (call->type->immediate_cancel) + call->type->immediate_cancel(call); /* We need to dispose of the extra ref we grabbed for an async call. * The call, however, might be queued on afs_async_calls and we need to * make sure we don't get any more notifications that might requeue it. */ - if (call->rxcall) { - rxrpc_kernel_end_call(call->net->socket, call->rxcall); - call->rxcall = NULL; - } + if (call->rxcall) + rxrpc_kernel_shutdown_call(call->net->socket, call->rxcall); if (call->async) { if (cancel_work_sync(&call->async_work)) afs_put_call(call); - afs_put_call(call); + afs_set_call_complete(call, ret, 0); } - ac->error = ret; + call->error = ret; call->state = AFS_CALL_COMPLETE; _leave(" = %d", ret); } @@ -464,14 +506,14 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort) max = m + 1; pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", msg, call->type->name, - &call->alist->addrs[call->addr_ix].transport); + rxrpc_kernel_remote_addr(call->peer)); } } /* * deliver messages to a call */ -static void afs_deliver_to_call(struct afs_call *call) +void afs_deliver_to_call(struct afs_call *call) { enum afs_call_state state; size_t len; @@ -511,6 +553,7 @@ static void afs_deliver_to_call(struct afs_call *call) ret = -EBADMSG; switch (ret) { case 0: + call->responded = true; afs_queue_call_work(call); if (state == AFS_CALL_CL_PROC_REPLY) { if (call->op) @@ -525,9 +568,11 @@ static void afs_deliver_to_call(struct afs_call *call) goto out; case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); + call->responded = true; afs_log_error(call, call->abort_code); goto done; case -ENOTSUPP: + call->responded = true; abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, @@ -569,55 +614,50 @@ local_abort: abort_code = 0; call_complete: afs_set_call_complete(call, ret, remote_abort); - state = AFS_CALL_COMPLETE; goto done; } /* - * Wait synchronously for a call to complete and clean up the call struct. + * Wait synchronously for a call to complete. */ -long afs_wait_for_call_to_complete(struct afs_call *call, - struct afs_addr_cursor *ac) +void afs_wait_for_call_to_complete(struct afs_call *call) { - long ret; bool rxrpc_complete = false; - DECLARE_WAITQUEUE(myself, current); - _enter(""); - ret = call->error; - if (ret < 0) - goto out; + if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { + DECLARE_WAITQUEUE(myself, current); + + add_wait_queue(&call->waitq, &myself); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + + /* deliver any messages that are in the queue */ + if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && + call->need_attention) { + call->need_attention = false; + __set_current_state(TASK_RUNNING); + afs_deliver_to_call(call); + continue; + } - add_wait_queue(&call->waitq, &myself); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - - /* deliver any messages that are in the queue */ - if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && - call->need_attention) { - call->need_attention = false; - __set_current_state(TASK_RUNNING); - afs_deliver_to_call(call); - continue; - } + if (afs_check_call_state(call, AFS_CALL_COMPLETE)) + break; - if (afs_check_call_state(call, AFS_CALL_COMPLETE)) - break; + if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { + /* rxrpc terminated the call. */ + rxrpc_complete = true; + break; + } - if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { - /* rxrpc terminated the call. */ - rxrpc_complete = true; - break; + schedule(); } - schedule(); + remove_wait_queue(&call->waitq, &myself); + __set_current_state(TASK_RUNNING); } - remove_wait_queue(&call->waitq, &myself); - __set_current_state(TASK_RUNNING); - if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { if (rxrpc_complete) { afs_set_call_complete(call, call->error, call->abort_code); @@ -630,29 +670,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call, afs_set_call_complete(call, -EINTR, 0); } } - - spin_lock_bh(&call->state_lock); - ac->abort_code = call->abort_code; - ac->error = call->error; - spin_unlock_bh(&call->state_lock); - - ret = ac->error; - switch (ret) { - case 0: - ret = call->ret0; - call->ret0 = 0; - - fallthrough; - case -ECONNABORTED: - ac->responded = true; - break; - } - -out: - _debug("call complete"); - afs_put_call(call); - _leave(" = %p", (void *)ret); - return ret; } /* @@ -668,7 +685,8 @@ static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, } /* - * wake up an asynchronous call + * Wake up an asynchronous call. The caller is holding the call notify + * spinlock around this, so we can't call afs_put_call(). */ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long call_user_ID) @@ -685,7 +703,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, __builtin_return_address(0)); if (!queue_work(afs_async_calls, &call->async_work)) - afs_put_call(call); + afs_deferred_put_call(call); } } @@ -739,7 +757,6 @@ void afs_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(net->socket, afs_wake_up_async_call, - afs_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -767,8 +784,14 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long user_call_ID) { + struct afs_call *call = (struct afs_call *)user_call_ID; struct afs_net *net = afs_sock2net(sk); + call->peer = rxrpc_kernel_get_call_peer(sk->sk_socket, call->rxcall); + call->server = afs_find_server(call->peer); + if (!call->server) + trace_afs_cm_no_server(call, rxrpc_kernel_remote_srx(call->peer)); + queue_work(afs_wq, &net->charge_preallocation_work); } @@ -795,9 +818,14 @@ static int afs_deliver_cm_op_id(struct afs_call *call) if (!afs_cm_incoming_call(call)) return -ENOTSUPP; + call->security_ix = rxrpc_kernel_query_call_security(call->rxcall, + &call->service_id, + &call->enctype); + trace_afs_cb_call(call); + call->work.func = call->type->work; - /* pass responsibility for the remainer of this message off to the + /* pass responsibility for the remainder of this message off to the * cache manager op */ return call->type->deliver(call); } @@ -946,3 +974,13 @@ noinline int afs_protocol_error(struct afs_call *call, call->unmarshalling_error = true; return -EBADMSG; } + +/* + * Wake up OOB notification processing. + */ +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob) +{ + struct afs_net *net = sk->sk_user_data; + + schedule_work(&net->rx_oob_work); +} diff --git a/fs/afs/security.c b/fs/afs/security.c index 7c6a63a30394..55ddce94af03 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -16,6 +16,31 @@ static DEFINE_HASHTABLE(afs_permits_cache, 10); static DEFINE_SPINLOCK(afs_permits_lock); +static DEFINE_MUTEX(afs_key_lock); + +/* + * Allocate a key to use as a placeholder for anonymous user security. + */ +static int afs_alloc_anon_key(struct afs_cell *cell) +{ + struct key *key; + + mutex_lock(&afs_key_lock); + key = cell->anonymous_key; + if (!key) { + key = rxrpc_get_null_key(cell->key_desc); + if (!IS_ERR(key)) + cell->anonymous_key = key; + } + mutex_unlock(&afs_key_lock); + + if (IS_ERR(key)) + return PTR_ERR(key); + + _debug("anon key %p{%x}", + cell->anonymous_key, key_serial(cell->anonymous_key)); + return 0; +} /* * get a key @@ -23,11 +48,12 @@ static DEFINE_SPINLOCK(afs_permits_lock); struct key *afs_request_key(struct afs_cell *cell) { struct key *key; + int ret; - _enter("{%x}", key_serial(cell->anonymous_key)); + _enter("{%s}", cell->key_desc); - _debug("key %s", cell->anonymous_key->description); - key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description, + _debug("key %s", cell->key_desc); + key = request_key_net(&key_type_rxrpc, cell->key_desc, cell->net->net, NULL); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { @@ -35,6 +61,12 @@ struct key *afs_request_key(struct afs_cell *cell) return key; } + if (!cell->anonymous_key) { + ret = afs_alloc_anon_key(cell); + if (ret < 0) + return ERR_PTR(ret); + } + /* act as anonymous user */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); @@ -52,11 +84,10 @@ struct key *afs_request_key_rcu(struct afs_cell *cell) { struct key *key; - _enter("{%x}", key_serial(cell->anonymous_key)); + _enter("{%s}", cell->key_desc); - _debug("key %s", cell->anonymous_key->description); - key = request_key_net_rcu(&key_type_rxrpc, - cell->anonymous_key->description, + _debug("key %s", cell->key_desc); + key = request_key_net_rcu(&key_type_rxrpc, cell->key_desc, cell->net->net); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { @@ -65,6 +96,8 @@ struct key *afs_request_key_rcu(struct afs_cell *cell) } /* act as anonymous user */ + if (!cell->anonymous_key) + return NULL; /* Need to allocate */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); } else { @@ -395,7 +428,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct user_namespace *mnt_userns, struct inode *inode, +int afs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); @@ -408,7 +441,7 @@ int afs_permission(struct user_namespace *mnt_userns, struct inode *inode, if (mask & MAY_NOT_BLOCK) { key = afs_request_key_rcu(vnode->volume->cell); - if (IS_ERR(key)) + if (IS_ERR_OR_NULL(key)) return -ECHILD; ret = -ECHILD; diff --git a/fs/afs/server.c b/fs/afs/server.c index b5237206eac3..c4428ebddb1d 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -14,210 +14,103 @@ static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */ static atomic_t afs_server_debug_id; -static struct afs_server *afs_maybe_use_server(struct afs_server *, - enum afs_server_trace); static void __afs_put_server(struct afs_net *, struct afs_server *); +static void afs_server_timer(struct timer_list *timer); +static void afs_server_destroyer(struct work_struct *work); /* * Find a server by one of its addresses. */ -struct afs_server *afs_find_server(struct afs_net *net, - const struct sockaddr_rxrpc *srx) +struct afs_server *afs_find_server(const struct rxrpc_peer *peer) { - const struct afs_addr_list *alist; - struct afs_server *server = NULL; - unsigned int i; - int seq = 0, diff; + struct afs_server *server = (struct afs_server *)rxrpc_kernel_get_peer_data(peer); - rcu_read_lock(); - - do { - if (server) - afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq); - server = NULL; - read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - - if (srx->transport.family == AF_INET6) { - const struct sockaddr_in6 *a = &srx->transport.sin6, *b; - hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { - alist = rcu_dereference(server->addresses); - for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { - b = &alist->addrs[i].transport.sin6; - diff = ((u16 __force)a->sin6_port - - (u16 __force)b->sin6_port); - if (diff == 0) - diff = memcmp(&a->sin6_addr, - &b->sin6_addr, - sizeof(struct in6_addr)); - if (diff == 0) - goto found; - } - } - } else { - const struct sockaddr_in *a = &srx->transport.sin, *b; - hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) { - alist = rcu_dereference(server->addresses); - for (i = 0; i < alist->nr_ipv4; i++) { - b = &alist->addrs[i].transport.sin; - diff = ((u16 __force)a->sin_port - - (u16 __force)b->sin_port); - if (diff == 0) - diff = ((u32 __force)a->sin_addr.s_addr - - (u32 __force)b->sin_addr.s_addr); - if (diff == 0) - goto found; - } - } - } - - server = NULL; - continue; - found: - server = afs_maybe_use_server(server, afs_server_trace_get_by_addr); - - } while (need_seqretry(&net->fs_addr_lock, seq)); - - done_seqretry(&net->fs_addr_lock, seq); - - rcu_read_unlock(); - return server; + if (!server) + return NULL; + return afs_use_server(server, false, afs_server_trace_use_cm_call); } /* - * Look up a server by its UUID and mark it active. + * Look up a server by its UUID and mark it active. The caller must hold + * cell->fs_lock. */ -struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid) +static struct afs_server *afs_find_server_by_uuid(struct afs_cell *cell, const uuid_t *uuid) { - struct afs_server *server = NULL; + struct afs_server *server; struct rb_node *p; - int diff, seq = 0; + int diff; _enter("%pU", uuid); - do { - /* Unfortunately, rbtree walking doesn't give reliable results - * under just the RCU read lock, so we have to check for - * changes. - */ - if (server) - afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq); - server = NULL; - - read_seqbegin_or_lock(&net->fs_lock, &seq); - - p = net->fs_servers.rb_node; - while (p) { - server = rb_entry(p, struct afs_server, uuid_rb); - - diff = memcmp(uuid, &server->uuid, sizeof(*uuid)); - if (diff < 0) { - p = p->rb_left; - } else if (diff > 0) { - p = p->rb_right; - } else { - afs_use_server(server, afs_server_trace_get_by_uuid); - break; - } + p = cell->fs_servers.rb_node; + while (p) { + server = rb_entry(p, struct afs_server, uuid_rb); - server = NULL; + diff = memcmp(uuid, &server->uuid, sizeof(*uuid)); + if (diff < 0) { + p = p->rb_left; + } else if (diff > 0) { + p = p->rb_right; + } else { + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) + return NULL; /* Need a write lock */ + afs_use_server(server, true, afs_server_trace_use_by_uuid); + return server; } - } while (need_seqretry(&net->fs_lock, seq)); - - done_seqretry(&net->fs_lock, seq); + } - _leave(" = %p", server); - return server; + return NULL; } /* - * Install a server record in the namespace tree. If there's a clash, we stick - * it into a list anchored on whichever afs_server struct is actually in the - * tree. + * Install a server record in the cell tree. The caller must hold an exclusive + * lock on cell->fs_lock. */ static struct afs_server *afs_install_server(struct afs_cell *cell, - struct afs_server *candidate) + struct afs_server **candidate) { - const struct afs_addr_list *alist; - struct afs_server *server, *next; + struct afs_server *server; struct afs_net *net = cell->net; struct rb_node **pp, *p; int diff; _enter("%p", candidate); - write_seqlock(&net->fs_lock); - /* Firstly install the server in the UUID lookup tree */ - pp = &net->fs_servers.rb_node; + pp = &cell->fs_servers.rb_node; p = NULL; while (*pp) { p = *pp; _debug("- consider %p", p); server = rb_entry(p, struct afs_server, uuid_rb); - diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t)); - if (diff < 0) { + diff = memcmp(&(*candidate)->uuid, &server->uuid, sizeof(uuid_t)); + if (diff < 0) pp = &(*pp)->rb_left; - } else if (diff > 0) { + else if (diff > 0) pp = &(*pp)->rb_right; - } else { - if (server->cell == cell) - goto exists; - - /* We have the same UUID representing servers in - * different cells. Append the new server to the list. - */ - for (;;) { - next = rcu_dereference_protected( - server->uuid_next, - lockdep_is_held(&net->fs_lock.lock)); - if (!next) - break; - server = next; - } - rcu_assign_pointer(server->uuid_next, candidate); - candidate->uuid_prev = server; - server = candidate; - goto added_dup; - } + else + goto exists; } - server = candidate; + server = *candidate; + *candidate = NULL; rb_link_node(&server->uuid_rb, p, pp); - rb_insert_color(&server->uuid_rb, &net->fs_servers); + rb_insert_color(&server->uuid_rb, &cell->fs_servers); + write_seqlock(&net->fs_lock); hlist_add_head_rcu(&server->proc_link, &net->fs_proc); + write_sequnlock(&net->fs_lock); -added_dup: - write_seqlock(&net->fs_addr_lock); - alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&net->fs_addr_lock.lock)); - - /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install - * it in the IPv4 and/or IPv6 reverse-map lists. - * - * TODO: For speed we want to use something other than a flat list - * here; even sorting the list in terms of lowest address would help a - * bit, but anything we might want to do gets messy and memory - * intensive. - */ - if (alist->nr_ipv4 > 0) - hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4); - if (alist->nr_addrs > alist->nr_ipv4) - hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6); - - write_sequnlock(&net->fs_addr_lock); + afs_get_cell(cell, afs_cell_trace_get_server); exists: - afs_get_server(server, afs_server_trace_get_install); - write_sequnlock(&net->fs_lock); + afs_use_server(server, true, afs_server_trace_use_install); return server; } /* - * Allocate a new server record and mark it active. + * Allocate a new server record and mark it as active but uncreated. */ -static struct afs_server *afs_alloc_server(struct afs_cell *cell, - const uuid_t *uuid, - struct afs_addr_list *alist) +static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid) { struct afs_server *server; struct afs_net *net = cell->net; @@ -226,49 +119,50 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, server = kzalloc(sizeof(struct afs_server), GFP_KERNEL); if (!server) - goto enomem; + return NULL; refcount_set(&server->ref, 1); - atomic_set(&server->active, 1); + atomic_set(&server->active, 0); + __set_bit(AFS_SERVER_FL_UNCREATED, &server->flags); server->debug_id = atomic_inc_return(&afs_server_debug_id); - RCU_INIT_POINTER(server->addresses, alist); - server->addr_version = alist->version; server->uuid = *uuid; rwlock_init(&server->fs_lock); - INIT_WORK(&server->initcb_work, afs_server_init_callback_work); + INIT_WORK(&server->destroyer, &afs_server_destroyer); + timer_setup(&server->timer, afs_server_timer, 0); + INIT_LIST_HEAD(&server->volumes); init_waitqueue_head(&server->probe_wq); + mutex_init(&server->cm_token_lock); INIT_LIST_HEAD(&server->probe_link); + INIT_HLIST_NODE(&server->proc_link); spin_lock_init(&server->probe_lock); server->cell = cell; server->rtt = UINT_MAX; + server->service_id = FS_SERVICE; + server->probe_counter = 1; + server->probed_at = jiffies - LONG_MAX / 2; afs_inc_servers_outstanding(net); - trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc); _leave(" = %p", server); return server; - -enomem: - _leave(" = NULL [nomem]"); - return NULL; } /* * Look up an address record for a server */ -static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, - struct key *key, const uuid_t *uuid) +static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_server *server, + struct key *key) { struct afs_vl_cursor vc; struct afs_addr_list *alist = NULL; int ret; ret = -ERESTARTSYS; - if (afs_begin_vlserver_operation(&vc, cell, key)) { + if (afs_begin_vlserver_operation(&vc, server->cell, key)) { while (afs_select_vlserver(&vc)) { if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) - alist = afs_yfsvl_get_endpoints(&vc, uuid); + alist = afs_yfsvl_get_endpoints(&vc, &server->uuid); else - alist = afs_vl_get_addrs_u(&vc, uuid); + alist = afs_vl_get_addrs_u(&vc, &server->uuid); } ret = afs_end_vlserver_operation(&vc); @@ -278,72 +172,122 @@ static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, } /* - * Get or create a fileserver record. + * Get or create a fileserver record and return it with an active-use count on + * it. */ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, const uuid_t *uuid, u32 addr_version) { - struct afs_addr_list *alist; - struct afs_server *server, *candidate; + struct afs_addr_list *alist = NULL; + struct afs_server *server, *candidate = NULL; + bool creating = false; + int ret; _enter("%p,%pU", cell->net, uuid); - server = afs_find_server_by_uuid(cell->net, uuid); + down_read(&cell->fs_lock); + server = afs_find_server_by_uuid(cell, uuid); + /* Won't see servers marked uncreated. */ + up_read(&cell->fs_lock); + if (server) { + timer_delete_sync(&server->timer); + if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) + goto wait_for_creation; if (server->addr_version != addr_version) set_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); return server; } - alist = afs_vl_lookup_addrs(cell, key, uuid); - if (IS_ERR(alist)) - return ERR_CAST(alist); - - candidate = afs_alloc_server(cell, uuid, alist); + candidate = afs_alloc_server(cell, uuid); if (!candidate) { - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_server_oom); return ERR_PTR(-ENOMEM); } - server = afs_install_server(cell, candidate); - if (server != candidate) { - afs_put_addrlist(alist); + down_write(&cell->fs_lock); + server = afs_install_server(cell, &candidate); + if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) { + /* We need to wait for creation to complete. */ + up_write(&cell->fs_lock); + goto wait_for_creation; + } + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) { + set_bit(AFS_SERVER_FL_CREATING, &server->flags); + clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + creating = true; + } + up_write(&cell->fs_lock); + timer_delete_sync(&server->timer); + + /* If we get to create the server, we look up the addresses and then + * immediately dispatch an asynchronous probe to each interface on the + * fileserver. This will make sure the repeat-probing service is + * started. + */ + if (creating) { + alist = afs_vl_lookup_addrs(server, key); + if (IS_ERR(alist)) { + ret = PTR_ERR(alist); + goto create_failed; + } + + ret = afs_fs_probe_fileserver(cell->net, server, alist, key); + if (ret) + goto create_failed; + + clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags); + } + +out: + afs_put_addrlist(alist, afs_alist_trace_put_server_create); + if (candidate) { + kfree(rcu_access_pointer(server->endpoint_state)); kfree(candidate); - } else { - /* Immediately dispatch an asynchronous probe to each interface - * on the fileserver. This will make sure the repeat-probing - * service is started. - */ - afs_fs_probe_fileserver(cell->net, server, key, true); + afs_dec_servers_outstanding(cell->net); + } + return server ?: ERR_PTR(ret); + +wait_for_creation: + afs_see_server(server, afs_server_trace_wait_create); + wait_on_bit(&server->flags, AFS_SERVER_FL_CREATING, TASK_UNINTERRUPTIBLE); + if (test_bit_acquire(AFS_SERVER_FL_UNCREATED, &server->flags)) { + /* Barrier: read flag before error */ + ret = READ_ONCE(server->create_error); + afs_put_server(cell->net, server, afs_server_trace_unuse_create_fail); + server = NULL; + goto out; } - return server; -} + ret = 0; + goto out; -/* - * Set the server timer to fire after a given delay, assuming it's not already - * set for an earlier time. - */ -static void afs_set_server_timer(struct afs_net *net, time64_t delay) -{ - if (net->live) { - afs_inc_servers_outstanding(net); - if (timer_reduce(&net->fs_timer, jiffies + delay * HZ)) - afs_dec_servers_outstanding(net); +create_failed: + down_write(&cell->fs_lock); + + WRITE_ONCE(server->create_error, ret); + smp_wmb(); /* Barrier: set error before flag. */ + set_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + + clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags); + + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) { + clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + creating = true; } + afs_unuse_server(cell->net, server, afs_server_trace_unuse_create_fail); + server = NULL; + + up_write(&cell->fs_lock); + goto out; } /* - * Server management timer. We have an increment on fs_outstanding that we - * need to pass along to the work item. + * Set/reduce a server's timer. */ -void afs_servers_timer(struct timer_list *timer) +static void afs_set_server_timer(struct afs_server *server, unsigned int delay_secs) { - struct afs_net *net = container_of(timer, struct afs_net, fs_timer); - - _enter(""); - if (!queue_work(afs_wq, &net->fs_manager)) - afs_dec_servers_outstanding(net); + mod_timer(&server->timer, jiffies + delay_secs * HZ); } /* @@ -362,32 +306,20 @@ struct afs_server *afs_get_server(struct afs_server *server, } /* - * Try to get a reference on a server object. + * Get an active count on a server object and maybe remove from the inactive + * list. */ -static struct afs_server *afs_maybe_use_server(struct afs_server *server, - enum afs_server_trace reason) -{ - unsigned int a; - int r; - - if (!__refcount_inc_not_zero(&server->ref, &r)) - return NULL; - - a = atomic_inc_return(&server->active); - trace_afs_server(server->debug_id, r + 1, a, reason); - return server; -} - -/* - * Get an active count on a server object. - */ -struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason) +struct afs_server *afs_use_server(struct afs_server *server, bool activate, + enum afs_server_trace reason) { unsigned int a; int r; __refcount_inc(&server->ref, &r); a = atomic_inc_return(&server->active); + if (a == 1 && activate && + !test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + timer_delete(&server->timer); trace_afs_server(server->debug_id, r + 1, a, reason); return server; @@ -399,13 +331,14 @@ struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_tra void afs_put_server(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - unsigned int a, debug_id = server->debug_id; + unsigned int a, debug_id; bool zero; int r; if (!server) return; + debug_id = server->debug_id; a = atomic_read(&server->active); zero = __refcount_dec_and_test(&server->ref, &r); trace_afs_server(debug_id, r - 1, a, reason); @@ -420,13 +353,16 @@ void afs_put_server(struct afs_net *net, struct afs_server *server, void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - if (server) { - unsigned int active = atomic_dec_return(&server->active); + if (!server) + return; - if (active == 0) - afs_set_server_timer(net, afs_server_gc_delay); - afs_put_server(net, server, reason); + if (atomic_dec_and_test(&server->active)) { + if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) || + READ_ONCE(server->cell->state) >= AFS_CELL_REMOVING) + schedule_work(&server->destroyer); } + + afs_put_server(net, server, reason); } /* @@ -435,10 +371,22 @@ void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, void afs_unuse_server(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - if (server) { - server->unuse_time = ktime_get_real_seconds(); - afs_unuse_server_notime(net, server, reason); + if (!server) + return; + + if (atomic_dec_and_test(&server->active)) { + if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) && + READ_ONCE(server->cell->state) < AFS_CELL_REMOVING) { + time64_t unuse_time = ktime_get_real_seconds(); + + server->unuse_time = unuse_time; + afs_set_server_timer(server, afs_server_gc_delay); + } else { + schedule_work(&server->destroyer); + } } + + afs_put_server(net, server, reason); } static void afs_server_rcu(struct rcu_head *rcu) @@ -447,7 +395,10 @@ static void afs_server_rcu(struct rcu_head *rcu) trace_afs_server(server->debug_id, refcount_read(&server->ref), atomic_read(&server->active), afs_server_trace_free); - afs_put_addrlist(rcu_access_pointer(server->addresses)); + afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), + afs_estate_trace_put_server); + afs_put_cell(server->cell, afs_cell_trace_put_server); + kfree(server->cm_rxgk_appdata.data); kfree(server); } @@ -459,173 +410,126 @@ static void __afs_put_server(struct afs_net *net, struct afs_server *server) static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server) { - struct afs_addr_list *alist = rcu_access_pointer(server->addresses); - struct afs_addr_cursor ac = { - .alist = alist, - .index = alist->preferred, - .error = 0, - }; - - afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + struct afs_endpoint_state *estate = rcu_access_pointer(server->endpoint_state); + struct afs_addr_list *alist = estate->addresses; + + afs_fs_give_up_all_callbacks(net, server, &alist->addrs[alist->preferred], NULL); } /* - * destroy a dead server + * Check to see if the server record has expired. */ -static void afs_destroy_server(struct afs_net *net, struct afs_server *server) +static bool afs_has_server_expired(const struct afs_server *server) { - if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) - afs_give_up_callbacks(net, server); + time64_t expires_at; - flush_work(&server->initcb_work); - afs_put_server(net, server, afs_server_trace_destroy); + if (atomic_read(&server->active)) + return false; + + if (server->cell->net->live || + server->cell->state >= AFS_CELL_REMOVING) { + trace_afs_server(server->debug_id, refcount_read(&server->ref), + 0, afs_server_trace_purging); + return true; + } + + expires_at = server->unuse_time; + if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && + !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) + expires_at += afs_server_gc_delay; + + return ktime_get_real_seconds() > expires_at; } /* - * Garbage collect any expired servers. + * Remove a server record from it's parent cell's database. */ -static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) +static bool afs_remove_server_from_cell(struct afs_server *server) { - struct afs_server *server, *next, *prev; - int active; - - while ((server = gc_list)) { - gc_list = server->gc_next; - - write_seqlock(&net->fs_lock); - - active = atomic_read(&server->active); - if (active == 0) { - trace_afs_server(server->debug_id, refcount_read(&server->ref), - active, afs_server_trace_gc); - next = rcu_dereference_protected( - server->uuid_next, lockdep_is_held(&net->fs_lock.lock)); - prev = server->uuid_prev; - if (!prev) { - /* The one at the front is in the tree */ - if (!next) { - rb_erase(&server->uuid_rb, &net->fs_servers); - } else { - rb_replace_node_rcu(&server->uuid_rb, - &next->uuid_rb, - &net->fs_servers); - next->uuid_prev = NULL; - } - } else { - /* This server is not at the front */ - rcu_assign_pointer(prev->uuid_next, next); - if (next) - next->uuid_prev = prev; - } - - list_del(&server->probe_link); - hlist_del_rcu(&server->proc_link); - if (!hlist_unhashed(&server->addr4_link)) - hlist_del_rcu(&server->addr4_link); - if (!hlist_unhashed(&server->addr6_link)) - hlist_del_rcu(&server->addr6_link); - } - write_sequnlock(&net->fs_lock); + struct afs_cell *cell = server->cell; + + down_write(&cell->fs_lock); - if (active == 0) - afs_destroy_server(net, server); + if (!afs_has_server_expired(server)) { + up_write(&cell->fs_lock); + return false; } + + set_bit(AFS_SERVER_FL_EXPIRED, &server->flags); + _debug("expire %pU %u", &server->uuid, atomic_read(&server->active)); + afs_see_server(server, afs_server_trace_see_expired); + rb_erase(&server->uuid_rb, &cell->fs_servers); + up_write(&cell->fs_lock); + return true; } -/* - * Manage the records of servers known to be within a network namespace. This - * includes garbage collecting unused servers. - * - * Note also that we were given an increment on net->servers_outstanding by - * whoever queued us that we need to deal with before returning. - */ -void afs_manage_servers(struct work_struct *work) +static void afs_server_destroyer(struct work_struct *work) { - struct afs_net *net = container_of(work, struct afs_net, fs_manager); - struct afs_server *gc_list = NULL; - struct rb_node *cursor; - time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX; - bool purging = !net->live; + struct afs_endpoint_state *estate; + struct afs_server *server = container_of(work, struct afs_server, destroyer); + struct afs_net *net = server->cell->net; - _enter(""); + afs_see_server(server, afs_server_trace_see_destroyer); - /* Trawl the server list looking for servers that have expired from - * lack of use. - */ - read_seqlock_excl(&net->fs_lock); - - for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) { - struct afs_server *server = - rb_entry(cursor, struct afs_server, uuid_rb); - int active = atomic_read(&server->active); + if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + return; - _debug("manage %pU %u", &server->uuid, active); + if (!afs_remove_server_from_cell(server)) + return; - if (purging) { - trace_afs_server(server->debug_id, refcount_read(&server->ref), - active, afs_server_trace_purging); - if (active != 0) - pr_notice("Can't purge s=%08x\n", server->debug_id); - } + timer_shutdown_sync(&server->timer); + cancel_work(&server->destroyer); - if (active == 0) { - time64_t expire_at = server->unuse_time; - - if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && - !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) - expire_at += afs_server_gc_delay; - if (purging || expire_at <= now) { - server->gc_next = gc_list; - gc_list = server; - } else if (expire_at < next_manage) { - next_manage = expire_at; - } - } - } + if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) + afs_give_up_callbacks(net, server); - read_sequnlock_excl(&net->fs_lock); + /* Unbind the rxrpc_peer records from the server. */ + estate = rcu_access_pointer(server->endpoint_state); + if (estate) + afs_set_peer_appdata(server, estate->addresses, NULL); - /* Update the timer on the way out. We have to pass an increment on - * servers_outstanding in the namespace that we are in to the timer or - * the work scheduler. - */ - if (!purging && next_manage < TIME64_MAX) { - now = ktime_get_real_seconds(); + write_seqlock(&net->fs_lock); + list_del_init(&server->probe_link); + if (!hlist_unhashed(&server->proc_link)) + hlist_del_rcu(&server->proc_link); + write_sequnlock(&net->fs_lock); - if (next_manage - now <= 0) { - if (queue_work(afs_wq, &net->fs_manager)) - afs_inc_servers_outstanding(net); - } else { - afs_set_server_timer(net, next_manage - now); - } - } + afs_put_server(net, server, afs_server_trace_destroy); +} - afs_gc_servers(net, gc_list); +static void afs_server_timer(struct timer_list *timer) +{ + struct afs_server *server = container_of(timer, struct afs_server, timer); - afs_dec_servers_outstanding(net); - _leave(" [%d]", atomic_read(&net->servers_outstanding)); + afs_see_server(server, afs_server_trace_see_timer); + if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + schedule_work(&server->destroyer); } -static void afs_queue_server_manager(struct afs_net *net) +/* + * Wake up all the servers in a cell so that they can purge themselves. + */ +void afs_purge_servers(struct afs_cell *cell) { - afs_inc_servers_outstanding(net); - if (!queue_work(afs_wq, &net->fs_manager)) - afs_dec_servers_outstanding(net); + struct afs_server *server; + struct rb_node *rb; + + down_read(&cell->fs_lock); + for (rb = rb_first(&cell->fs_servers); rb; rb = rb_next(rb)) { + server = rb_entry(rb, struct afs_server, uuid_rb); + afs_see_server(server, afs_server_trace_see_purge); + schedule_work(&server->destroyer); + } + up_read(&cell->fs_lock); } /* - * Purge list of servers. + * Wait for outstanding servers. */ -void afs_purge_servers(struct afs_net *net) +void afs_wait_for_servers(struct afs_net *net) { _enter(""); - if (del_timer_sync(&net->fs_timer)) - afs_dec_servers_outstanding(net); - - afs_queue_server_manager(net); - - _debug("wait"); atomic_dec(&net->servers_outstanding); wait_var_event(&net->servers_outstanding, !atomic_read(&net->servers_outstanding)); @@ -636,9 +540,12 @@ void afs_purge_servers(struct afs_net *net) * Get an update for a server's address list. */ static noinline bool afs_update_server_record(struct afs_operation *op, - struct afs_server *server) + struct afs_server *server, + struct key *key) { - struct afs_addr_list *alist, *discard; + struct afs_endpoint_state *estate; + struct afs_addr_list *alist; + bool has_addrs; _enter(""); @@ -646,31 +553,29 @@ static noinline bool afs_update_server_record(struct afs_operation *op, atomic_read(&server->active), afs_server_trace_update); - alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid); + alist = afs_vl_lookup_addrs(server, op->key); if (IS_ERR(alist)) { + rcu_read_lock(); + estate = rcu_dereference(server->endpoint_state); + has_addrs = estate->addresses; + rcu_read_unlock(); + if ((PTR_ERR(alist) == -ERESTARTSYS || PTR_ERR(alist) == -EINTR) && (op->flags & AFS_OPERATION_UNINTR) && - server->addresses) { + has_addrs) { _leave(" = t [intr]"); return true; } - op->error = PTR_ERR(alist); - _leave(" = f [%d]", op->error); + afs_op_set_error(op, PTR_ERR(alist)); + _leave(" = f [%d]", afs_op_error(op)); return false; } - discard = alist; - if (server->addr_version != alist->version) { - write_lock(&server->fs_lock); - discard = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->fs_lock)); - rcu_assign_pointer(server->addresses, alist); - server->addr_version = alist->version; - write_unlock(&server->fs_lock); - } + if (server->addr_version != alist->version) + afs_fs_probe_fileserver(op->net, server, alist, key); - afs_put_addrlist(discard); + afs_put_addrlist(alist, afs_alist_trace_put_server_update); _leave(" = t"); return true; } @@ -678,7 +583,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op, /* * See if a server's address list needs updating. */ -bool afs_check_server_record(struct afs_operation *op, struct afs_server *server) +bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, + struct key *key) { bool success; int ret, retries = 0; @@ -698,7 +604,7 @@ retry: update: if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) { clear_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); - success = afs_update_server_record(op, server); + success = afs_update_server_record(op, server, key); clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags); wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING); _leave(" = %d", success); @@ -710,7 +616,7 @@ wait: (op->flags & AFS_OPERATION_UNINTR) ? TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE); if (ret == -ERESTARTSYS) { - op->error = ret; + afs_op_set_error(op, ret); _leave(" = f [intr]"); return false; } diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index ed9056703505..20d5474837df 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -16,43 +16,70 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) if (slist && refcount_dec_and_test(&slist->usage)) { for (i = 0; i < slist->nr_servers; i++) afs_unuse_server(net, slist->servers[i].server, - afs_server_trace_put_slist); - kfree(slist); + afs_server_trace_unuse_slist); + kfree_rcu(slist, rcu); } } /* * Build a server list from a VLDB record. */ -struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, +struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, struct key *key, - struct afs_vldb_entry *vldb, - u8 type_mask) + struct afs_vldb_entry *vldb) { struct afs_server_list *slist; struct afs_server *server; - int ret = -ENOMEM, nr_servers = 0, i, j; - - for (i = 0; i < vldb->nr_servers; i++) - if (vldb->fs_mask[i] & type_mask) - nr_servers++; + unsigned int type_mask = 1 << volume->type; + bool use_newrepsites = false; + int ret = -ENOMEM, nr_servers = 0, newrep = 0, i, j, usable = 0; + + /* Work out if we're going to restrict to NEWREPSITE-marked servers or + * not. If at least one site is marked as NEWREPSITE, then it's likely + * that "vos release" is busy updating RO sites. We cut over from one + * to the other when >=50% of the sites have been updated. Sites that + * are in the process of being updated are marked DONTUSE. + */ + for (i = 0; i < vldb->nr_servers; i++) { + if (!(vldb->fs_mask[i] & type_mask)) + continue; + nr_servers++; + if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE) + continue; + usable++; + if (vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE) + newrep++; + } slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL); if (!slist) goto error; + if (newrep) { + if (newrep < usable / 2) { + slist->ro_replicating = AFS_RO_REPLICATING_USE_OLD; + } else { + slist->ro_replicating = AFS_RO_REPLICATING_USE_NEW; + use_newrepsites = true; + } + } + refcount_set(&slist->usage, 1); rwlock_init(&slist->lock); - for (i = 0; i < AFS_MAXTYPES; i++) - slist->vids[i] = vldb->vid[i]; - /* Make sure a records exists for each server in the list. */ for (i = 0; i < vldb->nr_servers; i++) { + unsigned long se_flags = 0; + bool newrepsite = vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE; + if (!(vldb->fs_mask[i] & type_mask)) continue; + if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE) + __set_bit(AFS_SE_EXCLUDED, &se_flags); + if (newrep && (newrepsite ^ use_newrepsites)) + __set_bit(AFS_SE_EXCLUDED, &se_flags); - server = afs_lookup_server(cell, key, &vldb->fs_server[i], + server = afs_lookup_server(volume->cell, key, &vldb->fs_server[i], vldb->addr_version[i]); if (IS_ERR(server)) { ret = PTR_ERR(server); @@ -70,8 +97,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { - afs_put_server(cell->net, server, - afs_server_trace_put_slist_isort); + afs_unuse_server_notime(volume->cell->net, server, + afs_server_trace_unuse_slist_isort); continue; } @@ -81,6 +108,9 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, } slist->servers[j].server = server; + slist->servers[j].volume = volume; + slist->servers[j].flags = se_flags; + slist->servers[j].cb_expires_at = AFS_NO_CB_PROMISE; slist->nr_servers++; } @@ -92,7 +122,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, return slist; error_2: - afs_put_serverlist(cell->net, slist); + afs_put_serverlist(volume->cell->net, slist); error: return ERR_PTR(ret); } @@ -103,27 +133,117 @@ error: bool afs_annotate_server_list(struct afs_server_list *new, struct afs_server_list *old) { - struct afs_server *cur; - int i, j; + unsigned long mask = 1UL << AFS_SE_EXCLUDED; + int i; - if (old->nr_servers != new->nr_servers) + if (old->nr_servers != new->nr_servers || + old->ro_replicating != new->ro_replicating) goto changed; - for (i = 0; i < old->nr_servers; i++) + for (i = 0; i < old->nr_servers; i++) { if (old->servers[i].server != new->servers[i].server) goto changed; - + if ((old->servers[i].flags & mask) != (new->servers[i].flags & mask)) + goto changed; + } return false; - changed: - /* Maintain the same preferred server as before if possible. */ - cur = old->servers[old->preferred].server; - for (j = 0; j < new->nr_servers; j++) { - if (new->servers[j].server == cur) { - new->preferred = j; - break; + return true; +} + +/* + * Attach a volume to the servers it is going to use. + */ +void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist) +{ + struct afs_server_entry *se, *pe; + struct afs_server *server; + struct list_head *p; + unsigned int i; + + down_write(&volume->cell->vs_lock); + + for (i = 0; i < slist->nr_servers; i++) { + se = &slist->servers[i]; + server = se->server; + + list_for_each(p, &server->volumes) { + pe = list_entry(p, struct afs_server_entry, slink); + if (volume->vid <= pe->volume->vid) + break; } + list_add_tail(&se->slink, p); } - return true; + slist->attached = true; + up_write(&volume->cell->vs_lock); +} + +/* + * Reattach a volume to the servers it is going to use when server list is + * replaced. We try to switch the attachment points to avoid rewalking the + * lists. + */ +void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *new, + struct afs_server_list *old) +{ + unsigned int n = 0, o = 0; + + down_write(&volume->cell->vs_lock); + + while (n < new->nr_servers || o < old->nr_servers) { + struct afs_server_entry *pn = n < new->nr_servers ? &new->servers[n] : NULL; + struct afs_server_entry *po = o < old->nr_servers ? &old->servers[o] : NULL; + struct afs_server_entry *s; + struct list_head *p; + int diff; + + if (pn && po && pn->server == po->server) { + pn->cb_expires_at = po->cb_expires_at; + list_replace(&po->slink, &pn->slink); + n++; + o++; + continue; + } + + if (pn && po) + diff = memcmp(&pn->server->uuid, &po->server->uuid, + sizeof(pn->server->uuid)); + else + diff = pn ? -1 : 1; + + if (diff < 0) { + list_for_each(p, &pn->server->volumes) { + s = list_entry(p, struct afs_server_entry, slink); + if (volume->vid <= s->volume->vid) + break; + } + list_add_tail(&pn->slink, p); + n++; + } else { + list_del(&po->slink); + o++; + } + } + + up_write(&volume->cell->vs_lock); +} + +/* + * Detach a volume from the servers it has been using. + */ +void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist) +{ + unsigned int i; + + if (!slist->attached) + return; + + down_write(&volume->cell->vs_lock); + + for (i = 0; i < slist->nr_servers; i++) + list_del(&slist->servers[i].slink); + + slist->attached = false; + up_write(&volume->cell->vs_lock); } diff --git a/fs/afs/super.c b/fs/afs/super.c index 95d713074dc8..d672b7ab57ae 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -55,7 +55,7 @@ int afs_net_id; static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, - .write_inode = afs_write_inode, + .write_inode = netfs_unpin_writeback, .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .free_inode = afs_free_inode, @@ -194,8 +194,6 @@ static int afs_show_options(struct seq_file *m, struct dentry *root) if (as->dyn_root) seq_puts(m, ",dyn"); - if (test_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(d_inode(root))->flags)) - seq_puts(m, ",autocell"); switch (as->flock_mode) { case afs_flock_mode_unset: break; case afs_flock_mode_local: p = "local"; break; @@ -292,13 +290,14 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) /* lookup the cell record */ if (cellname) { cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, - NULL, false); + NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT, + afs_cell_trace_use_lookup_mount); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%*.*s'\n", cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_parse); afs_see_cell(cell, afs_cell_trace_see_source); ctx->cell = cell; } @@ -381,8 +380,7 @@ static int afs_validate_fc(struct fs_context *fc) ctx->key = key; if (ctx->volume) { - afs_put_volume(ctx->net, ctx->volume, - afs_volume_trace_put_validate_fc); + afs_put_volume(ctx->volume, afs_volume_trace_put_validate_fc); ctx->volume = NULL; } @@ -396,7 +394,7 @@ static int afs_validate_fc(struct fs_context *fc) ctx->key = NULL; cell = afs_use_cell(ctx->cell->alias_of, afs_cell_trace_use_fc_alias); - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc); ctx->cell = cell; goto reget_key; } @@ -407,6 +405,10 @@ static int afs_validate_fc(struct fs_context *fc) return PTR_ERR(volume); ctx->volume = volume; + if (volume->type != AFSVL_RWVOL) { + ctx->flock_mode = afs_flock_mode_local; + fc->sb_flags |= SB_RDONLY; + } } return 0; @@ -465,7 +467,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) /* allocate the root inode and dentry */ if (as->dyn_root) { - inode = afs_iget_pseudo_dir(sb, true); + inode = afs_dynroot_iget_root(sb); } else { sprintf(sb->s_id, "%llu", as->volume->vid); afs_activate_volume(as->volume); @@ -475,21 +477,15 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); - if (ctx->autocell || as->dyn_root) - set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); - ret = -ENOMEM; sb->s_root = d_make_root(inode); if (!sb->s_root) goto error; if (as->dyn_root) { - sb->s_d_op = &afs_dynroot_dentry_operations; - ret = afs_dynroot_populate(sb); - if (ret < 0) - goto error; + set_default_d_op(sb, &afs_dynroot_dentry_operations); } else { - sb->s_d_op = &afs_fs_dentry_operations; + set_default_d_op(sb, &afs_fs_dentry_operations); rcu_assign_pointer(as->volume->sb, sb); } @@ -524,9 +520,8 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) static void afs_destroy_sbi(struct afs_super_info *as) { if (as) { - struct afs_net *net = afs_net(as->net_ns); - afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi); - afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi); + afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi); + afs_unuse_cell(as->cell, afs_cell_trace_unuse_sbi); put_net(as->net_ns); kfree(as); } @@ -536,9 +531,6 @@ static void afs_kill_super(struct super_block *sb) { struct afs_super_info *as = AFS_FS_S(sb); - if (as->dyn_root) - afs_dynroot_depopulate(sb); - /* Clear the callback interests (which will do ilookup5) before * deactivating the superblock. */ @@ -611,8 +603,8 @@ static void afs_free_fc(struct fs_context *fc) struct afs_fs_context *ctx = fc->fs_private; afs_destroy_sbi(fc->s_fs_info); - afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc); - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); + afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc); key_put(ctx->key); kfree(ctx); } @@ -660,7 +652,7 @@ static void afs_i_init_once(void *_vnode) memset(vnode, 0, sizeof(*vnode)); inode_init_once(&vnode->netfs.inode); - mutex_init(&vnode->io_lock); + INIT_LIST_HEAD(&vnode->io_lock_waiters); init_rwsem(&vnode->validate_lock); spin_lock_init(&vnode->wb_lock); spin_lock_init(&vnode->lock); @@ -693,6 +685,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb) vnode->volume = NULL; vnode->lock_key = NULL; vnode->permit_cache = NULL; + vnode->directory = NULL; + vnode->directory_size = 0; vnode->flags = 1 << AFS_VNODE_UNSET; vnode->lock_state = AFS_VNODE_LOCK_NONE; diff --git a/fs/afs/validation.c b/fs/afs/validation.c new file mode 100644 index 000000000000..0ba8336c9025 --- /dev/null +++ b/fs/afs/validation.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* vnode and volume validity verification. + * + * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include "internal.h" + +/* + * Data validation is managed through a number of mechanisms from the server: + * + * (1) On first contact with a server (such as if it has just been rebooted), + * the server sends us a CB.InitCallBackState* request. + * + * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC + * calls, the server maintains a time-limited per-vnode promise that it + * will send us a CB.CallBack request if a third party alters the vnodes + * accessed. + * + * Note that a vnode-level callbacks may also be sent for other reasons, + * such as filelock release. + * + * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC + * calls, each server maintains a time-limited per-volume promise that it + * will send us a CB.CallBack request if the RO volume is updated to a + * snapshot of the RW volume ("vos release"). This is an atomic event + * that cuts over all instances of the RO volume across multiple servers + * simultaneously. + * + * Note that a volume-level callbacks may also be sent for other reasons, + * such as the volumeserver taking over control of the volume from the + * fileserver. + * + * Note also that each server maintains an independent time limit on an + * independent callback. + * + * (4) Certain RPC calls include a volume information record "VolSync" in + * their reply. This contains a creation date for the volume that should + * remain unchanged for a RW volume (but will be changed if the volume is + * restored from backup) or will be bumped to the time of snapshotting + * when a RO volume is released. + * + * In order to track this events, the following are provided: + * + * ->cb_v_break. A counter of events that might mean that the contents of + * a volume have been altered since we last checked a vnode. + * + * ->cb_v_check. A counter of the number of events that we've sent a + * query to the server for. Everything's up to date if this equals + * cb_v_break. + * + * ->cb_scrub. A counter of the number of regression events for which we + * have to completely wipe the cache. + * + * ->cb_ro_snapshot. A counter of the number of times that we've + * recognised that a RO volume has been updated. + * + * ->cb_break. A counter of events that might mean that the contents of a + * vnode have been altered. + * + * ->cb_expires_at. The time at which the callback promise expires or + * AFS_NO_CB_PROMISE if we have no promise. + * + * The way we manage things is: + * + * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on + * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the + * volume and volume's server record. + * + * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level + * callback break on all the volumes that have been using that volume + * (ie. increment ->cb_v_break and reset ->cb_expires_at). + * + * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the + * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also + * dispatch a work item to unmap all PTEs to the vnode's pagecache to + * force reentry to the filesystem for revalidation. + * + * (4) When entering the filesystem, we call afs_validate() to check the + * validity of a vnode. This first checks to see if ->cb_v_check and + * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock + * exclusively and perform an FS.FetchStatus on the vnode. + * + * After checking the volume, we check the vnode. If there's a mismatch + * between the volume counters and the vnode's mirrors of those counters, + * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode. + * + * (5) When the reply from FS.FetchStatus arrives, the VolSync record is + * parsed: + * + * (A) If the Creation timestamp has changed on a RW volume or regressed + * on a RO volume, we try to increment ->cb_scrub; if it advances on a + * RO volume, we assume "vos release" happened and try to increment + * ->cb_ro_snapshot. + * + * (B) If the Update timestamp has regressed, we try to increment + * ->cb_scrub. + * + * Note that in both of these cases, we only do the increment if we can + * cmpxchg the value of the timestamp from the value we noted before the + * op. This tries to prevent parallel ops from fighting one another. + * + * volume->cb_v_check is then set to ->cb_v_break. + * + * (6) The AFSCallBack record included in the FS.FetchStatus reply is also + * parsed and used to set the promise in ->cb_expires_at for the vnode, + * the volume and the volume's server record. + * + * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for + * the vnode. + */ + +/* + * Check the validity of a vnode/inode and its parent volume. + */ +bool afs_check_validity(const struct afs_vnode *vnode) +{ + const struct afs_volume *volume = vnode->volume; + enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace; + time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at); + time64_t deadline = ktime_get_real_seconds() + 10; + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) + return true; + + if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) + trace = afs_vnode_invalid_trace_cb_v_break; + else if (cb_expires_at == AFS_NO_CB_PROMISE) + trace = afs_vnode_invalid_trace_no_cb_promise; + else if (cb_expires_at <= deadline) + trace = afs_vnode_invalid_trace_expired; + else if (volume->cb_expires_at <= deadline) + trace = afs_vnode_invalid_trace_vol_expired; + else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot)) + trace = afs_vnode_invalid_trace_cb_ro_snapshot; + else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub)) + trace = afs_vnode_invalid_trace_cb_scrub; + else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) + trace = afs_vnode_invalid_trace_zap_data; + else + return true; + trace_afs_vnode_invalid(vnode, trace); + return false; +} + +/* + * See if the server we've just talked to is currently excluded. + */ +static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) +{ + const struct afs_server_entry *se; + const struct afs_server_list *slist; + bool is_excluded = true; + int i; + + rcu_read_lock(); + + slist = rcu_dereference(volume->servers); + for (i = 0; i < slist->nr_servers; i++) { + se = &slist->servers[i]; + if (op->server == se->server) { + is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags); + break; + } + } + + rcu_read_unlock(); + return is_excluded; +} + +/* + * Update the volume's server list when the creation time changes and see if + * the server we've just talked to is currently excluded. + */ +static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) +{ + int ret; + + if (__afs_is_server_excluded(op, volume)) + return 1; + + set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); + ret = afs_check_volume_status(op->volume, op); + if (ret < 0) + return ret; + + return __afs_is_server_excluded(op, volume); +} + +/* + * Handle a change to the volume creation time in the VolSync record. + */ +static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume) +{ + unsigned int snap; + time64_t cur = volume->creation_time; + time64_t old = op->pre_volsync.creation; + time64_t new = op->volsync.creation; + int ret; + + _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); + + if (cur == TIME64_MIN) { + volume->creation_time = new; + return 0; + } + + if (new == cur) + return 0; + + /* Try to advance the creation timestamp from what we had before the + * operation to what we got back from the server. This should + * hopefully ensure that in a race between multiple operations only one + * of them will do this. + */ + if (cur != old) + return 0; + + /* If the creation time changes in an unexpected way, we need to scrub + * our caches. For a RW vol, this will only change if the volume is + * restored from a backup; for a RO/Backup vol, this will advance when + * the volume is updated to a new snapshot (eg. "vos release"). + */ + if (volume->type == AFSVL_RWVOL) + goto regressed; + if (volume->type == AFSVL_BACKVOL) { + if (new < old) + goto regressed; + goto advance; + } + + /* We have an RO volume, we need to query the VL server and look at the + * server flags to see if RW->RO replication is in progress. + */ + ret = afs_is_server_excluded(op, volume); + if (ret < 0) + return ret; + if (ret > 0) { + snap = atomic_read(&volume->cb_ro_snapshot); + trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded); + return ret; + } + +advance: + snap = atomic_inc_return(&volume->cb_ro_snapshot); + trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release); + volume->creation_time = new; + return 0; + +regressed: + atomic_inc(&volume->cb_scrub); + trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress); + volume->creation_time = new; + return 0; +} + +/* + * Handle a change to the volume update time in the VolSync record. + */ +static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume) +{ + enum afs_cb_break_reason reason = afs_cb_break_no_break; + time64_t cur = volume->update_time; + time64_t old = op->pre_volsync.update; + time64_t new = op->volsync.update; + + _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); + + if (cur == TIME64_MIN) { + volume->update_time = new; + return; + } + + if (new == cur) + return; + + /* If the volume update time changes in an unexpected way, we need to + * scrub our caches. For a RW vol, this will advance on every + * modification op; for a RO/Backup vol, this will advance when the + * volume is updated to a new snapshot (eg. "vos release"). + */ + if (new < old) + reason = afs_cb_break_for_update_regress; + + /* Try to advance the update timestamp from what we had before the + * operation to what we got back from the server. This should + * hopefully ensure that in a race between multiple operations only one + * of them will do this. + */ + if (cur == old) { + if (reason == afs_cb_break_for_update_regress) { + atomic_inc(&volume->cb_scrub); + trace_afs_cb_v_break(volume->vid, 0, reason); + } + volume->update_time = new; + } +} + +static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume) +{ + int ret = 0; + + if (likely(op->volsync.creation == volume->creation_time && + op->volsync.update == volume->update_time)) + return 0; + + mutex_lock(&volume->volsync_lock); + if (op->volsync.creation != volume->creation_time) { + ret = afs_update_volume_creation_time(op, volume); + if (ret < 0) + goto out; + } + if (op->volsync.update != volume->update_time) + afs_update_volume_update_time(op, volume); +out: + mutex_unlock(&volume->volsync_lock); + return ret; +} + +/* + * Update the state of a volume, including recording the expiration time of the + * callback promise. Returns 1 to redo the operation from the start. + */ +int afs_update_volume_state(struct afs_operation *op) +{ + struct afs_server_list *slist = op->server_list; + struct afs_server_entry *se = &slist->servers[op->server_index]; + struct afs_callback *cb = &op->file[0].scb.callback; + struct afs_volume *volume = op->volume; + unsigned int cb_v_break = atomic_read(&volume->cb_v_break); + unsigned int cb_v_check = atomic_read(&volume->cb_v_check); + int ret; + + _enter("%llx", op->volume->vid); + + if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) { + ret = afs_update_volume_times(op, volume); + if (ret != 0) { + _leave(" = %d", ret); + return ret; + } + } + + if (op->cb_v_break == cb_v_break && + (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) { + time64_t expires_at = cb->expires_at; + + if (!op->file[0].scb.have_cb) + expires_at = op->file[1].scb.callback.expires_at; + + se->cb_expires_at = expires_at; + volume->cb_expires_at = expires_at; + } + if (cb_v_check < op->cb_v_break) + atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break); + return 0; +} + +/* + * mark the data attached to an inode as obsolete due to a write on the server + * - might also want to ditch all the outstanding writes and dirty pages + */ +static void afs_zap_data(struct afs_vnode *vnode) +{ + _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); + + afs_invalidate_cache(vnode, 0); + + /* nuke all the non-dirty pages that aren't locked, mapped or being + * written back in a regular file and completely discard the pages in a + * directory or symlink */ + if (S_ISREG(vnode->netfs.inode.i_mode)) + filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX); + else + filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX); +} + +/* + * validate a vnode/inode + * - there are several things we need to check + * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, + * symlink) + * - parent dir metadata changed (security changes) + * - dentry data changed (write, truncate) + * - dentry metadata changed (security changes) + */ +int afs_validate(struct afs_vnode *vnode, struct key *key) +{ + struct afs_volume *volume = vnode->volume; + unsigned int cb_ro_snapshot, cb_scrub; + time64_t deadline = ktime_get_real_seconds() + 10; + bool zap = false, locked_vol = false; + int ret; + + _enter("{v={%llx:%llu} fl=%lx},%x", + vnode->fid.vid, vnode->fid.vnode, vnode->flags, + key_serial(key)); + + if (afs_check_validity(vnode)) + return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; + + ret = down_write_killable(&vnode->validate_lock); + if (ret < 0) + goto error; + + if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + ret = -ESTALE; + goto error_unlock; + } + + /* Validate a volume after the v_break has changed or the volume + * callback expired. We only want to do this once per volume per + * v_break change. The actual work will be done when parsing the + * status fetch reply. + */ + if (volume->cb_expires_at <= deadline || + atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) { + ret = mutex_lock_interruptible(&volume->cb_check_lock); + if (ret < 0) + goto error_unlock; + locked_vol = true; + } + + cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); + cb_scrub = atomic_read(&volume->cb_scrub); + if (vnode->cb_ro_snapshot != cb_ro_snapshot || + vnode->cb_scrub != cb_scrub) + unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); + + if (vnode->cb_ro_snapshot != cb_ro_snapshot || + vnode->cb_scrub != cb_scrub || + volume->cb_expires_at <= deadline || + atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || + atomic64_read(&vnode->cb_expires_at) <= deadline + ) { + ret = afs_fetch_status(vnode, key, false, NULL); + if (ret < 0) { + if (ret == -ENOENT) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + ret = -ESTALE; + } + goto error_unlock; + } + + _debug("new promise [fl=%lx]", vnode->flags); + } + + /* We can drop the volume lock now as. */ + if (locked_vol) { + mutex_unlock(&volume->cb_check_lock); + locked_vol = false; + } + + cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); + cb_scrub = atomic_read(&volume->cb_scrub); + _debug("vnode inval %x==%x %x==%x", + vnode->cb_ro_snapshot, cb_ro_snapshot, + vnode->cb_scrub, cb_scrub); + if (vnode->cb_scrub != cb_scrub) + zap = true; + vnode->cb_ro_snapshot = cb_ro_snapshot; + vnode->cb_scrub = cb_scrub; + + /* if the vnode's data version number changed then its contents are + * different */ + zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); + if (zap) + afs_zap_data(vnode); + up_write(&vnode->validate_lock); + _leave(" = 0"); + return 0; + +error_unlock: + if (locked_vol) + mutex_unlock(&volume->cb_check_lock); + up_write(&vnode->validate_lock); +error: + _leave(" = %d", ret); + return ret; +} diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index f04a80e4f5c3..fc9676abd252 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -33,55 +33,6 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k } /* - * Compare two addresses. - */ -static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a, - const struct sockaddr_rxrpc *srx_b) -{ - short port_a, port_b; - int addr_a, addr_b, diff; - - diff = (short)srx_a->transport_type - (short)srx_b->transport_type; - if (diff) - goto out; - - switch (srx_a->transport_type) { - case AF_INET: { - const struct sockaddr_in *a = &srx_a->transport.sin; - const struct sockaddr_in *b = &srx_b->transport.sin; - addr_a = ntohl(a->sin_addr.s_addr); - addr_b = ntohl(b->sin_addr.s_addr); - diff = addr_a - addr_b; - if (diff == 0) { - port_a = ntohs(a->sin_port); - port_b = ntohs(b->sin_port); - diff = port_a - port_b; - } - break; - } - - case AF_INET6: { - const struct sockaddr_in6 *a = &srx_a->transport.sin6; - const struct sockaddr_in6 *b = &srx_b->transport.sin6; - diff = memcmp(&a->sin6_addr, &b->sin6_addr, 16); - if (diff == 0) { - port_a = ntohs(a->sin6_port); - port_b = ntohs(b->sin6_port); - diff = port_a - port_b; - } - break; - } - - default: - WARN_ON(1); - diff = 1; - } - -out: - return diff; -} - -/* * Compare the address lists of a pair of fileservers. */ static int afs_compare_fs_alists(const struct afs_server *server_a, @@ -90,13 +41,13 @@ static int afs_compare_fs_alists(const struct afs_server *server_a, const struct afs_addr_list *la, *lb; int a = 0, b = 0, addr_matches = 0; - la = rcu_dereference(server_a->addresses); - lb = rcu_dereference(server_b->addresses); + la = rcu_dereference(server_a->endpoint_state)->addresses; + lb = rcu_dereference(server_b->endpoint_state)->addresses; while (a < la->nr_addrs && b < lb->nr_addrs) { - const struct sockaddr_rxrpc *srx_a = &la->addrs[a]; - const struct sockaddr_rxrpc *srx_b = &lb->addrs[b]; - int diff = afs_compare_addrs(srx_a, srx_b); + unsigned long pa = (unsigned long)la->addrs[a].peer; + unsigned long pb = (unsigned long)lb->addrs[b].peer; + long diff = pa - pb; if (diff < 0) { a++; @@ -126,7 +77,7 @@ static int afs_compare_volume_slists(const struct afs_volume *vol_a, lb = rcu_dereference(vol_b->servers); for (i = 0; i < AFS_MAXTYPES; i++) - if (la->vids[i] != lb->vids[i]) + if (vol_a->vids[i] != vol_b->vids[i]) return 0; while (a < la->nr_servers && b < lb->nr_servers) { @@ -205,7 +156,7 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, /* And see if it's in the new cell. */ volume = afs_sample_volume(cell, key, pvol->name, pvol->name_len); if (IS_ERR(volume)) { - afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias); + afs_put_volume(pvol, afs_volume_trace_put_query_alias); if (PTR_ERR(volume) != -ENOMEDIUM) return PTR_ERR(volume); /* That volume is not in the new cell, so not an alias */ @@ -223,8 +174,8 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key, rcu_read_unlock(); } - afs_put_volume(cell->net, volume, afs_volume_trace_put_query_alias); - afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias); + afs_put_volume(volume, afs_volume_trace_put_query_alias); + afs_put_volume(pvol, afs_volume_trace_put_query_alias); return ret; } @@ -254,11 +205,11 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key) goto is_alias; if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) { - afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); + afs_unuse_cell(p, afs_cell_trace_unuse_check_alias); return -ERESTARTSYS; } - afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); + afs_unuse_cell(p, afs_cell_trace_unuse_check_alias); } mutex_unlock(&cell->net->proc_cells_lock); @@ -285,7 +236,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key) while (afs_select_vlserver(&vc)) { if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) { - vc.ac.error = -EOPNOTSUPP; + vc.call_error = -EOPNOTSUPP; skipped = true; continue; } @@ -302,6 +253,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key) static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) { struct afs_cell *master; + size_t name_len; char *cell_name; cell_name = afs_vl_get_cell_name(cell, key); @@ -313,8 +265,13 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) return 0; } - master = afs_lookup_cell(cell->net, cell_name, strlen(cell_name), - NULL, false); + name_len = strlen(cell_name); + if (!name_len || name_len > AFS_MAXCELLNAME) + master = ERR_PTR(-EOPNOTSUPP); + else + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, + AFS_LOOKUP_CELL_ALIAS_CHECK, + afs_cell_trace_use_lookup_canonical); kfree(cell_name); if (IS_ERR(master)) return PTR_ERR(master); diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index acc48216136a..9b1c20daac53 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -13,6 +13,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, unsigned short port) { struct afs_vlserver *vlserver; + static atomic_t debug_ids; vlserver = kzalloc(struct_size(vlserver, name, name_len + 1), GFP_KERNEL); @@ -21,8 +22,10 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len, rwlock_init(&vlserver->lock); init_waitqueue_head(&vlserver->probe_wq); spin_lock_init(&vlserver->probe_lock); + vlserver->debug_id = atomic_inc_return(&debug_ids); vlserver->rtt = UINT_MAX; vlserver->name_len = name_len; + vlserver->service_id = VL_SERVICE; vlserver->port = port; memcpy(vlserver->name, name, name_len); } @@ -33,7 +36,8 @@ static void afs_vlserver_rcu(struct rcu_head *rcu) { struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu); - afs_put_addrlist(rcu_access_pointer(vlserver->addresses)); + afs_put_addrlist(rcu_access_pointer(vlserver->addresses), + afs_alist_trace_put_vlserver); kfree_rcu(vlserver, rcu); } @@ -83,14 +87,15 @@ static u16 afs_extract_le16(const u8 **_b) /* * Build a VL server address list from a DNS queried server list. */ -static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, +static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net, + const u8 **_b, const u8 *end, u8 nr_addrs, u16 port) { struct afs_addr_list *alist; const u8 *b = *_b; int ret = -EINVAL; - alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port); + alist = afs_alloc_addrlist(nr_addrs); if (!alist) return ERR_PTR(-ENOMEM); if (nr_addrs == 0) @@ -109,7 +114,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, goto error; } memcpy(x, b, 4); - afs_merge_fs_addr4(alist, x[0], port); + ret = afs_merge_fs_addr4(net, alist, x[0], port); + if (ret < 0) + goto error; b += 4; break; @@ -119,7 +126,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, goto error; } memcpy(x, b, 16); - afs_merge_fs_addr6(alist, x, port); + ret = afs_merge_fs_addr6(net, alist, x, port); + if (ret < 0) + goto error; b += 16; break; @@ -140,7 +149,7 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, error: *_b = b; - afs_put_addrlist(alist); + afs_put_addrlist(alist, afs_alist_trace_put_parse_error); return ERR_PTR(ret); } @@ -247,7 +256,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, /* Extract the addresses - note that we can't skip this as we * have to advance the payload pointer. */ - addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port); + addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port); if (IS_ERR(addrs)) { ret = PTR_ERR(addrs); goto error_2; @@ -255,7 +264,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, if (vllist->nr_servers >= nr_servers) { _debug("skip %u >= %u", vllist->nr_servers, nr_servers); - afs_put_addrlist(addrs); + afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty); afs_put_vlserver(cell->net, server); continue; } @@ -264,7 +273,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, addrs->status = bs.status; if (addrs->nr_addrs == 0) { - afs_put_addrlist(addrs); + afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty); if (!rcu_access_pointer(server->addresses)) { afs_put_vlserver(cell->net, server); continue; @@ -276,7 +285,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, old = rcu_replace_pointer(server->addresses, old, lockdep_is_held(&server->lock)); write_unlock(&server->lock); - afs_put_addrlist(old); + afs_put_addrlist(old, afs_alist_trace_put_vlserver_old); } diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index d1c7068b4346..3d2e0c925460 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -46,11 +46,12 @@ static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up) */ void afs_vlserver_probe_result(struct afs_call *call) { - struct afs_addr_list *alist = call->alist; + struct afs_addr_list *alist = call->vl_probe; struct afs_vlserver *server = call->vlserver; + struct afs_address *addr = &alist->addrs[call->probe_index]; unsigned int server_index = call->server_index; unsigned int rtt_us = 0; - unsigned int index = call->addr_ix; + unsigned int index = call->probe_index; bool have_result = false; int ret = call->error; @@ -89,7 +90,7 @@ void afs_vlserver_probe_result(struct afs_call *call) case -ETIME: default: clear_bit(index, &alist->responded); - set_bit(index, &alist->failed); + set_bit(index, &alist->probe_failed); if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) && (server->probe.error == 0 || server->probe.error == -ETIMEDOUT || @@ -101,22 +102,22 @@ void afs_vlserver_probe_result(struct afs_call *call) responded: set_bit(index, &alist->responded); - clear_bit(index, &alist->failed); + clear_bit(index, &alist->probe_failed); if (call->service_id == YFS_VL_SERVICE) { server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS; set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } else { server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS; if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) { clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); - alist->addrs[index].srx_service = call->service_id; + server->service_id = call->service_id; } } - if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && - rtt_us < server->probe.rtt) { + rtt_us = rxrpc_kernel_get_srtt(addr->peer); + if (rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; @@ -130,8 +131,10 @@ responded: out: spin_unlock(&server->probe_lock); - _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", - server_index, index, &alist->addrs[index].transport, rtt_us, ret); + trace_afs_vl_probe(server, false, alist, index, call->error, call->abort_code, rtt_us); + _debug("probe [%u][%u] %pISpc rtt=%d ret=%d", + server_index, index, rxrpc_kernel_remote_addr(addr->peer), + rtt_us, ret); afs_done_one_vl_probe(server, have_result); } @@ -146,35 +149,52 @@ static bool afs_do_probe_vlserver(struct afs_net *net, unsigned int server_index, struct afs_error *_e) { - struct afs_addr_cursor ac = { - .index = 0, - }; + struct afs_addr_list *alist; struct afs_call *call; + unsigned long unprobed; + unsigned int index, i; bool in_progress = false; + int best_prio; _enter("%s", server->name); read_lock(&server->lock); - ac.alist = rcu_dereference_protected(server->addresses, - lockdep_is_held(&server->lock)); + alist = rcu_dereference_protected(server->addresses, + lockdep_is_held(&server->lock)); + afs_get_addrlist(alist, afs_alist_trace_get_vlprobe); read_unlock(&server->lock); - atomic_set(&server->probe_outstanding, ac.alist->nr_addrs); + atomic_set(&server->probe_outstanding, alist->nr_addrs); memset(&server->probe, 0, sizeof(server->probe)); server->probe.rtt = UINT_MAX; - for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) { - call = afs_vl_get_capabilities(net, &ac, key, server, + unprobed = (1UL << alist->nr_addrs) - 1; + while (unprobed) { + best_prio = -1; + index = 0; + for (i = 0; i < alist->nr_addrs; i++) { + if (test_bit(i, &unprobed) && + alist->addrs[i].prio > best_prio) { + index = i; + best_prio = alist->addrs[i].prio; + } + } + __clear_bit(index, &unprobed); + + trace_afs_vl_probe(server, true, alist, index, 0, 0, 0); + call = afs_vl_get_capabilities(net, alist, index, key, server, server_index); if (!IS_ERR(call)) { + afs_prioritise_error(_e, call->error, call->abort_code); afs_put_call(call); in_progress = true; } else { - afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code); + afs_prioritise_error(_e, PTR_ERR(call), 0); afs_done_one_vl_probe(server, false); } } + afs_put_addrlist(alist, afs_alist_trace_put_vlprobe); return in_progress; } @@ -185,12 +205,10 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key, struct afs_vlserver_list *vllist) { struct afs_vlserver *server; - struct afs_error e; + struct afs_error e = {}; bool in_progress = false; int i; - e.error = 0; - e.responded = false; for (i = 0; i < vllist->nr_servers; i++) { server = vllist->servers[i].server; if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags)) diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index 488e58490b16..6ad9688d8f4b 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -17,18 +17,21 @@ bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell, struct key *key) { + static atomic_t debug_ids; + memset(vc, 0, sizeof(*vc)); vc->cell = cell; vc->key = key; - vc->error = -EDESTADDRREQ; - vc->ac.error = SHRT_MAX; + vc->cumul_error.error = -EDESTADDRREQ; + vc->nr_iterations = -1; if (signal_pending(current)) { - vc->error = -EINTR; + vc->cumul_error.error = -EINTR; vc->flags |= AFS_VL_CURSOR_STOP; return false; } + vc->debug_id = atomic_inc_return(&debug_ids); return true; } @@ -45,21 +48,27 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) cell->dns_expiry <= ktime_get_real_seconds()) { dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count); set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); - afs_queue_cell(cell, afs_cell_trace_get_queue_dns); + afs_queue_cell(cell, afs_cell_trace_queue_dns); if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { if (wait_var_event_interruptible( &cell->dns_lookup_count, smp_load_acquire(&cell->dns_lookup_count) != dns_lookup_count) < 0) { - vc->error = -ERESTARTSYS; + vc->cumul_error.error = -ERESTARTSYS; return false; } } /* Status load is ordered after lookup counter load */ + if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) { + pr_warn("No record of cell %s\n", cell->name); + vc->cumul_error.error = -ENOENT; + return false; + } + if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { - vc->error = -EDESTADDRREQ; + vc->cumul_error.error = -EDESTADDRREQ; return false; } } @@ -72,8 +81,8 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) if (!vc->server_list->nr_servers) return false; - vc->untried = (1UL << vc->server_list->nr_servers) - 1; - vc->index = -1; + vc->untried_servers = (1UL << vc->server_list->nr_servers) - 1; + vc->server_index = -1; return true; } @@ -83,54 +92,57 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) */ bool afs_select_vlserver(struct afs_vl_cursor *vc) { - struct afs_addr_list *alist; + struct afs_addr_list *alist = vc->alist; struct afs_vlserver *vlserver; - struct afs_error e; - u32 rtt; - int error = vc->ac.error, i; + unsigned long set, failed; + unsigned int rtt; + s32 abort_code = vc->call_abort_code; + int error = vc->call_error, i; - _enter("%lx[%d],%lx[%d],%d,%d", - vc->untried, vc->index, - vc->ac.tried, vc->ac.index, - error, vc->ac.abort_code); + vc->nr_iterations++; + + _enter("VC=%x+%x,%d{%lx},%d{%lx},%d,%d", + vc->debug_id, vc->nr_iterations, vc->server_index, vc->untried_servers, + vc->addr_index, vc->addr_tried, + error, abort_code); if (vc->flags & AFS_VL_CURSOR_STOP) { _leave(" = f [stopped]"); return false; } - vc->nr_iterations++; + if (vc->nr_iterations == 0) + goto start; + + WRITE_ONCE(alist->addrs[vc->addr_index].last_error, error); /* Evaluate the result of the previous operation, if there was one. */ switch (error) { - case SHRT_MAX: - goto start; - default: case 0: /* Success or local failure. Stop. */ - vc->error = error; + vc->cumul_error.error = error; vc->flags |= AFS_VL_CURSOR_STOP; - _leave(" = f [okay/local %d]", vc->ac.error); + _leave(" = f [okay/local %d]", vc->cumul_error.error); return false; case -ECONNABORTED: /* The far side rejected the operation on some grounds. This * might involve the server being busy or the volume having been moved. */ - switch (vc->ac.abort_code) { + switch (abort_code) { case AFSVL_IO: case AFSVL_BADVOLOPER: case AFSVL_NOMEM: /* The server went weird. */ - vc->error = -EREMOTEIO; + afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code); //write_lock(&vc->cell->vl_servers_lock); - //vc->server_list->weird_mask |= 1 << vc->index; + //vc->server_list->weird_mask |= 1 << vc->server_index; //write_unlock(&vc->cell->vl_servers_lock); goto next_server; default: - vc->error = afs_abort_to_error(vc->ac.abort_code); + afs_prioritise_error(&vc->cumul_error, error, abort_code); goto failed; } @@ -143,12 +155,12 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) case -ETIMEDOUT: case -ETIME: _debug("no conn %d", error); - vc->error = error; + afs_prioritise_error(&vc->cumul_error, error, 0); goto iterate_address; case -ECONNRESET: _debug("call reset"); - vc->error = error; + afs_prioritise_error(&vc->cumul_error, error, 0); vc->flags |= AFS_VL_CURSOR_RETRY; goto next_server; @@ -159,7 +171,13 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) restart_from_beginning: _debug("restart"); - afs_end_cursor(&vc->ac); + if (vc->call_responded && + vc->addr_index != vc->alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_restart); + alist = vc->alist = NULL; + afs_put_vlserverlist(vc->cell->net, vc->server_list); vc->server_list = NULL; if (vc->flags & AFS_VL_CURSOR_RETRIED) @@ -167,53 +185,58 @@ restart_from_beginning: vc->flags |= AFS_VL_CURSOR_RETRIED; start: _debug("start"); + ASSERTCMP(alist, ==, NULL); if (!afs_start_vl_iteration(vc)) goto failed; error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list); - if (error < 0) - goto failed_set_error; + if (error < 0) { + afs_prioritise_error(&vc->cumul_error, error, 0); + goto failed; + } pick_server: - _debug("pick [%lx]", vc->untried); + _debug("pick [%lx]", vc->untried_servers); + ASSERTCMP(alist, ==, NULL); - error = afs_wait_for_vl_probes(vc->server_list, vc->untried); - if (error < 0) - goto failed_set_error; + error = afs_wait_for_vl_probes(vc->server_list, vc->untried_servers); + if (error < 0) { + afs_prioritise_error(&vc->cumul_error, error, 0); + goto failed; + } /* Pick the untried server with the lowest RTT. */ - vc->index = vc->server_list->preferred; - if (test_bit(vc->index, &vc->untried)) + vc->server_index = vc->server_list->preferred; + if (test_bit(vc->server_index, &vc->untried_servers)) goto selected_server; - vc->index = -1; - rtt = U32_MAX; + vc->server_index = -1; + rtt = UINT_MAX; for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; - if (!test_bit(i, &vc->untried) || + if (!test_bit(i, &vc->untried_servers) || !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) continue; - if (s->probe.rtt < rtt) { - vc->index = i; + if (s->probe.rtt <= rtt) { + vc->server_index = i; rtt = s->probe.rtt; } } - if (vc->index == -1) + if (vc->server_index == -1) goto no_more_servers; selected_server: - _debug("use %d", vc->index); - __clear_bit(vc->index, &vc->untried); + _debug("use %d", vc->server_index); + __clear_bit(vc->server_index, &vc->untried_servers); /* We're starting on a different vlserver from the list. We need to * check it, find its address list and probe its capabilities before we * use it. */ - ASSERTCMP(vc->ac.alist, ==, NULL); - vlserver = vc->server_list->servers[vc->index].server; + vlserver = vc->server_list->servers[vc->server_index].server; vc->server = vlserver; _debug("USING VLSERVER: %s", vlserver->name); @@ -221,34 +244,48 @@ selected_server: read_lock(&vlserver->lock); alist = rcu_dereference_protected(vlserver->addresses, lockdep_is_held(&vlserver->lock)); - afs_get_addrlist(alist); + vc->alist = afs_get_addrlist(alist, afs_alist_trace_get_vlrotate_set); read_unlock(&vlserver->lock); - memset(&vc->ac, 0, sizeof(vc->ac)); - - if (!vc->ac.alist) - vc->ac.alist = alist; - else - afs_put_addrlist(alist); - - vc->ac.index = -1; + vc->addr_tried = 0; + vc->addr_index = -1; iterate_address: - ASSERT(vc->ac.alist); /* Iterate over the current server's address list to try and find an * address on which it will respond to us. */ - if (!afs_iterate_addresses(&vc->ac)) + set = READ_ONCE(alist->responded); + failed = READ_ONCE(alist->probe_failed); + vc->addr_index = READ_ONCE(alist->preferred); + + _debug("%lx-%lx-%lx,%d", set, failed, vc->addr_tried, vc->addr_index); + + set &= ~(failed | vc->addr_tried); + + if (!set) goto next_server; - _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs); + if (!test_bit(vc->addr_index, &set)) + vc->addr_index = __ffs(set); + + set_bit(vc->addr_index, &vc->addr_tried); + vc->alist = alist; - _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport); + _debug("VL address %d/%d", vc->addr_index, alist->nr_addrs); + + vc->call_responded = false; + _leave(" = t %pISpc", rxrpc_kernel_remote_addr(alist->addrs[vc->addr_index].peer)); return true; next_server: _debug("next"); - afs_end_cursor(&vc->ac); + ASSERT(alist); + if (vc->call_responded && + vc->addr_index != alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_next); + alist = vc->alist = NULL; goto pick_server; no_more_servers: @@ -258,25 +295,26 @@ no_more_servers: if (vc->flags & AFS_VL_CURSOR_RETRY) goto restart_from_beginning; - e.error = -EDESTADDRREQ; - e.responded = false; for (i = 0; i < vc->server_list->nr_servers; i++) { struct afs_vlserver *s = vc->server_list->servers[i].server; if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) - e.responded = true; - afs_prioritise_error(&e, READ_ONCE(s->probe.error), + vc->cumul_error.responded = true; + afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error), s->probe.abort_code); } - error = e.error; - -failed_set_error: - vc->error = error; failed: + if (alist) { + if (vc->call_responded && + vc->addr_index != alist->preferred && + test_bit(alist->preferred, &vc->addr_tried)) + WRITE_ONCE(alist->preferred, vc->addr_index); + afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_fail); + alist = vc->alist = NULL; + } vc->flags |= AFS_VL_CURSOR_STOP; - afs_end_cursor(&vc->ac); - _leave(" = f [failed %d]", vc->error); + _leave(" = f [failed %d]", vc->cumul_error.error); return false; } @@ -285,6 +323,7 @@ failed: */ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) { + struct afs_cell *cell = vc->cell; static int count; int i; @@ -294,8 +333,14 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) rcu_read_lock(); pr_notice("EDESTADDR occurred\n"); + pr_notice("CELL: %s err=%d\n", cell->name, cell->error); + pr_notice("DNS: src=%u st=%u lc=%x\n", + cell->dns_source, cell->dns_status, cell->dns_lookup_count); pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n", - vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error); + vc->untried_servers, vc->server_index, vc->nr_iterations, + vc->flags, vc->cumul_error.error); + pr_notice("VC: call er=%d ac=%d r=%u\n", + vc->call_error, vc->call_abort_code, vc->call_responded); if (vc->server_list) { const struct afs_vlserver_list *sl = vc->server_list; @@ -312,16 +357,14 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) a->nr_ipv4, a->nr_addrs, a->max_addrs, a->preferred); pr_notice("VC: - R=%lx F=%lx\n", - a->responded, a->failed); - if (a == vc->ac.alist) + a->responded, a->probe_failed); + if (a == vc->alist) pr_notice("VC: - current\n"); } } } - pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", - vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error, - vc->ac.responded, vc->ac.nr_iterations); + pr_notice("AC: t=%lx ax=%u\n", vc->addr_tried, vc->addr_index); rcu_read_unlock(); } @@ -332,17 +375,25 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc) { struct afs_net *net = vc->cell->net; - if (vc->error == -EDESTADDRREQ || - vc->error == -EADDRNOTAVAIL || - vc->error == -ENETUNREACH || - vc->error == -EHOSTUNREACH) + _enter("VC=%x+%x", vc->debug_id, vc->nr_iterations); + + switch (vc->cumul_error.error) { + case -EDESTADDRREQ: + case -EADDRNOTAVAIL: + case -ENETUNREACH: + case -EHOSTUNREACH: afs_vl_dump_edestaddrreq(vc); + break; + } - afs_end_cursor(&vc->ac); + if (vc->alist) { + if (vc->call_responded && + vc->addr_index != vc->alist->preferred && + test_bit(vc->alist->preferred, &vc->addr_tried)) + WRITE_ONCE(vc->alist->preferred, vc->addr_index); + afs_put_addrlist(vc->alist, afs_alist_trace_put_vlrotate_end); + vc->alist = NULL; + } afs_put_vlserverlist(net, vc->server_list); - - if (vc->error == -ECONNABORTED) - vc->error = afs_abort_to_error(vc->ac.abort_code); - - return vc->error; + return vc->cumul_error.error; } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 00fca3c66ba6..3a23c0b08eb6 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -18,8 +18,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) { struct afs_uvldbentry__xdr *uvldb; struct afs_vldb_entry *entry; - bool new_only = false; - u32 tmp, nr_servers, vlflags; + u32 nr_servers, vlflags; int i, ret; _enter(""); @@ -41,27 +40,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) entry->name[i] = 0; entry->name_len = strlen(entry->name); - /* If there is a new replication site that we can use, ignore all the - * sites that aren't marked as new. - */ - for (i = 0; i < nr_servers; i++) { - tmp = ntohl(uvldb->serverFlags[i]); - if (!(tmp & AFS_VLSF_DONTUSE) && - (tmp & AFS_VLSF_NEWREPSITE)) - new_only = true; - } - vlflags = ntohl(uvldb->flags); for (i = 0; i < nr_servers; i++) { struct afs_uuid__xdr *xdr; struct afs_uuid *uuid; + u32 tmp = ntohl(uvldb->serverFlags[i]); int j; int n = entry->nr_servers; - tmp = ntohl(uvldb->serverFlags[i]); - if (tmp & AFS_VLSF_DONTUSE || - (new_only && !(tmp & AFS_VLSF_NEWREPSITE))) - continue; if (tmp & AFS_VLSF_RWVOL) { entry->fs_mask[n] |= AFS_VOL_VTM_RW; if (vlflags & AFS_VLF_BACKEXISTS) @@ -82,6 +68,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) for (j = 0; j < 6; j++) uuid->node[j] = (u8)ntohl(xdr->node[j]); + entry->vlsf_flags[n] = tmp; entry->addr_version[n] = ntohl(uvldb->serverUnique[i]); entry->nr_servers++; } @@ -106,12 +93,6 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) return 0; } -static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call) -{ - kfree(call->ret_vldb); - afs_flat_call_destructor(call); -} - /* * VL.GetEntryByNameU operation type. */ @@ -119,7 +100,7 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = { .name = "VL.GetEntryByNameU", .op = afs_VL_GetEntryByNameU, .deliver = afs_deliver_vl_get_entry_by_name_u, - .destructor = afs_destroy_vl_get_entry_by_name_u, + .destructor = afs_flat_call_destructor, }; /* @@ -155,6 +136,8 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, call->key = vc->key; call->ret_vldb = entry; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* Marshall the parameters */ bp = call->request; @@ -165,8 +148,17 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, memset((void *)bp + volnamesz, 0, padsz); trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (struct afs_vldb_entry *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + afs_put_call(call); + if (vc->call_error) { + kfree(entry); + return ERR_PTR(vc->call_error); + } + return entry; } /* @@ -208,7 +200,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) count = ntohl(*bp); nentries = min(nentries, count); - alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT); + alist = afs_alloc_addrlist(nentries); if (!alist) return -ENOMEM; alist->version = uniquifier; @@ -230,9 +222,13 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) alist = call->ret_alist; bp = call->buffer; count = min(call->count, 4U); - for (i = 0; i < count; i++) - if (alist->nr_addrs < call->count2) - afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT); + for (i = 0; i < count; i++) { + if (alist->nr_addrs < call->count2) { + ret = afs_merge_fs_addr4(call->net, alist, *bp++, AFS_FS_PORT); + if (ret < 0) + return ret; + } + } call->count -= count; if (call->count > 0) @@ -245,12 +241,6 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) return 0; } -static void afs_vl_get_addrs_u_destructor(struct afs_call *call) -{ - afs_put_addrlist(call->ret_alist); - return afs_flat_call_destructor(call); -} - /* * VL.GetAddrsU operation type. */ @@ -258,7 +248,7 @@ static const struct afs_call_type afs_RXVLGetAddrsU = { .name = "VL.GetAddrsU", .op = afs_VL_GetAddrsU, .deliver = afs_deliver_vl_get_addrs_u, - .destructor = afs_vl_get_addrs_u_destructor, + .destructor = afs_flat_call_destructor, }; /* @@ -269,6 +259,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, const uuid_t *uuid) { struct afs_ListAddrByAttributes__xdr *r; + struct afs_addr_list *alist; const struct afs_uuid *u = (const struct afs_uuid *)uuid; struct afs_call *call; struct afs_net *net = vc->cell->net; @@ -286,6 +277,8 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, call->key = vc->key; call->ret_alist = NULL; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* Marshall the parameters */ bp = call->request; @@ -304,8 +297,18 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, r->uuid.node[i] = htonl(u->node[i]); trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + alist = call->ret_alist; + afs_put_call(call); + if (vc->call_error) { + afs_put_addrlist(alist, afs_alist_trace_put_getaddru); + return ERR_PTR(vc->call_error); + } + return alist; } /* @@ -355,6 +358,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) static void afs_destroy_vl_get_capabilities(struct afs_call *call) { + afs_put_addrlist(call->vl_probe, afs_alist_trace_put_vlgetcaps); afs_put_vlserver(call->net, call->vlserver); afs_flat_call_destructor(call); } @@ -366,6 +370,7 @@ static const struct afs_call_type afs_RXVLGetCapabilities = { .name = "VL.GetCapabilities", .op = afs_VL_GetCapabilities, .deliver = afs_deliver_vl_get_capabilities, + .immediate_cancel = afs_vlserver_probe_result, .done = afs_vlserver_probe_result, .destructor = afs_destroy_vl_get_capabilities, }; @@ -378,7 +383,8 @@ static const struct afs_call_type afs_RXVLGetCapabilities = { * other end supports. */ struct afs_call *afs_vl_get_capabilities(struct afs_net *net, - struct afs_addr_cursor *ac, + struct afs_addr_list *alist, + unsigned int addr_index, struct key *key, struct afs_vlserver *server, unsigned int server_index) @@ -395,6 +401,10 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net, call->key = key; call->vlserver = afs_get_vlserver(server); call->server_index = server_index; + call->peer = rxrpc_kernel_get_peer(alist->addrs[addr_index].peer); + call->vl_probe = afs_get_addrlist(alist, afs_alist_trace_get_vlgetcaps); + call->probe_index = addr_index; + call->service_id = server->service_id; call->upgrade = true; call->async = true; call->max_lifespan = AFS_PROBE_MAX_LIFESPAN; @@ -405,7 +415,7 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net, /* Can't take a ref on server */ trace_afs_make_vl_call(call); - afs_make_call(ac, call, GFP_KERNEL); + afs_make_call(call, GFP_KERNEL); return call; } @@ -450,7 +460,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (call->count > YFS_MAXENDPOINTS) return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num); - alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); + alist = afs_alloc_addrlist(call->count); if (!alist) return -ENOMEM; alist->version = uniquifier; @@ -488,14 +498,18 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (ntohl(bp[0]) != sizeof(__be32) * 2) return afs_protocol_error( call, afs_eproto_yvl_fsendpt4_len); - afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); + ret = afs_merge_fs_addr4(call->net, alist, bp[1], ntohl(bp[2])); + if (ret < 0) + return ret; bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) return afs_protocol_error( call, afs_eproto_yvl_fsendpt6_len); - afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); + ret = afs_merge_fs_addr6(call->net, alist, bp + 1, ntohl(bp[5])); + if (ret < 0) + return ret; bp += 6; break; default: @@ -610,7 +624,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = { .name = "YFSVL.GetEndpoints", .op = afs_YFSVL_GetEndpoints, .deliver = afs_deliver_yfsvl_get_endpoints, - .destructor = afs_vl_get_addrs_u_destructor, + .destructor = afs_flat_call_destructor, }; /* @@ -620,6 +634,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = { struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, const uuid_t *uuid) { + struct afs_addr_list *alist; struct afs_call *call; struct afs_net *net = vc->cell->net; __be32 *bp; @@ -635,6 +650,8 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, call->key = vc->key; call->ret_alist = NULL; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* Marshall the parameters */ bp = call->request; @@ -643,8 +660,18 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */ trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + alist = call->ret_alist; + afs_put_call(call); + if (vc->call_error) { + afs_put_addrlist(alist, afs_alist_trace_put_getaddru); + return ERR_PTR(vc->call_error); + } + return alist; } /* @@ -671,7 +698,7 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) return ret; namesz = ntohl(call->tmp); - if (namesz > AFS_MAXCELLNAME) + if (namesz > YFS_VL_MAXCELLNAME) return afs_protocol_error(call, afs_eproto_cellname_len); paddedsz = (namesz + 3) & ~3; call->count = namesz; @@ -709,12 +736,6 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) return 0; } -static void afs_destroy_yfsvl_get_cell_name(struct afs_call *call) -{ - kfree(call->ret_str); - afs_flat_call_destructor(call); -} - /* * VL.GetCapabilities operation type */ @@ -722,7 +743,7 @@ static const struct afs_call_type afs_YFSVLGetCellName = { .name = "YFSVL.GetCellName", .op = afs_YFSVL_GetCellName, .deliver = afs_deliver_yfsvl_get_cell_name, - .destructor = afs_destroy_yfsvl_get_cell_name, + .destructor = afs_flat_call_destructor, }; /* @@ -737,6 +758,7 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) struct afs_call *call; struct afs_net *net = vc->cell->net; __be32 *bp; + char *cellname; _enter(""); @@ -747,6 +769,8 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) call->key = vc->key; call->ret_str = NULL; call->max_lifespan = AFS_VL_MAX_LIFESPAN; + call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer); + call->service_id = vc->server->service_id; /* marshall the parameters */ bp = call->request; @@ -754,6 +778,16 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) /* Can't take a ref on server */ trace_afs_make_vl_call(call); - afs_make_call(&vc->ac, call, GFP_KERNEL); - return (char *)afs_wait_for_call_to_complete(call, &vc->ac); + afs_make_call(call, GFP_KERNEL); + afs_wait_for_call_to_complete(call); + vc->call_abort_code = call->abort_code; + vc->call_error = call->error; + vc->call_responded = call->responded; + cellname = call->ret_str; + afs_put_call(call); + if (vc->call_error) { + kfree(cellname); + return ERR_PTR(vc->call_error); + } + return cellname; } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 29d483c80281..0efff3d25133 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -10,6 +10,9 @@ #include "internal.h" static unsigned __read_mostly afs_volume_record_life = 60 * 60; +static atomic_t afs_volume_debug_id; + +static void afs_destroy_volume(struct work_struct *work); /* * Insert a volume into a cell. If there's an existing volume record, that is @@ -32,8 +35,13 @@ static struct afs_volume *afs_insert_volume_into_cell(struct afs_cell *cell, } else if (p->vid > volume->vid) { pp = &(*pp)->rb_right; } else { - volume = afs_get_volume(p, afs_volume_trace_get_cell_insert); - goto found; + if (afs_try_get_volume(p, afs_volume_trace_get_cell_insert)) { + volume = p; + goto found; + } + + set_bit(AFS_VOLUME_RM_TREE, &volume->flags); + rb_replace_node_rcu(&p->cell_node, &volume->cell_node, &cell->volumes); } } @@ -52,11 +60,12 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume) struct afs_cell *cell = volume->cell; if (!hlist_unhashed(&volume->proc_link)) { - trace_afs_volume(volume->vid, refcount_read(&cell->ref), + trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref), afs_volume_trace_remove); write_seqlock(&cell->volume_lock); hlist_del_rcu(&volume->proc_link); - rb_erase(&volume->cell_node, &cell->volumes); + if (!test_and_set_bit(AFS_VOLUME_RM_TREE, &volume->flags)) + rb_erase(&volume->cell_node, &cell->volumes); write_sequnlock(&cell->volume_lock); } } @@ -66,38 +75,49 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume) */ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, struct afs_vldb_entry *vldb, - unsigned long type_mask) + struct afs_server_list **_slist) { struct afs_server_list *slist; struct afs_volume *volume; - int ret = -ENOMEM; + int ret = -ENOMEM, i; volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL); if (!volume) goto error_0; + volume->debug_id = atomic_inc_return(&afs_volume_debug_id); volume->vid = vldb->vid[params->type]; volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol); volume->type = params->type; volume->type_force = params->force; volume->name_len = vldb->name_len; + volume->creation_time = TIME64_MIN; + volume->update_time = TIME64_MIN; refcount_set(&volume->ref, 1); INIT_HLIST_NODE(&volume->proc_link); + INIT_WORK(&volume->destructor, afs_destroy_volume); rwlock_init(&volume->servers_lock); + mutex_init(&volume->volsync_lock); + mutex_init(&volume->cb_check_lock); rwlock_init(&volume->cb_v_break_lock); + INIT_LIST_HEAD(&volume->open_mmaps); + init_rwsem(&volume->open_mmaps_lock); memcpy(volume->name, vldb->name, vldb->name_len + 1); - slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask); + for (i = 0; i < AFS_MAXTYPES; i++) + volume->vids[i] = vldb->vid[i]; + + slist = afs_alloc_server_list(volume, params->key, vldb); if (IS_ERR(slist)) { ret = PTR_ERR(slist); goto error_1; } - refcount_set(&slist->usage, 1); + *_slist = slist; rcu_assign_pointer(volume->servers, slist); - trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc); + trace_afs_volume(volume->debug_id, volume->vid, 1, afs_volume_trace_alloc); return volume; error_1: @@ -111,18 +131,20 @@ error_0: * Look up or allocate a volume record. */ static struct afs_volume *afs_lookup_volume(struct afs_fs_context *params, - struct afs_vldb_entry *vldb, - unsigned long type_mask) + struct afs_vldb_entry *vldb) { + struct afs_server_list *slist; struct afs_volume *candidate, *volume; - candidate = afs_alloc_volume(params, vldb, type_mask); + candidate = afs_alloc_volume(params, vldb, &slist); if (IS_ERR(candidate)) return candidate; volume = afs_insert_volume_into_cell(params->cell, candidate); - if (volume != candidate) - afs_put_volume(params->net, candidate, afs_volume_trace_put_cell_dup); + if (volume == candidate) + afs_attach_volume_to_servers(volume, slist); + else + afs_put_volume(candidate, afs_volume_trace_put_cell_dup); return volume; } @@ -202,8 +224,7 @@ struct afs_volume *afs_create_volume(struct afs_fs_context *params) goto error; } - type_mask = 1UL << params->type; - volume = afs_lookup_volume(params, vldb, type_mask); + volume = afs_lookup_volume(params, vldb); error: kfree(vldb); @@ -213,18 +234,22 @@ error: /* * Destroy a volume record */ -static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) +static void afs_destroy_volume(struct work_struct *work) { + struct afs_volume *volume = container_of(work, struct afs_volume, destructor); + struct afs_server_list *slist = rcu_access_pointer(volume->servers); + _enter("%p", volume); #ifdef CONFIG_AFS_FSCACHE ASSERTCMP(volume->cache, ==, NULL); #endif + afs_detach_volume_from_servers(volume, slist); afs_remove_volume_from_cell(volume); - afs_put_serverlist(net, rcu_access_pointer(volume->servers)); + afs_put_serverlist(volume->cell->net, slist); afs_put_cell(volume->cell, afs_cell_trace_put_vol); - trace_afs_volume(volume->vid, refcount_read(&volume->ref), + trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref), afs_volume_trace_free); kfree_rcu(volume, rcu); @@ -232,6 +257,20 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) } /* + * Try to get a reference on a volume record. + */ +bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason) +{ + int r; + + if (__refcount_inc_not_zero(&volume->ref, &r)) { + trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason); + return true; + } + return false; +} + +/* * Get a reference on a volume record. */ struct afs_volume *afs_get_volume(struct afs_volume *volume, @@ -241,7 +280,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume, int r; __refcount_inc(&volume->ref, &r); - trace_afs_volume(volume->vid, r + 1, reason); + trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason); } return volume; } @@ -250,18 +289,18 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume, /* * Drop a reference on a volume record. */ -void afs_put_volume(struct afs_net *net, struct afs_volume *volume, - enum afs_volume_trace reason) +void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason) { if (volume) { + unsigned int debug_id = volume->debug_id; afs_volid_t vid = volume->vid; bool zero; int r; zero = __refcount_dec_and_test(&volume->ref, &r); - trace_afs_volume(vid, r - 1, reason); + trace_afs_volume(debug_id, vid, r - 1, reason); if (zero) - afs_destroy_volume(net, volume); + schedule_work(&volume->destructor); } } @@ -317,7 +356,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) { struct afs_server_list *new, *old, *discard; struct afs_vldb_entry *vldb; - char idbuf[16]; + char idbuf[24]; int ret, idsz; _enter(""); @@ -325,7 +364,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) /* We look up an ID by passing it as a decimal string in the * operation's name parameter. */ - idsz = sprintf(idbuf, "%llu", volume->vid); + idsz = snprintf(idbuf, sizeof(idbuf), "%llu", volume->vid); vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz); if (IS_ERR(vldb)) { @@ -342,8 +381,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) } /* See if the volume's server list got updated. */ - new = afs_alloc_server_list(volume->cell, key, - vldb, (1 << volume->type)); + new = afs_alloc_server_list(volume, key, vldb); if (IS_ERR(new)) { ret = PTR_ERR(new); goto error_vldb; @@ -362,11 +400,17 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key) discard = old; } - volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; + /* Check more often if replication is ongoing. */ + if (new->ro_replicating) + volume->update_at = ktime_get_real_seconds() + 10 * 60; + else + volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; write_unlock(&volume->servers_lock); - ret = 0; + if (discard == old) + afs_reattach_volume_to_servers(volume, new, old); afs_put_serverlist(volume->cell->net, discard); + ret = 0; error_vldb: kfree(vldb); error: diff --git a/fs/afs/write.c b/fs/afs/write.c index 19df10d63323..93ad86ff3345 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -12,352 +12,56 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/netfs.h> +#include <trace/events/netfs.h> #include "internal.h" -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - loff_t start, loff_t end, loff_t *_next, - bool max_one_loop); - -static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len, - loff_t i_size, bool caching); - -#ifdef CONFIG_AFS_FSCACHE -/* - * Mark a page as having been made dirty and thus needing writeback. We also - * need to pin the cache object to write back to. - */ -bool afs_dirty_folio(struct address_space *mapping, struct folio *folio) -{ - return fscache_dirty_folio(mapping, folio, - afs_vnode_cache(AFS_FS_I(mapping->host))); -} -static void afs_folio_start_fscache(bool caching, struct folio *folio) -{ - if (caching) - folio_start_fscache(folio); -} -#else -static void afs_folio_start_fscache(bool caching, struct folio *folio) -{ -} -#endif - -/* - * Flush out a conflicting write. This may extend the write to the surrounding - * pages if also dirty and contiguous to the conflicting region.. - */ -static int afs_flush_conflicting_write(struct address_space *mapping, - struct folio *folio) -{ - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = folio_pos(folio), - .range_end = LLONG_MAX, - }; - loff_t next; - - return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX, - &next, true); -} - -/* - * prepare to perform part of a write to a page - */ -int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **_page, void **fsdata) -{ - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct folio *folio; - unsigned long priv; - unsigned f, from; - unsigned t, to; - pgoff_t index; - int ret; - - _enter("{%llx:%llu},%llx,%x", - vnode->fid.vid, vnode->fid.vnode, pos, len); - - /* Prefetch area to be written into the cache if we're caching this - * file. We need to do this before we get a lock on the page in case - * there's more than one writer competing for the same cache block. - */ - ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata); - if (ret < 0) - return ret; - - index = folio_index(folio); - from = pos - index * PAGE_SIZE; - to = from + len; - -try_again: - /* See if this page is already partially written in a way that we can - * merge the new write with. - */ - if (folio_test_private(folio)) { - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - ASSERTCMP(f, <=, t); - - if (folio_test_writeback(folio)) { - trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio); - folio_unlock(folio); - goto wait_for_writeback; - } - /* If the file is being filled locally, allow inter-write - * spaces to be merged into writes. If it's not, only write - * back what the user gives us. - */ - if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && - (to < f || from > t)) - goto flush_conflicting_write; - } - - *_page = folio_file_page(folio, pos / PAGE_SIZE); - _leave(" = 0"); - return 0; - - /* The previous write and this write aren't adjacent or overlapping, so - * flush the page out. - */ -flush_conflicting_write: - trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio); - folio_unlock(folio); - - ret = afs_flush_conflicting_write(mapping, folio); - if (ret < 0) - goto error; - -wait_for_writeback: - ret = folio_wait_writeback_killable(folio); - if (ret < 0) - goto error; - - ret = folio_lock_killable(folio); - if (ret < 0) - goto error; - goto try_again; - -error: - folio_put(folio); - _leave(" = %d", ret); - return ret; -} - -/* - * finalise part of a write to a page - */ -int afs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *subpage, void *fsdata) -{ - struct folio *folio = page_folio(subpage); - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - unsigned long priv; - unsigned int f, from = offset_in_folio(folio, pos); - unsigned int t, to = from + copied; - loff_t i_size, write_end_pos; - - _enter("{%llx:%llu},{%lx}", - vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - - if (!folio_test_uptodate(folio)) { - if (copied < len) { - copied = 0; - goto out; - } - - folio_mark_uptodate(folio); - } - - if (copied == 0) - goto out; - - write_end_pos = pos + copied; - - i_size = i_size_read(&vnode->netfs.inode); - if (write_end_pos > i_size) { - write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->netfs.inode); - if (write_end_pos > i_size) - afs_set_i_size(vnode, write_end_pos); - write_sequnlock(&vnode->cb_lock); - fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos); - } - - if (folio_test_private(folio)) { - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - if (from < f) - f = from; - if (to > t) - t = to; - priv = afs_folio_dirty(folio, f, t); - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio); - } else { - priv = afs_folio_dirty(folio, from, to); - folio_attach_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio); - } - - if (folio_mark_dirty(folio)) - _debug("dirtied %lx", folio_index(folio)); - -out: - folio_unlock(folio); - folio_put(folio); - return copied; -} - -/* - * kill all the pages in the given range - */ -static void afs_kill_pages(struct address_space *mapping, - loff_t start, loff_t len) -{ - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("{%llx:%llu},%llx @%llx", - vnode->fid.vid, vnode->fid.vnode, len, start); - - do { - _debug("kill %lx (to %lx)", index, last); - - folio = filemap_get_folio(mapping, index); - if (!folio) { - next = index + 1; - continue; - } - - next = folio_next_index(folio); - - folio_clear_uptodate(folio); - folio_end_writeback(folio); - folio_lock(folio); - generic_error_remove_page(mapping, &folio->page); - folio_unlock(folio); - folio_put(folio); - - } while (index = next, index <= last); - - _leave(""); -} - -/* - * Redirty all the pages in a given range. - */ -static void afs_redirty_pages(struct writeback_control *wbc, - struct address_space *mapping, - loff_t start, loff_t len) -{ - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct folio *folio; - pgoff_t index = start / PAGE_SIZE; - pgoff_t last = (start + len - 1) / PAGE_SIZE, next; - - _enter("{%llx:%llu},%llx @%llx", - vnode->fid.vid, vnode->fid.vnode, len, start); - - do { - _debug("redirty %llx @%llx", len, start); - - folio = filemap_get_folio(mapping, index); - if (!folio) { - next = index + 1; - continue; - } - - next = index + folio_nr_pages(folio); - folio_redirty_for_writepage(wbc, folio); - folio_end_writeback(folio); - folio_put(folio); - } while (index = next, index <= last); - - _leave(""); -} - /* * completion of write to server */ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct address_space *mapping = vnode->netfs.inode.i_mapping; - struct folio *folio; - pgoff_t end; - - XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); - _enter("{%llx:%llu},{%x @%llx}", vnode->fid.vid, vnode->fid.vnode, len, start); - rcu_read_lock(); - - end = (start + len - 1) / PAGE_SIZE; - xas_for_each(&xas, folio, end) { - if (!folio_test_writeback(folio)) { - kdebug("bad %x @%llx page %lx %lx", - len, start, folio_index(folio), end); - ASSERT(folio_test_writeback(folio)); - } - - trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio); - folio_detach_private(folio); - folio_end_writeback(folio); - } - - rcu_read_unlock(); - afs_prune_wb_keys(vnode); _leave(""); } /* * Find a key to use for the writeback. We cached the keys used to author the - * writes on the vnode. *_wbk will contain the last writeback key used or NULL - * and we need to start from there if it's set. + * writes on the vnode. wreq->netfs_priv2 will contain the last writeback key + * record used or NULL and we need to start from there if it's set. + * wreq->netfs_priv will be set to the key itself or NULL. */ -static int afs_get_writeback_key(struct afs_vnode *vnode, - struct afs_wb_key **_wbk) +static void afs_get_writeback_key(struct netfs_io_request *wreq) { - struct afs_wb_key *wbk = NULL; - struct list_head *p; - int ret = -ENOKEY, ret2; + struct afs_wb_key *wbk, *old = wreq->netfs_priv2; + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); + + key_put(wreq->netfs_priv); + wreq->netfs_priv = NULL; + wreq->netfs_priv2 = NULL; spin_lock(&vnode->wb_lock); - if (*_wbk) - p = (*_wbk)->vnode_link.next; + if (old) + wbk = list_next_entry(old, vnode_link); else - p = vnode->wb_keys.next; + wbk = list_first_entry(&vnode->wb_keys, struct afs_wb_key, vnode_link); - while (p != &vnode->wb_keys) { - wbk = list_entry(p, struct afs_wb_key, vnode_link); + list_for_each_entry_from(wbk, &vnode->wb_keys, vnode_link) { _debug("wbk %u", key_serial(wbk->key)); - ret2 = key_validate(wbk->key); - if (ret2 == 0) { + if (key_validate(wbk->key) == 0) { refcount_inc(&wbk->usage); + wreq->netfs_priv = key_get(wbk->key); + wreq->netfs_priv2 = wbk; _debug("USE WB KEY %u", key_serial(wbk->key)); break; } - - wbk = NULL; - if (ret == -ENOKEY) - ret = ret2; - p = p->next; } spin_unlock(&vnode->wb_lock); - if (*_wbk) - afs_put_wb_key(*_wbk); - *_wbk = wbk; - return 0; + + afs_put_wb_key(old); } static void afs_store_data_success(struct afs_operation *op) @@ -366,9 +70,8 @@ static void afs_store_data_success(struct afs_operation *op) op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); - if (op->error == 0) { - if (!op->store.laundering) - afs_pages_written_back(vnode, op->store.pos, op->store.size); + if (!afs_op_error(op)) { + afs_pages_written_back(vnode, op->store.pos, op->store.size); afs_stat_v(vnode, n_stores); atomic_long_add(op->store.size, &afs_v2net(vnode)->n_store_bytes); } @@ -381,428 +84,153 @@ static const struct afs_operation_ops afs_store_data_operation = { }; /* - * write to a file + * Prepare a subrequest to write to the server. This sets the max_len + * parameter. */ -static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos, - bool laundering) +void afs_prepare_write(struct netfs_io_subrequest *subreq) { + struct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr]; + + //if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) + // subreq->max_len = 512 * 1024; + //else + stream->sreq_max_len = 256 * 1024 * 1024; +} + +/* + * Issue a subrequest to write to the server. + */ +static void afs_issue_write_worker(struct work_struct *work) +{ + struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work); + struct netfs_io_request *wreq = subreq->rreq; struct afs_operation *op; - struct afs_wb_key *wbk = NULL; - loff_t size = iov_iter_count(iter); + struct afs_vnode *vnode = AFS_FS_I(wreq->inode); + unsigned long long pos = subreq->start + subreq->transferred; + size_t len = subreq->len - subreq->transferred; int ret = -ENOKEY; - _enter("%s{%llx:%llu.%u},%llx,%llx", + _enter("R=%x[%x],%s{%llx:%llu.%u},%llx,%zx", + wreq->debug_id, subreq->debug_index, vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, - size, pos); + pos, len); - ret = afs_get_writeback_key(vnode, &wbk); - if (ret) { - _leave(" = %d [no keys]", ret); - return ret; - } +#if 0 // Error injection + if (subreq->debug_index == 3) + return netfs_write_subrequest_terminated(subreq, -ENOANO); - op = afs_alloc_operation(wbk->key, vnode->volume); - if (IS_ERR(op)) { - afs_put_wb_key(wbk); - return -ENOMEM; + if (!subreq->retry_count) { + set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + return netfs_write_subrequest_terminated(subreq, -EAGAIN); } +#endif + + op = afs_alloc_operation(wreq->netfs_priv, vnode->volume); + if (IS_ERR(op)) + return netfs_write_subrequest_terminated(subreq, -EAGAIN); afs_op_set_vnode(op, 0, vnode); - op->file[0].dv_delta = 1; + op->file[0].dv_delta = 1; op->file[0].modification = true; - op->store.write_iter = iter; - op->store.pos = pos; - op->store.size = size; - op->store.i_size = max(pos + size, vnode->netfs.remote_i_size); - op->store.laundering = laundering; - op->mtime = vnode->netfs.inode.i_mtime; - op->flags |= AFS_OPERATION_UNINTR; - op->ops = &afs_store_data_operation; + op->store.pos = pos; + op->store.size = len; + op->flags |= AFS_OPERATION_UNINTR; + op->ops = &afs_store_data_operation; -try_next_key: afs_begin_vnode_operation(op); - afs_wait_for_operation(op); - switch (op->error) { + op->store.write_iter = &subreq->io_iter; + op->store.i_size = umax(pos + len, vnode->netfs.remote_i_size); + op->mtime = inode_get_mtime(&vnode->netfs.inode); + + afs_wait_for_operation(op); + ret = afs_put_operation(op); + switch (ret) { + case 0: + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + break; case -EACCES: case -EPERM: case -ENOKEY: case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: - _debug("next"); - - ret = afs_get_writeback_key(vnode, &wbk); - if (ret == 0) { - key_put(op->key); - op->key = key_get(wbk->key); - goto try_next_key; - } + /* If there are more keys we can try, use the retry algorithm + * to rotate the keys. + */ + if (wreq->netfs_priv2) + set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); break; } - afs_put_wb_key(wbk); - _leave(" = %d", op->error); - return afs_put_operation(op); + netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len); +} + +void afs_issue_write(struct netfs_io_subrequest *subreq) +{ + subreq->work.func = afs_issue_write_worker; + if (!queue_work(system_dfl_wq, &subreq->work)) + WARN_ON_ONCE(1); } /* - * Extend the region to be written back to include subsequent contiguously - * dirty pages if possible, but don't sleep while doing so. - * - * If this page holds new content, then we can include filler zeros in the - * writeback. + * Writeback calls this when it finds a folio that needs uploading. This isn't + * called if writeback only has copy-to-cache to deal with. */ -static void afs_extend_writeback(struct address_space *mapping, - struct afs_vnode *vnode, - long *_count, - loff_t start, - loff_t max_len, - bool new_content, - bool caching, - unsigned int *_len) +void afs_begin_writeback(struct netfs_io_request *wreq) { - struct pagevec pvec; - struct folio *folio; - unsigned long priv; - unsigned int psize, filler = 0; - unsigned int f, t; - loff_t len = *_len; - pgoff_t index = (start + len) / PAGE_SIZE; - bool stop = true; - unsigned int i; - - XA_STATE(xas, &mapping->i_pages, index); - pagevec_init(&pvec); - - do { - /* Firstly, we gather up a batch of contiguous dirty pages - * under the RCU read lock - but we can't clear the dirty flags - * there if any of those pages are mapped. - */ - rcu_read_lock(); - - xas_for_each(&xas, folio, ULONG_MAX) { - stop = true; - if (xas_retry(&xas, folio)) - continue; - if (xa_is_value(folio)) - break; - if (folio_index(folio) != index) - break; - - if (!folio_try_get_rcu(folio)) { - xas_reset(&xas); - continue; - } - - /* Has the page moved or been split? */ - if (unlikely(folio != xas_reload(&xas))) { - folio_put(folio); - break; - } - - if (!folio_trylock(folio)) { - folio_put(folio); - break; - } - if (!folio_test_dirty(folio) || - folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - folio_put(folio); - break; - } - - psize = folio_size(folio); - priv = (unsigned long)folio_get_private(folio); - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - if (f != 0 && !new_content) { - folio_unlock(folio); - folio_put(folio); - break; - } - - len += filler + t; - filler = psize - t; - if (len >= max_len || *_count <= 0) - stop = true; - else if (t == psize || new_content) - stop = false; - - index += folio_nr_pages(folio); - if (!pagevec_add(&pvec, &folio->page)) - break; - if (stop) - break; - } - - if (!stop) - xas_pause(&xas); - rcu_read_unlock(); - - /* Now, if we obtained any pages, we can shift them to being - * writable and mark them for caching. - */ - if (!pagevec_count(&pvec)) - break; - - for (i = 0; i < pagevec_count(&pvec); i++) { - folio = page_folio(pvec.pages[i]); - trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio); - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - if (folio_start_writeback(folio)) - BUG(); - afs_folio_start_fscache(caching, folio); - - *_count -= folio_nr_pages(folio); - folio_unlock(folio); - } - - pagevec_release(&pvec); - cond_resched(); - } while (!stop); - - *_len = len; + if (S_ISREG(wreq->inode->i_mode)) + afs_get_writeback_key(wreq); } /* - * Synchronously write back the locked page and any subsequent non-locked dirty - * pages. + * Prepare to retry the writes in request. Use this to try rotating the + * available writeback keys. */ -static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, - struct writeback_control *wbc, - struct folio *folio, - loff_t start, loff_t end) +void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct iov_iter iter; - unsigned long priv; - unsigned int offset, to, len, max_len; - loff_t i_size = i_size_read(&vnode->netfs.inode); - bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); - long count = wbc->nr_to_write; - int ret; - - _enter(",%lx,%llx-%llx", folio_index(folio), start, end); - - if (folio_start_writeback(folio)) - BUG(); - afs_folio_start_fscache(caching, folio); - - count -= folio_nr_pages(folio); - - /* Find all consecutive lockable dirty pages that have contiguous - * written regions, stopping when we find a page that is not - * immediately lockable, is not dirty or is missing, or we reach the - * end of the range. - */ - priv = (unsigned long)folio_get_private(folio); - offset = afs_folio_dirty_from(folio, priv); - to = afs_folio_dirty_to(folio, priv); - trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio); - - len = to - offset; - start += offset; - if (start < i_size) { - /* Trim the write to the EOF; the extra data is ignored. Also - * put an upper limit on the size of a single storedata op. - */ - max_len = 65536 * 4096; - max_len = min_t(unsigned long long, max_len, end - start + 1); - max_len = min_t(unsigned long long, max_len, i_size - start); - - if (len < max_len && - (to == folio_size(folio) || new_content)) - afs_extend_writeback(mapping, vnode, &count, - start, max_len, new_content, - caching, &len); - len = min_t(loff_t, len, max_len); - } - - /* We now have a contiguous set of dirty pages, each with writeback - * set; the first page is still locked at this point, but all the rest - * have been unlocked. - */ - folio_unlock(folio); - - if (start < i_size) { - _debug("write back %x @%llx [%llx]", len, start, i_size); - - /* Speculatively write to the cache. We have to fix this up - * later if the store fails. - */ - afs_write_to_cache(vnode, start, len, i_size, caching); - - iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len); - ret = afs_store_data(vnode, &iter, start, false); - } else { - _debug("write discard %x @%llx [%llx]", len, start, i_size); - - /* The dirty region was entirely beyond the EOF. */ - fscache_clear_page_bits(mapping, start, len, caching); - afs_pages_written_back(vnode, start, len); - ret = 0; - } - - switch (ret) { - case 0: - wbc->nr_to_write = count; - ret = len; + struct netfs_io_subrequest *subreq = + list_first_entry(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + + switch (wreq->origin) { + case NETFS_READAHEAD: + case NETFS_READPAGE: + case NETFS_READ_GAPS: + case NETFS_READ_SINGLE: + case NETFS_READ_FOR_WRITE: + case NETFS_UNBUFFERED_READ: + case NETFS_DIO_READ: + return; + default: break; + } - default: - pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret); - fallthrough; + switch (subreq->error) { case -EACCES: case -EPERM: case -ENOKEY: case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: - case -ENETRESET: - afs_redirty_pages(wbc, mapping, start, len); - mapping_set_error(mapping, ret); - break; - - case -EDQUOT: - case -ENOSPC: - afs_redirty_pages(wbc, mapping, start, len); - mapping_set_error(mapping, -ENOSPC); - break; - - case -EROFS: - case -EIO: - case -EREMOTEIO: - case -EFBIG: - case -ENOENT: - case -ENOMEDIUM: - case -ENXIO: - trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail); - afs_kill_pages(mapping, start, len); - mapping_set_error(mapping, ret); + afs_get_writeback_key(wreq); + if (!wreq->netfs_priv) + stream->failed = true; break; } - - _leave(" = %d", ret); - return ret; -} - -/* - * write a region of pages back to the server - */ -static int afs_writepages_region(struct address_space *mapping, - struct writeback_control *wbc, - loff_t start, loff_t end, loff_t *_next, - bool max_one_loop) -{ - struct folio *folio; - struct page *head_page; - ssize_t ret; - int n, skips = 0; - - _enter("%llx,%llx,", start, end); - - do { - pgoff_t index = start / PAGE_SIZE; - - n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, - PAGECACHE_TAG_DIRTY, 1, &head_page); - if (!n) - break; - - folio = page_folio(head_page); - start = folio_pos(folio); /* May regress with THPs */ - - _debug("wback %lx", folio_index(folio)); - - /* At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping - */ - if (wbc->sync_mode != WB_SYNC_NONE) { - ret = folio_lock_killable(folio); - if (ret < 0) { - folio_put(folio); - return ret; - } - } else { - if (!folio_trylock(folio)) { - folio_put(folio); - return 0; - } - } - - if (folio_mapping(folio) != mapping || - !folio_test_dirty(folio)) { - start += folio_size(folio); - folio_unlock(folio); - folio_put(folio); - continue; - } - - if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) { - folio_wait_writeback(folio); -#ifdef CONFIG_AFS_FSCACHE - folio_wait_fscache(folio); -#endif - } else { - start += folio_size(folio); - } - folio_put(folio); - if (wbc->sync_mode == WB_SYNC_NONE) { - if (skips >= 5 || need_resched()) - break; - skips++; - } - continue; - } - - if (!folio_clear_dirty_for_io(folio)) - BUG(); - ret = afs_write_back_from_locked_folio(mapping, wbc, folio, start, end); - folio_put(folio); - if (ret < 0) { - _leave(" = %zd", ret); - return ret; - } - - start += ret; - - if (max_one_loop) - break; - - cond_resched(); - } while (wbc->nr_to_write > 0); - - *_next = start; - _leave(" = 0 [%llx]", *_next); - return 0; } /* * write some of the pending data back to the server */ -int afs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - loff_t start, next; int ret; - _enter(""); - /* We have to be careful as we can end up racing with setattr() * truncating the pagecache since the caller doesn't take a lock here * to prevent it. @@ -812,69 +240,12 @@ int afs_writepages(struct address_space *mapping, else if (!down_read_trylock(&vnode->validate_lock)) return 0; - if (wbc->range_cyclic) { - start = mapping->writeback_index * PAGE_SIZE; - ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, - &next, false); - if (ret == 0) { - mapping->writeback_index = next / PAGE_SIZE; - if (start > 0 && wbc->nr_to_write > 0) { - ret = afs_writepages_region(mapping, wbc, 0, - start, &next, false); - if (ret == 0) - mapping->writeback_index = - next / PAGE_SIZE; - } - } - } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, - &next, false); - if (wbc->nr_to_write > 0 && ret == 0) - mapping->writeback_index = next / PAGE_SIZE; - } else { - ret = afs_writepages_region(mapping, wbc, - wbc->range_start, wbc->range_end, - &next, false); - } - + ret = netfs_writepages(mapping, wbc); up_read(&vnode->validate_lock); - _leave(" = %d", ret); return ret; } /* - * write to an AFS file - */ -ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); - struct afs_file *af = iocb->ki_filp->private_data; - ssize_t result; - size_t count = iov_iter_count(from); - - _enter("{%llx:%llu},{%zu},", - vnode->fid.vid, vnode->fid.vnode, count); - - if (IS_SWAPFILE(&vnode->netfs.inode)) { - printk(KERN_INFO - "AFS: Attempt to write to active swap file!\n"); - return -EBUSY; - } - - if (!count) - return 0; - - result = afs_validate(vnode, af->key); - if (result < 0) - return result; - - result = generic_file_write_iter(iocb, from); - - _leave(" = %zd", result); - return result; -} - -/* * flush any dirty pages for this process, and check for write errors. * - the return status from this call provides a reliable indication of * whether any write errors occurred for this process. @@ -902,59 +273,11 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { - struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; - struct inode *inode = file_inode(file); - struct afs_vnode *vnode = AFS_FS_I(inode); - struct afs_file *af = file->private_data; - unsigned long priv; - vm_fault_t ret = VM_FAULT_RETRY; - - _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - - afs_validate(vnode, af->key); - - sb_start_pagefault(inode->i_sb); - - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. - */ -#ifdef CONFIG_AFS_FSCACHE - if (folio_test_fscache(folio) && - folio_wait_fscache_killable(folio) < 0) - goto out; -#endif - - if (folio_wait_writeback_killable(folio)) - goto out; - - if (folio_lock_killable(folio) < 0) - goto out; - /* We mustn't change folio->private until writeback is complete as that - * details the portion of the page we need to write back and we might - * need to redirty the page if there's a problem. - */ - if (folio_wait_writeback_killable(folio) < 0) { - folio_unlock(folio); - goto out; - } - - priv = afs_folio_dirty(folio, 0, folio_size(folio)); - priv = afs_folio_dirty_mmapped(priv); - if (folio_test_private(folio)) { - folio_change_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio); - } else { - folio_attach_private(folio, (void *)priv); - trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio); - } - file_update_time(file); - - ret = VM_FAULT_LOCKED; -out: - sb_end_pagefault(inode->i_sb); - return ret; + if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0) + return VM_FAULT_SIGBUS; + return netfs_page_mkwrite(vmf, NULL); } /* @@ -984,66 +307,3 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) afs_put_wb_key(wbk); } } - -/* - * Clean up a page during invalidation. - */ -int afs_launder_folio(struct folio *folio) -{ - struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); - struct iov_iter iter; - struct bio_vec bv[1]; - unsigned long priv; - unsigned int f, t; - int ret = 0; - - _enter("{%lx}", folio->index); - - priv = (unsigned long)folio_get_private(folio); - if (folio_clear_dirty_for_io(folio)) { - f = 0; - t = folio_size(folio); - if (folio_test_private(folio)) { - f = afs_folio_dirty_from(folio, priv); - t = afs_folio_dirty_to(folio, priv); - } - - bv[0].bv_page = &folio->page; - bv[0].bv_offset = f; - bv[0].bv_len = t - f; - iov_iter_bvec(&iter, ITER_SOURCE, bv, 1, bv[0].bv_len); - - trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); - ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); - } - - trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio); - folio_detach_private(folio); - folio_wait_fscache(folio); - return ret; -} - -/* - * Deal with the completion of writing the data to the cache. - */ -static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct afs_vnode *vnode = priv; - - if (IS_ERR_VALUE(transferred_or_error) && - transferred_or_error != -ENOBUFS) - afs_invalidate_cache(vnode, 0); -} - -/* - * Save the write to the cache also. - */ -static void afs_write_to_cache(struct afs_vnode *vnode, - loff_t start, size_t len, loff_t i_size, - bool caching) -{ - fscache_write_to_cache(afs_vnode_cache(vnode), - vnode->netfs.inode.i_mapping, start, len, i_size, - afs_write_to_cache_done, vnode, caching); -} diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 7751b0b3f81d..e19f396aa370 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -75,7 +75,7 @@ static bool afs_make_acl(struct afs_operation *op, { struct afs_acl *acl; - acl = kmalloc(sizeof(*acl) + size, GFP_KERNEL); + acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL); if (!acl) { afs_op_nomem(op); return false; @@ -97,7 +97,7 @@ static const struct afs_operation_ops afs_store_acl_operation = { * Set a file's AFS3 ACL. */ static int afs_xattr_set_acl(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -228,7 +228,7 @@ static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { * Set a file's YFS ACL. */ static int afs_xattr_set_yfs(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -353,7 +353,7 @@ static const struct xattr_handler afs_xattr_afs_volume_handler = { .get = afs_xattr_get_volume, }; -const struct xattr_handler *afs_xattr_handlers[] = { +const struct xattr_handler * const afs_xattr_handlers[] = { &afs_xattr_afs_acl_handler, &afs_xattr_afs_cell_handler, &afs_xattr_afs_fid_handler, diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h index 8ca868164507..cc5f143d21a3 100644 --- a/fs/afs/xdr_fs.h +++ b/fs/afs/xdr_fs.h @@ -88,7 +88,7 @@ union afs_xdr_dir_block { struct { struct afs_xdr_dir_hdr hdr; - u8 alloc_ctrs[AFS_DIR_MAX_BLOCKS]; + u8 alloc_ctrs[AFS_DIR_BLOCKS_WITH_CTR]; __be16 hashtable[AFS_DIR_HASHTBL_SIZE]; } meta; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 11571cca86c1..febf13a49f0b 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -245,12 +245,15 @@ static void xdr_decode_YFSVolSync(const __be32 **_bp, struct afs_volsync *volsync) { struct yfs_xdr_YFSVolSync *x = (void *)*_bp; - u64 creation; + u64 creation, update; if (volsync) { creation = xdr_to_u64(x->vol_creation_date); do_div(creation, 10 * 1000 * 1000); volsync->creation = creation; + update = xdr_to_u64(x->vol_update_date); + do_div(update, 10 * 1000 * 1000); + volsync->update = update; } *_bp += xdr_size(x); @@ -349,18 +352,19 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call) static int yfs_deliver_fs_fetch_data64(struct afs_call *call) { struct afs_operation *op = call->op; + struct netfs_io_subrequest *subreq = op->fetch.subreq; struct afs_vnode_param *vp = &op->file[0]; - struct afs_read *req = op->fetch.req; const __be32 *bp; + size_t count_before; int ret; _enter("{%u,%zu, %zu/%llu}", call->unmarshall, call->iov_len, iov_iter_count(call->iter), - req->actual_len); + call->remaining); switch (call->unmarshall) { case 0: - req->actual_len = 0; + call->remaining = 0; afs_extract_to_tmp64(call); call->unmarshall++; fallthrough; @@ -375,38 +379,39 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) if (ret < 0) return ret; - req->actual_len = be64_to_cpu(call->tmp64); - _debug("DATA length: %llu", req->actual_len); + call->remaining = be64_to_cpu(call->tmp64); + _debug("DATA length: %llu", call->remaining); - if (req->actual_len == 0) + if (call->remaining == 0) goto no_more_data; - call->iter = req->iter; - call->iov_len = min(req->actual_len, req->len); + call->iter = &subreq->io_iter; + call->iov_len = min(call->remaining, subreq->len - subreq->transferred); call->unmarshall++; fallthrough; /* extract the returned data */ case 2: - _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->actual_len); + count_before = call->iov_len; + _debug("extract data %zu/%llu", count_before, call->remaining); ret = afs_extract_data(call, true); + subreq->transferred += count_before - call->iov_len; if (ret < 0) return ret; call->iter = &call->def_iter; - if (req->actual_len <= req->len) + if (call->remaining) goto no_more_data; /* Discard any excess data the server gave us */ - afs_extract_discard(call, req->actual_len - req->len); + afs_extract_discard(call, call->remaining); call->unmarshall = 3; fallthrough; case 3: _debug("extract discard %zu/%llu", - iov_iter_count(call->iter), req->actual_len - req->len); + iov_iter_count(call->iter), call->remaining); ret = afs_extract_data(call, true); if (ret < 0) @@ -431,8 +436,8 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) xdr_decode_YFSCallBack(&bp, call, &vp->scb); xdr_decode_YFSVolSync(&bp, &op->volsync); - req->data_version = vp->scb.status.data_version; - req->file_size = vp->scb.status.size; + if (subreq->start + subreq->transferred >= vp->scb.status.size) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); call->unmarshall++; fallthrough; @@ -451,7 +456,9 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) static const struct afs_call_type yfs_RXYFSFetchData64 = { .name = "YFS.FetchData64", .op = yfs_FS_FetchData64, + .async_rx = afs_fetch_data_async_rx, .deliver = yfs_deliver_fs_fetch_data64, + .immediate_cancel = afs_fetch_data_immediate_cancel, .destructor = afs_flat_call_destructor, }; @@ -460,14 +467,15 @@ static const struct afs_call_type yfs_RXYFSFetchData64 = { */ void yfs_fs_fetch_data(struct afs_operation *op) { + struct netfs_io_subrequest *subreq = op->fetch.subreq; struct afs_vnode_param *vp = &op->file[0]; - struct afs_read *req = op->fetch.req; struct afs_call *call; __be32 *bp; - _enter(",%x,{%llx:%llu},%llx,%llx", + _enter(",%x,{%llx:%llu},%llx,%zx", key_serial(op->key), vp->fid.vid, vp->fid.vnode, - req->pos, req->len); + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchData64, sizeof(__be32) * 2 + @@ -479,17 +487,19 @@ void yfs_fs_fetch_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); - req->call_debug_id = call->debug_id; + if (op->flags & AFS_OPERATION_ASYNC) + call->async = true; /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSFETCHDATA64); bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &vp->fid); - bp = xdr_encode_u64(bp, req->pos); - bp = xdr_encode_u64(bp, req->len); + bp = xdr_encode_u64(bp, subreq->start + subreq->transferred); + bp = xdr_encode_u64(bp, subreq->len - subreq->transferred); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -572,6 +582,7 @@ void yfs_fs_create_file(struct afs_operation *op) bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */ yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -620,6 +631,7 @@ void yfs_fs_make_dir(struct afs_operation *op) bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -655,8 +667,9 @@ static int yfs_deliver_fs_remove_file2(struct afs_call *call) static void yfs_done_fs_remove_file2(struct afs_call *call) { if (call->error == -ECONNABORTED && - call->abort_code == RX_INVALID_OPERATION) { - set_bit(AFS_SERVER_FL_NO_RM2, &call->server->flags); + (call->abort_code == RX_INVALID_OPERATION || + call->abort_code == RXGEN_OPCODE)) { + set_bit(AFS_SERVER_FL_NO_RM2, &call->op->server->flags); call->op->flags |= AFS_OPERATION_DOWNGRADE; } } @@ -704,6 +717,7 @@ void yfs_fs_remove_file2(struct afs_operation *op) bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -773,6 +787,7 @@ void yfs_fs_remove_file(struct afs_operation *op) bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -814,6 +829,7 @@ void yfs_fs_remove_dir(struct afs_operation *op) bp = xdr_encode_name(bp, name); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -887,6 +903,7 @@ void yfs_fs_link(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call1(call, &vp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -968,6 +985,7 @@ void yfs_fs_symlink(struct afs_operation *op) bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime); yfs_check_req(call, bp); + call->fid = dvp->fid; trace_afs_make_fs_call1(call, &dvp->fid, name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1024,6 +1042,9 @@ void yfs_fs_rename(struct afs_operation *op) _enter(""); + if (!test_bit(AFS_SERVER_FL_NO_RENAME2, &op->server->flags)) + return yfs_fs_rename_replace(op); + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename, sizeof(__be32) + sizeof(struct yfs_xdr_RPCFlags) + @@ -1047,6 +1068,253 @@ void yfs_fs_rename(struct afs_operation *op) bp = xdr_encode_name(bp, new_name); yfs_check_req(call, bp); + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Deliver reply data to a YFS.Rename_NoReplace operation. This does not + * return the status of a displaced target inode as there cannot be one. + */ +static int yfs_deliver_fs_rename_1(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +/* + * Deliver reply data to a YFS.Rename_Replace or a YFS.Rename_Exchange + * operation. These return the status of the displaced target inode if there + * was one. + */ +static int yfs_deliver_fs_rename_2(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + struct afs_vnode_param *old_vp = &op->more_files[0]; + struct afs_vnode_param *new_vp = &op->more_files[1]; + const __be32 *bp; + int ret; + + _enter("{%u}", call->unmarshall); + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + bp = call->buffer; + /* If the two dirs are the same, we have two copies of the same status + * report, so we just decode it twice. + */ + xdr_decode_YFSFetchStatus(&bp, call, &orig_dvp->scb); + xdr_decode_YFSFid(&bp, &old_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &old_vp->scb); + xdr_decode_YFSFetchStatus(&bp, call, &new_dvp->scb); + xdr_decode_YFSFid(&bp, &new_vp->fid); + xdr_decode_YFSFetchStatus(&bp, call, &new_vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + _leave(" = 0 [done]"); + return 0; +} + +static void yfs_done_fs_rename_replace(struct afs_call *call) +{ + if (call->error == -ECONNABORTED && + (call->abort_code == RX_INVALID_OPERATION || + call->abort_code == RXGEN_OPCODE)) { + set_bit(AFS_SERVER_FL_NO_RENAME2, &call->op->server->flags); + call->op->flags |= AFS_OPERATION_DOWNGRADE; + } +} + +/* + * YFS.Rename_Replace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Replace = { + .name = "FS.Rename_Replace", + .op = yfs_FS_Rename_Replace, + .deliver = yfs_deliver_fs_rename_2, + .done = yfs_done_fs_rename_replace, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_NoReplace operation type + */ +static const struct afs_call_type yfs_RXYFSRename_NoReplace = { + .name = "FS.Rename_NoReplace", + .op = yfs_FS_Rename_NoReplace, + .deliver = yfs_deliver_fs_rename_1, + .destructor = afs_flat_call_destructor, +}; + +/* + * YFS.Rename_Exchange operation type + */ +static const struct afs_call_type yfs_RXYFSRename_Exchange = { + .name = "FS.Rename_Exchange", + .op = yfs_FS_Rename_Exchange, + .deliver = yfs_deliver_fs_rename_2, + .destructor = afs_flat_call_destructor, +}; + +/* + * Rename a file or directory, replacing the target if it exists. The status + * of a displaced target is returned. + */ +void yfs_fs_rename_replace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Replace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_REPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Rename a file or directory, failing if the target dirent exists. + */ +void yfs_fs_rename_noreplace(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_NoReplace, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_NOREPLACE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; + trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); + afs_make_op_call(op, call, GFP_NOFS); +} + +/* + * Exchange a pair of files directories. + */ +void yfs_fs_rename_exchange(struct afs_operation *op) +{ + struct afs_vnode_param *orig_dvp = &op->file[0]; + struct afs_vnode_param *new_dvp = &op->file[1]; + const struct qstr *orig_name = &op->dentry->d_name; + const struct qstr *new_name = &op->dentry_2->d_name; + struct afs_call *call; + __be32 *bp; + + _enter(""); + + call = afs_alloc_flat_call(op->net, &yfs_RXYFSRename_Exchange, + sizeof(__be32) + + sizeof(struct yfs_xdr_RPCFlags) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(orig_name->len) + + sizeof(struct yfs_xdr_YFSFid) + + xdr_strlen(new_name->len), + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSFid) + + sizeof(struct yfs_xdr_YFSFetchStatus) + + sizeof(struct yfs_xdr_YFSVolSync)); + if (!call) + return afs_op_nomem(op); + + /* Marshall the parameters. */ + bp = call->request; + bp = xdr_encode_u32(bp, YFSRENAME_EXCHANGE); + bp = xdr_encode_u32(bp, 0); /* RPC flags */ + bp = xdr_encode_YFSFid(bp, &orig_dvp->fid); + bp = xdr_encode_name(bp, orig_name); + bp = xdr_encode_YFSFid(bp, &new_dvp->fid); + bp = xdr_encode_name(bp, new_name); + yfs_check_req(call, bp); + + call->fid = orig_dvp->fid; trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name); afs_make_op_call(op, call, GFP_NOFS); } @@ -1102,6 +1370,7 @@ void yfs_fs_store_data(struct afs_operation *op) bp = xdr_encode_u64(bp, op->store.i_size); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1158,6 +1427,7 @@ static void yfs_fs_setattr_size(struct afs_operation *op) bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */ yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1196,6 +1466,7 @@ void yfs_fs_setattr(struct afs_operation *op) bp = xdr_encode_YFS_StoreStatus(bp, attr); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1366,6 +1637,7 @@ void yfs_fs_get_volume_status(struct afs_operation *op) bp = xdr_encode_u64(bp, vp->fid.vid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1430,6 +1702,7 @@ void yfs_fs_set_lock(struct afs_operation *op) bp = xdr_encode_u32(bp, op->lock.type); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_calli(call, &vp->fid, op->lock.type); afs_make_op_call(op, call, GFP_NOFS); } @@ -1460,6 +1733,7 @@ void yfs_fs_extend_lock(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1490,6 +1764,7 @@ void yfs_fs_release_lock(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1556,6 +1831,7 @@ void yfs_fs_fetch_status(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1736,6 +2012,7 @@ void yfs_fs_inline_bulk_status(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &op->more_files[i].fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } @@ -1898,6 +2175,7 @@ void yfs_fs_fetch_opaque_acl(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &vp->fid); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } @@ -1948,6 +2226,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op) bp += size / sizeof(__be32); yfs_check_req(call, bp); + call->fid = vp->fid; trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_KERNEL); } |
