142 files changed, 10046 insertions, 7336 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index b2f82cf6bf86..58c2bbd385ad 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -34,8 +34,8 @@ config ARCH_BINFMT_ELF_STATE
 
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
-	default y
-	depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
+	default y if !BINFMT_ELF
+	depends on (ARM || FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
 	select ELFCORE
 	help
 	  ELF FDPIC binaries are based on ELF, but allow the individual load
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 641148208e90..45b7fc405fa6 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -7,6 +7,7 @@ afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
 
 kafs-objs := \
 	$(afs-cache-y) \
+	addr_list.o \
 	callback.o \
 	cell.o \
 	cmservice.o \
@@ -19,14 +20,14 @@ kafs-objs := \
 	misc.o \
 	mntpt.o \
 	proc.o \
+	rotate.o \
 	rxrpc.o \
 	security.o \
 	server.o \
+	server_list.o \
 	super.o \
 	netdevices.o \
 	vlclient.o \
-	vlocation.o \
-	vnode.o \
 	volume.o \
 	write.o \
 	xattr.o
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
new file mode 100644
index 000000000000..a537368ba0db
--- /dev/null
+++ b/fs/afs/addr_list.c
@@ -0,0 +1,381 @@
+/* Server address list management
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/dns_resolver.h>
+#include <linux/inet.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+#include "afs_fs.h"
+
+//#define AFS_MAX_ADDRESSES
+//	((unsigned int)((PAGE_SIZE - sizeof(struct afs_addr_list)) /
+//			sizeof(struct sockaddr_rxrpc)))
+#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
+
+/*
+ * Release an address list.
+ */
+void afs_put_addrlist(struct afs_addr_list *alist)
+{
+	if (alist && refcount_dec_and_test(&alist->usage))
+		call_rcu(&alist->rcu, (rcu_callback_t)kfree);
+}
+
+/*
+ * Allocate an address list.
+ */
+struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
+					 unsigned short service,
+					 unsigned short port)
+{
+	struct afs_addr_list *alist;
+	unsigned int i;
+
+	_enter("%u,%u,%u", nr, service, port);
+
+	alist = kzalloc(sizeof(*alist) + sizeof(alist->addrs[0]) * nr,
+			GFP_KERNEL);
+	if (!alist)
+		return NULL;
+
+	refcount_set(&alist->usage, 1);
+
+	for (i = 0; i < nr; i++) {
+		struct sockaddr_rxrpc *srx = &alist->addrs[i];
+		srx->srx_family			= AF_RXRPC;
+		srx->srx_service		= service;
+		srx->transport_type		= SOCK_DGRAM;
+		srx->transport_len		= sizeof(srx->transport.sin6);
+		srx->transport.sin6.sin6_family	= AF_INET6;
+		srx->transport.sin6.sin6_port	= htons(port);
+	}
+
+	return alist;
+}
+
+/*
+ * Parse a text string consisting of delimited addresses.
+ */
+struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
+					   char delim,
+					   unsigned short service,
+					   unsigned short port)
+{
+	struct afs_addr_list *alist;
+	const char *p, *end = text + len;
+	unsigned int nr = 0;
+
+	_enter("%*.*s,%c", (int)len, (int)len, text, delim);
+
+	if (!len)
+		return ERR_PTR(-EDESTADDRREQ);
+
+	if (delim == ':' && (memchr(text, ',', len) || !memchr(text, '.', len)))
+		delim = ',';
+
+	/* Count the addresses */
+	p = text;
+	do {
+		if (!*p)
+			return ERR_PTR(-EINVAL);
+		if (*p == delim)
+			continue;
+		nr++;
+		if (*p == '[') {
+			p++;
+			if (p == end)
+				return ERR_PTR(-EINVAL);
+			p = memchr(p, ']', end - p);
+			if (!p)
+				return ERR_PTR(-EINVAL);
+			p++;
+			if (p >= end)
+				break;
+		}
+
+		p = memchr(p, delim, end - p);
+		if (!p)
+			break;
+		p++;
+	} while (p < end);
+
+	_debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES);
+	if (nr > AFS_MAX_ADDRESSES)
+		nr = AFS_MAX_ADDRESSES;
+
+	alist = afs_alloc_addrlist(nr, service, port);
+	if (!alist)
+		return ERR_PTR(-ENOMEM);
+
+	/* Extract the addresses */
+	p = text;
+	do {
+		struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs];
+		char tdelim = delim;
+
+		if (*p == delim) {
+			p++;
+			continue;
+		}
+
+		if (*p == '[') {
+			p++;
+			tdelim = ']';
+		}
+
+		if (in4_pton(p, end - p,
+			     (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
+			     tdelim, &p)) {
+			srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
+			srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
+			srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
+		} else if (in6_pton(p, end - p,
+				    srx->transport.sin6.sin6_addr.s6_addr,
+				    tdelim, &p)) {
+			/* Nothing to do */
+		} else {
+			goto bad_address;
+		}
+
+		if (tdelim == ']') {
+			if (p == end || *p != ']')
+				goto bad_address;
+			p++;
+		}
+
+		if (p < end) {
+			if (*p == '+') {
+				/* Port number specification "+1234" */
+				unsigned int xport = 0;
+				p++;
+				if (p >= end || !isdigit(*p))
+					goto bad_address;
+				do {
+					xport *= 10;
+					xport += *p - '0';
+					if (xport > 65535)
+						goto bad_address;
+					p++;
+				} while (p < end && isdigit(*p));
+				srx->transport.sin6.sin6_port = htons(xport);
+			} else if (*p == delim) {
+				p++;
+			} else {
+				goto bad_address;
+			}
+		}
+
+		alist->nr_addrs++;
+	} while (p < end && alist->nr_addrs < AFS_MAX_ADDRESSES);
+
+	_leave(" = [nr %u]", alist->nr_addrs);
+	return alist;
+
+bad_address:
+	kfree(alist);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Compare old and new address lists to see if there's been any change.
+ * - How to do this in better than O(Nlog(N)) time?
+ *   - We don't really want to sort the address list, but would rather take the
+ *     list as we got it so as not to undo record rotation by the DNS server.
+ */
+#if 0
+static int afs_cmp_addr_list(const struct afs_addr_list *a1,
+			     const struct afs_addr_list *a2)
+{
+}
+#endif
+
+/*
+ * Perform a DNS query for VL servers and build a up an address list.
+ */
+struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
+{
+	struct afs_addr_list *alist;
+	char *vllist = NULL;
+	int ret;
+
+	_enter("%s", cell->name);
+
+	ret = dns_query("afsdb", cell->name, cell->name_len,
+			"ipv4", &vllist, _expiry);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	alist = afs_parse_text_addrs(vllist, strlen(vllist), ',',
+				     VL_SERVICE, AFS_VL_PORT);
+	if (IS_ERR(alist)) {
+		kfree(vllist);
+		if (alist != ERR_PTR(-ENOMEM))
+			pr_err("Failed to parse DNS data\n");
+		return alist;
+	}
+
+	kfree(vllist);
+	return alist;
+}
+
+/*
+ * Merge an IPv4 entry into a fileserver address list.
+ */
+void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+{
+	struct sockaddr_in6 *a;
+	__be16 xport = htons(port);
+	int i;
+
+	for (i = 0; i < alist->nr_ipv4; i++) {
+		a = &alist->addrs[i].transport.sin6;
+		if (xdr == a->sin6_addr.s6_addr32[3] &&
+		    xport == a->sin6_port)
+			return;
+		if (xdr == a->sin6_addr.s6_addr32[3] &&
+		    xport < a->sin6_port)
+			break;
+		if (xdr < a->sin6_addr.s6_addr32[3])
+			break;
+	}
+
+	if (i < alist->nr_addrs)
+		memmove(alist->addrs + i + 1,
+			alist->addrs + i,
+			sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+
+	a = &alist->addrs[i].transport.sin6;
+	a->sin6_port		  = xport;
+	a->sin6_addr.s6_addr32[0] = 0;
+	a->sin6_addr.s6_addr32[1] = 0;
+	a->sin6_addr.s6_addr32[2] = htonl(0xffff);
+	a->sin6_addr.s6_addr32[3] = xdr;
+	alist->nr_ipv4++;
+	alist->nr_addrs++;
+}
+
+/*
+ * Merge an IPv6 entry into a fileserver address list.
+ */
+void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+{
+	struct sockaddr_in6 *a;
+	__be16 xport = htons(port);
+	int i, diff;
+
+	for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+		a = &alist->addrs[i].transport.sin6;
+		diff = memcmp(xdr, &a->sin6_addr, 16);
+		if (diff == 0 &&
+		    xport == a->sin6_port)
+			return;
+		if (diff == 0 &&
+		    xport < a->sin6_port)
+			break;
+		if (diff < 0)
+			break;
+	}
+
+	if (i < alist->nr_addrs)
+		memmove(alist->addrs + i + 1,
+			alist->addrs + i,
+			sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+
+	a = &alist->addrs[i].transport.sin6;
+	a->sin6_port		  = xport;
+	a->sin6_addr.s6_addr32[0] = xdr[0];
+	a->sin6_addr.s6_addr32[1] = xdr[1];
+	a->sin6_addr.s6_addr32[2] = xdr[2];
+	a->sin6_addr.s6_addr32[3] = xdr[3];
+	alist->nr_addrs++;
+}
+
+/*
+ * Get an address to try.
+ */
+bool afs_iterate_addresses(struct afs_addr_cursor *ac)
+{
+	_enter("%hu+%hd", ac->start, (short)ac->index);
+
+	if (!ac->alist)
+		return false;
+
+	if (ac->begun) {
+		ac->index++;
+		if (ac->index == ac->alist->nr_addrs)
+			ac->index = 0;
+
+		if (ac->index == ac->start) {
+			ac->error = -EDESTADDRREQ;
+			return false;
+		}
+	}
+
+	ac->begun = true;
+	ac->responded = false;
+	ac->addr = &ac->alist->addrs[ac->index];
+	return true;
+}
+
+/*
+ * Release an address list cursor.
+ */
+int afs_end_cursor(struct afs_addr_cursor *ac)
+{
+	if (ac->responded && ac->index != ac->start)
+		WRITE_ONCE(ac->alist->index, ac->index);
+
+	afs_put_addrlist(ac->alist);
+	ac->alist = NULL;
+	return ac->error;
+}
+
+/*
+ * Set the address cursor for iterating over VL servers.
+ */
+int afs_set_vl_cursor(struct afs_addr_cursor *ac, struct afs_cell *cell)
+{
+	struct afs_addr_list *alist;
+	int ret;
+
+	if (!rcu_access_pointer(cell->vl_addrs)) {
+		ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET,
+				  TASK_INTERRUPTIBLE);
+		if (ret < 0)
+			return ret;
+
+		if (!rcu_access_pointer(cell->vl_addrs) &&
+		    ktime_get_real_seconds() < cell->dns_expiry)
+			return cell->error;
+	}
+
+	read_lock(&cell->vl_addrs_lock);
+	alist = rcu_dereference_protected(cell->vl_addrs,
+					  lockdep_is_held(&cell->vl_addrs_lock));
+	if (alist->nr_addrs > 0)
+		afs_get_addrlist(alist);
+	else
+		alist = NULL;
+	read_unlock(&cell->vl_addrs_lock);
+
+	if (!alist)
+		return -EDESTADDRREQ;
+
+	ac->alist = alist;
+	ac->addr = NULL;
+	ac->start = READ_ONCE(alist->index);
+	ac->index = ac->start;
+	ac->error = 0;
+	ac->begun = false;
+	return 0;
+}
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 3c462ff6db63..b94d0edc2b78 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -14,11 +14,14 @@
 
 #include <linux/in.h>
 
-#define AFS_MAXCELLNAME	64		/* maximum length of a cell name */
-#define AFS_MAXVOLNAME	64		/* maximum length of a volume name */
-#define AFSNAMEMAX	256		/* maximum length of a filename plus NUL */
-#define AFSPATHMAX	1024		/* maximum length of a pathname plus NUL */
-#define AFSOPAQUEMAX	1024		/* maximum length of an opaque field */
+#define AFS_MAXCELLNAME		64  	/* Maximum length of a cell name */
+#define AFS_MAXVOLNAME		64  	/* Maximum length of a volume name */
+#define AFS_MAXNSERVERS		8   	/* Maximum servers in a basic volume record */
+#define AFS_NMAXNSERVERS	13  	/* Maximum servers in a N/U-class volume record */
+#define AFS_MAXTYPES		3	/* Maximum number of volume types */
+#define AFSNAMEMAX		256 	/* Maximum length of a filename plus NUL */
+#define AFSPATHMAX		1024	/* Maximum length of a pathname plus NUL */
+#define AFSOPAQUEMAX		1024	/* Maximum length of an opaque field */
 
 typedef unsigned			afs_volid_t;
 typedef unsigned			afs_vnodeid_t;
@@ -72,6 +75,15 @@ struct afs_callback {
 
 #define AFSCBMAX 50	/* maximum callbacks transferred per bulk op */
 
+struct afs_uuid {
+	__be32		time_low;			/* low part of timestamp */
+	__be16		time_mid;			/* mid part of timestamp */
+	__be16		time_hi_and_version;		/* high part of timestamp and version  */
+	__s8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
+	__s8		clock_seq_low;			/* clock seq low */
+	__s8		node[6];			/* spatially unique node ID (MAC addr) */
+};
+
 /*
  * AFS volume information
  */
@@ -124,7 +136,6 @@ struct afs_file_status {
 	afs_access_t		caller_access;	/* access rights for authenticated caller */
 	afs_access_t		anon_access;	/* access rights for unauthenticated caller */
 	umode_t			mode;		/* UNIX mode */
-	struct afs_fid		parent;		/* parent dir ID for non-dirs only */
 	time_t			mtime_client;	/* last time client changed data */
 	time_t			mtime_server;	/* last time server changed data */
 	s32			lock_count;	/* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */
@@ -167,4 +178,16 @@ struct afs_volume_status {
 
 #define AFS_BLOCK_SIZE	1024
 
+/*
+ * XDR encoding of UUID in AFS.
+ */
+struct afs_uuid__xdr {
+	__be32		time_low;
+	__be32		time_mid;
+	__be32		time_hi_and_version;
+	__be32		clock_seq_hi_and_reserved;
+	__be32		clock_seq_low;
+	__be32		node[6];
+};
+
 #endif /* AFS_H */
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
index eb647323d8f0..d47b6d01e4c0 100644
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -37,9 +37,12 @@ enum AFS_FS_Operations {
 	FSLOOKUP		= 161,	/* AFS lookup file in directory */
 	FSFETCHDATA64		= 65537, /* AFS Fetch file data */
 	FSSTOREDATA64		= 65538, /* AFS Store file data */
+	FSGIVEUPALLCALLBACKS	= 65539, /* AFS Give up all outstanding callbacks on a server */
+	FSGETCAPABILITIES	= 65540, /* Probe and get the capabilities of a fileserver */
 };
 
 enum AFS_FS_Errors {
+	VRESTARTING	= -100,	/* Server is restarting */
 	VSALVAGE	= 101,	/* volume needs salvaging */
 	VNOVNODE	= 102,	/* no such file/dir (vnode) */
 	VNOVOL		= 103,	/* no such volume or volume unavailable */
@@ -51,6 +54,9 @@ enum AFS_FS_Errors {
 	VOVERQUOTA	= 109,	/* volume's maximum quota exceeded */
 	VBUSY		= 110,	/* volume is temporarily unavailable */
 	VMOVED		= 111,	/* volume moved to new server - ask this FS where */
+	VIO		= 112,	/* I/O error in volume */
+	VSALVAGING	= 113,	/* Volume is being salvaged */
+	VRESTRICTED	= 120,	/* Volume is restricted from using  */
 };
 
 #endif /* AFS_FS_H */
diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h
index 800f607ffaf5..e3c4688f573b 100644
--- a/fs/afs/afs_vl.h
+++ b/fs/afs/afs_vl.h
@@ -16,11 +16,17 @@
 
 #define AFS_VL_PORT		7003	/* volume location service port */
 #define VL_SERVICE		52	/* RxRPC service ID for the Volume Location service */
+#define YFS_VL_SERVICE		2503	/* Service ID for AuriStor upgraded VL service */
 
 enum AFSVL_Operations {
-	VLGETENTRYBYID		= 503,	/* AFS Get Cache Entry By ID operation ID */
-	VLGETENTRYBYNAME	= 504,	/* AFS Get Cache Entry By Name operation ID */
-	VLPROBE			= 514,	/* AFS Probe Volume Location Service operation ID */
+	VLGETENTRYBYID		= 503,	/* AFS Get VLDB entry by ID */
+	VLGETENTRYBYNAME	= 504,	/* AFS Get VLDB entry by name */
+	VLPROBE			= 514,	/* AFS probe VL service */
+	VLGETENTRYBYIDU		= 526,	/* AFS Get VLDB entry by ID (UUID-variant) */
+	VLGETENTRYBYNAMEU	= 527,	/* AFS Get VLDB entry by name (UUID-variant) */
+	VLGETADDRSU		= 533,	/* AFS Get addrs for fileserver */
+	YVLGETENDPOINTS		= 64002, /* YFS Get endpoints for file/volume server */
+	VLGETCAPABILITIES	= 65537, /* AFS Get server capabilities */
 };
 
 enum AFSVL_Errors {
@@ -54,6 +60,19 @@ enum AFSVL_Errors {
 	AFSVL_NOMEM 		= 363547,	/* malloc/realloc failed to alloc enough memory */
 };
 
+enum {
+	YFS_SERVER_INDEX	= 0,
+	YFS_SERVER_UUID		= 1,
+	YFS_SERVER_ENDPOINT	= 2,
+};
+
+enum {
+	YFS_ENDPOINT_IPV4	= 0,
+	YFS_ENDPOINT_IPV6	= 1,
+};
+
+#define YFS_MAXENDPOINTS	16
+
 /*
  * maps to "struct vldbentry" in vvl-spec.pdf
  */
@@ -74,11 +93,57 @@ struct afs_vldbentry {
 		struct in_addr	addr;		/* server address */
 		unsigned	partition;	/* partition ID on this server */
 		unsigned	flags;		/* server specific flags */
-#define AFS_VLSF_NEWREPSITE	0x0001	/* unused */
+#define AFS_VLSF_NEWREPSITE	0x0001	/* Ignore all 'non-new' servers */
 #define AFS_VLSF_ROVOL		0x0002	/* this server holds a R/O instance of the volume */
 #define AFS_VLSF_RWVOL		0x0004	/* this server holds a R/W instance of the volume */
 #define AFS_VLSF_BACKVOL	0x0008	/* this server holds a backup instance of the volume */
+#define AFS_VLSF_UUID		0x0010	/* This server is referred to by its UUID */
+#define AFS_VLSF_DONTUSE	0x0020	/* This server ref should be ignored */
 	} servers[8];
 };
 
+#define AFS_VLDB_MAXNAMELEN 65
+
+
+struct afs_ListAddrByAttributes__xdr {
+	__be32			Mask;
+#define AFS_VLADDR_IPADDR	0x1	/* Match by ->ipaddr */
+#define AFS_VLADDR_INDEX	0x2	/* Match by ->index */
+#define AFS_VLADDR_UUID		0x4	/* Match by ->uuid */
+	__be32			ipaddr;
+	__be32			index;
+	__be32			spare;
+	struct afs_uuid__xdr	uuid;
+};
+
+struct afs_uvldbentry__xdr {
+	__be32			name[AFS_VLDB_MAXNAMELEN];
+	__be32			nServers;
+	struct afs_uuid__xdr	serverNumber[AFS_NMAXNSERVERS];
+	__be32			serverUnique[AFS_NMAXNSERVERS];
+	__be32			serverPartition[AFS_NMAXNSERVERS];
+	__be32			serverFlags[AFS_NMAXNSERVERS];
+	__be32			volumeId[AFS_MAXTYPES];
+	__be32			cloneId;
+	__be32			flags;
+	__be32			spares1;
+	__be32			spares2;
+	__be32			spares3;
+	__be32			spares4;
+	__be32			spares5;
+	__be32			spares6;
+	__be32			spares7;
+	__be32			spares8;
+	__be32			spares9;
+};
+
+struct afs_address_list {
+	refcount_t		usage;
+	unsigned int		version;
+	unsigned int		nr_addrs;
+	struct sockaddr_rxrpc	addrs[];
+};
+
+extern void afs_put_address_list(struct afs_address_list *alist);
+
 #endif /* AFS_VL_H */
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 1fe855191261..f62ff71d28c9 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -14,19 +14,6 @@
 
 static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
 				       void *buffer, uint16_t buflen);
-static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
-				       void *buffer, uint16_t buflen);
-static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
-						      const void *buffer,
-						      uint16_t buflen);
-
-static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
-					    void *buffer, uint16_t buflen);
-static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
-					    void *buffer, uint16_t buflen);
-static enum fscache_checkaux afs_vlocation_cache_check_aux(
-	void *cookie_netfs_data, const void *buffer, uint16_t buflen);
-
 static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
 					 void *buffer, uint16_t buflen);
 
@@ -42,23 +29,13 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
 
 struct fscache_netfs afs_cache_netfs = {
 	.name			= "afs",
-	.version		= 0,
+	.version		= 1,
 };
 
 struct fscache_cookie_def afs_cell_cache_index_def = {
 	.name		= "AFS.cell",
 	.type		= FSCACHE_COOKIE_TYPE_INDEX,
 	.get_key	= afs_cell_cache_get_key,
-	.get_aux	= afs_cell_cache_get_aux,
-	.check_aux	= afs_cell_cache_check_aux,
-};
-
-struct fscache_cookie_def afs_vlocation_cache_index_def = {
-	.name			= "AFS.vldb",
-	.type			= FSCACHE_COOKIE_TYPE_INDEX,
-	.get_key		= afs_vlocation_cache_get_key,
-	.get_aux		= afs_vlocation_cache_get_aux,
-	.check_aux		= afs_vlocation_cache_check_aux,
 };
 
 struct fscache_cookie_def afs_volume_cache_index_def = {
@@ -95,150 +72,26 @@ static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
 	return klen;
 }
 
-/*
- * provide new auxiliary cache data
- */
-static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
-				       void *buffer, uint16_t bufmax)
-{
-	const struct afs_cell *cell = cookie_netfs_data;
-	uint16_t dlen;
-
-	_enter("%p,%p,%u", cell, buffer, bufmax);
-
-	dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
-	dlen = min(dlen, bufmax);
-	dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
-
-	memcpy(buffer, cell->vl_addrs, dlen);
-	return dlen;
-}
-
-/*
- * check that the auxiliary data indicates that the entry is still valid
- */
-static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
-						      const void *buffer,
-						      uint16_t buflen)
-{
-	_leave(" = OKAY");
-	return FSCACHE_CHECKAUX_OKAY;
-}
-
-/*****************************************************************************/
-/*
- * set the key for the index entry
- */
-static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
-					    void *buffer, uint16_t bufmax)
-{
-	const struct afs_vlocation *vlocation = cookie_netfs_data;
-	uint16_t klen;
-
-	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
-
-	klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
-	if (klen > bufmax)
-		return 0;
-
-	memcpy(buffer, vlocation->vldb.name, klen);
-
-	_leave(" = %u", klen);
-	return klen;
-}
-
-/*
- * provide new auxiliary cache data
- */
-static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
-					    void *buffer, uint16_t bufmax)
-{
-	const struct afs_vlocation *vlocation = cookie_netfs_data;
-	uint16_t dlen;
-
-	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
-
-	dlen = sizeof(struct afs_cache_vlocation);
-	dlen -= offsetof(struct afs_cache_vlocation, nservers);
-	if (dlen > bufmax)
-		return 0;
-
-	memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
-
-	_leave(" = %u", dlen);
-	return dlen;
-}
-
-/*
- * check that the auxiliary data indicates that the entry is still valid
- */
-static
-enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
-						    const void *buffer,
-						    uint16_t buflen)
-{
-	const struct afs_cache_vlocation *cvldb;
-	struct afs_vlocation *vlocation = cookie_netfs_data;
-	uint16_t dlen;
-
-	_enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
-
-	/* check the size of the data is what we're expecting */
-	dlen = sizeof(struct afs_cache_vlocation);
-	dlen -= offsetof(struct afs_cache_vlocation, nservers);
-	if (dlen != buflen)
-		return FSCACHE_CHECKAUX_OBSOLETE;
-
-	cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
-
-	/* if what's on disk is more valid than what's in memory, then use the
-	 * VL record from the cache */
-	if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
-		memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
-		vlocation->valid = 1;
-		_leave(" = SUCCESS [c->m]");
-		return FSCACHE_CHECKAUX_OKAY;
-	}
-
-	/* need to update the cache if the cached info differs */
-	if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
-		/* delete if the volume IDs for this name differ */
-		if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
-			   sizeof(cvldb->vid)) != 0
-		    ) {
-			_leave(" = OBSOLETE");
-			return FSCACHE_CHECKAUX_OBSOLETE;
-		}
-
-		_leave(" = UPDATE");
-		return FSCACHE_CHECKAUX_NEEDS_UPDATE;
-	}
-
-	_leave(" = OKAY");
-	return FSCACHE_CHECKAUX_OKAY;
-}
-
 /*****************************************************************************/
 /*
  * set the key for the volume index entry
  */
 static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
-					void *buffer, uint16_t bufmax)
+					 void *buffer, uint16_t bufmax)
 {
 	const struct afs_volume *volume = cookie_netfs_data;
-	uint16_t klen;
+	struct {
+		u64 volid;
+	} __packed key;
 
 	_enter("{%u},%p,%u", volume->type, buffer, bufmax);
 
-	klen = sizeof(volume->type);
-	if (klen > bufmax)
+	if (bufmax < sizeof(key))
 		return 0;
 
-	memcpy(buffer, &volume->type, sizeof(volume->type));
-
-	_leave(" = %u", klen);
-	return klen;
-
+	key.volid = volume->vid;
+	memcpy(buffer, &key, sizeof(key));
+	return sizeof(key);
 }
 
 /*****************************************************************************/
@@ -249,20 +102,25 @@ static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
 					void *buffer, uint16_t bufmax)
 {
 	const struct afs_vnode *vnode = cookie_netfs_data;
-	uint16_t klen;
+	struct {
+		u32 vnode_id[3];
+	} __packed key;
 
 	_enter("{%x,%x,%llx},%p,%u",
 	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
 	       buffer, bufmax);
 
-	klen = sizeof(vnode->fid.vnode);
-	if (klen > bufmax)
-		return 0;
+	/* Allow for a 96-bit key */
+	memset(&key, 0, sizeof(key));
+	key.vnode_id[0] = vnode->fid.vnode;
+	key.vnode_id[1] = 0;
+	key.vnode_id[2] = 0;
 
-	memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
+	if (sizeof(key) > bufmax)
+		return 0;
 
-	_leave(" = %u", klen);
-	return klen;
+	memcpy(buffer, &key, sizeof(key));
+	return sizeof(key);
 }
 
 /*
@@ -280,6 +138,11 @@ static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
 	*size = vnode->status.size;
 }
 
+struct afs_vnode_cache_aux {
+	u64 data_version;
+	u32 fid_unique;
+} __packed;
+
 /*
  * provide new auxiliary cache data
  */
@@ -287,23 +150,21 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
 					void *buffer, uint16_t bufmax)
 {
 	const struct afs_vnode *vnode = cookie_netfs_data;
-	uint16_t dlen;
+	struct afs_vnode_cache_aux aux;
 
 	_enter("{%x,%x,%Lx},%p,%u",
 	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
 	       buffer, bufmax);
 
-	dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
-	if (dlen > bufmax)
-		return 0;
+	memset(&aux, 0, sizeof(aux));
+	aux.data_version = vnode->status.data_version;
+	aux.fid_unique = vnode->fid.unique;
 
-	memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
-	buffer += sizeof(vnode->fid.unique);
-	memcpy(buffer, &vnode->status.data_version,
-	       sizeof(vnode->status.data_version));
+	if (bufmax < sizeof(aux))
+		return 0;
 
-	_leave(" = %u", dlen);
-	return dlen;
+	memcpy(buffer, &aux, sizeof(aux));
+	return sizeof(aux);
 }
 
 /*
@@ -314,43 +175,29 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
 						       uint16_t buflen)
 {
 	struct afs_vnode *vnode = cookie_netfs_data;
-	uint16_t dlen;
+	struct afs_vnode_cache_aux aux;
 
 	_enter("{%x,%x,%llx},%p,%u",
 	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
 	       buffer, buflen);
 
+	memcpy(&aux, buffer, sizeof(aux));
+
 	/* check the size of the data is what we're expecting */
-	dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
-	if (dlen != buflen) {
-		_leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
+	if (buflen != sizeof(aux)) {
+		_leave(" = OBSOLETE [len %hx != %zx]", buflen, sizeof(aux));
 		return FSCACHE_CHECKAUX_OBSOLETE;
 	}
 
-	if (memcmp(buffer,
-		   &vnode->fid.unique,
-		   sizeof(vnode->fid.unique)
-		   ) != 0) {
-		unsigned unique;
-
-		memcpy(&unique, buffer, sizeof(unique));
-
+	if (vnode->fid.unique != aux.fid_unique) {
 		_leave(" = OBSOLETE [uniq %x != %x]",
-		       unique, vnode->fid.unique);
+		       aux.fid_unique, vnode->fid.unique);
 		return FSCACHE_CHECKAUX_OBSOLETE;
 	}
 
-	if (memcmp(buffer + sizeof(vnode->fid.unique),
-		   &vnode->status.data_version,
-		   sizeof(vnode->status.data_version)
-		   ) != 0) {
-		afs_dataversion_t version;
-
-		memcpy(&version, buffer + sizeof(vnode->fid.unique),
-		       sizeof(version));
-
+	if (vnode->status.data_version != aux.data_version) {
 		_leave(" = OBSOLETE [vers %llx != %llx]",
-		       version, vnode->status.data_version);
+		       aux.data_version, vnode->status.data_version);
 		return FSCACHE_CHECKAUX_OBSOLETE;
 	}
 
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 25d404d22cae..f4291b576054 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -20,118 +20,151 @@
 #include <linux/sched.h>
 #include "internal.h"
 
-#if 0
-unsigned afs_vnode_update_timeout = 10;
-#endif  /*  0  */
-
-#define afs_breakring_space(server) \
-	CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail,	\
-		   ARRAY_SIZE((server)->cb_break))
-
-//static void afs_callback_updater(struct work_struct *);
-
-static struct workqueue_struct *afs_callback_update_worker;
-
 /*
- * allow the fileserver to request callback state (re-)initialisation
+ * Set up an interest-in-callbacks record for a volume on a server and
+ * register it with the server.
+ * - Called with volume->server_sem held.
  */
-void afs_init_callback_state(struct afs_server *server)
+int afs_register_server_cb_interest(struct afs_vnode *vnode,
+				    struct afs_server_entry *entry)
 {
-	struct afs_vnode *vnode;
-
-	_enter("{%p}", server);
+	struct afs_cb_interest *cbi = entry->cb_interest, *vcbi, *new, *x;
+	struct afs_server *server = entry->server;
+
+again:
+	vcbi = vnode->cb_interest;
+	if (vcbi) {
+		if (vcbi == cbi)
+			return 0;
+
+		if (cbi && vcbi->server == cbi->server) {
+			write_seqlock(&vnode->cb_lock);
+			vnode->cb_interest = afs_get_cb_interest(cbi);
+			write_sequnlock(&vnode->cb_lock);
+			afs_put_cb_interest(afs_v2net(vnode), cbi);
+			return 0;
+		}
 
-	spin_lock(&server->cb_lock);
+		if (!cbi && vcbi->server == server) {
+			afs_get_cb_interest(vcbi);
+			x = cmpxchg(&entry->cb_interest, cbi, vcbi);
+			if (x != cbi) {
+				cbi = x;
+				afs_put_cb_interest(afs_v2net(vnode), vcbi);
+				goto again;
+			}
+			return 0;
+		}
+	}
 
-	/* kill all the promises on record from this server */
-	while (!RB_EMPTY_ROOT(&server->cb_promises)) {
-		vnode = rb_entry(server->cb_promises.rb_node,
-				 struct afs_vnode, cb_promise);
-		_debug("UNPROMISE { vid=%x:%u uq=%u}",
-		       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
-		rb_erase(&vnode->cb_promise, &server->cb_promises);
-		vnode->cb_promised = false;
+	if (!cbi) {
+		new = kzalloc(sizeof(struct afs_cb_interest), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+
+		refcount_set(&new->usage, 1);
+		new->sb = vnode->vfs_inode.i_sb;
+		new->vid = vnode->volume->vid;
+		new->server = afs_get_server(server);
+		INIT_LIST_HEAD(&new->cb_link);
+
+		write_lock(&server->cb_break_lock);
+		list_add_tail(&new->cb_link, &server->cb_interests);
+		write_unlock(&server->cb_break_lock);
+
+		x = cmpxchg(&entry->cb_interest, cbi, new);
+		if (x == cbi) {
+			cbi = new;
+		} else {
+			cbi = x;
+			afs_put_cb_interest(afs_v2net(vnode), new);
+		}
 	}
 
-	spin_unlock(&server->cb_lock);
-	_leave("");
+	ASSERT(cbi);
+
+	/* Change the server the vnode is using.  This entails scrubbing any
+	 * interest the vnode had in the previous server it was using.
+	 */
+	write_seqlock(&vnode->cb_lock);
+
+	vnode->cb_interest = afs_get_cb_interest(cbi);
+	vnode->cb_s_break = cbi->server->cb_s_break;
+	clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+
+	write_sequnlock(&vnode->cb_lock);
+	return 0;
 }
 
 /*
- * handle the data invalidation side of a callback being broken
+ * Set a vnode's interest on a server.
  */
-void afs_broken_callback_work(struct work_struct *work)
+void afs_set_cb_interest(struct afs_vnode *vnode, struct afs_cb_interest *cbi)
 {
-	struct afs_vnode *vnode =
-		container_of(work, struct afs_vnode, cb_broken_work);
+	struct afs_cb_interest *old_cbi = NULL;
 
-	_enter("");
-
-	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+	if (vnode->cb_interest == cbi)
 		return;
 
-	/* we're only interested in dealing with a broken callback on *this*
-	 * vnode and only if no-one else has dealt with it yet */
-	if (!mutex_trylock(&vnode->validate_lock))
-		return; /* someone else is dealing with it */
-
-	if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
-		if (S_ISDIR(vnode->vfs_inode.i_mode))
-			afs_clear_permits(vnode);
-
-		if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0)
-			goto out;
-
-		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-			goto out;
-
-		/* if the vnode's data version number changed then its contents
-		 * are different */
-		if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
-			afs_zap_data(vnode);
+	write_seqlock(&vnode->cb_lock);
+	if (vnode->cb_interest != cbi) {
+		afs_get_cb_interest(cbi);
+		old_cbi = vnode->cb_interest;
+		vnode->cb_interest = cbi;
 	}
+	write_sequnlock(&vnode->cb_lock);
+	afs_put_cb_interest(afs_v2net(vnode), cbi);
+}
 
-out:
-	mutex_unlock(&vnode->validate_lock);
-
-	/* avoid the potential race whereby the mutex_trylock() in this
-	 * function happens again between the clear_bit() and the
-	 * mutex_unlock() */
-	if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
-		_debug("requeue");
-		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+/*
+ * Remove an interest on a server.
+ */
+void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi)
+{
+	if (cbi && refcount_dec_and_test(&cbi->usage)) {
+		if (!list_empty(&cbi->cb_link)) {
+			write_lock(&cbi->server->cb_break_lock);
+			list_del_init(&cbi->cb_link);
+			write_unlock(&cbi->server->cb_break_lock);
+			afs_put_server(net, cbi->server);
+		}
+		kfree(cbi);
 	}
-	_leave("");
+}
+
+/*
+ * allow the fileserver to request callback state (re-)initialisation
+ */
+void afs_init_callback_state(struct afs_server *server)
+{
+	if (!test_and_clear_bit(AFS_SERVER_FL_NEW, &server->flags))
+		server->cb_s_break++;
 }
 
 /*
  * actually break a callback
  */
-static void afs_break_callback(struct afs_server *server,
-			       struct afs_vnode *vnode)
+void afs_break_callback(struct afs_vnode *vnode)
 {
 	_enter("");
 
-	set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+	write_seqlock(&vnode->cb_lock);
+
+	if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+		vnode->cb_break++;
+		afs_clear_permits(vnode);
 
-	if (vnode->cb_promised) {
 		spin_lock(&vnode->lock);
 
 		_debug("break callback");
 
-		spin_lock(&server->cb_lock);
-		if (vnode->cb_promised) {
-			rb_erase(&vnode->cb_promise, &server->cb_promises);
-			vnode->cb_promised = false;
-		}
-		spin_unlock(&server->cb_lock);
-
-		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
 		if (list_empty(&vnode->granted_locks) &&
 		    !list_empty(&vnode->pending_locks))
 			afs_lock_may_be_available(vnode);
 		spin_unlock(&vnode->lock);
 	}
+
+	write_sequnlock(&vnode->cb_lock);
 }
 
 /*
@@ -143,49 +176,31 @@ static void afs_break_callback(struct afs_server *server,
 static void afs_break_one_callback(struct afs_server *server,
 				   struct afs_fid *fid)
 {
+	struct afs_cb_interest *cbi;
+	struct afs_iget_data data;
 	struct afs_vnode *vnode;
-	struct rb_node *p;
-
-	_debug("find");
-	spin_lock(&server->fs_lock);
-	p = server->fs_vnodes.rb_node;
-	while (p) {
-		vnode = rb_entry(p, struct afs_vnode, server_rb);
-		if (fid->vid < vnode->fid.vid)
-			p = p->rb_left;
-		else if (fid->vid > vnode->fid.vid)
-			p = p->rb_right;
-		else if (fid->vnode < vnode->fid.vnode)
-			p = p->rb_left;
-		else if (fid->vnode > vnode->fid.vnode)
-			p = p->rb_right;
-		else if (fid->unique < vnode->fid.unique)
-			p = p->rb_left;
-		else if (fid->unique > vnode->fid.unique)
-			p = p->rb_right;
-		else
-			goto found;
-	}
-
-	/* not found so we just ignore it (it may have moved to another
-	 * server) */
-not_available:
-	_debug("not avail");
-	spin_unlock(&server->fs_lock);
-	_leave("");
-	return;
+	struct inode *inode;
 
-found:
-	_debug("found");
-	ASSERTCMP(server, ==, vnode->server);
+	read_lock(&server->cb_break_lock);
 
-	if (!igrab(AFS_VNODE_TO_I(vnode)))
-		goto not_available;
-	spin_unlock(&server->fs_lock);
+	/* Step through all interested superblocks.  There may be more than one
+	 * because of cell aliasing.
+	 */
+	list_for_each_entry(cbi, &server->cb_interests, cb_link) {
+		if (cbi->vid != fid->vid)
+			continue;
+
+		data.volume = NULL;
+		data.fid = *fid;
+		inode = ilookup5_nowait(cbi->sb, fid->vnode, afs_iget5_test, &data);
+		if (inode) {
+			vnode = AFS_FS_I(inode);
+			afs_break_callback(vnode);
+			iput(inode);
+		}
+	}
 
-	afs_break_callback(server, vnode);
-	iput(&vnode->vfs_inode);
-	_leave("");
+	read_unlock(&server->cb_break_lock);
 }
 
 /*
@@ -216,261 +231,14 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
 }
 
 /*
- * record the callback for breaking
- * - the caller must hold server->cb_lock
+ * Clear the callback interests in a server list.
  */
-static void afs_do_give_up_callback(struct afs_server *server,
-				    struct afs_vnode *vnode)
+void afs_clear_callback_interests(struct afs_net *net, struct afs_server_list *slist)
 {
-	struct afs_callback *cb;
-
-	_enter("%p,%p", server, vnode);
-
-	cb = &server->cb_break[server->cb_break_head];
-	cb->fid		= vnode->fid;
-	cb->version	= vnode->cb_version;
-	cb->expiry	= vnode->cb_expiry;
-	cb->type	= vnode->cb_type;
-	smp_wmb();
-	server->cb_break_head =
-		(server->cb_break_head + 1) &
-		(ARRAY_SIZE(server->cb_break) - 1);
-
-	/* defer the breaking of callbacks to try and collect as many as
-	 * possible to ship in one operation */
-	switch (atomic_inc_return(&server->cb_break_n)) {
-	case 1 ... AFSCBMAX - 1:
-		queue_delayed_work(afs_callback_update_worker,
-				   &server->cb_break_work, HZ * 2);
-		break;
-	case AFSCBMAX:
-		afs_flush_callback_breaks(server);
-		break;
-	default:
-		break;
-	}
-
-	ASSERT(server->cb_promises.rb_node != NULL);
-	rb_erase(&vnode->cb_promise, &server->cb_promises);
-	vnode->cb_promised = false;
-	_leave("");
-}
-
-/*
- * discard the callback on a deleted item
- */
-void afs_discard_callback_on_delete(struct afs_vnode *vnode)
-{
-	struct afs_server *server = vnode->server;
+	int i;
 
-	_enter("%d", vnode->cb_promised);
-
-	if (!vnode->cb_promised) {
-		_leave(" [not promised]");
-		return;
-	}
-
-	ASSERT(server != NULL);
-
-	spin_lock(&server->cb_lock);
-	if (vnode->cb_promised) {
-		ASSERT(server->cb_promises.rb_node != NULL);
-		rb_erase(&vnode->cb_promise, &server->cb_promises);
-		vnode->cb_promised = false;
+	for (i = 0; i < slist->nr_servers; i++) {
+		afs_put_cb_interest(net, slist->servers[i].cb_interest);
+		slist->servers[i].cb_interest = NULL;
 	}
-	spin_unlock(&server->cb_lock);
-	_leave("");
-}
-
-/*
- * give up the callback registered for a vnode on the file server when the
- * inode is being cleared
- */
-void afs_give_up_callback(struct afs_vnode *vnode)
-{
-	struct afs_server *server = vnode->server;
-
-	DECLARE_WAITQUEUE(myself, current);
-
-	_enter("%d", vnode->cb_promised);
-
-	_debug("GIVE UP INODE %p", &vnode->vfs_inode);
-
-	if (!vnode->cb_promised) {
-		_leave(" [not promised]");
-		return;
-	}
-
-	ASSERT(server != NULL);
-
-	spin_lock(&server->cb_lock);
-	if (vnode->cb_promised && afs_breakring_space(server) == 0) {
-		add_wait_queue(&server->cb_break_waitq, &myself);
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (!vnode->cb_promised ||
-			    afs_breakring_space(server) != 0)
-				break;
-			spin_unlock(&server->cb_lock);
-			schedule();
-			spin_lock(&server->cb_lock);
-		}
-		remove_wait_queue(&server->cb_break_waitq, &myself);
-		__set_current_state(TASK_RUNNING);
-	}
-
-	/* of course, it's always possible for the server to break this vnode's
-	 * callback first... */
-	if (vnode->cb_promised)
-		afs_do_give_up_callback(server, vnode);
-
-	spin_unlock(&server->cb_lock);
-	_leave("");
-}
-
-/*
- * dispatch a deferred give up callbacks operation
- */
-void afs_dispatch_give_up_callbacks(struct work_struct *work)
-{
-	struct afs_server *server =
-		container_of(work, struct afs_server, cb_break_work.work);
-
-	_enter("");
-
-	/* tell the fileserver to discard the callback promises it has
-	 * - in the event of ENOMEM or some other error, we just forget that we
-	 *   had callbacks entirely, and the server will call us later to break
-	 *   them
-	 */
-	afs_fs_give_up_callbacks(server, true);
-}
-
-/*
- * flush the outstanding callback breaks on a server
- */
-void afs_flush_callback_breaks(struct afs_server *server)
-{
-	mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0);
-}
-
-#if 0
-/*
- * update a bunch of callbacks
- */
-static void afs_callback_updater(struct work_struct *work)
-{
-	struct afs_server *server;
-	struct afs_vnode *vnode, *xvnode;
-	time64_t now;
-	long timeout;
-	int ret;
-
-	server = container_of(work, struct afs_server, updater);
-
-	_enter("");
-
-	now = ktime_get_real_seconds();
-
-	/* find the first vnode to update */
-	spin_lock(&server->cb_lock);
-	for (;;) {
-		if (RB_EMPTY_ROOT(&server->cb_promises)) {
-			spin_unlock(&server->cb_lock);
-			_leave(" [nothing]");
-			return;
-		}
-
-		vnode = rb_entry(rb_first(&server->cb_promises),
-				 struct afs_vnode, cb_promise);
-		if (atomic_read(&vnode->usage) > 0)
-			break;
-		rb_erase(&vnode->cb_promise, &server->cb_promises);
-		vnode->cb_promised = false;
-	}
-
-	timeout = vnode->update_at - now;
-	if (timeout > 0) {
-		queue_delayed_work(afs_vnode_update_worker,
-				   &afs_vnode_update, timeout * HZ);
-		spin_unlock(&server->cb_lock);
-		_leave(" [nothing]");
-		return;
-	}
-
-	list_del_init(&vnode->update);
-	atomic_inc(&vnode->usage);
-	spin_unlock(&server->cb_lock);
-
-	/* we can now perform the update */
-	_debug("update %s", vnode->vldb.name);
-	vnode->state = AFS_VL_UPDATING;
-	vnode->upd_rej_cnt = 0;
-	vnode->upd_busy_cnt = 0;
-
-	ret = afs_vnode_update_record(vl, &vldb);
-	switch (ret) {
-	case 0:
-		afs_vnode_apply_update(vl, &vldb);
-		vnode->state = AFS_VL_UPDATING;
-		break;
-	case -ENOMEDIUM:
-		vnode->state = AFS_VL_VOLUME_DELETED;
-		break;
-	default:
-		vnode->state = AFS_VL_UNCERTAIN;
-		break;
-	}
-
-	/* and then reschedule */
-	_debug("reschedule");
-	vnode->update_at = ktime_get_real_seconds() +
-			afs_vnode_update_timeout;
-
-	spin_lock(&server->cb_lock);
-
-	if (!list_empty(&server->cb_promises)) {
-		/* next update in 10 minutes, but wait at least 1 second more
-		 * than the newest record already queued so that we don't spam
-		 * the VL server suddenly with lots of requests
-		 */
-		xvnode = list_entry(server->cb_promises.prev,
-				    struct afs_vnode, update);
-		if (vnode->update_at <= xvnode->update_at)
-			vnode->update_at = xvnode->update_at + 1;
-		xvnode = list_entry(server->cb_promises.next,
-				    struct afs_vnode, update);
-		timeout = xvnode->update_at - now;
-		if (timeout < 0)
-			timeout = 0;
-	} else {
-		timeout = afs_vnode_update_timeout;
-	}
-
-	list_add_tail(&vnode->update, &server->cb_promises);
-
-	_debug("timeout %ld", timeout);
-	queue_delayed_work(afs_vnode_update_worker,
-			   &afs_vnode_update, timeout * HZ);
-	spin_unlock(&server->cb_lock);
-	afs_put_vnode(vl);
-}
-#endif
-
-/*
- * initialise the callback update process
- */
-int __init afs_callback_update_init(void)
-{
-	afs_callback_update_worker = alloc_ordered_workqueue("kafs_callbackd",
-							     WQ_MEM_RECLAIM);
-	return afs_callback_update_worker ? 0 : -ENOMEM;
-}
-
-/*
- * shut down the callback update process
- */
-void afs_callback_update_kill(void)
-{
-	destroy_workqueue(afs_callback_update_worker);
 }
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index ca0a3cf93791..1858c91169e4 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -1,6 +1,6 @@
 /* AFS cell and server record management
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2017 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -9,213 +9,296 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/key.h>
 #include <linux/ctype.h>
 #include <linux/dns_resolver.h>
 #include <linux/sched.h>
+#include <linux/inet.h>
 #include <keys/rxrpc-type.h>
 #include "internal.h"
 
-DECLARE_RWSEM(afs_proc_cells_sem);
-LIST_HEAD(afs_proc_cells);
+unsigned __read_mostly afs_cell_gc_delay = 10;
 
-static LIST_HEAD(afs_cells);
-static DEFINE_RWLOCK(afs_cells_lock);
-static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
-static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
-static struct afs_cell *afs_cell_root;
+static void afs_manage_cell(struct work_struct *);
+
+static void afs_dec_cells_outstanding(struct afs_net *net)
+{
+	if (atomic_dec_and_test(&net->cells_outstanding))
+		wake_up_atomic_t(&net->cells_outstanding);
+}
 
 /*
- * allocate a cell record and fill in its name, VL server address list and
- * allocate an anonymous key
+ * Set the cell timer to fire after a given delay, assuming it's not already
+ * set for an earlier time.
  */
-static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
-				       char *vllist)
+static void afs_set_cell_timer(struct afs_net *net, time64_t delay)
 {
-	struct afs_cell *cell;
-	struct key *key;
-	char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
-	char  *dvllist = NULL, *_vllist = NULL;
-	char  delimiter = ':';
-	int ret;
+	if (net->live) {
+		atomic_inc(&net->cells_outstanding);
+		if (timer_reduce(&net->cells_timer, jiffies + delay * HZ))
+			afs_dec_cells_outstanding(net);
+	}
+}
 
-	_enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
+/*
+ * Look up and get an activation reference on a cell record under RCU
+ * conditions.  The caller must hold the RCU read lock.
+ */
+struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net,
+				     const char *name, unsigned int namesz)
+{
+	struct afs_cell *cell = NULL;
+	struct rb_node *p;
+	int n, seq = 0, ret = 0;
 
-	BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
+	_enter("%*.*s", namesz, namesz, name);
 
-	if (namelen > AFS_MAXCELLNAME) {
-		_leave(" = -ENAMETOOLONG");
+	if (name && namesz == 0)
+		return ERR_PTR(-EINVAL);
+	if (namesz > AFS_MAXCELLNAME)
 		return ERR_PTR(-ENAMETOOLONG);
-	}
 
-	/* allocate and initialise a cell record */
-	cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
-	if (!cell) {
-		_leave(" = -ENOMEM");
-		return ERR_PTR(-ENOMEM);
-	}
+	do {
+		/* Unfortunately, rbtree walking doesn't give reliable results
+		 * under just the RCU read lock, so we have to check for
+		 * changes.
+		 */
+		if (cell)
+			afs_put_cell(net, cell);
+		cell = NULL;
+		ret = -ENOENT;
 
-	memcpy(cell->name, name, namelen);
-	cell->name[namelen] = 0;
-
-	atomic_set(&cell->usage, 1);
-	INIT_LIST_HEAD(&cell->link);
-	rwlock_init(&cell->servers_lock);
-	INIT_LIST_HEAD(&cell->servers);
-	init_rwsem(&cell->vl_sem);
-	INIT_LIST_HEAD(&cell->vl_list);
-	spin_lock_init(&cell->vl_lock);
-
-	/* if the ip address is invalid, try dns query */
-	if (!vllist || strlen(vllist) < 7) {
-		ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
-		if (ret < 0) {
-			if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
-				/* translate these errors into something
-				 * userspace might understand */
-				ret = -EDESTADDRREQ;
-			_leave(" = %d", ret);
-			return ERR_PTR(ret);
+		read_seqbegin_or_lock(&net->cells_lock, &seq);
+
+		if (!name) {
+			cell = rcu_dereference_raw(net->ws_cell);
+			if (cell) {
+				afs_get_cell(cell);
+				continue;
+			}
+			ret = -EDESTADDRREQ;
+			continue;
 		}
-		_vllist = dvllist;
 
-		/* change the delimiter for user-space reply */
-		delimiter = ',';
+		p = rcu_dereference_raw(net->cells.rb_node);
+		while (p) {
+			cell = rb_entry(p, struct afs_cell, net_node);
+
+			n = strncasecmp(cell->name, name,
+					min_t(size_t, cell->name_len, namesz));
+			if (n == 0)
+				n = cell->name_len - namesz;
+			if (n < 0) {
+				p = rcu_dereference_raw(p->rb_left);
+			} else if (n > 0) {
+				p = rcu_dereference_raw(p->rb_right);
+			} else {
+				if (atomic_inc_not_zero(&cell->usage)) {
+					ret = 0;
+					break;
+				}
+				/* We want to repeat the search, this time with
+				 * the lock properly locked.
+				 */
+			}
+			cell = NULL;
+		}
 
-	} else {
-		_vllist = vllist;
-	}
+	} while (need_seqretry(&net->cells_lock, seq));
 
-	/* fill in the VL server list from the rest of the string */
-	do {
-		unsigned a, b, c, d;
+	done_seqretry(&net->cells_lock, seq);
 
-		next = strchr(_vllist, delimiter);
-		if (next)
-			*next++ = 0;
+	return ret == 0 ? cell : ERR_PTR(ret);
+}
 
-		if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
-			goto bad_address;
+/*
+ * Set up a cell record and fill in its name, VL server address list and
+ * allocate an anonymous key
+ */
+static struct afs_cell *afs_alloc_cell(struct afs_net *net,
+				       const char *name, unsigned int namelen,
+				       const char *vllist)
+{
+	struct afs_cell *cell;
+	int i, ret;
 
-		if (a > 255 || b > 255 || c > 255 || d > 255)
-			goto bad_address;
+	ASSERT(name);
+	if (namelen == 0)
+		return ERR_PTR(-EINVAL);
+	if (namelen > AFS_MAXCELLNAME) {
+		_leave(" = -ENAMETOOLONG");
+		return ERR_PTR(-ENAMETOOLONG);
+	}
 
-		cell->vl_addrs[cell->vl_naddrs++].s_addr =
-			htonl((a << 24) | (b << 16) | (c << 8) | d);
+	_enter("%*.*s,%s", namelen, namelen, name, vllist);
 
-	} while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next));
+	cell = kzalloc(sizeof(struct afs_cell), GFP_KERNEL);
+	if (!cell) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
+	}
 
-	/* create a key to represent an anonymous user */
-	memcpy(keyname, "afs@", 4);
-	dp = keyname + 4;
-	cp = cell->name;
-	do {
-		*dp++ = toupper(*cp);
-	} while (*cp++);
+	cell->net = net;
+	cell->name_len = namelen;
+	for (i = 0; i < namelen; i++)
+		cell->name[i] = tolower(name[i]);
+
+	atomic_set(&cell->usage, 2);
+	INIT_WORK(&cell->manager, afs_manage_cell);
+	cell->flags = ((1 << AFS_CELL_FL_NOT_READY) |
+		       (1 << AFS_CELL_FL_NO_LOOKUP_YET));
+	INIT_LIST_HEAD(&cell->proc_volumes);
+	rwlock_init(&cell->proc_lock);
+	rwlock_init(&cell->vl_addrs_lock);
+
+	/* Fill in the VL server list if we were given a list of addresses to
+	 * use.
+	 */
+	if (vllist) {
+		struct afs_addr_list *alist;
+
+		alist = afs_parse_text_addrs(vllist, strlen(vllist), ':',
+					     VL_SERVICE, AFS_VL_PORT);
+		if (IS_ERR(alist)) {
+			ret = PTR_ERR(alist);
+			goto parse_failed;
+		}
 
-	key = rxrpc_get_null_key(keyname);
-	if (IS_ERR(key)) {
-		_debug("no key");
-		ret = PTR_ERR(key);
-		goto error;
+		rcu_assign_pointer(cell->vl_addrs, alist);
+		cell->dns_expiry = TIME64_MAX;
 	}
-	cell->anonymous_key = key;
-
-	_debug("anon key %p{%x}",
-	       cell->anonymous_key, key_serial(cell->anonymous_key));
 
 	_leave(" = %p", cell);
 	return cell;
 
-bad_address:
-	printk(KERN_ERR "kAFS: bad VL server IP address\n");
-	ret = -EINVAL;
-error:
-	key_put(cell->anonymous_key);
-	kfree(dvllist);
+parse_failed:
+	if (ret == -EINVAL)
+		printk(KERN_ERR "kAFS: bad VL server IP address\n");
 	kfree(cell);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
 
 /*
- * afs_cell_crate() - create a cell record
- * @name:	is the name of the cell.
- * @namsesz:	is the strlen of the cell name.
- * @vllist:	is a colon separated list of IP addresses in "a.b.c.d" format.
- * @retref:	is T to return the cell reference when the cell exists.
+ * afs_lookup_cell - Look up or create a cell record.
+ * @net:	The network namespace
+ * @name:	The name of the cell.
+ * @namesz:	The strlen of the cell name.
+ * @vllist:	A colon/comma separated list of numeric IP addresses or NULL.
+ * @excl:	T if an error should be given if the cell name already exists.
+ *
+ * Look up a cell record by name and query the DNS for VL server addresses if
+ * needed.  Note that that actual DNS query is punted off to the manager thread
+ * so that this function can return immediately if interrupted whilst allowing
+ * cell records to be shared even if not yet fully constructed.
  */
-struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
-				 char *vllist, bool retref)
+struct afs_cell *afs_lookup_cell(struct afs_net *net,
+				 const char *name, unsigned int namesz,
+				 const char *vllist, bool excl)
 {
-	struct afs_cell *cell;
-	int ret;
-
-	_enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
+	struct afs_cell *cell, *candidate, *cursor;
+	struct rb_node *parent, **pp;
+	int ret, n;
+
+	_enter("%s,%s", name, vllist);
+
+	if (!excl) {
+		rcu_read_lock();
+		cell = afs_lookup_cell_rcu(net, name, namesz);
+		rcu_read_unlock();
+		if (!IS_ERR(cell)) {
+			if (excl) {
+				afs_put_cell(net, cell);
+				return ERR_PTR(-EEXIST);
+			}
+			goto wait_for_cell;
+		}
+	}
 
-	down_write(&afs_cells_sem);
-	read_lock(&afs_cells_lock);
-	list_for_each_entry(cell, &afs_cells, link) {
-		if (strncasecmp(cell->name, name, namesz) == 0)
-			goto duplicate_name;
+	/* Assume we're probably going to create a cell and preallocate and
+	 * mostly set up a candidate record.  We can then use this to stash the
+	 * name, the net namespace and VL server addresses.
+	 *
+	 * We also want to do this before we hold any locks as it may involve
+	 * upcalling to userspace to make DNS queries.
+	 */
+	candidate = afs_alloc_cell(net, name, namesz, vllist);
+	if (IS_ERR(candidate)) {
+		_leave(" = %ld", PTR_ERR(candidate));
+		return candidate;
 	}
-	read_unlock(&afs_cells_lock);
 
-	cell = afs_cell_alloc(name, namesz, vllist);
-	if (IS_ERR(cell)) {
-		_leave(" = %ld", PTR_ERR(cell));
-		up_write(&afs_cells_sem);
-		return cell;
+	/* Find the insertion point and check to see if someone else added a
+	 * cell whilst we were allocating.
+	 */
+	write_seqlock(&net->cells_lock);
+
+	pp = &net->cells.rb_node;
+	parent = NULL;
+	while (*pp) {
+		parent = *pp;
+		cursor = rb_entry(parent, struct afs_cell, net_node);
+
+		n = strncasecmp(cursor->name, name,
+				min_t(size_t, cursor->name_len, namesz));
+		if (n == 0)
+			n = cursor->name_len - namesz;
+		if (n < 0)
+			pp = &(*pp)->rb_left;
+		else if (n > 0)
+			pp = &(*pp)->rb_right;
+		else
+			goto cell_already_exists;
 	}
 
-	/* add a proc directory for this cell */
-	ret = afs_proc_cell_setup(cell);
-	if (ret < 0)
-		goto error;
+	cell = candidate;
+	candidate = NULL;
+	rb_link_node_rcu(&cell->net_node, parent, pp);
+	rb_insert_color(&cell->net_node, &net->cells);
+	atomic_inc(&net->cells_outstanding);
+	write_sequnlock(&net->cells_lock);
 
-#ifdef CONFIG_AFS_FSCACHE
-	/* put it up for caching (this never returns an error) */
-	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
-					     &afs_cell_cache_index_def,
-					     cell, true);
-#endif
+	queue_work(afs_wq, &cell->manager);
 
-	/* add to the cell lists */
-	write_lock(&afs_cells_lock);
-	list_add_tail(&cell->link, &afs_cells);
-	write_unlock(&afs_cells_lock);
+wait_for_cell:
+	_debug("wait_for_cell");
+	ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NOT_READY, TASK_INTERRUPTIBLE);
+	smp_rmb();
 
-	down_write(&afs_proc_cells_sem);
-	list_add_tail(&cell->proc_link, &afs_proc_cells);
-	up_write(&afs_proc_cells_sem);
-	up_write(&afs_cells_sem);
+	switch (READ_ONCE(cell->state)) {
+	case AFS_CELL_FAILED:
+		ret = cell->error;
+		goto error;
+	default:
+		_debug("weird %u %d", cell->state, cell->error);
+		goto error;
+	case AFS_CELL_ACTIVE:
+		break;
+	}
 
-	_leave(" = %p", cell);
+	_leave(" = %p [cell]", cell);
 	return cell;
 
+cell_already_exists:
+	_debug("cell exists");
+	cell = cursor;
+	if (excl) {
+		ret = -EEXIST;
+	} else {
+		afs_get_cell(cursor);
+		ret = 0;
+	}
+	write_sequnlock(&net->cells_lock);
+	kfree(candidate);
+	if (ret == 0)
+		goto wait_for_cell;
+	goto error_noput;
 error:
-	up_write(&afs_cells_sem);
-	key_put(cell->anonymous_key);
-	kfree(cell);
-	_leave(" = %d", ret);
+	afs_put_cell(net, cell);
+error_noput:
+	_leave(" = %d [error]", ret);
 	return ERR_PTR(ret);
-
-duplicate_name:
-	if (retref && !IS_ERR(cell))
-		afs_get_cell(cell);
-
-	read_unlock(&afs_cells_lock);
-	up_write(&afs_cells_sem);
-
-	if (retref) {
-		_leave(" = %p", cell);
-		return cell;
-	}
-
-	_leave(" = -EEXIST");
-	return ERR_PTR(-EEXIST);
 }
 
 /*
@@ -223,10 +306,11 @@ duplicate_name:
  * - can be called with a module parameter string
  * - can be called from a write to /proc/fs/afs/rootcell
  */
-int afs_cell_init(char *rootcell)
+int afs_cell_init(struct afs_net *net, const char *rootcell)
 {
 	struct afs_cell *old_root, *new_root;
-	char *cp;
+	const char *cp, *vllist;
+	size_t len;
 
 	_enter("");
 
@@ -239,222 +323,453 @@ int afs_cell_init(char *rootcell)
 	}
 
 	cp = strchr(rootcell, ':');
-	if (!cp)
+	if (!cp) {
 		_debug("kAFS: no VL server IP addresses specified");
-	else
-		*cp++ = 0;
+		vllist = NULL;
+		len = strlen(rootcell);
+	} else {
+		vllist = cp + 1;
+		len = cp - rootcell;
+	}
 
 	/* allocate a cell record for the root cell */
-	new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
+	new_root = afs_lookup_cell(net, rootcell, len, vllist, false);
 	if (IS_ERR(new_root)) {
 		_leave(" = %ld", PTR_ERR(new_root));
 		return PTR_ERR(new_root);
 	}
 
+	set_bit(AFS_CELL_FL_NO_GC, &new_root->flags);
+	afs_get_cell(new_root);
+
 	/* install the new cell */
-	write_lock(&afs_cells_lock);
-	old_root = afs_cell_root;
-	afs_cell_root = new_root;
-	write_unlock(&afs_cells_lock);
-	afs_put_cell(old_root);
+	write_seqlock(&net->cells_lock);
+	old_root = net->ws_cell;
+	net->ws_cell = new_root;
+	write_sequnlock(&net->cells_lock);
 
+	afs_put_cell(net, old_root);
 	_leave(" = 0");
 	return 0;
 }
 
 /*
- * lookup a cell record
+ * Update a cell's VL server address list from the DNS.
  */
-struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
-				 bool dns_cell)
+static void afs_update_cell(struct afs_cell *cell)
 {
-	struct afs_cell *cell;
-
-	_enter("\"%*.*s\",", namesz, namesz, name ?: "");
-
-	down_read(&afs_cells_sem);
-	read_lock(&afs_cells_lock);
-
-	if (name) {
-		/* if the cell was named, look for it in the cell record list */
-		list_for_each_entry(cell, &afs_cells, link) {
-			if (strncmp(cell->name, name, namesz) == 0) {
-				afs_get_cell(cell);
-				goto found;
-			}
+	struct afs_addr_list *alist, *old;
+	time64_t now, expiry;
+
+	_enter("%s", cell->name);
+
+	alist = afs_dns_query(cell, &expiry);
+	if (IS_ERR(alist)) {
+		switch (PTR_ERR(alist)) {
+		case -ENODATA:
+			/* The DNS said that the cell does not exist */
+			set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
+			clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
+			cell->dns_expiry = ktime_get_real_seconds() + 61;
+			break;
+
+		case -EAGAIN:
+		case -ECONNREFUSED:
+		default:
+			set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
+			cell->dns_expiry = ktime_get_real_seconds() + 10;
+			break;
 		}
-		cell = ERR_PTR(-ENOENT);
-		if (dns_cell)
-			goto create_cell;
-	found:
-		;
+
+		cell->error = -EDESTADDRREQ;
 	} else {
-		cell = afs_cell_root;
-		if (!cell) {
-			/* this should not happen unless user tries to mount
-			 * when root cell is not set. Return an impossibly
-			 * bizarre errno to alert the user. Things like
-			 * ENOENT might be "more appropriate" but they happen
-			 * for other reasons.
-			 */
-			cell = ERR_PTR(-EDESTADDRREQ);
-		} else {
-			afs_get_cell(cell);
-		}
+		clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
+		clear_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
+
+		/* Exclusion on changing vl_addrs is achieved by a
+		 * non-reentrant work item.
+		 */
+		old = rcu_dereference_protected(cell->vl_addrs, true);
+		rcu_assign_pointer(cell->vl_addrs, alist);
+		cell->dns_expiry = expiry;
 
+		if (old)
+			afs_put_addrlist(old);
 	}
 
-	read_unlock(&afs_cells_lock);
-	up_read(&afs_cells_sem);
-	_leave(" = %p", cell);
-	return cell;
+	if (test_and_clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags))
+		wake_up_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET);
 
-create_cell:
-	read_unlock(&afs_cells_lock);
-	up_read(&afs_cells_sem);
+	now = ktime_get_real_seconds();
+	afs_set_cell_timer(cell->net, cell->dns_expiry - now);
+	_leave("");
+}
 
-	cell = afs_cell_create(name, namesz, NULL, true);
+/*
+ * Destroy a cell record
+ */
+static void afs_cell_destroy(struct rcu_head *rcu)
+{
+	struct afs_cell *cell = container_of(rcu, struct afs_cell, rcu);
 
-	_leave(" = %p", cell);
-	return cell;
+	_enter("%p{%s}", cell, cell->name);
+
+	ASSERTCMP(atomic_read(&cell->usage), ==, 0);
+
+	afs_put_addrlist(cell->vl_addrs);
+	key_put(cell->anonymous_key);
+	kfree(cell);
+
+	_leave(" [destroyed]");
 }
 
-#if 0
 /*
- * try and get a cell record
+ * Queue the cell manager.
  */
-struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell)
+static void afs_queue_cell_manager(struct afs_net *net)
 {
-	write_lock(&afs_cells_lock);
+	int outstanding = atomic_inc_return(&net->cells_outstanding);
 
-	if (cell && !list_empty(&cell->link))
-		afs_get_cell(cell);
-	else
-		cell = NULL;
+	_enter("%d", outstanding);
 
-	write_unlock(&afs_cells_lock);
-	return cell;
+	if (!queue_work(afs_wq, &net->cells_manager))
+		afs_dec_cells_outstanding(net);
 }
-#endif  /*  0  */
 
 /*
- * destroy a cell record
+ * Cell management timer.  We have an increment on cells_outstanding that we
+ * need to pass along to the work item.
  */
-void afs_put_cell(struct afs_cell *cell)
+void afs_cells_timer(struct timer_list *timer)
 {
-	if (!cell)
-		return;
+	struct afs_net *net = container_of(timer, struct afs_net, cells_timer);
 
-	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
+	_enter("");
+	if (!queue_work(afs_wq, &net->cells_manager))
+		afs_dec_cells_outstanding(net);
+}
 
-	ASSERTCMP(atomic_read(&cell->usage), >, 0);
+/*
+ * Get a reference on a cell record.
+ */
+struct afs_cell *afs_get_cell(struct afs_cell *cell)
+{
+	atomic_inc(&cell->usage);
+	return cell;
+}
 
-	/* to prevent a race, the decrement and the dequeue must be effectively
-	 * atomic */
-	write_lock(&afs_cells_lock);
+/*
+ * Drop a reference on a cell record.
+ */
+void afs_put_cell(struct afs_net *net, struct afs_cell *cell)
+{
+	time64_t now, expire_delay;
 
-	if (likely(!atomic_dec_and_test(&cell->usage))) {
-		write_unlock(&afs_cells_lock);
-		_leave("");
+	if (!cell)
 		return;
-	}
 
-	ASSERT(list_empty(&cell->servers));
-	ASSERT(list_empty(&cell->vl_list));
+	_enter("%s", cell->name);
 
-	write_unlock(&afs_cells_lock);
+	now = ktime_get_real_seconds();
+	cell->last_inactive = now;
+	expire_delay = 0;
+	if (!test_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags) &&
+	    !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags))
+		expire_delay = afs_cell_gc_delay;
 
-	wake_up(&afs_cells_freeable_wq);
+	if (atomic_dec_return(&cell->usage) > 1)
+		return;
 
-	_leave(" [unused]");
+	/* 'cell' may now be garbage collected. */
+	afs_set_cell_timer(net, expire_delay);
 }
 
 /*
- * destroy a cell record
- * - must be called with the afs_cells_sem write-locked
- * - cell->link should have been broken by the caller
+ * Allocate a key to use as a placeholder for anonymous user security.
  */
-static void afs_cell_destroy(struct afs_cell *cell)
+static int afs_alloc_anon_key(struct afs_cell *cell)
 {
-	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
+	struct key *key;
+	char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp;
 
-	ASSERTCMP(atomic_read(&cell->usage), >=, 0);
-	ASSERT(list_empty(&cell->link));
+	/* Create a key to represent an anonymous user. */
+	memcpy(keyname, "afs@", 4);
+	dp = keyname + 4;
+	cp = cell->name;
+	do {
+		*dp++ = tolower(*cp);
+	} while (*cp++);
 
-	/* wait for everyone to stop using the cell */
-	if (atomic_read(&cell->usage) > 0) {
-		DECLARE_WAITQUEUE(myself, current);
+	key = rxrpc_get_null_key(keyname);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
 
-		_debug("wait for cell %s", cell->name);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		add_wait_queue(&afs_cells_freeable_wq, &myself);
+	cell->anonymous_key = key;
 
-		while (atomic_read(&cell->usage) > 0) {
-			schedule();
-			set_current_state(TASK_UNINTERRUPTIBLE);
-		}
+	_debug("anon key %p{%x}",
+	       cell->anonymous_key, key_serial(cell->anonymous_key));
+	return 0;
+}
 
-		remove_wait_queue(&afs_cells_freeable_wq, &myself);
-		set_current_state(TASK_RUNNING);
+/*
+ * Activate a cell.
+ */
+static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
+{
+	int ret;
+
+	if (!cell->anonymous_key) {
+		ret = afs_alloc_anon_key(cell);
+		if (ret < 0)
+			return ret;
 	}
 
-	_debug("cell dead");
-	ASSERTCMP(atomic_read(&cell->usage), ==, 0);
-	ASSERT(list_empty(&cell->servers));
-	ASSERT(list_empty(&cell->vl_list));
+#ifdef CONFIG_AFS_FSCACHE
+	cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
+					     &afs_cell_cache_index_def,
+					     cell, true);
+#endif
+	ret = afs_proc_cell_setup(net, cell);
+	if (ret < 0)
+		return ret;
+	spin_lock(&net->proc_cells_lock);
+	list_add_tail(&cell->proc_link, &net->proc_cells);
+	spin_unlock(&net->proc_cells_lock);
+	return 0;
+}
 
-	afs_proc_cell_remove(cell);
+/*
+ * Deactivate a cell.
+ */
+static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
+{
+	_enter("%s", cell->name);
+
+	afs_proc_cell_remove(net, cell);
 
-	down_write(&afs_proc_cells_sem);
+	spin_lock(&net->proc_cells_lock);
 	list_del_init(&cell->proc_link);
-	up_write(&afs_proc_cells_sem);
+	spin_unlock(&net->proc_cells_lock);
 
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_relinquish_cookie(cell->cache, 0);
+	cell->cache = NULL;
 #endif
-	key_put(cell->anonymous_key);
-	kfree(cell);
 
-	_leave(" [destroyed]");
+	_leave("");
 }
 
 /*
- * purge in-memory cell database on module unload or afs_init() failure
- * - the timeout daemon is stopped before calling this
+ * Manage a cell record, initialising and destroying it, maintaining its DNS
+ * records.
  */
-void afs_cell_purge(void)
+static void afs_manage_cell(struct work_struct *work)
 {
-	struct afs_cell *cell;
+	struct afs_cell *cell = container_of(work, struct afs_cell, manager);
+	struct afs_net *net = cell->net;
+	bool deleted;
+	int ret, usage;
+
+	_enter("%s", cell->name);
+
+again:
+	_debug("state %u", cell->state);
+	switch (cell->state) {
+	case AFS_CELL_INACTIVE:
+	case AFS_CELL_FAILED:
+		write_seqlock(&net->cells_lock);
+		usage = 1;
+		deleted = atomic_try_cmpxchg_relaxed(&cell->usage, &usage, 0);
+		if (deleted)
+			rb_erase(&cell->net_node, &net->cells);
+		write_sequnlock(&net->cells_lock);
+		if (deleted)
+			goto final_destruction;
+		if (cell->state == AFS_CELL_FAILED)
+			goto done;
+		cell->state = AFS_CELL_UNSET;
+		goto again;
+
+	case AFS_CELL_UNSET:
+		cell->state = AFS_CELL_ACTIVATING;
+		goto again;
+
+	case AFS_CELL_ACTIVATING:
+		ret = afs_activate_cell(net, cell);
+		if (ret < 0)
+			goto activation_failed;
+
+		cell->state = AFS_CELL_ACTIVE;
+		smp_wmb();
+		clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
+		wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+		goto again;
+
+	case AFS_CELL_ACTIVE:
+		if (atomic_read(&cell->usage) > 1) {
+			time64_t now = ktime_get_real_seconds();
+			if (cell->dns_expiry <= now && net->live)
+				afs_update_cell(cell);
+			goto done;
+		}
+		cell->state = AFS_CELL_DEACTIVATING;
+		goto again;
+
+	case AFS_CELL_DEACTIVATING:
+		set_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
+		if (atomic_read(&cell->usage) > 1)
+			goto reverse_deactivation;
+		afs_deactivate_cell(net, cell);
+		cell->state = AFS_CELL_INACTIVE;
+		goto again;
+
+	default:
+		break;
+	}
+	_debug("bad state %u", cell->state);
+	BUG(); /* Unhandled state */
+
+activation_failed:
+	cell->error = ret;
+	afs_deactivate_cell(net, cell);
+
+	cell->state = AFS_CELL_FAILED;
+	smp_wmb();
+	if (test_and_clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags))
+		wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+	goto again;
+
+reverse_deactivation:
+	cell->state = AFS_CELL_ACTIVE;
+	smp_wmb();
+	clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
+	wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+	_leave(" [deact->act]");
+	return;
+
+done:
+	_leave(" [done %u]", cell->state);
+	return;
+
+final_destruction:
+	call_rcu(&cell->rcu, afs_cell_destroy);
+	afs_dec_cells_outstanding(net);
+	_leave(" [destruct %d]", atomic_read(&net->cells_outstanding));
+}
+
+/*
+ * Manage the records of cells known to a network namespace.  This includes
+ * updating the DNS records and garbage collecting unused cells that were
+ * automatically added.
+ *
+ * Note that constructed cell records may only be removed from net->cells by
+ * this work item, so it is safe for this work item to stash a cursor pointing
+ * into the tree and then return to caller (provided it skips cells that are
+ * still under construction).
+ *
+ * Note also that we were given an increment on net->cells_outstanding by
+ * whoever queued us that we need to deal with before returning.
+ */
+void afs_manage_cells(struct work_struct *work)
+{
+	struct afs_net *net = container_of(work, struct afs_net, cells_manager);
+	struct rb_node *cursor;
+	time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
+	bool purging = !net->live;
 
 	_enter("");
 
-	afs_put_cell(afs_cell_root);
+	/* Trawl the cell database looking for cells that have expired from
+	 * lack of use and cells whose DNS results have expired and dispatch
+	 * their managers.
+	 */
+	read_seqlock_excl(&net->cells_lock);
 
-	down_write(&afs_cells_sem);
+	for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) {
+		struct afs_cell *cell =
+			rb_entry(cursor, struct afs_cell, net_node);
+		unsigned usage;
+		bool sched_cell = false;
 
-	while (!list_empty(&afs_cells)) {
-		cell = NULL;
+		usage = atomic_read(&cell->usage);
+		_debug("manage %s %u", cell->name, usage);
+
+		ASSERTCMP(usage, >=, 1);
 
-		/* remove the next cell from the front of the list */
-		write_lock(&afs_cells_lock);
+		if (purging) {
+			if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags))
+				usage = atomic_dec_return(&cell->usage);
+			ASSERTCMP(usage, ==, 1);
+		}
+
+		if (usage == 1) {
+			time64_t expire_at = cell->last_inactive;
+
+			if (!test_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags) &&
+			    !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags))
+				expire_at += afs_cell_gc_delay;
+			if (purging || expire_at <= now)
+				sched_cell = true;
+			else if (expire_at < next_manage)
+				next_manage = expire_at;
+		}
 
-		if (!list_empty(&afs_cells)) {
-			cell = list_entry(afs_cells.next,
-					  struct afs_cell, link);
-			list_del_init(&cell->link);
+		if (!purging) {
+			if (cell->dns_expiry <= now)
+				sched_cell = true;
+			else if (cell->dns_expiry <= next_manage)
+				next_manage = cell->dns_expiry;
 		}
 
-		write_unlock(&afs_cells_lock);
+		if (sched_cell)
+			queue_work(afs_wq, &cell->manager);
+	}
+
+	read_sequnlock_excl(&net->cells_lock);
 
-		if (cell) {
-			_debug("PURGING CELL %s (%d)",
-			       cell->name, atomic_read(&cell->usage));
+	/* Update the timer on the way out.  We have to pass an increment on
+	 * cells_outstanding in the namespace that we are in to the timer or
+	 * the work scheduler.
+	 */
+	if (!purging && next_manage < TIME64_MAX) {
+		now = ktime_get_real_seconds();
 
-			/* now the cell should be left with no references */
-			afs_cell_destroy(cell);
+		if (next_manage - now <= 0) {
+			if (queue_work(afs_wq, &net->cells_manager))
+				atomic_inc(&net->cells_outstanding);
+		} else {
+			afs_set_cell_timer(net, next_manage - now);
 		}
 	}
 
-	up_write(&afs_cells_sem);
+	afs_dec_cells_outstanding(net);
+	_leave(" [%d]", atomic_read(&net->cells_outstanding));
+}
+
+/*
+ * Purge in-memory cell database.
+ */
+void afs_cell_purge(struct afs_net *net)
+{
+	struct afs_cell *ws;
+
+	_enter("");
+
+	write_seqlock(&net->cells_lock);
+	ws = net->ws_cell;
+	net->ws_cell = NULL;
+	write_sequnlock(&net->cells_lock);
+	afs_put_cell(net, ws);
+
+	_debug("del timer");
+	if (del_timer_sync(&net->cells_timer))
+		atomic_dec(&net->cells_outstanding);
+
+	_debug("kick mgr");
+	afs_queue_cell_manager(net);
+
+	_debug("wait");
+	wait_on_atomic_t(&net->cells_outstanding, atomic_t_wait,
+			 TASK_UNINTERRUPTIBLE);
 	_leave("");
 }
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 782d4d05a53b..41e277f57b20 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -41,7 +41,6 @@ static CM_NAME(CallBack);
 static const struct afs_call_type afs_SRXCBCallBack = {
 	.name		= afs_SRXCBCallBack_name,
 	.deliver	= afs_deliver_cb_callback,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_CallBack,
 };
@@ -53,7 +52,6 @@ static CM_NAME(InitCallBackState);
 static const struct afs_call_type afs_SRXCBInitCallBackState = {
 	.name		= afs_SRXCBInitCallBackState_name,
 	.deliver	= afs_deliver_cb_init_call_back_state,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_InitCallBackState,
 };
@@ -65,7 +63,6 @@ static CM_NAME(InitCallBackState3);
 static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
 	.name		= afs_SRXCBInitCallBackState3_name,
 	.deliver	= afs_deliver_cb_init_call_back_state3,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_InitCallBackState,
 };
@@ -77,7 +74,6 @@ static CM_NAME(Probe);
 static const struct afs_call_type afs_SRXCBProbe = {
 	.name		= afs_SRXCBProbe_name,
 	.deliver	= afs_deliver_cb_probe,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_Probe,
 };
@@ -89,7 +85,6 @@ static CM_NAME(ProbeUuid);
 static const struct afs_call_type afs_SRXCBProbeUuid = {
 	.name		= afs_SRXCBProbeUuid_name,
 	.deliver	= afs_deliver_cb_probe_uuid,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_ProbeUuid,
 };
@@ -101,7 +96,6 @@ static CM_NAME(TellMeAboutYourself);
 static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
 	.name		= afs_SRXCBTellMeAboutYourself_name,
 	.deliver	= afs_deliver_cb_tell_me_about_yourself,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_TellMeAboutYourself,
 };
@@ -127,6 +121,9 @@ bool afs_cm_incoming_call(struct afs_call *call)
 	case CBProbe:
 		call->type = &afs_SRXCBProbe;
 		return true;
+	case CBProbeUuid:
+		call->type = &afs_SRXCBProbeUuid;
+		return true;
 	case CBTellMeAboutYourself:
 		call->type = &afs_SRXCBTellMeAboutYourself;
 		return true;
@@ -147,18 +144,16 @@ static void afs_cm_destructor(struct afs_call *call)
 	 * afs_deliver_cb_callback().
 	 */
 	if (call->unmarshall == 5) {
-		ASSERT(call->server && call->count && call->request);
-		afs_break_callbacks(call->server, call->count, call->request);
+		ASSERT(call->cm_server && call->count && call->request);
+		afs_break_callbacks(call->cm_server, call->count, call->request);
 	}
 
-	afs_put_server(call->server);
-	call->server = NULL;
 	kfree(call->buffer);
 	call->buffer = NULL;
 }
 
 /*
- * allow the fileserver to see if the cache manager is still alive
+ * The server supplied a list of callbacks that it wanted to break.
  */
 static void SRXAFSCB_CallBack(struct work_struct *work)
 {
@@ -173,7 +168,7 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
 	 * yet */
 	afs_send_empty_reply(call);
 
-	afs_break_callbacks(call->server, call->count, call->request);
+	afs_break_callbacks(call->cm_server, call->count, call->request);
 	afs_put_call(call);
 	_leave("");
 }
@@ -193,7 +188,6 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 
 	switch (call->unmarshall) {
 	case 0:
-		rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -286,14 +280,16 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 		break;
 	}
 
-	call->state = AFS_CALL_REPLYING;
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return -EIO;
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(&srx);
+	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
+	server = afs_find_server(call->net, &srx);
 	if (!server)
 		return -ENOTCONN;
-	call->server = server;
+	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
@@ -305,9 +301,9 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 {
 	struct afs_call *call = container_of(work, struct afs_call, work);
 
-	_enter("{%p}", call->server);
+	_enter("{%p}", call->cm_server);
 
-	afs_init_callback_state(call->server);
+	afs_init_callback_state(call->cm_server);
 	afs_send_empty_reply(call);
 	afs_put_call(call);
 	_leave("");
@@ -324,21 +320,18 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 
 	_enter("");
 
-	rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
+	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
 
 	ret = afs_extract_data(call, NULL, 0, false);
 	if (ret < 0)
 		return ret;
 
-	/* no unmarshalling required */
-	call->state = AFS_CALL_REPLYING;
-
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(&srx);
+	server = afs_find_server(call->net, &srx);
 	if (!server)
 		return -ENOTCONN;
-	call->server = server;
+	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
@@ -357,8 +350,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 
 	_enter("");
 
-	rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx);
-
 	_enter("{%u}", call->unmarshall);
 
 	switch (call->unmarshall) {
@@ -402,15 +393,16 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 		break;
 	}
 
-	/* no unmarshalling required */
-	call->state = AFS_CALL_REPLYING;
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return -EIO;
 
 	/* we'll need the file server record as that tells us which set of
 	 * vnodes to operate upon */
-	server = afs_find_server(&srx);
+	rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
+	server = afs_find_server(call->net, &srx);
 	if (!server)
 		return -ENOTCONN;
-	call->server = server;
+	call->cm_server = server;
 
 	return afs_queue_call_work(call);
 }
@@ -441,8 +433,8 @@ static int afs_deliver_cb_probe(struct afs_call *call)
 	if (ret < 0)
 		return ret;
 
-	/* no unmarshalling required */
-	call->state = AFS_CALL_REPLYING;
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return -EIO;
 
 	return afs_queue_call_work(call);
 }
@@ -461,7 +453,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
 
 	_enter("");
 
-	if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0)
+	if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0)
 		reply.match = htonl(0);
 	else
 		reply.match = htonl(1);
@@ -524,7 +516,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
 		break;
 	}
 
-	call->state = AFS_CALL_REPLYING;
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return -EIO;
 
 	return afs_queue_call_work(call);
 }
@@ -568,13 +561,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 	memset(&reply, 0, sizeof(reply));
 	reply.ia.nifs = htonl(nifs);
 
-	reply.ia.uuid[0] = afs_uuid.time_low;
-	reply.ia.uuid[1] = htonl(ntohs(afs_uuid.time_mid));
-	reply.ia.uuid[2] = htonl(ntohs(afs_uuid.time_hi_and_version));
-	reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
-	reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
+	reply.ia.uuid[0] = call->net->uuid.time_low;
+	reply.ia.uuid[1] = htonl(ntohs(call->net->uuid.time_mid));
+	reply.ia.uuid[2] = htonl(ntohs(call->net->uuid.time_hi_and_version));
+	reply.ia.uuid[3] = htonl((s8) call->net->uuid.clock_seq_hi_and_reserved);
+	reply.ia.uuid[4] = htonl((s8) call->net->uuid.clock_seq_low);
 	for (loop = 0; loop < 6; loop++)
-		reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]);
+		reply.ia.uuid[loop + 5] = htonl((s8) call->net->uuid.node[loop]);
 
 	if (ifs) {
 		for (loop = 0; loop < nifs; loop++) {
@@ -605,8 +598,8 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
 	if (ret < 0)
 		return ret;
 
-	/* no unmarshalling required */
-	call->state = AFS_CALL_REPLYING;
+	if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
+		return -EIO;
 
 	return afs_queue_call_work(call);
 }
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 613a77058263..ab618d32554c 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -130,10 +130,11 @@ struct afs_lookup_cookie {
 /*
  * check that a directory page is valid
  */
-static inline bool afs_dir_check_page(struct inode *dir, struct page *page)
+bool afs_dir_check_page(struct inode *dir, struct page *page)
 {
 	struct afs_dir_page *dbuf;
-	loff_t latter;
+	struct afs_vnode *vnode = AFS_FS_I(dir);
+	loff_t latter, i_size, off;
 	int tmp, qty;
 
 #if 0
@@ -150,8 +151,15 @@ static inline bool afs_dir_check_page(struct inode *dir, struct page *page)
 	}
 #endif
 
-	/* determine how many magic numbers there should be in this page */
-	latter = dir->i_size - page_offset(page);
+	/* Determine how many magic numbers there should be in this page, but
+	 * we must take care because the directory may change size under us.
+	 */
+	off = page_offset(page);
+	i_size = i_size_read(dir);
+	if (i_size <= off)
+		goto checked;
+
+	latter = i_size - off;
 	if (latter >= PAGE_SIZE)
 		qty = PAGE_SIZE;
 	else
@@ -162,13 +170,15 @@ static inline bool afs_dir_check_page(struct inode *dir, struct page *page)
 	dbuf = page_address(page);
 	for (tmp = 0; tmp < qty; tmp++) {
 		if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) {
-			printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n",
+			printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n",
 			       __func__, dir->i_ino, tmp, qty,
 			       ntohs(dbuf->blocks[tmp].pagehdr.magic));
+			trace_afs_dir_check_failed(vnode, off, i_size);
 			goto error;
 		}
 	}
 
+checked:
 	SetPageChecked(page);
 	return true;
 
@@ -183,6 +193,7 @@ error:
 static inline void afs_dir_put_page(struct page *page)
 {
 	kunmap(page);
+	unlock_page(page);
 	put_page(page);
 }
 
@@ -197,9 +208,10 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
 
 	page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
 	if (!IS_ERR(page)) {
+		lock_page(page);
 		kmap(page);
 		if (unlikely(!PageChecked(page))) {
-			if (PageError(page) || !afs_dir_check_page(dir, page))
+			if (PageError(page))
 				goto fail;
 		}
 	}
@@ -384,8 +396,7 @@ out:
  */
 static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-	return afs_dir_iterate(file_inode(file), 
-			      ctx, file->private_data);
+	return afs_dir_iterate(file_inode(file), ctx, afs_file_key(file));
 }
 
 /*
@@ -553,7 +564,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
 
 	/* instantiate the dentry */
-	inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL);
+	inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL, NULL);
 	key_put(key);
 	if (IS_ERR(inode)) {
 		_leave(" = %ld", PTR_ERR(inode));
@@ -581,6 +592,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	struct afs_vnode *vnode, *dir;
 	struct afs_fid uninitialized_var(fid);
 	struct dentry *parent;
+	struct inode *inode;
 	struct key *key;
 	void *dir_version;
 	int ret;
@@ -588,30 +600,39 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
-	vnode = AFS_FS_I(d_inode(dentry));
-
-	if (d_really_is_positive(dentry))
+	if (d_really_is_positive(dentry)) {
+		vnode = AFS_FS_I(d_inode(dentry));
 		_enter("{v={%x:%u} n=%pd fl=%lx},",
 		       vnode->fid.vid, vnode->fid.vnode, dentry,
 		       vnode->flags);
-	else
+	} else {
 		_enter("{neg n=%pd}", dentry);
+	}
 
 	key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
 	if (IS_ERR(key))
 		key = NULL;
 
+	if (d_really_is_positive(dentry)) {
+		inode = d_inode(dentry);
+		if (inode) {
+			vnode = AFS_FS_I(inode);
+			afs_validate(vnode, key);
+			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+				goto out_bad;
+		}
+	}
+
 	/* lock down the parent dentry so we can peer at it */
 	parent = dget_parent(dentry);
 	dir = AFS_FS_I(d_inode(parent));
 
 	/* validate the parent directory */
-	if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
-		afs_validate(dir, key);
+	afs_validate(dir, key);
 
 	if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
 		_debug("%pd: parent dir deleted", dentry);
-		goto out_bad;
+		goto out_bad_parent;
 	}
 
 	dir_version = (void *) (unsigned long) dir->status.data_version;
@@ -626,13 +647,16 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	case 0:
 		/* the filename maps to something */
 		if (d_really_is_negative(dentry))
-			goto out_bad;
-		if (is_bad_inode(d_inode(dentry))) {
+			goto out_bad_parent;
+		inode = d_inode(dentry);
+		if (is_bad_inode(inode)) {
 			printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
 			       dentry);
-			goto out_bad;
+			goto out_bad_parent;
 		}
 
+		vnode = AFS_FS_I(inode);
+
 		/* if the vnode ID has changed, then the dirent points to a
 		 * different file */
 		if (fid.vnode != vnode->fid.vnode) {
@@ -649,10 +673,10 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 			_debug("%pd: file deleted (uq %u -> %u I:%u)",
 			       dentry, fid.unique,
 			       vnode->fid.unique,
-			       d_inode(dentry)->i_generation);
-			spin_lock(&vnode->lock);
+			       vnode->vfs_inode.i_generation);
+			write_seqlock(&vnode->cb_lock);
 			set_bit(AFS_VNODE_DELETED, &vnode->flags);
-			spin_unlock(&vnode->lock);
+			write_sequnlock(&vnode->cb_lock);
 			goto not_found;
 		}
 		goto out_valid;
@@ -667,7 +691,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	default:
 		_debug("failed to iterate dir %pd: %d",
 		       parent, ret);
-		goto out_bad;
+		goto out_bad_parent;
 	}
 
 out_valid:
@@ -683,9 +707,10 @@ not_found:
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
 	spin_unlock(&dentry->d_lock);
 
-out_bad:
+out_bad_parent:
 	_debug("dropping dentry %pd2", dentry);
 	dput(parent);
+out_bad:
 	key_put(key);
 
 	_leave(" = 0 [bad]");
@@ -727,20 +752,48 @@ static void afs_d_release(struct dentry *dentry)
 }
 
 /*
+ * Create a new inode for create/mkdir/symlink
+ */
+static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
+				struct dentry *new_dentry,
+				struct afs_fid *newfid,
+				struct afs_file_status *newstatus,
+				struct afs_callback *newcb)
+{
+	struct inode *inode;
+
+	if (fc->ac.error < 0)
+		return;
+
+	inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key,
+			 newfid, newstatus, newcb, fc->cbi);
+	if (IS_ERR(inode)) {
+		/* ENOMEM or EINTR at a really inconvenient time - just abandon
+		 * the new directory on the server.
+		 */
+		fc->ac.error = PTR_ERR(inode);
+		return;
+	}
+
+	d_instantiate(new_dentry, inode);
+	if (d_unhashed(new_dentry))
+		d_rehash(new_dentry);
+}
+
+/*
  * create a directory on an AFS filesystem
  */
 static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	struct afs_file_status status;
-	struct afs_callback cb;
-	struct afs_server *server;
-	struct afs_vnode *dvnode, *vnode;
-	struct afs_fid fid;
-	struct inode *inode;
+	struct afs_file_status newstatus;
+	struct afs_fs_cursor fc;
+	struct afs_callback newcb;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct afs_fid newfid;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
+	mode |= S_IFDIR;
 
 	_enter("{%x:%u},{%pd},%ho",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
@@ -751,40 +804,27 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 		goto error;
 	}
 
-	mode |= S_IFDIR;
-	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
-			       mode, &fid, &status, &cb, &server);
-	if (ret < 0)
-		goto mkdir_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_create(&fc, dentry->d_name.name, mode,
+				      &newfid, &newstatus, &newcb);
+		}
 
-	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
-	if (IS_ERR(inode)) {
-		/* ENOMEM at a really inconvenient time - just abandon the new
-		 * directory on the server */
-		ret = PTR_ERR(inode);
-		goto iget_error;
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
 	}
 
-	/* apply the status report we've got for the new vnode */
-	vnode = AFS_FS_I(inode);
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(server);
-
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry)) {
-		_debug("not hashed");
-		d_rehash(dentry);
-	}
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-iget_error:
-	afs_put_server(server);
-mkdir_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -793,16 +833,29 @@ error:
 }
 
 /*
+ * Remove a subdir from a directory.
+ */
+static void afs_dir_remove_subdir(struct dentry *dentry)
+{
+	if (d_really_is_positive(dentry)) {
+		struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+
+		clear_nlink(&vnode->vfs_inode);
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+	}
+}
+
+/*
  * remove a directory from an AFS filesystem
  */
 static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	struct afs_vnode *dvnode, *vnode;
+	struct afs_fs_cursor fc;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
-
 	_enter("{%x:%u},{%pd}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry);
 
@@ -812,45 +865,69 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 		goto error;
 	}
 
-	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true);
-	if (ret < 0)
-		goto rmdir_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_remove(&fc, dentry->d_name.name, true);
+		}
 
-	if (d_really_is_positive(dentry)) {
-		vnode = AFS_FS_I(d_inode(dentry));
-		clear_nlink(&vnode->vfs_inode);
-		set_bit(AFS_VNODE_DELETED, &vnode->flags);
-		afs_discard_callback_on_delete(vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret == 0)
+			afs_dir_remove_subdir(dentry);
 	}
 
 	key_put(key);
-	_leave(" = 0");
-	return 0;
-
-rmdir_error:
-	key_put(key);
 error:
-	_leave(" = %d", ret);
 	return ret;
 }
 
 /*
- * remove a file from an AFS filesystem
+ * Remove a link to a file or symlink from a directory.
+ *
+ * If the file was not deleted due to excess hard links, the fileserver will
+ * break the callback promise on the file - if it had one - before it returns
+ * to us, and if it was deleted, it won't
+ *
+ * However, if we didn't have a callback promise outstanding, or it was
+ * outstanding on a different server, then it won't break it either...
+ */
+static int afs_dir_remove_link(struct dentry *dentry, struct key *key)
+{
+	int ret = 0;
+
+	if (d_really_is_positive(dentry)) {
+		struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
+
+		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+			kdebug("AFS_VNODE_DELETED");
+		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+
+		ret = afs_validate(vnode, key);
+		if (ret == -ESTALE)
+			ret = 0;
+		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+	}
+
+	return ret;
+}
+
+/*
+ * Remove a file or symlink from an AFS filesystem.
  */
 static int afs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	struct afs_vnode *dvnode, *vnode;
+	struct afs_fs_cursor fc;
+	struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
-
 	_enter("{%x:%u},{%pd}",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry);
 
-	ret = -ENAMETOOLONG;
 	if (dentry->d_name.len >= AFSNAMEMAX)
-		goto error;
+		return -ENAMETOOLONG;
 
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
@@ -858,44 +935,28 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 		goto error;
 	}
 
+	/* Try to make sure we have a callback promise on the victim. */
 	if (d_really_is_positive(dentry)) {
 		vnode = AFS_FS_I(d_inode(dentry));
-
-		/* make sure we have a callback promise on the victim */
 		ret = afs_validate(vnode, key);
 		if (ret < 0)
-			goto error;
+			goto error_key;
 	}
 
-	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false);
-	if (ret < 0)
-		goto remove_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_remove(&fc, dentry->d_name.name, false);
+		}
 
-	if (d_really_is_positive(dentry)) {
-		/* if the file wasn't deleted due to excess hard links, the
-		 * fileserver will break the callback promise on the file - if
-		 * it had one - before it returns to us, and if it was deleted,
-		 * it won't
-		 *
-		 * however, if we didn't have a callback promise outstanding,
-		 * or it was outstanding on a different server, then it won't
-		 * break it either...
-		 */
-		vnode = AFS_FS_I(d_inode(dentry));
-		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-			_debug("AFS_VNODE_DELETED");
-		if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
-			_debug("AFS_VNODE_CB_BROKEN");
-		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		ret = afs_validate(vnode, key);
-		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret == 0)
+			ret = afs_dir_remove_link(dentry, key);
 	}
 
-	key_put(key);
-	_leave(" = 0");
-	return 0;
-
-remove_error:
+error_key:
 	key_put(key);
 error:
 	_leave(" = %d", ret);
@@ -908,60 +969,50 @@ error:
 static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		      bool excl)
 {
-	struct afs_file_status status;
-	struct afs_callback cb;
-	struct afs_server *server;
-	struct afs_vnode *dvnode, *vnode;
-	struct afs_fid fid;
-	struct inode *inode;
+	struct afs_fs_cursor fc;
+	struct afs_file_status newstatus;
+	struct afs_callback newcb;
+	struct afs_vnode *dvnode = dvnode = AFS_FS_I(dir);
+	struct afs_fid newfid;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
+	mode |= S_IFREG;
 
 	_enter("{%x:%u},{%pd},%ho,",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
 
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
 	}
 
-	mode |= S_IFREG;
-	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
-			       mode, &fid, &status, &cb, &server);
-	if (ret < 0)
-		goto create_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_create(&fc, dentry->d_name.name, mode,
+				      &newfid, &newstatus, &newcb);
+		}
 
-	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
-	if (IS_ERR(inode)) {
-		/* ENOMEM at a really inconvenient time - just abandon the new
-		 * directory on the server */
-		ret = PTR_ERR(inode);
-		goto iget_error;
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
 	}
 
-	/* apply the status report we've got for the new vnode */
-	vnode = AFS_FS_I(inode);
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(server);
-
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry)) {
-		_debug("not hashed");
-		d_rehash(dentry);
-	}
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-iget_error:
-	afs_put_server(server);
-create_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -975,6 +1026,7 @@ error:
 static int afs_link(struct dentry *from, struct inode *dir,
 		    struct dentry *dentry)
 {
+	struct afs_fs_cursor fc;
 	struct afs_vnode *dvnode, *vnode;
 	struct key *key;
 	int ret;
@@ -987,23 +1039,45 @@ static int afs_link(struct dentry *from, struct inode *dir,
 	       dvnode->fid.vid, dvnode->fid.vnode,
 	       dentry);
 
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
 	key = afs_request_key(dvnode->volume->cell);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
 	}
 
-	ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name);
-	if (ret < 0)
-		goto link_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) {
+			afs_end_vnode_operation(&fc);
+			return -ERESTARTSYS;
+		}
+
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_link(&fc, vnode, dentry->d_name.name);
+		}
+
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break_2);
+		ihold(&vnode->vfs_inode);
+		d_instantiate(dentry, &vnode->vfs_inode);
+
+		mutex_unlock(&vnode->io_lock);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
+	}
 
-	ihold(&vnode->vfs_inode);
-	d_instantiate(dentry, &vnode->vfs_inode);
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-link_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -1017,20 +1091,21 @@ error:
 static int afs_symlink(struct inode *dir, struct dentry *dentry,
 		       const char *content)
 {
-	struct afs_file_status status;
-	struct afs_server *server;
-	struct afs_vnode *dvnode, *vnode;
-	struct afs_fid fid;
-	struct inode *inode;
+	struct afs_fs_cursor fc;
+	struct afs_file_status newstatus;
+	struct afs_vnode *dvnode = AFS_FS_I(dir);
+	struct afs_fid newfid;
 	struct key *key;
 	int ret;
 
-	dvnode = AFS_FS_I(dir);
-
 	_enter("{%x:%u},{%pd},%s",
 	       dvnode->fid.vid, dvnode->fid.vnode, dentry,
 	       content);
 
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len >= AFSNAMEMAX)
+		goto error;
+
 	ret = -EINVAL;
 	if (strlen(content) >= AFSPATHMAX)
 		goto error;
@@ -1041,39 +1116,27 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
 		goto error;
 	}
 
-	ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content,
-				&fid, &status, &server);
-	if (ret < 0)
-		goto create_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = dvnode->cb_break + dvnode->cb_s_break;
+			afs_fs_symlink(&fc, dentry->d_name.name, content,
+				       &newfid, &newstatus);
+		}
 
-	inode = afs_iget(dir->i_sb, key, &fid, &status, NULL);
-	if (IS_ERR(inode)) {
-		/* ENOMEM at a really inconvenient time - just abandon the new
-		 * directory on the server */
-		ret = PTR_ERR(inode);
-		goto iget_error;
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+		afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, NULL);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
 	}
 
-	/* apply the status report we've got for the new vnode */
-	vnode = AFS_FS_I(inode);
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	afs_vnode_finalise_status_update(vnode, server);
-	afs_put_server(server);
-
-	d_instantiate(dentry, inode);
-	if (d_unhashed(dentry)) {
-		_debug("not hashed");
-		d_rehash(dentry);
-	}
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-iget_error:
-	afs_put_server(server);
-create_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(dentry);
@@ -1088,6 +1151,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		      struct inode *new_dir, struct dentry *new_dentry,
 		      unsigned int flags)
 {
+	struct afs_fs_cursor fc;
 	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
 	struct key *key;
 	int ret;
@@ -1111,16 +1175,35 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto error;
 	}
 
-	ret = afs_vnode_rename(orig_dvnode, new_dvnode, key,
-			       old_dentry->d_name.name,
-			       new_dentry->d_name.name);
-	if (ret < 0)
-		goto rename_error;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, orig_dvnode, key)) {
+		if (orig_dvnode != new_dvnode) {
+			if (mutex_lock_interruptible_nested(&new_dvnode->io_lock, 1) < 0) {
+				afs_end_vnode_operation(&fc);
+				return -ERESTARTSYS;
+			}
+		}
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break;
+			fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break;
+			afs_fs_rename(&fc, old_dentry->d_name.name,
+				      new_dvnode, new_dentry->d_name.name);
+		}
+
+		afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break);
+		afs_vnode_commit_status(&fc, new_dvnode, fc.cb_break_2);
+		if (orig_dvnode != new_dvnode)
+			mutex_unlock(&new_dvnode->io_lock);
+		ret = afs_end_vnode_operation(&fc);
+		if (ret < 0)
+			goto error_key;
+	}
+
 	key_put(key);
 	_leave(" = 0");
 	return 0;
 
-rename_error:
+error_key:
 	key_put(key);
 error:
 	d_drop(new_dentry);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 510cba15fa56..a39192ced99e 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,11 +19,11 @@
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 
+static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
 static int afs_readpage(struct file *file, struct page *page);
 static void afs_invalidatepage(struct page *page, unsigned int offset,
 			       unsigned int length);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
-static int afs_launder_page(struct page *page);
 
 static int afs_readpages(struct file *filp, struct address_space *mapping,
 			 struct list_head *pages, unsigned nr_pages);
@@ -35,7 +35,7 @@ const struct file_operations afs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= generic_file_read_iter,
 	.write_iter	= afs_file_write,
-	.mmap		= generic_file_readonly_mmap,
+	.mmap		= afs_file_mmap,
 	.splice_read	= generic_file_splice_read,
 	.fsync		= afs_fsync,
 	.lock		= afs_lock,
@@ -62,12 +62,63 @@ const struct address_space_operations afs_fs_aops = {
 	.writepages	= afs_writepages,
 };
 
+static const struct vm_operations_struct afs_vm_ops = {
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite	= afs_page_mkwrite,
+};
+
+/*
+ * Discard a pin on a writeback key.
+ */
+void afs_put_wb_key(struct afs_wb_key *wbk)
+{
+	if (refcount_dec_and_test(&wbk->usage)) {
+		key_put(wbk->key);
+		kfree(wbk);
+	}
+}
+
+/*
+ * Cache key for writeback.
+ */
+int afs_cache_wb_key(struct afs_vnode *vnode, struct afs_file *af)
+{
+	struct afs_wb_key *wbk, *p;
+
+	wbk = kzalloc(sizeof(struct afs_wb_key), GFP_KERNEL);
+	if (!wbk)
+		return -ENOMEM;
+	refcount_set(&wbk->usage, 2);
+	wbk->key = af->key;
+
+	spin_lock(&vnode->wb_lock);
+	list_for_each_entry(p, &vnode->wb_keys, vnode_link) {
+		if (p->key == wbk->key)
+			goto found;
+	}
+
+	key_get(wbk->key);
+	list_add_tail(&wbk->vnode_link, &vnode->wb_keys);
+	spin_unlock(&vnode->wb_lock);
+	af->wb = wbk;
+	return 0;
+
+found:
+	refcount_inc(&p->usage);
+	spin_unlock(&vnode->wb_lock);
+	af->wb = p;
+	kfree(wbk);
+	return 0;
+}
+
 /*
  * open an AFS file or directory and attach a key to it
  */
 int afs_open(struct inode *inode, struct file *file)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_file *af;
 	struct key *key;
 	int ret;
 
@@ -75,19 +126,38 @@ int afs_open(struct inode *inode, struct file *file)
 
 	key = afs_request_key(vnode->volume->cell);
 	if (IS_ERR(key)) {
-		_leave(" = %ld [key]", PTR_ERR(key));
-		return PTR_ERR(key);
+		ret = PTR_ERR(key);
+		goto error;
 	}
 
-	ret = afs_validate(vnode, key);
-	if (ret < 0) {
-		_leave(" = %d [val]", ret);
-		return ret;
+	af = kzalloc(sizeof(*af), GFP_KERNEL);
+	if (!af) {
+		ret = -ENOMEM;
+		goto error_key;
 	}
+	af->key = key;
+
+	ret = afs_validate(vnode, key);
+	if (ret < 0)
+		goto error_af;
 
-	file->private_data = key;
+	if (file->f_mode & FMODE_WRITE) {
+		ret = afs_cache_wb_key(vnode, af);
+		if (ret < 0)
+			goto error_af;
+	}
+	
+	file->private_data = af;
 	_leave(" = 0");
 	return 0;
+
+error_af:
+	kfree(af);
+error_key:
+	key_put(key);
+error:
+	_leave(" = %d", ret);
+	return ret;
 }
 
 /*
@@ -96,10 +166,16 @@ int afs_open(struct inode *inode, struct file *file)
 int afs_release(struct inode *inode, struct file *file)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_file *af = file->private_data;
 
 	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
 
-	key_put(file->private_data);
+	file->private_data = NULL;
+	if (af->wb)
+		afs_put_wb_key(af->wb);
+	key_put(af->key);
+	kfree(af);
+	afs_prune_wb_keys(vnode);
 	_leave(" = 0");
 	return 0;
 }
@@ -138,6 +214,37 @@ static void afs_file_readpage_read_complete(struct page *page,
 #endif
 
 /*
+ * Fetch file data from the volume.
+ */
+int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *desc)
+{
+	struct afs_fs_cursor fc;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,,,",
+	       vnode->volume->name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_fetch_data(&fc, desc);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
  * read page from file, directory or symlink, given a key to use
  */
 int afs_page_filler(void *data, struct page *page)
@@ -199,8 +306,13 @@ int afs_page_filler(void *data, struct page *page)
 
 		/* read the contents of the file from the server into the
 		 * page */
-		ret = afs_vnode_fetch_data(vnode, key, req);
+		ret = afs_fetch_data(vnode, key, req);
 		afs_put_read(req);
+
+		if (ret >= 0 && S_ISDIR(inode->i_mode) &&
+		    !afs_dir_check_page(inode, page))
+			ret = -EIO;
+
 		if (ret < 0) {
 			if (ret == -ENOENT) {
 				_debug("got NOENT from server"
@@ -259,12 +371,12 @@ static int afs_readpage(struct file *file, struct page *page)
 	int ret;
 
 	if (file) {
-		key = file->private_data;
+		key = afs_file_key(file);
 		ASSERT(key != NULL);
 		ret = afs_page_filler(key, page);
 	} else {
 		struct inode *inode = page->mapping->host;
-		key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+		key = afs_request_key(AFS_FS_S(inode->i_sb)->cell);
 		if (IS_ERR(key)) {
 			ret = PTR_ERR(key);
 		} else {
@@ -281,7 +393,7 @@ static int afs_readpage(struct file *file, struct page *page)
 static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req)
 {
 #ifdef CONFIG_AFS_FSCACHE
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 #endif
 	struct page *page = req->pages[req->index];
 
@@ -310,7 +422,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping,
 	struct afs_read *req;
 	struct list_head *p;
 	struct page *first, *page;
-	struct key *key = file->private_data;
+	struct key *key = afs_file_key(file);
 	pgoff_t index;
 	int ret, n, i;
 
@@ -369,7 +481,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping,
 		return 0;
 	}
 
-	ret = afs_vnode_fetch_data(vnode, key, req);
+	ret = afs_fetch_data(vnode, key, req);
 	if (ret < 0)
 		goto error;
 
@@ -406,7 +518,7 @@ error:
 static int afs_readpages(struct file *file, struct address_space *mapping,
 			 struct list_head *pages, unsigned nr_pages)
 {
-	struct key *key = file->private_data;
+	struct key *key = afs_file_key(file);
 	struct afs_vnode *vnode;
 	int ret = 0;
 
@@ -464,16 +576,6 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
 }
 
 /*
- * write back a dirty page
- */
-static int afs_launder_page(struct page *page)
-{
-	_enter("{%lu}", page->index);
-
-	return 0;
-}
-
-/*
  * invalidate part or all of a page
  * - release a page and clean up its private data if offset is 0 (indicating
  *   the entire page)
@@ -481,7 +583,8 @@ static int afs_launder_page(struct page *page)
 static void afs_invalidatepage(struct page *page, unsigned int offset,
 			       unsigned int length)
 {
-	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+	unsigned long priv;
 
 	_enter("{%lu},%u,%u", page->index, offset, length);
 
@@ -498,13 +601,11 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
 #endif
 
 		if (PagePrivate(page)) {
-			if (wb && !PageWriteback(page)) {
-				set_page_private(page, 0);
-				afs_put_writeback(wb);
-			}
-
-			if (!page_private(page))
-				ClearPagePrivate(page);
+			priv = page_private(page);
+			trace_afs_page_dirty(vnode, tracepoint_string("inval"),
+					     page->index, priv);
+			set_page_private(page, 0);
+			ClearPagePrivate(page);
 		}
 	}
 
@@ -517,8 +618,8 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
  */
 static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 {
-	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
 	struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+	unsigned long priv;
 
 	_enter("{{%x:%u}[%lu],%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
@@ -534,10 +635,10 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 #endif
 
 	if (PagePrivate(page)) {
-		if (wb) {
-			set_page_private(page, 0);
-			afs_put_writeback(wb);
-		}
+		priv = page_private(page);
+		trace_afs_page_dirty(vnode, tracepoint_string("rel"),
+				     page->index, priv);
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 	}
 
@@ -545,3 +646,16 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 	_leave(" = T");
 	return 1;
 }
+
+/*
+ * Handle setting up a memory mapping on an AFS file.
+ */
+static int afs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int ret;
+
+	ret = generic_file_mmap(file, vma);
+	if (ret == 0)
+		vma->vm_ops = &afs_vm_ops;
+	return ret;
+}
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 3191dff2c156..7571a5dfd5a3 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -14,48 +14,17 @@
 #define AFS_LOCK_GRANTED	0
 #define AFS_LOCK_PENDING	1
 
+struct workqueue_struct *afs_lock_manager;
+
 static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl);
 static void afs_fl_release_private(struct file_lock *fl);
 
-static struct workqueue_struct *afs_lock_manager;
-static DEFINE_MUTEX(afs_lock_manager_mutex);
-
 static const struct file_lock_operations afs_lock_ops = {
 	.fl_copy_lock		= afs_fl_copy_lock,
 	.fl_release_private	= afs_fl_release_private,
 };
 
 /*
- * initialise the lock manager thread if it isn't already running
- */
-static int afs_init_lock_manager(void)
-{
-	int ret;
-
-	ret = 0;
-	if (!afs_lock_manager) {
-		mutex_lock(&afs_lock_manager_mutex);
-		if (!afs_lock_manager) {
-			afs_lock_manager = alloc_workqueue("kafs_lockd",
-							   WQ_MEM_RECLAIM, 0);
-			if (!afs_lock_manager)
-				ret = -ENOMEM;
-		}
-		mutex_unlock(&afs_lock_manager_mutex);
-	}
-	return ret;
-}
-
-/*
- * destroy the lock manager thread if it's running
- */
-void __exit afs_kill_lock_manager(void)
-{
-	if (afs_lock_manager)
-		destroy_workqueue(afs_lock_manager);
-}
-
-/*
  * if the callback is broken on this vnode, then the lock may now be available
  */
 void afs_lock_may_be_available(struct afs_vnode *vnode)
@@ -99,6 +68,100 @@ static void afs_grant_locks(struct afs_vnode *vnode, struct file_lock *fl)
 }
 
 /*
+ * Get a lock on a file
+ */
+static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
+			afs_lock_type_t type)
+{
+	struct afs_fs_cursor fc;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x,%u",
+	       vnode->volume->name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key), type);
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_set_lock(&fc, type);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Extend a lock on a file
+ */
+static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_fs_cursor fc;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_current_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_extend_lock(&fc);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Release a lock on a file
+ */
+static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_fs_cursor fc;
+	int ret;
+
+	_enter("%s{%x:%u.%u},%x",
+	       vnode->volume->name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key));
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_current_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_release_lock(&fc);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
  * do work for a lock, including:
  * - probing for a lock we're waiting on but didn't get immediately
  * - extending a lock that's close to timing out
@@ -122,7 +185,7 @@ void afs_lock_work(struct work_struct *work)
 
 		/* attempt to release the server lock; if it fails, we just
 		 * wait 5 minutes and it'll time out anyway */
-		ret = afs_vnode_release_lock(vnode, vnode->unlock_key);
+		ret = afs_release_lock(vnode, vnode->unlock_key);
 		if (ret < 0)
 			printk(KERN_WARNING "AFS:"
 			       " Failed to release lock on {%x:%x} error %d\n",
@@ -143,10 +206,10 @@ void afs_lock_work(struct work_struct *work)
 			BUG();
 		fl = list_entry(vnode->granted_locks.next,
 				struct file_lock, fl_u.afs.link);
-		key = key_get(fl->fl_file->private_data);
+		key = key_get(afs_file_key(fl->fl_file));
 		spin_unlock(&vnode->lock);
 
-		ret = afs_vnode_extend_lock(vnode, key);
+		ret = afs_extend_lock(vnode, key);
 		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
 		key_put(key);
 		switch (ret) {
@@ -177,12 +240,12 @@ void afs_lock_work(struct work_struct *work)
 			BUG();
 		fl = list_entry(vnode->pending_locks.next,
 				struct file_lock, fl_u.afs.link);
-		key = key_get(fl->fl_file->private_data);
+		key = key_get(afs_file_key(fl->fl_file));
 		type = (fl->fl_type == F_RDLCK) ?
 			AFS_LOCK_READ : AFS_LOCK_WRITE;
 		spin_unlock(&vnode->lock);
 
-		ret = afs_vnode_set_lock(vnode, key, type);
+		ret = afs_set_lock(vnode, key, type);
 		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
 		switch (ret) {
 		case -EWOULDBLOCK:
@@ -213,7 +276,7 @@ void afs_lock_work(struct work_struct *work)
 				clear_bit(AFS_VNODE_READLOCKED, &vnode->flags);
 				clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
 				spin_unlock(&vnode->lock);
-				afs_vnode_release_lock(vnode, key);
+				afs_release_lock(vnode, key);
 				if (!list_empty(&vnode->pending_locks))
 					afs_lock_may_be_available(vnode);
 			}
@@ -255,7 +318,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 	struct inode *inode = file_inode(file);
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_lock_type_t type;
-	struct key *key = file->private_data;
+	struct key *key = afs_file_key(file);
 	int ret;
 
 	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
@@ -264,10 +327,6 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 	if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
 		return -EINVAL;
 
-	ret = afs_init_lock_manager();
-	if (ret < 0)
-		return ret;
-
 	fl->fl_ops = &afs_lock_ops;
 	INIT_LIST_HEAD(&fl->fl_u.afs.link);
 	fl->fl_u.afs.state = AFS_LOCK_PENDING;
@@ -278,7 +337,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 
 	/* make sure we've got a callback on this file and that our view of the
 	 * data version is up to date */
-	ret = afs_vnode_fetch_status(vnode, NULL, key);
+	ret = afs_validate(vnode, key);
 	if (ret < 0)
 		goto error;
 
@@ -315,7 +374,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 		set_bit(AFS_VNODE_LOCKING, &vnode->flags);
 		spin_unlock(&vnode->lock);
 
-		ret = afs_vnode_set_lock(vnode, key, type);
+		ret = afs_set_lock(vnode, key, type);
 		clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
 		switch (ret) {
 		case 0:
@@ -418,7 +477,7 @@ given_lock:
 	/* again, make sure we've got a callback on this file and, again, make
 	 * sure that our view of the data version is up to date (we ignore
 	 * errors incurred here and deal with the consequences elsewhere) */
-	afs_vnode_fetch_status(vnode, NULL, key);
+	afs_validate(vnode, key);
 
 error:
 	spin_unlock(&inode->i_lock);
@@ -441,7 +500,7 @@ vfs_rejected_lock:
 static int afs_do_unlk(struct file *file, struct file_lock *fl)
 {
 	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
-	struct key *key = file->private_data;
+	struct key *key = afs_file_key(file);
 	int ret;
 
 	_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
@@ -476,7 +535,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl)
 static int afs_do_getlk(struct file *file, struct file_lock *fl)
 {
 	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
-	struct key *key = file->private_data;
+	struct key *key = afs_file_key(file);
 	int ret, lock_count;
 
 	_enter("");
@@ -490,7 +549,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
 	posix_test_lock(file, fl);
 	if (fl->fl_type == F_UNLCK) {
 		/* no local locks; consult the server */
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		ret = afs_fetch_status(vnode, key);
 		if (ret < 0)
 			goto error;
 		lock_count = vnode->status.lock_count;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 19f76ae36982..b90ef39ae914 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -16,12 +16,19 @@
 #include "internal.h"
 #include "afs_fs.h"
 
+static const struct afs_fid afs_zero_fid;
+
 /*
  * We need somewhere to discard into in case the server helpfully returns more
  * than we asked for in FS.FetchData{,64}.
  */
 static u8 afs_discard_buffer[64];
 
+static inline void afs_use_fs_server(struct afs_call *call, struct afs_cb_interest *cbi)
+{
+	call->cbi = afs_get_cb_interest(cbi);
+}
+
 /*
  * decode an AFSFid block
  */
@@ -47,14 +54,18 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	const __be32 *bp = *_bp;
 	umode_t mode;
 	u64 data_version, size;
-	u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+	bool changed = false;
 	kuid_t owner;
 	kgid_t group;
 
+	if (vnode)
+		write_seqlock(&vnode->cb_lock);
+
 #define EXTRACT(DST)				\
 	do {					\
 		u32 x = ntohl(*bp++);		\
-		changed |= DST - x;		\
+		if (DST != x)			\
+			changed |= true;	\
 		DST = x;			\
 	} while (0)
 
@@ -70,8 +81,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	EXTRACT(status->caller_access); /* call ticket dependent */
 	EXTRACT(status->anon_access);
 	EXTRACT(status->mode);
-	EXTRACT(status->parent.vnode);
-	EXTRACT(status->parent.unique);
+	bp++; /* parent.vnode */
+	bp++; /* parent.unique */
 	bp++; /* seg size */
 	status->mtime_client = ntohl(*bp++);
 	status->mtime_server = ntohl(*bp++);
@@ -95,7 +106,6 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	       status->mtime_client, status->mtime_server);
 
 	if (vnode) {
-		status->parent.vid = vnode->fid.vid;
 		if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
 			_debug("vnode changed");
 			i_size_write(&vnode->vfs_inode, size);
@@ -127,25 +137,47 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 			_debug("vnode modified %llx on {%x:%u}",
 			       (unsigned long long) data_version,
 			       vnode->fid.vid, vnode->fid.vnode);
-			set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+			set_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags);
 			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
 		}
 	} else if (store_version) {
 		status->data_version = data_version;
 	}
+
+	if (vnode)
+		write_sequnlock(&vnode->cb_lock);
 }
 
 /*
  * decode an AFSCallBack block
  */
-static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode)
+static void xdr_decode_AFSCallBack(struct afs_call *call,
+				   struct afs_vnode *vnode,
+				   const __be32 **_bp)
 {
+	struct afs_cb_interest *old, *cbi = call->cbi;
 	const __be32 *bp = *_bp;
+	u32 cb_expiry;
+
+	write_seqlock(&vnode->cb_lock);
+
+	if (call->cb_break == (vnode->cb_break + cbi->server->cb_s_break)) {
+		vnode->cb_version	= ntohl(*bp++);
+		cb_expiry		= ntohl(*bp++);
+		vnode->cb_type		= ntohl(*bp++);
+		vnode->cb_expires_at	= cb_expiry + ktime_get_real_seconds();
+		old = vnode->cb_interest;
+		if (old != call->cbi) {
+			vnode->cb_interest = cbi;
+			cbi = old;
+		}
+		set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+	} else {
+		bp += 3;
+	}
 
-	vnode->cb_version	= ntohl(*bp++);
-	vnode->cb_expiry	= ntohl(*bp++);
-	vnode->cb_type		= ntohl(*bp++);
-	vnode->cb_expires	= vnode->cb_expiry + ktime_get_real_seconds();
+	write_sequnlock(&vnode->cb_lock);
+	call->cbi = cbi;
 	*_bp = bp;
 }
 
@@ -243,22 +275,22 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
  */
 static int afs_deliver_fs_fetch_status(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
-	_enter("");
-
 	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;
 
+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-	xdr_decode_AFSCallBack(&bp, vnode);
-	if (call->reply2)
-		xdr_decode_AFSVolSync(&bp, call->reply2);
+	xdr_decode_AFSCallBack(call, vnode, &bp);
+	if (call->reply[1])
+		xdr_decode_AFSVolSync(&bp, call->reply[1]);
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -269,35 +301,33 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSFetchStatus = {
 	.name		= "FS.FetchStatus",
+	.op		= afs_FS_FetchStatus,
 	.deliver	= afs_deliver_fs_fetch_status,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * fetch the status information for a file
  */
-int afs_fs_fetch_file_status(struct afs_server *server,
-			     struct key *key,
-			     struct afs_vnode *vnode,
-			     struct afs_volsync *volsync,
-			     bool async)
+int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
-	call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
-	if (!call)
+	call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
+	if (!call) {
+		fc->ac.error = -ENOMEM;
 		return -ENOMEM;
+	}
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = volsync;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = volsync;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -306,7 +336,10 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	call->cb_break = fc->cb_break;
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -314,8 +347,8 @@ int afs_fs_fetch_file_status(struct afs_server *server,
  */
 static int afs_deliver_fs_fetch_data(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
-	struct afs_read *req = call->reply3;
+	struct afs_vnode *vnode = call->reply[0];
+	struct afs_read *req = call->reply[2];
 	const __be32 *bp;
 	unsigned int size;
 	void *buffer;
@@ -431,9 +464,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 
 		bp = call->buffer;
 		xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-		xdr_decode_AFSCallBack(&bp, vnode);
-		if (call->reply2)
-			xdr_decode_AFSVolSync(&bp, call->reply2);
+		xdr_decode_AFSCallBack(call, vnode, &bp);
+		if (call->reply[1])
+			xdr_decode_AFSVolSync(&bp, call->reply[1]);
 
 		call->offset = 0;
 		call->unmarshall++;
@@ -457,7 +490,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 
 static void afs_fetch_data_destructor(struct afs_call *call)
 {
-	struct afs_read *req = call->reply3;
+	struct afs_read *req = call->reply[2];
 
 	afs_put_read(req);
 	afs_flat_call_destructor(call);
@@ -468,43 +501,38 @@ static void afs_fetch_data_destructor(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSFetchData = {
 	.name		= "FS.FetchData",
+	.op		= afs_FS_FetchData,
 	.deliver	= afs_deliver_fs_fetch_data,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_fetch_data_destructor,
 };
 
 static const struct afs_call_type afs_RXFSFetchData64 = {
 	.name		= "FS.FetchData64",
+	.op		= afs_FS_FetchData64,
 	.deliver	= afs_deliver_fs_fetch_data,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_fetch_data_destructor,
 };
 
 /*
  * fetch data from a very large file
  */
-static int afs_fs_fetch_data64(struct afs_server *server,
-			       struct key *key,
-			       struct afs_vnode *vnode,
-			       struct afs_read *req,
-			       bool async)
+static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = NULL; /* volsync */
-	call->reply3 = req;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-	call->operation_ID = FSFETCHDATA64;
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = NULL; /* volsync */
+	call->reply[2] = req;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -518,39 +546,37 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 	bp[7] = htonl(lower_32_bits(req->len));
 
 	atomic_inc(&req->usage);
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	call->cb_break = fc->cb_break;
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * fetch data from a file
  */
-int afs_fs_fetch_data(struct afs_server *server,
-		      struct key *key,
-		      struct afs_vnode *vnode,
-		      struct afs_read *req,
-		      bool async)
+int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	if (upper_32_bits(req->pos) ||
 	    upper_32_bits(req->len) ||
 	    upper_32_bits(req->pos + req->len))
-		return afs_fs_fetch_data64(server, key, vnode, req, async);
+		return afs_fs_fetch_data64(fc, req);
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = NULL; /* volsync */
-	call->reply3 = req;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-	call->operation_ID = FSFETCHDATA;
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = NULL; /* volsync */
+	call->reply[2] = req;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -562,90 +588,10 @@ int afs_fs_fetch_data(struct afs_server *server,
 	bp[5] = htonl(lower_32_bits(req->len));
 
 	atomic_inc(&req->usage);
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
-}
-
-/*
- * deliver reply data to an FS.GiveUpCallBacks
- */
-static int afs_deliver_fs_give_up_callbacks(struct afs_call *call)
-{
-	_enter("");
-
-	/* shouldn't be any reply data */
-	return afs_extract_data(call, NULL, 0, false);
-}
-
-/*
- * FS.GiveUpCallBacks operation type
- */
-static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
-	.name		= "FS.GiveUpCallBacks",
-	.deliver	= afs_deliver_fs_give_up_callbacks,
-	.abort_to_error	= afs_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
-};
-
-/*
- * give up a set of callbacks
- * - the callbacks are held in the server->cb_break ring
- */
-int afs_fs_give_up_callbacks(struct afs_server *server,
-			     bool async)
-{
-	struct afs_call *call;
-	size_t ncallbacks;
-	__be32 *bp, *tp;
-	int loop;
-
-	ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail,
-			      ARRAY_SIZE(server->cb_break));
-
-	_enter("{%zu},", ncallbacks);
-
-	if (ncallbacks == 0)
-		return 0;
-	if (ncallbacks > AFSCBMAX)
-		ncallbacks = AFSCBMAX;
-
-	_debug("break %zu callbacks", ncallbacks);
-
-	call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks,
-				   12 + ncallbacks * 6 * 4, 0);
-	if (!call)
-		return -ENOMEM;
-
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-
-	/* marshall the parameters */
-	bp = call->request;
-	tp = bp + 2 + ncallbacks * 3;
-	*bp++ = htonl(FSGIVEUPCALLBACKS);
-	*bp++ = htonl(ncallbacks);
-	*tp++ = htonl(ncallbacks);
-
-	atomic_sub(ncallbacks, &server->cb_break_n);
-	for (loop = ncallbacks; loop > 0; loop--) {
-		struct afs_callback *cb =
-			&server->cb_break[server->cb_break_tail];
-
-		*bp++ = htonl(cb->fid.vid);
-		*bp++ = htonl(cb->fid.vnode);
-		*bp++ = htonl(cb->fid.unique);
-		*tp++ = htonl(cb->version);
-		*tp++ = htonl(cb->expiry);
-		*tp++ = htonl(cb->type);
-		smp_mb();
-		server->cb_break_tail =
-			(server->cb_break_tail + 1) &
-			(ARRAY_SIZE(server->cb_break) - 1);
-	}
-
-	ASSERT(ncallbacks > 0);
-	wake_up_nr(&server->cb_break_waitq, ncallbacks);
-
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	call->cb_break = fc->cb_break;
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -653,7 +599,7 @@ int afs_fs_give_up_callbacks(struct afs_server *server,
  */
 static int afs_deliver_fs_create_vnode(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -665,11 +611,11 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call)
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	xdr_decode_AFSFid(&bp, call->reply2);
-	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+	xdr_decode_AFSFid(&bp, call->reply[1]);
+	xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL);
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-	xdr_decode_AFSCallBack_raw(&bp, call->reply4);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	xdr_decode_AFSCallBack_raw(&bp, call->reply[3]);
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -678,27 +624,33 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call)
 /*
  * FS.CreateFile and FS.MakeDir operation type
  */
-static const struct afs_call_type afs_RXFSCreateXXXX = {
-	.name		= "FS.CreateXXXX",
+static const struct afs_call_type afs_RXFSCreateFile = {
+	.name		= "FS.CreateFile",
+	.op		= afs_FS_CreateFile,
+	.deliver	= afs_deliver_fs_create_vnode,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSMakeDir = {
+	.name		= "FS.MakeDir",
+	.op		= afs_FS_MakeDir,
 	.deliver	= afs_deliver_fs_create_vnode,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * create a file or make a directory
  */
-int afs_fs_create(struct afs_server *server,
-		  struct key *key,
-		  struct afs_vnode *vnode,
+int afs_fs_create(struct afs_fs_cursor *fc,
 		  const char *name,
 		  umode_t mode,
 		  struct afs_fid *newfid,
 		  struct afs_file_status *newstatus,
-		  struct afs_callback *newcb,
-		  bool async)
+		  struct afs_callback *newcb)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
@@ -708,18 +660,17 @@ int afs_fs_create(struct afs_server *server,
 	padsz = (4 - (namesz & 3)) & 3;
 	reqsz = (5 * 4) + namesz + padsz + (6 * 4);
 
-	call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz,
-				   (3 + 21 + 21 + 3 + 6) * 4);
+	call = afs_alloc_flat_call(
+		net, S_ISDIR(mode) ? &afs_RXFSMakeDir : &afs_RXFSCreateFile,
+		reqsz, (3 + 21 + 21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = newfid;
-	call->reply3 = newstatus;
-	call->reply4 = newcb;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = newfid;
+	call->reply[2] = newstatus;
+	call->reply[3] = newcb;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -741,7 +692,9 @@ int afs_fs_create(struct afs_server *server,
 	*bp++ = htonl(mode & S_IALLUGO); /* unix mode */
 	*bp++ = 0; /* segment size */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -749,7 +702,7 @@ int afs_fs_create(struct afs_server *server,
  */
 static int afs_deliver_fs_remove(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -762,7 +715,7 @@ static int afs_deliver_fs_remove(struct afs_call *call)
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -771,24 +724,28 @@ static int afs_deliver_fs_remove(struct afs_call *call)
 /*
  * FS.RemoveDir/FS.RemoveFile operation type
  */
-static const struct afs_call_type afs_RXFSRemoveXXXX = {
-	.name		= "FS.RemoveXXXX",
+static const struct afs_call_type afs_RXFSRemoveFile = {
+	.name		= "FS.RemoveFile",
+	.op		= afs_FS_RemoveFile,
+	.deliver	= afs_deliver_fs_remove,
+	.destructor	= afs_flat_call_destructor,
+};
+
+static const struct afs_call_type afs_RXFSRemoveDir = {
+	.name		= "FS.RemoveDir",
+	.op		= afs_FS_RemoveDir,
 	.deliver	= afs_deliver_fs_remove,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * remove a file or directory
  */
-int afs_fs_remove(struct afs_server *server,
-		  struct key *key,
-		  struct afs_vnode *vnode,
-		  const char *name,
-		  bool isdir,
-		  bool async)
+int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
@@ -798,14 +755,14 @@ int afs_fs_remove(struct afs_server *server,
 	padsz = (4 - (namesz & 3)) & 3;
 	reqsz = (5 * 4) + namesz + padsz;
 
-	call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
+	call = afs_alloc_flat_call(
+		net, isdir ? &afs_RXFSRemoveDir : &afs_RXFSRemoveFile,
+		reqsz, (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -821,7 +778,9 @@ int afs_fs_remove(struct afs_server *server,
 		bp = (void *) bp + padsz;
 	}
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -829,7 +788,7 @@ int afs_fs_remove(struct afs_server *server,
  */
 static int afs_deliver_fs_link(struct afs_call *call)
 {
-	struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
+	struct afs_vnode *dvnode = call->reply[0], *vnode = call->reply[1];
 	const __be32 *bp;
 	int ret;
 
@@ -843,7 +802,7 @@ static int afs_deliver_fs_link(struct afs_call *call)
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
 	xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -854,22 +813,20 @@ static int afs_deliver_fs_link(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSLink = {
 	.name		= "FS.Link",
+	.op		= afs_FS_Link,
 	.deliver	= afs_deliver_fs_link,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * make a hard link
  */
-int afs_fs_link(struct afs_server *server,
-		struct key *key,
-		struct afs_vnode *dvnode,
-		struct afs_vnode *vnode,
-		const char *name,
-		bool async)
+int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+		const char *name)
 {
+	struct afs_vnode *dvnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
@@ -879,15 +836,13 @@ int afs_fs_link(struct afs_server *server,
 	padsz = (4 - (namesz & 3)) & 3;
 	reqsz = (5 * 4) + namesz + padsz + (3 * 4);
 
-	call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = dvnode;
-	call->reply2 = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = dvnode;
+	call->reply[1] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -906,7 +861,9 @@ int afs_fs_link(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -914,7 +871,7 @@ int afs_fs_link(struct afs_server *server,
  */
 static int afs_deliver_fs_symlink(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -926,10 +883,10 @@ static int afs_deliver_fs_symlink(struct afs_call *call)
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	xdr_decode_AFSFid(&bp, call->reply2);
-	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL, NULL);
+	xdr_decode_AFSFid(&bp, call->reply[1]);
+	xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL);
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -940,24 +897,23 @@ static int afs_deliver_fs_symlink(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSSymlink = {
 	.name		= "FS.Symlink",
+	.op		= afs_FS_Symlink,
 	.deliver	= afs_deliver_fs_symlink,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * create a symbolic link
  */
-int afs_fs_symlink(struct afs_server *server,
-		   struct key *key,
-		   struct afs_vnode *vnode,
+int afs_fs_symlink(struct afs_fs_cursor *fc,
 		   const char *name,
 		   const char *contents,
 		   struct afs_fid *newfid,
-		   struct afs_file_status *newstatus,
-		   bool async)
+		   struct afs_file_status *newstatus)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	size_t namesz, reqsz, padsz, c_namesz, c_padsz;
 	__be32 *bp;
 
@@ -971,17 +927,15 @@ int afs_fs_symlink(struct afs_server *server,
 
 	reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
 
-	call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz,
+	call = afs_alloc_flat_call(net, &afs_RXFSSymlink, reqsz,
 				   (3 + 21 + 21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = newfid;
-	call->reply3 = newstatus;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = newfid;
+	call->reply[2] = newstatus;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1010,7 +964,9 @@ int afs_fs_symlink(struct afs_server *server,
 	*bp++ = htonl(S_IRWXUGO); /* unix mode */
 	*bp++ = 0; /* segment size */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -1018,7 +974,7 @@ int afs_fs_symlink(struct afs_server *server,
  */
 static int afs_deliver_fs_rename(struct afs_call *call)
 {
-	struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
+	struct afs_vnode *orig_dvnode = call->reply[0], *new_dvnode = call->reply[1];
 	const __be32 *bp;
 	int ret;
 
@@ -1034,7 +990,7 @@ static int afs_deliver_fs_rename(struct afs_call *call)
 	if (new_dvnode != orig_dvnode)
 		xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode,
 					  NULL);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1045,23 +1001,22 @@ static int afs_deliver_fs_rename(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSRename = {
 	.name		= "FS.Rename",
+	.op		= afs_FS_Rename,
 	.deliver	= afs_deliver_fs_rename,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * create a symbolic link
  */
-int afs_fs_rename(struct afs_server *server,
-		  struct key *key,
-		  struct afs_vnode *orig_dvnode,
+int afs_fs_rename(struct afs_fs_cursor *fc,
 		  const char *orig_name,
 		  struct afs_vnode *new_dvnode,
-		  const char *new_name,
-		  bool async)
+		  const char *new_name)
 {
+	struct afs_vnode *orig_dvnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(orig_dvnode);
 	size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
 	__be32 *bp;
 
@@ -1078,15 +1033,13 @@ int afs_fs_rename(struct afs_server *server,
 		(3 * 4) +
 		4 + n_namesz + n_padsz;
 
-	call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = orig_dvnode;
-	call->reply2 = new_dvnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = orig_dvnode;
+	call->reply[1] = new_dvnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1113,7 +1066,9 @@ int afs_fs_rename(struct afs_server *server,
 		bp = (void *) bp + n_padsz;
 	}
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &orig_dvnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -1121,7 +1076,7 @@ int afs_fs_rename(struct afs_server *server,
  */
 static int afs_deliver_fs_store_data(struct afs_call *call)
 {
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -1135,7 +1090,7 @@ static int afs_deliver_fs_store_data(struct afs_call *call)
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode,
 				  &call->store_version);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	afs_pages_written_back(vnode, call);
 
@@ -1148,47 +1103,44 @@ static int afs_deliver_fs_store_data(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSStoreData = {
 	.name		= "FS.StoreData",
+	.op		= afs_FS_StoreData,
 	.deliver	= afs_deliver_fs_store_data,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 static const struct afs_call_type afs_RXFSStoreData64 = {
 	.name		= "FS.StoreData64",
+	.op		= afs_FS_StoreData64,
 	.deliver	= afs_deliver_fs_store_data,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
  * store a set of pages to a very large file
  */
-static int afs_fs_store_data64(struct afs_server *server,
-			       struct afs_writeback *wb,
+static int afs_fs_store_data64(struct afs_fs_cursor *fc,
+			       struct address_space *mapping,
 			       pgoff_t first, pgoff_t last,
 			       unsigned offset, unsigned to,
-			       loff_t size, loff_t pos, loff_t i_size,
-			       bool async)
+			       loff_t size, loff_t pos, loff_t i_size)
 {
-	struct afs_vnode *vnode = wb->vnode;
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData64,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData64,
 				   (4 + 6 + 3 * 2) * 4,
 				   (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->wb = wb;
-	call->key = wb->key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-	call->mapping = vnode->vfs_inode.i_mapping;
+	call->key = fc->key;
+	call->mapping = mapping;
+	call->reply[0] = vnode;
 	call->first = first;
 	call->last = last;
 	call->first_offset = offset;
@@ -1217,24 +1169,25 @@ static int afs_fs_store_data64(struct afs_server *server,
 	*bp++ = htonl(i_size >> 32);
 	*bp++ = htonl((u32) i_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * store a set of pages
  */
-int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
+int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
 		      pgoff_t first, pgoff_t last,
-		      unsigned offset, unsigned to,
-		      bool async)
+		      unsigned offset, unsigned to)
 {
-	struct afs_vnode *vnode = wb->vnode;
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	loff_t size, pos, i_size;
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	size = (loff_t)to - (loff_t)offset;
 	if (first != last)
@@ -1251,21 +1204,18 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 	       (unsigned long long) i_size);
 
 	if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32)
-		return afs_fs_store_data64(server, wb, first, last, offset, to,
-					   size, pos, i_size, async);
+		return afs_fs_store_data64(fc, mapping, first, last, offset, to,
+					   size, pos, i_size);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData,
 				   (4 + 6 + 3) * 4,
 				   (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->wb = wb;
-	call->key = wb->key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-	call->mapping = vnode->vfs_inode.i_mapping;
+	call->key = fc->key;
+	call->mapping = mapping;
+	call->reply[0] = vnode;
 	call->first = first;
 	call->last = last;
 	call->first_offset = offset;
@@ -1291,7 +1241,9 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 	*bp++ = htonl(size);
 	*bp++ = htonl(i_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -1300,7 +1252,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 static int afs_deliver_fs_store_status(struct afs_call *call)
 {
 	afs_dataversion_t *store_version;
-	struct afs_vnode *vnode = call->reply;
+	struct afs_vnode *vnode = call->reply[0];
 	const __be32 *bp;
 	int ret;
 
@@ -1317,7 +1269,7 @@ static int afs_deliver_fs_store_status(struct afs_call *call)
 
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version);
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1328,22 +1280,22 @@ static int afs_deliver_fs_store_status(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSStoreStatus = {
 	.name		= "FS.StoreStatus",
+	.op		= afs_FS_StoreStatus,
 	.deliver	= afs_deliver_fs_store_status,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 static const struct afs_call_type afs_RXFSStoreData_as_Status = {
 	.name		= "FS.StoreData",
+	.op		= afs_FS_StoreData,
 	.deliver	= afs_deliver_fs_store_status,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 static const struct afs_call_type afs_RXFSStoreData64_as_Status = {
 	.name		= "FS.StoreData64",
+	.op		= afs_FS_StoreData64,
 	.deliver	= afs_deliver_fs_store_status,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -1351,30 +1303,27 @@ static const struct afs_call_type afs_RXFSStoreData64_as_Status = {
  * set the attributes on a very large file, using FS.StoreData rather than
  * FS.StoreStatus so as to alter the file size also
  */
-static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
-				 struct afs_vnode *vnode, struct iattr *attr,
-				 bool async)
+static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	ASSERT(attr->ia_valid & ATTR_SIZE);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData64_as_Status,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData64_as_Status,
 				   (4 + 6 + 3 * 2) * 4,
 				   (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 	call->store_version = vnode->status.data_version + 1;
-	call->operation_ID = FSSTOREDATA;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1392,40 +1341,38 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
 	*bp++ = htonl(attr->ia_size >> 32);	/* new file length */
 	*bp++ = htonl((u32) attr->ia_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus
  * so as to alter the file size also
  */
-static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
-			       struct afs_vnode *vnode, struct iattr *attr,
-			       bool async)
+static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
 	ASSERT(attr->ia_valid & ATTR_SIZE);
 	if (attr->ia_size >> 32)
-		return afs_fs_setattr_size64(server, key, vnode, attr,
-					     async);
+		return afs_fs_setattr_size64(fc, attr);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreData_as_Status,
 				   (4 + 6 + 3) * 4,
 				   (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 	call->store_version = vnode->status.data_version + 1;
-	call->operation_ID = FSSTOREDATA;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1440,38 +1387,36 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
 	*bp++ = 0;				/* size of write */
 	*bp++ = htonl(attr->ia_size);		/* new file length */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * set the attributes on a file, using FS.StoreData if there's a change in file
  * size, and FS.StoreStatus otherwise
  */
-int afs_fs_setattr(struct afs_server *server, struct key *key,
-		   struct afs_vnode *vnode, struct iattr *attr,
-		   bool async)
+int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	if (attr->ia_valid & ATTR_SIZE)
-		return afs_fs_setattr_size(server, key, vnode, attr,
-					   async);
+		return afs_fs_setattr_size(fc, attr);
 
 	_enter(",%x,{%x:%u},,",
-	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
+	       key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
 
-	call = afs_alloc_flat_call(&afs_RXFSStoreStatus,
+	call = afs_alloc_flat_call(net, &afs_RXFSStoreStatus,
 				   (4 + 6) * 4,
 				   (21 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
-	call->operation_ID = FSSTORESTATUS;
+	call->key = fc->key;
+	call->reply[0] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1482,7 +1427,9 @@ int afs_fs_setattr(struct afs_server *server, struct key *key,
 
 	xdr_encode_AFS_StoreStatus(&bp, attr);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -1510,7 +1457,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 			return ret;
 
 		bp = call->buffer;
-		xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
+		xdr_decode_AFSFetchVolumeStatus(&bp, call->reply[1]);
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -1531,13 +1478,13 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 	case 3:
 		_debug("extract volname");
 		if (call->count > 0) {
-			ret = afs_extract_data(call, call->reply3,
+			ret = afs_extract_data(call, call->reply[2],
 					       call->count, true);
 			if (ret < 0)
 				return ret;
 		}
 
-		p = call->reply3;
+		p = call->reply[2];
 		p[call->count] = 0;
 		_debug("volname '%s'", p);
 
@@ -1578,13 +1525,13 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 	case 6:
 		_debug("extract offline");
 		if (call->count > 0) {
-			ret = afs_extract_data(call, call->reply3,
+			ret = afs_extract_data(call, call->reply[2],
 					       call->count, true);
 			if (ret < 0)
 				return ret;
 		}
 
-		p = call->reply3;
+		p = call->reply[2];
 		p[call->count] = 0;
 		_debug("offline '%s'", p);
 
@@ -1625,13 +1572,13 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
 	case 9:
 		_debug("extract motd");
 		if (call->count > 0) {
-			ret = afs_extract_data(call, call->reply3,
+			ret = afs_extract_data(call, call->reply[2],
 					       call->count, true);
 			if (ret < 0)
 				return ret;
 		}
 
-		p = call->reply3;
+		p = call->reply[2];
 		p[call->count] = 0;
 		_debug("motd '%s'", p);
 
@@ -1662,8 +1609,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
  */
 static void afs_get_volume_status_call_destructor(struct afs_call *call)
 {
-	kfree(call->reply3);
-	call->reply3 = NULL;
+	kfree(call->reply[2]);
+	call->reply[2] = NULL;
 	afs_flat_call_destructor(call);
 }
 
@@ -1672,21 +1619,20 @@ static void afs_get_volume_status_call_destructor(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSGetVolumeStatus = {
 	.name		= "FS.GetVolumeStatus",
+	.op		= afs_FS_GetVolumeStatus,
 	.deliver	= afs_deliver_fs_get_volume_status,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_get_volume_status_call_destructor,
 };
 
 /*
  * fetch the status of a volume
  */
-int afs_fs_get_volume_status(struct afs_server *server,
-			     struct key *key,
-			     struct afs_vnode *vnode,
-			     struct afs_volume_status *vs,
-			     bool async)
+int afs_fs_get_volume_status(struct afs_fs_cursor *fc,
+			     struct afs_volume_status *vs)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 	void *tmpbuf;
 
@@ -1696,25 +1642,25 @@ int afs_fs_get_volume_status(struct afs_server *server,
 	if (!tmpbuf)
 		return -ENOMEM;
 
-	call = afs_alloc_flat_call(&afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
 	if (!call) {
 		kfree(tmpbuf);
 		return -ENOMEM;
 	}
 
-	call->key = key;
-	call->reply = vnode;
-	call->reply2 = vs;
-	call->reply3 = tmpbuf;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
+	call->reply[1] = vs;
+	call->reply[2] = tmpbuf;
 
 	/* marshall the parameters */
 	bp = call->request;
 	bp[0] = htonl(FSGETVOLUMESTATUS);
 	bp[1] = htonl(vnode->fid.vid);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
@@ -1733,7 +1679,7 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call)
 
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
-	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+	/* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
 
 	_leave(" = 0 [done]");
 	return 0;
@@ -1744,8 +1690,8 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call)
  */
 static const struct afs_call_type afs_RXFSSetLock = {
 	.name		= "FS.SetLock",
+	.op		= afs_FS_SetLock,
 	.deliver	= afs_deliver_fs_xxxx_lock,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -1754,8 +1700,8 @@ static const struct afs_call_type afs_RXFSSetLock = {
  */
 static const struct afs_call_type afs_RXFSExtendLock = {
 	.name		= "FS.ExtendLock",
+	.op		= afs_FS_ExtendLock,
 	.deliver	= afs_deliver_fs_xxxx_lock,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
@@ -1764,33 +1710,29 @@ static const struct afs_call_type afs_RXFSExtendLock = {
  */
 static const struct afs_call_type afs_RXFSReleaseLock = {
 	.name		= "FS.ReleaseLock",
+	.op		= afs_FS_ReleaseLock,
 	.deliver	= afs_deliver_fs_xxxx_lock,
-	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_flat_call_destructor,
 };
 
 /*
- * get a lock on a file
+ * Set a lock on a file
  */
-int afs_fs_set_lock(struct afs_server *server,
-		    struct key *key,
-		    struct afs_vnode *vnode,
-		    afs_lock_type_t type,
-		    bool async)
+int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSSetLock, 5 * 4, 6 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSSetLock, 5 * 4, 6 * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1800,30 +1742,29 @@ int afs_fs_set_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.unique);
 	*bp++ = htonl(type);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * extend a lock on a file
  */
-int afs_fs_extend_lock(struct afs_server *server,
-		       struct key *key,
-		       struct afs_vnode *vnode,
-		       bool async)
+int afs_fs_extend_lock(struct afs_fs_cursor *fc)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSExtendLock, 4 * 4, 6 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSExtendLock, 4 * 4, 6 * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1832,30 +1773,29 @@ int afs_fs_extend_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
 }
 
 /*
  * release a lock on a file
  */
-int afs_fs_release_lock(struct afs_server *server,
-			struct key *key,
-			struct afs_vnode *vnode,
-			bool async)
+int afs_fs_release_lock(struct afs_fs_cursor *fc)
 {
+	struct afs_vnode *vnode = fc->vnode;
 	struct afs_call *call;
+	struct afs_net *net = afs_v2net(vnode);
 	__be32 *bp;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXFSReleaseLock, 4 * 4, 6 * 4);
+	call = afs_alloc_flat_call(net, &afs_RXFSReleaseLock, 4 * 4, 6 * 4);
 	if (!call)
 		return -ENOMEM;
 
-	call->key = key;
-	call->reply = vnode;
-	call->service_id = FS_SERVICE;
-	call->port = htons(AFS_FS_PORT);
+	call->key = fc->key;
+	call->reply[0] = vnode;
 
 	/* marshall the parameters */
 	bp = call->request;
@@ -1864,5 +1804,145 @@ int afs_fs_release_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+	afs_use_fs_server(call, fc->cbi);
+	trace_afs_make_fs_call(call, &vnode->fid);
+	return afs_make_call(&fc->ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an FS.GiveUpAllCallBacks operation.
+ */
+static int afs_deliver_fs_give_up_all_callbacks(struct afs_call *call)
+{
+	return afs_transfer_reply(call);
+}
+
+/*
+ * FS.GiveUpAllCallBacks operation type
+ */
+static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = {
+	.name		= "FS.GiveUpAllCallBacks",
+	.op		= afs_FS_GiveUpAllCallBacks,
+	.deliver	= afs_deliver_fs_give_up_all_callbacks,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Flush all the callbacks we have on a server.
+ */
+int afs_fs_give_up_all_callbacks(struct afs_net *net,
+				 struct afs_server *server,
+				 struct afs_addr_cursor *ac,
+				 struct key *key)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &afs_RXFSGiveUpAllCallBacks, 1 * 4, 0);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSGIVEUPALLCALLBACKS);
+
+	/* Can't take a ref on server */
+	return afs_make_call(ac, call, GFP_NOFS, false);
+}
+
+/*
+ * Deliver reply data to an FS.GetCapabilities operation.
+ */
+static int afs_deliver_fs_get_capabilities(struct afs_call *call)
+{
+	u32 count;
+	int ret;
+
+	_enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+
+again:
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract the capabilities word count */
+	case 1:
+		ret = afs_extract_data(call, &call->tmp,
+				       1 * sizeof(__be32),
+				       true);
+		if (ret < 0)
+			return ret;
+
+		count = ntohl(call->tmp);
+
+		call->count = count;
+		call->count2 = count;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract capabilities words */
+	case 2:
+		count = min(call->count, 16U);
+		ret = afs_extract_data(call, call->buffer,
+				       count * sizeof(__be32),
+				       call->count > 16);
+		if (ret < 0)
+			return ret;
+
+		/* TODO: Examine capabilities */
+
+		call->count -= count;
+		if (call->count > 0)
+			goto again;
+		call->offset = 0;
+		call->unmarshall++;
+		break;
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.GetCapabilities operation type
+ */
+static const struct afs_call_type afs_RXFSGetCapabilities = {
+	.name		= "FS.GetCapabilities",
+	.op		= afs_FS_GetCapabilities,
+	.deliver	= afs_deliver_fs_get_capabilities,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Probe a fileserver for the capabilities that it supports.  This can
+ * return up to 196 words.
+ */
+int afs_fs_get_capabilities(struct afs_net *net,
+			    struct afs_server *server,
+			    struct afs_addr_cursor *ac,
+			    struct key *key)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &afs_RXFSGetCapabilities, 1 * 4, 16 * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSGETCAPABILITIES);
+
+	/* Can't take a ref on server */
+	trace_afs_make_fs_call(call, NULL);
+	return afs_make_call(ac, call, GFP_NOFS, false);
 }
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 342316a9e3e0..3415eb7484f6 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -23,11 +23,6 @@
 #include <linux/namei.h>
 #include "internal.h"
 
-struct afs_iget_data {
-	struct afs_fid		fid;
-	struct afs_volume	*volume;	/* volume on which resides */
-};
-
 static const struct inode_operations afs_symlink_inode_operations = {
 	.get_link	= page_get_link,
 	.listxattr	= afs_listxattr,
@@ -39,6 +34,7 @@ static const struct inode_operations afs_symlink_inode_operations = {
 static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 {
 	struct inode *inode = AFS_VNODE_TO_I(vnode);
+	bool changed;
 
 	_debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
 	       vnode->status.type,
@@ -47,6 +43,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 	       vnode->status.data_version,
 	       vnode->status.mode);
 
+	read_seqlock_excl(&vnode->cb_lock);
+
 	switch (vnode->status.type) {
 	case AFS_FTYPE_FILE:
 		inode->i_mode	= S_IFREG | vnode->status.mode;
@@ -63,9 +61,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 		if ((vnode->status.mode & 0777) == 0644) {
 			inode->i_flags |= S_AUTOMOUNT;
 
-			spin_lock(&vnode->lock);
 			set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
-			spin_unlock(&vnode->lock);
 
 			inode->i_mode	= S_IFDIR | 0555;
 			inode->i_op	= &afs_mntpt_inode_operations;
@@ -78,13 +74,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 		break;
 	default:
 		printk("kAFS: AFS vnode with undefined type\n");
+		read_sequnlock_excl(&vnode->cb_lock);
 		return -EBADMSG;
 	}
 
-#ifdef CONFIG_AFS_FSCACHE
-	if (vnode->status.size != inode->i_size)
-		fscache_attr_changed(vnode->cache);
-#endif
+	changed = (vnode->status.size != inode->i_size);
 
 	set_nlink(inode, vnode->status.nlink);
 	inode->i_uid		= vnode->status.owner;
@@ -97,13 +91,49 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 	inode->i_generation	= vnode->fid.unique;
 	inode->i_version	= vnode->status.data_version;
 	inode->i_mapping->a_ops	= &afs_fs_aops;
+
+	read_sequnlock_excl(&vnode->cb_lock);
+
+#ifdef CONFIG_AFS_FSCACHE
+	if (changed)
+		fscache_attr_changed(vnode->cache);
+#endif
 	return 0;
 }
 
 /*
+ * Fetch file status from the volume.
+ */
+int afs_fetch_status(struct afs_vnode *vnode, struct key *key)
+{
+	struct afs_fs_cursor fc;
+	int ret;
+
+	_enter("%s,{%x:%u.%u,S=%lx}",
+	       vnode->volume->name,
+	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
+	       vnode->flags);
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_fetch_file_status(&fc, NULL);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
  * iget5() comparator
  */
-static int afs_iget5_test(struct inode *inode, void *opaque)
+int afs_iget5_test(struct inode *inode, void *opaque)
 {
 	struct afs_iget_data *data = opaque;
 
@@ -204,7 +234,7 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
  */
 struct inode *afs_iget(struct super_block *sb, struct key *key,
 		       struct afs_fid *fid, struct afs_file_status *status,
-		       struct afs_callback *cb)
+		       struct afs_callback *cb, struct afs_cb_interest *cbi)
 {
 	struct afs_iget_data data = { .fid = *fid };
 	struct afs_super_info *as;
@@ -237,8 +267,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 
 	if (!status) {
 		/* it's a remotely extant inode */
-		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		ret = afs_fetch_status(vnode, key);
 		if (ret < 0)
 			goto bad_inode;
 	} else {
@@ -249,16 +278,17 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 			/* it's a symlink we just created (the fileserver
 			 * didn't give us a callback) */
 			vnode->cb_version = 0;
-			vnode->cb_expiry = 0;
 			vnode->cb_type = 0;
-			vnode->cb_expires = ktime_get_real_seconds();
+			vnode->cb_expires_at = 0;
 		} else {
 			vnode->cb_version = cb->version;
-			vnode->cb_expiry = cb->expiry;
 			vnode->cb_type = cb->type;
-			vnode->cb_expires = vnode->cb_expiry +
-				ktime_get_real_seconds();
+			vnode->cb_expires_at = cb->expiry;
+			vnode->cb_interest = afs_get_cb_interest(cbi);
+			set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 		}
+
+		vnode->cb_expires_at += ktime_get_real_seconds();
 	}
 
 	/* set up caching before mapping the status, as map-status reads the
@@ -320,25 +350,34 @@ void afs_zap_data(struct afs_vnode *vnode)
  */
 int afs_validate(struct afs_vnode *vnode, struct key *key)
 {
+	time64_t now = ktime_get_real_seconds();
+	bool valid = false;
 	int ret;
 
 	_enter("{v={%x:%u} fl=%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
 	       key_serial(key));
 
-	if (vnode->cb_promised &&
-	    !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
-	    !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) &&
-	    !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
-		if (vnode->cb_expires < ktime_get_real_seconds() + 10) {
-			_debug("callback expired");
-			set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		} else {
-			goto valid;
+	/* Quickly check the callback state.  Ideally, we'd use read_seqbegin
+	 * here, but we have no way to pass the net namespace to the RCU
+	 * cleanup for the server record.
+	 */
+	read_seqlock_excl(&vnode->cb_lock);
+
+	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+		if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) {
+			vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
+		} else if (!test_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags) &&
+			   !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
+			   vnode->cb_expires_at - 10 > now) {
+				valid = true;
 		}
+	} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		valid = true;
 	}
 
-	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+	read_sequnlock_excl(&vnode->cb_lock);
+	if (valid)
 		goto valid;
 
 	mutex_lock(&vnode->validate_lock);
@@ -347,12 +386,16 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	 * a new promise - note that if the (parent) directory's metadata was
 	 * changed then the security may be different and we may no longer have
 	 * access */
-	if (!vnode->cb_promised ||
-	    test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+	if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
 		_debug("not promised");
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
-		if (ret < 0)
+		ret = afs_fetch_status(vnode, key);
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				set_bit(AFS_VNODE_DELETED, &vnode->flags);
+				ret = -ESTALE;
+			}
 			goto error_unlock;
+		}
 		_debug("new promise [fl=%lx]", vnode->flags);
 	}
 
@@ -367,7 +410,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
 		afs_zap_data(vnode);
 
-	clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+	clear_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags);
 	mutex_unlock(&vnode->validate_lock);
 valid:
 	_leave(" = 0");
@@ -386,10 +429,17 @@ int afs_getattr(const struct path *path, struct kstat *stat,
 		u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	int seq = 0;
 
 	_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
 
-	generic_fillattr(inode, stat);
+	do {
+		read_seqbegin_or_lock(&vnode->cb_lock, &seq);
+		generic_fillattr(inode, stat);
+	} while (need_seqretry(&vnode->cb_lock, seq));
+
+	done_seqretry(&vnode->cb_lock, seq);
 	return 0;
 }
 
@@ -411,18 +461,14 @@ int afs_drop_inode(struct inode *inode)
  */
 void afs_evict_inode(struct inode *inode)
 {
-	struct afs_permits *permits;
 	struct afs_vnode *vnode;
 
 	vnode = AFS_FS_I(inode);
 
-	_enter("{%x:%u.%d} v=%u x=%u t=%u }",
+	_enter("{%x:%u.%d}",
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       vnode->cb_version,
-	       vnode->cb_expiry,
-	       vnode->cb_type);
+	       vnode->fid.unique);
 
 	_debug("CLEAR INODE %p", inode);
 
@@ -431,31 +477,24 @@ void afs_evict_inode(struct inode *inode)
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 
-	afs_give_up_callback(vnode);
-
-	if (vnode->server) {
-		spin_lock(&vnode->server->fs_lock);
-		rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
-		spin_unlock(&vnode->server->fs_lock);
-		afs_put_server(vnode->server);
-		vnode->server = NULL;
+	if (vnode->cb_interest) {
+		afs_put_cb_interest(afs_i2net(inode), vnode->cb_interest);
+		vnode->cb_interest = NULL;
 	}
 
-	ASSERT(list_empty(&vnode->writebacks));
-	ASSERT(!vnode->cb_promised);
+	while (!list_empty(&vnode->wb_keys)) {
+		struct afs_wb_key *wbk = list_entry(vnode->wb_keys.next,
+						    struct afs_wb_key, vnode_link);
+		list_del(&wbk->vnode_link);
+		afs_put_wb_key(wbk);
+	}
 
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_relinquish_cookie(vnode->cache, 0);
 	vnode->cache = NULL;
 #endif
 
-	mutex_lock(&vnode->permits_lock);
-	permits = vnode->permits;
-	RCU_INIT_POINTER(vnode->permits, NULL);
-	mutex_unlock(&vnode->permits_lock);
-	if (permits)
-		call_rcu(&permits->rcu, afs_zap_permits);
-
+	afs_put_permits(vnode->permit_cache);
 	_leave("");
 }
 
@@ -464,6 +503,7 @@ void afs_evict_inode(struct inode *inode)
  */
 int afs_setattr(struct dentry *dentry, struct iattr *attr)
 {
+	struct afs_fs_cursor fc;
 	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
 	struct key *key;
 	int ret;
@@ -479,13 +519,11 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	/* flush any dirty data outstanding on a regular file */
-	if (S_ISREG(vnode->vfs_inode.i_mode)) {
+	if (S_ISREG(vnode->vfs_inode.i_mode))
 		filemap_write_and_wait(vnode->vfs_inode.i_mapping);
-		afs_writeback_all(vnode);
-	}
 
 	if (attr->ia_valid & ATTR_FILE) {
-		key = attr->ia_file->private_data;
+		key = afs_file_key(attr->ia_file);
 	} else {
 		key = afs_request_key(vnode->volume->cell);
 		if (IS_ERR(key)) {
@@ -494,7 +532,18 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	ret = afs_vnode_setattr(vnode, key, attr);
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_setattr(&fc, attr);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
 	if (!(attr->ia_valid & ATTR_FILE))
 		key_put(key);
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 82e16556afea..bd8dcee7e066 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -21,6 +21,7 @@
 #include <linux/fscache.h>
 #include <linux/backing-dev.h>
 #include <linux/uuid.h>
+#include <net/net_namespace.h>
 #include <net/af_rxrpc.h>
 
 #include "afs.h"
@@ -31,16 +32,6 @@
 struct pagevec;
 struct afs_call;
 
-typedef enum {
-	AFS_VL_NEW,			/* new, uninitialised record */
-	AFS_VL_CREATING,		/* creating record */
-	AFS_VL_VALID,			/* record is pending */
-	AFS_VL_NO_VOLUME,		/* no such volume available */
-	AFS_VL_UPDATING,		/* update in progress */
-	AFS_VL_VOLUME_DELETED,		/* volume was deleted */
-	AFS_VL_UNCERTAIN,		/* uncertain state (update failed) */
-} __attribute__((packed)) afs_vlocation_state_t;
-
 struct afs_mount_params {
 	bool			rwpath;		/* T if the parent should be considered R/W */
 	bool			force;		/* T to force cell type */
@@ -48,20 +39,43 @@ struct afs_mount_params {
 	afs_voltype_t		type;		/* type of volume requested */
 	int			volnamesz;	/* size of volume name */
 	const char		*volname;	/* name of volume to mount */
+	struct afs_net		*net;		/* Network namespace in effect */
 	struct afs_cell		*cell;		/* cell in which to find volume */
 	struct afs_volume	*volume;	/* volume record */
 	struct key		*key;		/* key to use for secure mounting */
 };
 
+struct afs_iget_data {
+	struct afs_fid		fid;
+	struct afs_volume	*volume;	/* volume on which resides */
+};
+
 enum afs_call_state {
-	AFS_CALL_REQUESTING,	/* request is being sent for outgoing call */
-	AFS_CALL_AWAIT_REPLY,	/* awaiting reply to outgoing call */
-	AFS_CALL_AWAIT_OP_ID,	/* awaiting op ID on incoming call */
-	AFS_CALL_AWAIT_REQUEST,	/* awaiting request data on incoming call */
-	AFS_CALL_REPLYING,	/* replying to incoming call */
-	AFS_CALL_AWAIT_ACK,	/* awaiting final ACK of incoming call */
-	AFS_CALL_COMPLETE,	/* Completed or failed */
+	AFS_CALL_CL_REQUESTING,		/* Client: Request is being sent */
+	AFS_CALL_CL_AWAIT_REPLY,	/* Client: Awaiting reply */
+	AFS_CALL_CL_PROC_REPLY,		/* Client: rxrpc call complete; processing reply */
+	AFS_CALL_SV_AWAIT_OP_ID,	/* Server: Awaiting op ID */
+	AFS_CALL_SV_AWAIT_REQUEST,	/* Server: Awaiting request data */
+	AFS_CALL_SV_REPLYING,		/* Server: Replying */
+	AFS_CALL_SV_AWAIT_ACK,		/* Server: Awaiting final ACK */
+	AFS_CALL_COMPLETE,		/* Completed or failed */
 };
+
+/*
+ * List of server addresses.
+ */
+struct afs_addr_list {
+	struct rcu_head		rcu;		/* Must be first */
+	refcount_t		usage;
+	u32			version;	/* Version */
+	unsigned short		nr_addrs;
+	unsigned short		index;		/* Address currently in use */
+	unsigned short		nr_ipv4;	/* Number of IPv4 addresses */
+	unsigned long		probed;		/* Mask of servers that have been probed */
+	unsigned long		yfs;		/* Mask of servers that are YFS */
+	struct sockaddr_rxrpc	addrs[];
+};
+
 /*
  * a record of an in-progress RxRPC call
  */
@@ -72,25 +86,25 @@ struct afs_call {
 	struct work_struct	work;		/* actual work processor */
 	struct rxrpc_call	*rxcall;	/* RxRPC call handle */
 	struct key		*key;		/* security for this call */
-	struct afs_server	*server;	/* server affected by incoming CM call */
+	struct afs_net		*net;		/* The network namespace */
+	struct afs_server	*cm_server;	/* Server affected by incoming CM call */
+	struct afs_cb_interest	*cbi;		/* Callback interest for server used */
 	void			*request;	/* request data (first part) */
-	struct address_space	*mapping;	/* page set */
-	struct afs_writeback	*wb;		/* writeback being performed */
+	struct address_space	*mapping;	/* Pages being written from */
 	void			*buffer;	/* reply receive buffer */
-	void			*reply;		/* reply buffer (first part) */
-	void			*reply2;	/* reply buffer (second part) */
-	void			*reply3;	/* reply buffer (third part) */
-	void			*reply4;	/* reply buffer (fourth part) */
+	void			*reply[4];	/* Where to put the reply */
 	pgoff_t			first;		/* first page in mapping to deal with */
 	pgoff_t			last;		/* last page in mapping to deal with */
 	size_t			offset;		/* offset into received data store */
 	atomic_t		usage;
 	enum afs_call_state	state;
+	spinlock_t		state_lock;
 	int			error;		/* error code */
 	u32			abort_code;	/* Remote abort ID or 0 */
 	unsigned		request_size;	/* size of request data */
 	unsigned		reply_max;	/* maximum size of reply */
 	unsigned		first_offset;	/* offset into mapping[first] */
+	unsigned int		cb_break;	/* cb_break + cb_s_break before the call */
 	union {
 		unsigned	last_to;	/* amount of mapping[last] */
 		unsigned	count2;		/* count used in unmarshalling */
@@ -100,8 +114,9 @@ struct afs_call {
 	bool			send_pages;	/* T if data from mapping should be sent */
 	bool			need_attention;	/* T if RxRPC poked us */
 	bool			async;		/* T if asynchronous */
-	u16			service_id;	/* RxRPC service ID to call */
-	__be16			port;		/* target UDP port */
+	bool			ret_reply0;	/* T if should return reply[0] on success */
+	bool			upgrade;	/* T to request service upgrade */
+	u16			service_id;	/* Actual service ID (after upgrade) */
 	u32			operation_ID;	/* operation ID for an incoming call */
 	u32			count;		/* count for use in unmarshalling */
 	__be32			tmp;		/* place to extract temporary data */
@@ -110,15 +125,13 @@ struct afs_call {
 
 struct afs_call_type {
 	const char *name;
+	unsigned int op; /* Really enum afs_fs_operation */
 
 	/* deliver request or reply data to an call
 	 * - returning an error will cause the call to be aborted
 	 */
 	int (*deliver)(struct afs_call *call);
 
-	/* map an abort code to an error number */
-	int (*abort_to_error)(u32 abort_code);
-
 	/* clean up a call */
 	void (*destructor)(struct afs_call *call);
 
@@ -127,6 +140,30 @@ struct afs_call_type {
 };
 
 /*
+ * Key available for writeback on a file.
+ */
+struct afs_wb_key {
+	refcount_t		usage;
+	struct key		*key;
+	struct list_head	vnode_link;	/* Link in vnode->wb_keys */
+};
+
+/*
+ * AFS open file information record.  Pointed to by file->private_data.
+ */
+struct afs_file {
+	struct key		*key;		/* The key this file was opened with */
+	struct afs_wb_key	*wb;		/* Writeback key record for this file */
+};
+
+static inline struct key *afs_file_key(struct file *file)
+{
+	struct afs_file *af = file->private_data;
+
+	return af->key;
+}
+
+/*
  * Record of an outstanding read operation on a vnode.
  */
 struct afs_read {
@@ -142,38 +179,13 @@ struct afs_read {
 };
 
 /*
- * record of an outstanding writeback on a vnode
- */
-struct afs_writeback {
-	struct list_head	link;		/* link in vnode->writebacks */
-	struct work_struct	writer;		/* work item to perform the writeback */
-	struct afs_vnode	*vnode;		/* vnode to which this write applies */
-	struct key		*key;		/* owner of this write */
-	wait_queue_head_t	waitq;		/* completion and ready wait queue */
-	pgoff_t			first;		/* first page in batch */
-	pgoff_t			point;		/* last page in current store op */
-	pgoff_t			last;		/* last page in batch (inclusive) */
-	unsigned		offset_first;	/* offset into first page of start of write */
-	unsigned		to_last;	/* offset into last page of end of write */
-	int			num_conflicts;	/* count of conflicting writes in list */
-	int			usage;
-	bool			conflicts;	/* T if has dependent conflicts */
-	enum {
-		AFS_WBACK_SYNCING,		/* synchronisation being performed */
-		AFS_WBACK_PENDING,		/* write pending */
-		AFS_WBACK_CONFLICTING,		/* conflicting writes posted */
-		AFS_WBACK_WRITING,		/* writing back */
-		AFS_WBACK_COMPLETE		/* the writeback record has been unlinked */
-	} state __attribute__((packed));
-};
-
-/*
  * AFS superblock private data
  * - there's one superblock per volume
  */
 struct afs_super_info {
+	struct afs_net		*net;		/* Network namespace */
+	struct afs_cell		*cell;		/* The cell in which the volume resides */
 	struct afs_volume	*volume;	/* volume record */
-	char			rwparent;	/* T if parent is R/W AFS volume */
 };
 
 static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
@@ -184,149 +196,238 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
 extern struct file_system_type afs_fs_type;
 
 /*
- * entry in the cached cell catalogue
+ * AFS network namespace record.
  */
-struct afs_cache_cell {
-	char		name[AFS_MAXCELLNAME];	/* cell name (padded with NULs) */
-	struct in_addr	vl_servers[15];		/* cached cell VL servers */
+struct afs_net {
+	struct afs_uuid		uuid;
+	bool			live;		/* F if this namespace is being removed */
+
+	/* AF_RXRPC I/O stuff */
+	struct socket		*socket;
+	struct afs_call		*spare_incoming_call;
+	struct work_struct	charge_preallocation_work;
+	struct mutex		socket_mutex;
+	atomic_t		nr_outstanding_calls;
+	atomic_t		nr_superblocks;
+
+	/* Cell database */
+	struct rb_root		cells;
+	struct afs_cell		*ws_cell;
+	struct work_struct	cells_manager;
+	struct timer_list	cells_timer;
+	atomic_t		cells_outstanding;
+	seqlock_t		cells_lock;
+
+	spinlock_t		proc_cells_lock;
+	struct list_head	proc_cells;
+
+	/* Known servers.  Theoretically each fileserver can only be in one
+	 * cell, but in practice, people create aliases and subsets and there's
+	 * no easy way to distinguish them.
+	 */
+	seqlock_t		fs_lock;	/* For fs_servers */
+	struct rb_root		fs_servers;	/* afs_server (by server UUID or address) */
+	struct list_head	fs_updates;	/* afs_server (by update_at) */
+	struct hlist_head	fs_proc;	/* procfs servers list */
+
+	struct hlist_head	fs_addresses4;	/* afs_server (by lowest IPv4 addr) */
+	struct hlist_head	fs_addresses6;	/* afs_server (by lowest IPv6 addr) */
+	seqlock_t		fs_addr_lock;	/* For fs_addresses[46] */
+
+	struct work_struct	fs_manager;
+	struct timer_list	fs_timer;
+	atomic_t		servers_outstanding;
+
+	/* File locking renewal management */
+	struct mutex		lock_manager_mutex;
+
+	/* Misc */
+	struct proc_dir_entry	*proc_afs;		/* /proc/net/afs directory */
+};
+
+extern struct afs_net __afs_net;// Dummy AFS network namespace; TODO: replace with real netns
+
+enum afs_cell_state {
+	AFS_CELL_UNSET,
+	AFS_CELL_ACTIVATING,
+	AFS_CELL_ACTIVE,
+	AFS_CELL_DEACTIVATING,
+	AFS_CELL_INACTIVE,
+	AFS_CELL_FAILED,
 };
 
 /*
- * AFS cell record
+ * AFS cell record.
+ *
+ * This is a tricky concept to get right as it is possible to create aliases
+ * simply by pointing AFSDB/SRV records for two names at the same set of VL
+ * servers; it is also possible to do things like setting up two sets of VL
+ * servers, one of which provides a superset of the volumes provided by the
+ * other (for internal/external division, for example).
+ *
+ * Cells only exist in the sense that (a) a cell's name maps to a set of VL
+ * servers and (b) a cell's name is used by the client to select the key to use
+ * for authentication and encryption.  The cell name is not typically used in
+ * the protocol.
+ *
+ * There is no easy way to determine if two cells are aliases or one is a
+ * subset of another.
  */
 struct afs_cell {
-	atomic_t		usage;
-	struct list_head	link;		/* main cell list link */
+	union {
+		struct rcu_head	rcu;
+		struct rb_node	net_node;	/* Node in net->cells */
+	};
+	struct afs_net		*net;
 	struct key		*anonymous_key;	/* anonymous user key for this cell */
+	struct work_struct	manager;	/* Manager for init/deinit/dns */
 	struct list_head	proc_link;	/* /proc cell list link */
 #ifdef CONFIG_AFS_FSCACHE
 	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
-
-	/* server record management */
-	rwlock_t		servers_lock;	/* active server list lock */
-	struct list_head	servers;	/* active server list */
-
-	/* volume location record management */
-	struct rw_semaphore	vl_sem;		/* volume management serialisation semaphore */
-	struct list_head	vl_list;	/* cell's active VL record list */
-	spinlock_t		vl_lock;	/* vl_list lock */
-	unsigned short		vl_naddrs;	/* number of VL servers in addr list */
-	unsigned short		vl_curr_svix;	/* current server index */
-	struct in_addr		vl_addrs[AFS_CELL_MAX_ADDRS];	/* cell VL server addresses */
-
-	char			name[0];	/* cell name - must go last */
+	time64_t		dns_expiry;	/* Time AFSDB/SRV record expires */
+	time64_t		last_inactive;	/* Time of last drop of usage count */
+	atomic_t		usage;
+	unsigned long		flags;
+#define AFS_CELL_FL_NOT_READY	0		/* The cell record is not ready for use */
+#define AFS_CELL_FL_NO_GC	1		/* The cell was added manually, don't auto-gc */
+#define AFS_CELL_FL_NOT_FOUND	2		/* Permanent DNS error */
+#define AFS_CELL_FL_DNS_FAIL	3		/* Failed to access DNS */
+#define AFS_CELL_FL_NO_LOOKUP_YET 4		/* Not completed first DNS lookup yet */
+	enum afs_cell_state	state;
+	short			error;
+
+	/* Active fileserver interaction state. */
+	struct list_head	proc_volumes;	/* procfs volume list */
+	rwlock_t		proc_lock;
+
+	/* VL server list. */
+	rwlock_t		vl_addrs_lock;	/* Lock on vl_addrs */
+	struct afs_addr_list	__rcu *vl_addrs; /* List of VL servers */
+	u8			name_len;	/* Length of name */
+	char			name[64 + 1];	/* Cell name, case-flattened and NUL-padded */
 };
 
 /*
- * entry in the cached volume location catalogue
+ * Cached VLDB entry.
+ *
+ * This is pointed to by cell->vldb_entries, indexed by name.
  */
-struct afs_cache_vlocation {
-	/* volume name (lowercase, padded with NULs) */
-	uint8_t			name[AFS_MAXVOLNAME + 1];
+struct afs_vldb_entry {
+	afs_volid_t		vid[3];		/* Volume IDs for R/W, R/O and Bak volumes */
 
-	uint8_t			nservers;	/* number of entries used in servers[] */
-	uint8_t			vidmask;	/* voltype mask for vid[] */
-	uint8_t			srvtmask[8];	/* voltype masks for servers[] */
+	unsigned long		flags;
+#define AFS_VLDB_HAS_RW		0		/* - R/W volume exists */
+#define AFS_VLDB_HAS_RO		1		/* - R/O volume exists */
+#define AFS_VLDB_HAS_BAK	2		/* - Backup volume exists */
+#define AFS_VLDB_QUERY_VALID	3		/* - Record is valid */
+#define AFS_VLDB_QUERY_ERROR	4		/* - VL server returned error */
+
+	uuid_t			fs_server[AFS_NMAXNSERVERS];
+	u8			fs_mask[AFS_NMAXNSERVERS];
 #define AFS_VOL_VTM_RW	0x01 /* R/W version of the volume is available (on this server) */
 #define AFS_VOL_VTM_RO	0x02 /* R/O version of the volume is available (on this server) */
 #define AFS_VOL_VTM_BAK	0x04 /* backup version of the volume is available (on this server) */
-
-	afs_volid_t		vid[3];		/* volume IDs for R/W, R/O and Bak volumes */
-	struct in_addr		servers[8];	/* fileserver addresses */
-	time_t			rtime;		/* last retrieval time */
+	short			error;
+	u8			nr_servers;	/* Number of server records */
+	u8			name_len;
+	u8			name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */
 };
 
 /*
- * volume -> vnode hash table entry
+ * Record of fileserver with which we're actively communicating.
  */
-struct afs_cache_vhash {
-	afs_voltype_t		vtype;		/* which volume variation */
-	uint8_t			hash_bucket;	/* which hash bucket this represents */
-} __attribute__((packed));
+struct afs_server {
+	struct rcu_head		rcu;
+	union {
+		uuid_t		uuid;		/* Server ID */
+		struct afs_uuid	_uuid;
+	};
 
-/*
- * AFS volume location record
- */
-struct afs_vlocation {
+	struct afs_addr_list	__rcu *addresses;
+	struct rb_node		uuid_rb;	/* Link in net->servers */
+	struct hlist_node	addr4_link;	/* Link in net->fs_addresses4 */
+	struct hlist_node	addr6_link;	/* Link in net->fs_addresses6 */
+	struct hlist_node	proc_link;	/* Link in net->fs_proc */
+	struct afs_server	*gc_next;	/* Next server in manager's list */
+	time64_t		put_time;	/* Time at which last put */
+	time64_t		update_at;	/* Time at which to next update the record */
+	unsigned long		flags;
+#define AFS_SERVER_FL_NEW	0		/* New server, don't inc cb_s_break */
+#define AFS_SERVER_FL_NOT_READY	1		/* The record is not ready for use */
+#define AFS_SERVER_FL_NOT_FOUND	2		/* VL server says no such server */
+#define AFS_SERVER_FL_VL_FAIL	3		/* Failed to access VL server */
+#define AFS_SERVER_FL_UPDATING	4
+#define AFS_SERVER_FL_PROBED	5		/* The fileserver has been probed */
+#define AFS_SERVER_FL_PROBING	6		/* Fileserver is being probed */
 	atomic_t		usage;
-	time64_t		time_of_death;	/* time at which put reduced usage to 0 */
-	struct list_head	link;		/* link in cell volume location list */
-	struct list_head	grave;		/* link in master graveyard list */
-	struct list_head	update;		/* link in master update list */
-	struct afs_cell		*cell;		/* cell to which volume belongs */
-#ifdef CONFIG_AFS_FSCACHE
-	struct fscache_cookie	*cache;		/* caching cookie */
-#endif
-	struct afs_cache_vlocation vldb;	/* volume information DB record */
-	struct afs_volume	*vols[3];	/* volume access record pointer (index by type) */
-	wait_queue_head_t	waitq;		/* status change waitqueue */
-	time64_t		update_at;	/* time at which record should be updated */
-	spinlock_t		lock;		/* access lock */
-	afs_vlocation_state_t	state;		/* volume location state */
-	unsigned short		upd_rej_cnt;	/* ENOMEDIUM count during update */
-	unsigned short		upd_busy_cnt;	/* EBUSY count during update */
-	bool			valid;		/* T if valid */
+	u32			addr_version;	/* Address list version */
+
+	/* file service access */
+	rwlock_t		fs_lock;	/* access lock */
+
+	/* callback promise management */
+	struct list_head	cb_interests;	/* List of superblocks using this server */
+	unsigned		cb_s_break;	/* Break-everything counter. */
+	rwlock_t		cb_break_lock;	/* Volume finding lock */
 };
 
 /*
- * AFS fileserver record
+ * Interest by a superblock on a server.
  */
-struct afs_server {
-	atomic_t		usage;
-	time64_t		time_of_death;	/* time at which put reduced usage to 0 */
-	struct in_addr		addr;		/* server address */
-	struct afs_cell		*cell;		/* cell in which server resides */
-	struct list_head	link;		/* link in cell's server list */
-	struct list_head	grave;		/* link in master graveyard list */
-	struct rb_node		master_rb;	/* link in master by-addr tree */
-	struct rw_semaphore	sem;		/* access lock */
+struct afs_cb_interest {
+	struct list_head	cb_link;	/* Link in server->cb_interests */
+	struct afs_server	*server;	/* Server on which this interest resides */
+	struct super_block	*sb;		/* Superblock on which inodes reside */
+	afs_volid_t		vid;		/* Volume ID to match */
+	refcount_t		usage;
+};
 
-	/* file service access */
-	struct rb_root		fs_vnodes;	/* vnodes backed by this server (ordered by FID) */
-	unsigned long		fs_act_jif;	/* time at which last activity occurred */
-	unsigned long		fs_dead_jif;	/* time at which no longer to be considered dead */
-	spinlock_t		fs_lock;	/* access lock */
-	int			fs_state;      	/* 0 or reason FS currently marked dead (-errno) */
+/*
+ * Replaceable server list.
+ */
+struct afs_server_entry {
+	struct afs_server	*server;
+	struct afs_cb_interest	*cb_interest;
+};
 
-	/* callback promise management */
-	struct rb_root		cb_promises;	/* vnode expiration list (ordered earliest first) */
-	struct delayed_work	cb_updater;	/* callback updater */
-	struct delayed_work	cb_break_work;	/* collected break dispatcher */
-	wait_queue_head_t	cb_break_waitq;	/* space available in cb_break waitqueue */
-	spinlock_t		cb_lock;	/* access lock */
-	struct afs_callback	cb_break[64];	/* ring of callbacks awaiting breaking */
-	atomic_t		cb_break_n;	/* number of pending breaks */
-	u8			cb_break_head;	/* head of callback breaking ring */
-	u8			cb_break_tail;	/* tail of callback breaking ring */
+struct afs_server_list {
+	refcount_t		usage;
+	unsigned short		nr_servers;
+	unsigned short		index;		/* Server currently in use */
+	unsigned short		vnovol_mask;	/* Servers to be skipped due to VNOVOL */
+	unsigned int		seq;		/* Set to ->servers_seq when installed */
+	struct afs_server_entry	servers[];
 };
 
 /*
- * AFS volume access record
+ * Live AFS volume management.
  */
 struct afs_volume {
+	afs_volid_t		vid;		/* volume ID */
 	atomic_t		usage;
-	struct afs_cell		*cell;		/* cell to which belongs (unrefd ptr) */
-	struct afs_vlocation	*vlocation;	/* volume location */
+	time64_t		update_at;	/* Time at which to next update */
+	struct afs_cell		*cell;		/* Cell to which belongs (pins ref) */
+	struct list_head	proc_link;	/* Link in cell->vl_proc */
+	unsigned long		flags;
+#define AFS_VOLUME_NEEDS_UPDATE	0	/* - T if an update needs performing */
+#define AFS_VOLUME_UPDATING	1	/* - T if an update is in progress */
+#define AFS_VOLUME_WAIT		2	/* - T if users must wait for update */
+#define AFS_VOLUME_DELETED	3	/* - T if volume appears deleted */
+#define AFS_VOLUME_OFFLINE	4	/* - T if volume offline notice given */
+#define AFS_VOLUME_BUSY		5	/* - T if volume busy notice given */
 #ifdef CONFIG_AFS_FSCACHE
 	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
-	afs_volid_t		vid;		/* volume ID */
+	struct afs_server_list	*servers;	/* List of servers on which volume resides */
+	rwlock_t		servers_lock;	/* Lock for ->servers */
+	unsigned int		servers_seq;	/* Incremented each time ->servers changes */
+
 	afs_voltype_t		type;		/* type of volume */
+	short			error;
 	char			type_force;	/* force volume type (suppress R/O -> R/W) */
-	unsigned short		nservers;	/* number of server slots filled */
-	unsigned short		rjservers;	/* number of servers discarded due to -ENOMEDIUM */
-	struct afs_server	*servers[8];	/* servers on which volume resides (ordered) */
-	struct rw_semaphore	server_sem;	/* lock for accessing current server */
-};
-
-/*
- * vnode catalogue entry
- */
-struct afs_cache_vnode {
-	afs_vnodeid_t		vnode_id;	/* vnode ID */
-	unsigned		vnode_unique;	/* vnode ID uniquifier */
-	afs_dataversion_t	data_version;	/* data version */
+	u8			name_len;
+	u8			name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */
 };
 
 /*
@@ -336,24 +437,20 @@ struct afs_vnode {
 	struct inode		vfs_inode;	/* the VFS's inode record */
 
 	struct afs_volume	*volume;	/* volume on which vnode resides */
-	struct afs_server	*server;	/* server currently supplying this file */
 	struct afs_fid		fid;		/* the file identifier for this inode */
 	struct afs_file_status	status;		/* AFS status info for this file */
 #ifdef CONFIG_AFS_FSCACHE
 	struct fscache_cookie	*cache;		/* caching cookie */
 #endif
-	struct afs_permits	*permits;	/* cache of permits so far obtained */
-	struct mutex		permits_lock;	/* lock for altering permits list */
+	struct afs_permits	*permit_cache;	/* cache of permits so far obtained */
+	struct mutex		io_lock;	/* Lock for serialising I/O on this mutex */
 	struct mutex		validate_lock;	/* lock for validating this vnode */
-	wait_queue_head_t	update_waitq;	/* status fetch waitqueue */
-	int			update_cnt;	/* number of outstanding ops that will update the
-						 * status */
-	spinlock_t		writeback_lock;	/* lock for writebacks */
+	spinlock_t		wb_lock;	/* lock for wb_keys */
 	spinlock_t		lock;		/* waitqueue/flags lock */
 	unsigned long		flags;
-#define AFS_VNODE_CB_BROKEN	0		/* set if vnode's callback was broken */
+#define AFS_VNODE_CB_PROMISED	0		/* Set if vnode has a callback promise */
 #define AFS_VNODE_UNSET		1		/* set if vnode attributes not yet set */
-#define AFS_VNODE_MODIFIED	2		/* set if vnode's data modified */
+#define AFS_VNODE_DIR_MODIFIED	2		/* set if dir vnode's data modified */
 #define AFS_VNODE_ZAP_DATA	3		/* set if vnode's data should be invalidated */
 #define AFS_VNODE_DELETED	4		/* set if vnode deleted on server */
 #define AFS_VNODE_MOUNTPOINT	5		/* set if vnode is a mountpoint symlink */
@@ -364,24 +461,21 @@ struct afs_vnode {
 #define AFS_VNODE_AUTOCELL	10		/* set if Vnode is an auto mount point */
 #define AFS_VNODE_PSEUDODIR	11		/* set if Vnode is a pseudo directory */
 
-	long			acl_order;	/* ACL check count (callback break count) */
-
-	struct list_head	writebacks;	/* alterations in pagecache that need writing */
+	struct list_head	wb_keys;	/* List of keys available for writeback */
 	struct list_head	pending_locks;	/* locks waiting to be granted */
 	struct list_head	granted_locks;	/* locks granted on this file */
 	struct delayed_work	lock_work;	/* work to be done in locking */
 	struct key		*unlock_key;	/* key to be used in unlocking */
 
 	/* outstanding callback notification on this file */
-	struct rb_node		server_rb;	/* link in server->fs_vnodes */
-	struct rb_node		cb_promise;	/* link in server->cb_promises */
-	struct work_struct	cb_broken_work;	/* work to be done on callback break */
-	time64_t		cb_expires;	/* time at which callback expires */
-	time64_t		cb_expires_at;	/* time used to order cb_promise */
+	struct afs_cb_interest	*cb_interest;	/* Server on which this resides */
+	unsigned int		cb_s_break;	/* Mass break counter on ->server */
+	unsigned int		cb_break;	/* Break counter on vnode */
+	seqlock_t		cb_lock;	/* Lock for ->cb_interest, ->status, ->cb_*break */
+
+	time64_t		cb_expires_at;	/* time at which callback expires */
 	unsigned		cb_version;	/* callback version */
-	unsigned		cb_expiry;	/* callback expiry time */
 	afs_callback_type_t	cb_type;	/* type of callback */
-	bool			cb_promised;	/* true if promise still holds */
 };
 
 /*
@@ -389,16 +483,21 @@ struct afs_vnode {
  */
 struct afs_permit {
 	struct key		*key;		/* RxRPC ticket holding a security context */
-	afs_access_t		access_mask;	/* access mask for this key */
+	afs_access_t		access;		/* CallerAccess value for this key */
 };
 
 /*
- * cache of security records from attempts to access a vnode
+ * Immutable cache of CallerAccess records from attempts to access vnodes.
+ * These may be shared between multiple vnodes.
  */
 struct afs_permits {
-	struct rcu_head		rcu;		/* disposal procedure */
-	int			count;		/* number of records */
-	struct afs_permit	permits[0];	/* the permits so far examined */
+	struct rcu_head		rcu;
+	struct hlist_node	hash_node;	/* Link in hash */
+	unsigned long		h;		/* Hash value for this permit list */
+	refcount_t		usage;
+	unsigned short		nr_permits;	/* Number of records */
+	bool			invalidated;	/* Invalidated due to key change */
+	struct afs_permit	permits[];	/* List of permits sorted by key pointer */
 };
 
 /*
@@ -410,28 +509,78 @@ struct afs_interface {
 	unsigned	mtu;		/* MTU of interface */
 };
 
-struct afs_uuid {
-	__be32		time_low;			/* low part of timestamp */
-	__be16		time_mid;			/* mid part of timestamp */
-	__be16		time_hi_and_version;		/* high part of timestamp and version  */
-	__u8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
-	__u8		clock_seq_low;			/* clock seq low */
-	__u8		node[6];			/* spatially unique node ID (MAC addr) */
+/*
+ * Cursor for iterating over a server's address list.
+ */
+struct afs_addr_cursor {
+	struct afs_addr_list	*alist;		/* Current address list (pins ref) */
+	struct sockaddr_rxrpc	*addr;
+	u32			abort_code;
+	unsigned short		start;		/* Starting point in alist->addrs[] */
+	unsigned short		index;		/* Wrapping offset from start to current addr */
+	short			error;
+	bool			begun;		/* T if we've begun iteration */
+	bool			responded;	/* T if the current address responded */
+};
+
+/*
+ * Cursor for iterating over a set of fileservers.
+ */
+struct afs_fs_cursor {
+	struct afs_addr_cursor	ac;
+	struct afs_vnode	*vnode;
+	struct afs_server_list	*server_list;	/* Current server list (pins ref) */
+	struct afs_cb_interest	*cbi;		/* Server on which this resides (pins ref) */
+	struct key		*key;		/* Key for the server */
+	unsigned int		cb_break;	/* cb_break + cb_s_break before the call */
+	unsigned int		cb_break_2;	/* cb_break + cb_s_break (2nd vnode) */
+	unsigned char		start;		/* Initial index in server list */
+	unsigned char		index;		/* Number of servers tried beyond start */
+	unsigned short		flags;
+#define AFS_FS_CURSOR_STOP	0x0001		/* Set to cease iteration */
+#define AFS_FS_CURSOR_VBUSY	0x0002		/* Set if seen VBUSY */
+#define AFS_FS_CURSOR_VMOVED	0x0004		/* Set if seen VMOVED */
+#define AFS_FS_CURSOR_VNOVOL	0x0008		/* Set if seen VNOVOL */
+#define AFS_FS_CURSOR_CUR_ONLY	0x0010		/* Set if current server only (file lock held) */
+#define AFS_FS_CURSOR_NO_VSLEEP	0x0020		/* Set to prevent sleep on VBUSY, VOFFLINE, ... */
 };
 
+#include <trace/events/afs.h>
+
 /*****************************************************************************/
 /*
+ * addr_list.c
+ */
+static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist)
+{
+	if (alist)
+		refcount_inc(&alist->usage);
+	return alist;
+}
+extern struct afs_addr_list *afs_alloc_addrlist(unsigned int,
+						unsigned short,
+						unsigned short);
+extern void afs_put_addrlist(struct afs_addr_list *);
+extern struct afs_addr_list *afs_parse_text_addrs(const char *, size_t, char,
+						  unsigned short, unsigned short);
+extern struct afs_addr_list *afs_dns_query(struct afs_cell *, time64_t *);
+extern bool afs_iterate_addresses(struct afs_addr_cursor *);
+extern int afs_end_cursor(struct afs_addr_cursor *);
+extern int afs_set_vl_cursor(struct afs_addr_cursor *, struct afs_cell *);
+
+extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
+extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
+
+/*
  * cache.c
  */
 #ifdef CONFIG_AFS_FSCACHE
 extern struct fscache_netfs afs_cache_netfs;
 extern struct fscache_cookie_def afs_cell_cache_index_def;
-extern struct fscache_cookie_def afs_vlocation_cache_index_def;
 extern struct fscache_cookie_def afs_volume_cache_index_def;
 extern struct fscache_cookie_def afs_vnode_cache_index_def;
 #else
 #define afs_cell_cache_index_def	(*(struct fscache_cookie_def *) NULL)
-#define afs_vlocation_cache_index_def	(*(struct fscache_cookie_def *) NULL)
 #define afs_volume_cache_index_def	(*(struct fscache_cookie_def *) NULL)
 #define afs_vnode_cache_index_def	(*(struct fscache_cookie_def *) NULL)
 #endif
@@ -440,29 +589,31 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def;
  * callback.c
  */
 extern void afs_init_callback_state(struct afs_server *);
-extern void afs_broken_callback_work(struct work_struct *);
-extern void afs_break_callbacks(struct afs_server *, size_t,
-				struct afs_callback[]);
-extern void afs_discard_callback_on_delete(struct afs_vnode *);
-extern void afs_give_up_callback(struct afs_vnode *);
-extern void afs_dispatch_give_up_callbacks(struct work_struct *);
-extern void afs_flush_callback_breaks(struct afs_server *);
-extern int __init afs_callback_update_init(void);
-extern void afs_callback_update_kill(void);
+extern void afs_break_callback(struct afs_vnode *);
+extern void afs_break_callbacks(struct afs_server *, size_t,struct afs_callback[]);
+
+extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *);
+extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *);
+extern void afs_clear_callback_interests(struct afs_net *, struct afs_server_list *);
+
+static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi)
+{
+	refcount_inc(&cbi->usage);
+	return cbi;
+}
 
 /*
  * cell.c
  */
-extern struct rw_semaphore afs_proc_cells_sem;
-extern struct list_head afs_proc_cells;
-
-#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
-extern int afs_cell_init(char *);
-extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
-extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
-extern struct afs_cell *afs_grab_cell(struct afs_cell *);
-extern void afs_put_cell(struct afs_cell *);
-extern void afs_cell_purge(void);
+extern int afs_cell_init(struct afs_net *, const char *);
+extern struct afs_cell *afs_lookup_cell_rcu(struct afs_net *, const char *, unsigned);
+extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned,
+					const char *, bool);
+extern struct afs_cell *afs_get_cell(struct afs_cell *);
+extern void afs_put_cell(struct afs_net *, struct afs_cell *);
+extern void afs_manage_cells(struct work_struct *);
+extern void afs_cells_timer(struct timer_list *);
+extern void __net_exit afs_cell_purge(struct afs_net *);
 
 /*
  * cmservice.c
@@ -472,6 +623,7 @@ extern bool afs_cm_incoming_call(struct afs_call *);
 /*
  * dir.c
  */
+extern bool afs_dir_check_page(struct inode *, struct page *);
 extern const struct inode_operations afs_dir_inode_operations;
 extern const struct dentry_operations afs_fs_dentry_operations;
 extern const struct file_operations afs_dir_file_operations;
@@ -483,15 +635,19 @@ extern const struct address_space_operations afs_fs_aops;
 extern const struct inode_operations afs_file_inode_operations;
 extern const struct file_operations afs_file_operations;
 
+extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
+extern void afs_put_wb_key(struct afs_wb_key *);
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
+extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *);
 extern int afs_page_filler(void *, struct page *);
 extern void afs_put_read(struct afs_read *);
 
 /*
  * flock.c
  */
-extern void __exit afs_kill_lock_manager(void);
+extern struct workqueue_struct *afs_lock_manager;
+
 extern void afs_lock_work(struct work_struct *);
 extern void afs_lock_may_be_available(struct afs_vnode *);
 extern int afs_lock(struct file *, int, struct file_lock *);
@@ -500,48 +656,40 @@ extern int afs_flock(struct file *, int, struct file_lock *);
 /*
  * fsclient.c
  */
-extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
-				    struct afs_vnode *, struct afs_volsync *,
-				    bool);
-extern int afs_fs_give_up_callbacks(struct afs_server *, bool);
-extern int afs_fs_fetch_data(struct afs_server *, struct key *,
-			     struct afs_vnode *, struct afs_read *, bool);
-extern int afs_fs_create(struct afs_server *, struct key *,
-			 struct afs_vnode *, const char *, umode_t,
-			 struct afs_fid *, struct afs_file_status *,
-			 struct afs_callback *, bool);
-extern int afs_fs_remove(struct afs_server *, struct key *,
-			 struct afs_vnode *, const char *, bool, bool);
-extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *,
-		       struct afs_vnode *, const char *, bool);
-extern int afs_fs_symlink(struct afs_server *, struct key *,
-			  struct afs_vnode *, const char *, const char *,
-			  struct afs_fid *, struct afs_file_status *, bool);
-extern int afs_fs_rename(struct afs_server *, struct key *,
-			 struct afs_vnode *, const char *,
-			 struct afs_vnode *, const char *, bool);
-extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *,
-			     pgoff_t, pgoff_t, unsigned, unsigned, bool);
-extern int afs_fs_setattr(struct afs_server *, struct key *,
-			  struct afs_vnode *, struct iattr *, bool);
-extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
-				    struct afs_vnode *,
-				    struct afs_volume_status *, bool);
-extern int afs_fs_set_lock(struct afs_server *, struct key *,
-			   struct afs_vnode *, afs_lock_type_t, bool);
-extern int afs_fs_extend_lock(struct afs_server *, struct key *,
-			      struct afs_vnode *, bool);
-extern int afs_fs_release_lock(struct afs_server *, struct key *,
-			       struct afs_vnode *, bool);
+extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *);
+extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *);
+extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *);
+extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t,
+			 struct afs_fid *, struct afs_file_status *, struct afs_callback *);
+extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool);
+extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *);
+extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *,
+			  struct afs_fid *, struct afs_file_status *);
+extern int afs_fs_rename(struct afs_fs_cursor *, const char *,
+			 struct afs_vnode *, const char *);
+extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
+			     pgoff_t, pgoff_t, unsigned, unsigned);
+extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *);
+extern int afs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *);
+extern int afs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t);
+extern int afs_fs_extend_lock(struct afs_fs_cursor *);
+extern int afs_fs_release_lock(struct afs_fs_cursor *);
+extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *,
+					struct afs_addr_cursor *, struct key *);
+extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
+				   struct afs_addr_cursor *, struct key *);
 
 /*
  * inode.c
  */
+extern int afs_fetch_status(struct afs_vnode *, struct key *);
+extern int afs_iget5_test(struct inode *, void *);
 extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
 				       struct key *);
 extern struct inode *afs_iget(struct super_block *, struct key *,
 			      struct afs_fid *, struct afs_file_status *,
-			      struct afs_callback *);
+			      struct afs_callback *,
+			      struct afs_cb_interest *);
 extern void afs_zap_data(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
 extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int);
@@ -553,7 +701,35 @@ extern int afs_drop_inode(struct inode *);
  * main.c
  */
 extern struct workqueue_struct *afs_wq;
-extern struct afs_uuid afs_uuid;
+
+static inline struct afs_net *afs_d2net(struct dentry *dentry)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_i2net(struct inode *inode)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_v2net(struct afs_vnode *vnode)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_sock2net(struct sock *sk)
+{
+	return &__afs_net;
+}
+
+static inline struct afs_net *afs_get_net(struct afs_net *net)
+{
+	return net;
+}
+
+static inline void afs_put_net(struct afs_net *net)
+{
+}
 
 /*
  * misc.c
@@ -578,23 +754,33 @@ extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
 /*
  * proc.c
  */
-extern int afs_proc_init(void);
-extern void afs_proc_cleanup(void);
-extern int afs_proc_cell_setup(struct afs_cell *);
-extern void afs_proc_cell_remove(struct afs_cell *);
+extern int __net_init afs_proc_init(struct afs_net *);
+extern void __net_exit afs_proc_cleanup(struct afs_net *);
+extern int afs_proc_cell_setup(struct afs_net *, struct afs_cell *);
+extern void afs_proc_cell_remove(struct afs_net *, struct afs_cell *);
+
+/*
+ * rotate.c
+ */
+extern bool afs_begin_vnode_operation(struct afs_fs_cursor *, struct afs_vnode *,
+				      struct key *);
+extern bool afs_select_fileserver(struct afs_fs_cursor *);
+extern bool afs_select_current_fileserver(struct afs_fs_cursor *);
+extern int afs_end_vnode_operation(struct afs_fs_cursor *);
 
 /*
  * rxrpc.c
  */
-extern struct socket *afs_socket;
-extern atomic_t afs_outstanding_calls;
+extern struct workqueue_struct *afs_async_calls;
 
-extern int afs_open_socket(void);
-extern void afs_close_socket(void);
+extern int __net_init afs_open_socket(struct afs_net *);
+extern void __net_exit afs_close_socket(struct afs_net *);
+extern void afs_charge_preallocation(struct work_struct *);
 extern void afs_put_call(struct afs_call *);
 extern int afs_queue_call_work(struct afs_call *);
-extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, bool);
-extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
+extern long afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t, bool);
+extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
+					    const struct afs_call_type *,
 					    size_t, size_t);
 extern void afs_flat_call_destructor(struct afs_call *);
 extern void afs_send_empty_reply(struct afs_call *);
@@ -606,117 +792,135 @@ static inline int afs_transfer_reply(struct afs_call *call)
 	return afs_extract_data(call, call->buffer, call->reply_max, false);
 }
 
+static inline bool afs_check_call_state(struct afs_call *call,
+					enum afs_call_state state)
+{
+	return READ_ONCE(call->state) == state;
+}
+
+static inline bool afs_set_call_state(struct afs_call *call,
+				      enum afs_call_state from,
+				      enum afs_call_state to)
+{
+	bool ok = false;
+
+	spin_lock_bh(&call->state_lock);
+	if (call->state == from) {
+		call->state = to;
+		trace_afs_call_state(call, from, to, 0, 0);
+		ok = true;
+	}
+	spin_unlock_bh(&call->state_lock);
+	return ok;
+}
+
+static inline void afs_set_call_complete(struct afs_call *call,
+					 int error, u32 remote_abort)
+{
+	enum afs_call_state state;
+	bool ok = false;
+
+	spin_lock_bh(&call->state_lock);
+	state = call->state;
+	if (state != AFS_CALL_COMPLETE) {
+		call->abort_code = remote_abort;
+		call->error = error;
+		call->state = AFS_CALL_COMPLETE;
+		trace_afs_call_state(call, state, AFS_CALL_COMPLETE,
+				     error, remote_abort);
+		ok = true;
+	}
+	spin_unlock_bh(&call->state_lock);
+	if (ok)
+		trace_afs_call_done(call);
+}
+
 /*
  * security.c
  */
+extern void afs_put_permits(struct afs_permits *);
 extern void afs_clear_permits(struct afs_vnode *);
-extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
+extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
 extern int afs_permission(struct inode *, int);
+extern void __exit afs_clean_up_permit_cache(void);
 
 /*
  * server.c
  */
 extern spinlock_t afs_server_peer_lock;
 
-#define afs_get_server(S)					\
-do {								\
-	_debug("GET SERVER %d", atomic_read(&(S)->usage));	\
-	atomic_inc(&(S)->usage);				\
-} while(0)
+static inline struct afs_server *afs_get_server(struct afs_server *server)
+{
+	atomic_inc(&server->usage);
+	return server;
+}
 
-extern struct afs_server *afs_lookup_server(struct afs_cell *,
-					    const struct in_addr *);
-extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *);
-extern void afs_put_server(struct afs_server *);
-extern void __exit afs_purge_servers(void);
+extern struct afs_server *afs_find_server(struct afs_net *,
+					  const struct sockaddr_rxrpc *);
+extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
+extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *);
+extern void afs_put_server(struct afs_net *, struct afs_server *);
+extern void afs_manage_servers(struct work_struct *);
+extern void afs_servers_timer(struct timer_list *);
+extern void __net_exit afs_purge_servers(struct afs_net *);
+extern bool afs_probe_fileserver(struct afs_fs_cursor *);
+extern bool afs_check_server_record(struct afs_fs_cursor *, struct afs_server *);
 
 /*
- * super.c
+ * server_list.c
  */
-extern int afs_fs_init(void);
-extern void afs_fs_exit(void);
+static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list *slist)
+{
+	refcount_inc(&slist->usage);
+	return slist;
+}
 
-/*
- * vlclient.c
- */
-extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
-				    const char *, struct afs_cache_vlocation *,
-				    bool);
-extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
-				  afs_volid_t, afs_voltype_t,
-				  struct afs_cache_vlocation *, bool);
+extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *);
+extern struct afs_server_list *afs_alloc_server_list(struct afs_cell *, struct key *,
+						     struct afs_vldb_entry *,
+						     u8);
+extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *);
 
 /*
- * vlocation.c
+ * super.c
  */
-#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
-
-extern int __init afs_vlocation_update_init(void);
-extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
-						  struct key *,
-						  const char *, size_t);
-extern void afs_put_vlocation(struct afs_vlocation *);
-extern void afs_vlocation_purge(void);
+extern int __init afs_fs_init(void);
+extern void __exit afs_fs_exit(void);
 
 /*
- * vnode.c
+ * vlclient.c
  */
-static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
-{
-	return container_of(inode, struct afs_vnode, vfs_inode);
-}
-
-static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
-{
-	return &vnode->vfs_inode;
-}
-
-extern void afs_vnode_finalise_status_update(struct afs_vnode *,
-					     struct afs_server *);
-extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
-				  struct key *);
-extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
-				struct afs_read *);
-extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
-			    umode_t, struct afs_fid *, struct afs_file_status *,
-			    struct afs_callback *, struct afs_server **);
-extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *,
-			    bool);
-extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *,
-			  const char *);
-extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
-			     const char *, struct afs_fid *,
-			     struct afs_file_status *, struct afs_server **);
-extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
-			    struct key *, const char *, const char *);
-extern int afs_vnode_store_data(struct afs_writeback *, pgoff_t, pgoff_t,
-				unsigned, unsigned);
-extern int afs_vnode_setattr(struct afs_vnode *, struct key *, struct iattr *);
-extern int afs_vnode_get_volume_status(struct afs_vnode *, struct key *,
-				       struct afs_volume_status *);
-extern int afs_vnode_set_lock(struct afs_vnode *, struct key *,
-			      afs_lock_type_t);
-extern int afs_vnode_extend_lock(struct afs_vnode *, struct key *);
-extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
+extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *,
+							 struct afs_addr_cursor *,
+							 struct key *, const char *, int);
+extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *, struct afs_addr_cursor *,
+						struct key *, const uuid_t *);
+extern int afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *, struct key *);
+extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *, struct afs_addr_cursor *,
+						     struct key *, const uuid_t *);
 
 /*
  * volume.c
  */
-#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
+static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume)
+{
+	if (volume)
+		atomic_inc(&volume->usage);
+	return volume;
+}
 
-extern void afs_put_volume(struct afs_volume *);
-extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *);
-extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
-extern int afs_volume_release_fileserver(struct afs_vnode *,
-					 struct afs_server *, int);
+extern struct afs_volume *afs_create_volume(struct afs_mount_params *);
+extern void afs_activate_volume(struct afs_volume *);
+extern void afs_deactivate_volume(struct afs_volume *);
+extern void afs_put_volume(struct afs_cell *, struct afs_volume *);
+extern int afs_check_volume_status(struct afs_volume *, struct key *);
 
 /*
  * write.c
  */
 extern int afs_set_page_dirty(struct page *);
-extern void afs_put_writeback(struct afs_writeback *);
 extern int afs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata);
@@ -727,9 +931,11 @@ extern int afs_writepage(struct page *, struct writeback_control *);
 extern int afs_writepages(struct address_space *, struct writeback_control *);
 extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
-extern int afs_writeback_all(struct afs_vnode *);
 extern int afs_flush(struct file *, fl_owner_t);
 extern int afs_fsync(struct file *, loff_t, loff_t, int);
+extern int afs_page_mkwrite(struct vm_fault *);
+extern void afs_prune_wb_keys(struct afs_vnode *);
+extern int afs_launder_page(struct page *);
 
 /*
  * xattr.c
@@ -737,12 +943,42 @@ extern int afs_fsync(struct file *, loff_t, loff_t, int);
 extern const struct xattr_handler *afs_xattr_handlers[];
 extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
 
+
+/*
+ * Miscellaneous inline functions.
+ */
+static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
+{
+	return container_of(inode, struct afs_vnode, vfs_inode);
+}
+
+static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
+{
+	return &vnode->vfs_inode;
+}
+
+static inline void afs_vnode_commit_status(struct afs_fs_cursor *fc,
+					   struct afs_vnode *vnode,
+					   unsigned int cb_break)
+{
+	if (fc->ac.error == 0)
+		afs_cache_permit(vnode, fc->key, cb_break);
+}
+
+static inline void afs_check_for_remote_deletion(struct afs_fs_cursor *fc,
+						 struct afs_vnode *vnode)
+{
+	if (fc->ac.error == -ENOENT) {
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		afs_break_callback(vnode);
+	}
+}
+
+
 /*****************************************************************************/
 /*
  * debug tracing
  */
-#include <trace/events/afs.h>
-
 extern unsigned afs_debug;
 
 #define dbgprintk(FMT,...) \
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 9944770849da..15a02a05ff40 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -31,57 +31,112 @@ static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 
-struct afs_uuid afs_uuid;
 struct workqueue_struct *afs_wq;
+struct afs_net __afs_net;
 
 /*
- * initialise the AFS client FS module
+ * Initialise an AFS network namespace record.
  */
-static int __init afs_init(void)
+static int __net_init afs_net_init(struct afs_net *net)
 {
 	int ret;
 
-	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
+	net->live = true;
+	generate_random_uuid((unsigned char *)&net->uuid);
 
-	generate_random_uuid((unsigned char *)&afs_uuid);
+	INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation);
+	mutex_init(&net->socket_mutex);
 
-	/* create workqueue */
-	ret = -ENOMEM;
-	afs_wq = alloc_workqueue("afs", 0, 0);
-	if (!afs_wq)
-		return ret;
+	net->cells = RB_ROOT;
+	seqlock_init(&net->cells_lock);
+	INIT_WORK(&net->cells_manager, afs_manage_cells);
+	timer_setup(&net->cells_timer, afs_cells_timer, 0);
 
-	/* register the /proc stuff */
-	ret = afs_proc_init();
-	if (ret < 0)
-		goto error_proc;
+	spin_lock_init(&net->proc_cells_lock);
+	INIT_LIST_HEAD(&net->proc_cells);
 
-#ifdef CONFIG_AFS_FSCACHE
-	/* we want to be able to cache */
-	ret = fscache_register_netfs(&afs_cache_netfs);
+	seqlock_init(&net->fs_lock);
+	net->fs_servers = RB_ROOT;
+	INIT_LIST_HEAD(&net->fs_updates);
+	INIT_HLIST_HEAD(&net->fs_proc);
+
+	INIT_HLIST_HEAD(&net->fs_addresses4);
+	INIT_HLIST_HEAD(&net->fs_addresses6);
+	seqlock_init(&net->fs_addr_lock);
+
+	INIT_WORK(&net->fs_manager, afs_manage_servers);
+	timer_setup(&net->fs_timer, afs_servers_timer, 0);
+
+	/* Register the /proc stuff */
+	ret = afs_proc_init(net);
 	if (ret < 0)
-		goto error_cache;
-#endif
+		goto error_proc;
 
-	/* initialise the cell DB */
-	ret = afs_cell_init(rootcell);
+	/* Initialise the cell DB */
+	ret = afs_cell_init(net, rootcell);
 	if (ret < 0)
 		goto error_cell_init;
 
-	/* initialise the VL update process */
-	ret = afs_vlocation_update_init();
+	/* Create the RxRPC transport */
+	ret = afs_open_socket(net);
 	if (ret < 0)
-		goto error_vl_update_init;
+		goto error_open_socket;
 
-	/* initialise the callback update process */
-	ret = afs_callback_update_init();
+	return 0;
+
+error_open_socket:
+	net->live = false;
+	afs_cell_purge(net);
+	afs_purge_servers(net);
+error_cell_init:
+	net->live = false;
+	afs_proc_cleanup(net);
+error_proc:
+	net->live = false;
+	return ret;
+}
+
+/*
+ * Clean up and destroy an AFS network namespace record.
+ */
+static void __net_exit afs_net_exit(struct afs_net *net)
+{
+	net->live = false;
+	afs_cell_purge(net);
+	afs_purge_servers(net);
+	afs_close_socket(net);
+	afs_proc_cleanup(net);
+}
+
+/*
+ * initialise the AFS client FS module
+ */
+static int __init afs_init(void)
+{
+	int ret = -ENOMEM;
+
+	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
+
+	afs_wq = alloc_workqueue("afs", 0, 0);
+	if (!afs_wq)
+		goto error_afs_wq;
+	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
+	if (!afs_async_calls)
+		goto error_async;
+	afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
+	if (!afs_lock_manager)
+		goto error_lockmgr;
+
+#ifdef CONFIG_AFS_FSCACHE
+	/* we want to be able to cache */
+	ret = fscache_register_netfs(&afs_cache_netfs);
 	if (ret < 0)
-		goto error_callback_update_init;
+		goto error_cache;
+#endif
 
-	/* create the RxRPC transport */
-	ret = afs_open_socket();
+	ret = afs_net_init(&__afs_net);
 	if (ret < 0)
-		goto error_open_socket;
+		goto error_net;
 
 	/* register the filesystems */
 	ret = afs_fs_init();
@@ -91,21 +146,18 @@ static int __init afs_init(void)
 	return ret;
 
 error_fs:
-	afs_close_socket();
-error_open_socket:
-	afs_callback_update_kill();
-error_callback_update_init:
-	afs_vlocation_purge();
-error_vl_update_init:
-	afs_cell_purge();
-error_cell_init:
+	afs_net_exit(&__afs_net);
+error_net:
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_unregister_netfs(&afs_cache_netfs);
 error_cache:
 #endif
-	afs_proc_cleanup();
-error_proc:
+	destroy_workqueue(afs_lock_manager);
+error_lockmgr:
+	destroy_workqueue(afs_async_calls);
+error_async:
 	destroy_workqueue(afs_wq);
+error_afs_wq:
 	rcu_barrier();
 	printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
 	return ret;
@@ -124,17 +176,14 @@ static void __exit afs_exit(void)
 	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
 
 	afs_fs_exit();
-	afs_kill_lock_manager();
-	afs_close_socket();
-	afs_purge_servers();
-	afs_callback_update_kill();
-	afs_vlocation_purge();
-	destroy_workqueue(afs_wq);
-	afs_cell_purge();
+	afs_net_exit(&__afs_net);
 #ifdef CONFIG_AFS_FSCACHE
 	fscache_unregister_netfs(&afs_cache_netfs);
 #endif
-	afs_proc_cleanup();
+	destroy_workqueue(afs_lock_manager);
+	destroy_workqueue(afs_async_calls);
+	destroy_workqueue(afs_wq);
+	afs_clean_up_permit_cache();
 	rcu_barrier();
 }
 
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index c05f1f1c0d41..700a5fa7f4ec 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -21,12 +21,12 @@
 int afs_abort_to_error(u32 abort_code)
 {
 	switch (abort_code) {
-	/* low errno codes inserted into abort namespace */
+		/* Low errno codes inserted into abort namespace */
 	case 13:		return -EACCES;
 	case 27:		return -EFBIG;
 	case 30:		return -EROFS;
 
-	/* VICE "special error" codes; 101 - 111 */
+		/* VICE "special error" codes; 101 - 111 */
 	case VSALVAGE:		return -EIO;
 	case VNOVNODE:		return -ENOENT;
 	case VNOVOL:		return -ENOMEDIUM;
@@ -39,7 +39,37 @@ int afs_abort_to_error(u32 abort_code)
 	case VBUSY:		return -EBUSY;
 	case VMOVED:		return -ENXIO;
 
-	/* Unified AFS error table; ET "uae" == 0x2f6df00 */
+		/* Volume Location server errors */
+	case AFSVL_IDEXIST:		return -EEXIST;
+	case AFSVL_IO:			return -EREMOTEIO;
+	case AFSVL_NAMEEXIST:		return -EEXIST;
+	case AFSVL_CREATEFAIL:		return -EREMOTEIO;
+	case AFSVL_NOENT:		return -ENOMEDIUM;
+	case AFSVL_EMPTY:		return -ENOMEDIUM;
+	case AFSVL_ENTDELETED:		return -ENOMEDIUM;
+	case AFSVL_BADNAME:		return -EINVAL;
+	case AFSVL_BADINDEX:		return -EINVAL;
+	case AFSVL_BADVOLTYPE:		return -EINVAL;
+	case AFSVL_BADSERVER:		return -EINVAL;
+	case AFSVL_BADPARTITION:	return -EINVAL;
+	case AFSVL_REPSFULL:		return -EFBIG;
+	case AFSVL_NOREPSERVER:		return -ENOENT;
+	case AFSVL_DUPREPSERVER:	return -EEXIST;
+	case AFSVL_RWNOTFOUND:		return -ENOENT;
+	case AFSVL_BADREFCOUNT:		return -EINVAL;
+	case AFSVL_SIZEEXCEEDED:	return -EINVAL;
+	case AFSVL_BADENTRY:		return -EINVAL;
+	case AFSVL_BADVOLIDBUMP:	return -EINVAL;
+	case AFSVL_IDALREADYHASHED:	return -EINVAL;
+	case AFSVL_ENTRYLOCKED:		return -EBUSY;
+	case AFSVL_BADVOLOPER:		return -EBADRQC;
+	case AFSVL_BADRELLOCKTYPE:	return -EINVAL;
+	case AFSVL_RERELEASE:		return -EREMOTEIO;
+	case AFSVL_BADSERVERFLAG:	return -EINVAL;
+	case AFSVL_PERM:		return -EACCES;
+	case AFSVL_NOMEM:		return -EREMOTEIO;
+
+		/* Unified AFS error table; ET "uae" == 0x2f6df00 */
 	case 0x2f6df00:		return -EPERM;
 	case 0x2f6df01:		return -ENOENT;
 	case 0x2f6df04:		return -EIO;
@@ -68,7 +98,7 @@ int afs_abort_to_error(u32 abort_code)
 	case 0x2f6df6c:		return -ETIMEDOUT;
 	case 0x2f6df78:		return -EDQUOT;
 
-	/* RXKAD abort codes; from include/rxrpc/packet.h.  ET "RXK" == 0x1260B00 */
+		/* RXKAD abort codes; from include/rxrpc/packet.h.  ET "RXK" == 0x1260B00 */
 	case RXKADINCONSISTENCY: return -EPROTO;
 	case RXKADPACKETSHORT:	return -EPROTO;
 	case RXKADLEVELFAIL:	return -EKEYREJECTED;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 35efb9a31dd7..4508dd54f789 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -17,8 +17,15 @@
 #include <linux/uaccess.h>
 #include "internal.h"
 
-static struct proc_dir_entry *proc_afs;
+static inline struct afs_net *afs_proc2net(struct file *f)
+{
+	return &__afs_net;
+}
 
+static inline struct afs_net *afs_seq2net(struct seq_file *m)
+{
+	return &__afs_net; // TODO: use seq_file_net(m)
+}
 
 static int afs_proc_cells_open(struct inode *inode, struct file *file);
 static void *afs_proc_cells_start(struct seq_file *p, loff_t *pos);
@@ -98,22 +105,22 @@ static const struct file_operations afs_proc_cell_vlservers_fops = {
 	.release	= seq_release,
 };
 
-static int afs_proc_cell_servers_open(struct inode *inode, struct file *file);
-static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos);
-static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
+static int afs_proc_servers_open(struct inode *inode, struct file *file);
+static void *afs_proc_servers_start(struct seq_file *p, loff_t *pos);
+static void *afs_proc_servers_next(struct seq_file *p, void *v,
 					loff_t *pos);
-static void afs_proc_cell_servers_stop(struct seq_file *p, void *v);
-static int afs_proc_cell_servers_show(struct seq_file *m, void *v);
-
-static const struct seq_operations afs_proc_cell_servers_ops = {
-	.start	= afs_proc_cell_servers_start,
-	.next	= afs_proc_cell_servers_next,
-	.stop	= afs_proc_cell_servers_stop,
-	.show	= afs_proc_cell_servers_show,
+static void afs_proc_servers_stop(struct seq_file *p, void *v);
+static int afs_proc_servers_show(struct seq_file *m, void *v);
+
+static const struct seq_operations afs_proc_servers_ops = {
+	.start	= afs_proc_servers_start,
+	.next	= afs_proc_servers_next,
+	.stop	= afs_proc_servers_stop,
+	.show	= afs_proc_servers_show,
 };
 
-static const struct file_operations afs_proc_cell_servers_fops = {
-	.open		= afs_proc_cell_servers_open,
+static const struct file_operations afs_proc_servers_fops = {
+	.open		= afs_proc_servers_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
@@ -122,23 +129,24 @@ static const struct file_operations afs_proc_cell_servers_fops = {
 /*
  * initialise the /proc/fs/afs/ directory
  */
-int afs_proc_init(void)
+int afs_proc_init(struct afs_net *net)
 {
 	_enter("");
 
-	proc_afs = proc_mkdir("fs/afs", NULL);
-	if (!proc_afs)
+	net->proc_afs = proc_mkdir("fs/afs", NULL);
+	if (!net->proc_afs)
 		goto error_dir;
 
-	if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) ||
-	    !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops))
+	if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) ||
+	    !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) ||
+	    !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops))
 		goto error_tree;
 
 	_leave(" = 0");
 	return 0;
 
 error_tree:
-	remove_proc_subtree("fs/afs", NULL);
+	proc_remove(net->proc_afs);
 error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
@@ -147,9 +155,10 @@ error_dir:
 /*
  * clean up the /proc/fs/afs/ directory
  */
-void afs_proc_cleanup(void)
+void afs_proc_cleanup(struct afs_net *net)
 {
-	remove_proc_subtree("fs/afs", NULL);
+	proc_remove(net->proc_afs);
+	net->proc_afs = NULL;
 }
 
 /*
@@ -166,7 +175,6 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
 
 	m = file->private_data;
 	m->private = PDE_DATA(inode);
-
 	return 0;
 }
 
@@ -176,25 +184,28 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
  */
 static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
 {
-	/* lock the list against modification */
-	down_read(&afs_proc_cells_sem);
-	return seq_list_start_head(&afs_proc_cells, *_pos);
+	struct afs_net *net = afs_seq2net(m);
+
+	rcu_read_lock();
+	return seq_list_start_head(&net->proc_cells, *_pos);
 }
 
 /*
  * move to next cell in cells list
  */
-static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
+static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	return seq_list_next(v, &afs_proc_cells, pos);
+	struct afs_net *net = afs_seq2net(m);
+
+	return seq_list_next(v, &net->proc_cells, pos);
 }
 
 /*
  * clean up after reading from the cells list
  */
-static void afs_proc_cells_stop(struct seq_file *p, void *v)
+static void afs_proc_cells_stop(struct seq_file *m, void *v)
 {
-	up_read(&afs_proc_cells_sem);
+	rcu_read_unlock();
 }
 
 /*
@@ -203,16 +214,16 @@ static void afs_proc_cells_stop(struct seq_file *p, void *v)
 static int afs_proc_cells_show(struct seq_file *m, void *v)
 {
 	struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
+	struct afs_net *net = afs_seq2net(m);
 
-	if (v == &afs_proc_cells) {
+	if (v == &net->proc_cells) {
 		/* display header on line 1 */
 		seq_puts(m, "USE NAME\n");
 		return 0;
 	}
 
 	/* display one cell per line on subsequent lines */
-	seq_printf(m, "%3d %s\n",
-		   atomic_read(&cell->usage), cell->name);
+	seq_printf(m, "%3u %s\n", atomic_read(&cell->usage), cell->name);
 	return 0;
 }
 
@@ -223,6 +234,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
 static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
 				    size_t size, loff_t *_pos)
 {
+	struct afs_net *net = afs_proc2net(file);
 	char *kbuf, *name, *args;
 	int ret;
 
@@ -264,13 +276,13 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
 	if (strcmp(kbuf, "add") == 0) {
 		struct afs_cell *cell;
 
-		cell = afs_cell_create(name, strlen(name), args, false);
+		cell = afs_lookup_cell(net, name, strlen(name), args, true);
 		if (IS_ERR(cell)) {
 			ret = PTR_ERR(cell);
 			goto done;
 		}
 
-		afs_put_cell(cell);
+		set_bit(AFS_CELL_FL_NO_GC, &cell->flags);
 		printk("kAFS: Added new cell '%s'\n", name);
 	} else {
 		goto inval;
@@ -303,6 +315,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 				       const char __user *buf,
 				       size_t size, loff_t *_pos)
 {
+	struct afs_net *net = afs_proc2net(file);
 	char *kbuf, *s;
 	int ret;
 
@@ -322,7 +335,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 	/* determine command to perform */
 	_debug("rootcell=%s", kbuf);
 
-	ret = afs_cell_init(kbuf);
+	ret = afs_cell_init(net, kbuf);
 	if (ret >= 0)
 		ret = size;	/* consume everything, always */
 
@@ -334,29 +347,27 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 /*
  * initialise /proc/fs/afs/<cell>/
  */
-int afs_proc_cell_setup(struct afs_cell *cell)
+int afs_proc_cell_setup(struct afs_net *net, struct afs_cell *cell)
 {
 	struct proc_dir_entry *dir;
 
-	_enter("%p{%s}", cell, cell->name);
+	_enter("%p{%s},%p", cell, cell->name, net->proc_afs);
 
-	dir = proc_mkdir(cell->name, proc_afs);
+	dir = proc_mkdir(cell->name, net->proc_afs);
 	if (!dir)
 		goto error_dir;
 
-	if (!proc_create_data("servers", 0, dir,
-			     &afs_proc_cell_servers_fops, cell) ||
-	    !proc_create_data("vlservers", 0, dir,
-			     &afs_proc_cell_vlservers_fops, cell) ||
+	if (!proc_create_data("vlservers", 0, dir,
+			      &afs_proc_cell_vlservers_fops, cell) ||
 	    !proc_create_data("volumes", 0, dir,
-			     &afs_proc_cell_volumes_fops, cell))
+			      &afs_proc_cell_volumes_fops, cell))
 		goto error_tree;
 
 	_leave(" = 0");
 	return 0;
 
 error_tree:
-	remove_proc_subtree(cell->name, proc_afs);
+	remove_proc_subtree(cell->name, net->proc_afs);
 error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
@@ -365,11 +376,11 @@ error_dir:
 /*
  * remove /proc/fs/afs/<cell>/
  */
-void afs_proc_cell_remove(struct afs_cell *cell)
+void afs_proc_cell_remove(struct afs_net *net, struct afs_cell *cell)
 {
 	_enter("");
 
-	remove_proc_subtree(cell->name, proc_afs);
+	remove_proc_subtree(cell->name, net->proc_afs);
 
 	_leave("");
 }
@@ -407,9 +418,8 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
 
 	_enter("cell=%p pos=%Ld", cell, *_pos);
 
-	/* lock the list against modification */
-	down_read(&cell->vl_sem);
-	return seq_list_start_head(&cell->vl_list, *_pos);
+	read_lock(&cell->proc_lock);
+	return seq_list_start_head(&cell->proc_volumes, *_pos);
 }
 
 /*
@@ -421,7 +431,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
 	struct afs_cell *cell = p->private;
 
 	_enter("cell=%p pos=%Ld", cell, *_pos);
-	return seq_list_next(v, &cell->vl_list, _pos);
+	return seq_list_next(v, &cell->proc_volumes, _pos);
 }
 
 /*
@@ -431,17 +441,13 @@ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
 {
 	struct afs_cell *cell = p->private;
 
-	up_read(&cell->vl_sem);
+	read_unlock(&cell->proc_lock);
 }
 
-static const char afs_vlocation_states[][4] = {
-	[AFS_VL_NEW]			= "New",
-	[AFS_VL_CREATING]		= "Crt",
-	[AFS_VL_VALID]			= "Val",
-	[AFS_VL_NO_VOLUME]		= "NoV",
-	[AFS_VL_UPDATING]		= "Upd",
-	[AFS_VL_VOLUME_DELETED]		= "Del",
-	[AFS_VL_UNCERTAIN]		= "Unc",
+static const char afs_vol_types[3][3] = {
+	[AFSVL_RWVOL]	= "RW",
+	[AFSVL_ROVOL]	= "RO",
+	[AFSVL_BACKVOL]	= "BK",
 };
 
 /*
@@ -450,23 +456,17 @@ static const char afs_vlocation_states[][4] = {
 static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
 {
 	struct afs_cell *cell = m->private;
-	struct afs_vlocation *vlocation =
-		list_entry(v, struct afs_vlocation, link);
+	struct afs_volume *vol = list_entry(v, struct afs_volume, proc_link);
 
-	/* display header on line 1 */
-	if (v == &cell->vl_list) {
-		seq_puts(m, "USE STT VLID[0]  VLID[1]  VLID[2]  NAME\n");
+	/* Display header on line 1 */
+	if (v == &cell->proc_volumes) {
+		seq_puts(m, "USE VID      TY\n");
 		return 0;
 	}
 
-	/* display one cell per line on subsequent lines */
-	seq_printf(m, "%3d %s %08x %08x %08x %s\n",
-		   atomic_read(&vlocation->usage),
-		   afs_vlocation_states[vlocation->state],
-		   vlocation->vldb.vid[0],
-		   vlocation->vldb.vid[1],
-		   vlocation->vldb.vid[2],
-		   vlocation->vldb.name);
+	seq_printf(m, "%3d %08x %s\n",
+		   atomic_read(&vol->usage), vol->vid,
+		   afs_vol_types[vol->type]);
 
 	return 0;
 }
@@ -501,23 +501,23 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
  */
 static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
 {
+	struct afs_addr_list *alist;
 	struct afs_cell *cell = m->private;
 	loff_t pos = *_pos;
 
-	_enter("cell=%p pos=%Ld", cell, *_pos);
+	rcu_read_lock();
 
-	/* lock the list against modification */
-	down_read(&cell->vl_sem);
+	alist = rcu_dereference(cell->vl_addrs);
 
 	/* allow for the header line */
 	if (!pos)
 		return (void *) 1;
 	pos--;
 
-	if (pos >= cell->vl_naddrs)
+	if (!alist || pos >= alist->nr_addrs)
 		return NULL;
 
-	return &cell->vl_addrs[pos];
+	return alist->addrs + pos;
 }
 
 /*
@@ -526,17 +526,18 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
 static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
 					  loff_t *_pos)
 {
+	struct afs_addr_list *alist;
 	struct afs_cell *cell = p->private;
 	loff_t pos;
 
-	_enter("cell=%p{nad=%u} pos=%Ld", cell, cell->vl_naddrs, *_pos);
+	alist = rcu_dereference(cell->vl_addrs);
 
 	pos = *_pos;
 	(*_pos)++;
-	if (pos >= cell->vl_naddrs)
+	if (!alist || pos >= alist->nr_addrs)
 		return NULL;
 
-	return &cell->vl_addrs[pos];
+	return alist->addrs + pos;
 }
 
 /*
@@ -544,9 +545,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
  */
 static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
 {
-	struct afs_cell *cell = p->private;
-
-	up_read(&cell->vl_sem);
+	rcu_read_unlock();
 }
 
 /*
@@ -554,100 +553,76 @@ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
  */
 static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
 {
-	struct in_addr *addr = v;
+	struct sockaddr_rxrpc *addr = v;
 
 	/* display header on line 1 */
-	if (v == (struct in_addr *) 1) {
+	if (v == (void *)1) {
 		seq_puts(m, "ADDRESS\n");
 		return 0;
 	}
 
 	/* display one cell per line on subsequent lines */
-	seq_printf(m, "%pI4\n", &addr->s_addr);
+	seq_printf(m, "%pISp\n", &addr->transport);
 	return 0;
 }
 
 /*
- * open "/proc/fs/afs/<cell>/servers" which provides a summary of active
+ * open "/proc/fs/afs/servers" which provides a summary of active
  * servers
  */
-static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
+static int afs_proc_servers_open(struct inode *inode, struct file *file)
 {
-	struct afs_cell *cell;
-	struct seq_file *m;
-	int ret;
-
-	cell = PDE_DATA(inode);
-	if (!cell)
-		return -ENOENT;
-
-	ret = seq_open(file, &afs_proc_cell_servers_ops);
-	if (ret < 0)
-		return ret;
-
-	m = file->private_data;
-	m->private = cell;
-	return 0;
+	return seq_open(file, &afs_proc_servers_ops);
 }
 
 /*
- * set up the iterator to start reading from the cells list and return the
- * first item
+ * Set up the iterator to start reading from the server list and return the
+ * first item.
  */
-static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
-	__acquires(m->private->servers_lock)
+static void *afs_proc_servers_start(struct seq_file *m, loff_t *_pos)
 {
-	struct afs_cell *cell = m->private;
-
-	_enter("cell=%p pos=%Ld", cell, *_pos);
+	struct afs_net *net = afs_seq2net(m);
 
-	/* lock the list against modification */
-	read_lock(&cell->servers_lock);
-	return seq_list_start_head(&cell->servers, *_pos);
+	rcu_read_lock();
+	return seq_hlist_start_head_rcu(&net->fs_proc, *_pos);
 }
 
 /*
  * move to next cell in cells list
  */
-static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
-					loff_t *_pos)
+static void *afs_proc_servers_next(struct seq_file *m, void *v, loff_t *_pos)
 {
-	struct afs_cell *cell = p->private;
+	struct afs_net *net = afs_seq2net(m);
 
-	_enter("cell=%p pos=%Ld", cell, *_pos);
-	return seq_list_next(v, &cell->servers, _pos);
+	return seq_hlist_next_rcu(v, &net->fs_proc, _pos);
 }
 
 /*
  * clean up after reading from the cells list
  */
-static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
-	__releases(p->private->servers_lock)
+static void afs_proc_servers_stop(struct seq_file *p, void *v)
 {
-	struct afs_cell *cell = p->private;
-
-	read_unlock(&cell->servers_lock);
+	rcu_read_unlock();
 }
 
 /*
  * display a header line followed by a load of volume lines
  */
-static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
+static int afs_proc_servers_show(struct seq_file *m, void *v)
 {
-	struct afs_cell *cell = m->private;
-	struct afs_server *server = list_entry(v, struct afs_server, link);
-	char ipaddr[20];
+	struct afs_server *server;
+	struct afs_addr_list *alist;
 
-	/* display header on line 1 */
-	if (v == &cell->servers) {
-		seq_puts(m, "USE ADDR            STATE\n");
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(m, "UUID                                 USE ADDR\n");
 		return 0;
 	}
 
-	/* display one cell per line on subsequent lines */
-	sprintf(ipaddr, "%pI4", &server->addr);
-	seq_printf(m, "%3d %-15.15s %5d\n",
-		   atomic_read(&server->usage), ipaddr, server->fs_state);
-
+	server = list_entry(v, struct afs_server, proc_link);
+	alist = rcu_dereference(server->addresses);
+	seq_printf(m, "%pU %3d %pISp\n",
+		   &server->uuid,
+		   atomic_read(&server->usage),
+		   &alist->addrs[alist->index].transport);
 	return 0;
 }
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
new file mode 100644
index 000000000000..e728ca1776c9
--- /dev/null
+++ b/fs/afs/rotate.c
@@ -0,0 +1,715 @@
+/* Handle fileserver selection and rotation.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/sched/signal.h>
+#include "internal.h"
+#include "afs_fs.h"
+
+/*
+ * Initialise a filesystem server cursor for iterating over FS servers.
+ */
+void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
+{
+	memset(fc, 0, sizeof(*fc));
+}
+
+/*
+ * Begin an operation on the fileserver.
+ *
+ * Fileserver operations are serialised on the server by vnode, so we serialise
+ * them here also using the io_lock.
+ */
+bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
+			       struct key *key)
+{
+	afs_init_fs_cursor(fc, vnode);
+	fc->vnode = vnode;
+	fc->key = key;
+	fc->ac.error = SHRT_MAX;
+
+	if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
+		fc->ac.error = -EINTR;
+		fc->flags |= AFS_FS_CURSOR_STOP;
+		return false;
+	}
+
+	if (test_bit(AFS_VNODE_READLOCKED, &vnode->flags) ||
+	    test_bit(AFS_VNODE_WRITELOCKED, &vnode->flags))
+		fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
+	return true;
+}
+
+/*
+ * Begin iteration through a server list, starting with the vnode's last used
+ * server if possible, or the last recorded good server if not.
+ */
+static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
+				   struct afs_vnode *vnode)
+{
+	struct afs_cb_interest *cbi;
+	int i;
+
+	read_lock(&vnode->volume->servers_lock);
+	fc->server_list = afs_get_serverlist(vnode->volume->servers);
+	read_unlock(&vnode->volume->servers_lock);
+
+	cbi = vnode->cb_interest;
+	if (cbi) {
+		/* See if the vnode's preferred record is still available */
+		for (i = 0; i < fc->server_list->nr_servers; i++) {
+			if (fc->server_list->servers[i].cb_interest == cbi) {
+				fc->start = i;
+				goto found_interest;
+			}
+		}
+
+		/* If we have a lock outstanding on a server that's no longer
+		 * serving this vnode, then we can't switch to another server
+		 * and have to return an error.
+		 */
+		if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
+			fc->ac.error = -ESTALE;
+			return false;
+		}
+
+		/* Note that the callback promise is effectively broken */
+		write_seqlock(&vnode->cb_lock);
+		ASSERTCMP(cbi, ==, vnode->cb_interest);
+		vnode->cb_interest = NULL;
+		if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+			vnode->cb_break++;
+		write_sequnlock(&vnode->cb_lock);
+
+		afs_put_cb_interest(afs_v2net(vnode), cbi);
+		cbi = NULL;
+	} else {
+		fc->start = READ_ONCE(fc->server_list->index);
+	}
+
+found_interest:
+	fc->index = fc->start;
+	return true;
+}
+
+/*
+ * Post volume busy note.
+ */
+static void afs_busy(struct afs_volume *volume, u32 abort_code)
+{
+	const char *m;
+
+	switch (abort_code) {
+	case VOFFLINE:		m = "offline";		break;
+	case VRESTARTING:	m = "restarting";	break;
+	case VSALVAGING:	m = "being salvaged";	break;
+	default:		m = "busy";		break;
+	}
+	
+	pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m);
+}
+
+/*
+ * Sleep and retry the operation to the same fileserver.
+ */
+static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
+{
+	msleep_interruptible(1000);
+	if (signal_pending(current)) {
+		fc->ac.error = -ERESTARTSYS;
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Select the fileserver to use.  May be called multiple times to rotate
+ * through the fileservers.
+ */
+bool afs_select_fileserver(struct afs_fs_cursor *fc)
+{
+	struct afs_addr_list *alist;
+	struct afs_server *server;
+	struct afs_vnode *vnode = fc->vnode;
+
+	_enter("%u/%u,%u/%u,%d,%d",
+	       fc->index, fc->start,
+	       fc->ac.index, fc->ac.start,
+	       fc->ac.error, fc->ac.abort_code);
+
+	if (fc->flags & AFS_FS_CURSOR_STOP) {
+		_leave(" = f [stopped]");
+		return false;
+	}
+
+	/* Evaluate the result of the previous operation, if there was one. */
+	switch (fc->ac.error) {
+	case SHRT_MAX:
+		goto start;
+
+	case 0:
+	default:
+		/* Success or local failure.  Stop. */
+		fc->flags |= AFS_FS_CURSOR_STOP;
+		_leave(" = f [okay/local %d]", fc->ac.error);
+		return false;
+
+	case -ECONNABORTED:
+		/* The far side rejected the operation on some grounds.  This
+		 * might involve the server being busy or the volume having been moved.
+		 */
+		switch (fc->ac.abort_code) {
+		case VNOVOL:
+			/* This fileserver doesn't know about the volume.
+			 * - May indicate that the VL is wrong - retry once and compare
+			 *   the results.
+			 * - May indicate that the fileserver couldn't attach to the vol.
+			 */
+			if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
+				fc->ac.error = -EREMOTEIO;
+				goto failed;
+			}
+
+			write_lock(&vnode->volume->servers_lock);
+			fc->server_list->vnovol_mask |= 1 << fc->index;
+			write_unlock(&vnode->volume->servers_lock);
+
+			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
+			fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
+			if (fc->ac.error < 0)
+				goto failed;
+
+			if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
+				fc->ac.error = -ENOMEDIUM;
+				goto failed;
+			}
+
+			/* If the server list didn't change, then assume that
+			 * it's the fileserver having trouble.
+			 */
+			if (vnode->volume->servers == fc->server_list) {
+				fc->ac.error = -EREMOTEIO;
+				goto failed;
+			}
+
+			/* Try again */
+			fc->flags |= AFS_FS_CURSOR_VNOVOL;
+			_leave(" = t [vnovol]");
+			return true;
+
+		case VSALVAGE: /* TODO: Should this return an error or iterate? */
+		case VVOLEXISTS:
+		case VNOSERVICE:
+		case VONLINE:
+		case VDISKFULL:
+		case VOVERQUOTA:
+			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+			goto next_server;
+
+		case VOFFLINE:
+			if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
+				afs_busy(vnode->volume, fc->ac.abort_code);
+				clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
+			}
+			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
+				fc->ac.error = -EADV;
+				goto failed;
+			}
+			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
+				fc->ac.error = -ESTALE;
+				goto failed;
+			}
+			goto busy;
+
+		case VSALVAGING:
+		case VRESTARTING:
+		case VBUSY:
+			/* Retry after going round all the servers unless we
+			 * have a file lock we need to maintain.
+			 */
+			if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
+				fc->ac.error = -EBUSY;
+				goto failed;
+			}
+			if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
+				afs_busy(vnode->volume, fc->ac.abort_code);
+				clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
+			}
+		busy:
+			if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
+				if (!afs_sleep_and_retry(fc))
+					goto failed;
+
+				 /* Retry with same server & address */
+				_leave(" = t [vbusy]");
+				return true;
+			}
+
+			fc->flags |= AFS_FS_CURSOR_VBUSY;
+			goto next_server;
+
+		case VMOVED:
+			/* The volume migrated to another server.  We consider
+			 * consider all locks and callbacks broken and request
+			 * an update from the VLDB.
+			 *
+			 * We also limit the number of VMOVED hops we will
+			 * honour, just in case someone sets up a loop.
+			 */
+			if (fc->flags & AFS_FS_CURSOR_VMOVED) {
+				fc->ac.error = -EREMOTEIO;
+				goto failed;
+			}
+			fc->flags |= AFS_FS_CURSOR_VMOVED;
+
+			set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
+			set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
+			fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
+			if (fc->ac.error < 0)
+				goto failed;
+
+			/* If the server list didn't change, then the VLDB is
+			 * out of sync with the fileservers.  This is hopefully
+			 * a temporary condition, however, so we don't want to
+			 * permanently block access to the file.
+			 *
+			 * TODO: Try other fileservers if we can.
+			 *
+			 * TODO: Retry a few times with sleeps.
+			 */
+			if (vnode->volume->servers == fc->server_list) {
+				fc->ac.error = -ENOMEDIUM;
+				goto failed;
+			}
+
+			goto restart_from_beginning;
+
+		default:
+			clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
+			clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
+			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+			goto failed;
+		}
+
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -ECONNREFUSED:
+	case -ETIMEDOUT:
+	case -ETIME:
+		_debug("no conn");
+		goto iterate_address;
+	}
+
+restart_from_beginning:
+	_debug("restart");
+	afs_end_cursor(&fc->ac);
+	afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
+	fc->cbi = NULL;
+	afs_put_serverlist(afs_v2net(vnode), fc->server_list);
+	fc->server_list = NULL;
+start:
+	_debug("start");
+	/* See if we need to do an update of the volume record.  Note that the
+	 * volume may have moved or even have been deleted.
+	 */
+	fc->ac.error = afs_check_volume_status(vnode->volume, fc->key);
+	if (fc->ac.error < 0)
+		goto failed;
+
+	if (!afs_start_fs_iteration(fc, vnode))
+		goto failed;
+	goto use_server;
+
+next_server:
+	_debug("next");
+	afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
+	fc->cbi = NULL;
+	fc->index++;
+	if (fc->index >= fc->server_list->nr_servers)
+		fc->index = 0;
+	if (fc->index != fc->start)
+		goto use_server;
+
+	/* That's all the servers poked to no good effect.  Try again if some
+	 * of them were busy.
+	 */
+	if (fc->flags & AFS_FS_CURSOR_VBUSY)
+		goto restart_from_beginning;
+
+	fc->ac.error = -EDESTADDRREQ;
+	goto failed;
+
+use_server:
+	_debug("use");
+	/* We're starting on a different fileserver from the list.  We need to
+	 * check it, create a callback intercept, find its address list and
+	 * probe its capabilities before we use it.
+	 */
+	ASSERTCMP(fc->ac.alist, ==, NULL);
+	server = fc->server_list->servers[fc->index].server;
+
+	if (!afs_check_server_record(fc, server))
+		goto failed;
+
+	_debug("USING SERVER: %pU", &server->uuid);
+
+	/* Make sure we've got a callback interest record for this server.  We
+	 * have to link it in before we send the request as we can be sent a
+	 * break request before we've finished decoding the reply and
+	 * installing the vnode.
+	 */
+	fc->ac.error = afs_register_server_cb_interest(
+		vnode, &fc->server_list->servers[fc->index]);
+	if (fc->ac.error < 0)
+		goto failed;
+
+	fc->cbi = afs_get_cb_interest(vnode->cb_interest);
+
+	read_lock(&server->fs_lock);
+	alist = rcu_dereference_protected(server->addresses,
+					  lockdep_is_held(&server->fs_lock));
+	afs_get_addrlist(alist);
+	read_unlock(&server->fs_lock);
+
+
+	/* Probe the current fileserver if we haven't done so yet. */
+	if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) {
+		fc->ac.alist = afs_get_addrlist(alist);
+
+		if (!afs_probe_fileserver(fc))
+			goto failed;
+	}
+
+	if (!fc->ac.alist)
+		fc->ac.alist = alist;
+	else
+		afs_put_addrlist(alist);
+
+	fc->ac.addr  = NULL;
+	fc->ac.start = READ_ONCE(alist->index);
+	fc->ac.index = fc->ac.start;
+	fc->ac.error = 0;
+	fc->ac.begun = false;
+	goto iterate_address;
+
+iterate_address:
+	ASSERT(fc->ac.alist);
+	_debug("iterate %d/%d", fc->ac.index, fc->ac.alist->nr_addrs);
+	/* Iterate over the current server's address list to try and find an
+	 * address on which it will respond to us.
+	 */
+	if (afs_iterate_addresses(&fc->ac)) {
+		_leave(" = t");
+		return true;
+	}
+
+	afs_end_cursor(&fc->ac);
+	goto next_server;
+
+failed:
+	fc->flags |= AFS_FS_CURSOR_STOP;
+	_leave(" = f [failed %d]", fc->ac.error);
+	return false;
+}
+
+/*
+ * Select the same fileserver we used for a vnode before and only that
+ * fileserver.  We use this when we have a lock on that file, which is backed
+ * only by the fileserver we obtained it from.
+ */
+bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
+{
+	struct afs_vnode *vnode = fc->vnode;
+	struct afs_cb_interest *cbi = vnode->cb_interest;
+	struct afs_addr_list *alist;
+
+	_enter("");
+
+	if (!cbi) {
+		fc->ac.error = -ESTALE;
+		fc->flags |= AFS_FS_CURSOR_STOP;
+		return false;
+	}
+
+	read_lock(&cbi->server->fs_lock);
+	alist = afs_get_addrlist(cbi->server->addresses);
+	read_unlock(&cbi->server->fs_lock);
+	if (!alist) {
+		fc->ac.error = -ESTALE;
+		fc->flags |= AFS_FS_CURSOR_STOP;
+		return false;
+	}
+
+	fc->ac.alist = alist;
+	fc->ac.error = 0;
+	return true;
+}
+
+/*
+ * Tidy up a filesystem cursor and unlock the vnode.
+ */
+int afs_end_vnode_operation(struct afs_fs_cursor *fc)
+{
+	struct afs_net *net = afs_v2net(fc->vnode);
+	int ret;
+
+	mutex_unlock(&fc->vnode->io_lock);
+
+	afs_end_cursor(&fc->ac);
+	afs_put_cb_interest(net, fc->cbi);
+	afs_put_serverlist(net, fc->server_list);
+
+	ret = fc->ac.error;
+	if (ret == -ECONNABORTED)
+		afs_abort_to_error(fc->ac.abort_code);
+
+	return fc->ac.error;
+}
+
+#if 0
+/*
+ * Set a filesystem server cursor for using a specific FS server.
+ */
+int afs_set_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
+{
+	afs_init_fs_cursor(fc, vnode);
+
+	read_seqlock_excl(&vnode->cb_lock);
+	if (vnode->cb_interest) {
+		if (vnode->cb_interest->server->fs_state == 0)
+			fc->server = afs_get_server(vnode->cb_interest->server);
+		else
+			fc->ac.error = vnode->cb_interest->server->fs_state;
+	} else {
+		fc->ac.error = -ESTALE;
+	}
+	read_sequnlock_excl(&vnode->cb_lock);
+
+	return fc->ac.error;
+}
+
+/*
+ * pick a server to use to try accessing this volume
+ * - returns with an elevated usage count on the server chosen
+ */
+bool afs_volume_pick_fileserver(struct afs_fs_cursor *fc, struct afs_vnode *vnode)
+{
+	struct afs_volume *volume = vnode->volume;
+	struct afs_server *server;
+	int ret, state, loop;
+
+	_enter("%s", volume->vlocation->vldb.name);
+
+	/* stick with the server we're already using if we can */
+	if (vnode->cb_interest && vnode->cb_interest->server->fs_state == 0) {
+		fc->server = afs_get_server(vnode->cb_interest->server);
+		goto set_server;
+	}
+
+	down_read(&volume->server_sem);
+
+	/* handle the no-server case */
+	if (volume->nservers == 0) {
+		fc->ac.error = volume->rjservers ? -ENOMEDIUM : -ESTALE;
+		up_read(&volume->server_sem);
+		_leave(" = f [no servers %d]", fc->ac.error);
+		return false;
+	}
+
+	/* basically, just search the list for the first live server and use
+	 * that */
+	ret = 0;
+	for (loop = 0; loop < volume->nservers; loop++) {
+		server = volume->servers[loop];
+		state = server->fs_state;
+
+		_debug("consider %d [%d]", loop, state);
+
+		switch (state) {
+		case 0:
+			goto picked_server;
+
+		case -ENETUNREACH:
+			if (ret == 0)
+				ret = state;
+			break;
+
+		case -EHOSTUNREACH:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH)
+				ret = state;
+			break;
+
+		case -ECONNREFUSED:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH ||
+			    ret == -EHOSTUNREACH)
+				ret = state;
+			break;
+
+		default:
+		case -EREMOTEIO:
+			if (ret == 0 ||
+			    ret == -ENETUNREACH ||
+			    ret == -EHOSTUNREACH ||
+			    ret == -ECONNREFUSED)
+				ret = state;
+			break;
+		}
+	}
+
+error:
+	fc->ac.error = ret;
+
+	/* no available servers
+	 * - TODO: handle the no active servers case better
+	 */
+	up_read(&volume->server_sem);
+	_leave(" = f [%d]", fc->ac.error);
+	return false;
+
+picked_server:
+	/* Found an apparently healthy server.  We need to register an interest
+	 * in receiving callbacks before we talk to it.
+	 */
+	ret = afs_register_server_cb_interest(vnode,
+					      &volume->cb_interests[loop], server);
+	if (ret < 0)
+		goto error;
+
+	fc->server = afs_get_server(server);
+	up_read(&volume->server_sem);
+set_server:
+	fc->ac.alist = afs_get_addrlist(fc->server->addrs);
+	fc->ac.addr = &fc->ac.alist->addrs[0];
+	_debug("USING SERVER: %pIS\n", &fc->ac.addr->transport);
+	_leave(" = t (picked %pIS)", &fc->ac.addr->transport);
+	return true;
+}
+
+/*
+ * release a server after use
+ * - releases the ref on the server struct that was acquired by picking
+ * - records result of using a particular server to access a volume
+ * - return true to try again, false if okay or to issue error
+ * - the caller must release the server struct if result was false
+ */
+bool afs_iterate_fs_cursor(struct afs_fs_cursor *fc,
+			   struct afs_vnode *vnode)
+{
+	struct afs_volume *volume = vnode->volume;
+	struct afs_server *server = fc->server;
+	unsigned loop;
+
+	_enter("%s,%pIS,%d",
+	       volume->vlocation->vldb.name, &fc->ac.addr->transport,
+	       fc->ac.error);
+
+	switch (fc->ac.error) {
+		/* success */
+	case 0:
+		server->fs_state = 0;
+		_leave(" = f");
+		return false;
+
+		/* the fileserver denied all knowledge of the volume */
+	case -ENOMEDIUM:
+		down_write(&volume->server_sem);
+
+		/* firstly, find where the server is in the active list (if it
+		 * is) */
+		for (loop = 0; loop < volume->nservers; loop++)
+			if (volume->servers[loop] == server)
+				goto present;
+
+		/* no longer there - may have been discarded by another op */
+		goto try_next_server_upw;
+
+	present:
+		volume->nservers--;
+		memmove(&volume->servers[loop],
+			&volume->servers[loop + 1],
+			sizeof(volume->servers[loop]) *
+			(volume->nservers - loop));
+		volume->servers[volume->nservers] = NULL;
+		afs_put_server(afs_v2net(vnode), server);
+		volume->rjservers++;
+
+		if (volume->nservers > 0)
+			/* another server might acknowledge its existence */
+			goto try_next_server_upw;
+
+		/* handle the case where all the fileservers have rejected the
+		 * volume
+		 * - TODO: try asking the fileservers for volume information
+		 * - TODO: contact the VL server again to see if the volume is
+		 *         no longer registered
+		 */
+		up_write(&volume->server_sem);
+		afs_put_server(afs_v2net(vnode), server);
+		fc->server = NULL;
+		_leave(" = f [completely rejected]");
+		return false;
+
+		/* problem reaching the server */
+	case -ENETUNREACH:
+	case -EHOSTUNREACH:
+	case -ECONNREFUSED:
+	case -ETIME:
+	case -ETIMEDOUT:
+	case -EREMOTEIO:
+		/* mark the server as dead
+		 * TODO: vary dead timeout depending on error
+		 */
+		spin_lock(&server->fs_lock);
+		if (!server->fs_state) {
+			server->fs_state = fc->ac.error;
+			printk("kAFS: SERVER DEAD state=%d\n", fc->ac.error);
+		}
+		spin_unlock(&server->fs_lock);
+		goto try_next_server;
+
+		/* miscellaneous error */
+	default:
+	case -ENOMEM:
+	case -ENONET:
+		/* tell the caller to accept the result */
+		afs_put_server(afs_v2net(vnode), server);
+		fc->server = NULL;
+		_leave(" = f [local failure]");
+		return false;
+	}
+
+	/* tell the caller to loop around and try the next server */
+try_next_server_upw:
+	up_write(&volume->server_sem);
+try_next_server:
+	afs_put_server(afs_v2net(vnode), server);
+	_leave(" = t [try next server]");
+	return true;
+}
+
+/*
+ * Clean up a fileserver cursor.
+ */
+int afs_end_fs_cursor(struct afs_fs_cursor *fc, struct afs_net *net)
+{
+	afs_end_cursor(&fc->ac);
+	afs_put_server(net, fc->server);
+	return fc->ac.error;
+}
+
+#endif
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 0bf191f0dbaf..ea1460b9b71a 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -17,13 +17,10 @@
 #include "internal.h"
 #include "afs_cm.h"
 
-struct socket *afs_socket; /* my RxRPC socket */
-static struct workqueue_struct *afs_async_calls;
-static struct afs_call *afs_spare_incoming_call;
-atomic_t afs_outstanding_calls;
+struct workqueue_struct *afs_async_calls;
 
 static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
-static int afs_wait_for_call_to_complete(struct afs_call *);
+static long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
 static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
 static void afs_process_async_call(struct work_struct *);
 static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
@@ -34,24 +31,13 @@ static int afs_deliver_cm_op_id(struct afs_call *);
 static const struct afs_call_type afs_RXCMxxxx = {
 	.name		= "CB.xxxx",
 	.deliver	= afs_deliver_cm_op_id,
-	.abort_to_error	= afs_abort_to_error,
 };
 
-static void afs_charge_preallocation(struct work_struct *);
-
-static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
-
-static int afs_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 /*
  * open an RxRPC socket and bind it to be a server for callback notifications
  * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
  */
-int afs_open_socket(void)
+int afs_open_socket(struct afs_net *net)
 {
 	struct sockaddr_rxrpc srx;
 	struct socket *socket;
@@ -59,28 +45,26 @@ int afs_open_socket(void)
 
 	_enter("");
 
-	ret = -ENOMEM;
-	afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
-	if (!afs_async_calls)
-		goto error_0;
-
-	ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+	ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET6, &socket);
 	if (ret < 0)
 		goto error_1;
 
 	socket->sk->sk_allocation = GFP_NOFS;
 
 	/* bind the callback manager's address to make this a server socket */
+	memset(&srx, 0, sizeof(srx));
 	srx.srx_family			= AF_RXRPC;
 	srx.srx_service			= CM_SERVICE;
 	srx.transport_type		= SOCK_DGRAM;
-	srx.transport_len		= sizeof(srx.transport.sin);
-	srx.transport.sin.sin_family	= AF_INET;
-	srx.transport.sin.sin_port	= htons(AFS_CM_PORT);
-	memset(&srx.transport.sin.sin_addr, 0,
-	       sizeof(srx.transport.sin.sin_addr));
+	srx.transport_len		= sizeof(srx.transport.sin6);
+	srx.transport.sin6.sin6_family	= AF_INET6;
+	srx.transport.sin6.sin6_port	= htons(AFS_CM_PORT);
 
 	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	if (ret == -EADDRINUSE) {
+		srx.transport.sin6.sin6_port = 0;
+		ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	}
 	if (ret < 0)
 		goto error_2;
 
@@ -91,16 +75,14 @@ int afs_open_socket(void)
 	if (ret < 0)
 		goto error_2;
 
-	afs_socket = socket;
-	afs_charge_preallocation(NULL);
+	net->socket = socket;
+	afs_charge_preallocation(&net->charge_preallocation_work);
 	_leave(" = 0");
 	return 0;
 
 error_2:
 	sock_release(socket);
 error_1:
-	destroy_workqueue(afs_async_calls);
-error_0:
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -108,36 +90,36 @@ error_0:
 /*
  * close the RxRPC socket AFS was using
  */
-void afs_close_socket(void)
+void afs_close_socket(struct afs_net *net)
 {
 	_enter("");
 
-	kernel_listen(afs_socket, 0);
+	kernel_listen(net->socket, 0);
 	flush_workqueue(afs_async_calls);
 
-	if (afs_spare_incoming_call) {
-		afs_put_call(afs_spare_incoming_call);
-		afs_spare_incoming_call = NULL;
+	if (net->spare_incoming_call) {
+		afs_put_call(net->spare_incoming_call);
+		net->spare_incoming_call = NULL;
 	}
 
-	_debug("outstanding %u", atomic_read(&afs_outstanding_calls));
-	wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t,
+	_debug("outstanding %u", atomic_read(&net->nr_outstanding_calls));
+	wait_on_atomic_t(&net->nr_outstanding_calls, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 	_debug("no outstanding calls");
 
-	kernel_sock_shutdown(afs_socket, SHUT_RDWR);
+	kernel_sock_shutdown(net->socket, SHUT_RDWR);
 	flush_workqueue(afs_async_calls);
-	sock_release(afs_socket);
+	sock_release(net->socket);
 
 	_debug("dework");
-	destroy_workqueue(afs_async_calls);
 	_leave("");
 }
 
 /*
  * Allocate a call.
  */
-static struct afs_call *afs_alloc_call(const struct afs_call_type *type,
+static struct afs_call *afs_alloc_call(struct afs_net *net,
+				       const struct afs_call_type *type,
 				       gfp_t gfp)
 {
 	struct afs_call *call;
@@ -148,11 +130,13 @@ static struct afs_call *afs_alloc_call(const struct afs_call_type *type,
 		return NULL;
 
 	call->type = type;
+	call->net = net;
 	atomic_set(&call->usage, 1);
 	INIT_WORK(&call->async_work, afs_process_async_call);
 	init_waitqueue_head(&call->waitq);
+	spin_lock_init(&call->state_lock);
 
-	o = atomic_inc_return(&afs_outstanding_calls);
+	o = atomic_inc_return(&net->nr_outstanding_calls);
 	trace_afs_call(call, afs_call_trace_alloc, 1, o,
 		       __builtin_return_address(0));
 	return call;
@@ -163,8 +147,9 @@ static struct afs_call *afs_alloc_call(const struct afs_call_type *type,
  */
 void afs_put_call(struct afs_call *call)
 {
+	struct afs_net *net = call->net;
 	int n = atomic_dec_return(&call->usage);
-	int o = atomic_read(&afs_outstanding_calls);
+	int o = atomic_read(&net->nr_outstanding_calls);
 
 	trace_afs_call(call, afs_call_trace_put, n + 1, o,
 		       __builtin_return_address(0));
@@ -175,20 +160,22 @@ void afs_put_call(struct afs_call *call)
 		ASSERT(call->type->name != NULL);
 
 		if (call->rxcall) {
-			rxrpc_kernel_end_call(afs_socket, call->rxcall);
+			rxrpc_kernel_end_call(net->socket, call->rxcall);
 			call->rxcall = NULL;
 		}
 		if (call->type->destructor)
 			call->type->destructor(call);
 
+		afs_put_server(call->net, call->cm_server);
+		afs_put_cb_interest(call->net, call->cbi);
 		kfree(call->request);
 		kfree(call);
 
-		o = atomic_dec_return(&afs_outstanding_calls);
+		o = atomic_dec_return(&net->nr_outstanding_calls);
 		trace_afs_call(call, afs_call_trace_free, 0, o,
 			       __builtin_return_address(0));
 		if (o == 0)
-			wake_up_atomic_t(&afs_outstanding_calls);
+			wake_up_atomic_t(&net->nr_outstanding_calls);
 	}
 }
 
@@ -200,7 +187,7 @@ int afs_queue_call_work(struct afs_call *call)
 	int u = atomic_inc_return(&call->usage);
 
 	trace_afs_call(call, afs_call_trace_work, u,
-		       atomic_read(&afs_outstanding_calls),
+		       atomic_read(&call->net->nr_outstanding_calls),
 		       __builtin_return_address(0));
 
 	INIT_WORK(&call->work, call->type->work);
@@ -213,12 +200,13 @@ int afs_queue_call_work(struct afs_call *call)
 /*
  * allocate a call with flat request and reply buffers
  */
-struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
+struct afs_call *afs_alloc_flat_call(struct afs_net *net,
+				     const struct afs_call_type *type,
 				     size_t request_size, size_t reply_max)
 {
 	struct afs_call *call;
 
-	call = afs_alloc_call(type, GFP_NOFS);
+	call = afs_alloc_call(net, type, GFP_NOFS);
 	if (!call)
 		goto nomem_call;
 
@@ -236,6 +224,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
 			goto nomem_free;
 	}
 
+	call->operation_ID = type->op;
 	init_waitqueue_head(&call->waitq);
 	return call;
 
@@ -300,8 +289,7 @@ static void afs_notify_end_request_tx(struct sock *sock,
 {
 	struct afs_call *call = (struct afs_call *)call_user_ID;
 
-	if (call->state == AFS_CALL_REQUESTING)
-		call->state = AFS_CALL_AWAIT_REPLY;
+	afs_set_call_state(call, AFS_CALL_CL_REQUESTING, AFS_CALL_CL_AWAIT_REPLY);
 }
 
 /*
@@ -319,11 +307,13 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
 
 	do {
 		afs_load_bvec(call, msg, bv, first, last, offset);
+		trace_afs_send_pages(call, msg, first, last, offset);
+
 		offset = 0;
 		bytes = msg->msg_iter.count;
 		nr = msg->msg_iter.nr_segs;
 
-		ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg,
+		ret = rxrpc_kernel_send_data(call->net->socket, call->rxcall, msg,
 					     bytes, afs_notify_end_request_tx);
 		for (loop = 0; loop < nr; loop++)
 			put_page(bv[loop].bv_page);
@@ -333,62 +323,62 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
 		first += nr;
 	} while (first <= last);
 
+	trace_afs_sent_pages(call, call->first, last, first, ret);
 	return ret;
 }
 
 /*
  * initiate a call
  */
-int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
-		  bool async)
+long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call,
+		   gfp_t gfp, bool async)
 {
-	struct sockaddr_rxrpc srx;
+	struct sockaddr_rxrpc *srx = ac->addr;
 	struct rxrpc_call *rxcall;
 	struct msghdr msg;
 	struct kvec iov[1];
 	size_t offset;
 	s64 tx_total_len;
-	u32 abort_code;
 	int ret;
 
-	_enter("%x,{%d},", addr->s_addr, ntohs(call->port));
+	_enter(",{%pISp},", &srx->transport);
 
 	ASSERT(call->type != NULL);
 	ASSERT(call->type->name != NULL);
 
 	_debug("____MAKE %p{%s,%x} [%d]____",
 	       call, call->type->name, key_serial(call->key),
-	       atomic_read(&afs_outstanding_calls));
+	       atomic_read(&call->net->nr_outstanding_calls));
 
 	call->async = async;
 
-	memset(&srx, 0, sizeof(srx));
-	srx.srx_family = AF_RXRPC;
-	srx.srx_service = call->service_id;
-	srx.transport_type = SOCK_DGRAM;
-	srx.transport_len = sizeof(srx.transport.sin);
-	srx.transport.sin.sin_family = AF_INET;
-	srx.transport.sin.sin_port = call->port;
-	memcpy(&srx.transport.sin.sin_addr, addr, 4);
-
 	/* Work out the length we're going to transmit.  This is awkward for
 	 * calls such as FS.StoreData where there's an extra injection of data
 	 * after the initial fixed part.
 	 */
 	tx_total_len = call->request_size;
 	if (call->send_pages) {
-		tx_total_len += call->last_to - call->first_offset;
-		tx_total_len += (call->last - call->first) * PAGE_SIZE;
+		if (call->last == call->first) {
+			tx_total_len += call->last_to - call->first_offset;
+		} else {
+			/* It looks mathematically like you should be able to
+			 * combine the following lines with the ones above, but
+			 * unsigned arithmetic is fun when it wraps...
+			 */
+			tx_total_len += PAGE_SIZE - call->first_offset;
+			tx_total_len += call->last_to;
+			tx_total_len += (call->last - call->first - 1) * PAGE_SIZE;
+		}
 	}
 
 	/* create a call */
-	rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
+	rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
 					 (unsigned long)call,
 					 tx_total_len, gfp,
 					 (async ?
 					  afs_wake_up_async_call :
-					  afs_wake_up_call_waiter));
-	call->key = NULL;
+					  afs_wake_up_call_waiter),
+					 call->upgrade);
 	if (IS_ERR(rxcall)) {
 		ret = PTR_ERR(rxcall);
 		goto error_kill_call;
@@ -406,16 +396,9 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 		      call->request_size);
 	msg.msg_control		= NULL;
 	msg.msg_controllen	= 0;
-	msg.msg_flags		= (call->send_pages ? MSG_MORE : 0);
+	msg.msg_flags		= MSG_WAITALL | (call->send_pages ? MSG_MORE : 0);
 
-	/* We have to change the state *before* sending the last packet as
-	 * rxrpc might give us the reply before it returns from sending the
-	 * request.  Further, if the send fails, we may already have been given
-	 * a notification and may have collected it.
-	 */
-	if (!call->send_pages)
-		call->state = AFS_CALL_AWAIT_REPLY;
-	ret = rxrpc_kernel_send_data(afs_socket, rxcall,
+	ret = rxrpc_kernel_send_data(call->net->socket, rxcall,
 				     &msg, call->request_size,
 				     afs_notify_end_request_tx);
 	if (ret < 0)
@@ -432,22 +415,26 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	if (call->async)
 		return -EINPROGRESS;
 
-	return afs_wait_for_call_to_complete(call);
+	return afs_wait_for_call_to_complete(call, ac);
 
 error_do_abort:
 	call->state = AFS_CALL_COMPLETE;
 	if (ret != -ECONNABORTED) {
-		rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT,
-					ret, "KSD");
+		rxrpc_kernel_abort_call(call->net->socket, rxcall,
+					RX_USER_ABORT, ret, "KSD");
 	} else {
-		abort_code = 0;
 		offset = 0;
-		rxrpc_kernel_recv_data(afs_socket, rxcall, NULL, 0, &offset,
-				       false, &abort_code);
-		ret = call->type->abort_to_error(abort_code);
+		rxrpc_kernel_recv_data(call->net->socket, rxcall, NULL,
+				       0, &offset, false, &call->abort_code,
+				       &call->service_id);
+		ac->abort_code = call->abort_code;
+		ac->responded = true;
 	}
+	call->error = ret;
+	trace_afs_call_done(call);
 error_kill_call:
 	afs_put_call(call);
+	ac->error = ret;
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -457,123 +444,174 @@ error_kill_call:
  */
 static void afs_deliver_to_call(struct afs_call *call)
 {
-	u32 abort_code;
+	enum afs_call_state state;
+	u32 abort_code, remote_abort = 0;
 	int ret;
 
 	_enter("%s", call->type->name);
 
-	while (call->state == AFS_CALL_AWAIT_REPLY ||
-	       call->state == AFS_CALL_AWAIT_OP_ID ||
-	       call->state == AFS_CALL_AWAIT_REQUEST ||
-	       call->state == AFS_CALL_AWAIT_ACK
+	while (state = READ_ONCE(call->state),
+	       state == AFS_CALL_CL_AWAIT_REPLY ||
+	       state == AFS_CALL_SV_AWAIT_OP_ID ||
+	       state == AFS_CALL_SV_AWAIT_REQUEST ||
+	       state == AFS_CALL_SV_AWAIT_ACK
 	       ) {
-		if (call->state == AFS_CALL_AWAIT_ACK) {
+		if (state == AFS_CALL_SV_AWAIT_ACK) {
 			size_t offset = 0;
-			ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
+			ret = rxrpc_kernel_recv_data(call->net->socket,
+						     call->rxcall,
 						     NULL, 0, &offset, false,
-						     &call->abort_code);
+						     &remote_abort,
+						     &call->service_id);
 			trace_afs_recv_data(call, 0, offset, false, ret);
 
 			if (ret == -EINPROGRESS || ret == -EAGAIN)
 				return;
-			if (ret == 1 || ret < 0) {
-				call->state = AFS_CALL_COMPLETE;
-				goto done;
+			if (ret < 0 || ret == 1) {
+				if (ret == 1)
+					ret = 0;
+				goto call_complete;
 			}
 			return;
 		}
 
 		ret = call->type->deliver(call);
+		state = READ_ONCE(call->state);
 		switch (ret) {
 		case 0:
-			if (call->state == AFS_CALL_AWAIT_REPLY)
-				call->state = AFS_CALL_COMPLETE;
+			if (state == AFS_CALL_CL_PROC_REPLY)
+				goto call_complete;
+			ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY);
 			goto done;
 		case -EINPROGRESS:
 		case -EAGAIN:
 			goto out;
+		case -EIO:
 		case -ECONNABORTED:
-			goto call_complete;
+			ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
+			goto done;
 		case -ENOTCONN:
 			abort_code = RX_CALL_DEAD;
-			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
 						abort_code, ret, "KNC");
-			goto save_error;
+			goto local_abort;
 		case -ENOTSUPP:
 			abort_code = RXGEN_OPCODE;
-			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
 						abort_code, ret, "KIV");
-			goto save_error;
+			goto local_abort;
 		case -ENODATA:
 		case -EBADMSG:
 		case -EMSGSIZE:
 		default:
 			abort_code = RXGEN_CC_UNMARSHAL;
-			if (call->state != AFS_CALL_AWAIT_REPLY)
+			if (state != AFS_CALL_CL_AWAIT_REPLY)
 				abort_code = RXGEN_SS_UNMARSHAL;
-			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+			rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
 						abort_code, -EBADMSG, "KUM");
-			goto save_error;
+			goto local_abort;
 		}
 	}
 
 done:
-	if (call->state == AFS_CALL_COMPLETE && call->incoming)
+	if (state == AFS_CALL_COMPLETE && call->incoming)
 		afs_put_call(call);
 out:
 	_leave("");
 	return;
 
-save_error:
-	call->error = ret;
+local_abort:
+	abort_code = 0;
 call_complete:
-	call->state = AFS_CALL_COMPLETE;
+	afs_set_call_complete(call, ret, remote_abort);
+	state = AFS_CALL_COMPLETE;
 	goto done;
 }
 
 /*
  * wait synchronously for a call to complete
  */
-static int afs_wait_for_call_to_complete(struct afs_call *call)
+static long afs_wait_for_call_to_complete(struct afs_call *call,
+					  struct afs_addr_cursor *ac)
 {
-	int ret;
+	signed long rtt2, timeout;
+	long ret;
+	u64 rtt;
+	u32 life, last_life;
 
 	DECLARE_WAITQUEUE(myself, current);
 
 	_enter("");
 
+	rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
+	rtt2 = nsecs_to_jiffies64(rtt) * 2;
+	if (rtt2 < 2)
+		rtt2 = 2;
+
+	timeout = rtt2;
+	last_life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
+
 	add_wait_queue(&call->waitq, &myself);
 	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 
 		/* deliver any messages that are in the queue */
-		if (call->state < AFS_CALL_COMPLETE && call->need_attention) {
+		if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
+		    call->need_attention) {
 			call->need_attention = false;
 			__set_current_state(TASK_RUNNING);
 			afs_deliver_to_call(call);
 			continue;
 		}
 
-		if (call->state == AFS_CALL_COMPLETE ||
-		    signal_pending(current))
+		if (afs_check_call_state(call, AFS_CALL_COMPLETE))
 			break;
-		schedule();
+
+		life = rxrpc_kernel_check_life(call->net->socket, call->rxcall);
+		if (timeout == 0 &&
+		    life == last_life && signal_pending(current))
+				break;
+
+		if (life != last_life) {
+			timeout = rtt2;
+			last_life = life;
+		}
+
+		timeout = schedule_timeout(timeout);
 	}
 
 	remove_wait_queue(&call->waitq, &myself);
 	__set_current_state(TASK_RUNNING);
 
 	/* Kill off the call if it's still live. */
-	if (call->state < AFS_CALL_COMPLETE) {
+	if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
 		_debug("call interrupted");
-		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-					RX_USER_ABORT, -EINTR, "KWI");
+		if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+					    RX_USER_ABORT, -EINTR, "KWI"))
+			afs_set_call_complete(call, -EINTR, 0);
+	}
+
+	spin_lock_bh(&call->state_lock);
+	ac->abort_code = call->abort_code;
+	ac->error = call->error;
+	spin_unlock_bh(&call->state_lock);
+
+	ret = ac->error;
+	switch (ret) {
+	case 0:
+		if (call->ret_reply0) {
+			ret = (long)call->reply[0];
+			call->reply[0] = NULL;
+		}
+		/* Fall through */
+	case -ECONNABORTED:
+		ac->responded = true;
+		break;
 	}
 
-	ret = call->error;
 	_debug("call complete");
 	afs_put_call(call);
-	_leave(" = %d", ret);
+	_leave(" = %p", (void *)ret);
 	return ret;
 }
 
@@ -604,7 +642,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
 	u = __atomic_add_unless(&call->usage, 1, 0);
 	if (u != 0) {
 		trace_afs_call(call, afs_call_trace_wake, u,
-			       atomic_read(&afs_outstanding_calls),
+			       atomic_read(&call->net->nr_outstanding_calls),
 			       __builtin_return_address(0));
 
 		if (!queue_work(afs_async_calls, &call->async_work))
@@ -643,7 +681,7 @@ static void afs_process_async_call(struct work_struct *work)
 	}
 
 	if (call->state == AFS_CALL_COMPLETE) {
-		call->reply = NULL;
+		call->reply[0] = NULL;
 
 		/* We have two refs to release - one from the alloc and one
 		 * queued with the work item - and we can't just deallocate the
@@ -668,22 +706,24 @@ static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
 /*
  * Charge the incoming call preallocation.
  */
-static void afs_charge_preallocation(struct work_struct *work)
+void afs_charge_preallocation(struct work_struct *work)
 {
-	struct afs_call *call = afs_spare_incoming_call;
+	struct afs_net *net =
+		container_of(work, struct afs_net, charge_preallocation_work);
+	struct afs_call *call = net->spare_incoming_call;
 
 	for (;;) {
 		if (!call) {
-			call = afs_alloc_call(&afs_RXCMxxxx, GFP_KERNEL);
+			call = afs_alloc_call(net, &afs_RXCMxxxx, GFP_KERNEL);
 			if (!call)
 				break;
 
 			call->async = true;
-			call->state = AFS_CALL_AWAIT_OP_ID;
+			call->state = AFS_CALL_SV_AWAIT_OP_ID;
 			init_waitqueue_head(&call->waitq);
 		}
 
-		if (rxrpc_kernel_charge_accept(afs_socket,
+		if (rxrpc_kernel_charge_accept(net->socket,
 					       afs_wake_up_async_call,
 					       afs_rx_attach,
 					       (unsigned long)call,
@@ -691,7 +731,7 @@ static void afs_charge_preallocation(struct work_struct *work)
 			break;
 		call = NULL;
 	}
-	afs_spare_incoming_call = call;
+	net->spare_incoming_call = call;
 }
 
 /*
@@ -712,7 +752,9 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
 static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
 			    unsigned long user_call_ID)
 {
-	queue_work(afs_wq, &afs_charge_preallocation_work);
+	struct afs_net *net = afs_sock2net(sk);
+
+	queue_work(afs_wq, &net->charge_preallocation_work);
 }
 
 /*
@@ -733,7 +775,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
 		return ret;
 
 	call->operation_ID = ntohl(call->tmp);
-	call->state = AFS_CALL_AWAIT_REQUEST;
+	afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST);
 	call->offset = 0;
 
 	/* ask the cache manager to route the call (it'll change the call type
@@ -758,8 +800,7 @@ static void afs_notify_end_reply_tx(struct sock *sock,
 {
 	struct afs_call *call = (struct afs_call *)call_user_ID;
 
-	if (call->state == AFS_CALL_REPLYING)
-		call->state = AFS_CALL_AWAIT_ACK;
+	afs_set_call_state(call, AFS_CALL_SV_REPLYING, AFS_CALL_SV_AWAIT_ACK);
 }
 
 /*
@@ -767,11 +808,12 @@ static void afs_notify_end_reply_tx(struct sock *sock,
  */
 void afs_send_empty_reply(struct afs_call *call)
 {
+	struct afs_net *net = call->net;
 	struct msghdr msg;
 
 	_enter("");
 
-	rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, 0);
+	rxrpc_kernel_set_tx_length(net->socket, call->rxcall, 0);
 
 	msg.msg_name		= NULL;
 	msg.msg_namelen		= 0;
@@ -780,8 +822,7 @@ void afs_send_empty_reply(struct afs_call *call)
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
 
-	call->state = AFS_CALL_AWAIT_ACK;
-	switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0,
+	switch (rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, 0,
 				       afs_notify_end_reply_tx)) {
 	case 0:
 		_leave(" [replied]");
@@ -789,7 +830,7 @@ void afs_send_empty_reply(struct afs_call *call)
 
 	case -ENOMEM:
 		_debug("oom");
-		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+		rxrpc_kernel_abort_call(net->socket, call->rxcall,
 					RX_USER_ABORT, -ENOMEM, "KOO");
 	default:
 		_leave(" [error]");
@@ -802,13 +843,14 @@ void afs_send_empty_reply(struct afs_call *call)
  */
 void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 {
+	struct afs_net *net = call->net;
 	struct msghdr msg;
 	struct kvec iov[1];
 	int n;
 
 	_enter("");
 
-	rxrpc_kernel_set_tx_length(afs_socket, call->rxcall, len);
+	rxrpc_kernel_set_tx_length(net->socket, call->rxcall, len);
 
 	iov[0].iov_base		= (void *) buf;
 	iov[0].iov_len		= len;
@@ -819,8 +861,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	msg.msg_controllen	= 0;
 	msg.msg_flags		= 0;
 
-	call->state = AFS_CALL_AWAIT_ACK;
-	n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len,
+	n = rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, len,
 				   afs_notify_end_reply_tx);
 	if (n >= 0) {
 		/* Success */
@@ -830,7 +871,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 
 	if (n == -ENOMEM) {
 		_debug("oom");
-		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
+		rxrpc_kernel_abort_call(net->socket, call->rxcall,
 					RX_USER_ABORT, -ENOMEM, "KOO");
 	}
 	_leave(" [error]");
@@ -842,6 +883,9 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 		     bool want_more)
 {
+	struct afs_net *net = call->net;
+	enum afs_call_state state;
+	u32 remote_abort;
 	int ret;
 
 	_enter("{%s,%zu},,%zu,%d",
@@ -849,31 +893,32 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 
 	ASSERTCMP(call->offset, <=, count);
 
-	ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
+	ret = rxrpc_kernel_recv_data(net->socket, call->rxcall,
 				     buf, count, &call->offset,
-				     want_more, &call->abort_code);
+				     want_more, &remote_abort,
+				     &call->service_id);
 	trace_afs_recv_data(call, count, call->offset, want_more, ret);
 	if (ret == 0 || ret == -EAGAIN)
 		return ret;
 
+	state = READ_ONCE(call->state);
 	if (ret == 1) {
-		switch (call->state) {
-		case AFS_CALL_AWAIT_REPLY:
-			call->state = AFS_CALL_COMPLETE;
+		switch (state) {
+		case AFS_CALL_CL_AWAIT_REPLY:
+			afs_set_call_state(call, state, AFS_CALL_CL_PROC_REPLY);
 			break;
-		case AFS_CALL_AWAIT_REQUEST:
-			call->state = AFS_CALL_REPLYING;
+		case AFS_CALL_SV_AWAIT_REQUEST:
+			afs_set_call_state(call, state, AFS_CALL_SV_REPLYING);
 			break;
+		case AFS_CALL_COMPLETE:
+			kdebug("prem complete %d", call->error);
+			return -EIO;
 		default:
 			break;
 		}
 		return 0;
 	}
 
-	if (ret == -ECONNABORTED)
-		call->error = call->type->abort_to_error(call->abort_code);
-	else
-		call->error = ret;
-	call->state = AFS_CALL_COMPLETE;
+	afs_set_call_complete(call, ret, remote_abort);
 	return ret;
 }
diff --git a/fs/afs/security.c b/fs/afs/security.c
index faca66227ecf..46a881a4d08f 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -1,6 +1,6 @@
 /* AFS security handling
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2007, 2017 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -14,9 +14,13 @@
 #include <linux/fs.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
+#include <linux/hashtable.h>
 #include <keys/rxrpc-type.h>
 #include "internal.h"
 
+static DEFINE_HASHTABLE(afs_permits_cache, 10);
+static DEFINE_SPINLOCK(afs_permits_lock);
+
 /*
  * get a key
  */
@@ -46,167 +50,233 @@ struct key *afs_request_key(struct afs_cell *cell)
 }
 
 /*
- * dispose of a permits list
+ * Dispose of a list of permits.
  */
-void afs_zap_permits(struct rcu_head *rcu)
+static void afs_permits_rcu(struct rcu_head *rcu)
 {
 	struct afs_permits *permits =
 		container_of(rcu, struct afs_permits, rcu);
-	int loop;
-
-	_enter("{%d}", permits->count);
+	int i;
 
-	for (loop = permits->count - 1; loop >= 0; loop--)
-		key_put(permits->permits[loop].key);
+	for (i = 0; i < permits->nr_permits; i++)
+		key_put(permits->permits[i].key);
 	kfree(permits);
 }
 
 /*
- * dispose of a permits list in which all the key pointers have been copied
+ * Discard a permission cache.
  */
-static void afs_dispose_of_permits(struct rcu_head *rcu)
+void afs_put_permits(struct afs_permits *permits)
 {
-	struct afs_permits *permits =
-		container_of(rcu, struct afs_permits, rcu);
-
-	_enter("{%d}", permits->count);
-
-	kfree(permits);
+	if (permits && refcount_dec_and_test(&permits->usage)) {
+		spin_lock(&afs_permits_lock);
+		hash_del_rcu(&permits->hash_node);
+		spin_unlock(&afs_permits_lock);
+		call_rcu(&permits->rcu, afs_permits_rcu);
+	}
 }
 
 /*
- * get the authorising vnode - this is the specified inode itself if it's a
- * directory or it's the parent directory if the specified inode is a file or
- * symlink
- * - the caller must release the ref on the inode
+ * Clear a permit cache on callback break.
  */
-static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode,
-					    struct key *key)
+void afs_clear_permits(struct afs_vnode *vnode)
 {
-	struct afs_vnode *auth_vnode;
-	struct inode *auth_inode;
+	struct afs_permits *permits;
 
-	_enter("");
+	spin_lock(&vnode->lock);
+	permits = rcu_dereference_protected(vnode->permit_cache,
+					    lockdep_is_held(&vnode->lock));
+	RCU_INIT_POINTER(vnode->permit_cache, NULL);
+	vnode->cb_break++;
+	spin_unlock(&vnode->lock);
 
-	if (S_ISDIR(vnode->vfs_inode.i_mode)) {
-		auth_inode = igrab(&vnode->vfs_inode);
-		ASSERT(auth_inode != NULL);
-	} else {
-		auth_inode = afs_iget(vnode->vfs_inode.i_sb, key,
-				      &vnode->status.parent, NULL, NULL);
-		if (IS_ERR(auth_inode))
-			return ERR_CAST(auth_inode);
-	}
-
-	auth_vnode = AFS_FS_I(auth_inode);
-	_leave(" = {%x}", auth_vnode->fid.vnode);
-	return auth_vnode;
+	if (permits)
+		afs_put_permits(permits);
 }
 
 /*
- * clear the permit cache on a directory vnode
+ * Hash a list of permits.  Use simple addition to make it easy to add an extra
+ * one at an as-yet indeterminate position in the list.
  */
-void afs_clear_permits(struct afs_vnode *vnode)
+static void afs_hash_permits(struct afs_permits *permits)
 {
-	struct afs_permits *permits;
-
-	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+	unsigned long h = permits->nr_permits;
+	int i;
 
-	mutex_lock(&vnode->permits_lock);
-	permits = vnode->permits;
-	RCU_INIT_POINTER(vnode->permits, NULL);
-	mutex_unlock(&vnode->permits_lock);
+	for (i = 0; i < permits->nr_permits; i++) {
+		h += (unsigned long)permits->permits[i].key / sizeof(void *);
+		h += permits->permits[i].access;
+	}
 
-	if (permits)
-		call_rcu(&permits->rcu, afs_zap_permits);
-	_leave("");
+	permits->h = h;
 }
 
 /*
- * add the result obtained for a vnode to its or its parent directory's cache
- * for the key used to access it
+ * Cache the CallerAccess result obtained from doing a fileserver operation
+ * that returned a vnode status for a particular key.  If a callback break
+ * occurs whilst the operation was in progress then we have to ditch the cache
+ * as the ACL *may* have changed.
  */
-void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
+void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
+		      unsigned int cb_break)
 {
-	struct afs_permits *permits, *xpermits;
-	struct afs_permit *permit;
-	struct afs_vnode *auth_vnode;
-	int count, loop;
+	struct afs_permits *permits, *xpermits, *replacement, *new = NULL;
+	afs_access_t caller_access = READ_ONCE(vnode->status.caller_access);
+	size_t size = 0;
+	bool changed = false;
+	int i, j;
+
+	_enter("{%x:%u},%x,%x",
+	       vnode->fid.vid, vnode->fid.vnode, key_serial(key), caller_access);
+
+	rcu_read_lock();
+
+	/* Check for the common case first: We got back the same access as last
+	 * time we tried and already have it recorded.
+	 */
+	permits = rcu_dereference(vnode->permit_cache);
+	if (permits) {
+		if (!permits->invalidated) {
+			for (i = 0; i < permits->nr_permits; i++) {
+				if (permits->permits[i].key < key)
+					continue;
+				if (permits->permits[i].key > key)
+					break;
+				if (permits->permits[i].access != caller_access) {
+					changed = true;
+					break;
+				}
 
-	_enter("{%x:%u},%x,%lx",
-	       vnode->fid.vid, vnode->fid.vnode, key_serial(key), acl_order);
+				if (cb_break != (vnode->cb_break +
+						 vnode->cb_interest->server->cb_s_break)) {
+					changed = true;
+					break;
+				}
 
-	auth_vnode = afs_get_auth_inode(vnode, key);
-	if (IS_ERR(auth_vnode)) {
-		_leave(" [get error %ld]", PTR_ERR(auth_vnode));
-		return;
-	}
+				/* The cache is still good. */
+				rcu_read_unlock();
+				return;
+			}
+		}
+
+		changed |= permits->invalidated;
+		size = permits->nr_permits;
 
-	mutex_lock(&auth_vnode->permits_lock);
+		/* If this set of permits is now wrong, clear the permits
+		 * pointer so that no one tries to use the stale information.
+		 */
+		if (changed) {
+			spin_lock(&vnode->lock);
+			if (permits != rcu_access_pointer(vnode->permit_cache))
+				goto someone_else_changed_it_unlock;
+			RCU_INIT_POINTER(vnode->permit_cache, NULL);
+			spin_unlock(&vnode->lock);
+
+			afs_put_permits(permits);
+			permits = NULL;
+			size = 0;
+		}
+	}
 
-	/* guard against a rename being detected whilst we waited for the
-	 * lock */
-	if (memcmp(&auth_vnode->fid, &vnode->status.parent,
-		   sizeof(struct afs_fid)) != 0) {
-		_debug("renamed");
-		goto out_unlock;
+	if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) {
+		rcu_read_unlock();
+		goto someone_else_changed_it;
 	}
 
-	/* have to be careful as the directory's callback may be broken between
-	 * us receiving the status we're trying to cache and us getting the
-	 * lock to update the cache for the status */
-	if (auth_vnode->acl_order - acl_order > 0) {
-		_debug("ACL changed?");
-		goto out_unlock;
+	/* We need a ref on any permits list we want to copy as we'll have to
+	 * drop the lock to do memory allocation.
+	 */
+	if (permits && !refcount_inc_not_zero(&permits->usage)) {
+		rcu_read_unlock();
+		goto someone_else_changed_it;
 	}
 
-	/* always update the anonymous mask */
-	_debug("anon access %x", vnode->status.anon_access);
-	auth_vnode->status.anon_access = vnode->status.anon_access;
-	if (key == vnode->volume->cell->anonymous_key)
-		goto out_unlock;
-
-	xpermits = auth_vnode->permits;
-	count = 0;
-	if (xpermits) {
-		/* see if the permit is already in the list
-		 * - if it is then we just amend the list
-		 */
-		count = xpermits->count;
-		permit = xpermits->permits;
-		for (loop = count; loop > 0; loop--) {
-			if (permit->key == key) {
-				permit->access_mask =
-					vnode->status.caller_access;
-				goto out_unlock;
+	rcu_read_unlock();
+
+	/* Speculatively create a new list with the revised permission set.  We
+	 * discard this if we find an extant match already in the hash, but
+	 * it's easier to compare with memcmp this way.
+	 *
+	 * We fill in the key pointers at this time, but we don't get the refs
+	 * yet.
+	 */
+	size++;
+	new = kzalloc(sizeof(struct afs_permits) +
+		      sizeof(struct afs_permit) * size, GFP_NOFS);
+	if (!new)
+		return;
+
+	refcount_set(&new->usage, 1);
+	new->nr_permits = size;
+	i = j = 0;
+	if (permits) {
+		for (i = 0; i < permits->nr_permits; i++) {
+			if (j == i && permits->permits[i].key > key) {
+				new->permits[j].key = key;
+				new->permits[j].access = caller_access;
+				j++;
 			}
-			permit++;
+			new->permits[j].key = permits->permits[i].key;
+			new->permits[j].access = permits->permits[i].access;
+			j++;
+		}
+	}
+
+	if (j == i) {
+		new->permits[j].key = key;
+		new->permits[j].access = caller_access;
+	}
+
+	afs_hash_permits(new);
+
+	afs_put_permits(permits);
+
+	/* Now see if the permit list we want is actually already available */
+	spin_lock(&afs_permits_lock);
+
+	hash_for_each_possible(afs_permits_cache, xpermits, hash_node, new->h) {
+		if (xpermits->h != new->h ||
+		    xpermits->invalidated ||
+		    xpermits->nr_permits != new->nr_permits ||
+		    memcmp(xpermits->permits, new->permits,
+			   new->nr_permits * sizeof(struct afs_permit)) != 0)
+			continue;
+
+		if (refcount_inc_not_zero(&xpermits->usage)) {
+			replacement = xpermits;
+			goto found;
 		}
+
+		break;
 	}
 
-	permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1),
-			  GFP_NOFS);
-	if (!permits)
-		goto out_unlock;
-
-	if (xpermits)
-		memcpy(permits->permits, xpermits->permits,
-			count * sizeof(struct afs_permit));
-
-	_debug("key %x access %x",
-	       key_serial(key), vnode->status.caller_access);
-	permits->permits[count].access_mask = vnode->status.caller_access;
-	permits->permits[count].key = key_get(key);
-	permits->count = count + 1;
-
-	rcu_assign_pointer(auth_vnode->permits, permits);
-	if (xpermits)
-		call_rcu(&xpermits->rcu, afs_dispose_of_permits);
-
-out_unlock:
-	mutex_unlock(&auth_vnode->permits_lock);
-	iput(&auth_vnode->vfs_inode);
-	_leave("");
+	for (i = 0; i < new->nr_permits; i++)
+		key_get(new->permits[i].key);
+	hash_add_rcu(afs_permits_cache, &new->hash_node, new->h);
+	replacement = new;
+	new = NULL;
+
+found:
+	spin_unlock(&afs_permits_lock);
+
+	kfree(new);
+
+	spin_lock(&vnode->lock);
+	if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break) ||
+	    permits != rcu_access_pointer(vnode->permit_cache))
+		goto someone_else_changed_it_unlock;
+	rcu_assign_pointer(vnode->permit_cache, replacement);
+	spin_unlock(&vnode->lock);
+	afs_put_permits(permits);
+	return;
+
+someone_else_changed_it_unlock:
+	spin_unlock(&vnode->lock);
+someone_else_changed_it:
+	/* Someone else changed the cache under us - don't recheck at this
+	 * time.
+	 */
+	return;
 }
 
 /*
@@ -218,56 +288,45 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 			    afs_access_t *_access)
 {
 	struct afs_permits *permits;
-	struct afs_permit *permit;
-	struct afs_vnode *auth_vnode;
-	bool valid;
-	int loop, ret;
+	bool valid = false;
+	int i, ret;
 
 	_enter("{%x:%u},%x",
 	       vnode->fid.vid, vnode->fid.vnode, key_serial(key));
 
-	auth_vnode = afs_get_auth_inode(vnode, key);
-	if (IS_ERR(auth_vnode)) {
-		*_access = 0;
-		_leave(" = %ld", PTR_ERR(auth_vnode));
-		return PTR_ERR(auth_vnode);
-	}
-
-	ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode));
+	permits = vnode->permit_cache;
 
 	/* check the permits to see if we've got one yet */
-	if (key == auth_vnode->volume->cell->anonymous_key) {
+	if (key == vnode->volume->cell->anonymous_key) {
 		_debug("anon");
-		*_access = auth_vnode->status.anon_access;
+		*_access = vnode->status.anon_access;
 		valid = true;
 	} else {
-		valid = false;
 		rcu_read_lock();
-		permits = rcu_dereference(auth_vnode->permits);
+		permits = rcu_dereference(vnode->permit_cache);
 		if (permits) {
-			permit = permits->permits;
-			for (loop = permits->count; loop > 0; loop--) {
-				if (permit->key == key) {
-					_debug("found in cache");
-					*_access = permit->access_mask;
-					valid = true;
+			for (i = 0; i < permits->nr_permits; i++) {
+				if (permits->permits[i].key < key)
+					continue;
+				if (permits->permits[i].key > key)
 					break;
-				}
-				permit++;
+
+				*_access = permits->permits[i].access;
+				valid = !permits->invalidated;
+				break;
 			}
 		}
 		rcu_read_unlock();
 	}
 
 	if (!valid) {
-		/* check the status on the file we're actually interested in
-		 * (the post-processing will cache the result on auth_vnode) */
+		/* Check the status on the file we're actually interested in
+		 * (the post-processing will cache the result).
+		 */
 		_debug("no valid permit");
 
-		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		ret = afs_vnode_fetch_status(vnode, auth_vnode, key);
+		ret = afs_fetch_status(vnode, key);
 		if (ret < 0) {
-			iput(&auth_vnode->vfs_inode);
 			*_access = 0;
 			_leave(" = %d", ret);
 			return ret;
@@ -275,7 +334,6 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 		*_access = vnode->status.caller_access;
 	}
 
-	iput(&auth_vnode->vfs_inode);
 	_leave(" = 0 [access %x]", *_access);
 	return 0;
 }
@@ -304,14 +362,9 @@ int afs_permission(struct inode *inode, int mask)
 		return PTR_ERR(key);
 	}
 
-	/* if the promise has expired, we need to check the server again */
-	if (!vnode->cb_promised) {
-		_debug("not promised");
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
-		if (ret < 0)
-			goto error;
-		_debug("new promise [fl=%lx]", vnode->flags);
-	}
+	ret = afs_validate(vnode, key);
+	if (ret < 0)
+		goto error;
 
 	/* check the permits to see if we've got one yet */
 	ret = afs_check_permit(vnode, key, &access);
@@ -365,3 +418,12 @@ error:
 	_leave(" = %d", ret);
 	return ret;
 }
+
+void __exit afs_clean_up_permit_cache(void)
+{
+	int i;
+
+	for (i = 0; i < HASH_SIZE(afs_permits_cache); i++)
+		WARN_ON_ONCE(!hlist_empty(&afs_permits_cache[i]));
+
+}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index c001b1f2455f..1880f1b6a9f1 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -11,317 +11,689 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include "afs_fs.h"
 #include "internal.h"
 
-static unsigned afs_server_timeout = 10;	/* server timeout in seconds */
+static unsigned afs_server_gc_delay = 10;	/* Server record timeout in seconds */
+static unsigned afs_server_update_delay = 30;	/* Time till VLDB recheck in secs */
 
-static void afs_reap_server(struct work_struct *);
+static void afs_inc_servers_outstanding(struct afs_net *net)
+{
+	atomic_inc(&net->servers_outstanding);
+}
+
+static void afs_dec_servers_outstanding(struct afs_net *net)
+{
+	if (atomic_dec_and_test(&net->servers_outstanding))
+		wake_up_atomic_t(&net->servers_outstanding);
+}
+
+/*
+ * Find a server by one of its addresses.
+ */
+struct afs_server *afs_find_server(struct afs_net *net,
+				   const struct sockaddr_rxrpc *srx)
+{
+	const struct sockaddr_in6 *a = &srx->transport.sin6, *b;
+	const struct afs_addr_list *alist;
+	struct afs_server *server = NULL;
+	unsigned int i;
+	bool ipv6 = true;
+	int seq = 0, diff;
+
+	if (srx->transport.sin6.sin6_addr.s6_addr32[0] == 0 ||
+	    srx->transport.sin6.sin6_addr.s6_addr32[1] == 0 ||
+	    srx->transport.sin6.sin6_addr.s6_addr32[2] == htonl(0xffff))
+		ipv6 = false;
+
+	rcu_read_lock();
+
+	do {
+		if (server)
+			afs_put_server(net, server);
+		server = NULL;
+		read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
+
+		if (ipv6) {
+			hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
+				alist = rcu_dereference(server->addresses);
+				for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+					b = &alist->addrs[i].transport.sin6;
+					diff = (u16)a->sin6_port - (u16)b->sin6_port;
+					if (diff == 0)
+						diff = memcmp(&a->sin6_addr,
+							      &b->sin6_addr,
+							      sizeof(struct in6_addr));
+					if (diff == 0)
+						goto found;
+					if (diff < 0) {
+						// TODO: Sort the list
+						//if (i == alist->nr_ipv4)
+						//	goto not_found;
+						break;
+					}
+				}
+			}
+		} else {
+			hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
+				alist = rcu_dereference(server->addresses);
+				for (i = 0; i < alist->nr_ipv4; i++) {
+					b = &alist->addrs[i].transport.sin6;
+					diff = (u16)a->sin6_port - (u16)b->sin6_port;
+					if (diff == 0)
+						diff = ((u32)a->sin6_addr.s6_addr32[3] -
+							(u32)b->sin6_addr.s6_addr32[3]);
+					if (diff == 0)
+						goto found;
+					if (diff < 0) {
+						// TODO: Sort the list
+						//if (i == 0)
+						//	goto not_found;
+						break;
+					}
+				}
+			}
+		}
+
+	//not_found:
+		server = NULL;
+	found:
+		if (server && !atomic_inc_not_zero(&server->usage))
+			server = NULL;
+
+	} while (need_seqretry(&net->fs_addr_lock, seq));
 
-/* tree of all the servers, indexed by IP address */
-static struct rb_root afs_servers = RB_ROOT;
-static DEFINE_RWLOCK(afs_servers_lock);
+	done_seqretry(&net->fs_addr_lock, seq);
 
-/* LRU list of all the servers not currently in use */
-static LIST_HEAD(afs_server_graveyard);
-static DEFINE_SPINLOCK(afs_server_graveyard_lock);
-static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server);
+	rcu_read_unlock();
+	return server;
+}
 
 /*
- * install a server record in the master tree
+ * Look up a server by its UUID
  */
-static int afs_install_server(struct afs_server *server)
+struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid)
 {
-	struct afs_server *xserver;
+	struct afs_server *server = NULL;
+	struct rb_node *p;
+	int diff, seq = 0;
+
+	_enter("%pU", uuid);
+
+	do {
+		/* Unfortunately, rbtree walking doesn't give reliable results
+		 * under just the RCU read lock, so we have to check for
+		 * changes.
+		 */
+		if (server)
+			afs_put_server(net, server);
+		server = NULL;
+
+		read_seqbegin_or_lock(&net->fs_lock, &seq);
+
+		p = net->fs_servers.rb_node;
+		while (p) {
+			server = rb_entry(p, struct afs_server, uuid_rb);
+
+			diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
+			if (diff < 0) {
+				p = p->rb_left;
+			} else if (diff > 0) {
+				p = p->rb_right;
+			} else {
+				afs_get_server(server);
+				break;
+			}
+
+			server = NULL;
+		}
+	} while (need_seqretry(&net->fs_lock, seq));
+
+	done_seqretry(&net->fs_lock, seq);
+
+	_leave(" = %p", server);
+	return server;
+}
+
+/*
+ * Install a server record in the namespace tree
+ */
+static struct afs_server *afs_install_server(struct afs_net *net,
+					     struct afs_server *candidate)
+{
+	const struct afs_addr_list *alist;
+	struct afs_server *server;
 	struct rb_node **pp, *p;
-	int ret;
+	int ret = -EEXIST, diff;
 
-	_enter("%p", server);
+	_enter("%p", candidate);
 
-	write_lock(&afs_servers_lock);
+	write_seqlock(&net->fs_lock);
 
-	ret = -EEXIST;
-	pp = &afs_servers.rb_node;
+	/* Firstly install the server in the UUID lookup tree */
+	pp = &net->fs_servers.rb_node;
 	p = NULL;
 	while (*pp) {
 		p = *pp;
 		_debug("- consider %p", p);
-		xserver = rb_entry(p, struct afs_server, master_rb);
-		if (server->addr.s_addr < xserver->addr.s_addr)
+		server = rb_entry(p, struct afs_server, uuid_rb);
+		diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t));
+		if (diff < 0)
 			pp = &(*pp)->rb_left;
-		else if (server->addr.s_addr > xserver->addr.s_addr)
+		else if (diff > 0)
 			pp = &(*pp)->rb_right;
 		else
-			goto error;
+			goto exists;
 	}
 
-	rb_link_node(&server->master_rb, p, pp);
-	rb_insert_color(&server->master_rb, &afs_servers);
+	server = candidate;
+	rb_link_node(&server->uuid_rb, p, pp);
+	rb_insert_color(&server->uuid_rb, &net->fs_servers);
+	hlist_add_head_rcu(&server->proc_link, &net->fs_proc);
+
+	write_seqlock(&net->fs_addr_lock);
+	alist = rcu_dereference_protected(server->addresses,
+					  lockdep_is_held(&net->fs_addr_lock.lock));
+
+	/* Secondly, if the server has any IPv4 and/or IPv6 addresses, install
+	 * it in the IPv4 and/or IPv6 reverse-map lists.
+	 *
+	 * TODO: For speed we want to use something other than a flat list
+	 * here; even sorting the list in terms of lowest address would help a
+	 * bit, but anything we might want to do gets messy and memory
+	 * intensive.
+	 */
+	if (alist->nr_ipv4 > 0)
+		hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4);
+	if (alist->nr_addrs > alist->nr_ipv4)
+		hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6);
+
+	write_sequnlock(&net->fs_addr_lock);
 	ret = 0;
 
-error:
-	write_unlock(&afs_servers_lock);
-	return ret;
+exists:
+	afs_get_server(server);
+	write_sequnlock(&net->fs_lock);
+	return server;
 }
 
 /*
  * allocate a new server record
  */
-static struct afs_server *afs_alloc_server(struct afs_cell *cell,
-					   const struct in_addr *addr)
+static struct afs_server *afs_alloc_server(struct afs_net *net,
+					   const uuid_t *uuid,
+					   struct afs_addr_list *alist)
 {
 	struct afs_server *server;
 
 	_enter("");
 
 	server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
-	if (server) {
-		atomic_set(&server->usage, 1);
-		server->cell = cell;
-
-		INIT_LIST_HEAD(&server->link);
-		INIT_LIST_HEAD(&server->grave);
-		init_rwsem(&server->sem);
-		spin_lock_init(&server->fs_lock);
-		server->fs_vnodes = RB_ROOT;
-		server->cb_promises = RB_ROOT;
-		spin_lock_init(&server->cb_lock);
-		init_waitqueue_head(&server->cb_break_waitq);
-		INIT_DELAYED_WORK(&server->cb_break_work,
-				  afs_dispatch_give_up_callbacks);
-
-		memcpy(&server->addr, addr, sizeof(struct in_addr));
-		server->addr.s_addr = addr->s_addr;
-		_leave(" = %p{%d}", server, atomic_read(&server->usage));
-	} else {
-		_leave(" = NULL [nomem]");
-	}
+	if (!server)
+		goto enomem;
+
+	atomic_set(&server->usage, 1);
+	RCU_INIT_POINTER(server->addresses, alist);
+	server->addr_version = alist->version;
+	server->uuid = *uuid;
+	server->flags = (1UL << AFS_SERVER_FL_NEW);
+	server->update_at = ktime_get_real_seconds() + afs_server_update_delay;
+	rwlock_init(&server->fs_lock);
+	INIT_LIST_HEAD(&server->cb_interests);
+	rwlock_init(&server->cb_break_lock);
+
+	afs_inc_servers_outstanding(net);
+	_leave(" = %p", server);
 	return server;
+
+enomem:
+	_leave(" = NULL [nomem]");
+	return NULL;
+}
+
+/*
+ * Look up an address record for a server
+ */
+static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
+						 struct key *key, const uuid_t *uuid)
+{
+	struct afs_addr_cursor ac;
+	struct afs_addr_list *alist;
+	int ret;
+
+	ret = afs_set_vl_cursor(&ac, cell);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	while (afs_iterate_addresses(&ac)) {
+		if (test_bit(ac.index, &ac.alist->yfs))
+			alist = afs_yfsvl_get_endpoints(cell->net, &ac, key, uuid);
+		else
+			alist = afs_vl_get_addrs_u(cell->net, &ac, key, uuid);
+		switch (ac.error) {
+		case 0:
+			afs_end_cursor(&ac);
+			return alist;
+		case -ECONNABORTED:
+			ac.error = afs_abort_to_error(ac.abort_code);
+			goto error;
+		case -ENOMEM:
+		case -ENONET:
+			goto error;
+		case -ENETUNREACH:
+		case -EHOSTUNREACH:
+		case -ECONNREFUSED:
+			break;
+		default:
+			ac.error = -EIO;
+			goto error;
+		}
+	}
+
+error:
+	return ERR_PTR(afs_end_cursor(&ac));
 }
 
 /*
- * get an FS-server record for a cell
+ * Get or create a fileserver record.
  */
-struct afs_server *afs_lookup_server(struct afs_cell *cell,
-				     const struct in_addr *addr)
+struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key,
+				     const uuid_t *uuid)
 {
+	struct afs_addr_list *alist;
 	struct afs_server *server, *candidate;
 
-	_enter("%p,%pI4", cell, &addr->s_addr);
+	_enter("%p,%pU", cell->net, uuid);
 
-	/* quick scan of the list to see if we already have the server */
-	read_lock(&cell->servers_lock);
+	server = afs_find_server_by_uuid(cell->net, uuid);
+	if (server)
+		return server;
 
-	list_for_each_entry(server, &cell->servers, link) {
-		if (server->addr.s_addr == addr->s_addr)
-			goto found_server_quickly;
-	}
-	read_unlock(&cell->servers_lock);
+	alist = afs_vl_lookup_addrs(cell, key, uuid);
+	if (IS_ERR(alist))
+		return ERR_CAST(alist);
 
-	candidate = afs_alloc_server(cell, addr);
+	candidate = afs_alloc_server(cell->net, uuid, alist);
 	if (!candidate) {
-		_leave(" = -ENOMEM");
+		afs_put_addrlist(alist);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	write_lock(&cell->servers_lock);
-
-	/* check the cell's server list again */
-	list_for_each_entry(server, &cell->servers, link) {
-		if (server->addr.s_addr == addr->s_addr)
-			goto found_server;
+	server = afs_install_server(cell->net, candidate);
+	if (server != candidate) {
+		afs_put_addrlist(alist);
+		kfree(candidate);
 	}
 
-	_debug("new");
-	server = candidate;
-	if (afs_install_server(server) < 0)
-		goto server_in_two_cells;
-
-	afs_get_cell(cell);
-	list_add_tail(&server->link, &cell->servers);
-
-	write_unlock(&cell->servers_lock);
 	_leave(" = %p{%d}", server, atomic_read(&server->usage));
 	return server;
+}
 
-	/* found a matching server quickly */
-found_server_quickly:
-	_debug("found quickly");
-	afs_get_server(server);
-	read_unlock(&cell->servers_lock);
-no_longer_unused:
-	if (!list_empty(&server->grave)) {
-		spin_lock(&afs_server_graveyard_lock);
-		list_del_init(&server->grave);
-		spin_unlock(&afs_server_graveyard_lock);
+/*
+ * Set the server timer to fire after a given delay, assuming it's not already
+ * set for an earlier time.
+ */
+static void afs_set_server_timer(struct afs_net *net, time64_t delay)
+{
+	if (net->live) {
+		afs_inc_servers_outstanding(net);
+		if (timer_reduce(&net->fs_timer, jiffies + delay * HZ))
+			afs_dec_servers_outstanding(net);
 	}
-	_leave(" = %p{%d}", server, atomic_read(&server->usage));
-	return server;
+}
 
-	/* found a matching server on the second pass */
-found_server:
-	_debug("found");
-	afs_get_server(server);
-	write_unlock(&cell->servers_lock);
-	kfree(candidate);
-	goto no_longer_unused;
-
-	/* found a server that seems to be in two cells */
-server_in_two_cells:
-	write_unlock(&cell->servers_lock);
-	kfree(candidate);
-	printk(KERN_NOTICE "kAFS: Server %pI4 appears to be in two cells\n",
-	       addr);
-	_leave(" = -EEXIST");
-	return ERR_PTR(-EEXIST);
+/*
+ * Server management timer.  We have an increment on fs_outstanding that we
+ * need to pass along to the work item.
+ */
+void afs_servers_timer(struct timer_list *timer)
+{
+	struct afs_net *net = container_of(timer, struct afs_net, fs_timer);
+
+	_enter("");
+	if (!queue_work(afs_wq, &net->fs_manager))
+		afs_dec_servers_outstanding(net);
 }
 
 /*
- * look up a server by its IP address
+ * Release a reference on a server record.
  */
-struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx)
+void afs_put_server(struct afs_net *net, struct afs_server *server)
 {
-	struct afs_server *server = NULL;
-	struct rb_node *p;
-	struct in_addr addr = srx->transport.sin.sin_addr;
+	unsigned int usage;
 
-	_enter("{%d,%pI4}", srx->transport.family, &addr.s_addr);
+	if (!server)
+		return;
 
-	if (srx->transport.family != AF_INET) {
-		WARN(true, "AFS does not yes support non-IPv4 addresses\n");
-		return NULL;
-	}
+	server->put_time = ktime_get_real_seconds();
 
-	read_lock(&afs_servers_lock);
+	usage = atomic_dec_return(&server->usage);
 
-	p = afs_servers.rb_node;
-	while (p) {
-		server = rb_entry(p, struct afs_server, master_rb);
+	_enter("{%u}", usage);
 
-		_debug("- consider %p", p);
+	if (likely(usage > 0))
+		return;
 
-		if (addr.s_addr < server->addr.s_addr) {
-			p = p->rb_left;
-		} else if (addr.s_addr > server->addr.s_addr) {
-			p = p->rb_right;
-		} else {
-			afs_get_server(server);
-			goto found;
-		}
-	}
+	afs_set_server_timer(net, afs_server_gc_delay);
+}
 
-	server = NULL;
-found:
-	read_unlock(&afs_servers_lock);
-	ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr);
-	_leave(" = %p", server);
-	return server;
+static void afs_server_rcu(struct rcu_head *rcu)
+{
+	struct afs_server *server = container_of(rcu, struct afs_server, rcu);
+
+	afs_put_addrlist(server->addresses);
+	kfree(server);
 }
 
 /*
- * destroy a server record
- * - removes from the cell list
+ * destroy a dead server
  */
-void afs_put_server(struct afs_server *server)
+static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
 {
-	if (!server)
-		return;
+	struct afs_addr_list *alist = server->addresses;
+	struct afs_addr_cursor ac = {
+		.alist	= alist,
+		.addr	= &alist->addrs[0],
+		.start	= alist->index,
+		.index	= alist->index,
+		.error	= 0,
+	};
+	_enter("%p", server);
 
-	_enter("%p{%d}", server, atomic_read(&server->usage));
+	afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+	call_rcu(&server->rcu, afs_server_rcu);
+	afs_dec_servers_outstanding(net);
+}
 
-	_debug("PUT SERVER %d", atomic_read(&server->usage));
+/*
+ * Garbage collect any expired servers.
+ */
+static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
+{
+	struct afs_server *server;
+	bool deleted;
+	int usage;
+
+	while ((server = gc_list)) {
+		gc_list = server->gc_next;
+
+		write_seqlock(&net->fs_lock);
+		usage = 1;
+		deleted = atomic_try_cmpxchg(&server->usage, &usage, 0);
+		if (deleted) {
+			rb_erase(&server->uuid_rb, &net->fs_servers);
+			hlist_del_rcu(&server->proc_link);
+		}
+		write_sequnlock(&net->fs_lock);
 
-	ASSERTCMP(atomic_read(&server->usage), >, 0);
+		if (deleted)
+			afs_destroy_server(net, server);
+	}
+}
 
-	if (likely(!atomic_dec_and_test(&server->usage))) {
-		_leave("");
-		return;
+/*
+ * Manage the records of servers known to be within a network namespace.  This
+ * includes garbage collecting unused servers.
+ *
+ * Note also that we were given an increment on net->servers_outstanding by
+ * whoever queued us that we need to deal with before returning.
+ */
+void afs_manage_servers(struct work_struct *work)
+{
+	struct afs_net *net = container_of(work, struct afs_net, fs_manager);
+	struct afs_server *gc_list = NULL;
+	struct rb_node *cursor;
+	time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
+	bool purging = !net->live;
+
+	_enter("");
+
+	/* Trawl the server list looking for servers that have expired from
+	 * lack of use.
+	 */
+	read_seqlock_excl(&net->fs_lock);
+
+	for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) {
+		struct afs_server *server =
+			rb_entry(cursor, struct afs_server, uuid_rb);
+		int usage = atomic_read(&server->usage);
+
+		_debug("manage %pU %u", &server->uuid, usage);
+
+		ASSERTCMP(usage, >=, 1);
+		ASSERTIFCMP(purging, usage, ==, 1);
+
+		if (usage == 1) {
+			time64_t expire_at = server->put_time;
+
+			if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
+			    !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
+				expire_at += afs_server_gc_delay;
+			if (purging || expire_at <= now) {
+				server->gc_next = gc_list;
+				gc_list = server;
+			} else if (expire_at < next_manage) {
+				next_manage = expire_at;
+			}
+		}
 	}
 
-	afs_flush_callback_breaks(server);
+	read_sequnlock_excl(&net->fs_lock);
+
+	/* Update the timer on the way out.  We have to pass an increment on
+	 * servers_outstanding in the namespace that we are in to the timer or
+	 * the work scheduler.
+	 */
+	if (!purging && next_manage < TIME64_MAX) {
+		now = ktime_get_real_seconds();
 
-	spin_lock(&afs_server_graveyard_lock);
-	if (atomic_read(&server->usage) == 0) {
-		list_move_tail(&server->grave, &afs_server_graveyard);
-		server->time_of_death = ktime_get_real_seconds();
-		queue_delayed_work(afs_wq, &afs_server_reaper,
-				   afs_server_timeout * HZ);
+		if (next_manage - now <= 0) {
+			if (queue_work(afs_wq, &net->fs_manager))
+				afs_inc_servers_outstanding(net);
+		} else {
+			afs_set_server_timer(net, next_manage - now);
+		}
 	}
-	spin_unlock(&afs_server_graveyard_lock);
-	_leave(" [dead]");
+
+	afs_gc_servers(net, gc_list);
+
+	afs_dec_servers_outstanding(net);
+	_leave(" [%d]", atomic_read(&net->servers_outstanding));
+}
+
+static void afs_queue_server_manager(struct afs_net *net)
+{
+	afs_inc_servers_outstanding(net);
+	if (!queue_work(afs_wq, &net->fs_manager))
+		afs_dec_servers_outstanding(net);
 }
 
 /*
- * destroy a dead server
+ * Purge list of servers.
  */
-static void afs_destroy_server(struct afs_server *server)
+void afs_purge_servers(struct afs_net *net)
 {
-	_enter("%p", server);
+	_enter("");
 
-	ASSERTIF(server->cb_break_head != server->cb_break_tail,
-		 delayed_work_pending(&server->cb_break_work));
+	if (del_timer_sync(&net->fs_timer))
+		atomic_dec(&net->servers_outstanding);
 
-	ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
-	ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
-	ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
-	ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
+	afs_queue_server_manager(net);
 
-	afs_put_cell(server->cell);
-	kfree(server);
+	_debug("wait");
+	wait_on_atomic_t(&net->servers_outstanding, atomic_t_wait,
+			 TASK_UNINTERRUPTIBLE);
+	_leave("");
 }
 
 /*
- * reap dead server records
+ * Probe a fileserver to find its capabilities.
+ *
+ * TODO: Try service upgrade.
  */
-static void afs_reap_server(struct work_struct *work)
+static bool afs_do_probe_fileserver(struct afs_fs_cursor *fc)
 {
-	LIST_HEAD(corpses);
-	struct afs_server *server;
-	unsigned long delay, expiry;
-	time64_t now;
-
-	now = ktime_get_real_seconds();
-	spin_lock(&afs_server_graveyard_lock);
-
-	while (!list_empty(&afs_server_graveyard)) {
-		server = list_entry(afs_server_graveyard.next,
-				    struct afs_server, grave);
+	_enter("");
 
-		/* the queue is ordered most dead first */
-		expiry = server->time_of_death + afs_server_timeout;
-		if (expiry > now) {
-			delay = (expiry - now) * HZ;
-			mod_delayed_work(afs_wq, &afs_server_reaper, delay);
+	fc->ac.addr = NULL;
+	fc->ac.start = READ_ONCE(fc->ac.alist->index);
+	fc->ac.index = fc->ac.start;
+	fc->ac.error = 0;
+	fc->ac.begun = false;
+
+	while (afs_iterate_addresses(&fc->ac)) {
+		afs_fs_get_capabilities(afs_v2net(fc->vnode), fc->cbi->server,
+					&fc->ac, fc->key);
+		switch (fc->ac.error) {
+		case 0:
+			afs_end_cursor(&fc->ac);
+			set_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags);
+			return true;
+		case -ECONNABORTED:
+			fc->ac.error = afs_abort_to_error(fc->ac.abort_code);
+			goto error;
+		case -ENOMEM:
+		case -ENONET:
+			goto error;
+		case -ENETUNREACH:
+		case -EHOSTUNREACH:
+		case -ECONNREFUSED:
+		case -ETIMEDOUT:
+		case -ETIME:
 			break;
+		default:
+			fc->ac.error = -EIO;
+			goto error;
 		}
+	}
 
-		write_lock(&server->cell->servers_lock);
-		write_lock(&afs_servers_lock);
-		if (atomic_read(&server->usage) > 0) {
-			list_del_init(&server->grave);
-		} else {
-			list_move_tail(&server->grave, &corpses);
-			list_del_init(&server->link);
-			rb_erase(&server->master_rb, &afs_servers);
-		}
-		write_unlock(&afs_servers_lock);
-		write_unlock(&server->cell->servers_lock);
+error:
+	afs_end_cursor(&fc->ac);
+	return false;
+}
+
+/*
+ * If we haven't already, try probing the fileserver to get its capabilities.
+ * We try not to instigate parallel probes, but it's possible that the parallel
+ * probes will fail due to authentication failure when ours would succeed.
+ *
+ * TODO: Try sending an anonymous probe if an authenticated probe fails.
+ */
+bool afs_probe_fileserver(struct afs_fs_cursor *fc)
+{
+	bool success;
+	int ret, retries = 0;
+
+	_enter("");
+
+retry:
+	if (test_bit(AFS_SERVER_FL_PROBED, &fc->cbi->server->flags)) {
+		_leave(" = t");
+		return true;
 	}
 
-	spin_unlock(&afs_server_graveyard_lock);
+	if (!test_and_set_bit_lock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags)) {
+		success = afs_do_probe_fileserver(fc);
+		clear_bit_unlock(AFS_SERVER_FL_PROBING, &fc->cbi->server->flags);
+		wake_up_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING);
+		_leave(" = t");
+		return success;
+	}
+
+	_debug("wait");
+	ret = wait_on_bit(&fc->cbi->server->flags, AFS_SERVER_FL_PROBING,
+			  TASK_INTERRUPTIBLE);
+	if (ret == -ERESTARTSYS) {
+		fc->ac.error = ret;
+		_leave(" = f [%d]", ret);
+		return false;
+	}
 
-	/* now reap the corpses we've extracted */
-	while (!list_empty(&corpses)) {
-		server = list_entry(corpses.next, struct afs_server, grave);
-		list_del(&server->grave);
-		afs_destroy_server(server);
+	retries++;
+	if (retries == 4) {
+		fc->ac.error = -ESTALE;
+		_leave(" = f [stale]");
+		return false;
 	}
+	_debug("retry");
+	goto retry;
 }
 
 /*
- * discard all the server records for rmmod
+ * Get an update for a server's address list.
  */
-void __exit afs_purge_servers(void)
+static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct afs_server *server)
 {
-	afs_server_timeout = 0;
-	mod_delayed_work(afs_wq, &afs_server_reaper, 0);
+	struct afs_addr_list *alist, *discard;
+
+	_enter("");
+
+	alist = afs_vl_lookup_addrs(fc->vnode->volume->cell, fc->key,
+				    &server->uuid);
+	if (IS_ERR(alist)) {
+		fc->ac.error = PTR_ERR(alist);
+		_leave(" = f [%d]", fc->ac.error);
+		return false;
+	}
+
+	discard = alist;
+	if (server->addr_version != alist->version) {
+		write_lock(&server->fs_lock);
+		discard = rcu_dereference_protected(server->addresses,
+						    lockdep_is_held(&server->fs_lock));
+		rcu_assign_pointer(server->addresses, alist);
+		server->addr_version = alist->version;
+		write_unlock(&server->fs_lock);
+	}
+
+	server->update_at = ktime_get_real_seconds() + afs_server_update_delay;
+	afs_put_addrlist(discard);
+	_leave(" = t");
+	return true;
+}
+
+/*
+ * See if a server's address list needs updating.
+ */
+bool afs_check_server_record(struct afs_fs_cursor *fc, struct afs_server *server)
+{
+	time64_t now = ktime_get_real_seconds();
+	long diff;
+	bool success;
+	int ret, retries = 0;
+
+	_enter("");
+
+	ASSERT(server);
+
+retry:
+	diff = READ_ONCE(server->update_at) - now;
+	if (diff > 0) {
+		_leave(" = t [not now %ld]", diff);
+		return true;
+	}
+
+	if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) {
+		success = afs_update_server_record(fc, server);
+		clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags);
+		wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING);
+		_leave(" = %d", success);
+		return success;
+	}
+
+	ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING,
+			  TASK_INTERRUPTIBLE);
+	if (ret == -ERESTARTSYS) {
+		fc->ac.error = ret;
+		_leave(" = f [intr]");
+		return false;
+	}
+
+	retries++;
+	if (retries == 4) {
+		_leave(" = f [stale]");
+		ret = -ESTALE;
+		return false;
+	}
+	goto retry;
 }
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
new file mode 100644
index 000000000000..26bad7032bba
--- /dev/null
+++ b/fs/afs/server_list.c
@@ -0,0 +1,153 @@
+/* AFS fileserver list management.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
+{
+	int i;
+
+	if (refcount_dec_and_test(&slist->usage)) {
+		for (i = 0; i < slist->nr_servers; i++) {
+			afs_put_cb_interest(net, slist->servers[i].cb_interest);
+			afs_put_server(net, slist->servers[i].server);
+		}
+		kfree(slist);
+	}
+}
+
+/*
+ * Build a server list from a VLDB record.
+ */
+struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
+					      struct key *key,
+					      struct afs_vldb_entry *vldb,
+					      u8 type_mask)
+{
+	struct afs_server_list *slist;
+	struct afs_server *server;
+	int ret = -ENOMEM, nr_servers = 0, i, j;
+
+	for (i = 0; i < vldb->nr_servers; i++)
+		if (vldb->fs_mask[i] & type_mask)
+			nr_servers++;
+
+	slist = kzalloc(sizeof(struct afs_server_list) +
+			sizeof(struct afs_server_entry) * nr_servers,
+			GFP_KERNEL);
+	if (!slist)
+		goto error;
+
+	refcount_set(&slist->usage, 1);
+
+	/* Make sure a records exists for each server in the list. */
+	for (i = 0; i < vldb->nr_servers; i++) {
+		if (!(vldb->fs_mask[i] & type_mask))
+			continue;
+
+		server = afs_lookup_server(cell, key, &vldb->fs_server[i]);
+		if (IS_ERR(server)) {
+			ret = PTR_ERR(server);
+			if (ret == -ENOENT)
+				continue;
+			goto error_2;
+		}
+
+		/* Insertion-sort by server pointer */
+		for (j = 0; j < slist->nr_servers; j++)
+			if (slist->servers[j].server >= server)
+				break;
+		if (j < slist->nr_servers) {
+			if (slist->servers[j].server == server) {
+				afs_put_server(cell->net, server);
+				continue;
+			}
+
+			memmove(slist->servers + j + 1,
+				slist->servers + j,
+				(slist->nr_servers - j) * sizeof(struct afs_server_entry));
+		}
+
+		slist->servers[j].server = server;
+		slist->nr_servers++;
+	}
+
+	if (slist->nr_servers == 0) {
+		ret = -EDESTADDRREQ;
+		goto error_2;
+	}
+
+	return slist;
+
+error_2:
+	afs_put_serverlist(cell->net, slist);
+error:
+	return ERR_PTR(ret);
+}
+
+/*
+ * Copy the annotations from an old server list to its potential replacement.
+ */
+bool afs_annotate_server_list(struct afs_server_list *new,
+			      struct afs_server_list *old)
+{
+	struct afs_server *cur;
+	int i, j;
+
+	if (old->nr_servers != new->nr_servers)
+		goto changed;
+
+	for (i = 0; i < old->nr_servers; i++)
+		if (old->servers[i].server != new->servers[i].server)
+			goto changed;
+
+	return false;
+
+changed:
+	/* Maintain the same current server as before if possible. */
+	cur = old->servers[old->index].server;
+	for (j = 0; j < new->nr_servers; j++) {
+		if (new->servers[j].server == cur) {
+			new->index = j;
+			break;
+		}
+	}
+
+	/* Keep the old callback interest records where possible so that we
+	 * maintain callback interception.
+	 */
+	i = 0;
+	j = 0;
+	while (i < old->nr_servers && j < new->nr_servers) {
+		if (new->servers[j].server == old->servers[i].server) {
+			struct afs_cb_interest *cbi = old->servers[i].cb_interest;
+			if (cbi) {
+				new->servers[j].cb_interest = cbi;
+				refcount_inc(&cbi->usage);
+			}
+			i++;
+			j++;
+			continue;
+		}
+
+		if (new->servers[j].server < old->servers[i].server) {
+			j++;
+			continue;
+		}
+
+		i++;
+		continue;
+	}
+
+	return true;
+}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 689173c0a682..875b5eb02242 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -25,11 +25,10 @@
 #include <linux/statfs.h>
 #include <linux/sched.h>
 #include <linux/nsproxy.h>
+#include <linux/magic.h>
 #include <net/net_namespace.h>
 #include "internal.h"
 
-#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
-
 static void afs_i_init_once(void *foo);
 static struct dentry *afs_mount(struct file_system_type *fs_type,
 		      int flags, const char *dev_name, void *data);
@@ -143,9 +142,9 @@ void __exit afs_fs_exit(void)
  */
 static int afs_show_devname(struct seq_file *m, struct dentry *root)
 {
-	struct afs_super_info *as = root->d_sb->s_fs_info;
+	struct afs_super_info *as = AFS_FS_S(root->d_sb);
 	struct afs_volume *volume = as->volume;
-	struct afs_cell *cell = volume->cell;
+	struct afs_cell *cell = as->cell;
 	const char *suf = "";
 	char pref = '%';
 
@@ -163,7 +162,7 @@ static int afs_show_devname(struct seq_file *m, struct dentry *root)
 		break;
 	}
 
-	seq_printf(m, "%c%s:%s%s", pref, cell->name, volume->vlocation->vldb.name, suf);
+	seq_printf(m, "%c%s:%s%s", pref, cell->name, volume->name, suf);
 	return 0;
 }
 
@@ -201,12 +200,14 @@ static int afs_parse_options(struct afs_mount_params *params,
 		token = match_token(p, afs_options_list, args);
 		switch (token) {
 		case afs_opt_cell:
-			cell = afs_cell_lookup(args[0].from,
-					       args[0].to - args[0].from,
-					       false);
+			rcu_read_lock();
+			cell = afs_lookup_cell_rcu(params->net,
+						   args[0].from,
+						   args[0].to - args[0].from);
+			rcu_read_unlock();
 			if (IS_ERR(cell))
 				return PTR_ERR(cell);
-			afs_put_cell(params->cell);
+			afs_put_cell(params->net, params->cell);
 			params->cell = cell;
 			break;
 
@@ -308,13 +309,14 @@ static int afs_parse_device_name(struct afs_mount_params *params,
 
 	/* lookup the cell record */
 	if (cellname || !params->cell) {
-		cell = afs_cell_lookup(cellname, cellnamesz, true);
+		cell = afs_lookup_cell(params->net, cellname, cellnamesz,
+				       NULL, false);
 		if (IS_ERR(cell)) {
 			printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
 			       cellnamesz, cellnamesz, cellname ?: "");
 			return PTR_ERR(cell);
 		}
-		afs_put_cell(params->cell);
+		afs_put_cell(params->net, params->cell);
 		params->cell = cell;
 	}
 
@@ -332,14 +334,16 @@ static int afs_parse_device_name(struct afs_mount_params *params,
 static int afs_test_super(struct super_block *sb, void *data)
 {
 	struct afs_super_info *as1 = data;
-	struct afs_super_info *as = sb->s_fs_info;
+	struct afs_super_info *as = AFS_FS_S(sb);
 
-	return as->volume == as1->volume;
+	return as->net == as1->net && as->volume->vid == as1->volume->vid;
 }
 
 static int afs_set_super(struct super_block *sb, void *data)
 {
-	sb->s_fs_info = data;
+	struct afs_super_info *as = data;
+
+	sb->s_fs_info = as;
 	return set_anon_super(sb, NULL);
 }
 
@@ -349,7 +353,7 @@ static int afs_set_super(struct super_block *sb, void *data)
 static int afs_fill_super(struct super_block *sb,
 			  struct afs_mount_params *params)
 {
-	struct afs_super_info *as = sb->s_fs_info;
+	struct afs_super_info *as = AFS_FS_S(sb);
 	struct afs_fid fid;
 	struct inode *inode = NULL;
 	int ret;
@@ -366,13 +370,15 @@ static int afs_fill_super(struct super_block *sb,
 	if (ret)
 		return ret;
 	sb->s_bdi->ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
-	strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
+	sprintf(sb->s_id, "%u", as->volume->vid);
+
+	afs_activate_volume(as->volume);
 
 	/* allocate the root inode and dentry */
 	fid.vid		= as->volume->vid;
 	fid.vnode	= 1;
 	fid.unique	= 1;
-	inode = afs_iget(sb, params->key, &fid, NULL, NULL);
+	inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
@@ -394,23 +400,45 @@ error:
 	return ret;
 }
 
+static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params)
+{
+	struct afs_super_info *as;
+
+	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
+	if (as) {
+		as->net = afs_get_net(params->net);
+		as->cell = afs_get_cell(params->cell);
+	}
+	return as;
+}
+
+static void afs_destroy_sbi(struct afs_super_info *as)
+{
+	if (as) {
+		afs_put_volume(as->cell, as->volume);
+		afs_put_cell(as->net, as->cell);
+		afs_put_net(as->net);
+		kfree(as);
+	}
+}
+
 /*
  * get an AFS superblock
  */
 static struct dentry *afs_mount(struct file_system_type *fs_type,
-		      int flags, const char *dev_name, void *options)
+				int flags, const char *dev_name, void *options)
 {
 	struct afs_mount_params params;
 	struct super_block *sb;
-	struct afs_volume *vol;
+	struct afs_volume *candidate;
 	struct key *key;
-	char *new_opts = kstrdup(options, GFP_KERNEL);
 	struct afs_super_info *as;
 	int ret;
 
 	_enter(",,%s,%p", dev_name, options);
 
 	memset(&params, 0, sizeof(params));
+	params.net = &__afs_net;
 
 	ret = -EINVAL;
 	if (current->nsproxy->net_ns != &init_net)
@@ -436,66 +464,75 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 	}
 	params.key = key;
 
-	/* parse the device name */
-	vol = afs_volume_lookup(&params);
-	if (IS_ERR(vol)) {
-		ret = PTR_ERR(vol);
-		goto error;
-	}
-
 	/* allocate a superblock info record */
-	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
-	if (!as) {
-		ret = -ENOMEM;
-		afs_put_volume(vol);
-		goto error;
+	ret = -ENOMEM;
+	as = afs_alloc_sbi(&params);
+	if (!as)
+		goto error_key;
+
+	/* Assume we're going to need a volume record; at the very least we can
+	 * use it to update the volume record if we have one already.  This
+	 * checks that the volume exists within the cell.
+	 */
+	candidate = afs_create_volume(&params);
+	if (IS_ERR(candidate)) {
+		ret = PTR_ERR(candidate);
+		goto error_as;
 	}
-	as->volume = vol;
+
+	as->volume = candidate;
 
 	/* allocate a deviceless superblock */
 	sb = sget(fs_type, afs_test_super, afs_set_super, flags, as);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
-		afs_put_volume(vol);
-		kfree(as);
-		goto error;
+		goto error_as;
 	}
 
 	if (!sb->s_root) {
 		/* initial superblock/root creation */
 		_debug("create");
 		ret = afs_fill_super(sb, &params);
-		if (ret < 0) {
-			deactivate_locked_super(sb);
-			goto error;
-		}
+		if (ret < 0)
+			goto error_sb;
+		as = NULL;
 		sb->s_flags |= MS_ACTIVE;
 	} else {
 		_debug("reuse");
 		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
-		afs_put_volume(vol);
-		kfree(as);
+		afs_destroy_sbi(as);
+		as = NULL;
 	}
 
-	afs_put_cell(params.cell);
-	kfree(new_opts);
+	afs_put_cell(params.net, params.cell);
+	key_put(params.key);
 	_leave(" = 0 [%p]", sb);
 	return dget(sb->s_root);
 
-error:
-	afs_put_cell(params.cell);
+error_sb:
+	deactivate_locked_super(sb);
+	goto error_key;
+error_as:
+	afs_destroy_sbi(as);
+error_key:
 	key_put(params.key);
-	kfree(new_opts);
+error:
+	afs_put_cell(params.net, params.cell);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
 
 static void afs_kill_super(struct super_block *sb)
 {
-	struct afs_super_info *as = sb->s_fs_info;
+	struct afs_super_info *as = AFS_FS_S(sb);
+
+	/* Clear the callback interests (which will do ilookup5) before
+	 * deactivating the superblock.
+	 */
+	afs_clear_callback_interests(as->net, as->volume->servers);
 	kill_anon_super(sb);
-	afs_put_volume(as->volume);
-	kfree(as);
+	afs_deactivate_volume(as->volume);
+	afs_destroy_sbi(as);
 }
 
 /*
@@ -507,16 +544,15 @@ static void afs_i_init_once(void *_vnode)
 
 	memset(vnode, 0, sizeof(*vnode));
 	inode_init_once(&vnode->vfs_inode);
-	init_waitqueue_head(&vnode->update_waitq);
-	mutex_init(&vnode->permits_lock);
+	mutex_init(&vnode->io_lock);
 	mutex_init(&vnode->validate_lock);
-	spin_lock_init(&vnode->writeback_lock);
+	spin_lock_init(&vnode->wb_lock);
 	spin_lock_init(&vnode->lock);
-	INIT_LIST_HEAD(&vnode->writebacks);
+	INIT_LIST_HEAD(&vnode->wb_keys);
 	INIT_LIST_HEAD(&vnode->pending_locks);
 	INIT_LIST_HEAD(&vnode->granted_locks);
 	INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
-	INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
+	seqlock_init(&vnode->cb_lock);
 }
 
 /*
@@ -536,9 +572,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 	memset(&vnode->status, 0, sizeof(vnode->status));
 
 	vnode->volume		= NULL;
-	vnode->update_cnt	= 0;
 	vnode->flags		= 1 << AFS_VNODE_UNSET;
-	vnode->cb_promised	= false;
 
 	_leave(" = %p", &vnode->vfs_inode);
 	return &vnode->vfs_inode;
@@ -562,7 +596,7 @@ static void afs_destroy_inode(struct inode *inode)
 
 	_debug("DESTROY INODE %p", inode);
 
-	ASSERTCMP(vnode->server, ==, NULL);
+	ASSERTCMP(vnode->cb_interest, ==, NULL);
 
 	call_rcu(&inode->i_rcu, afs_i_callback);
 	atomic_dec(&afs_count_active_inodes);
@@ -573,6 +607,7 @@ static void afs_destroy_inode(struct inode *inode)
  */
 static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct afs_fs_cursor fc;
 	struct afs_volume_status vs;
 	struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
 	struct key *key;
@@ -582,21 +617,32 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (IS_ERR(key))
 		return PTR_ERR(key);
 
-	ret = afs_vnode_get_volume_status(vnode, key, &vs);
-	key_put(key);
-	if (ret < 0) {
-		_leave(" = %d", ret);
-		return ret;
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, key)) {
+		fc.flags |= AFS_FS_CURSOR_NO_VSLEEP;
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_get_volume_status(&fc, &vs);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
 	}
 
-	buf->f_type	= dentry->d_sb->s_magic;
-	buf->f_bsize	= AFS_BLOCK_SIZE;
-	buf->f_namelen	= AFSNAMEMAX - 1;
+	key_put(key);
 
-	if (vs.max_quota == 0)
-		buf->f_blocks = vs.part_max_blocks;
-	else
-		buf->f_blocks = vs.max_quota;
-	buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use;
-	return 0;
+	if (ret == 0) {
+		buf->f_type	= dentry->d_sb->s_magic;
+		buf->f_bsize	= AFS_BLOCK_SIZE;
+		buf->f_namelen	= AFSNAMEMAX - 1;
+
+		if (vs.max_quota == 0)
+			buf->f_blocks = vs.part_max_blocks;
+		else
+			buf->f_blocks = vs.max_quota;
+		buf->f_bavail = buf->f_bfree = buf->f_blocks - vs.blocks_in_use;
+	}
+
+	return ret;
 }
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index a5e4cc561b6c..e372f89fd36a 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -12,58 +12,19 @@
 #include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include "afs_fs.h"
 #include "internal.h"
 
 /*
- * map volume locator abort codes to error codes
+ * Deliver reply data to a VL.GetEntryByNameU call.
  */
-static int afs_vl_abort_to_error(u32 abort_code)
+static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
 {
-	_enter("%u", abort_code);
-
-	switch (abort_code) {
-	case AFSVL_IDEXIST:		return -EEXIST;
-	case AFSVL_IO:			return -EREMOTEIO;
-	case AFSVL_NAMEEXIST:		return -EEXIST;
-	case AFSVL_CREATEFAIL:		return -EREMOTEIO;
-	case AFSVL_NOENT:		return -ENOMEDIUM;
-	case AFSVL_EMPTY:		return -ENOMEDIUM;
-	case AFSVL_ENTDELETED:		return -ENOMEDIUM;
-	case AFSVL_BADNAME:		return -EINVAL;
-	case AFSVL_BADINDEX:		return -EINVAL;
-	case AFSVL_BADVOLTYPE:		return -EINVAL;
-	case AFSVL_BADSERVER:		return -EINVAL;
-	case AFSVL_BADPARTITION:	return -EINVAL;
-	case AFSVL_REPSFULL:		return -EFBIG;
-	case AFSVL_NOREPSERVER:		return -ENOENT;
-	case AFSVL_DUPREPSERVER:	return -EEXIST;
-	case AFSVL_RWNOTFOUND:		return -ENOENT;
-	case AFSVL_BADREFCOUNT:		return -EINVAL;
-	case AFSVL_SIZEEXCEEDED:	return -EINVAL;
-	case AFSVL_BADENTRY:		return -EINVAL;
-	case AFSVL_BADVOLIDBUMP:	return -EINVAL;
-	case AFSVL_IDALREADYHASHED:	return -EINVAL;
-	case AFSVL_ENTRYLOCKED:		return -EBUSY;
-	case AFSVL_BADVOLOPER:		return -EBADRQC;
-	case AFSVL_BADRELLOCKTYPE:	return -EINVAL;
-	case AFSVL_RERELEASE:		return -EREMOTEIO;
-	case AFSVL_BADSERVERFLAG:	return -EINVAL;
-	case AFSVL_PERM:		return -EACCES;
-	case AFSVL_NOMEM:		return -EREMOTEIO;
-	default:
-		return afs_abort_to_error(abort_code);
-	}
-}
-
-/*
- * deliver reply data to a VL.GetEntryByXXX call
- */
-static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call)
-{
-	struct afs_cache_vlocation *entry;
-	__be32 *bp;
+	struct afs_uvldbentry__xdr *uvldb;
+	struct afs_vldb_entry *entry;
+	bool new_only = false;
 	u32 tmp;
-	int loop, ret;
+	int i, ret;
 
 	_enter("");
 
@@ -72,144 +33,613 @@ static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call)
 		return ret;
 
 	/* unmarshall the reply once we've received all of it */
-	entry = call->reply;
-	bp = call->buffer;
-
-	for (loop = 0; loop < 64; loop++)
-		entry->name[loop] = ntohl(*bp++);
-	entry->name[loop] = 0;
-	bp++; /* final NUL */
+	uvldb = call->buffer;
+	entry = call->reply[0];
 
-	bp++; /* type */
-	entry->nservers = ntohl(*bp++);
+	for (i = 0; i < ARRAY_SIZE(uvldb->name) - 1; i++)
+		entry->name[i] = (u8)ntohl(uvldb->name[i]);
+	entry->name[i] = 0;
+	entry->name_len = strlen(entry->name);
 
-	for (loop = 0; loop < 8; loop++)
-		entry->servers[loop].s_addr = *bp++;
+	/* If there is a new replication site that we can use, ignore all the
+	 * sites that aren't marked as new.
+	 */
+	for (i = 0; i < AFS_NMAXNSERVERS; i++) {
+		tmp = ntohl(uvldb->serverFlags[i]);
+		if (!(tmp & AFS_VLSF_DONTUSE) &&
+		    (tmp & AFS_VLSF_NEWREPSITE))
+			new_only = true;
+	}
 
-	bp += 8; /* partition IDs */
+	for (i = 0; i < AFS_NMAXNSERVERS; i++) {
+		struct afs_uuid__xdr *xdr;
+		struct afs_uuid *uuid;
+		int j;
 
-	for (loop = 0; loop < 8; loop++) {
-		tmp = ntohl(*bp++);
-		entry->srvtmask[loop] = 0;
+		tmp = ntohl(uvldb->serverFlags[i]);
+		if (tmp & AFS_VLSF_DONTUSE ||
+		    (new_only && !(tmp & AFS_VLSF_NEWREPSITE)))
+			continue;
 		if (tmp & AFS_VLSF_RWVOL)
-			entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
+			entry->fs_mask[i] |= AFS_VOL_VTM_RW;
 		if (tmp & AFS_VLSF_ROVOL)
-			entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
+			entry->fs_mask[i] |= AFS_VOL_VTM_RO;
 		if (tmp & AFS_VLSF_BACKVOL)
-			entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
-	}
+			entry->fs_mask[i] |= AFS_VOL_VTM_BAK;
+		if (!entry->fs_mask[i])
+			continue;
 
-	entry->vid[0] = ntohl(*bp++);
-	entry->vid[1] = ntohl(*bp++);
-	entry->vid[2] = ntohl(*bp++);
+		xdr = &uvldb->serverNumber[i];
+		uuid = (struct afs_uuid *)&entry->fs_server[i];
+		uuid->time_low			= xdr->time_low;
+		uuid->time_mid			= htons(ntohl(xdr->time_mid));
+		uuid->time_hi_and_version	= htons(ntohl(xdr->time_hi_and_version));
+		uuid->clock_seq_hi_and_reserved	= (u8)ntohl(xdr->clock_seq_hi_and_reserved);
+		uuid->clock_seq_low		= (u8)ntohl(xdr->clock_seq_low);
+		for (j = 0; j < 6; j++)
+			uuid->node[j] = (u8)ntohl(xdr->node[j]);
 
-	bp++; /* clone ID */
+		entry->nr_servers++;
+	}
+
+	for (i = 0; i < AFS_MAXTYPES; i++)
+		entry->vid[i] = ntohl(uvldb->volumeId[i]);
 
-	tmp = ntohl(*bp++); /* flags */
-	entry->vidmask = 0;
+	tmp = ntohl(uvldb->flags);
 	if (tmp & AFS_VLF_RWEXISTS)
-		entry->vidmask |= AFS_VOL_VTM_RW;
+		__set_bit(AFS_VLDB_HAS_RW, &entry->flags);
 	if (tmp & AFS_VLF_ROEXISTS)
-		entry->vidmask |= AFS_VOL_VTM_RO;
+		__set_bit(AFS_VLDB_HAS_RO, &entry->flags);
 	if (tmp & AFS_VLF_BACKEXISTS)
-		entry->vidmask |= AFS_VOL_VTM_BAK;
-	if (!entry->vidmask)
-		return -EBADMSG;
+		__set_bit(AFS_VLDB_HAS_BAK, &entry->flags);
 
+	if (!(tmp & (AFS_VLF_RWEXISTS | AFS_VLF_ROEXISTS | AFS_VLF_BACKEXISTS))) {
+		entry->error = -ENOMEDIUM;
+		__set_bit(AFS_VLDB_QUERY_ERROR, &entry->flags);
+	}
+
+	__set_bit(AFS_VLDB_QUERY_VALID, &entry->flags);
 	_leave(" = 0 [done]");
 	return 0;
 }
 
-/*
- * VL.GetEntryByName operation type
- */
-static const struct afs_call_type afs_RXVLGetEntryByName = {
-	.name		= "VL.GetEntryByName",
-	.deliver	= afs_deliver_vl_get_entry_by_xxx,
-	.abort_to_error	= afs_vl_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
-};
+static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call)
+{
+	kfree(call->reply[0]);
+	afs_flat_call_destructor(call);
+}
 
 /*
- * VL.GetEntryById operation type
+ * VL.GetEntryByNameU operation type.
  */
-static const struct afs_call_type afs_RXVLGetEntryById = {
-	.name		= "VL.GetEntryById",
-	.deliver	= afs_deliver_vl_get_entry_by_xxx,
-	.abort_to_error	= afs_vl_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
+static const struct afs_call_type afs_RXVLGetEntryByNameU = {
+	.name		= "VL.GetEntryByNameU",
+	.op		= afs_VL_GetEntryByNameU,
+	.deliver	= afs_deliver_vl_get_entry_by_name_u,
+	.destructor	= afs_destroy_vl_get_entry_by_name_u,
 };
 
 /*
- * dispatch a get volume entry by name operation
+ * Dispatch a get volume entry by name or ID operation (uuid variant).  If the
+ * volname is a decimal number then it's a volume ID not a volume name.
  */
-int afs_vl_get_entry_by_name(struct in_addr *addr,
-			     struct key *key,
-			     const char *volname,
-			     struct afs_cache_vlocation *entry,
-			     bool async)
+struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_net *net,
+						  struct afs_addr_cursor *ac,
+						  struct key *key,
+						  const char *volname,
+						  int volnamesz)
 {
+	struct afs_vldb_entry *entry;
 	struct afs_call *call;
-	size_t volnamesz, reqsz, padsz;
+	size_t reqsz, padsz;
 	__be32 *bp;
 
 	_enter("");
 
-	volnamesz = strlen(volname);
 	padsz = (4 - (volnamesz & 3)) & 3;
 	reqsz = 8 + volnamesz + padsz;
 
-	call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384);
-	if (!call)
-		return -ENOMEM;
+	entry = kzalloc(sizeof(struct afs_vldb_entry), GFP_KERNEL);
+	if (!entry)
+		return ERR_PTR(-ENOMEM);
+
+	call = afs_alloc_flat_call(net, &afs_RXVLGetEntryByNameU, reqsz,
+				   sizeof(struct afs_uvldbentry__xdr));
+	if (!call) {
+		kfree(entry);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	call->key = key;
-	call->reply = entry;
-	call->service_id = VL_SERVICE;
-	call->port = htons(AFS_VL_PORT);
+	call->reply[0] = entry;
+	call->ret_reply0 = true;
 
-	/* marshall the parameters */
+	/* Marshall the parameters */
 	bp = call->request;
-	*bp++ = htonl(VLGETENTRYBYNAME);
+	*bp++ = htonl(VLGETENTRYBYNAMEU);
 	*bp++ = htonl(volnamesz);
 	memcpy(bp, volname, volnamesz);
 	if (padsz > 0)
-		memset((void *) bp + volnamesz, 0, padsz);
+		memset((void *)bp + volnamesz, 0, padsz);
 
-	/* initiate the call */
-	return afs_make_call(addr, call, GFP_KERNEL, async);
+	trace_afs_make_vl_call(call);
+	return (struct afs_vldb_entry *)afs_make_call(ac, call, GFP_KERNEL, false);
 }
 
 /*
- * dispatch a get volume entry by ID operation
+ * Deliver reply data to a VL.GetAddrsU call.
+ *
+ *	GetAddrsU(IN ListAddrByAttributes *inaddr,
+ *		  OUT afsUUID *uuidp1,
+ *		  OUT uint32_t *uniquifier,
+ *		  OUT uint32_t *nentries,
+ *		  OUT bulkaddrs *blkaddrs);
  */
-int afs_vl_get_entry_by_id(struct in_addr *addr,
-			   struct key *key,
-			   afs_volid_t volid,
-			   afs_voltype_t voltype,
-			   struct afs_cache_vlocation *entry,
-			   bool async)
+static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
 {
+	struct afs_addr_list *alist;
+	__be32 *bp;
+	u32 uniquifier, nentries, count;
+	int i, ret;
+
+	_enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+
+again:
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract the returned uuid, uniquifier, nentries and blkaddrs size */
+	case 1:
+		ret = afs_extract_data(call, call->buffer,
+				       sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32),
+				       true);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer + sizeof(struct afs_uuid__xdr);
+		uniquifier	= ntohl(*bp++);
+		nentries	= ntohl(*bp++);
+		count		= ntohl(*bp);
+
+		nentries = min(nentries, count);
+		alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT);
+		if (!alist)
+			return -ENOMEM;
+		alist->version = uniquifier;
+		call->reply[0] = alist;
+		call->count = count;
+		call->count2 = nentries;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract entries */
+	case 2:
+		count = min(call->count, 4U);
+		ret = afs_extract_data(call, call->buffer,
+				       count * sizeof(__be32),
+				       call->count > 4);
+		if (ret < 0)
+			return ret;
+
+		alist = call->reply[0];
+		bp = call->buffer;
+		for (i = 0; i < count; i++)
+			if (alist->nr_addrs < call->count2)
+				afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT);
+
+		call->count -= count;
+		if (call->count > 0)
+			goto again;
+		call->offset = 0;
+		call->unmarshall++;
+		break;
+	}
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+static void afs_vl_get_addrs_u_destructor(struct afs_call *call)
+{
+	afs_put_server(call->net, (struct afs_server *)call->reply[0]);
+	kfree(call->reply[1]);
+	return afs_flat_call_destructor(call);
+}
+
+/*
+ * VL.GetAddrsU operation type.
+ */
+static const struct afs_call_type afs_RXVLGetAddrsU = {
+	.name		= "VL.GetAddrsU",
+	.op		= afs_VL_GetAddrsU,
+	.deliver	= afs_deliver_vl_get_addrs_u,
+	.destructor	= afs_vl_get_addrs_u_destructor,
+};
+
+/*
+ * Dispatch an operation to get the addresses for a server, where the server is
+ * nominated by UUID.
+ */
+struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net,
+					 struct afs_addr_cursor *ac,
+					 struct key *key,
+					 const uuid_t *uuid)
+{
+	struct afs_ListAddrByAttributes__xdr *r;
+	const struct afs_uuid *u = (const struct afs_uuid *)uuid;
 	struct afs_call *call;
 	__be32 *bp;
+	int i;
 
 	_enter("");
 
-	call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384);
+	call = afs_alloc_flat_call(net, &afs_RXVLGetAddrsU,
+				   sizeof(__be32) + sizeof(struct afs_ListAddrByAttributes__xdr),
+				   sizeof(struct afs_uuid__xdr) + 3 * sizeof(__be32));
+	if (!call)
+		return ERR_PTR(-ENOMEM);
+
+	call->key = key;
+	call->reply[0] = NULL;
+	call->ret_reply0 = true;
+
+	/* Marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(VLGETADDRSU);
+	r = (struct afs_ListAddrByAttributes__xdr *)bp;
+	r->Mask		= htonl(AFS_VLADDR_UUID);
+	r->ipaddr	= 0;
+	r->index	= 0;
+	r->spare	= 0;
+	r->uuid.time_low			= u->time_low;
+	r->uuid.time_mid			= htonl(ntohs(u->time_mid));
+	r->uuid.time_hi_and_version		= htonl(ntohs(u->time_hi_and_version));
+	r->uuid.clock_seq_hi_and_reserved 	= htonl(u->clock_seq_hi_and_reserved);
+	r->uuid.clock_seq_low			= htonl(u->clock_seq_low);
+	for (i = 0; i < 6; i++)
+		r->uuid.node[i] = ntohl(u->node[i]);
+
+	trace_afs_make_vl_call(call);
+	return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
+}
+
+/*
+ * Deliver reply data to an VL.GetCapabilities operation.
+ */
+static int afs_deliver_vl_get_capabilities(struct afs_call *call)
+{
+	u32 count;
+	int ret;
+
+	_enter("{%u,%zu/%u}", call->unmarshall, call->offset, call->count);
+
+again:
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract the capabilities word count */
+	case 1:
+		ret = afs_extract_data(call, &call->tmp,
+				       1 * sizeof(__be32),
+				       true);
+		if (ret < 0)
+			return ret;
+
+		count = ntohl(call->tmp);
+
+		call->count = count;
+		call->count2 = count;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* Extract capabilities words */
+	case 2:
+		count = min(call->count, 16U);
+		ret = afs_extract_data(call, call->buffer,
+				       count * sizeof(__be32),
+				       call->count > 16);
+		if (ret < 0)
+			return ret;
+
+		/* TODO: Examine capabilities */
+
+		call->count -= count;
+		if (call->count > 0)
+			goto again;
+		call->offset = 0;
+		call->unmarshall++;
+		break;
+	}
+
+	call->reply[0] = (void *)(unsigned long)call->service_id;
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * VL.GetCapabilities operation type
+ */
+static const struct afs_call_type afs_RXVLGetCapabilities = {
+	.name		= "VL.GetCapabilities",
+	.op		= afs_VL_GetCapabilities,
+	.deliver	= afs_deliver_vl_get_capabilities,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Probe a fileserver for the capabilities that it supports.  This can
+ * return up to 196 words.
+ *
+ * We use this to probe for service upgrade to determine what the server at the
+ * other end supports.
+ */
+int afs_vl_get_capabilities(struct afs_net *net,
+			    struct afs_addr_cursor *ac,
+			    struct key *key)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &afs_RXVLGetCapabilities, 1 * 4, 16 * 4);
 	if (!call)
 		return -ENOMEM;
 
 	call->key = key;
-	call->reply = entry;
-	call->service_id = VL_SERVICE;
-	call->port = htons(AFS_VL_PORT);
+	call->upgrade = true; /* Let's see if this is a YFS server */
+	call->reply[0] = (void *)VLGETCAPABILITIES;
+	call->ret_reply0 = true;
 
 	/* marshall the parameters */
 	bp = call->request;
-	*bp++ = htonl(VLGETENTRYBYID);
-	*bp++ = htonl(volid);
-	*bp   = htonl(voltype);
+	*bp++ = htonl(VLGETCAPABILITIES);
+
+	/* Can't take a ref on server */
+	trace_afs_make_vl_call(call);
+	return afs_make_call(ac, call, GFP_KERNEL, false);
+}
+
+/*
+ * Deliver reply data to a YFSVL.GetEndpoints call.
+ *
+ *	GetEndpoints(IN yfsServerAttributes *attr,
+ *		     OUT opr_uuid *uuid,
+ *		     OUT afs_int32 *uniquifier,
+ *		     OUT endpoints *fsEndpoints,
+ *		     OUT endpoints *volEndpoints)
+ */
+static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
+{
+	struct afs_addr_list *alist;
+	__be32 *bp;
+	u32 uniquifier, size;
+	int ret;
+
+	_enter("{%u,%zu/%u,%u}", call->unmarshall, call->offset, call->count, call->count2);
+
+again:
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall = 1;
+
+		/* Extract the returned uuid, uniquifier, fsEndpoints count and
+		 * either the first fsEndpoint type or the volEndpoints
+		 * count if there are no fsEndpoints. */
+	case 1:
+		ret = afs_extract_data(call, call->buffer,
+				       sizeof(uuid_t) +
+				       3 * sizeof(__be32),
+				       true);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer + sizeof(uuid_t);
+		uniquifier	= ntohl(*bp++);
+		call->count	= ntohl(*bp++);
+		call->count2	= ntohl(*bp); /* Type or next count */
+
+		if (call->count > YFS_MAXENDPOINTS)
+			return -EBADMSG;
+
+		alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
+		if (!alist)
+			return -ENOMEM;
+		alist->version = uniquifier;
+		call->reply[0] = alist;
+		call->offset = 0;
+
+		if (call->count == 0)
+			goto extract_volendpoints;
+
+		call->unmarshall = 2;
+
+		/* Extract fsEndpoints[] entries */
+	case 2:
+		switch (call->count2) {
+		case YFS_ENDPOINT_IPV4:
+			size = sizeof(__be32) * (1 + 1 + 1);
+			break;
+		case YFS_ENDPOINT_IPV6:
+			size = sizeof(__be32) * (1 + 4 + 1);
+			break;
+		default:
+			return -EBADMSG;
+		}
+
+		size += sizeof(__be32);
+		ret = afs_extract_data(call, call->buffer, size, true);
+		if (ret < 0)
+			return ret;
+
+		alist = call->reply[0];
+		bp = call->buffer;
+		switch (call->count2) {
+		case YFS_ENDPOINT_IPV4:
+			if (ntohl(bp[0]) != sizeof(__be32) * 2)
+				return -EBADMSG;
+			afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
+			bp += 3;
+			break;
+		case YFS_ENDPOINT_IPV6:
+			if (ntohl(bp[0]) != sizeof(__be32) * 5)
+				return -EBADMSG;
+			afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
+			bp += 6;
+			break;
+		default:
+			return -EBADMSG;
+		}
+
+		/* Got either the type of the next entry or the count of
+		 * volEndpoints if no more fsEndpoints.
+		 */
+		call->count2 = htonl(*bp++);
+
+		call->offset = 0;
+		call->count--;
+		if (call->count > 0)
+			goto again;
+
+	extract_volendpoints:
+		/* Extract the list of volEndpoints. */
+		call->count = call->count2;
+		if (!call->count)
+			goto end;
+		if (call->count > YFS_MAXENDPOINTS)
+			return -EBADMSG;
+
+		call->unmarshall = 3;
+
+		/* Extract the type of volEndpoints[0].  Normally we would
+		 * extract the type of the next endpoint when we extract the
+		 * data of the current one, but this is the first...
+		 */
+	case 3:
+		ret = afs_extract_data(call, call->buffer, sizeof(__be32), true);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer;
+		call->count2 = htonl(*bp++);
+		call->offset = 0;
+		call->unmarshall = 4;
+
+		/* Extract volEndpoints[] entries */
+	case 4:
+		switch (call->count2) {
+		case YFS_ENDPOINT_IPV4:
+			size = sizeof(__be32) * (1 + 1 + 1);
+			break;
+		case YFS_ENDPOINT_IPV6:
+			size = sizeof(__be32) * (1 + 4 + 1);
+			break;
+		default:
+			return -EBADMSG;
+		}
+
+		if (call->count > 1)
+			size += sizeof(__be32);
+		ret = afs_extract_data(call, call->buffer, size, true);
+		if (ret < 0)
+			return ret;
+
+		bp = call->buffer;
+		switch (call->count2) {
+		case YFS_ENDPOINT_IPV4:
+			if (ntohl(bp[0]) != sizeof(__be32) * 2)
+				return -EBADMSG;
+			bp += 3;
+			break;
+		case YFS_ENDPOINT_IPV6:
+			if (ntohl(bp[0]) != sizeof(__be32) * 5)
+				return -EBADMSG;
+			bp += 6;
+			break;
+		default:
+			return -EBADMSG;
+		}
+
+		/* Got either the type of the next entry or the count of
+		 * volEndpoints if no more fsEndpoints.
+		 */
+		call->offset = 0;
+		call->count--;
+		if (call->count > 0) {
+			call->count2 = htonl(*bp++);
+			goto again;
+		}
+
+	end:
+		call->unmarshall = 5;
+
+		/* Done */
+	case 5:
+		ret = afs_extract_data(call, call->buffer, 0, false);
+		if (ret < 0)
+			return ret;
+		call->unmarshall = 6;
+
+	case 6:
+		break;
+	}
+
+	alist = call->reply[0];
+
+	/* Start with IPv6 if available. */
+	if (alist->nr_ipv4 < alist->nr_addrs)
+		alist->index = alist->nr_ipv4;
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * YFSVL.GetEndpoints operation type.
+ */
+static const struct afs_call_type afs_YFSVLGetEndpoints = {
+	.name		= "YFSVL.GetEndpoints",
+	.op		= afs_YFSVL_GetEndpoints,
+	.deliver	= afs_deliver_yfsvl_get_endpoints,
+	.destructor	= afs_vl_get_addrs_u_destructor,
+};
+
+/*
+ * Dispatch an operation to get the addresses for a server, where the server is
+ * nominated by UUID.
+ */
+struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_net *net,
+					      struct afs_addr_cursor *ac,
+					      struct key *key,
+					      const uuid_t *uuid)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(net, &afs_YFSVLGetEndpoints,
+				   sizeof(__be32) * 2 + sizeof(*uuid),
+				   sizeof(struct in6_addr) + sizeof(__be32) * 3);
+	if (!call)
+		return ERR_PTR(-ENOMEM);
+
+	call->key = key;
+	call->reply[0] = NULL;
+	call->ret_reply0 = true;
+
+	/* Marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(YVLGETENDPOINTS);
+	*bp++ = htonl(YFS_SERVER_UUID);
+	memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */
 
-	/* initiate the call */
-	return afs_make_call(addr, call, GFP_KERNEL, async);
+	trace_afs_make_vl_call(call);
+	return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false);
 }
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
deleted file mode 100644
index 37b7c3b342a6..000000000000
--- a/fs/afs/vlocation.c
+++ /dev/null
@@ -1,720 +0,0 @@
-/* AFS volume location management
- *
- * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include "internal.h"
-
-static unsigned afs_vlocation_timeout = 10;	/* volume location timeout in seconds */
-static unsigned afs_vlocation_update_timeout = 10 * 60;
-
-static void afs_vlocation_reaper(struct work_struct *);
-static void afs_vlocation_updater(struct work_struct *);
-
-static LIST_HEAD(afs_vlocation_updates);
-static LIST_HEAD(afs_vlocation_graveyard);
-static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
-static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
-static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
-static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
-static struct workqueue_struct *afs_vlocation_update_worker;
-
-/*
- * iterate through the VL servers in a cell until one of them admits knowing
- * about the volume in question
- */
-static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
-					   struct key *key,
-					   struct afs_cache_vlocation *vldb)
-{
-	struct afs_cell *cell = vl->cell;
-	struct in_addr addr;
-	int count, ret;
-
-	_enter("%s,%s", cell->name, vl->vldb.name);
-
-	down_write(&vl->cell->vl_sem);
-	ret = -ENOMEDIUM;
-	for (count = cell->vl_naddrs; count > 0; count--) {
-		addr = cell->vl_addrs[cell->vl_curr_svix];
-
-		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
-
-		/* attempt to access the VL server */
-		ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
-					       false);
-		switch (ret) {
-		case 0:
-			goto out;
-		case -ENOMEM:
-		case -ENONET:
-		case -ENETUNREACH:
-		case -EHOSTUNREACH:
-		case -ECONNREFUSED:
-			if (ret == -ENOMEM || ret == -ENONET)
-				goto out;
-			goto rotate;
-		case -ENOMEDIUM:
-		case -EKEYREJECTED:
-		case -EKEYEXPIRED:
-			goto out;
-		default:
-			ret = -EIO;
-			goto rotate;
-		}
-
-		/* rotate the server records upon lookup failure */
-	rotate:
-		cell->vl_curr_svix++;
-		cell->vl_curr_svix %= cell->vl_naddrs;
-	}
-
-out:
-	up_write(&vl->cell->vl_sem);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * iterate through the VL servers in a cell until one of them admits knowing
- * about the volume in question
- */
-static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
-					 struct key *key,
-					 afs_volid_t volid,
-					 afs_voltype_t voltype,
-					 struct afs_cache_vlocation *vldb)
-{
-	struct afs_cell *cell = vl->cell;
-	struct in_addr addr;
-	int count, ret;
-
-	_enter("%s,%x,%d,", cell->name, volid, voltype);
-
-	down_write(&vl->cell->vl_sem);
-	ret = -ENOMEDIUM;
-	for (count = cell->vl_naddrs; count > 0; count--) {
-		addr = cell->vl_addrs[cell->vl_curr_svix];
-
-		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
-
-		/* attempt to access the VL server */
-		ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
-					     false);
-		switch (ret) {
-		case 0:
-			goto out;
-		case -ENOMEM:
-		case -ENONET:
-		case -ENETUNREACH:
-		case -EHOSTUNREACH:
-		case -ECONNREFUSED:
-			if (ret == -ENOMEM || ret == -ENONET)
-				goto out;
-			goto rotate;
-		case -EBUSY:
-			vl->upd_busy_cnt++;
-			if (vl->upd_busy_cnt <= 3) {
-				if (vl->upd_busy_cnt > 1) {
-					/* second+ BUSY - sleep a little bit */
-					set_current_state(TASK_UNINTERRUPTIBLE);
-					schedule_timeout(1);
-				}
-				continue;
-			}
-			break;
-		case -ENOMEDIUM:
-			vl->upd_rej_cnt++;
-			goto rotate;
-		default:
-			ret = -EIO;
-			goto rotate;
-		}
-
-		/* rotate the server records upon lookup failure */
-	rotate:
-		cell->vl_curr_svix++;
-		cell->vl_curr_svix %= cell->vl_naddrs;
-		vl->upd_busy_cnt = 0;
-	}
-
-out:
-	if (ret < 0 && vl->upd_rej_cnt > 0) {
-		printk(KERN_NOTICE "kAFS:"
-		       " Active volume no longer valid '%s'\n",
-		       vl->vldb.name);
-		vl->valid = 0;
-		ret = -ENOMEDIUM;
-	}
-
-	up_write(&vl->cell->vl_sem);
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
- * allocate a volume location record
- */
-static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
-						 const char *name,
-						 size_t namesz)
-{
-	struct afs_vlocation *vl;
-
-	vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
-	if (vl) {
-		vl->cell = cell;
-		vl->state = AFS_VL_NEW;
-		atomic_set(&vl->usage, 1);
-		INIT_LIST_HEAD(&vl->link);
-		INIT_LIST_HEAD(&vl->grave);
-		INIT_LIST_HEAD(&vl->update);
-		init_waitqueue_head(&vl->waitq);
-		spin_lock_init(&vl->lock);
-		memcpy(vl->vldb.name, name, namesz);
-	}
-
-	_leave(" = %p", vl);
-	return vl;
-}
-
-/*
- * update record if we found it in the cache
- */
-static int afs_vlocation_update_record(struct afs_vlocation *vl,
-				       struct key *key,
-				       struct afs_cache_vlocation *vldb)
-{
-	afs_voltype_t voltype;
-	afs_volid_t vid;
-	int ret;
-
-	/* try to look up a cached volume in the cell VL databases by ID */
-	_debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
-	       vl->vldb.name,
-	       vl->vldb.vidmask,
-	       ntohl(vl->vldb.servers[0].s_addr),
-	       vl->vldb.srvtmask[0],
-	       ntohl(vl->vldb.servers[1].s_addr),
-	       vl->vldb.srvtmask[1],
-	       ntohl(vl->vldb.servers[2].s_addr),
-	       vl->vldb.srvtmask[2]);
-
-	_debug("Vids: %08x %08x %08x",
-	       vl->vldb.vid[0],
-	       vl->vldb.vid[1],
-	       vl->vldb.vid[2]);
-
-	if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
-		vid = vl->vldb.vid[0];
-		voltype = AFSVL_RWVOL;
-	} else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) {
-		vid = vl->vldb.vid[1];
-		voltype = AFSVL_ROVOL;
-	} else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) {
-		vid = vl->vldb.vid[2];
-		voltype = AFSVL_BACKVOL;
-	} else {
-		BUG();
-		vid = 0;
-		voltype = 0;
-	}
-
-	/* contact the server to make sure the volume is still available
-	 * - TODO: need to handle disconnected operation here
-	 */
-	ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
-	switch (ret) {
-		/* net error */
-	default:
-		printk(KERN_WARNING "kAFS:"
-		       " failed to update volume '%s' (%x) up in '%s': %d\n",
-		       vl->vldb.name, vid, vl->cell->name, ret);
-		_leave(" = %d", ret);
-		return ret;
-
-		/* pulled from local cache into memory */
-	case 0:
-		_leave(" = 0");
-		return 0;
-
-		/* uh oh... looks like the volume got deleted */
-	case -ENOMEDIUM:
-		printk(KERN_ERR "kAFS:"
-		       " volume '%s' (%x) does not exist '%s'\n",
-		       vl->vldb.name, vid, vl->cell->name);
-
-		/* TODO: make existing record unavailable */
-		_leave(" = %d", ret);
-		return ret;
-	}
-}
-
-/*
- * apply the update to a VL record
- */
-static void afs_vlocation_apply_update(struct afs_vlocation *vl,
-				       struct afs_cache_vlocation *vldb)
-{
-	_debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
-	       vldb->name, vldb->vidmask,
-	       ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0],
-	       ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1],
-	       ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]);
-
-	_debug("Vids: %08x %08x %08x",
-	       vldb->vid[0], vldb->vid[1], vldb->vid[2]);
-
-	if (strcmp(vldb->name, vl->vldb.name) != 0)
-		printk(KERN_NOTICE "kAFS:"
-		       " name of volume '%s' changed to '%s' on server\n",
-		       vl->vldb.name, vldb->name);
-
-	vl->vldb = *vldb;
-
-#ifdef CONFIG_AFS_FSCACHE
-	fscache_update_cookie(vl->cache);
-#endif
-}
-
-/*
- * fill in a volume location record, consulting the cache and the VL server
- * both
- */
-static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
-					struct key *key)
-{
-	struct afs_cache_vlocation vldb;
-	int ret;
-
-	_enter("");
-
-	ASSERTCMP(vl->valid, ==, 0);
-
-	memset(&vldb, 0, sizeof(vldb));
-
-	/* see if we have an in-cache copy (will set vl->valid if there is) */
-#ifdef CONFIG_AFS_FSCACHE
-	vl->cache = fscache_acquire_cookie(vl->cell->cache,
-					   &afs_vlocation_cache_index_def, vl,
-					   true);
-#endif
-
-	if (vl->valid) {
-		/* try to update a known volume in the cell VL databases by
-		 * ID as the name may have changed */
-		_debug("found in cache");
-		ret = afs_vlocation_update_record(vl, key, &vldb);
-	} else {
-		/* try to look up an unknown volume in the cell VL databases by
-		 * name */
-		ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
-		if (ret < 0) {
-			printk("kAFS: failed to locate '%s' in cell '%s'\n",
-			       vl->vldb.name, vl->cell->name);
-			return ret;
-		}
-	}
-
-	afs_vlocation_apply_update(vl, &vldb);
-	_leave(" = 0");
-	return 0;
-}
-
-/*
- * queue a vlocation record for updates
- */
-static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
-{
-	struct afs_vlocation *xvl;
-
-	/* wait at least 10 minutes before updating... */
-	vl->update_at = ktime_get_real_seconds() +
-			afs_vlocation_update_timeout;
-
-	spin_lock(&afs_vlocation_updates_lock);
-
-	if (!list_empty(&afs_vlocation_updates)) {
-		/* ... but wait at least 1 second more than the newest record
-		 * already queued so that we don't spam the VL server suddenly
-		 * with lots of requests
-		 */
-		xvl = list_entry(afs_vlocation_updates.prev,
-				 struct afs_vlocation, update);
-		if (vl->update_at <= xvl->update_at)
-			vl->update_at = xvl->update_at + 1;
-	} else {
-		queue_delayed_work(afs_vlocation_update_worker,
-				   &afs_vlocation_update,
-				   afs_vlocation_update_timeout * HZ);
-	}
-
-	list_add_tail(&vl->update, &afs_vlocation_updates);
-	spin_unlock(&afs_vlocation_updates_lock);
-}
-
-/*
- * lookup volume location
- * - iterate through the VL servers in a cell until one of them admits knowing
- *   about the volume in question
- * - lookup in the local cache if not able to find on the VL server
- * - insert/update in the local cache if did get a VL response
- */
-struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
-					   struct key *key,
-					   const char *name,
-					   size_t namesz)
-{
-	struct afs_vlocation *vl;
-	int ret;
-
-	_enter("{%s},{%x},%*.*s,%zu",
-	       cell->name, key_serial(key),
-	       (int) namesz, (int) namesz, name, namesz);
-
-	if (namesz >= sizeof(vl->vldb.name)) {
-		_leave(" = -ENAMETOOLONG");
-		return ERR_PTR(-ENAMETOOLONG);
-	}
-
-	/* see if we have an in-memory copy first */
-	down_write(&cell->vl_sem);
-	spin_lock(&cell->vl_lock);
-	list_for_each_entry(vl, &cell->vl_list, link) {
-		if (vl->vldb.name[namesz] != '\0')
-			continue;
-		if (memcmp(vl->vldb.name, name, namesz) == 0)
-			goto found_in_memory;
-	}
-	spin_unlock(&cell->vl_lock);
-
-	/* not in the cell's in-memory lists - create a new record */
-	vl = afs_vlocation_alloc(cell, name, namesz);
-	if (!vl) {
-		up_write(&cell->vl_sem);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	afs_get_cell(cell);
-
-	list_add_tail(&vl->link, &cell->vl_list);
-	vl->state = AFS_VL_CREATING;
-	up_write(&cell->vl_sem);
-
-fill_in_record:
-	ret = afs_vlocation_fill_in_record(vl, key);
-	if (ret < 0)
-		goto error_abandon;
-	spin_lock(&vl->lock);
-	vl->state = AFS_VL_VALID;
-	spin_unlock(&vl->lock);
-	wake_up(&vl->waitq);
-
-	/* update volume entry in local cache */
-#ifdef CONFIG_AFS_FSCACHE
-	fscache_update_cookie(vl->cache);
-#endif
-
-	/* schedule for regular updates */
-	afs_vlocation_queue_for_updates(vl);
-	goto success;
-
-found_in_memory:
-	/* found in memory */
-	_debug("found in memory");
-	atomic_inc(&vl->usage);
-	spin_unlock(&cell->vl_lock);
-	if (!list_empty(&vl->grave)) {
-		spin_lock(&afs_vlocation_graveyard_lock);
-		list_del_init(&vl->grave);
-		spin_unlock(&afs_vlocation_graveyard_lock);
-	}
-	up_write(&cell->vl_sem);
-
-	/* see if it was an abandoned record that we might try filling in */
-	spin_lock(&vl->lock);
-	while (vl->state != AFS_VL_VALID) {
-		afs_vlocation_state_t state = vl->state;
-
-		_debug("invalid [state %d]", state);
-
-		if (state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME) {
-			vl->state = AFS_VL_CREATING;
-			spin_unlock(&vl->lock);
-			goto fill_in_record;
-		}
-
-		/* must now wait for creation or update by someone else to
-		 * complete */
-		_debug("wait");
-
-		spin_unlock(&vl->lock);
-		ret = wait_event_interruptible(vl->waitq,
-					       vl->state == AFS_VL_NEW ||
-					       vl->state == AFS_VL_VALID ||
-					       vl->state == AFS_VL_NO_VOLUME);
-		if (ret < 0)
-			goto error;
-		spin_lock(&vl->lock);
-	}
-	spin_unlock(&vl->lock);
-
-success:
-	_leave(" = %p", vl);
-	return vl;
-
-error_abandon:
-	spin_lock(&vl->lock);
-	vl->state = AFS_VL_NEW;
-	spin_unlock(&vl->lock);
-	wake_up(&vl->waitq);
-error:
-	ASSERT(vl != NULL);
-	afs_put_vlocation(vl);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
-}
-
-/*
- * finish using a volume location record
- */
-void afs_put_vlocation(struct afs_vlocation *vl)
-{
-	if (!vl)
-		return;
-
-	_enter("%s", vl->vldb.name);
-
-	ASSERTCMP(atomic_read(&vl->usage), >, 0);
-
-	if (likely(!atomic_dec_and_test(&vl->usage))) {
-		_leave("");
-		return;
-	}
-
-	spin_lock(&afs_vlocation_graveyard_lock);
-	if (atomic_read(&vl->usage) == 0) {
-		_debug("buried");
-		list_move_tail(&vl->grave, &afs_vlocation_graveyard);
-		vl->time_of_death = ktime_get_real_seconds();
-		queue_delayed_work(afs_wq, &afs_vlocation_reap,
-				   afs_vlocation_timeout * HZ);
-
-		/* suspend updates on this record */
-		if (!list_empty(&vl->update)) {
-			spin_lock(&afs_vlocation_updates_lock);
-			list_del_init(&vl->update);
-			spin_unlock(&afs_vlocation_updates_lock);
-		}
-	}
-	spin_unlock(&afs_vlocation_graveyard_lock);
-	_leave(" [killed?]");
-}
-
-/*
- * destroy a dead volume location record
- */
-static void afs_vlocation_destroy(struct afs_vlocation *vl)
-{
-	_enter("%p", vl);
-
-#ifdef CONFIG_AFS_FSCACHE
-	fscache_relinquish_cookie(vl->cache, 0);
-#endif
-	afs_put_cell(vl->cell);
-	kfree(vl);
-}
-
-/*
- * reap dead volume location records
- */
-static void afs_vlocation_reaper(struct work_struct *work)
-{
-	LIST_HEAD(corpses);
-	struct afs_vlocation *vl;
-	unsigned long delay, expiry;
-	time64_t now;
-
-	_enter("");
-
-	now = ktime_get_real_seconds();
-	spin_lock(&afs_vlocation_graveyard_lock);
-
-	while (!list_empty(&afs_vlocation_graveyard)) {
-		vl = list_entry(afs_vlocation_graveyard.next,
-				struct afs_vlocation, grave);
-
-		_debug("check %p", vl);
-
-		/* the queue is ordered most dead first */
-		expiry = vl->time_of_death + afs_vlocation_timeout;
-		if (expiry > now) {
-			delay = (expiry - now) * HZ;
-			_debug("delay %lu", delay);
-			mod_delayed_work(afs_wq, &afs_vlocation_reap, delay);
-			break;
-		}
-
-		spin_lock(&vl->cell->vl_lock);
-		if (atomic_read(&vl->usage) > 0) {
-			_debug("no reap");
-			list_del_init(&vl->grave);
-		} else {
-			_debug("reap");
-			list_move_tail(&vl->grave, &corpses);
-			list_del_init(&vl->link);
-		}
-		spin_unlock(&vl->cell->vl_lock);
-	}
-
-	spin_unlock(&afs_vlocation_graveyard_lock);
-
-	/* now reap the corpses we've extracted */
-	while (!list_empty(&corpses)) {
-		vl = list_entry(corpses.next, struct afs_vlocation, grave);
-		list_del(&vl->grave);
-		afs_vlocation_destroy(vl);
-	}
-
-	_leave("");
-}
-
-/*
- * initialise the VL update process
- */
-int __init afs_vlocation_update_init(void)
-{
-	afs_vlocation_update_worker = alloc_workqueue("kafs_vlupdated",
-						      WQ_MEM_RECLAIM, 0);
-	return afs_vlocation_update_worker ? 0 : -ENOMEM;
-}
-
-/*
- * discard all the volume location records for rmmod
- */
-void afs_vlocation_purge(void)
-{
-	afs_vlocation_timeout = 0;
-
-	spin_lock(&afs_vlocation_updates_lock);
-	list_del_init(&afs_vlocation_updates);
-	spin_unlock(&afs_vlocation_updates_lock);
-	mod_delayed_work(afs_vlocation_update_worker, &afs_vlocation_update, 0);
-	destroy_workqueue(afs_vlocation_update_worker);
-
-	mod_delayed_work(afs_wq, &afs_vlocation_reap, 0);
-}
-
-/*
- * update a volume location
- */
-static void afs_vlocation_updater(struct work_struct *work)
-{
-	struct afs_cache_vlocation vldb;
-	struct afs_vlocation *vl, *xvl;
-	time64_t now;
-	long timeout;
-	int ret;
-
-	_enter("");
-
-	now = ktime_get_real_seconds();
-
-	/* find a record to update */
-	spin_lock(&afs_vlocation_updates_lock);
-	for (;;) {
-		if (list_empty(&afs_vlocation_updates)) {
-			spin_unlock(&afs_vlocation_updates_lock);
-			_leave(" [nothing]");
-			return;
-		}
-
-		vl = list_entry(afs_vlocation_updates.next,
-				struct afs_vlocation, update);
-		if (atomic_read(&vl->usage) > 0)
-			break;
-		list_del_init(&vl->update);
-	}
-
-	timeout = vl->update_at - now;
-	if (timeout > 0) {
-		queue_delayed_work(afs_vlocation_update_worker,
-				   &afs_vlocation_update, timeout * HZ);
-		spin_unlock(&afs_vlocation_updates_lock);
-		_leave(" [nothing]");
-		return;
-	}
-
-	list_del_init(&vl->update);
-	atomic_inc(&vl->usage);
-	spin_unlock(&afs_vlocation_updates_lock);
-
-	/* we can now perform the update */
-	_debug("update %s", vl->vldb.name);
-	vl->state = AFS_VL_UPDATING;
-	vl->upd_rej_cnt = 0;
-	vl->upd_busy_cnt = 0;
-
-	ret = afs_vlocation_update_record(vl, NULL, &vldb);
-	spin_lock(&vl->lock);
-	switch (ret) {
-	case 0:
-		afs_vlocation_apply_update(vl, &vldb);
-		vl->state = AFS_VL_VALID;
-		break;
-	case -ENOMEDIUM:
-		vl->state = AFS_VL_VOLUME_DELETED;
-		break;
-	default:
-		vl->state = AFS_VL_UNCERTAIN;
-		break;
-	}
-	spin_unlock(&vl->lock);
-	wake_up(&vl->waitq);
-
-	/* and then reschedule */
-	_debug("reschedule");
-	vl->update_at = ktime_get_real_seconds() +
-			afs_vlocation_update_timeout;
-
-	spin_lock(&afs_vlocation_updates_lock);
-
-	if (!list_empty(&afs_vlocation_updates)) {
-		/* next update in 10 minutes, but wait at least 1 second more
-		 * than the newest record already queued so that we don't spam
-		 * the VL server suddenly with lots of requests
-		 */
-		xvl = list_entry(afs_vlocation_updates.prev,
-				 struct afs_vlocation, update);
-		if (vl->update_at <= xvl->update_at)
-			vl->update_at = xvl->update_at + 1;
-		xvl = list_entry(afs_vlocation_updates.next,
-				 struct afs_vlocation, update);
-		timeout = xvl->update_at - now;
-		if (timeout < 0)
-			timeout = 0;
-	} else {
-		timeout = afs_vlocation_update_timeout;
-	}
-
-	ASSERT(list_empty(&vl->update));
-
-	list_add_tail(&vl->update, &afs_vlocation_updates);
-
-	_debug("timeout %ld", timeout);
-	queue_delayed_work(afs_vlocation_update_worker,
-			   &afs_vlocation_update, timeout * HZ);
-	spin_unlock(&afs_vlocation_updates_lock);
-	afs_put_vlocation(vl);
-}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
deleted file mode 100644
index dcb956143c86..000000000000
--- a/fs/afs/vnode.c
+++ /dev/null
@@ -1,1025 +0,0 @@
-/* AFS vnode management
- *
- * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include "internal.h"
-
-#if 0
-static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent,
-				   int depth, char lr)
-{
-	struct afs_vnode *vnode;
-	bool bad = false;
-
-	if (!node)
-		return false;
-
-	if (node->rb_left)
-		bad = dump_tree_aux(node->rb_left, node, depth + 2, '/');
-
-	vnode = rb_entry(node, struct afs_vnode, cb_promise);
-	_debug("%c %*.*s%c%p {%d}",
-	       rb_is_red(node) ? 'R' : 'B',
-	       depth, depth, "", lr,
-	       vnode, vnode->cb_expires_at);
-	if (rb_parent(node) != parent) {
-		printk("BAD: %p != %p\n", rb_parent(node), parent);
-		bad = true;
-	}
-
-	if (node->rb_right)
-		bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\');
-
-	return bad;
-}
-
-static noinline void dump_tree(const char *name, struct afs_server *server)
-{
-	_enter("%s", name);
-	if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-'))
-		BUG();
-}
-#endif
-
-/*
- * insert a vnode into the backing server's vnode tree
- */
-static void afs_install_vnode(struct afs_vnode *vnode,
-			      struct afs_server *server)
-{
-	struct afs_server *old_server = vnode->server;
-	struct afs_vnode *xvnode;
-	struct rb_node *parent, **p;
-
-	_enter("%p,%p", vnode, server);
-
-	if (old_server) {
-		spin_lock(&old_server->fs_lock);
-		rb_erase(&vnode->server_rb, &old_server->fs_vnodes);
-		spin_unlock(&old_server->fs_lock);
-	}
-
-	afs_get_server(server);
-	vnode->server = server;
-	afs_put_server(old_server);
-
-	/* insert into the server's vnode tree in FID order */
-	spin_lock(&server->fs_lock);
-
-	parent = NULL;
-	p = &server->fs_vnodes.rb_node;
-	while (*p) {
-		parent = *p;
-		xvnode = rb_entry(parent, struct afs_vnode, server_rb);
-		if (vnode->fid.vid < xvnode->fid.vid)
-			p = &(*p)->rb_left;
-		else if (vnode->fid.vid > xvnode->fid.vid)
-			p = &(*p)->rb_right;
-		else if (vnode->fid.vnode < xvnode->fid.vnode)
-			p = &(*p)->rb_left;
-		else if (vnode->fid.vnode > xvnode->fid.vnode)
-			p = &(*p)->rb_right;
-		else if (vnode->fid.unique < xvnode->fid.unique)
-			p = &(*p)->rb_left;
-		else if (vnode->fid.unique > xvnode->fid.unique)
-			p = &(*p)->rb_right;
-		else
-			BUG(); /* can't happen unless afs_iget() malfunctions */
-	}
-
-	rb_link_node(&vnode->server_rb, parent, p);
-	rb_insert_color(&vnode->server_rb, &server->fs_vnodes);
-
-	spin_unlock(&server->fs_lock);
-	_leave("");
-}
-
-/*
- * insert a vnode into the promising server's update/expiration tree
- * - caller must hold vnode->lock
- */
-static void afs_vnode_note_promise(struct afs_vnode *vnode,
-				   struct afs_server *server)
-{
-	struct afs_server *old_server;
-	struct afs_vnode *xvnode;
-	struct rb_node *parent, **p;
-
-	_enter("%p,%p", vnode, server);
-
-	ASSERT(server != NULL);
-
-	old_server = vnode->server;
-	if (vnode->cb_promised) {
-		if (server == old_server &&
-		    vnode->cb_expires == vnode->cb_expires_at) {
-			_leave(" [no change]");
-			return;
-		}
-
-		spin_lock(&old_server->cb_lock);
-		if (vnode->cb_promised) {
-			_debug("delete");
-			rb_erase(&vnode->cb_promise, &old_server->cb_promises);
-			vnode->cb_promised = false;
-		}
-		spin_unlock(&old_server->cb_lock);
-	}
-
-	if (vnode->server != server)
-		afs_install_vnode(vnode, server);
-
-	vnode->cb_expires_at = vnode->cb_expires;
-	_debug("PROMISE on %p {%lu}",
-	       vnode, (unsigned long) vnode->cb_expires_at);
-
-	/* abuse an RB-tree to hold the expiration order (we may have multiple
-	 * items with the same expiration time) */
-	spin_lock(&server->cb_lock);
-
-	parent = NULL;
-	p = &server->cb_promises.rb_node;
-	while (*p) {
-		parent = *p;
-		xvnode = rb_entry(parent, struct afs_vnode, cb_promise);
-		if (vnode->cb_expires_at < xvnode->cb_expires_at)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&vnode->cb_promise, parent, p);
-	rb_insert_color(&vnode->cb_promise, &server->cb_promises);
-	vnode->cb_promised = true;
-
-	spin_unlock(&server->cb_lock);
-	_leave("");
-}
-
-/*
- * handle remote file deletion by discarding the callback promise
- */
-static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
-{
-	struct afs_server *server;
-
-	_enter("{%p}", vnode->server);
-
-	set_bit(AFS_VNODE_DELETED, &vnode->flags);
-
-	server = vnode->server;
-	if (server) {
-		if (vnode->cb_promised) {
-			spin_lock(&server->cb_lock);
-			if (vnode->cb_promised) {
-				rb_erase(&vnode->cb_promise,
-					 &server->cb_promises);
-				vnode->cb_promised = false;
-			}
-			spin_unlock(&server->cb_lock);
-		}
-
-		spin_lock(&server->fs_lock);
-		rb_erase(&vnode->server_rb, &server->fs_vnodes);
-		spin_unlock(&server->fs_lock);
-
-		vnode->server = NULL;
-		afs_put_server(server);
-	} else {
-		ASSERT(!vnode->cb_promised);
-	}
-
-	_leave("");
-}
-
-/*
- * finish off updating the recorded status of a file after a successful
- * operation completion
- * - starts callback expiry timer
- * - adds to server's callback list
- */
-void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
-				      struct afs_server *server)
-{
-	struct afs_server *oldserver = NULL;
-
-	_enter("%p,%p", vnode, server);
-
-	spin_lock(&vnode->lock);
-	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-	afs_vnode_note_promise(vnode, server);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-
-	wake_up_all(&vnode->update_waitq);
-	afs_put_server(oldserver);
-	_leave("");
-}
-
-/*
- * finish off updating the recorded status of a file after an operation failed
- */
-static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
-{
-	_enter("{%x:%u},%d", vnode->fid.vid, vnode->fid.vnode, ret);
-
-	spin_lock(&vnode->lock);
-
-	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-
-	if (ret == -ENOENT) {
-		/* the file was deleted on the server */
-		_debug("got NOENT from server - marking file deleted");
-		afs_vnode_deleted_remotely(vnode);
-	}
-
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-
-	wake_up_all(&vnode->update_waitq);
-	_leave("");
-}
-
-/*
- * fetch file status from the volume
- * - don't issue a fetch if:
- *   - the changed bit is not set and there's a valid callback
- *   - there are any outstanding ops that will fetch the status
- * - TODO implement local caching
- */
-int afs_vnode_fetch_status(struct afs_vnode *vnode,
-			   struct afs_vnode *auth_vnode, struct key *key)
-{
-	struct afs_server *server;
-	unsigned long acl_order;
-	int ret;
-
-	DECLARE_WAITQUEUE(myself, current);
-
-	_enter("%s,{%x:%u.%u}",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
-
-	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
-	    vnode->cb_promised) {
-		_leave(" [unchanged]");
-		return 0;
-	}
-
-	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
-		_leave(" [deleted]");
-		return -ENOENT;
-	}
-
-	acl_order = 0;
-	if (auth_vnode)
-		acl_order = auth_vnode->acl_order;
-
-	spin_lock(&vnode->lock);
-
-	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
-	    vnode->cb_promised) {
-		spin_unlock(&vnode->lock);
-		_leave(" [unchanged]");
-		return 0;
-	}
-
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-
-	if (vnode->update_cnt > 0) {
-		/* someone else started a fetch */
-		_debug("wait on fetch %d", vnode->update_cnt);
-
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		ASSERT(myself.func != NULL);
-		add_wait_queue(&vnode->update_waitq, &myself);
-
-		/* wait for the status to be updated */
-		for (;;) {
-			if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
-				break;
-			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
-				break;
-
-			/* check to see if it got updated and invalidated all
-			 * before we saw it */
-			if (vnode->update_cnt == 0) {
-				remove_wait_queue(&vnode->update_waitq,
-						  &myself);
-				set_current_state(TASK_RUNNING);
-				goto get_anyway;
-			}
-
-			spin_unlock(&vnode->lock);
-
-			schedule();
-			set_current_state(TASK_UNINTERRUPTIBLE);
-
-			spin_lock(&vnode->lock);
-		}
-
-		remove_wait_queue(&vnode->update_waitq, &myself);
-		spin_unlock(&vnode->lock);
-		set_current_state(TASK_RUNNING);
-
-		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ?
-			-ENOENT : 0;
-	}
-
-get_anyway:
-	/* okay... we're going to have to initiate the op */
-	vnode->update_cnt++;
-
-	spin_unlock(&vnode->lock);
-
-	/* merge AFS status fetches and clear outstanding callback on this
-	 * vnode */
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %p{%08x}",
-		       server, ntohl(server->addr.s_addr));
-
-		ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
-					       false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		_debug("adjust");
-		if (auth_vnode)
-			afs_cache_permit(vnode, key, acl_order);
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_put_server(server);
-	} else {
-		_debug("failed [%d]", ret);
-		afs_vnode_status_update_failed(vnode, ret);
-	}
-
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-
-	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * fetch file data from the volume
- * - TODO implement caching
- */
-int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
-			 struct afs_read *desc)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,,,",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key));
-
-	/* this op will fetch the status */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	/* merge in AFS status fetches and clear outstanding callback on this
-	 * vnode */
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_fetch_data(server, key, vnode, desc,
-					false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	return PTR_ERR(server);
-}
-
-/*
- * make a file or a directory
- */
-int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
-		     const char *name, umode_t mode, struct afs_fid *newfid,
-		     struct afs_file_status *newstatus,
-		     struct afs_callback *newcb, struct afs_server **_server)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,%s,,",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key),
-	       name);
-
-	/* this op will fetch the status on the directory we're creating in */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_create(server, key, vnode, name, mode, newfid,
-				    newstatus, newcb, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		*_server = server;
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-		*_server = NULL;
-	}
-
-	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * remove a file or directory
- */
-int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
-		     bool isdir)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,%s",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key),
-	       name);
-
-	/* this op will fetch the status on the directory we're removing from */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_remove(server, key, vnode, name, isdir,
-				    false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-	}
-
-	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * create a hard link
- */
-int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
-			  struct key *key, const char *name)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%s{%x:%u.%u},%x,%s",
-	       dvnode->volume->vlocation->vldb.name,
-	       dvnode->fid.vid,
-	       dvnode->fid.vnode,
-	       dvnode->fid.unique,
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key),
-	       name);
-
-	/* this op will fetch the status on the directory we're removing from */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-	spin_lock(&dvnode->lock);
-	dvnode->update_cnt++;
-	spin_unlock(&dvnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(dvnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_link(server, key, dvnode, vnode, name,
-				  false);
-
-	} while (!afs_volume_release_fileserver(dvnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_vnode_finalise_status_update(dvnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-		afs_vnode_status_update_failed(dvnode, ret);
-	}
-
-	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	spin_lock(&dvnode->lock);
-	dvnode->update_cnt--;
-	ASSERTCMP(dvnode->update_cnt, >=, 0);
-	spin_unlock(&dvnode->lock);
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * create a symbolic link
- */
-int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
-		      const char *name, const char *content,
-		      struct afs_fid *newfid,
-		      struct afs_file_status *newstatus,
-		      struct afs_server **_server)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,%s,%s,,,",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key),
-	       name, content);
-
-	/* this op will fetch the status on the directory we're creating in */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_symlink(server, key, vnode, name, content,
-				     newfid, newstatus, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		*_server = server;
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-		*_server = NULL;
-	}
-
-	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * rename a file
- */
-int afs_vnode_rename(struct afs_vnode *orig_dvnode,
-		     struct afs_vnode *new_dvnode,
-		     struct key *key,
-		     const char *orig_name,
-		     const char *new_name)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%s{%u,%u,%u},%x,%s,%s",
-	       orig_dvnode->volume->vlocation->vldb.name,
-	       orig_dvnode->fid.vid,
-	       orig_dvnode->fid.vnode,
-	       orig_dvnode->fid.unique,
-	       new_dvnode->volume->vlocation->vldb.name,
-	       new_dvnode->fid.vid,
-	       new_dvnode->fid.vnode,
-	       new_dvnode->fid.unique,
-	       key_serial(key),
-	       orig_name,
-	       new_name);
-
-	/* this op will fetch the status on both the directories we're dealing
-	 * with */
-	spin_lock(&orig_dvnode->lock);
-	orig_dvnode->update_cnt++;
-	spin_unlock(&orig_dvnode->lock);
-	if (new_dvnode != orig_dvnode) {
-		spin_lock(&new_dvnode->lock);
-		new_dvnode->update_cnt++;
-		spin_unlock(&new_dvnode->lock);
-	}
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(orig_dvnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
-				    new_dvnode, new_name, false);
-
-	} while (!afs_volume_release_fileserver(orig_dvnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(orig_dvnode, server);
-		if (new_dvnode != orig_dvnode)
-			afs_vnode_finalise_status_update(new_dvnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(orig_dvnode, ret);
-		if (new_dvnode != orig_dvnode)
-			afs_vnode_status_update_failed(new_dvnode, ret);
-	}
-
-	_leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt);
-	return ret;
-
-no_server:
-	spin_lock(&orig_dvnode->lock);
-	orig_dvnode->update_cnt--;
-	ASSERTCMP(orig_dvnode->update_cnt, >=, 0);
-	spin_unlock(&orig_dvnode->lock);
-	if (new_dvnode != orig_dvnode) {
-		spin_lock(&new_dvnode->lock);
-		new_dvnode->update_cnt--;
-		ASSERTCMP(new_dvnode->update_cnt, >=, 0);
-		spin_unlock(&new_dvnode->lock);
-	}
-	_leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
-	return PTR_ERR(server);
-}
-
-/*
- * write to a file
- */
-int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last,
-			 unsigned offset, unsigned to)
-{
-	struct afs_server *server;
-	struct afs_vnode *vnode = wb->vnode;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,%lx,%lx,%x,%x",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(wb->key),
-	       first, last, offset, to);
-
-	/* this op will fetch the status */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_store_data(server, wb, first, last, offset, to,
-					false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	return PTR_ERR(server);
-}
-
-/*
- * set the attributes on a file
- */
-int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key,
-		      struct iattr *attr)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key));
-
-	/* this op will fetch the status */
-	spin_lock(&vnode->lock);
-	vnode->update_cnt++;
-	spin_unlock(&vnode->lock);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_setattr(server, key, vnode, attr, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0) {
-		afs_vnode_finalise_status_update(vnode, server);
-		afs_put_server(server);
-	} else {
-		afs_vnode_status_update_failed(vnode, ret);
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	spin_lock(&vnode->lock);
-	vnode->update_cnt--;
-	ASSERTCMP(vnode->update_cnt, >=, 0);
-	spin_unlock(&vnode->lock);
-	return PTR_ERR(server);
-}
-
-/*
- * get the status of a volume
- */
-int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
-				struct afs_volume_status *vs)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key));
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_get_volume_status(server, key, vnode, vs, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0)
-		afs_put_server(server);
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	return PTR_ERR(server);
-}
-
-/*
- * get a lock on a file
- */
-int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key,
-		       afs_lock_type_t type)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x,%u",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key), type);
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_set_lock(server, key, vnode, type, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0)
-		afs_put_server(server);
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	return PTR_ERR(server);
-}
-
-/*
- * extend a lock on a file
- */
-int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key));
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_extend_lock(server, key, vnode, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0)
-		afs_put_server(server);
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	return PTR_ERR(server);
-}
-
-/*
- * release a lock on a file
- */
-int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key)
-{
-	struct afs_server *server;
-	int ret;
-
-	_enter("%s{%x:%u.%u},%x",
-	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid,
-	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       key_serial(key));
-
-	do {
-		/* pick a server to query */
-		server = afs_volume_pick_fileserver(vnode);
-		if (IS_ERR(server))
-			goto no_server;
-
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
-
-		ret = afs_fs_release_lock(server, key, vnode, false);
-
-	} while (!afs_volume_release_fileserver(vnode, server, ret));
-
-	/* adjust the flags */
-	if (ret == 0)
-		afs_put_server(server);
-
-	_leave(" = %d", ret);
-	return ret;
-
-no_server:
-	return PTR_ERR(server);
-}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index db73d6dad02b..684c48293353 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -10,19 +10,167 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/sched.h>
 #include "internal.h"
 
-static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
+unsigned __read_mostly afs_volume_gc_delay = 10;
+unsigned __read_mostly afs_volume_record_life = 60 * 60;
+
+static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" };
+
+/*
+ * Allocate a volume record and load it up from a vldb record.
+ */
+static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params,
+					   struct afs_vldb_entry *vldb,
+					   unsigned long type_mask)
+{
+	struct afs_server_list *slist;
+	struct afs_server *server;
+	struct afs_volume *volume;
+	int ret = -ENOMEM, nr_servers = 0, i, j;
+
+	for (i = 0; i < vldb->nr_servers; i++)
+		if (vldb->fs_mask[i] & type_mask)
+			nr_servers++;
+
+	volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
+	if (!volume)
+		goto error_0;
+
+	volume->vid		= vldb->vid[params->type];
+	volume->update_at	= ktime_get_real_seconds() + afs_volume_record_life;
+	volume->cell		= afs_get_cell(params->cell);
+	volume->type		= params->type;
+	volume->type_force	= params->force;
+	volume->name_len	= vldb->name_len;
+
+	atomic_set(&volume->usage, 1);
+	INIT_LIST_HEAD(&volume->proc_link);
+	rwlock_init(&volume->servers_lock);
+	memcpy(volume->name, vldb->name, vldb->name_len + 1);
+
+	slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask);
+	if (IS_ERR(slist)) {
+		ret = PTR_ERR(slist);
+		goto error_1;
+	}
+
+	refcount_set(&slist->usage, 1);
+	volume->servers = slist;
+
+	/* Make sure a records exists for each server this volume occupies. */
+	for (i = 0; i < nr_servers; i++) {
+		if (!(vldb->fs_mask[i] & type_mask))
+			continue;
+
+		server = afs_lookup_server(params->cell, params->key,
+					   &vldb->fs_server[i]);
+		if (IS_ERR(server)) {
+			ret = PTR_ERR(server);
+			if (ret == -ENOENT)
+				continue;
+			goto error_2;
+		}
+
+		/* Insertion-sort by server pointer */
+		for (j = 0; j < slist->nr_servers; j++)
+			if (slist->servers[j].server >= server)
+				break;
+		if (j < slist->nr_servers) {
+			if (slist->servers[j].server == server) {
+				afs_put_server(params->net, server);
+				continue;
+			}
+
+			memmove(slist->servers + j + 1,
+				slist->servers + j,
+				(slist->nr_servers - j) * sizeof(struct afs_server_entry));
+		}
+
+		slist->servers[j].server = server;
+		slist->nr_servers++;
+	}
+
+	if (slist->nr_servers == 0) {
+		ret = -EDESTADDRREQ;
+		goto error_2;
+	}
+
+	return volume;
+
+error_2:
+	afs_put_serverlist(params->net, slist);
+error_1:
+	kfree(volume);
+error_0:
+	return ERR_PTR(ret);
+}
 
 /*
- * lookup a volume by name
- * - this can be one of the following:
+ * Look up a VLDB record for a volume.
+ */
+static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell,
+						 struct key *key,
+						 const char *volname,
+						 size_t volnamesz)
+{
+	struct afs_addr_cursor ac;
+	struct afs_vldb_entry *vldb;
+	int ret;
+
+	ret = afs_set_vl_cursor(&ac, cell);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	while (afs_iterate_addresses(&ac)) {
+		if (!test_bit(ac.index, &ac.alist->probed)) {
+			ret = afs_vl_get_capabilities(cell->net, &ac, key);
+			switch (ret) {
+			case VL_SERVICE:
+				clear_bit(ac.index, &ac.alist->yfs);
+				set_bit(ac.index, &ac.alist->probed);
+				ac.addr->srx_service = ret;
+				break;
+			case YFS_VL_SERVICE:
+				set_bit(ac.index, &ac.alist->yfs);
+				set_bit(ac.index, &ac.alist->probed);
+				ac.addr->srx_service = ret;
+				break;
+			}
+		}
+		
+		vldb = afs_vl_get_entry_by_name_u(cell->net, &ac, key,
+						  volname, volnamesz);
+		switch (ac.error) {
+		case 0:
+			afs_end_cursor(&ac);
+			return vldb;
+		case -ECONNABORTED:
+			ac.error = afs_abort_to_error(ac.abort_code);
+			goto error;
+		case -ENOMEM:
+		case -ENONET:
+			goto error;
+		case -ENETUNREACH:
+		case -EHOSTUNREACH:
+		case -ECONNREFUSED:
+			break;
+		default:
+			ac.error = -EIO;
+			goto error;
+		}
+	}
+
+error:
+	return ERR_PTR(afs_end_cursor(&ac));
+}
+
+/*
+ * Look up a volume in the VL server and create a candidate volume record for
+ * it.
+ *
+ * The volume name can be one of the following:
  *	"%[cell:]volume[.]"		R/W volume
  *	"#[cell:]volume[.]"		R/O or R/W volume (rwparent=0),
  *					 or R/W (rwparent=1) volume
@@ -42,353 +190,218 @@ static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
  * - Rule 3: If parent volume is R/W, then only mount R/W volume unless
  *           explicitly told otherwise
  */
-struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
+struct afs_volume *afs_create_volume(struct afs_mount_params *params)
 {
-	struct afs_vlocation *vlocation = NULL;
-	struct afs_volume *volume = NULL;
-	struct afs_server *server = NULL;
-	char srvtmask;
-	int ret, loop;
-
-	_enter("{%*.*s,%d}",
-	       params->volnamesz, params->volnamesz, params->volname, params->rwpath);
-
-	/* lookup the volume location record */
-	vlocation = afs_vlocation_lookup(params->cell, params->key,
-					 params->volname, params->volnamesz);
-	if (IS_ERR(vlocation)) {
-		ret = PTR_ERR(vlocation);
-		vlocation = NULL;
-		goto error;
-	}
+	struct afs_vldb_entry *vldb;
+	struct afs_volume *volume;
+	unsigned long type_mask = 1UL << params->type;
 
-	/* make the final decision on the type we want */
-	ret = -ENOMEDIUM;
-	if (params->force && !(vlocation->vldb.vidmask & (1 << params->type)))
-		goto error;
+	vldb = afs_vl_lookup_vldb(params->cell, params->key,
+				  params->volname, params->volnamesz);
+	if (IS_ERR(vldb))
+		return ERR_CAST(vldb);
 
-	srvtmask = 0;
-	for (loop = 0; loop < vlocation->vldb.nservers; loop++)
-		srvtmask |= vlocation->vldb.srvtmask[loop];
+	if (test_bit(AFS_VLDB_QUERY_ERROR, &vldb->flags)) {
+		volume = ERR_PTR(vldb->error);
+		goto error;
+	}
 
+	/* Make the final decision on the type we want */
+	volume = ERR_PTR(-ENOMEDIUM);
 	if (params->force) {
-		if (!(srvtmask & (1 << params->type)))
+		if (!(vldb->flags & type_mask))
 			goto error;
-	} else if (srvtmask & AFS_VOL_VTM_RO) {
+	} else if (test_bit(AFS_VLDB_HAS_RO, &vldb->flags)) {
 		params->type = AFSVL_ROVOL;
-	} else if (srvtmask & AFS_VOL_VTM_RW) {
+	} else if (test_bit(AFS_VLDB_HAS_RW, &vldb->flags)) {
 		params->type = AFSVL_RWVOL;
 	} else {
 		goto error;
 	}
 
-	down_write(&params->cell->vl_sem);
+	type_mask = 1UL << params->type;
+	volume = afs_alloc_volume(params, vldb, type_mask);
 
-	/* is the volume already active? */
-	if (vlocation->vols[params->type]) {
-		/* yes - re-use it */
-		volume = vlocation->vols[params->type];
-		afs_get_volume(volume);
-		goto success;
-	}
+error:
+	kfree(vldb);
+	return volume;
+}
 
-	/* create a new volume record */
-	_debug("creating new volume record");
+/*
+ * Destroy a volume record
+ */
+static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
+{
+	_enter("%p", volume);
 
-	ret = -ENOMEM;
-	volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
-	if (!volume)
-		goto error_up;
+#ifdef CONFIG_AFS_FSCACHE
+	ASSERTCMP(volume->cache, ==, NULL);
+#endif
 
-	atomic_set(&volume->usage, 1);
-	volume->type		= params->type;
-	volume->type_force	= params->force;
-	volume->cell		= params->cell;
-	volume->vid		= vlocation->vldb.vid[params->type];
-
-	init_rwsem(&volume->server_sem);
-
-	/* look up all the applicable server records */
-	for (loop = 0; loop < 8; loop++) {
-		if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) {
-			server = afs_lookup_server(
-			       volume->cell, &vlocation->vldb.servers[loop]);
-			if (IS_ERR(server)) {
-				ret = PTR_ERR(server);
-				goto error_discard;
-			}
+	afs_put_serverlist(net, volume->servers);
+	afs_put_cell(net, volume->cell);
+	kfree(volume);
 
-			volume->servers[volume->nservers] = server;
-			volume->nservers++;
-		}
+	_leave(" [destroyed]");
+}
+
+/*
+ * Drop a reference on a volume record.
+ */
+void afs_put_volume(struct afs_cell *cell, struct afs_volume *volume)
+{
+	if (volume) {
+		_enter("%s", volume->name);
+
+		if (atomic_dec_and_test(&volume->usage))
+			afs_destroy_volume(cell->net, volume);
 	}
+}
 
-	/* attach the cache and volume location */
+/*
+ * Activate a volume.
+ */
+void afs_activate_volume(struct afs_volume *volume)
+{
 #ifdef CONFIG_AFS_FSCACHE
-	volume->cache = fscache_acquire_cookie(vlocation->cache,
+	volume->cache = fscache_acquire_cookie(volume->cell->cache,
 					       &afs_volume_cache_index_def,
 					       volume, true);
 #endif
-	afs_get_vlocation(vlocation);
-	volume->vlocation = vlocation;
-
-	vlocation->vols[volume->type] = volume;
-
-success:
-	_debug("kAFS selected %s volume %08x",
-	       afs_voltypes[volume->type], volume->vid);
-	up_write(&params->cell->vl_sem);
-	afs_put_vlocation(vlocation);
-	_leave(" = %p", volume);
-	return volume;
-
-	/* clean up */
-error_up:
-	up_write(&params->cell->vl_sem);
-error:
-	afs_put_vlocation(vlocation);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
-
-error_discard:
-	up_write(&params->cell->vl_sem);
-
-	for (loop = volume->nservers - 1; loop >= 0; loop--)
-		afs_put_server(volume->servers[loop]);
 
-	kfree(volume);
-	goto error;
+	write_lock(&volume->cell->proc_lock);
+	list_add_tail(&volume->proc_link, &volume->cell->proc_volumes);
+	write_unlock(&volume->cell->proc_lock);
 }
 
 /*
- * destroy a volume record
+ * Deactivate a volume.
  */
-void afs_put_volume(struct afs_volume *volume)
+void afs_deactivate_volume(struct afs_volume *volume)
 {
-	struct afs_vlocation *vlocation;
-	int loop;
-
-	if (!volume)
-		return;
-
-	_enter("%p", volume);
+	_enter("%s", volume->name);
 
-	ASSERTCMP(atomic_read(&volume->usage), >, 0);
+	write_lock(&volume->cell->proc_lock);
+	list_del_init(&volume->proc_link);
+	write_unlock(&volume->cell->proc_lock);
 
-	vlocation = volume->vlocation;
-
-	/* to prevent a race, the decrement and the dequeue must be effectively
-	 * atomic */
-	down_write(&vlocation->cell->vl_sem);
-
-	if (likely(!atomic_dec_and_test(&volume->usage))) {
-		up_write(&vlocation->cell->vl_sem);
-		_leave("");
-		return;
-	}
-
-	vlocation->vols[volume->type] = NULL;
-
-	up_write(&vlocation->cell->vl_sem);
-
-	/* finish cleaning up the volume */
 #ifdef CONFIG_AFS_FSCACHE
-	fscache_relinquish_cookie(volume->cache, 0);
+	fscache_relinquish_cookie(volume->cache,
+				  test_bit(AFS_VOLUME_DELETED, &volume->flags));
+	volume->cache = NULL;
 #endif
-	afs_put_vlocation(vlocation);
-
-	for (loop = volume->nservers - 1; loop >= 0; loop--)
-		afs_put_server(volume->servers[loop]);
-
-	kfree(volume);
 
-	_leave(" [destroyed]");
+	_leave("");
 }
 
 /*
- * pick a server to use to try accessing this volume
- * - returns with an elevated usage count on the server chosen
+ * Query the VL service to update the volume status.
  */
-struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
+static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
 {
-	struct afs_volume *volume = vnode->volume;
-	struct afs_server *server;
-	int ret, state, loop;
+	struct afs_server_list *new, *old, *discard;
+	struct afs_vldb_entry *vldb;
+	char idbuf[16];
+	int ret, idsz;
+
+	_enter("");
+
+	/* We look up an ID by passing it as a decimal string in the
+	 * operation's name parameter.
+	 */
+	idsz = sprintf(idbuf, "%u", volume->vid);
 
-	_enter("%s", volume->vlocation->vldb.name);
+	vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz);
+	if (IS_ERR(vldb)) {
+		ret = PTR_ERR(vldb);
+		goto error;
+	}
 
-	/* stick with the server we're already using if we can */
-	if (vnode->server && vnode->server->fs_state == 0) {
-		afs_get_server(vnode->server);
-		_leave(" = %p [current]", vnode->server);
-		return vnode->server;
+	/* See if the volume got renamed. */
+	if (vldb->name_len != volume->name_len ||
+	    memcmp(vldb->name, volume->name, vldb->name_len) != 0) {
+		/* TODO: Use RCU'd string. */
+		memcpy(volume->name, vldb->name, AFS_MAXVOLNAME);
+		volume->name_len = vldb->name_len;
 	}
 
-	down_read(&volume->server_sem);
+	/* See if the volume's server list got updated. */
+	new = afs_alloc_server_list(volume->cell, key,
+				      vldb, (1 << volume->type));
+	if (IS_ERR(new)) {
+		ret = PTR_ERR(new);
+		goto error_vldb;
+	}
 
-	/* handle the no-server case */
-	if (volume->nservers == 0) {
-		ret = volume->rjservers ? -ENOMEDIUM : -ESTALE;
-		up_read(&volume->server_sem);
-		_leave(" = %d [no servers]", ret);
-		return ERR_PTR(ret);
+	write_lock(&volume->servers_lock);
+
+	discard = new;
+	old = volume->servers;
+	if (afs_annotate_server_list(new, old)) {
+		new->seq = volume->servers_seq + 1;
+		volume->servers = new;
+		smp_wmb();
+		volume->servers_seq++;
+		discard = old;
 	}
 
-	/* basically, just search the list for the first live server and use
-	 * that */
+	volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
+	clear_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
+	write_unlock(&volume->servers_lock);
 	ret = 0;
-	for (loop = 0; loop < volume->nservers; loop++) {
-		server = volume->servers[loop];
-		state = server->fs_state;
 
-		_debug("consider %d [%d]", loop, state);
+	afs_put_serverlist(volume->cell->net, discard);
+error_vldb:
+	kfree(vldb);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
 
-		switch (state) {
-			/* found an apparently healthy server */
-		case 0:
-			afs_get_server(server);
-			up_read(&volume->server_sem);
-			_leave(" = %p (picked %08x)",
-			       server, ntohl(server->addr.s_addr));
-			return server;
+/*
+ * Make sure the volume record is up to date.
+ */
+int afs_check_volume_status(struct afs_volume *volume, struct key *key)
+{
+	time64_t now = ktime_get_real_seconds();
+	int ret, retries = 0;
 
-		case -ENETUNREACH:
-			if (ret == 0)
-				ret = state;
-			break;
+	_enter("");
 
-		case -EHOSTUNREACH:
-			if (ret == 0 ||
-			    ret == -ENETUNREACH)
-				ret = state;
-			break;
+	if (volume->update_at <= now)
+		set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
 
-		case -ECONNREFUSED:
-			if (ret == 0 ||
-			    ret == -ENETUNREACH ||
-			    ret == -EHOSTUNREACH)
-				ret = state;
-			break;
+retry:
+	if (!test_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags) &&
+	    !test_bit(AFS_VOLUME_WAIT, &volume->flags)) {
+		_leave(" = 0");
+		return 0;
+	}
 
-		default:
-		case -EREMOTEIO:
-			if (ret == 0 ||
-			    ret == -ENETUNREACH ||
-			    ret == -EHOSTUNREACH ||
-			    ret == -ECONNREFUSED)
-				ret = state;
-			break;
-		}
+	if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) {
+		ret = afs_update_volume_status(volume, key);
+		clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags);
+		clear_bit_unlock(AFS_VOLUME_UPDATING, &volume->flags);
+		wake_up_bit(&volume->flags, AFS_VOLUME_WAIT);
+		_leave(" = %d", ret);
+		return ret;
 	}
 
-	/* no available servers
-	 * - TODO: handle the no active servers case better
-	 */
-	up_read(&volume->server_sem);
-	_leave(" = %d", ret);
-	return ERR_PTR(ret);
-}
+	if (!test_bit(AFS_VOLUME_WAIT, &volume->flags)) {
+		_leave(" = 0 [no wait]");
+		return 0;
+	}
 
-/*
- * release a server after use
- * - releases the ref on the server struct that was acquired by picking
- * - records result of using a particular server to access a volume
- * - return 0 to try again, 1 if okay or to issue error
- * - the caller must release the server struct if result was 0
- */
-int afs_volume_release_fileserver(struct afs_vnode *vnode,
-				  struct afs_server *server,
-				  int result)
-{
-	struct afs_volume *volume = vnode->volume;
-	unsigned loop;
-
-	_enter("%s,%08x,%d",
-	       volume->vlocation->vldb.name, ntohl(server->addr.s_addr),
-	       result);
-
-	switch (result) {
-		/* success */
-	case 0:
-		server->fs_act_jif = jiffies;
-		server->fs_state = 0;
-		_leave("");
-		return 1;
-
-		/* the fileserver denied all knowledge of the volume */
-	case -ENOMEDIUM:
-		server->fs_act_jif = jiffies;
-		down_write(&volume->server_sem);
-
-		/* firstly, find where the server is in the active list (if it
-		 * is) */
-		for (loop = 0; loop < volume->nservers; loop++)
-			if (volume->servers[loop] == server)
-				goto present;
-
-		/* no longer there - may have been discarded by another op */
-		goto try_next_server_upw;
-
-	present:
-		volume->nservers--;
-		memmove(&volume->servers[loop],
-			&volume->servers[loop + 1],
-			sizeof(volume->servers[loop]) *
-			(volume->nservers - loop));
-		volume->servers[volume->nservers] = NULL;
-		afs_put_server(server);
-		volume->rjservers++;
-
-		if (volume->nservers > 0)
-			/* another server might acknowledge its existence */
-			goto try_next_server_upw;
-
-		/* handle the case where all the fileservers have rejected the
-		 * volume
-		 * - TODO: try asking the fileservers for volume information
-		 * - TODO: contact the VL server again to see if the volume is
-		 *         no longer registered
-		 */
-		up_write(&volume->server_sem);
-		afs_put_server(server);
-		_leave(" [completely rejected]");
-		return 1;
-
-		/* problem reaching the server */
-	case -ENETUNREACH:
-	case -EHOSTUNREACH:
-	case -ECONNREFUSED:
-	case -ETIME:
-	case -ETIMEDOUT:
-	case -EREMOTEIO:
-		/* mark the server as dead
-		 * TODO: vary dead timeout depending on error
-		 */
-		spin_lock(&server->fs_lock);
-		if (!server->fs_state) {
-			server->fs_dead_jif = jiffies + HZ * 10;
-			server->fs_state = result;
-			printk("kAFS: SERVER DEAD state=%d\n", result);
-		}
-		spin_unlock(&server->fs_lock);
-		goto try_next_server;
-
-		/* miscellaneous error */
-	default:
-		server->fs_act_jif = jiffies;
-	case -ENOMEM:
-	case -ENONET:
-		/* tell the caller to accept the result */
-		afs_put_server(server);
-		_leave(" [local failure]");
-		return 1;
+	ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, TASK_INTERRUPTIBLE);
+	if (ret == -ERESTARTSYS) {
+		_leave(" = %d", ret);
+		return ret;
 	}
 
-	/* tell the caller to loop around and try the next server */
-try_next_server_upw:
-	up_write(&volume->server_sem);
-try_next_server:
-	afs_put_server(server);
-	_leave(" [try next server]");
-	return 0;
+	retries++;
+	if (retries == 4) {
+		_leave(" = -ESTALE");
+		return -ESTALE;
+	}
+	goto retry;
 }
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 106e43db1115..18e46e31523c 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -8,6 +8,7 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+
 #include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
@@ -16,9 +17,6 @@
 #include <linux/pagevec.h>
 #include "internal.h"
 
-static int afs_write_back_from_locked_page(struct afs_writeback *wb,
-					   struct page *page);
-
 /*
  * mark a page as having been made dirty and thus needing writeback
  */
@@ -29,58 +27,6 @@ int afs_set_page_dirty(struct page *page)
 }
 
 /*
- * unlink a writeback record because its usage has reached zero
- * - must be called with the wb->vnode->writeback_lock held
- */
-static void afs_unlink_writeback(struct afs_writeback *wb)
-{
-	struct afs_writeback *front;
-	struct afs_vnode *vnode = wb->vnode;
-
-	list_del_init(&wb->link);
-	if (!list_empty(&vnode->writebacks)) {
-		/* if an fsync rises to the front of the queue then wake it
-		 * up */
-		front = list_entry(vnode->writebacks.next,
-				   struct afs_writeback, link);
-		if (front->state == AFS_WBACK_SYNCING) {
-			_debug("wake up sync");
-			front->state = AFS_WBACK_COMPLETE;
-			wake_up(&front->waitq);
-		}
-	}
-}
-
-/*
- * free a writeback record
- */
-static void afs_free_writeback(struct afs_writeback *wb)
-{
-	_enter("");
-	key_put(wb->key);
-	kfree(wb);
-}
-
-/*
- * dispose of a reference to a writeback record
- */
-void afs_put_writeback(struct afs_writeback *wb)
-{
-	struct afs_vnode *vnode = wb->vnode;
-
-	_enter("{%d}", wb->usage);
-
-	spin_lock(&vnode->writeback_lock);
-	if (--wb->usage == 0)
-		afs_unlink_writeback(wb);
-	else
-		wb = NULL;
-	spin_unlock(&vnode->writeback_lock);
-	if (wb)
-		afs_free_writeback(wb);
-}
-
-/*
  * partly or wholly fill a page that's under preparation for writing
  */
 static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
@@ -103,7 +49,7 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
 	req->pages[0] = page;
 	get_page(page);
 
-	ret = afs_vnode_fetch_data(vnode, key, req);
+	ret = afs_fetch_data(vnode, key, req);
 	afs_put_read(req);
 	if (ret < 0) {
 		if (ret == -ENOENT) {
@@ -125,42 +71,32 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 		    loff_t pos, unsigned len, unsigned flags,
 		    struct page **pagep, void **fsdata)
 {
-	struct afs_writeback *candidate, *wb;
 	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 	struct page *page;
-	struct key *key = file->private_data;
-	unsigned from = pos & (PAGE_SIZE - 1);
-	unsigned to = from + len;
+	struct key *key = afs_file_key(file);
+	unsigned long priv;
+	unsigned f, from = pos & (PAGE_SIZE - 1);
+	unsigned t, to = from + len;
 	pgoff_t index = pos >> PAGE_SHIFT;
 	int ret;
 
 	_enter("{%x:%u},{%lx},%u,%u",
 	       vnode->fid.vid, vnode->fid.vnode, index, from, to);
 
-	candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
-	if (!candidate)
-		return -ENOMEM;
-	candidate->vnode = vnode;
-	candidate->first = candidate->last = index;
-	candidate->offset_first = from;
-	candidate->to_last = to;
-	INIT_LIST_HEAD(&candidate->link);
-	candidate->usage = 1;
-	candidate->state = AFS_WBACK_PENDING;
-	init_waitqueue_head(&candidate->waitq);
+	/* We want to store information about how much of a page is altered in
+	 * page->private.
+	 */
+	BUILD_BUG_ON(PAGE_SIZE > 32768 && sizeof(page->private) < 8);
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page) {
-		kfree(candidate);
+	if (!page)
 		return -ENOMEM;
-	}
 
 	if (!PageUptodate(page) && len != PAGE_SIZE) {
 		ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page);
 		if (ret < 0) {
 			unlock_page(page);
 			put_page(page);
-			kfree(candidate);
 			_leave(" = %d [prep]", ret);
 			return ret;
 		}
@@ -171,79 +107,54 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 	*pagep = page;
 
 try_again:
-	spin_lock(&vnode->writeback_lock);
-
-	/* see if this page is already pending a writeback under a suitable key
-	 * - if so we can just join onto that one */
-	wb = (struct afs_writeback *) page_private(page);
-	if (wb) {
-		if (wb->key == key && wb->state == AFS_WBACK_PENDING)
-			goto subsume_in_current_wb;
-		goto flush_conflicting_wb;
+	/* See if this page is already partially written in a way that we can
+	 * merge the new write with.
+	 */
+	t = f = 0;
+	if (PagePrivate(page)) {
+		priv = page_private(page);
+		f = priv & AFS_PRIV_MAX;
+		t = priv >> AFS_PRIV_SHIFT;
+		ASSERTCMP(f, <=, t);
 	}
 
-	if (index > 0) {
-		/* see if we can find an already pending writeback that we can
-		 * append this page to */
-		list_for_each_entry(wb, &vnode->writebacks, link) {
-			if (wb->last == index - 1 && wb->key == key &&
-			    wb->state == AFS_WBACK_PENDING)
-				goto append_to_previous_wb;
-		}
+	if (f != t) {
+		if (to < f || from > t)
+			goto flush_conflicting_write;
+		if (from < f)
+			f = from;
+		if (to > t)
+			t = to;
+	} else {
+		f = from;
+		t = to;
 	}
 
-	list_add_tail(&candidate->link, &vnode->writebacks);
-	candidate->key = key_get(key);
-	spin_unlock(&vnode->writeback_lock);
-	SetPagePrivate(page);
-	set_page_private(page, (unsigned long) candidate);
-	_leave(" = 0 [new]");
-	return 0;
-
-subsume_in_current_wb:
-	_debug("subsume");
-	ASSERTRANGE(wb->first, <=, index, <=, wb->last);
-	if (index == wb->first && from < wb->offset_first)
-		wb->offset_first = from;
-	if (index == wb->last && to > wb->to_last)
-		wb->to_last = to;
-	spin_unlock(&vnode->writeback_lock);
-	kfree(candidate);
-	_leave(" = 0 [sub]");
-	return 0;
-
-append_to_previous_wb:
-	_debug("append into %lx-%lx", wb->first, wb->last);
-	wb->usage++;
-	wb->last++;
-	wb->to_last = to;
-	spin_unlock(&vnode->writeback_lock);
+	priv = (unsigned long)t << AFS_PRIV_SHIFT;
+	priv |= f;
+	trace_afs_page_dirty(vnode, tracepoint_string("begin"),
+			     page->index, priv);
 	SetPagePrivate(page);
-	set_page_private(page, (unsigned long) wb);
-	kfree(candidate);
-	_leave(" = 0 [app]");
+	set_page_private(page, priv);
+	_leave(" = 0");
 	return 0;
 
-	/* the page is currently bound to another context, so if it's dirty we
-	 * need to flush it before we can use the new context */
-flush_conflicting_wb:
+	/* The previous write and this write aren't adjacent or overlapping, so
+	 * flush the page out.
+	 */
+flush_conflicting_write:
 	_debug("flush conflict");
-	if (wb->state == AFS_WBACK_PENDING)
-		wb->state = AFS_WBACK_CONFLICTING;
-	spin_unlock(&vnode->writeback_lock);
-	if (clear_page_dirty_for_io(page)) {
-		ret = afs_write_back_from_locked_page(wb, page);
-		if (ret < 0) {
-			afs_put_writeback(candidate);
-			_leave(" = %d", ret);
-			return ret;
-		}
+	ret = write_one_page(page);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
 	}
 
-	/* the page holds a ref on the writeback record */
-	afs_put_writeback(wb);
-	set_page_private(page, 0);
-	ClearPagePrivate(page);
+	ret = lock_page_killable(page);
+	if (ret < 0) {
+		_leave(" = %d", ret);
+		return ret;
+	}
 	goto try_again;
 }
 
@@ -255,7 +166,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 		  struct page *page, void *fsdata)
 {
 	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
-	struct key *key = file->private_data;
+	struct key *key = afs_file_key(file);
 	loff_t i_size, maybe_i_size;
 	int ret;
 
@@ -266,11 +177,11 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 
 	i_size = i_size_read(&vnode->vfs_inode);
 	if (maybe_i_size > i_size) {
-		spin_lock(&vnode->writeback_lock);
+		spin_lock(&vnode->wb_lock);
 		i_size = i_size_read(&vnode->vfs_inode);
 		if (maybe_i_size > i_size)
 			i_size_write(&vnode->vfs_inode, maybe_i_size);
-		spin_unlock(&vnode->writeback_lock);
+		spin_unlock(&vnode->wb_lock);
 	}
 
 	if (!PageUptodate(page)) {
@@ -299,16 +210,17 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 /*
  * kill all the pages in the given range
  */
-static void afs_kill_pages(struct afs_vnode *vnode, bool error,
+static void afs_kill_pages(struct address_space *mapping,
 			   pgoff_t first, pgoff_t last)
 {
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
 	struct pagevec pv;
 	unsigned count, loop;
 
 	_enter("{%x:%u},%lx-%lx",
 	       vnode->fid.vid, vnode->fid.vnode, first, last);
 
-	pagevec_init(&pv, 0);
+	pagevec_init(&pv);
 
 	do {
 		_debug("kill %lx-%lx", first, last);
@@ -316,37 +228,157 @@ static void afs_kill_pages(struct afs_vnode *vnode, bool error,
 		count = last - first + 1;
 		if (count > PAGEVEC_SIZE)
 			count = PAGEVEC_SIZE;
-		pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
-					      first, count, pv.pages);
+		pv.nr = find_get_pages_contig(mapping, first, count, pv.pages);
 		ASSERTCMP(pv.nr, ==, count);
 
 		for (loop = 0; loop < count; loop++) {
 			struct page *page = pv.pages[loop];
 			ClearPageUptodate(page);
-			if (error)
-				SetPageError(page);
-			if (PageWriteback(page))
-				end_page_writeback(page);
+			SetPageError(page);
+			end_page_writeback(page);
+			if (page->index >= first)
+				first = page->index + 1;
+			lock_page(page);
+			generic_error_remove_page(mapping, page);
+		}
+
+		__pagevec_release(&pv);
+	} while (first <= last);
+
+	_leave("");
+}
+
+/*
+ * Redirty all the pages in a given range.
+ */
+static void afs_redirty_pages(struct writeback_control *wbc,
+			      struct address_space *mapping,
+			      pgoff_t first, pgoff_t last)
+{
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+	struct pagevec pv;
+	unsigned count, loop;
+
+	_enter("{%x:%u},%lx-%lx",
+	       vnode->fid.vid, vnode->fid.vnode, first, last);
+
+	pagevec_init(&pv);
+
+	do {
+		_debug("redirty %lx-%lx", first, last);
+
+		count = last - first + 1;
+		if (count > PAGEVEC_SIZE)
+			count = PAGEVEC_SIZE;
+		pv.nr = find_get_pages_contig(mapping, first, count, pv.pages);
+		ASSERTCMP(pv.nr, ==, count);
+
+		for (loop = 0; loop < count; loop++) {
+			struct page *page = pv.pages[loop];
+
+			redirty_page_for_writepage(wbc, page);
+			end_page_writeback(page);
 			if (page->index >= first)
 				first = page->index + 1;
 		}
 
 		__pagevec_release(&pv);
-	} while (first < last);
+	} while (first <= last);
 
 	_leave("");
 }
 
 /*
- * synchronously write back the locked page and any subsequent non-locked dirty
- * pages also covered by the same writeback record
+ * write to a file
  */
-static int afs_write_back_from_locked_page(struct afs_writeback *wb,
-					   struct page *primary_page)
+static int afs_store_data(struct address_space *mapping,
+			  pgoff_t first, pgoff_t last,
+			  unsigned offset, unsigned to)
 {
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+	struct afs_fs_cursor fc;
+	struct afs_wb_key *wbk = NULL;
+	struct list_head *p;
+	int ret = -ENOKEY, ret2;
+
+	_enter("%s{%x:%u.%u},%lx,%lx,%x,%x",
+	       vnode->volume->name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       first, last, offset, to);
+
+	spin_lock(&vnode->wb_lock);
+	p = vnode->wb_keys.next;
+
+	/* Iterate through the list looking for a valid key to use. */
+try_next_key:
+	while (p != &vnode->wb_keys) {
+		wbk = list_entry(p, struct afs_wb_key, vnode_link);
+		_debug("wbk %u", key_serial(wbk->key));
+		ret2 = key_validate(wbk->key);
+		if (ret2 == 0)
+			goto found_key;
+		if (ret == -ENOKEY)
+			ret = ret2;
+		p = p->next;
+	}
+
+	spin_unlock(&vnode->wb_lock);
+	afs_put_wb_key(wbk);
+	_leave(" = %d [no keys]", ret);
+	return ret;
+
+found_key:
+	refcount_inc(&wbk->usage);
+	spin_unlock(&vnode->wb_lock);
+
+	_debug("USE WB KEY %u", key_serial(wbk->key));
+
+	ret = -ERESTARTSYS;
+	if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) {
+		while (afs_select_fileserver(&fc)) {
+			fc.cb_break = vnode->cb_break + vnode->cb_s_break;
+			afs_fs_store_data(&fc, mapping, first, last, offset, to);
+		}
+
+		afs_check_for_remote_deletion(&fc, fc.vnode);
+		afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+		ret = afs_end_vnode_operation(&fc);
+	}
+
+	switch (ret) {
+	case -EACCES:
+	case -EPERM:
+	case -ENOKEY:
+	case -EKEYEXPIRED:
+	case -EKEYREJECTED:
+	case -EKEYREVOKED:
+		_debug("next");
+		spin_lock(&vnode->wb_lock);
+		p = wbk->vnode_link.next;
+		afs_put_wb_key(wbk);
+		goto try_next_key;
+	}
+
+	afs_put_wb_key(wbk);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Synchronously write back the locked page and any subsequent non-locked dirty
+ * pages.
+ */
+static int afs_write_back_from_locked_page(struct address_space *mapping,
+					   struct writeback_control *wbc,
+					   struct page *primary_page,
+					   pgoff_t final_page)
+{
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
 	struct page *pages[8], *page;
-	unsigned long count;
-	unsigned n, offset, to;
+	unsigned long count, priv;
+	unsigned n, offset, to, f, t;
 	pgoff_t start, first, last;
 	int loop, ret;
 
@@ -356,20 +388,33 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
 	if (test_set_page_writeback(primary_page))
 		BUG();
 
-	/* find all consecutive lockable dirty pages, stopping when we find a
-	 * page that is not immediately lockable, is not dirty or is missing,
-	 * or we reach the end of the range */
+	/* Find all consecutive lockable dirty pages that have contiguous
+	 * written regions, stopping when we find a page that is not
+	 * immediately lockable, is not dirty or is missing, or we reach the
+	 * end of the range.
+	 */
 	start = primary_page->index;
-	if (start >= wb->last)
+	priv = page_private(primary_page);
+	offset = priv & AFS_PRIV_MAX;
+	to = priv >> AFS_PRIV_SHIFT;
+	trace_afs_page_dirty(vnode, tracepoint_string("store"),
+			     primary_page->index, priv);
+
+	WARN_ON(offset == to);
+	if (offset == to)
+		trace_afs_page_dirty(vnode, tracepoint_string("WARN"),
+				     primary_page->index, priv);
+
+	if (start >= final_page || to < PAGE_SIZE)
 		goto no_more;
+
 	start++;
 	do {
 		_debug("more %lx [%lx]", start, count);
-		n = wb->last - start + 1;
+		n = final_page - start + 1;
 		if (n > ARRAY_SIZE(pages))
 			n = ARRAY_SIZE(pages);
-		n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping,
-					  start, n, pages);
+		n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages);
 		_debug("fgpc %u", n);
 		if (n == 0)
 			goto no_more;
@@ -381,16 +426,30 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
 		}
 
 		for (loop = 0; loop < n; loop++) {
+			if (to != PAGE_SIZE)
+				break;
 			page = pages[loop];
-			if (page->index > wb->last)
+			if (page->index > final_page)
 				break;
 			if (!trylock_page(page))
 				break;
-			if (!PageDirty(page) ||
-			    page_private(page) != (unsigned long) wb) {
+			if (!PageDirty(page) || PageWriteback(page)) {
 				unlock_page(page);
 				break;
 			}
+
+			priv = page_private(page);
+			f = priv & AFS_PRIV_MAX;
+			t = priv >> AFS_PRIV_SHIFT;
+			if (f != 0) {
+				unlock_page(page);
+				break;
+			}
+			to = t;
+
+			trace_afs_page_dirty(vnode, tracepoint_string("store+"),
+					     page->index, priv);
+
 			if (!clear_page_dirty_for_io(page))
 				BUG();
 			if (test_set_page_writeback(page))
@@ -406,50 +465,55 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
 		}
 
 		start += loop;
-	} while (start <= wb->last && count < 65536);
+	} while (start <= final_page && count < 65536);
 
 no_more:
-	/* we now have a contiguous set of dirty pages, each with writeback set
-	 * and the dirty mark cleared; the first page is locked and must remain
-	 * so, all the rest are unlocked */
+	/* We now have a contiguous set of dirty pages, each with writeback
+	 * set; the first page is still locked at this point, but all the rest
+	 * have been unlocked.
+	 */
+	unlock_page(primary_page);
+
 	first = primary_page->index;
 	last = first + count - 1;
 
-	offset = (first == wb->first) ? wb->offset_first : 0;
-	to = (last == wb->last) ? wb->to_last : PAGE_SIZE;
-
 	_debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
 
-	ret = afs_vnode_store_data(wb, first, last, offset, to);
-	if (ret < 0) {
-		switch (ret) {
-		case -EDQUOT:
-		case -ENOSPC:
-			mapping_set_error(wb->vnode->vfs_inode.i_mapping, -ENOSPC);
-			break;
-		case -EROFS:
-		case -EIO:
-		case -EREMOTEIO:
-		case -EFBIG:
-		case -ENOENT:
-		case -ENOMEDIUM:
-		case -ENXIO:
-			afs_kill_pages(wb->vnode, true, first, last);
-			mapping_set_error(wb->vnode->vfs_inode.i_mapping, -EIO);
-			break;
-		case -EACCES:
-		case -EPERM:
-		case -ENOKEY:
-		case -EKEYEXPIRED:
-		case -EKEYREJECTED:
-		case -EKEYREVOKED:
-			afs_kill_pages(wb->vnode, false, first, last);
-			break;
-		default:
-			break;
-		}
-	} else {
+	ret = afs_store_data(mapping, first, last, offset, to);
+	switch (ret) {
+	case 0:
 		ret = count;
+		break;
+
+	default:
+		pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret);
+		/* Fall through */
+	case -EACCES:
+	case -EPERM:
+	case -ENOKEY:
+	case -EKEYEXPIRED:
+	case -EKEYREJECTED:
+	case -EKEYREVOKED:
+		afs_redirty_pages(wbc, mapping, first, last);
+		mapping_set_error(mapping, ret);
+		break;
+
+	case -EDQUOT:
+	case -ENOSPC:
+		afs_redirty_pages(wbc, mapping, first, last);
+		mapping_set_error(mapping, -ENOSPC);
+		break;
+
+	case -EROFS:
+	case -EIO:
+	case -EREMOTEIO:
+	case -EFBIG:
+	case -ENOENT:
+	case -ENOMEDIUM:
+	case -ENXIO:
+		afs_kill_pages(mapping, first, last);
+		mapping_set_error(mapping, ret);
+		break;
 	}
 
 	_leave(" = %d", ret);
@@ -462,16 +526,12 @@ no_more:
  */
 int afs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct afs_writeback *wb;
 	int ret;
 
 	_enter("{%lx},", page->index);
 
-	wb = (struct afs_writeback *) page_private(page);
-	ASSERT(wb != NULL);
-
-	ret = afs_write_back_from_locked_page(wb, page);
-	unlock_page(page);
+	ret = afs_write_back_from_locked_page(page->mapping, wbc, page,
+					      wbc->range_end >> PAGE_SHIFT);
 	if (ret < 0) {
 		_leave(" = %d", ret);
 		return 0;
@@ -490,33 +550,30 @@ static int afs_writepages_region(struct address_space *mapping,
 				 struct writeback_control *wbc,
 				 pgoff_t index, pgoff_t end, pgoff_t *_next)
 {
-	struct afs_writeback *wb;
 	struct page *page;
 	int ret, n;
 
 	_enter(",,%lx,%lx,", index, end);
 
 	do {
-		n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY,
-				       1, &page);
+		n = find_get_pages_range_tag(mapping, &index, end,
+					PAGECACHE_TAG_DIRTY, 1, &page);
 		if (!n)
 			break;
 
 		_debug("wback %lx", page->index);
 
-		if (page->index > end) {
-			*_next = index;
-			put_page(page);
-			_leave(" = 0 [%lx]", *_next);
-			return 0;
-		}
-
 		/* at this point we hold neither mapping->tree_lock nor lock on
 		 * the page itself: the page may be truncated or invalidated
 		 * (changing page->mapping to NULL), or even swizzled back from
 		 * swapper_space to tmpfs file mapping
 		 */
-		lock_page(page);
+		ret = lock_page_killable(page);
+		if (ret < 0) {
+			put_page(page);
+			_leave(" = %d", ret);
+			return ret;
+		}
 
 		if (page->mapping != mapping || !PageDirty(page)) {
 			unlock_page(page);
@@ -532,17 +589,9 @@ static int afs_writepages_region(struct address_space *mapping,
 			continue;
 		}
 
-		wb = (struct afs_writeback *) page_private(page);
-		ASSERT(wb != NULL);
-
-		spin_lock(&wb->vnode->writeback_lock);
-		wb->state = AFS_WBACK_WRITING;
-		spin_unlock(&wb->vnode->writeback_lock);
-
 		if (!clear_page_dirty_for_io(page))
 			BUG();
-		ret = afs_write_back_from_locked_page(wb, page);
-		unlock_page(page);
+		ret = afs_write_back_from_locked_page(mapping, wbc, page, end);
 		put_page(page);
 		if (ret < 0) {
 			_leave(" = %d", ret);
@@ -598,18 +647,15 @@ int afs_writepages(struct address_space *mapping,
  */
 void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
 {
-	struct afs_writeback *wb = call->wb;
 	struct pagevec pv;
+	unsigned long priv;
 	unsigned count, loop;
 	pgoff_t first = call->first, last = call->last;
-	bool free_wb;
 
 	_enter("{%x:%u},{%lx-%lx}",
 	       vnode->fid.vid, vnode->fid.vnode, first, last);
 
-	ASSERT(wb != NULL);
-
-	pagevec_init(&pv, 0);
+	pagevec_init(&pv);
 
 	do {
 		_debug("done %lx-%lx", first, last);
@@ -617,35 +663,22 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
 		count = last - first + 1;
 		if (count > PAGEVEC_SIZE)
 			count = PAGEVEC_SIZE;
-		pv.nr = find_get_pages_contig(call->mapping, first, count,
-					      pv.pages);
+		pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
+					      first, count, pv.pages);
 		ASSERTCMP(pv.nr, ==, count);
 
-		spin_lock(&vnode->writeback_lock);
 		for (loop = 0; loop < count; loop++) {
-			struct page *page = pv.pages[loop];
-			end_page_writeback(page);
-			if (page_private(page) == (unsigned long) wb) {
-				set_page_private(page, 0);
-				ClearPagePrivate(page);
-				wb->usage--;
-			}
-		}
-		free_wb = false;
-		if (wb->usage == 0) {
-			afs_unlink_writeback(wb);
-			free_wb = true;
+			priv = page_private(pv.pages[loop]);
+			trace_afs_page_dirty(vnode, tracepoint_string("clear"),
+					     pv.pages[loop]->index, priv);
+			set_page_private(pv.pages[loop], 0);
+			end_page_writeback(pv.pages[loop]);
 		}
-		spin_unlock(&vnode->writeback_lock);
 		first += count;
-		if (free_wb) {
-			afs_free_writeback(wb);
-			wb = NULL;
-		}
-
 		__pagevec_release(&pv);
 	} while (first <= last);
 
+	afs_prune_wb_keys(vnode);
 	_leave("");
 }
 
@@ -677,28 +710,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
 }
 
 /*
- * flush the vnode to the fileserver
- */
-int afs_writeback_all(struct afs_vnode *vnode)
-{
-	struct address_space *mapping = vnode->vfs_inode.i_mapping;
-	struct writeback_control wbc = {
-		.sync_mode	= WB_SYNC_ALL,
-		.nr_to_write	= LONG_MAX,
-		.range_cyclic	= 1,
-	};
-	int ret;
-
-	_enter("");
-
-	ret = mapping->a_ops->writepages(mapping, &wbc);
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
  * flush any dirty pages for this process, and check for write errors.
  * - the return status from this call provides a reliable indication of
  *   whether any write errors occurred for this process.
@@ -706,61 +717,13 @@ int afs_writeback_all(struct afs_vnode *vnode)
 int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file_inode(file);
-	struct afs_writeback *wb, *xwb;
 	struct afs_vnode *vnode = AFS_FS_I(inode);
-	int ret;
 
 	_enter("{%x:%u},{n=%pD},%d",
 	       vnode->fid.vid, vnode->fid.vnode, file,
 	       datasync);
 
-	ret = file_write_and_wait_range(file, start, end);
-	if (ret)
-		return ret;
-	inode_lock(inode);
-
-	/* use a writeback record as a marker in the queue - when this reaches
-	 * the front of the queue, all the outstanding writes are either
-	 * completed or rejected */
-	wb = kzalloc(sizeof(*wb), GFP_KERNEL);
-	if (!wb) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	wb->vnode = vnode;
-	wb->first = 0;
-	wb->last = -1;
-	wb->offset_first = 0;
-	wb->to_last = PAGE_SIZE;
-	wb->usage = 1;
-	wb->state = AFS_WBACK_SYNCING;
-	init_waitqueue_head(&wb->waitq);
-
-	spin_lock(&vnode->writeback_lock);
-	list_for_each_entry(xwb, &vnode->writebacks, link) {
-		if (xwb->state == AFS_WBACK_PENDING)
-			xwb->state = AFS_WBACK_CONFLICTING;
-	}
-	list_add_tail(&wb->link, &vnode->writebacks);
-	spin_unlock(&vnode->writeback_lock);
-
-	/* push all the outstanding writebacks to the server */
-	ret = afs_writeback_all(vnode);
-	if (ret < 0) {
-		afs_put_writeback(wb);
-		_leave(" = %d [wb]", ret);
-		goto out;
-	}
-
-	/* wait for the preceding writes to actually complete */
-	ret = wait_event_interruptible(wb->waitq,
-				       wb->state == AFS_WBACK_COMPLETE ||
-				       vnode->writebacks.next == &wb->link);
-	afs_put_writeback(wb);
-	_leave(" = %d", ret);
-out:
-	inode_unlock(inode);
-	return ret;
+	return file_write_and_wait_range(file, start, end);
 }
 
 /*
@@ -781,19 +744,114 @@ int afs_flush(struct file *file, fl_owner_t id)
  * notification that a previously read-only page is about to become writable
  * - if it returns an error, the caller will deliver a bus error signal
  */
-int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int afs_page_mkwrite(struct vm_fault *vmf)
 {
-	struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
+	struct file *file = vmf->vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	unsigned long priv;
 
 	_enter("{{%x:%u}},{%lx}",
-	       vnode->fid.vid, vnode->fid.vnode, page->index);
+	       vnode->fid.vid, vnode->fid.vnode, vmf->page->index);
+
+	sb_start_pagefault(inode->i_sb);
 
-	/* wait for the page to be written to the cache before we allow it to
-	 * be modified */
+	/* Wait for the page to be written to the cache before we allow it to
+	 * be modified.  We then assume the entire page will need writing back.
+	 */
 #ifdef CONFIG_AFS_FSCACHE
-	fscache_wait_on_page_write(vnode->cache, page);
+	fscache_wait_on_page_write(vnode->cache, vmf->page);
 #endif
 
-	_leave(" = 0");
-	return 0;
+	if (PageWriteback(vmf->page) &&
+	    wait_on_page_bit_killable(vmf->page, PG_writeback) < 0)
+		return VM_FAULT_RETRY;
+
+	if (lock_page_killable(vmf->page) < 0)
+		return VM_FAULT_RETRY;
+
+	/* We mustn't change page->private until writeback is complete as that
+	 * details the portion of the page we need to write back and we might
+	 * need to redirty the page if there's a problem.
+	 */
+	wait_on_page_writeback(vmf->page);
+
+	priv = (unsigned long)PAGE_SIZE << AFS_PRIV_SHIFT; /* To */
+	priv |= 0; /* From */
+	trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"),
+			     vmf->page->index, priv);
+	SetPagePrivate(vmf->page);
+	set_page_private(vmf->page, priv);
+
+	sb_end_pagefault(inode->i_sb);
+	return VM_FAULT_LOCKED;
+}
+
+/*
+ * Prune the keys cached for writeback.  The caller must hold vnode->wb_lock.
+ */
+void afs_prune_wb_keys(struct afs_vnode *vnode)
+{
+	LIST_HEAD(graveyard);
+	struct afs_wb_key *wbk, *tmp;
+
+	/* Discard unused keys */
+	spin_lock(&vnode->wb_lock);
+
+	if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
+	    !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) {
+		list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) {
+			if (refcount_read(&wbk->usage) == 1)
+				list_move(&wbk->vnode_link, &graveyard);
+		}
+	}
+
+	spin_unlock(&vnode->wb_lock);
+
+	while (!list_empty(&graveyard)) {
+		wbk = list_entry(graveyard.next, struct afs_wb_key, vnode_link);
+		list_del(&wbk->vnode_link);
+		afs_put_wb_key(wbk);
+	}
+}
+
+/*
+ * Clean up a page during invalidation.
+ */
+int afs_launder_page(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+	unsigned long priv;
+	unsigned int f, t;
+	int ret = 0;
+
+	_enter("{%lx}", page->index);
+
+	priv = page_private(page);
+	if (clear_page_dirty_for_io(page)) {
+		f = 0;
+		t = PAGE_SIZE;
+		if (PagePrivate(page)) {
+			f = priv & AFS_PRIV_MAX;
+			t = priv >> AFS_PRIV_SHIFT;
+		}
+
+		trace_afs_page_dirty(vnode, tracepoint_string("launder"),
+				     page->index, priv);
+		ret = afs_store_data(mapping, page->index, page->index, t, f);
+	}
+
+	trace_afs_page_dirty(vnode, tracepoint_string("laundered"),
+			     page->index, priv);
+	set_page_private(page, 0);
+	ClearPagePrivate(page);
+
+#ifdef CONFIG_AFS_FSCACHE
+	if (PageFsCache(page)) {
+		fscache_wait_on_page_write(vnode->cache, page);
+		fscache_uncache_page(vnode->cache, page);
+	}
+#endif
+	return ret;
 }
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 2830e4f48d85..cfcc674e64a5 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -45,7 +45,7 @@ static int afs_xattr_get_cell(const struct xattr_handler *handler,
 	struct afs_cell *cell = vnode->volume->cell;
 	size_t namelen;
 
-	namelen = strlen(cell->name);
+	namelen = cell->name_len;
 	if (size == 0)
 		return namelen;
 	if (namelen > size)
@@ -96,7 +96,7 @@ static int afs_xattr_get_volume(const struct xattr_handler *handler,
 			      void *buffer, size_t size)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
-	const char *volname = vnode->volume->vlocation->vldb.name;
+	const char *volname = vnode->volume->name;
 	size_t namelen;
 
 	namelen = strlen(volname);
diff --git a/fs/aio.c b/fs/aio.c
index e6de7715228c..a062d75109cb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1297,20 +1297,10 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
 
 static long read_events(struct kioctx *ctx, long min_nr, long nr,
 			struct io_event __user *event,
-			struct timespec __user *timeout)
+			ktime_t until)
 {
-	ktime_t until = KTIME_MAX;
 	long ret = 0;
 
-	if (timeout) {
-		struct timespec	ts;
-
-		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
-			return -EFAULT;
-
-		until = timespec_to_ktime(ts);
-	}
-
 	/*
 	 * Note that aio_read_events() is being called as the conditional - i.e.
 	 * we're calling it after prepare_to_wait() has set task state to
@@ -1826,6 +1816,25 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	return ret;
 }
 
+static long do_io_getevents(aio_context_t ctx_id,
+		long min_nr,
+		long nr,
+		struct io_event __user *events,
+		struct timespec64 *ts)
+{
+	ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX;
+	struct kioctx *ioctx = lookup_ioctx(ctx_id);
+	long ret = -EINVAL;
+
+	if (likely(ioctx)) {
+		if (likely(min_nr <= nr && min_nr >= 0))
+			ret = read_events(ioctx, min_nr, nr, events, until);
+		percpu_ref_put(&ioctx->users);
+	}
+
+	return ret;
+}
+
 /* io_getevents:
  *	Attempts to read at least min_nr events and up to nr events from
  *	the completion queue for the aio_context specified by ctx_id. If
@@ -1844,15 +1853,14 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 		struct io_event __user *, events,
 		struct timespec __user *, timeout)
 {
-	struct kioctx *ioctx = lookup_ioctx(ctx_id);
-	long ret = -EINVAL;
+	struct timespec64	ts;
 
-	if (likely(ioctx)) {
-		if (likely(min_nr <= nr && min_nr >= 0))
-			ret = read_events(ioctx, min_nr, nr, events, timeout);
-		percpu_ref_put(&ioctx->users);
+	if (timeout) {
+		if (unlikely(get_timespec64(&ts, timeout)))
+			return -EFAULT;
 	}
-	return ret;
+
+	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
 }
 
 #ifdef CONFIG_COMPAT
@@ -1862,17 +1870,14 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
 		       struct io_event __user *, events,
 		       struct compat_timespec __user *, timeout)
 {
-	struct timespec t;
-	struct timespec __user *ut = NULL;
+	struct timespec64 t;
 
 	if (timeout) {
-		if (compat_get_timespec(&t, timeout))
+		if (compat_get_timespec64(&t, timeout))
 			return -EFAULT;
 
-		ut = compat_alloc_user_space(sizeof(*ut));
-		if (copy_to_user(ut, &t, sizeof(t)))
-			return -EFAULT;
 	}
-	return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
+
+	return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
 }
 #endif
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 73b01e474fdc..83732fef510d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -51,6 +51,11 @@
 #define user_siginfo_t siginfo_t
 #endif
 
+/* That's for binfmt_elf_fdpic to deal with */
+#ifndef elf_check_fdpic
+#define elf_check_fdpic(ex) false
+#endif
+
 static int load_elf_binary(struct linux_binprm *bprm);
 static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
 				int, int, unsigned long);
@@ -541,7 +546,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	if (interp_elf_ex->e_type != ET_EXEC &&
 	    interp_elf_ex->e_type != ET_DYN)
 		goto out;
-	if (!elf_check_arch(interp_elf_ex))
+	if (!elf_check_arch(interp_elf_ex) ||
+	    elf_check_fdpic(interp_elf_ex))
 		goto out;
 	if (!interpreter->f_op->mmap)
 		goto out;
@@ -718,6 +724,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		goto out;
 	if (!elf_check_arch(&loc->elf_ex))
 		goto out;
+	if (elf_check_fdpic(&loc->elf_ex))
+		goto out;
 	if (!bprm->file->f_op->mmap)
 		goto out;
 
@@ -817,7 +825,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
 			goto out_free_dentry;
 		/* Verify the interpreter has a valid arch */
-		if (!elf_check_arch(&loc->interp_elf_ex))
+		if (!elf_check_arch(&loc->interp_elf_ex) ||
+		    elf_check_fdpic(&loc->interp_elf_ex))
 			goto out_free_dentry;
 
 		/* Load the interpreter program headers */
@@ -1190,6 +1199,8 @@ static int load_elf_library(struct file *file)
 	if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
 	    !elf_check_arch(&elf_ex) || !file->f_op->mmap)
 		goto out;
+	if (elf_check_fdpic(&elf_ex))
+		goto out;
 
 	/* Now read in all of the header information */
 
@@ -1699,7 +1710,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 				 long signr, size_t *total)
 {
 	unsigned int i;
-	unsigned int regset_size = view->regsets[0].n * view->regsets[0].size;
+	unsigned int regset0_size = regset_size(t->task, &view->regsets[0]);
 
 	/*
 	 * NT_PRSTATUS is the one special case, because the regset data
@@ -1708,11 +1719,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 * We assume that regset 0 is NT_PRSTATUS.
 	 */
 	fill_prstatus(&t->prstatus, t->task, signr);
-	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size,
+	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset0_size,
 				    &t->prstatus.pr_reg, NULL);
 
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
-		  PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus);
+		  PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus);
 	*total += notesize(&t->notes[0]);
 
 	do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1728,7 +1739,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 		if (regset->core_note_type && regset->get &&
 		    (!regset->active || regset->active(t->task, regset))) {
 			int ret;
-			size_t size = regset->n * regset->size;
+			size_t size = regset_size(t->task, regset);
 			void *data = kmalloc(size, GFP_KERNEL);
 			if (unlikely(!data))
 				return 0;
@@ -1743,7 +1754,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 						  size, data);
 				else {
 					SET_PR_FPVALID(&t->prstatus,
-							1, regset_size);
+							1, regset0_size);
 					fill_note(&t->notes[i], "CORE",
 						  NT_PRFPREG, size, data);
 				}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index e70c039ac190..429326b6e2e7 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -378,6 +378,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 				 executable_stack);
 	if (retval < 0)
 		goto error;
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+	retval = arch_setup_additional_pages(bprm, !!interpreter_name);
+	if (retval < 0)
+		goto error;
+#endif
 #endif
 
 	/* load the executable and interpreter into memory */
@@ -831,6 +836,9 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
 			if (phdr->p_vaddr >= seg->p_vaddr &&
 			    phdr->p_vaddr + phdr->p_memsz <=
 			    seg->p_vaddr + seg->p_memsz) {
+				Elf32_Dyn __user *dyn;
+				Elf32_Sword d_tag;
+
 				params->dynamic_addr =
 					(phdr->p_vaddr - seg->p_vaddr) +
 					seg->addr;
@@ -843,8 +851,9 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
 					goto dynamic_error;
 
 				tmp = phdr->p_memsz / sizeof(Elf32_Dyn);
-				if (((Elf32_Dyn *)
-				     params->dynamic_addr)[tmp - 1].d_tag != 0)
+				dyn = (Elf32_Dyn __user *)params->dynamic_addr;
+				__get_user(d_tag, &dyn[tmp - 1].d_tag);
+				if (d_tag != 0)
 					goto dynamic_error;
 				break;
 			}
@@ -1489,7 +1498,9 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
 	struct vm_area_struct *vma;
 
 	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+#ifdef CONFIG_MMU
 		unsigned long addr;
+#endif
 
 		if (!maydump(vma, cprm->mm_flags))
 			continue;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 789f55e851ae..4a181fcb5175 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -54,18 +54,6 @@ struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
 
-void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
-	va_end(args);
-}
-
 static void bdev_write_inode(struct block_device *bdev)
 {
 	struct inode *inode = bdev->bd_inode;
@@ -249,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 		if (!READ_ONCE(bio.bi_private))
 			break;
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -414,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			break;
 
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -674,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return result;
 
-	result = blk_queue_enter(bdev->bd_queue, false);
+	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
@@ -710,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
-	result = blk_queue_enter(bdev->bd_queue, false);
+	result = blk_queue_enter(bdev->bd_queue, 0);
 	if (result)
 		return result;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 673ac4e01dd0..7208ecef7088 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3992,16 +3992,9 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 	btrfs_put_block_group(bg);
 }
 
-static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
 {
-	wait_on_atomic_t(&bg->nocow_writers,
-			 btrfs_wait_nocow_writers_atomic_t,
+	wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 }
 
@@ -6530,12 +6523,6 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 	btrfs_put_block_group(bg);
 }
 
-static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 {
 	struct btrfs_space_info *space_info = bg->space_info;
@@ -6558,8 +6545,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 	down_write(&space_info->groups_sem);
 	up_write(&space_info->groups_sem);
 
-	wait_on_atomic_t(&bg->reservations,
-			 btrfs_wait_bg_reservations_atomic_t,
+	wait_on_atomic_t(&bg->reservations, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 }
 
@@ -11059,12 +11045,6 @@ int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
 	return 1;
 }
 
-static int wait_snapshotting_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 {
 	while (true) {
@@ -11073,8 +11053,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 		ret = btrfs_start_write_no_snapshotting(root);
 		if (ret)
 			break;
-		wait_on_atomic_t(&root->will_be_snapshotted,
-				 wait_snapshotting_atomic_t,
+		wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 	}
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index adbbc017191c..16045ea86fc1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3797,7 +3797,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 	int scanned = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
@@ -3814,8 +3814,8 @@ retry:
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		tag_pages_for_writeback(mapping, index, end);
 	while (!done && !nr_to_write_done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+	       (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+			tag))) {
 		unsigned i;
 
 		scanned = 1;
@@ -3825,11 +3825,6 @@ retry:
 			if (!PagePrivate(page))
 				continue;
 
-			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
-				break;
-			}
-
 			spin_lock(&mapping->private_lock);
 			if (!PagePrivate(page)) {
 				spin_unlock(&mapping->private_lock);
@@ -3941,7 +3936,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
 	if (!igrab(inode))
 		return 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
@@ -3961,8 +3956,8 @@ retry:
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && !nr_to_write_done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+			(nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
+						&index, end, tag))) {
 		unsigned i;
 
 		scanned = 1;
@@ -3987,12 +3982,6 @@ retry:
 				continue;
 			}
 
-			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
-				unlock_page(page);
-				continue;
-			}
-
 			if (wbc->sync_mode != WB_SYNC_NONE) {
 				if (PageWriteback(page))
 					flush_fn(data);
diff --git a/fs/buffer.c b/fs/buffer.c
index 49b7e9bdcd1d..0736a6a2e2f0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -253,27 +253,6 @@ out:
 }
 
 /*
- * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
- */
-static void free_more_memory(void)
-{
-	struct zoneref *z;
-	int nid;
-
-	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
-	yield();
-
-	for_each_online_node(nid) {
-
-		z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
-						gfp_zone(GFP_NOFS), NULL);
-		if (z->zone)
-			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-						GFP_NOFS, NULL);
-	}
-}
-
-/*
  * I/O completion handler for block_read_full_page() - pages
  * which come unlocked at the end of I/O.
  */
@@ -861,16 +840,19 @@ int remove_inode_buffers(struct inode *inode)
  * which may not fail from ordinary buffer allocations.
  */
 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		int retry)
+		bool retry)
 {
 	struct buffer_head *bh, *head;
+	gfp_t gfp = GFP_NOFS;
 	long offset;
 
-try_again:
+	if (retry)
+		gfp |= __GFP_NOFAIL;
+
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
-		bh = alloc_buffer_head(GFP_NOFS);
+		bh = alloc_buffer_head(gfp);
 		if (!bh)
 			goto no_grow;
 
@@ -896,23 +878,7 @@ no_grow:
 		} while (head);
 	}
 
-	/*
-	 * Return failure for non-async IO requests.  Async IO requests
-	 * are not allowed to fail, so we have to wait until buffer heads
-	 * become available.  But we don't want tasks sleeping with 
-	 * partially complete buffers, so all were released above.
-	 */
-	if (!retry)
-		return NULL;
-
-	/* We're _really_ low on memory. Now we just
-	 * wait for old buffer heads to become free due to
-	 * finishing IO.  Since this is an async request and
-	 * the reserve list is empty, we're sure there are 
-	 * async buffer heads in use.
-	 */
-	free_more_memory();
-	goto try_again;
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 
@@ -1001,8 +967,6 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	gfp_mask |= __GFP_NOFAIL;
 
 	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
-	if (!page)
-		return ret;
 
 	BUG_ON(!PageLocked(page));
 
@@ -1021,9 +985,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	/*
 	 * Allocate some buffers for this page
 	 */
-	bh = alloc_page_buffers(page, size, 0);
-	if (!bh)
-		goto failed;
+	bh = alloc_page_buffers(page, size, true);
 
 	/*
 	 * Link the page to the buffers and initialise them.  Take the
@@ -1103,8 +1065,6 @@ __getblk_slow(struct block_device *bdev, sector_t block,
 		ret = grow_buffers(bdev, block, size, gfp);
 		if (ret < 0)
 			return NULL;
-		if (ret == 0)
-			free_more_memory();
 	}
 }
 
@@ -1575,7 +1535,7 @@ void create_empty_buffers(struct page *page,
 {
 	struct buffer_head *bh, *head, *tail;
 
-	head = alloc_page_buffers(page, blocksize, 1);
+	head = alloc_page_buffers(page, blocksize, true);
 	bh = head;
 	do {
 		bh->b_state |= b_state;
@@ -1632,7 +1592,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 	struct buffer_head *head;
 
 	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
 		count = pagevec_count(&pvec);
 		for (i = 0; i < count; i++) {
@@ -2639,7 +2599,7 @@ int nobh_write_begin(struct address_space *mapping,
 	 * Be careful: the buffer linked list is a NULL terminated one, rather
 	 * than the circular one we're used to.
 	 */
-	head = alloc_page_buffers(page, blocksize, 0);
+	head = alloc_page_buffers(page, blocksize, false);
 	if (!head) {
 		ret = -ENOMEM;
 		goto out_release;
@@ -3056,8 +3016,16 @@ void guard_bio_eod(int op, struct bio *bio)
 	sector_t maxsector;
 	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
 	unsigned truncated_bytes;
+	struct hd_struct *part;
+
+	rcu_read_lock();
+	part = __disk_get_part(bio->bi_disk, bio->bi_partno);
+	if (part)
+		maxsector = part_nr_sects_read(part);
+	else
+		maxsector = get_capacity(bio->bi_disk);
+	rcu_read_unlock();
 
-	maxsector = get_capacity(bio->bi_disk);
 	if (!maxsector)
 		return;
 
@@ -3546,7 +3514,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 	if (length <= 0)
 		return -ENOENT;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	do {
 		unsigned nr_pages, i;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 18d7aa61ef0f..883bc7bb12c5 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -256,8 +256,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 			goto backing_page_already_present;
 
 		if (!newpage) {
-			newpage = __page_cache_alloc(cachefiles_gfp |
-						     __GFP_COLD);
+			newpage = __page_cache_alloc(cachefiles_gfp);
 			if (!newpage)
 				goto nomem_monitor;
 		}
@@ -493,8 +492,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 				goto backing_page_already_present;
 
 			if (!newpage) {
-				newpage = __page_cache_alloc(cachefiles_gfp |
-							     __GFP_COLD);
+				newpage = __page_cache_alloc(cachefiles_gfp);
 				if (!newpage)
 					goto nomem;
 			}
@@ -710,7 +708,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
 	/* calculate the shift required to use bmap */
 	shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
 
-	pagevec_init(&pagevec, 0);
+	pagevec_init(&pagevec);
 
 	op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
 	op->op.flags |= FSCACHE_OP_ASYNC;
@@ -844,7 +842,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,
 
 	ret = cachefiles_has_space(cache, 0, *nr_pages);
 	if (ret == 0) {
-		pagevec_init(&pagevec, 0);
+		pagevec_init(&pagevec);
 
 		list_for_each_entry(page, pages, lru) {
 			if (pagevec_add(&pagevec, page) == 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4d622654bfbc..dbf07051aacd 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -680,7 +680,7 @@ static void ceph_release_pages(struct page **pages, int num)
 	struct pagevec pvec;
 	int i;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	for (i = 0; i < num; i++) {
 		if (pagevec_add(&pvec, pages[i]) == 0)
 			pagevec_release(&pvec);
@@ -811,7 +811,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	if (fsc->mount_options->wsize < wsize)
 		wsize = fsc->mount_options->wsize;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
 	index = start_index;
@@ -870,15 +870,10 @@ retry:
 		max_pages = wsize >> PAGE_SHIFT;
 
 get_more_pages:
-		pvec_pages = min_t(unsigned, PAGEVEC_SIZE,
-				   max_pages - locked_pages);
-		if (end - index < (u64)(pvec_pages - 1))
-			pvec_pages = (unsigned)(end - index) + 1;
-
-		pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-						PAGECACHE_TAG_DIRTY,
-						pvec_pages);
-		dout("pagevec_lookup_tag got %d\n", pvec_pages);
+		pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
+						end, PAGECACHE_TAG_DIRTY,
+						max_pages - locked_pages);
+		dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
 		if (!pvec_pages && !locked_pages)
 			break;
 		for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
@@ -896,16 +891,6 @@ get_more_pages:
 				unlock_page(page);
 				continue;
 			}
-			if (page->index > end) {
-				dout("end of range %p\n", page);
-				/* can't be range_cyclic (1st pass) because
-				 * end == -1 in that case. */
-				stop = true;
-				if (ceph_wbc.head_snapc)
-					done = true;
-				unlock_page(page);
-				break;
-			}
 			if (strip_unit_end && (page->index > strip_unit_end)) {
 				dout("end of strip unit %p\n", page);
 				unlock_page(page);
@@ -1177,8 +1162,7 @@ release_pvec_pages:
 			index = 0;
 			while ((index <= end) &&
 			       (nr = pagevec_lookup_tag(&pvec, mapping, &index,
-							PAGECACHE_TAG_WRITEBACK,
-							PAGEVEC_SIZE))) {
+						PAGECACHE_TAG_WRITEBACK))) {
 				for (i = 0; i < nr; i++) {
 					page = pvec.pages[i];
 					if (page_snap_context(page) != snapc)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 92fdf9c35de2..df9f682708c6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1963,8 +1963,6 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
 			  pgoff_t end, pgoff_t *index,
 			  unsigned int *found_pages)
 {
-	unsigned int nr_pages;
-	struct page **pages;
 	struct cifs_writedata *wdata;
 
 	wdata = cifs_writedata_alloc((unsigned int)tofind,
@@ -1972,23 +1970,8 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
 	if (!wdata)
 		return NULL;
 
-	/*
-	 * find_get_pages_tag seems to return a max of 256 on each
-	 * iteration, so we must call it several times in order to
-	 * fill the array or the wsize is effectively limited to
-	 * 256 * PAGE_SIZE.
-	 */
-	*found_pages = 0;
-	pages = wdata->pages;
-	do {
-		nr_pages = find_get_pages_tag(mapping, index,
-					      PAGECACHE_TAG_DIRTY, tofind,
-					      pages);
-		*found_pages += nr_pages;
-		tofind -= nr_pages;
-		pages += nr_pages;
-	} while (nr_pages && tofind && *index <= end);
-
+	*found_pages = find_get_pages_range_tag(mapping, index, end,
+				PAGECACHE_TAG_DIRTY, tofind, wdata->pages);
 	return wdata;
 }
 
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index a37f003530d7..1175a1722411 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -447,8 +447,7 @@ int venus_fsync(struct super_block *sb, struct CodaFid *fid)
 	UPARG(CODA_FSYNC);
 
 	inp->coda_fsync.VFid = *fid;
-	error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs),
-			    &outsize, inp);
+	error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
 
 	CODA_FREE(inp, insize);
 	return error;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index bd5d91e119ca..5fc5dc660600 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -54,8 +54,6 @@
 #include <linux/if_tun.h>
 #include <linux/ctype.h>
 #include <linux/syscalls.h>
-#include <linux/i2c.h>
-#include <linux/i2c-dev.h>
 #include <linux/atalk.h>
 #include <linux/gfp.h>
 #include <linux/cec.h>
@@ -137,22 +135,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return vfs_ioctl(file, cmd, arg);
 }
 
-static int w_long(struct file *file,
-		unsigned int cmd, compat_ulong_t __user *argp)
-{
-	int err;
-	unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
-
-	if (valp == NULL)
-		return -EFAULT;
-	err = do_ioctl(file, cmd, (unsigned long)valp);
-	if (err)
-		return err;
-	if (convert_in_user(valp, argp))
-		return -EFAULT;
-	return 0;
-}
-
 struct compat_video_event {
 	int32_t		type;
 	compat_time_t	timestamp;
@@ -671,96 +653,6 @@ static int serial_struct_ioctl(struct file *file,
         return err;
 }
 
-/*
- * I2C layer ioctls
- */
-
-struct i2c_msg32 {
-	u16 addr;
-	u16 flags;
-	u16 len;
-	compat_caddr_t buf;
-};
-
-struct i2c_rdwr_ioctl_data32 {
-	compat_caddr_t msgs; /* struct i2c_msg __user *msgs */
-	u32 nmsgs;
-};
-
-struct i2c_smbus_ioctl_data32 {
-	u8 read_write;
-	u8 command;
-	u32 size;
-	compat_caddr_t data; /* union i2c_smbus_data *data */
-};
-
-struct i2c_rdwr_aligned {
-	struct i2c_rdwr_ioctl_data cmd;
-	struct i2c_msg msgs[0];
-};
-
-static int do_i2c_rdwr_ioctl(struct file *file,
-	unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata)
-{
-	struct i2c_rdwr_aligned		__user *tdata;
-	struct i2c_msg			__user *tmsgs;
-	struct i2c_msg32		__user *umsgs;
-	compat_caddr_t			datap;
-	u32				nmsgs;
-	int				i;
-
-	if (get_user(nmsgs, &udata->nmsgs))
-		return -EFAULT;
-	if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS)
-		return -EINVAL;
-
-	if (get_user(datap, &udata->msgs))
-		return -EFAULT;
-	umsgs = compat_ptr(datap);
-
-	tdata = compat_alloc_user_space(sizeof(*tdata) +
-				      nmsgs * sizeof(struct i2c_msg));
-	tmsgs = &tdata->msgs[0];
-
-	if (put_user(nmsgs, &tdata->cmd.nmsgs) ||
-	    put_user(tmsgs, &tdata->cmd.msgs))
-		return -EFAULT;
-
-	for (i = 0; i < nmsgs; i++) {
-		if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16)))
-			return -EFAULT;
-		if (get_user(datap, &umsgs[i].buf) ||
-		    put_user(compat_ptr(datap), &tmsgs[i].buf))
-			return -EFAULT;
-	}
-	return do_ioctl(file, cmd, (unsigned long)tdata);
-}
-
-static int do_i2c_smbus_ioctl(struct file *file,
-		unsigned int cmd, struct i2c_smbus_ioctl_data32   __user *udata)
-{
-	struct i2c_smbus_ioctl_data	__user *tdata;
-	union {
-		/* beginnings of those have identical layouts */
-		struct i2c_smbus_ioctl_data32	data32;
-		struct i2c_smbus_ioctl_data	data;
-	} v;
-
-	tdata = compat_alloc_user_space(sizeof(*tdata));
-	if (tdata == NULL)
-		return -ENOMEM;
-
-	memset(&v, 0, sizeof(v));
-	if (copy_from_user(&v.data32, udata, sizeof(v.data32)))
-		return -EFAULT;
-	v.data.data = compat_ptr(v.data32.data);
-
-	if (copy_to_user(tdata, &v.data, sizeof(v.data)))
-		return -EFAULT;
-
-	return do_ioctl(file, cmd, (unsigned long)tdata);
-}
-
 #define RTC_IRQP_READ32		_IOR('p', 0x0b, compat_ulong_t)
 #define RTC_IRQP_SET32		_IOW('p', 0x0c, compat_ulong_t)
 #define RTC_EPOCH_READ32	_IOR('p', 0x0d, compat_ulong_t)
@@ -1283,13 +1175,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
 COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
 COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
 COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* i2c */
-COMPATIBLE_IOCTL(I2C_SLAVE)
-COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
-COMPATIBLE_IOCTL(I2C_TENBIT)
-COMPATIBLE_IOCTL(I2C_PEC)
-COMPATIBLE_IOCTL(I2C_RETRIES)
-COMPATIBLE_IOCTL(I2C_TIMEOUT)
 /* hiddev */
 COMPATIBLE_IOCTL(HIDIOCGVERSION)
 COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
@@ -1464,13 +1349,6 @@ static long do_ioctl_trans(unsigned int cmd,
 	case TIOCGSERIAL:
 	case TIOCSSERIAL:
 		return serial_struct_ioctl(file, cmd, argp);
-	/* i2c */
-	case I2C_FUNCS:
-		return w_long(file, cmd, argp);
-	case I2C_RDWR:
-		return do_i2c_rdwr_ioctl(file, cmd, argp);
-	case I2C_SMBUS:
-		return do_i2c_smbus_ioctl(file, cmd, argp);
 	/* Not implemented in the native kernel */
 	case RTC_IRQP_READ32:
 	case RTC_IRQP_SET32:
@@ -1580,6 +1458,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
 	case FICLONE:
 	case FICLONERANGE:
 	case FIDEDUPERANGE:
+	case FS_IOC_FIEMAP:
 		goto do_ioctl;
 
 	case FIBMAP:
diff --git a/fs/coredump.c b/fs/coredump.c
index 52c63d6c9143..1e2c87acac9b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -680,16 +680,11 @@ void do_coredump(const siginfo_t *siginfo)
 		 * privs and don't want to unlink another user's coredump.
 		 */
 		if (!need_suid_safe) {
-			mm_segment_t old_fs;
-
-			old_fs = get_fs();
-			set_fs(KERNEL_DS);
 			/*
 			 * If it doesn't exist, that's fine. If there's some
 			 * other problem, we'll catch it at the filp_open().
 			 */
-			(void) sys_unlink((const char __user *)cn.corename);
-			set_fs(old_fs);
+			do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
 		}
 
 		/*
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
index 11b29d491b7c..f937082f3244 100644
--- a/fs/cramfs/Kconfig
+++ b/fs/cramfs/Kconfig
@@ -1,6 +1,5 @@
 config CRAMFS
-	tristate "Compressed ROM file system support (cramfs) (OBSOLETE)"
-	depends on BLOCK
+	tristate "Compressed ROM file system support (cramfs)"
 	select ZLIB_INFLATE
 	help
 	  Saying Y here includes support for CramFs (Compressed ROM File
@@ -16,7 +15,39 @@ config CRAMFS
 	  cramfs.  Note that the root file system (the one containing the
 	  directory /) cannot be compiled as a module.
 
-	  This filesystem is obsoleted by SquashFS, which is much better
-	  in terms of performance and features.
+	  This filesystem is limited in capabilities and performance on
+	  purpose to remain small and low on RAM usage. It is most suitable
+	  for small embedded systems. If you have ample RAM to spare, you may
+	  consider a more capable compressed filesystem such as SquashFS
+	  which is much better in terms of performance and features.
+
+	  If unsure, say N.
+
+config CRAMFS_BLOCKDEV
+	bool "Support CramFs image over a regular block device" if EXPERT
+	depends on CRAMFS && BLOCK
+	default y
+	help
+	  This option allows the CramFs driver to load data from a regular
+	  block device such a disk partition or a ramdisk.
+
+config CRAMFS_MTD
+	bool "Support CramFs image directly mapped in physical memory"
+	depends on CRAMFS && MTD
+	default y if !CRAMFS_BLOCKDEV
+	help
+	  This option allows the CramFs driver to load data directly from
+	  a linear adressed memory range (usually non volatile memory
+	  like flash) instead of going through the block device layer.
+	  This saves some memory since no intermediate buffering is
+	  necessary.
+
+	  The location of the CramFs image is determined by a
+	  MTD device capable of direct memory mapping e.g. from
+	  the 'physmap' map driver or a resulting MTD partition.
+	  For example, this would mount the cramfs image stored in
+	  the MTD partition named "xip_fs" on the /mnt mountpoint:
+
+	  mount -t cramfs mtd:xip_fs /mnt
 
 	  If unsure, say N.
diff --git a/fs/cramfs/README b/fs/cramfs/README
index 9d4e7ea311f4..d71b27e0ff15 100644
--- a/fs/cramfs/README
+++ b/fs/cramfs/README
@@ -49,17 +49,46 @@ same as the start of the (i+1)'th <block> if there is one).  The first
 <block> immediately follows the last <block_pointer> for the file.
 <block_pointer>s are each 32 bits long.
 
+When the CRAMFS_FLAG_EXT_BLOCK_POINTERS capability bit is set, each
+<block_pointer>'s top bits may contain special flags as follows:
+
+CRAMFS_BLK_FLAG_UNCOMPRESSED (bit 31):
+	The block data is not compressed and should be copied verbatim.
+
+CRAMFS_BLK_FLAG_DIRECT_PTR (bit 30):
+	The <block_pointer> stores the actual block start offset and not
+	its end, shifted right by 2 bits. The block must therefore be
+	aligned to a 4-byte boundary. The block size is either blksize
+	if CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified, otherwise
+	the compressed data length is included in the first 2 bytes of
+	the block data. This is used to allow discontiguous data layout
+	and specific data block alignments e.g. for XIP applications.
+
+
 The order of <file_data>'s is a depth-first descent of the directory
 tree, i.e. the same order as `find -size +0 \( -type f -o -type l \)
 -print'.
 
 
 <block>: The i'th <block> is the output of zlib's compress function
-applied to the i'th blksize-sized chunk of the input data.
+applied to the i'th blksize-sized chunk of the input data if the
+corresponding CRAMFS_BLK_FLAG_UNCOMPRESSED <block_ptr> bit is not set,
+otherwise it is the input data directly.
 (For the last <block> of the file, the input may of course be smaller.)
 Each <block> may be a different size.  (See <block_pointer> above.)
+
 <block>s are merely byte-aligned, not generally u32-aligned.
 
+When CRAMFS_BLK_FLAG_DIRECT_PTR is specified then the corresponding
+<block> may be located anywhere and not necessarily contiguous with
+the previous/next blocks. In that case it is minimally u32-aligned.
+If CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified then the size is always
+blksize except for the last block which is limited by the file length.
+If CRAMFS_BLK_FLAG_DIRECT_PTR is set and CRAMFS_BLK_FLAG_UNCOMPRESSED
+is not set then the first 2 bytes of the block contains the size of the
+remaining block data as this cannot be determined from the placement of
+logically adjacent blocks.
+
 
 Holes
 -----
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 7919967488cb..9a2ab419ba62 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -15,10 +15,15 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/file.h>
 #include <linux/pagemap.h>
+#include <linux/pfn_t.h>
+#include <linux/ramfs.h>
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/blkdev.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/super.h>
 #include <linux/slab.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
@@ -36,6 +41,9 @@ struct cramfs_sb_info {
 	unsigned long blocks;
 	unsigned long files;
 	unsigned long flags;
+	void *linear_virt_addr;
+	resource_size_t linear_phys_addr;
+	size_t mtd_point_size;
 };
 
 static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
@@ -46,6 +54,7 @@ static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
 static const struct super_operations cramfs_ops;
 static const struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
+static const struct file_operations cramfs_physmem_fops;
 static const struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
@@ -93,6 +102,10 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 	case S_IFREG:
 		inode->i_fop = &generic_ro_fops;
 		inode->i_data.a_ops = &cramfs_aops;
+		if (IS_ENABLED(CONFIG_CRAMFS_MTD) &&
+		    CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS &&
+		    CRAMFS_SB(sb)->linear_phys_addr)
+			inode->i_fop = &cramfs_physmem_fops;
 		break;
 	case S_IFDIR:
 		inode->i_op = &cramfs_dir_inode_operations;
@@ -140,6 +153,9 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
  * BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to
  * worry about end-of-buffer issues even when decompressing a full
  * page cache.
+ *
+ * Note: This is all optimized away at compile time when
+ *       CONFIG_CRAMFS_BLOCKDEV=n.
  */
 #define READ_BUFFERS (2)
 /* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */
@@ -160,10 +176,10 @@ static struct super_block *buffer_dev[READ_BUFFERS];
 static int next_buffer;
 
 /*
- * Returns a pointer to a buffer containing at least LEN bytes of
- * filesystem starting at byte offset OFFSET into the filesystem.
+ * Populate our block cache and return a pointer to it.
  */
-static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len)
+static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
+				unsigned int len)
 {
 	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
 	struct page *pages[BLKS_PER_BUF];
@@ -239,11 +255,250 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 	return read_buffers[buffer] + offset;
 }
 
+/*
+ * Return a pointer to the linearly addressed cramfs image in memory.
+ */
+static void *cramfs_direct_read(struct super_block *sb, unsigned int offset,
+				unsigned int len)
+{
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+
+	if (!len)
+		return NULL;
+	if (len > sbi->size || offset > sbi->size - len)
+		return page_address(ZERO_PAGE(0));
+	return sbi->linear_virt_addr + offset;
+}
+
+/*
+ * Returns a pointer to a buffer containing at least LEN bytes of
+ * filesystem starting at byte offset OFFSET into the filesystem.
+ */
+static void *cramfs_read(struct super_block *sb, unsigned int offset,
+			 unsigned int len)
+{
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+
+	if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sbi->linear_virt_addr)
+		return cramfs_direct_read(sb, offset, len);
+	else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV))
+		return cramfs_blkdev_read(sb, offset, len);
+	else
+		return NULL;
+}
+
+/*
+ * For a mapping to be possible, we need a range of uncompressed and
+ * contiguous blocks. Return the offset for the first block and number of
+ * valid blocks for which that is true, or zero otherwise.
+ */
+static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages)
+{
+	struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+	int i;
+	u32 *blockptrs, first_block_addr;
+
+	/*
+	 * We can dereference memory directly here as this code may be
+	 * reached only when there is a direct filesystem image mapping
+	 * available in memory.
+	 */
+	blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode) + pgoff * 4);
+	first_block_addr = blockptrs[0] & ~CRAMFS_BLK_FLAGS;
+	i = 0;
+	do {
+		u32 block_off = i * (PAGE_SIZE >> CRAMFS_BLK_DIRECT_PTR_SHIFT);
+		u32 expect = (first_block_addr + block_off) |
+			     CRAMFS_BLK_FLAG_DIRECT_PTR |
+			     CRAMFS_BLK_FLAG_UNCOMPRESSED;
+		if (blockptrs[i] != expect) {
+			pr_debug("range: block %d/%d got %#x expects %#x\n",
+				 pgoff+i, pgoff + *pages - 1,
+				 blockptrs[i], expect);
+			if (i == 0)
+				return 0;
+			break;
+		}
+	} while (++i < *pages);
+
+	*pages = i;
+	return first_block_addr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
+}
+
+#ifdef CONFIG_MMU
+
+/*
+ * Return true if the last page of a file in the filesystem image contains
+ * some other data that doesn't belong to that file. It is assumed that the
+ * last block is CRAMFS_BLK_FLAG_DIRECT_PTR | CRAMFS_BLK_FLAG_UNCOMPRESSED
+ * (verified by cramfs_get_block_range() and directly accessible in memory.
+ */
+static bool cramfs_last_page_is_shared(struct inode *inode)
+{
+	struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+	u32 partial, last_page, blockaddr, *blockptrs;
+	char *tail_data;
+
+	partial = offset_in_page(inode->i_size);
+	if (!partial)
+		return false;
+	last_page = inode->i_size >> PAGE_SHIFT;
+	blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode));
+	blockaddr = blockptrs[last_page] & ~CRAMFS_BLK_FLAGS;
+	blockaddr <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+	tail_data = sbi->linear_virt_addr + blockaddr + partial;
+	return memchr_inv(tail_data, 0, PAGE_SIZE - partial) ? true : false;
+}
+
+static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *inode = file_inode(file);
+	struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb);
+	unsigned int pages, max_pages, offset;
+	unsigned long address, pgoff = vma->vm_pgoff;
+	char *bailout_reason;
+	int ret;
+
+	ret = generic_file_readonly_mmap(file, vma);
+	if (ret)
+		return ret;
+
+	/*
+	 * Now try to pre-populate ptes for this vma with a direct
+	 * mapping avoiding memory allocation when possible.
+	 */
+
+	/* Could COW work here? */
+	bailout_reason = "vma is writable";
+	if (vma->vm_flags & VM_WRITE)
+		goto bailout;
+
+	max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	bailout_reason = "beyond file limit";
+	if (pgoff >= max_pages)
+		goto bailout;
+	pages = min(vma_pages(vma), max_pages - pgoff);
+
+	offset = cramfs_get_block_range(inode, pgoff, &pages);
+	bailout_reason = "unsuitable block layout";
+	if (!offset)
+		goto bailout;
+	address = sbi->linear_phys_addr + offset;
+	bailout_reason = "data is not page aligned";
+	if (!PAGE_ALIGNED(address))
+		goto bailout;
+
+	/* Don't map the last page if it contains some other data */
+	if (pgoff + pages == max_pages && cramfs_last_page_is_shared(inode)) {
+		pr_debug("mmap: %s: last page is shared\n",
+			 file_dentry(file)->d_name.name);
+		pages--;
+	}
+
+	if (!pages) {
+		bailout_reason = "no suitable block remaining";
+		goto bailout;
+	}
+
+	if (pages == vma_pages(vma)) {
+		/*
+		 * The entire vma is mappable. remap_pfn_range() will
+		 * make it distinguishable from a non-direct mapping
+		 * in /proc/<pid>/maps by substituting the file offset
+		 * with the actual physical address.
+		 */
+		ret = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT,
+				      pages * PAGE_SIZE, vma->vm_page_prot);
+	} else {
+		/*
+		 * Let's create a mixed map if we can't map it all.
+		 * The normal paging machinery will take care of the
+		 * unpopulated ptes via cramfs_readpage().
+		 */
+		int i;
+		vma->vm_flags |= VM_MIXEDMAP;
+		for (i = 0; i < pages && !ret; i++) {
+			unsigned long off = i * PAGE_SIZE;
+			pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
+			ret = vm_insert_mixed(vma, vma->vm_start + off, pfn);
+		}
+	}
+
+	if (!ret)
+		pr_debug("mapped %s[%lu] at 0x%08lx (%u/%lu pages) "
+			 "to vma 0x%08lx, page_prot 0x%llx\n",
+			 file_dentry(file)->d_name.name, pgoff,
+			 address, pages, vma_pages(vma), vma->vm_start,
+			 (unsigned long long)pgprot_val(vma->vm_page_prot));
+	return ret;
+
+bailout:
+	pr_debug("%s[%lu]: direct mmap impossible: %s\n",
+		 file_dentry(file)->d_name.name, pgoff, bailout_reason);
+	/* Didn't manage any direct map, but normal paging is still possible */
+	return 0;
+}
+
+#else /* CONFIG_MMU */
+
+static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+
+static unsigned long cramfs_physmem_get_unmapped_area(struct file *file,
+			unsigned long addr, unsigned long len,
+			unsigned long pgoff, unsigned long flags)
+{
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+	unsigned int pages, block_pages, max_pages, offset;
+
+	pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= max_pages || pages > max_pages - pgoff)
+		return -EINVAL;
+	block_pages = pages;
+	offset = cramfs_get_block_range(inode, pgoff, &block_pages);
+	if (!offset || block_pages != pages)
+		return -ENOSYS;
+	addr = sbi->linear_phys_addr + offset;
+	pr_debug("get_unmapped for %s ofs %#lx siz %lu at 0x%08lx\n",
+		 file_dentry(file)->d_name.name, pgoff*PAGE_SIZE, len, addr);
+	return addr;
+}
+
+static unsigned int cramfs_physmem_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT |
+	       NOMMU_MAP_READ | NOMMU_MAP_EXEC;
+}
+
+#endif /* CONFIG_MMU */
+
+static const struct file_operations cramfs_physmem_fops = {
+	.llseek			= generic_file_llseek,
+	.read_iter		= generic_file_read_iter,
+	.splice_read		= generic_file_splice_read,
+	.mmap			= cramfs_physmem_mmap,
+#ifndef CONFIG_MMU
+	.get_unmapped_area	= cramfs_physmem_get_unmapped_area,
+	.mmap_capabilities	= cramfs_physmem_mmap_capabilities,
+#endif
+};
+
 static void cramfs_kill_sb(struct super_block *sb)
 {
 	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
 
-	kill_block_super(sb);
+	if (IS_ENABLED(CCONFIG_CRAMFS_MTD) && sb->s_mtd) {
+		if (sbi && sbi->mtd_point_size)
+			mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size);
+		kill_mtd_super(sb);
+	} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
+		kill_block_super(sb);
+	}
 	kfree(sbi);
 }
 
@@ -254,34 +509,24 @@ static int cramfs_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
+static int cramfs_read_super(struct super_block *sb,
+			     struct cramfs_super *super, int silent)
 {
-	int i;
-	struct cramfs_super super;
+	struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
 	unsigned long root_offset;
-	struct cramfs_sb_info *sbi;
-	struct inode *root;
 
-	sb->s_flags |= MS_RDONLY;
-
-	sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
-	if (!sbi)
-		return -ENOMEM;
-	sb->s_fs_info = sbi;
-
-	/* Invalidate the read buffers on mount: think disk change.. */
-	mutex_lock(&read_mutex);
-	for (i = 0; i < READ_BUFFERS; i++)
-		buffer_blocknr[i] = -1;
+	/* We don't know the real size yet */
+	sbi->size = PAGE_SIZE;
 
 	/* Read the first block and get the superblock from it */
-	memcpy(&super, cramfs_read(sb, 0, sizeof(super)), sizeof(super));
+	mutex_lock(&read_mutex);
+	memcpy(super, cramfs_read(sb, 0, sizeof(*super)), sizeof(*super));
 	mutex_unlock(&read_mutex);
 
 	/* Do sanity checks on the superblock */
-	if (super.magic != CRAMFS_MAGIC) {
+	if (super->magic != CRAMFS_MAGIC) {
 		/* check for wrong endianness */
-		if (super.magic == CRAMFS_MAGIC_WEND) {
+		if (super->magic == CRAMFS_MAGIC_WEND) {
 			if (!silent)
 				pr_err("wrong endianness\n");
 			return -EINVAL;
@@ -289,10 +534,12 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 
 		/* check at 512 byte offset */
 		mutex_lock(&read_mutex);
-		memcpy(&super, cramfs_read(sb, 512, sizeof(super)), sizeof(super));
+		memcpy(super,
+		       cramfs_read(sb, 512, sizeof(*super)),
+		       sizeof(*super));
 		mutex_unlock(&read_mutex);
-		if (super.magic != CRAMFS_MAGIC) {
-			if (super.magic == CRAMFS_MAGIC_WEND && !silent)
+		if (super->magic != CRAMFS_MAGIC) {
+			if (super->magic == CRAMFS_MAGIC_WEND && !silent)
 				pr_err("wrong endianness\n");
 			else if (!silent)
 				pr_err("wrong magic\n");
@@ -301,34 +548,34 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* get feature flags first */
-	if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
+	if (super->flags & ~CRAMFS_SUPPORTED_FLAGS) {
 		pr_err("unsupported filesystem features\n");
 		return -EINVAL;
 	}
 
 	/* Check that the root inode is in a sane state */
-	if (!S_ISDIR(super.root.mode)) {
+	if (!S_ISDIR(super->root.mode)) {
 		pr_err("root is not a directory\n");
 		return -EINVAL;
 	}
 	/* correct strange, hard-coded permissions of mkcramfs */
-	super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
+	super->root.mode |= 0555;
 
-	root_offset = super.root.offset << 2;
-	if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
-		sbi->size = super.size;
-		sbi->blocks = super.fsid.blocks;
-		sbi->files = super.fsid.files;
+	root_offset = super->root.offset << 2;
+	if (super->flags & CRAMFS_FLAG_FSID_VERSION_2) {
+		sbi->size = super->size;
+		sbi->blocks = super->fsid.blocks;
+		sbi->files = super->fsid.files;
 	} else {
 		sbi->size = 1<<28;
 		sbi->blocks = 0;
 		sbi->files = 0;
 	}
-	sbi->magic = super.magic;
-	sbi->flags = super.flags;
+	sbi->magic = super->magic;
+	sbi->flags = super->flags;
 	if (root_offset == 0)
 		pr_info("empty filesystem");
-	else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
+	else if (!(super->flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
 		 ((root_offset != sizeof(struct cramfs_super)) &&
 		  (root_offset != 512 + sizeof(struct cramfs_super))))
 	{
@@ -336,9 +583,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 		return -EINVAL;
 	}
 
+	return 0;
+}
+
+static int cramfs_finalize_super(struct super_block *sb,
+				 struct cramfs_inode *cramfs_root)
+{
+	struct inode *root;
+
 	/* Set it all up.. */
+	sb->s_flags |= MS_RDONLY;
 	sb->s_op = &cramfs_ops;
-	root = get_cramfs_inode(sb, &super.root, 0);
+	root = get_cramfs_inode(sb, cramfs_root, 0);
 	if (IS_ERR(root))
 		return PTR_ERR(root);
 	sb->s_root = d_make_root(root);
@@ -347,10 +603,79 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
+static int cramfs_blkdev_fill_super(struct super_block *sb, void *data,
+				    int silent)
+{
+	struct cramfs_sb_info *sbi;
+	struct cramfs_super super;
+	int i, err;
+
+	sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	/* Invalidate the read buffers on mount: think disk change.. */
+	for (i = 0; i < READ_BUFFERS; i++)
+		buffer_blocknr[i] = -1;
+
+	err = cramfs_read_super(sb, &super, silent);
+	if (err)
+		return err;
+	return cramfs_finalize_super(sb, &super.root);
+}
+
+static int cramfs_mtd_fill_super(struct super_block *sb, void *data,
+				 int silent)
+{
+	struct cramfs_sb_info *sbi;
+	struct cramfs_super super;
+	int err;
+
+	sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	/* Map only one page for now.  Will remap it when fs size is known. */
+	err = mtd_point(sb->s_mtd, 0, PAGE_SIZE, &sbi->mtd_point_size,
+			&sbi->linear_virt_addr, &sbi->linear_phys_addr);
+	if (err || sbi->mtd_point_size != PAGE_SIZE) {
+		pr_err("unable to get direct memory access to mtd:%s\n",
+		       sb->s_mtd->name);
+		return err ? : -ENODATA;
+	}
+
+	pr_info("checking physical address %pap for linear cramfs image\n",
+		&sbi->linear_phys_addr);
+	err = cramfs_read_super(sb, &super, silent);
+	if (err)
+		return err;
+
+	/* Remap the whole filesystem now */
+	pr_info("linear cramfs image on mtd:%s appears to be %lu KB in size\n",
+		sb->s_mtd->name, sbi->size/1024);
+	mtd_unpoint(sb->s_mtd, 0, PAGE_SIZE);
+	err = mtd_point(sb->s_mtd, 0, sbi->size, &sbi->mtd_point_size,
+			&sbi->linear_virt_addr, &sbi->linear_phys_addr);
+	if (err || sbi->mtd_point_size != sbi->size) {
+		pr_err("unable to get direct memory access to mtd:%s\n",
+		       sb->s_mtd->name);
+		return err ? : -ENODATA;
+	}
+
+	return cramfs_finalize_super(sb, &super.root);
+}
+
 static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
-	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	u64 id = 0;
+
+	if (sb->s_bdev)
+		id = huge_encode_dev(sb->s_bdev->bd_dev);
+	else if (sb->s_dev)
+		id = huge_encode_dev(sb->s_dev);
 
 	buf->f_type = CRAMFS_MAGIC;
 	buf->f_bsize = PAGE_SIZE;
@@ -502,34 +827,86 @@ static int cramfs_readpage(struct file *file, struct page *page)
 
 	if (page->index < maxblock) {
 		struct super_block *sb = inode->i_sb;
-		u32 blkptr_offset = OFFSET(inode) + page->index*4;
-		u32 start_offset, compr_len;
+		u32 blkptr_offset = OFFSET(inode) + page->index * 4;
+		u32 block_ptr, block_start, block_len;
+		bool uncompressed, direct;
 
-		start_offset = OFFSET(inode) + maxblock*4;
 		mutex_lock(&read_mutex);
-		if (page->index)
-			start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4,
-				4);
-		compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) -
-			start_offset);
-		mutex_unlock(&read_mutex);
+		block_ptr = *(u32 *) cramfs_read(sb, blkptr_offset, 4);
+		uncompressed = (block_ptr & CRAMFS_BLK_FLAG_UNCOMPRESSED);
+		direct = (block_ptr & CRAMFS_BLK_FLAG_DIRECT_PTR);
+		block_ptr &= ~CRAMFS_BLK_FLAGS;
+
+		if (direct) {
+			/*
+			 * The block pointer is an absolute start pointer,
+			 * shifted by 2 bits. The size is included in the
+			 * first 2 bytes of the data block when compressed,
+			 * or PAGE_SIZE otherwise.
+			 */
+			block_start = block_ptr << CRAMFS_BLK_DIRECT_PTR_SHIFT;
+			if (uncompressed) {
+				block_len = PAGE_SIZE;
+				/* if last block: cap to file length */
+				if (page->index == maxblock - 1)
+					block_len =
+						offset_in_page(inode->i_size);
+			} else {
+				block_len = *(u16 *)
+					cramfs_read(sb, block_start, 2);
+				block_start += 2;
+			}
+		} else {
+			/*
+			 * The block pointer indicates one past the end of
+			 * the current block (start of next block). If this
+			 * is the first block then it starts where the block
+			 * pointer table ends, otherwise its start comes
+			 * from the previous block's pointer.
+			 */
+			block_start = OFFSET(inode) + maxblock * 4;
+			if (page->index)
+				block_start = *(u32 *)
+					cramfs_read(sb, blkptr_offset - 4, 4);
+			/* Beware... previous ptr might be a direct ptr */
+			if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) {
+				/* See comments on earlier code. */
+				u32 prev_start = block_start;
+			       block_start = prev_start & ~CRAMFS_BLK_FLAGS;
+			       block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT;
+				if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) {
+					block_start += PAGE_SIZE;
+				} else {
+					block_len = *(u16 *)
+						cramfs_read(sb, block_start, 2);
+					block_start += 2 + block_len;
+				}
+			}
+			block_start &= ~CRAMFS_BLK_FLAGS;
+			block_len = block_ptr - block_start;
+		}
 
-		if (compr_len == 0)
+		if (block_len == 0)
 			; /* hole */
-		else if (unlikely(compr_len > (PAGE_SIZE << 1))) {
-			pr_err("bad compressed blocksize %u\n",
-				compr_len);
+		else if (unlikely(block_len > 2*PAGE_SIZE ||
+				  (uncompressed && block_len > PAGE_SIZE))) {
+			mutex_unlock(&read_mutex);
+			pr_err("bad data blocksize %u\n", block_len);
 			goto err;
+		} else if (uncompressed) {
+			memcpy(pgdata,
+			       cramfs_read(sb, block_start, block_len),
+			       block_len);
+			bytes_filled = block_len;
 		} else {
-			mutex_lock(&read_mutex);
 			bytes_filled = cramfs_uncompress_block(pgdata,
 				 PAGE_SIZE,
-				 cramfs_read(sb, start_offset, compr_len),
-				 compr_len);
-			mutex_unlock(&read_mutex);
-			if (unlikely(bytes_filled < 0))
-				goto err;
+				 cramfs_read(sb, block_start, block_len),
+				 block_len);
 		}
+		mutex_unlock(&read_mutex);
+		if (unlikely(bytes_filled < 0))
+			goto err;
 	}
 
 	memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
@@ -573,10 +950,22 @@ static const struct super_operations cramfs_ops = {
 	.statfs		= cramfs_statfs,
 };
 
-static struct dentry *cramfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static struct dentry *cramfs_mount(struct file_system_type *fs_type, int flags,
+				   const char *dev_name, void *data)
 {
-	return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+	struct dentry *ret = ERR_PTR(-ENOPROTOOPT);
+
+	if (IS_ENABLED(CONFIG_CRAMFS_MTD)) {
+		ret = mount_mtd(fs_type, flags, dev_name, data,
+				cramfs_mtd_fill_super);
+		if (!IS_ERR(ret))
+			return ret;
+	}
+	if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) {
+		ret = mount_bdev(fs_type, flags, dev_name, data,
+				 cramfs_blkdev_fill_super);
+	}
+	return ret;
 }
 
 static struct file_system_type cramfs_fs_type = {
diff --git a/fs/dax.c b/fs/dax.c
index f3a44a7c14b3..95981591977a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
 static void *dax_insert_mapping_entry(struct address_space *mapping,
 				      struct vm_fault *vmf,
 				      void *entry, sector_t sector,
-				      unsigned long flags)
+				      unsigned long flags, bool dirty)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
 	void *new_entry;
 	pgoff_t index = vmf->pgoff;
 
-	if (vmf->flags & FAULT_FLAG_WRITE)
+	if (dirty)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
 	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@@ -565,11 +565,11 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 		ret = __radix_tree_lookup(page_tree, index, &node, &slot);
 		WARN_ON_ONCE(ret != entry);
 		__radix_tree_replace(page_tree, node, slot,
-				     new_entry, NULL, NULL);
+				     new_entry, NULL);
 		entry = new_entry;
 	}
 
-	if (vmf->flags & FAULT_FLAG_WRITE)
+	if (dirty)
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 
 	spin_unlock_irq(&mapping->tree_lock);
@@ -614,6 +614,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 		if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
 			continue;
 
+		/*
+		 * No need to call mmu_notifier_invalidate_range() as we are
+		 * downgrading page table protection not changing it to point
+		 * to a new page.
+		 *
+		 * See Documentation/vm/mmu_notifier.txt
+		 */
 		if (pmdp) {
 #ifdef CONFIG_FS_DAX_PMD
 			pmd_t pmd;
@@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 			pmd = pmd_wrprotect(pmd);
 			pmd = pmd_mkclean(pmd);
 			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-			mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pmd:
 			spin_unlock(ptl);
 #endif
@@ -643,7 +649,6 @@ unlock_pmd:
 			pte = pte_wrprotect(pte);
 			pte = pte_mkclean(pte);
 			set_pte_at(vma->vm_mm, address, ptep, pte);
-			mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pte:
 			pte_unmap_unlock(ptep, ptl);
 		}
@@ -789,7 +794,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
 	tag_pages_for_writeback(mapping, start_index, end_index);
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (!done) {
 		pvec.nr = find_get_entries_tag(mapping, start_index,
 				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
@@ -820,38 +825,42 @@ out:
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct address_space *mapping,
-		struct block_device *bdev, struct dax_device *dax_dev,
-		sector_t sector, size_t size, void *entry,
-		struct vm_area_struct *vma, struct vm_fault *vmf)
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 {
-	unsigned long vaddr = vmf->address;
-	void *ret, *kaddr;
+	return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
+}
+
+static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
+			 pfn_t *pfnp)
+{
+	const sector_t sector = dax_iomap_sector(iomap, pos);
 	pgoff_t pgoff;
+	void *kaddr;
 	int id, rc;
-	pfn_t pfn;
+	long length;
 
-	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
 	if (rc)
 		return rc;
-
 	id = dax_read_lock();
-	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
-	if (rc < 0) {
-		dax_read_unlock(id);
-		return rc;
+	length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
+				   &kaddr, pfnp);
+	if (length < 0) {
+		rc = length;
+		goto out;
 	}
+	rc = -EINVAL;
+	if (PFN_PHYS(length) < size)
+		goto out;
+	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
+		goto out;
+	/* For larger pages we need devmap */
+	if (length > 1 && !pfn_t_devmap(*pfnp))
+		goto out;
+	rc = 0;
+out:
 	dax_read_unlock(id);
-
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
-	if (IS_ERR(ret))
-		return PTR_ERR(ret);
-
-	trace_dax_insert_mapping(mapping->host, vmf, ret);
-	if (vmf->flags & FAULT_FLAG_WRITE)
-		return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
-	else
-		return vm_insert_mixed(vma, vaddr, pfn);
+	return rc;
 }
 
 /*
@@ -877,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
 	}
 
 	entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-			RADIX_DAX_ZERO_PAGE);
+			RADIX_DAX_ZERO_PAGE, false);
 	if (IS_ERR(entry2)) {
 		ret = VM_FAULT_SIGBUS;
 		goto out;
@@ -936,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev,
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
-static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
-{
-	return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
-}
-
 static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
@@ -1080,19 +1084,33 @@ static int dax_fault_return(int error)
 	return VM_FAULT_SIGBUS;
 }
 
-static int dax_iomap_pte_fault(struct vm_fault *vmf,
+/*
+ * MAP_SYNC on a dax mapping guarantees dirty metadata is
+ * flushed on write-faults (non-cow), but not read-faults.
+ */
+static bool dax_fault_is_synchronous(unsigned long flags,
+		struct vm_area_struct *vma, struct iomap *iomap)
+{
+	return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
+		&& (iomap->flags & IOMAP_F_DIRTY);
+}
+
+static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
-	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	struct vm_area_struct *vma = vmf->vma;
+	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
-	sector_t sector;
 	struct iomap iomap = { 0 };
 	unsigned flags = IOMAP_FAULT;
 	int error, major = 0;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool sync;
 	int vmf_ret = 0;
 	void *entry;
+	pfn_t pfn;
 
 	trace_dax_pte_fault(inode, vmf, vmf_ret);
 	/*
@@ -1105,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		goto out;
 	}
 
-	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
+	if (write && !vmf->cow_page)
 		flags |= IOMAP_WRITE;
 
 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@@ -1140,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		goto error_finish_iomap;
 	}
 
-	sector = dax_iomap_sector(&iomap, pos);
-
 	if (vmf->cow_page) {
+		sector_t sector = dax_iomap_sector(&iomap, pos);
+
 		switch (iomap.type) {
 		case IOMAP_HOLE:
 		case IOMAP_UNWRITTEN:
@@ -1168,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		goto finish_iomap;
 	}
 
+	sync = dax_fault_is_synchronous(flags, vma, &iomap);
+
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
 		if (iomap.flags & IOMAP_F_NEW) {
 			count_vm_event(PGMAJFAULT);
-			count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+			count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
-		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
-				sector, PAGE_SIZE, entry, vmf->vma, vmf);
+		error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
+		if (error < 0)
+			goto error_finish_iomap;
+
+		entry = dax_insert_mapping_entry(mapping, vmf, entry,
+						 dax_iomap_sector(&iomap, pos),
+						 0, write && !sync);
+		if (IS_ERR(entry)) {
+			error = PTR_ERR(entry);
+			goto error_finish_iomap;
+		}
+
+		/*
+		 * If we are doing synchronous page fault and inode needs fsync,
+		 * we can insert PTE into page tables only after that happens.
+		 * Skip insertion for now and return the pfn so that caller can
+		 * insert it after fsync is done.
+		 */
+		if (sync) {
+			if (WARN_ON_ONCE(!pfnp)) {
+				error = -EIO;
+				goto error_finish_iomap;
+			}
+			*pfnp = pfn;
+			vmf_ret = VM_FAULT_NEEDDSYNC | major;
+			goto finish_iomap;
+		}
+		trace_dax_insert_mapping(inode, vmf, entry);
+		if (write)
+			error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+		else
+			error = vm_insert_mixed(vma, vaddr, pfn);
+
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
-		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+		if (!write) {
 			vmf_ret = dax_load_hole(mapping, entry, vmf);
 			goto finish_iomap;
 		}
@@ -1218,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 }
 
 #ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-		loff_t pos, void *entry)
-{
-	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-	const sector_t sector = dax_iomap_sector(iomap, pos);
-	struct dax_device *dax_dev = iomap->dax_dev;
-	struct block_device *bdev = iomap->bdev;
-	struct inode *inode = mapping->host;
-	const size_t size = PMD_SIZE;
-	void *ret = NULL, *kaddr;
-	long length = 0;
-	pgoff_t pgoff;
-	pfn_t pfn = {};
-	int id;
-
-	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
-		goto fallback;
-
-	id = dax_read_lock();
-	length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
-	if (length < 0)
-		goto unlock_fallback;
-	length = PFN_PHYS(length);
-
-	if (length < size)
-		goto unlock_fallback;
-	if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
-		goto unlock_fallback;
-	if (!pfn_t_devmap(pfn))
-		goto unlock_fallback;
-	dax_read_unlock(id);
-
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
-			RADIX_DAX_PMD);
-	if (IS_ERR(ret))
-		goto fallback;
-
-	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
-	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-			pfn, vmf->flags & FAULT_FLAG_WRITE);
-
-unlock_fallback:
-	dax_read_unlock(id);
-fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
-	return VM_FAULT_FALLBACK;
-}
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 		void *entry)
@@ -1283,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 		goto fallback;
 
 	ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
-			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
+			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
 	if (IS_ERR(ret))
 		goto fallback;
 
@@ -1305,13 +1314,14 @@ fallback:
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool sync;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
 	int result = VM_FAULT_FALLBACK;
@@ -1320,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	void *entry;
 	loff_t pos;
 	int error;
+	pfn_t pfn;
 
 	/*
 	 * Check whether offset isn't beyond end of file now. Caller is
@@ -1327,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	 * this is a reliable test.
 	 */
 	pgoff = linear_page_index(vma, pmd_addr);
-	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 
 	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
 
@@ -1351,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
 		goto fallback;
 
-	if (pgoff > max_pgoff) {
+	if (pgoff >= max_pgoff) {
 		result = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
 	/* If the PMD would extend beyond the file size */
-	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+	if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
 		goto fallback;
 
 	/*
@@ -1395,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
 		goto finish_iomap;
 
+	sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
+
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
+		error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
+		if (error < 0)
+			goto finish_iomap;
+
+		entry = dax_insert_mapping_entry(mapping, vmf, entry,
+						dax_iomap_sector(&iomap, pos),
+						RADIX_DAX_PMD, write && !sync);
+		if (IS_ERR(entry))
+			goto finish_iomap;
+
+		/*
+		 * If we are doing synchronous page fault and inode needs fsync,
+		 * we can insert PMD into page tables only after that happens.
+		 * Skip insertion for now and return the pfn so that caller can
+		 * insert it after fsync is done.
+		 */
+		if (sync) {
+			if (WARN_ON_ONCE(!pfnp))
+				goto finish_iomap;
+			*pfnp = pfn;
+			result = VM_FAULT_NEEDDSYNC;
+			goto finish_iomap;
+		}
+
+		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
+		result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+					    write);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
@@ -1437,7 +1476,7 @@ out:
 	return result;
 }
 #else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf,
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 			       const struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
@@ -1447,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 /**
  * dax_iomap_fault - handle a page fault on a DAX file
  * @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
+ * @pe_size: Size of the page to fault in
+ * @pfnp: PFN to insert for synchronous faults if fsync is required
+ * @ops: Iomap ops passed from the file system
  *
  * When a page fault occurs, filesystems may call this helper in
  * their fault handler for DAX files. dax_iomap_fault() assumes the caller
@@ -1455,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
  * successfully.
  */
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-		    const struct iomap_ops *ops)
+		    pfn_t *pfnp, const struct iomap_ops *ops)
 {
 	switch (pe_size) {
 	case PE_SIZE_PTE:
-		return dax_iomap_pte_fault(vmf, ops);
+		return dax_iomap_pte_fault(vmf, pfnp, ops);
 	case PE_SIZE_PMD:
-		return dax_iomap_pmd_fault(vmf, ops);
+		return dax_iomap_pmd_fault(vmf, pfnp, ops);
 	default:
 		return VM_FAULT_FALLBACK;
 	}
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+/**
+ * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function inserts writeable PTE or PMD entry into page tables for mmaped
+ * DAX file.  It takes care of marking corresponding radix tree entry as dirty
+ * as well.
+ */
+static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+				  enum page_entry_size pe_size,
+				  pfn_t pfn)
+{
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	void *entry, **slot;
+	pgoff_t index = vmf->pgoff;
+	int vmf_ret, error;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, &slot);
+	/* Did we race with someone splitting entry or so? */
+	if (!entry ||
+	    (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
+	    (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
+		put_unlocked_mapping_entry(mapping, index, entry);
+		spin_unlock_irq(&mapping->tree_lock);
+		trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
+						      VM_FAULT_NOPAGE);
+		return VM_FAULT_NOPAGE;
+	}
+	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+	entry = lock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	switch (pe_size) {
+	case PE_SIZE_PTE:
+		error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+		vmf_ret = dax_fault_return(error);
+		break;
+#ifdef CONFIG_FS_DAX_PMD
+	case PE_SIZE_PMD:
+		vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+			pfn, true);
+		break;
+#endif
+	default:
+		vmf_ret = VM_FAULT_FALLBACK;
+	}
+	put_locked_mapping_entry(mapping, index);
+	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
+	return vmf_ret;
+}
+
+/**
+ * dax_finish_sync_fault - finish synchronous page fault
+ * @vmf: The description of the fault
+ * @pe_size: Size of entry to be inserted
+ * @pfn: PFN to insert
+ *
+ * This function ensures that the file range touched by the page fault is
+ * stored persistently on the media and handles inserting of appropriate page
+ * table entry.
+ */
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+			  pfn_t pfn)
+{
+	int err;
+	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
+	size_t len = 0;
+
+	if (pe_size == PE_SIZE_PTE)
+		len = PAGE_SIZE;
+	else if (pe_size == PE_SIZE_PMD)
+		len = PMD_SIZE;
+	else
+		WARN_ON_ONCE(1);
+	err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
+	if (err)
+		return VM_FAULT_SIGBUS;
+	return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+}
+EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/fs/dcache.c b/fs/dcache.c
index bcc9f6981569..5c7df1df81ff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2705,8 +2705,6 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
 			 */
 			unsigned int i;
 			BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
-			kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN);
-			kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN);
 			for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
 				swap(((long *) &dentry->d_iname)[i],
 				     ((long *) &target->d_iname)[i]);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6dabc4a10396..cd12e6576b48 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1,16 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  file.c - part of debugfs, a tiny little debug file system
  *
  *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
  *  Copyright (C) 2004 IBM Inc.
  *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  *  debugfs is for people to use instead of /proc or /sys.
  *  See Documentation/filesystems/ for more details.
- *
  */
 
 #include <linux/module.h>
@@ -22,7 +18,6 @@
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/device.h>
-#include <linux/srcu.h>
 #include <asm/poll.h>
 
 #include "internal.h"
@@ -48,66 +43,108 @@ const struct file_operations debugfs_noop_file_operations = {
 	.llseek =	noop_llseek,
 };
 
+#define F_DENTRY(filp) ((filp)->f_path.dentry)
+
+const struct file_operations *debugfs_real_fops(const struct file *filp)
+{
+	struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
+
+	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) {
+		/*
+		 * Urgh, we've been called w/o a protecting
+		 * debugfs_file_get().
+		 */
+		WARN_ON(1);
+		return NULL;
+	}
+
+	return fsd->real_fops;
+}
+EXPORT_SYMBOL_GPL(debugfs_real_fops);
+
 /**
- * debugfs_use_file_start - mark the beginning of file data access
+ * debugfs_file_get - mark the beginning of file data access
  * @dentry: the dentry object whose data is being accessed.
- * @srcu_idx: a pointer to some memory to store a SRCU index in.
  *
- * Up to a matching call to debugfs_use_file_finish(), any
- * successive call into the file removing functions debugfs_remove()
- * and debugfs_remove_recursive() will block. Since associated private
+ * Up to a matching call to debugfs_file_put(), any successive call
+ * into the file removing functions debugfs_remove() and
+ * debugfs_remove_recursive() will block. Since associated private
  * file data may only get freed after a successful return of any of
  * the removal functions, you may safely access it after a successful
- * call to debugfs_use_file_start() without worrying about
- * lifetime issues.
+ * call to debugfs_file_get() without worrying about lifetime issues.
  *
  * If -%EIO is returned, the file has already been removed and thus,
  * it is not safe to access any of its data. If, on the other hand,
  * it is allowed to access the file data, zero is returned.
- *
- * Regardless of the return code, any call to
- * debugfs_use_file_start() must be followed by a matching call
- * to debugfs_use_file_finish().
  */
-int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx)
-	__acquires(&debugfs_srcu)
+int debugfs_file_get(struct dentry *dentry)
 {
-	*srcu_idx = srcu_read_lock(&debugfs_srcu);
-	barrier();
+	struct debugfs_fsdata *fsd;
+	void *d_fsd;
+
+	d_fsd = READ_ONCE(dentry->d_fsdata);
+	if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) {
+		fsd = d_fsd;
+	} else {
+		fsd = kmalloc(sizeof(*fsd), GFP_KERNEL);
+		if (!fsd)
+			return -ENOMEM;
+
+		fsd->real_fops = (void *)((unsigned long)d_fsd &
+					~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+		refcount_set(&fsd->active_users, 1);
+		init_completion(&fsd->active_users_drained);
+		if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) {
+			kfree(fsd);
+			fsd = READ_ONCE(dentry->d_fsdata);
+		}
+	}
+
+	/*
+	 * In case of a successful cmpxchg() above, this check is
+	 * strictly necessary and must follow it, see the comment in
+	 * __debugfs_remove_file().
+	 * OTOH, if the cmpxchg() hasn't been executed or wasn't
+	 * successful, this serves the purpose of not starving
+	 * removers.
+	 */
 	if (d_unlinked(dentry))
 		return -EIO;
+
+	if (!refcount_inc_not_zero(&fsd->active_users))
+		return -EIO;
+
 	return 0;
 }
-EXPORT_SYMBOL_GPL(debugfs_use_file_start);
+EXPORT_SYMBOL_GPL(debugfs_file_get);
 
 /**
- * debugfs_use_file_finish - mark the end of file data access
- * @srcu_idx: the SRCU index "created" by a former call to
- *            debugfs_use_file_start().
+ * debugfs_file_put - mark the end of file data access
+ * @dentry: the dentry object formerly passed to
+ *          debugfs_file_get().
  *
  * Allow any ongoing concurrent call into debugfs_remove() or
  * debugfs_remove_recursive() blocked by a former call to
- * debugfs_use_file_start() to proceed and return to its caller.
+ * debugfs_file_get() to proceed and return to its caller.
  */
-void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu)
+void debugfs_file_put(struct dentry *dentry)
 {
-	srcu_read_unlock(&debugfs_srcu, srcu_idx);
-}
-EXPORT_SYMBOL_GPL(debugfs_use_file_finish);
+	struct debugfs_fsdata *fsd = READ_ONCE(dentry->d_fsdata);
 
-#define F_DENTRY(filp) ((filp)->f_path.dentry)
+	if (refcount_dec_and_test(&fsd->active_users))
+		complete(&fsd->active_users_drained);
+}
+EXPORT_SYMBOL_GPL(debugfs_file_put);
 
 static int open_proxy_open(struct inode *inode, struct file *filp)
 {
-	const struct dentry *dentry = F_DENTRY(filp);
+	struct dentry *dentry = F_DENTRY(filp);
 	const struct file_operations *real_fops = NULL;
-	int srcu_idx, r;
+	int r;
 
-	r = debugfs_use_file_start(dentry, &srcu_idx);
-	if (r) {
-		r = -ENOENT;
-		goto out;
-	}
+	r = debugfs_file_get(dentry);
+	if (r)
+		return r == -EIO ? -ENOENT : r;
 
 	real_fops = debugfs_real_fops(filp);
 	real_fops = fops_get(real_fops);
@@ -124,7 +161,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
 		r = real_fops->open(inode, filp);
 
 out:
-	debugfs_use_file_finish(srcu_idx);
+	debugfs_file_put(dentry);
 	return r;
 }
 
@@ -138,16 +175,16 @@ const struct file_operations debugfs_open_proxy_file_operations = {
 #define FULL_PROXY_FUNC(name, ret_type, filp, proto, args)		\
 static ret_type full_proxy_ ## name(proto)				\
 {									\
-	const struct dentry *dentry = F_DENTRY(filp);			\
-	const struct file_operations *real_fops =			\
-		debugfs_real_fops(filp);				\
-	int srcu_idx;							\
+	struct dentry *dentry = F_DENTRY(filp);			\
+	const struct file_operations *real_fops;			\
 	ret_type r;							\
 									\
-	r = debugfs_use_file_start(dentry, &srcu_idx);			\
-	if (likely(!r))						\
-		r = real_fops->name(args);				\
-	debugfs_use_file_finish(srcu_idx);				\
+	r = debugfs_file_get(dentry);					\
+	if (unlikely(r))						\
+		return r;						\
+	real_fops = debugfs_real_fops(filp);				\
+	r = real_fops->name(args);					\
+	debugfs_file_put(dentry);					\
 	return r;							\
 }
 
@@ -172,18 +209,16 @@ FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
 static unsigned int full_proxy_poll(struct file *filp,
 				struct poll_table_struct *wait)
 {
-	const struct dentry *dentry = F_DENTRY(filp);
-	const struct file_operations *real_fops = debugfs_real_fops(filp);
-	int srcu_idx;
+	struct dentry *dentry = F_DENTRY(filp);
 	unsigned int r = 0;
+	const struct file_operations *real_fops;
 
-	if (debugfs_use_file_start(dentry, &srcu_idx)) {
-		debugfs_use_file_finish(srcu_idx);
+	if (debugfs_file_get(dentry))
 		return POLLHUP;
-	}
 
+	real_fops = debugfs_real_fops(filp);
 	r = real_fops->poll(filp, wait);
-	debugfs_use_file_finish(srcu_idx);
+	debugfs_file_put(dentry);
 	return r;
 }
 
@@ -227,16 +262,14 @@ static void __full_proxy_fops_init(struct file_operations *proxy_fops,
 
 static int full_proxy_open(struct inode *inode, struct file *filp)
 {
-	const struct dentry *dentry = F_DENTRY(filp);
+	struct dentry *dentry = F_DENTRY(filp);
 	const struct file_operations *real_fops = NULL;
 	struct file_operations *proxy_fops = NULL;
-	int srcu_idx, r;
+	int r;
 
-	r = debugfs_use_file_start(dentry, &srcu_idx);
-	if (r) {
-		r = -ENOENT;
-		goto out;
-	}
+	r = debugfs_file_get(dentry);
+	if (r)
+		return r == -EIO ? -ENOENT : r;
 
 	real_fops = debugfs_real_fops(filp);
 	real_fops = fops_get(real_fops);
@@ -274,7 +307,7 @@ free_proxy:
 	kfree(proxy_fops);
 	fops_put(real_fops);
 out:
-	debugfs_use_file_finish(srcu_idx);
+	debugfs_file_put(dentry);
 	return r;
 }
 
@@ -285,13 +318,14 @@ const struct file_operations debugfs_full_proxy_file_operations = {
 ssize_t debugfs_attr_read(struct file *file, char __user *buf,
 			size_t len, loff_t *ppos)
 {
+	struct dentry *dentry = F_DENTRY(file);
 	ssize_t ret;
-	int srcu_idx;
 
-	ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
-	if (likely(!ret))
-		ret = simple_attr_read(file, buf, len, ppos);
-	debugfs_use_file_finish(srcu_idx);
+	ret = debugfs_file_get(dentry);
+	if (unlikely(ret))
+		return ret;
+	ret = simple_attr_read(file, buf, len, ppos);
+	debugfs_file_put(dentry);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(debugfs_attr_read);
@@ -299,13 +333,14 @@ EXPORT_SYMBOL_GPL(debugfs_attr_read);
 ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
 			 size_t len, loff_t *ppos)
 {
+	struct dentry *dentry = F_DENTRY(file);
 	ssize_t ret;
-	int srcu_idx;
 
-	ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
-	if (likely(!ret))
-		ret = simple_attr_write(file, buf, len, ppos);
-	debugfs_use_file_finish(srcu_idx);
+	ret = debugfs_file_get(dentry);
+	if (unlikely(ret))
+		return ret;
+	ret = simple_attr_write(file, buf, len, ppos);
+	debugfs_file_put(dentry);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(debugfs_attr_write);
@@ -739,14 +774,14 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
 {
 	char buf[3];
 	bool val;
-	int r, srcu_idx;
+	int r;
+	struct dentry *dentry = F_DENTRY(file);
 
-	r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
-	if (likely(!r))
-		val = *(bool *)file->private_data;
-	debugfs_use_file_finish(srcu_idx);
-	if (r)
+	r = debugfs_file_get(dentry);
+	if (unlikely(r))
 		return r;
+	val = *(bool *)file->private_data;
+	debugfs_file_put(dentry);
 
 	if (val)
 		buf[0] = 'Y';
@@ -764,8 +799,9 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
 	char buf[32];
 	size_t buf_size;
 	bool bv;
-	int r, srcu_idx;
+	int r;
 	bool *val = file->private_data;
+	struct dentry *dentry = F_DENTRY(file);
 
 	buf_size = min(count, (sizeof(buf)-1));
 	if (copy_from_user(buf, user_buf, buf_size))
@@ -773,12 +809,11 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
 
 	buf[buf_size] = '\0';
 	if (strtobool(buf, &bv) == 0) {
-		r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
-		if (likely(!r))
-			*val = bv;
-		debugfs_use_file_finish(srcu_idx);
-		if (r)
+		r = debugfs_file_get(dentry);
+		if (unlikely(r))
 			return r;
+		*val = bv;
+		debugfs_file_put(dentry);
 	}
 
 	return count;
@@ -840,14 +875,15 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
 			      size_t count, loff_t *ppos)
 {
 	struct debugfs_blob_wrapper *blob = file->private_data;
+	struct dentry *dentry = F_DENTRY(file);
 	ssize_t r;
-	int srcu_idx;
 
-	r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx);
-	if (likely(!r))
-		r = simple_read_from_buffer(user_buf, count, ppos, blob->data,
-					blob->size);
-	debugfs_use_file_finish(srcu_idx);
+	r = debugfs_file_get(dentry);
+	if (unlikely(r))
+		return r;
+	r = simple_read_from_buffer(user_buf, count, ppos, blob->data,
+				blob->size);
+	debugfs_file_put(dentry);
 	return r;
 }
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c59f015f386e..63a998c3f252 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -1,16 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  inode.c - part of debugfs, a tiny little debug file system
  *
  *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
  *  Copyright (C) 2004 IBM Inc.
  *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  *  debugfs is for people to use instead of /proc or /sys.
  *  See ./Documentation/core-api/kernel-api.rst for more details.
- *
  */
 
 #include <linux/module.h>
@@ -27,14 +23,11 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
-#include <linux/srcu.h>
 
 #include "internal.h"
 
 #define DEBUGFS_DEFAULT_MODE	0700
 
-DEFINE_SRCU(debugfs_srcu);
-
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
@@ -185,6 +178,14 @@ static const struct super_operations debugfs_super_operations = {
 	.evict_inode	= debugfs_evict_inode,
 };
 
+static void debugfs_release_dentry(struct dentry *dentry)
+{
+	void *fsd = dentry->d_fsdata;
+
+	if (!((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
+		kfree(dentry->d_fsdata);
+}
+
 static struct vfsmount *debugfs_automount(struct path *path)
 {
 	debugfs_automount_t f;
@@ -194,6 +195,7 @@ static struct vfsmount *debugfs_automount(struct path *path)
 
 static const struct dentry_operations debugfs_dops = {
 	.d_delete = always_delete_dentry,
+	.d_release = debugfs_release_dentry,
 	.d_automount = debugfs_automount,
 };
 
@@ -358,7 +360,8 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 	inode->i_private = data;
 
 	inode->i_fop = proxy_fops;
-	dentry->d_fsdata = (void *)real_fops;
+	dentry->d_fsdata = (void *)((unsigned long)real_fops |
+				DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
 
 	d_instantiate(dentry, inode);
 	fsnotify_create(d_inode(dentry->d_parent), dentry);
@@ -615,18 +618,43 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
 
+static void __debugfs_remove_file(struct dentry *dentry, struct dentry *parent)
+{
+	struct debugfs_fsdata *fsd;
+
+	simple_unlink(d_inode(parent), dentry);
+	d_delete(dentry);
+
+	/*
+	 * Paired with the closing smp_mb() implied by a successful
+	 * cmpxchg() in debugfs_file_get(): either
+	 * debugfs_file_get() must see a dead dentry or we must see a
+	 * debugfs_fsdata instance at ->d_fsdata here (or both).
+	 */
+	smp_mb();
+	fsd = READ_ONCE(dentry->d_fsdata);
+	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
+		return;
+	if (!refcount_dec_and_test(&fsd->active_users))
+		wait_for_completion(&fsd->active_users_drained);
+}
+
 static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 {
 	int ret = 0;
 
 	if (simple_positive(dentry)) {
 		dget(dentry);
-		if (d_is_dir(dentry))
-			ret = simple_rmdir(d_inode(parent), dentry);
-		else
-			simple_unlink(d_inode(parent), dentry);
-		if (!ret)
-			d_delete(dentry);
+		if (!d_is_reg(dentry)) {
+			if (d_is_dir(dentry))
+				ret = simple_rmdir(d_inode(parent), dentry);
+			else
+				simple_unlink(d_inode(parent), dentry);
+			if (!ret)
+				d_delete(dentry);
+		} else {
+			__debugfs_remove_file(dentry, parent);
+		}
 		dput(dentry);
 	}
 	return ret;
@@ -660,8 +688,6 @@ void debugfs_remove(struct dentry *dentry)
 	inode_unlock(d_inode(parent));
 	if (!ret)
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-
-	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
 
@@ -735,8 +761,6 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	if (!__debugfs_remove(child, parent))
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	inode_unlock(d_inode(parent));
-
-	synchronize_srcu(&debugfs_srcu);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index b3e8443a1f47..f0d73d86cc1a 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *  internal.h - declarations internal to debugfs
  *
  *  Copyright (C) 2016 Nicolai Stange <nicstange@gmail.com>
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License version
- *	2 as published by the Free Software Foundation.
- *
  */
 
 #ifndef _DEBUGFS_INTERNAL_H_
@@ -19,4 +15,18 @@ extern const struct file_operations debugfs_noop_file_operations;
 extern const struct file_operations debugfs_open_proxy_file_operations;
 extern const struct file_operations debugfs_full_proxy_file_operations;
 
+struct debugfs_fsdata {
+	const struct file_operations *real_fops;
+	refcount_t active_users;
+	struct completion active_users_drained;
+};
+
+/*
+ * A dentry's ->d_fsdata either points to the real fops or to a
+ * dynamically allocated debugfs_fsdata instance.
+ * In order to distinguish between these two cases, a real fops
+ * pointer gets its lowest bit set.
+ */
+#define DEBUGFS_FSDATA_IS_REAL_FOPS_BIT BIT(0)
+
 #endif /* _DEBUGFS_INTERNAL_H_ */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 98fe1325da9d..3aafb3343a65 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -497,7 +497,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
 		if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie))
+		    !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
 			io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 6b801186baa5..25aeaa7328ba 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -660,7 +660,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
-	unsigned long flags;
+	slab_flags_t flags;
 	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..396a3c075fd4 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2259,7 +2259,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 			compat_size_t, sigsetsize)
 {
 	long err;
-	compat_sigset_t csigmask;
 	sigset_t ksigmask, sigsaved;
 
 	/*
@@ -2269,9 +2268,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &csigmask);
 		sigsaved = current->blocked;
 		set_current_blocked(&ksigmask);
 	}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index c67b486488fd..2da67699dc33 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ad204d2724ac..a0ae27b1bc66 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -28,6 +28,7 @@
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include <linux/uio.h>
+#include <linux/mman.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -297,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 	 */
 	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
 		(vmf->vma->vm_flags & VM_SHARED);
+	pfn_t pfn;
 
 	if (write) {
 		sb_start_pagefault(sb);
@@ -304,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 					       EXT4_DATA_TRANS_BLOCKS(sb));
+		if (IS_ERR(handle)) {
+			up_read(&EXT4_I(inode)->i_mmap_sem);
+			sb_end_pagefault(sb);
+			return VM_FAULT_SIGBUS;
+		}
 	} else {
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 	}
-	if (!IS_ERR(handle))
-		result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
-	else
-		result = VM_FAULT_SIGBUS;
+	result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
 	if (write) {
-		if (!IS_ERR(handle))
-			ext4_journal_stop(handle);
+		ext4_journal_stop(handle);
+		/* Handling synchronous page fault? */
+		if (result & VM_FAULT_NEEDDSYNC)
+			result = dax_finish_sync_fault(vmf, pe_size, pfn);
 		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
 	} else {
@@ -351,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
 
+	/*
+	 * We don't support synchronous mappings for non-DAX files. At least
+	 * until someone comes with a sensible use case.
+	 */
+	if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
+		return -EOPNOTSUPP;
+
 	file_accessed(file);
 	if (IS_DAX(file_inode(file))) {
 		vma->vm_ops = &ext4_dax_vm_ops;
@@ -469,6 +482,7 @@ const struct file_operations ext4_file_operations = {
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
 	.mmap		= ext4_file_mmap,
+	.mmap_supported_flags = MAP_SYNC,
 	.open		= ext4_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2633150e41b9..0992d76f7ab1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1719,7 +1719,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 		ext4_es_remove_extent(inode, start, last - start + 1);
 	}
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (index <= end) {
 		nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
 		if (nr_pages == 0)
@@ -2345,7 +2345,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 	lblk = start << bpp_bits;
 	pblock = mpd->map.m_pblk;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	while (start <= end) {
 		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
 						&start, end);
@@ -2616,12 +2616,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	mpd->map.m_len = 0;
 	mpd->next_page = index;
 	while (index <= end) {
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+				tag);
 		if (nr_pages == 0)
 			goto out;
 
@@ -2629,16 +2629,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			struct page *page = pvec.pages[i];
 
 			/*
-			 * At this point, the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or
-			 * even swizzled back from swapper_space to tmpfs file
-			 * mapping. However, page->index will not change
-			 * because we have a reference on the page.
-			 */
-			if (page->index > end)
-				goto out;
-
-			/*
 			 * Accumulated enough dirty pages? This doesn't apply
 			 * to WB_SYNC_ALL mode. For integrity sync we have to
 			 * keep going because someone may be concurrently
@@ -3394,6 +3384,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 		return try_to_free_buffers(page);
 }
 
+static bool ext4_inode_datasync_dirty(struct inode *inode)
+{
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+	if (journal)
+		return !jbd2_transaction_committed(journal,
+					EXT4_I(inode)->i_datasync_tid);
+	/* Any metadata buffers to write? */
+	if (!list_empty(&inode->i_mapping->private_list))
+		return true;
+	return inode->i_state & I_DIRTY_DATASYNC;
+}
+
 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
@@ -3507,6 +3510,8 @@ retry:
 	}
 
 	iomap->flags = 0;
+	if (ext4_inode_datasync_dirty(inode))
+		iomap->flags |= IOMAP_F_DIRTY;
 	iomap->bdev = inode->i_sb->s_bdev;
 	iomap->dax_dev = sbi->s_daxdev;
 	iomap->offset = first_block << blkbits;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b7558f292420..1eec25014f62 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -592,6 +592,44 @@ static int ext4_ioc_getfsmap(struct super_block *sb,
 	return 0;
 }
 
+static long ext4_ioctl_group_add(struct file *file,
+				 struct ext4_new_group_data *input)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	int err, err2=0;
+
+	err = ext4_resize_begin(sb);
+	if (err)
+		return err;
+
+	if (ext4_has_feature_bigalloc(sb)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Online resizing not supported with bigalloc");
+		err = -EOPNOTSUPP;
+		goto group_add_out;
+	}
+
+	err = mnt_want_write_file(file);
+	if (err)
+		goto group_add_out;
+
+	err = ext4_group_add(sb, input);
+	if (EXT4_SB(sb)->s_journal) {
+		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	}
+	if (err == 0)
+		err = err2;
+	mnt_drop_write_file(file);
+	if (!err && ext4_has_group_desc_csum(sb) &&
+	    test_opt(sb, INIT_INODE_TABLE))
+		err = ext4_register_li_request(sb, input->group);
+group_add_out:
+	ext4_resize_end(sb);
+	return err;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -776,44 +814,12 @@ mext_out:
 
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
-		int err, err2=0;
-
-		err = ext4_resize_begin(sb);
-		if (err)
-			return err;
 
 		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
-				sizeof(input))) {
-			err = -EFAULT;
-			goto group_add_out;
-		}
-
-		if (ext4_has_feature_bigalloc(sb)) {
-			ext4_msg(sb, KERN_ERR,
-				 "Online resizing not supported with bigalloc");
-			err = -EOPNOTSUPP;
-			goto group_add_out;
-		}
-
-		err = mnt_want_write_file(filp);
-		if (err)
-			goto group_add_out;
+				sizeof(input)))
+			return -EFAULT;
 
-		err = ext4_group_add(sb, &input);
-		if (EXT4_SB(sb)->s_journal) {
-			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
-			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-		}
-		if (err == 0)
-			err = err2;
-		mnt_drop_write_file(filp);
-		if (!err && ext4_has_group_desc_csum(sb) &&
-		    test_opt(sb, INIT_INODE_TABLE))
-			err = ext4_register_li_request(sb, input.group);
-group_add_out:
-		ext4_resize_end(sb);
-		return err;
+		return ext4_ioctl_group_add(filp, &input);
 	}
 
 	case EXT4_IOC_MIGRATE:
@@ -1078,8 +1084,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		break;
 	case EXT4_IOC32_GROUP_ADD: {
 		struct compat_ext4_new_group_input __user *uinput;
-		struct ext4_new_group_input input;
-		mm_segment_t old_fs;
+		struct ext4_new_group_data input;
 		int err;
 
 		uinput = compat_ptr(arg);
@@ -1092,12 +1097,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 				&uinput->reserved_blocks);
 		if (err)
 			return -EFAULT;
-		old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
-				 (unsigned long) &input);
-		set_fs(old_fs);
-		return err;
+		return ext4_ioctl_group_add(file, &input);
 	}
 	case EXT4_IOC_MOVE_EXT:
 	case EXT4_IOC_RESIZE_FS:
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 436b3a1464d9..2bb7c9fc5144 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,6 +250,9 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 
 int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	return __f2fs_set_acl(inode, type, acl, NULL);
 }
 
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 04fe1df052b2..dd2e73e10857 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -29,7 +29,6 @@ struct kmem_cache *inode_entry_slab;
 void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
 {
 	set_ckpt_flags(sbi, CP_ERROR_FLAG);
-	sbi->sb->s_flags |= MS_RDONLY;
 	if (!end_io)
 		f2fs_flush_merged_writes(sbi);
 }
@@ -305,25 +304,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 				long nr_to_write, enum iostat_type io_type)
 {
 	struct address_space *mapping = META_MAPPING(sbi);
-	pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
+	pgoff_t index = 0, prev = ULONG_MAX;
 	struct pagevec pvec;
 	long nwritten = 0;
+	int nr_pages;
 	struct writeback_control wbc = {
 		.for_reclaim = 0,
 	};
 	struct blk_plug plug;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	blk_start_plug(&plug);
 
-	while (index <= end) {
-		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-				PAGECACHE_TAG_DIRTY,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (unlikely(nr_pages == 0))
-			break;
+	while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+				PAGECACHE_TAG_DIRTY))) {
+		int i;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -401,24 +397,23 @@ const struct address_space_operations f2fs_meta_aops = {
 #endif
 };
 
-static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
+						unsigned int devidx, int type)
 {
 	struct inode_management *im = &sbi->im[type];
 	struct ino_entry *e, *tmp;
 
 	tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
-retry:
+
 	radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
 
 	spin_lock(&im->ino_lock);
 	e = radix_tree_lookup(&im->ino_root, ino);
 	if (!e) {
 		e = tmp;
-		if (radix_tree_insert(&im->ino_root, ino, e)) {
-			spin_unlock(&im->ino_lock);
-			radix_tree_preload_end();
-			goto retry;
-		}
+		if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
+			f2fs_bug_on(sbi, 1);
+
 		memset(e, 0, sizeof(struct ino_entry));
 		e->ino = ino;
 
@@ -426,6 +421,10 @@ retry:
 		if (type != ORPHAN_INO)
 			im->ino_num++;
 	}
+
+	if (type == FLUSH_INO)
+		f2fs_set_bit(devidx, (char *)&e->dirty_device);
+
 	spin_unlock(&im->ino_lock);
 	radix_tree_preload_end();
 
@@ -454,7 +453,7 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 {
 	/* add new dirty ino entry into list */
-	__add_ino_entry(sbi, ino, type);
+	__add_ino_entry(sbi, ino, 0, type);
 }
 
 void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -480,7 +479,7 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
 	struct ino_entry *e, *tmp;
 	int i;
 
-	for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) {
+	for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) {
 		struct inode_management *im = &sbi->im[i];
 
 		spin_lock(&im->ino_lock);
@@ -494,6 +493,27 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
 	}
 }
 
+void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type)
+{
+	__add_ino_entry(sbi, ino, devidx, type);
+}
+
+bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type)
+{
+	struct inode_management *im = &sbi->im[type];
+	struct ino_entry *e;
+	bool is_dirty = false;
+
+	spin_lock(&im->ino_lock);
+	e = radix_tree_lookup(&im->ino_root, ino);
+	if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device))
+		is_dirty = true;
+	spin_unlock(&im->ino_lock);
+	return is_dirty;
+}
+
 int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 {
 	struct inode_management *im = &sbi->im[ORPHAN_INO];
@@ -530,7 +550,7 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
 void add_orphan_inode(struct inode *inode)
 {
 	/* add new orphan ino entry into list */
-	__add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO);
+	__add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO);
 	update_inode_page(inode);
 }
 
@@ -554,7 +574,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 		return err;
 	}
 
-	__add_ino_entry(sbi, ino, ORPHAN_INO);
+	__add_ino_entry(sbi, ino, 0, ORPHAN_INO);
 
 	inode = f2fs_iget_retry(sbi->sb, ino);
 	if (IS_ERR(inode)) {
@@ -590,6 +610,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 	block_t start_blk, orphan_blocks, i, j;
 	unsigned int s_flags = sbi->sb->s_flags;
 	int err = 0;
+#ifdef CONFIG_QUOTA
+	int quota_enabled;
+#endif
 
 	if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
 		return 0;
@@ -602,8 +625,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 #ifdef CONFIG_QUOTA
 	/* Needed for iput() to work correctly and not trash data */
 	sbi->sb->s_flags |= MS_ACTIVE;
+
 	/* Turn on quotas so that they are updated correctly */
-	f2fs_enable_quota_files(sbi);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
 #endif
 
 	start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -631,7 +655,8 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 out:
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
-	f2fs_quota_off_umount(sbi->sb);
+	if (quota_enabled)
+		f2fs_quota_off_umount(sbi->sb);
 #endif
 	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 
@@ -986,7 +1011,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
 				update_inode_page(inode);
 			iput(inode);
 		}
-	};
+	}
 	return 0;
 }
 
@@ -1146,6 +1171,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct super_block *sb = sbi->sb;
 	struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
 	u64 kbytes_written;
+	int err;
 
 	/* Flush all the NAT/SIT pages */
 	while (get_pages(sbi, F2FS_DIRTY_META)) {
@@ -1239,6 +1265,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
+	/* flush all device cache */
+	err = f2fs_flush_device_cache(sbi);
+	if (err)
+		return err;
+
 	/* write out checkpoint buffer at block 0 */
 	update_meta_page(sbi, ckpt, start_blk++);
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 36b535207c88..516fa0d3ff9c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -173,7 +173,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 {
 	struct bio *bio;
 
-	bio = f2fs_bio_alloc(npages);
+	bio = f2fs_bio_alloc(sbi, npages, true);
 
 	f2fs_target_device(sbi, blk_addr, bio);
 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
@@ -418,8 +418,8 @@ next:
 
 	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
-	/* set submitted = 1 as a return value */
-	fio->submitted = 1;
+	/* set submitted = true as a return value */
+	fio->submitted = true;
 
 	inc_page_count(sbi, WB_DATA_TYPE(bio_page));
 
@@ -473,7 +473,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 		f2fs_wait_on_block_writeback(sbi, blkaddr);
 	}
 
-	bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+	bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
 	if (!bio) {
 		if (ctx)
 			fscrypt_release_ctx(ctx);
@@ -833,6 +833,13 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	struct f2fs_map_blocks map;
 	int err = 0;
 
+	/* convert inline data for Direct I/O*/
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
+	}
+
 	if (is_inode_flag_set(inode, FI_NO_PREALLOC))
 		return 0;
 
@@ -845,15 +852,11 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 
 	map.m_next_pgofs = NULL;
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		err = f2fs_convert_inline_inode(inode);
-		if (err)
-			return err;
+	if (iocb->ki_flags & IOCB_DIRECT)
 		return f2fs_map_blocks(inode, &map, 1,
 			__force_buffered_io(inode, WRITE) ?
 				F2FS_GET_BLOCK_PRE_AIO :
 				F2FS_GET_BLOCK_PRE_DIO);
-	}
 	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
@@ -1334,7 +1337,7 @@ static int f2fs_read_data_pages(struct file *file,
 			struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages)
 {
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	struct page *page = list_last_entry(pages, struct page, lru);
 
 	trace_f2fs_readpages(inode, page, nr_pages);
@@ -1495,6 +1498,7 @@ static int __write_data_page(struct page *page, bool *submitted,
 	int err = 0;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
+		.ino = inode->i_ino,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
 		.op_flags = wbc_to_write_flags(wbc),
@@ -1566,8 +1570,11 @@ write:
 			err = do_write_data_page(&fio);
 		}
 	}
+
+	down_write(&F2FS_I(inode)->i_sem);
 	if (F2FS_I(inode)->last_disk_size < psize)
 		F2FS_I(inode)->last_disk_size = psize;
+	up_write(&F2FS_I(inode)->i_sem);
 
 done:
 	if (err && err != -ENOENT)
@@ -1635,7 +1642,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
 	int range_whole = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 	if (get_dirty_pages(mapping->host) <=
 				SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
@@ -1669,8 +1676,8 @@ retry:
 	while (!done && (index <= end)) {
 		int i;
 
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1);
+		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+				tag);
 		if (nr_pages == 0)
 			break;
 
@@ -1678,11 +1685,6 @@ retry:
 			struct page *page = pvec.pages[i];
 			bool submitted = false;
 
-			if (page->index > end) {
-				done = 1;
-				break;
-			}
-
 			done_index = page->index;
 retry_write:
 			lock_page(page);
@@ -1937,6 +1939,12 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 
 	trace_f2fs_write_begin(inode, pos, len, flags);
 
+	if (f2fs_is_atomic_file(inode) &&
+			!available_free_memory(sbi, INMEM_PAGES)) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
 	/*
 	 * We should check this at this moment to avoid deadlock on inode page
 	 * and #0 page. The locking rule for inline_data conversion should be:
@@ -1952,7 +1960,7 @@ repeat:
 	 * Do not use grab_cache_page_write_begin() to avoid deadlock due to
 	 * wait_for_stable_page. Will wait that below with our IO control.
 	 */
-	page = pagecache_get_page(mapping, index,
+	page = f2fs_pagecache_get_page(mapping, index,
 				FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS);
 	if (!page) {
 		err = -ENOMEM;
@@ -2014,6 +2022,8 @@ repeat:
 fail:
 	f2fs_put_page(page, 1);
 	f2fs_write_failed(mapping, pos + len);
+	if (f2fs_is_atomic_file(inode))
+		drop_inmem_pages_all(sbi);
 	return err;
 }
 
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 87f449845f5f..ecada8425268 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -45,9 +45,18 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
 	si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
 	si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+	si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
 	si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
 	si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
 	si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+
+	si->nquota_files = 0;
+	if (f2fs_sb_has_quota_ino(sbi->sb)) {
+		for (i = 0; i < MAXQUOTAS; i++) {
+			if (f2fs_qf_ino(sbi->sb, i))
+				si->nquota_files++;
+		}
+	}
 	si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
 	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
 	si->aw_cnt = atomic_read(&sbi->aw_cnt);
@@ -61,6 +70,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 			atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
 		si->nr_flushing =
 			atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+		si->flush_list_empty =
+			llist_empty(&SM_I(sbi)->fcc_info->issue_list);
 	}
 	if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
 		si->nr_discarded =
@@ -96,9 +107,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
 	si->sits = MAIN_SEGS(sbi);
 	si->dirty_sits = SIT_I(sbi)->dirty_sentries;
-	si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+	si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID];
 	si->avail_nids = NM_I(sbi)->available_nids;
-	si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
+	si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
 	si->bg_gc = sbi->bg_gc;
 	si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
 		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
@@ -231,14 +242,14 @@ get_cache:
 	}
 
 	/* free nids */
-	si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
-				NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) *
+	si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] +
+				NM_I(sbi)->nid_cnt[PREALLOC_NID]) *
 				sizeof(struct free_nid);
 	si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
 	si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
 					sizeof(struct nat_entry_set);
 	si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
-	for (i = 0; i <= ORPHAN_INO; i++)
+	for (i = 0; i < MAX_INO_ENTRY; i++)
 		si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
 	si->cache_mem += atomic_read(&sbi->total_ext_tree) *
 						sizeof(struct extent_tree);
@@ -262,9 +273,10 @@ static int stat_show(struct seq_file *s, void *v)
 	list_for_each_entry(si, &f2fs_stat_list, stat_list) {
 		update_general_status(si->sbi);
 
-		seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n",
+		seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n",
 			si->sbi->sb->s_bdev, i++,
-			f2fs_readonly(si->sbi->sb) ? "RO": "RW");
+			f2fs_readonly(si->sbi->sb) ? "RO": "RW",
+			f2fs_cp_error(si->sbi) ? "Error": "Good");
 		seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
 			   si->sit_area_segs, si->nat_area_segs);
 		seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -349,10 +361,11 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), "
+		seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
 			"Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
 			   si->nr_wb_cp_data, si->nr_wb_data,
 			   si->nr_flushing, si->nr_flushed,
+			   si->flush_list_empty,
 			   si->nr_discarding, si->nr_discarded,
 			   si->nr_discard_cmd, si->undiscard_blks);
 		seq_printf(s, "  - inmem: %4d, atomic IO: %4d (Max. %4d), "
@@ -365,6 +378,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
 		seq_printf(s, "  - datas: %4d in files:%4d\n",
 			   si->ndirty_data, si->ndirty_files);
+		seq_printf(s, "  - quota datas: %4d in quota files:%4d\n",
+			   si->ndirty_qdata, si->nquota_files);
 		seq_printf(s, "  - meta: %4d in %4d\n",
 			   si->ndirty_meta, si->meta_pages);
 		seq_printf(s, "  - imeta: %4d\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index c0c933ad43c8..2d98d877c09d 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -10,10 +10,12 @@
  */
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
+#include <linux/sched/signal.h>
 #include "f2fs.h"
 #include "node.h"
 #include "acl.h"
 #include "xattr.h"
+#include <trace/events/f2fs.h>
 
 static unsigned long dir_blocks(struct inode *inode)
 {
@@ -847,6 +849,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct page *dentry_page = NULL;
 	struct file_ra_state *ra = &file->f_ra;
+	loff_t start_pos = ctx->pos;
 	unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
 	struct f2fs_dentry_ptr d;
 	struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
@@ -855,24 +858,32 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 	if (f2fs_encrypted_inode(inode)) {
 		err = fscrypt_get_encryption_info(inode);
 		if (err && err != -ENOKEY)
-			return err;
+			goto out;
 
 		err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
 		if (err < 0)
-			return err;
+			goto out;
 	}
 
 	if (f2fs_has_inline_dentry(inode)) {
 		err = f2fs_read_inline_dir(file, ctx, &fstr);
-		goto out;
+		goto out_free;
 	}
 
-	/* readahead for multi pages of dir */
-	if (npages - n > 1 && !ra_has_index(ra, n))
-		page_cache_sync_readahead(inode->i_mapping, ra, file, n,
+	for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+
+		/* allow readdir() to be interrupted */
+		if (fatal_signal_pending(current)) {
+			err = -ERESTARTSYS;
+			goto out_free;
+		}
+		cond_resched();
+
+		/* readahead for multi pages of dir */
+		if (npages - n > 1 && !ra_has_index(ra, n))
+			page_cache_sync_readahead(inode->i_mapping, ra, file, n,
 				min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
 
-	for (; n < npages; n++) {
 		dentry_page = get_lock_data_page(inode, n, false);
 		if (IS_ERR(dentry_page)) {
 			err = PTR_ERR(dentry_page);
@@ -880,7 +891,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 				err = 0;
 				continue;
 			} else {
-				goto out;
+				goto out_free;
 			}
 		}
 
@@ -896,12 +907,13 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 			break;
 		}
 
-		ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
 	}
-out:
+out_free:
 	fscrypt_fname_free_buffer(&fstr);
+out:
+	trace_f2fs_readdir(inode, start_pos, ctx->pos, err);
 	return err < 0 ? err : 0;
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 115204fdefcc..f4e094e816c6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -44,6 +44,8 @@
 enum {
 	FAULT_KMALLOC,
 	FAULT_PAGE_ALLOC,
+	FAULT_PAGE_GET,
+	FAULT_ALLOC_BIO,
 	FAULT_ALLOC_NID,
 	FAULT_ORPHAN,
 	FAULT_BLOCK,
@@ -91,6 +93,7 @@ extern char *fault_name[FAULT_MAX];
 #define F2FS_MOUNT_GRPQUOTA		0x00100000
 #define F2FS_MOUNT_PRJQUOTA		0x00200000
 #define F2FS_MOUNT_QUOTA		0x00400000
+#define F2FS_MOUNT_INLINE_XATTR_SIZE	0x00800000
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -116,6 +119,8 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_EXTRA_ATTR		0x0008
 #define F2FS_FEATURE_PRJQUOTA		0x0010
 #define F2FS_FEATURE_INODE_CHKSUM	0x0020
+#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR	0x0040
+#define F2FS_FEATURE_QUOTA_INO		0x0080
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -145,7 +150,7 @@ enum {
 #define BATCHED_TRIM_BLOCKS(sbi)	\
 		(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
 #define MAX_DISCARD_BLOCKS(sbi)		BLKS_PER_SEC(sbi)
-#define DISCARD_ISSUE_RATE		8
+#define DEF_MAX_DISCARD_REQUEST		8	/* issue 8 discards per round */
 #define DEF_MIN_DISCARD_ISSUE_TIME	50	/* 50 ms, if exists */
 #define DEF_MAX_DISCARD_ISSUE_TIME	60000	/* 60 s, if no candidates */
 #define DEF_CP_INTERVAL			60	/* 60 secs */
@@ -156,7 +161,6 @@ struct cp_control {
 	__u64 trim_start;
 	__u64 trim_end;
 	__u64 trim_minlen;
-	__u64 trimmed;
 };
 
 /*
@@ -175,12 +179,14 @@ enum {
 	ORPHAN_INO,		/* for orphan ino list */
 	APPEND_INO,		/* for append ino list */
 	UPDATE_INO,		/* for update ino list */
+	FLUSH_INO,		/* for multiple device flushing */
 	MAX_INO_ENTRY,		/* max. list */
 };
 
 struct ino_entry {
-	struct list_head list;	/* list head */
-	nid_t ino;		/* inode number */
+	struct list_head list;		/* list head */
+	nid_t ino;			/* inode number */
+	unsigned int dirty_device;	/* dirty device bitmap */
 };
 
 /* for the list of inodes to be GCed */
@@ -204,10 +210,6 @@ struct discard_entry {
 #define plist_idx(blk_num)	((blk_num) >= MAX_PLIST_NUM ?		\
 					(MAX_PLIST_NUM - 1) : (blk_num - 1))
 
-#define P_ACTIVE	0x01
-#define P_TRIM		0x02
-#define plist_issue(tag)	(((tag) & P_ACTIVE) || ((tag) & P_TRIM))
-
 enum {
 	D_PREP,
 	D_SUBMIT,
@@ -239,12 +241,32 @@ struct discard_cmd {
 	int error;			/* bio error */
 };
 
+enum {
+	DPOLICY_BG,
+	DPOLICY_FORCE,
+	DPOLICY_FSTRIM,
+	DPOLICY_UMOUNT,
+	MAX_DPOLICY,
+};
+
+struct discard_policy {
+	int type;			/* type of discard */
+	unsigned int min_interval;	/* used for candidates exist */
+	unsigned int max_interval;	/* used for candidates not exist */
+	unsigned int max_requests;	/* # of discards issued per round */
+	unsigned int io_aware_gran;	/* minimum granularity discard not be aware of I/O */
+	bool io_aware;			/* issue discard in idle time */
+	bool sync;			/* submit discard with REQ_SYNC flag */
+	unsigned int granularity;	/* discard granularity */
+};
+
 struct discard_cmd_control {
 	struct task_struct *f2fs_issue_discard;	/* discard thread */
 	struct list_head entry_list;		/* 4KB discard entry list */
 	struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
 	unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */
 	struct list_head wait_list;		/* store on-flushing entries */
+	struct list_head fstrim_list;		/* in-flight discard from fstrim */
 	wait_queue_head_t discard_wait_queue;	/* waiting queue for wake-up */
 	unsigned int discard_wake;		/* to wake up discard thread */
 	struct mutex cmd_lock;
@@ -377,11 +399,14 @@ struct f2fs_flush_device {
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
+#define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
-#define MAX_INLINE_DATA(inode)	(sizeof(__le32) * \
-				(CUR_ADDRS_PER_INODE(inode) - \
-				DEF_INLINE_RESERVED_SIZE - \
-				F2FS_INLINE_XATTR_ADDRS))
+static inline int get_inline_xattr_addrs(struct inode *inode);
+#define F2FS_INLINE_XATTR_ADDRS(inode)	get_inline_xattr_addrs(inode)
+#define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
+				(CUR_ADDRS_PER_INODE(inode) -		\
+				F2FS_INLINE_XATTR_ADDRS(inode) -	\
+				DEF_INLINE_RESERVED_SIZE))
 
 /* for inline dir */
 #define NR_INLINE_DENTRY(inode)	(MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \
@@ -581,6 +606,7 @@ struct f2fs_inode_info {
 #endif
 	struct list_head dirty_list;	/* dirty list for dirs and files */
 	struct list_head gdirty_list;	/* linked in global dirty list */
+	struct list_head inmem_ilist;	/* list for inmem inodes */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
 	struct task_struct *inmem_task;	/* store inmemory task */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
@@ -591,6 +617,7 @@ struct f2fs_inode_info {
 
 	int i_extra_isize;		/* size of extra space located in i_addr */
 	kprojid_t i_projid;		/* id for project quota */
+	int i_inline_xattr_size;	/* inline xattr size */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -664,10 +691,13 @@ static inline void __try_update_largest_extent(struct inode *inode,
 	}
 }
 
-enum nid_list {
-	FREE_NID_LIST,
-	ALLOC_NID_LIST,
-	MAX_NID_LIST,
+/*
+ * For free nid management
+ */
+enum nid_state {
+	FREE_NID,		/* newly added to free nid list */
+	PREALLOC_NID,		/* it is preallocated */
+	MAX_NID_STATE,
 };
 
 struct f2fs_nm_info {
@@ -690,8 +720,8 @@ struct f2fs_nm_info {
 
 	/* free node ids management */
 	struct radix_tree_root free_nid_root;/* root of the free_nid cache */
-	struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */
-	unsigned int nid_cnt[MAX_NID_LIST];	/* the number of free node id */
+	struct list_head free_nid_list;		/* list for free nids excluding preallocated nids */
+	unsigned int nid_cnt[MAX_NID_STATE];	/* the number of free node id */
 	spinlock_t nid_list_lock;	/* protect nid lists ops */
 	struct mutex build_lock;	/* lock for build free nids */
 	unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
@@ -769,6 +799,7 @@ enum {
 struct flush_cmd {
 	struct completion wait;
 	struct llist_node llnode;
+	nid_t ino;
 	int ret;
 };
 
@@ -787,6 +818,8 @@ struct f2fs_sm_info {
 	struct dirty_seglist_info *dirty_info;	/* dirty segment information */
 	struct curseg_info *curseg_array;	/* active segment information */
 
+	struct rw_semaphore curseg_lock;	/* for preventing curseg change */
+
 	block_t seg0_blkaddr;		/* block address of 0'th segment */
 	block_t main_blkaddr;		/* start block address of main area */
 	block_t ssa_blkaddr;		/* start block address of SSA area */
@@ -808,6 +841,7 @@ struct f2fs_sm_info {
 	unsigned int min_ipu_util;	/* in-place-update threshold */
 	unsigned int min_fsync_blocks;	/* threshold for fsync */
 	unsigned int min_hot_blocks;	/* threshold for hot block allocation */
+	unsigned int min_ssr_sections;	/* threshold to trigger SSR allocation */
 
 	/* for flush command control */
 	struct flush_cmd_control *fcc_info;
@@ -829,6 +863,7 @@ struct f2fs_sm_info {
 enum count_type {
 	F2FS_DIRTY_DENTS,
 	F2FS_DIRTY_DATA,
+	F2FS_DIRTY_QDATA,
 	F2FS_DIRTY_NODES,
 	F2FS_DIRTY_META,
 	F2FS_INMEM_PAGES,
@@ -877,6 +912,18 @@ enum need_lock_type {
 	LOCK_RETRY,
 };
 
+enum cp_reason_type {
+	CP_NO_NEEDED,
+	CP_NON_REGULAR,
+	CP_HARDLINK,
+	CP_SB_NEED_CP,
+	CP_WRONG_PINO,
+	CP_NO_SPC_ROLL,
+	CP_NODE_NEED_CP,
+	CP_FASTBOOT_MODE,
+	CP_SPEC_LOG_NUM,
+};
+
 enum iostat_type {
 	APP_DIRECT_IO,			/* app direct IOs */
 	APP_BUFFERED_IO,		/* app buffered IOs */
@@ -896,6 +943,7 @@ enum iostat_type {
 
 struct f2fs_io_info {
 	struct f2fs_sb_info *sbi;	/* f2fs_sb_info pointer */
+	nid_t ino;		/* inode number */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	enum temp_type temp;	/* contains HOT/WARM/COLD */
 	int op;			/* contains REQ_OP_ */
@@ -940,6 +988,7 @@ enum inode_type {
 	DIR_INODE,			/* for dirty dir inode */
 	FILE_INODE,			/* for dirty regular/symlink inode */
 	DIRTY_META,			/* for all dirtied inode metadata */
+	ATOMIC_FILE,			/* for all atomic files */
 	NR_INODE_TYPE,
 };
 
@@ -1042,12 +1091,15 @@ struct f2fs_sb_info {
 	loff_t max_file_blocks;			/* max block index of file */
 	int active_logs;			/* # of active logs */
 	int dir_level;				/* directory level */
+	int inline_xattr_size;			/* inline xattr size */
+	unsigned int trigger_ssr_threshold;	/* threshold to trigger ssr */
 
 	block_t user_block_count;		/* # of user blocks */
 	block_t total_valid_block_count;	/* # of valid blocks */
 	block_t discard_blks;			/* discard command candidats */
 	block_t last_valid_block_count;		/* for recovery */
 	block_t reserved_blocks;		/* configurable reserved blocks */
+	block_t current_reserved_blocks;	/* current reserved blocks */
 
 	u32 s_next_generation;			/* for NFS support */
 
@@ -1113,6 +1165,8 @@ struct f2fs_sb_info {
 	struct list_head s_list;
 	int s_ndevs;				/* number of devices */
 	struct f2fs_dev_info *devs;		/* for device list */
+	unsigned int dirty_device;		/* for checkpoint data flush */
+	spinlock_t dev_lock;			/* protect dirty_device */
 	struct mutex umount_mutex;
 	unsigned int shrinker_run_no;
 
@@ -1176,8 +1230,7 @@ static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
 
 static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
 {
-	struct timespec ts = {sbi->interval_time[type], 0};
-	unsigned long interval = timespec_to_jiffies(&ts);
+	unsigned long interval = sbi->interval_time[type] * HZ;
 
 	return time_after(jiffies, sbi->last_time[type] + interval);
 }
@@ -1344,6 +1397,13 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
 	return le64_to_cpu(cp->checkpoint_ver);
 }
 
+static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type)
+{
+	if (type < F2FS_MAX_QUOTAS)
+		return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]);
+	return 0;
+}
+
 static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp)
 {
 	size_t crc_offset = le32_to_cpu(cp->checksum_offset);
@@ -1522,7 +1582,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 
 	spin_lock(&sbi->stat_lock);
 	sbi->total_valid_block_count += (block_t)(*count);
-	avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks;
+	avail_user_block_count = sbi->user_block_count -
+					sbi->current_reserved_blocks;
 	if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
 		diff = sbi->total_valid_block_count - avail_user_block_count;
 		*count -= diff;
@@ -1556,6 +1617,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 	f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
 	f2fs_bug_on(sbi, inode->i_blocks < sectors);
 	sbi->total_valid_block_count -= (block_t)count;
+	if (sbi->reserved_blocks &&
+		sbi->current_reserved_blocks < sbi->reserved_blocks)
+		sbi->current_reserved_blocks = min(sbi->reserved_blocks,
+					sbi->current_reserved_blocks + count);
 	spin_unlock(&sbi->stat_lock);
 	f2fs_i_blocks_write(inode, count, false, true);
 }
@@ -1576,6 +1641,8 @@ static inline void inode_inc_dirty_pages(struct inode *inode)
 	atomic_inc(&F2FS_I(inode)->dirty_pages);
 	inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+	if (IS_NOQUOTA(inode))
+		inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
 }
 
 static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1592,6 +1659,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
 	atomic_dec(&F2FS_I(inode)->dirty_pages);
 	dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+	if (IS_NOQUOTA(inode))
+		dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
 }
 
 static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -1699,10 +1768,17 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
 			return ret;
 	}
 
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_BLOCK)) {
+		f2fs_show_injection_info(FAULT_BLOCK);
+		goto enospc;
+	}
+#endif
+
 	spin_lock(&sbi->stat_lock);
 
 	valid_block_count = sbi->total_valid_block_count + 1;
-	if (unlikely(valid_block_count + sbi->reserved_blocks >
+	if (unlikely(valid_block_count + sbi->current_reserved_blocks >
 						sbi->user_block_count)) {
 		spin_unlock(&sbi->stat_lock);
 		goto enospc;
@@ -1745,6 +1821,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
 
 	sbi->total_valid_node_count--;
 	sbi->total_valid_block_count--;
+	if (sbi->reserved_blocks &&
+		sbi->current_reserved_blocks < sbi->reserved_blocks)
+		sbi->current_reserved_blocks++;
 
 	spin_unlock(&sbi->stat_lock);
 
@@ -1791,6 +1870,19 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
 	return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
 }
 
+static inline struct page *f2fs_pagecache_get_page(
+				struct address_space *mapping, pgoff_t index,
+				int fgp_flags, gfp_t gfp_mask)
+{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) {
+		f2fs_show_injection_info(FAULT_PAGE_GET);
+		return NULL;
+	}
+#endif
+	return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
+}
+
 static inline void f2fs_copy_page(struct page *src, struct page *dst)
 {
 	char *src_kaddr = kmap(src);
@@ -1840,15 +1932,25 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
 	return entry;
 }
 
-static inline struct bio *f2fs_bio_alloc(int npages)
+static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi,
+						int npages, bool no_fail)
 {
 	struct bio *bio;
 
-	/* No failure on bio allocation */
-	bio = bio_alloc(GFP_NOIO, npages);
-	if (!bio)
-		bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
-	return bio;
+	if (no_fail) {
+		/* No failure on bio allocation */
+		bio = bio_alloc(GFP_NOIO, npages);
+		if (!bio)
+			bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
+		return bio;
+	}
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
+		f2fs_show_injection_info(FAULT_ALLOC_BIO);
+		return NULL;
+	}
+#endif
+	return bio_alloc(GFP_KERNEL, npages);
 }
 
 static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
@@ -2158,25 +2260,20 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
 
 static inline unsigned int addrs_per_inode(struct inode *inode)
 {
-	if (f2fs_has_inline_xattr(inode))
-		return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS;
-	return CUR_ADDRS_PER_INODE(inode);
+	return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode);
 }
 
-static inline void *inline_xattr_addr(struct page *page)
+static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode *ri = F2FS_INODE(page);
 
 	return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
-					F2FS_INLINE_XATTR_ADDRS]);
+					F2FS_INLINE_XATTR_ADDRS(inode)]);
 }
 
 static inline int inline_xattr_size(struct inode *inode)
 {
-	if (f2fs_has_inline_xattr(inode))
-		return F2FS_INLINE_XATTR_ADDRS << 2;
-	else
-		return 0;
+	return get_inline_xattr_addrs(inode) * sizeof(__le32);
 }
 
 static inline int f2fs_has_inline_data(struct inode *inode)
@@ -2257,9 +2354,10 @@ static inline void clear_file(struct inode *inode, int type)
 
 static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 {
+	bool ret;
+
 	if (dsync) {
 		struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-		bool ret;
 
 		spin_lock(&sbi->inode_lock[DIRTY_META]);
 		ret = list_empty(&F2FS_I(inode)->gdirty_list);
@@ -2270,7 +2368,12 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 			file_keep_isize(inode) ||
 			i_size_read(inode) & PAGE_MASK)
 		return false;
-	return F2FS_I(inode)->last_disk_size == i_size_read(inode);
+
+	down_read(&F2FS_I(inode)->i_sem);
+	ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
+	up_read(&F2FS_I(inode)->i_sem);
+
+	return ret;
 }
 
 static inline int f2fs_readonly(struct super_block *sb)
@@ -2320,6 +2423,12 @@ static inline int get_extra_isize(struct inode *inode)
 	return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
 }
 
+static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb);
+static inline int get_inline_xattr_addrs(struct inode *inode)
+{
+	return F2FS_I(inode)->i_inline_xattr_size;
+}
+
 #define get_inode_mode(i) \
 	((is_inode_flag_set(i, FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -2448,7 +2557,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
  */
 int f2fs_inode_dirtied(struct inode *inode, bool sync);
 void f2fs_inode_synced(struct inode *inode);
-void f2fs_enable_quota_files(struct f2fs_sb_info *sbi);
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
 void f2fs_quota_off_umount(struct super_block *sb);
 int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
 int f2fs_sync_fs(struct super_block *sb, int sync);
@@ -2476,7 +2585,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni);
 pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
 int truncate_inode_blocks(struct inode *inode, pgoff_t from);
-int truncate_xattr_node(struct inode *inode, struct page *page);
+int truncate_xattr_node(struct inode *inode);
 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
 int remove_inode_page(struct inode *inode);
 struct page *new_inode_page(struct inode *inode);
@@ -2511,19 +2620,22 @@ void destroy_node_manager_caches(void);
  */
 bool need_SSR(struct f2fs_sb_info *sbi);
 void register_inmem_page(struct inode *inode, struct page *page);
+void drop_inmem_pages_all(struct f2fs_sb_info *sbi);
 void drop_inmem_pages(struct inode *inode);
 void drop_inmem_page(struct inode *inode, struct page *page);
 int commit_inmem_pages(struct inode *inode);
 void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
-int f2fs_issue_flush(struct f2fs_sb_info *sbi);
+int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
 int create_flush_cmd_control(struct f2fs_sb_info *sbi);
+int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
 void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
 void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
 bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
-void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
+void init_discard_policy(struct discard_policy *dpolicy, int discard_type,
+						unsigned int granularity);
 void stop_discard_thread(struct f2fs_sb_info *sbi);
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount);
+bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
 void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 void release_discard_addrs(struct f2fs_sb_info *sbi);
 int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
@@ -2578,6 +2690,10 @@ void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
 void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
 void release_ino_entry(struct f2fs_sb_info *sbi, bool all);
 bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode);
+void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type);
+bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+					unsigned int devidx, int type);
 int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi);
 int acquire_orphan_inode(struct f2fs_sb_info *sbi);
 void release_orphan_inode(struct f2fs_sb_info *sbi);
@@ -2665,14 +2781,16 @@ struct f2fs_stat_info {
 	unsigned long long hit_largest, hit_cached, hit_rbtree;
 	unsigned long long hit_total, total_ext;
 	int ext_tree, zombie_tree, ext_node;
-	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
+	int ndirty_data, ndirty_qdata;
 	int inmem_pages;
-	unsigned int ndirty_dirs, ndirty_files, ndirty_all;
+	unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
 	int nats, dirty_nats, sits, dirty_sits;
 	int free_nids, avail_nids, alloc_nids;
 	int total_count, utilization;
 	int bg_gc, nr_wb_cp_data, nr_wb_data;
-	int nr_flushing, nr_flushed, nr_discarding, nr_discarded;
+	int nr_flushing, nr_flushed, flush_list_empty;
+	int nr_discarding, nr_discarded;
 	int nr_discard_cmd;
 	unsigned int undiscard_blks;
 	int inline_xattr, inline_inode, inline_dir, append, update, orphans;
@@ -2981,6 +3099,16 @@ static inline int f2fs_sb_has_inode_chksum(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM);
 }
 
+static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR);
+}
+
+static inline int f2fs_sb_has_quota_ino(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 			struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 517e112c8a9a..7874bbd7311d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -53,6 +53,11 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 	struct dnode_of_data dn;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi))) {
+		err = -EIO;
+		goto err;
+	}
+
 	sb_start_pagefault(inode->i_sb);
 
 	f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -114,6 +119,7 @@ out_sem:
 out:
 	sb_end_pagefault(inode->i_sb);
 	f2fs_update_time(sbi, REQ_TIME);
+err:
 	return block_page_mkwrite_return(err);
 }
 
@@ -138,27 +144,29 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
 	return 1;
 }
 
-static inline bool need_do_checkpoint(struct inode *inode)
+static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	bool need_cp = false;
+	enum cp_reason_type cp_reason = CP_NO_NEEDED;
 
-	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
-		need_cp = true;
+	if (!S_ISREG(inode->i_mode))
+		cp_reason = CP_NON_REGULAR;
+	else if (inode->i_nlink != 1)
+		cp_reason = CP_HARDLINK;
 	else if (is_sbi_flag_set(sbi, SBI_NEED_CP))
-		need_cp = true;
+		cp_reason = CP_SB_NEED_CP;
 	else if (file_wrong_pino(inode))
-		need_cp = true;
+		cp_reason = CP_WRONG_PINO;
 	else if (!space_for_roll_forward(sbi))
-		need_cp = true;
+		cp_reason = CP_NO_SPC_ROLL;
 	else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
-		need_cp = true;
+		cp_reason = CP_NODE_NEED_CP;
 	else if (test_opt(sbi, FASTBOOT))
-		need_cp = true;
+		cp_reason = CP_FASTBOOT_MODE;
 	else if (sbi->active_logs == 2)
-		need_cp = true;
+		cp_reason = CP_SPEC_LOG_NUM;
 
-	return need_cp;
+	return cp_reason;
 }
 
 static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino)
@@ -193,7 +201,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t ino = inode->i_ino;
 	int ret = 0;
-	bool need_cp = false;
+	enum cp_reason_type cp_reason = 0;
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
@@ -212,7 +220,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	clear_inode_flag(inode, FI_NEED_IPU);
 
 	if (ret) {
-		trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+		trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
 		return ret;
 	}
 
@@ -243,10 +251,10 @@ go_write:
 	 * sudden-power-off.
 	 */
 	down_read(&F2FS_I(inode)->i_sem);
-	need_cp = need_do_checkpoint(inode);
+	cp_reason = need_do_checkpoint(inode);
 	up_read(&F2FS_I(inode)->i_sem);
 
-	if (need_cp) {
+	if (cp_reason) {
 		/* all the dirty node pages should be flushed for POR */
 		ret = f2fs_sync_fs(inode->i_sb, 1);
 
@@ -294,37 +302,43 @@ sync_nodes:
 	remove_ino_entry(sbi, ino, APPEND_INO);
 	clear_inode_flag(inode, FI_APPEND_WRITE);
 flush_out:
-	remove_ino_entry(sbi, ino, UPDATE_INO);
-	clear_inode_flag(inode, FI_UPDATE_WRITE);
 	if (!atomic)
-		ret = f2fs_issue_flush(sbi);
+		ret = f2fs_issue_flush(sbi, inode->i_ino);
+	if (!ret) {
+		remove_ino_entry(sbi, ino, UPDATE_INO);
+		clear_inode_flag(inode, FI_UPDATE_WRITE);
+		remove_ino_entry(sbi, ino, FLUSH_INO);
+	}
 	f2fs_update_time(sbi, REQ_TIME);
 out:
-	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+	trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
 	f2fs_trace_ios(NULL, 1);
 	return ret;
 }
 
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file)))))
+		return -EIO;
 	return f2fs_do_sync_file(file, start, end, datasync, false);
 }
 
 static pgoff_t __get_first_dirty_index(struct address_space *mapping,
 						pgoff_t pgofs, int whence)
 {
-	struct pagevec pvec;
+	struct page *page;
 	int nr_pages;
 
 	if (whence != SEEK_DATA)
 		return 0;
 
 	/* find first dirty page index */
-	pagevec_init(&pvec, 0);
-	nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
-					PAGECACHE_TAG_DIRTY, 1);
-	pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX;
-	pagevec_release(&pvec);
+	nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY,
+				      1, &page);
+	if (!nr_pages)
+		return ULONG_MAX;
+	pgofs = page->index;
+	put_page(page);
 	return pgofs;
 }
 
@@ -443,6 +457,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	struct inode *inode = file_inode(file);
 	int err;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	/* we don't need to use inline_data strictly */
 	err = f2fs_convert_inline_inode(inode);
 	if (err)
@@ -629,6 +646,9 @@ int f2fs_truncate(struct inode *inode)
 {
 	int err;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 				S_ISLNK(inode->i_mode)))
 		return 0;
@@ -683,6 +703,12 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
 				  STATX_ATTR_NODUMP);
 
 	generic_fillattr(inode, stat);
+
+	/* we need to show initial sectors used for inline_data/dentries */
+	if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
+					f2fs_has_inline_dentry(inode))
+		stat->blocks += (stat->size + 511) >> 9;
+
 	return 0;
 }
 
@@ -722,6 +748,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	int err;
 	bool size_changed = false;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	err = setattr_prepare(dentry, attr);
 	if (err)
 		return err;
@@ -774,6 +803,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			inode->i_mtime = inode->i_ctime = current_time(inode);
 		}
 
+		down_write(&F2FS_I(inode)->i_sem);
+		F2FS_I(inode)->last_disk_size = i_size_read(inode);
+		up_write(&F2FS_I(inode)->i_sem);
+
 		size_changed = true;
 	}
 
@@ -844,7 +877,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
 		err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
 		if (err) {
 			if (err == -ENOENT) {
-				pg_start++;
+				pg_start = get_next_page_offset(&dn, pg_start);
 				continue;
 			}
 			return err;
@@ -1159,11 +1192,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		goto out;
 
+	/* avoid gc operation during block exchange */
+	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
 	truncate_pagecache(inode, offset);
 
 	ret = f2fs_do_collapse(inode, pg_start, pg_end);
 	if (ret)
-		goto out;
+		goto out_unlock;
 
 	/* write out all moved pages, if possible */
 	filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -1175,7 +1211,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	ret = truncate_blocks(inode, new_size, true);
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
-
+out_unlock:
+	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 out:
 	up_write(&F2FS_I(inode)->i_mmap_sem);
 	return ret;
@@ -1358,6 +1395,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		goto out;
 
+	/* avoid gc operation during block exchange */
+	down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
 	truncate_pagecache(inode, offset);
 
 	pg_start = offset >> PAGE_SHIFT;
@@ -1385,6 +1425,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 	if (!ret)
 		f2fs_i_size_write(inode, new_size);
+
+	up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 out:
 	up_write(&F2FS_I(inode)->i_mmap_sem);
 	return ret;
@@ -1434,8 +1476,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 		new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end;
 	}
 
-	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
-		f2fs_i_size_write(inode, new_size);
+	if (new_size > i_size_read(inode)) {
+		if (mode & FALLOC_FL_KEEP_SIZE)
+			file_set_keep_isize(inode);
+		else
+			f2fs_i_size_write(inode, new_size);
+	}
 
 	return err;
 }
@@ -1446,6 +1492,9 @@ static long f2fs_fallocate(struct file *file, int mode,
 	struct inode *inode = file_inode(file);
 	long ret = 0;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	/* f2fs only support ->fallocate for regular file */
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
@@ -1479,8 +1528,6 @@ static long f2fs_fallocate(struct file *file, int mode,
 	if (!ret) {
 		inode->i_mtime = inode->i_ctime = current_time(inode);
 		f2fs_mark_inode_dirty_sync(inode, false);
-		if (mode & FALLOC_FL_KEEP_SIZE)
-			file_set_keep_isize(inode);
 		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
 
@@ -1882,6 +1929,9 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 
+	if (!f2fs_sb_has_crypto(inode->i_sb))
+		return -EOPNOTSUPP;
+
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 
 	return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
@@ -1889,6 +1939,8 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
+	if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb))
+		return -EOPNOTSUPP;
 	return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 }
 
@@ -2244,9 +2296,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
 	}
 
 	inode_lock(src);
+	down_write(&F2FS_I(src)->dio_rwsem[WRITE]);
 	if (src != dst) {
-		if (!inode_trylock(dst)) {
-			ret = -EBUSY;
+		ret = -EBUSY;
+		if (!inode_trylock(dst))
+			goto out;
+		if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) {
+			inode_unlock(dst);
 			goto out;
 		}
 	}
@@ -2306,9 +2362,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
 	}
 	f2fs_unlock_op(sbi);
 out_unlock:
-	if (src != dst)
+	if (src != dst) {
+		up_write(&F2FS_I(dst)->dio_rwsem[WRITE]);
 		inode_unlock(dst);
+	}
 out:
+	up_write(&F2FS_I(src)->dio_rwsem[WRITE]);
 	inode_unlock(src);
 	return ret;
 }
@@ -2624,6 +2683,9 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
 
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
+		return -EIO;
+
 	switch (cmd) {
 	case F2FS_IOC_GETFLAGS:
 		return f2fs_ioc_getflags(filp, arg);
@@ -2681,6 +2743,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct blk_plug plug;
 	ssize_t ret;
 
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+		return -EIO;
+
 	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0) {
@@ -2691,6 +2756,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 		err = f2fs_preallocate_blocks(iocb, from);
 		if (err) {
+			clear_inode_flag(inode, FI_NO_PREALLOC);
 			inode_unlock(inode);
 			return err;
 		}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index bfe6a8ccc3a0..5d5bba462f26 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -267,16 +267,6 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 	return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
 }
 
-static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi,
-						unsigned int segno)
-{
-	unsigned int valid_blocks =
-			get_valid_blocks(sbi, segno, true);
-
-	return IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
-				valid_blocks * 2 : valid_blocks;
-}
-
 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
 			unsigned int segno, struct victim_sel_policy *p)
 {
@@ -285,7 +275,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
 
 	/* alloc_mode == LFS */
 	if (p->gc_mode == GC_GREEDY)
-		return get_greedy_cost(sbi, segno);
+		return get_valid_blocks(sbi, segno, true);
 	else
 		return get_cb_cost(sbi, segno);
 }
@@ -466,10 +456,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
 	struct seg_entry *sentry;
 	int ret;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_read(&sit_i->sentry_lock);
 	sentry = get_seg_entry(sbi, segno);
 	ret = f2fs_test_bit(offset, sentry->cur_valid_map);
-	mutex_unlock(&sit_i->sentry_lock);
+	up_read(&sit_i->sentry_lock);
 	return ret;
 }
 
@@ -608,6 +598,7 @@ static void move_data_block(struct inode *inode, block_t bidx,
 {
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(inode),
+		.ino = inode->i_ino,
 		.type = DATA,
 		.temp = COLD,
 		.op = REQ_OP_READ,
@@ -659,8 +650,8 @@ static void move_data_block(struct inode *inode, block_t bidx,
 	allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
 					&sum, CURSEG_COLD_DATA, NULL, false);
 
-	fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
-					FGP_LOCK | FGP_CREAT, GFP_NOFS);
+	fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
+				newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
 	if (!fio.encrypted_page) {
 		err = -ENOMEM;
 		goto recover_block;
@@ -738,6 +729,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
 	} else {
 		struct f2fs_io_info fio = {
 			.sbi = F2FS_I_SB(inode),
+			.ino = inode->i_ino,
 			.type = DATA,
 			.temp = COLD,
 			.op = REQ_OP_WRITE,
@@ -840,10 +832,17 @@ next_step:
 				continue;
 			}
 
+			if (!down_write_trylock(
+				&F2FS_I(inode)->dio_rwsem[WRITE])) {
+				iput(inode);
+				continue;
+			}
+
 			start_bidx = start_bidx_of_node(nofs, inode);
 			data_page = get_read_data_page(inode,
 					start_bidx + ofs_in_node, REQ_RAHEAD,
 					true);
+			up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
 			if (IS_ERR(data_page)) {
 				iput(inode);
 				continue;
@@ -901,10 +900,10 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
 	struct sit_info *sit_i = SIT_I(sbi);
 	int ret;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 	ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
 					      NO_CHECK_TYPE, LFS);
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 	return ret;
 }
 
@@ -952,8 +951,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 		/*
 		 * this is to avoid deadlock:
 		 * - lock_page(sum_page)         - f2fs_replace_block
-		 *  - check_valid_map()            - mutex_lock(sentry_lock)
-		 *   - mutex_lock(sentry_lock)     - change_curseg()
+		 *  - check_valid_map()            - down_write(sentry_lock)
+		 *   - down_read(sentry_lock)     - change_curseg()
 		 *                                  - lock_page(sum_page)
 		 */
 		if (type == SUM_TYPE_NODE)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 8322e4e7bb3f..90e38d8ea688 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -112,6 +112,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 {
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(dn->inode),
+		.ino = dn->inode->i_ino,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
 		.op_flags = REQ_SYNC | REQ_PRIO,
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 53fb08810ee9..b4c4f2b25304 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -235,6 +235,23 @@ static int do_read_inode(struct inode *inode)
 	fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
 					le16_to_cpu(ri->i_extra_isize) : 0;
 
+	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+		fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size);
+	} else if (f2fs_has_inline_xattr(inode) ||
+				f2fs_has_inline_dentry(inode)) {
+		fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+	} else {
+
+		/*
+		 * Previous inline data or directory always reserved 200 bytes
+		 * in inode layout, even if inline_xattr is disabled. In order
+		 * to keep inline_dentry's structure for backward compatibility,
+		 * we get the space back only from inline_data.
+		 */
+		fi->i_inline_xattr_size = 0;
+	}
+
 	/* check data exist */
 	if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
 		__recover_inline_status(inode, node_page);
@@ -387,6 +404,10 @@ int update_inode(struct inode *inode, struct page *node_page)
 	if (f2fs_has_extra_attr(inode)) {
 		ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
 
+		if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb))
+			ri->i_inline_xattr_size =
+				cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size);
+
 		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
 			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
 								i_projid)) {
@@ -483,6 +504,7 @@ void f2fs_evict_inode(struct inode *inode)
 
 	remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
 	remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+	remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
 
 	sb_start_intwrite(inode->i_sb);
 	set_inode_flag(inode, FI_NO_ALLOC);
@@ -522,8 +544,10 @@ no_delete:
 	stat_dec_inline_dir(inode);
 	stat_dec_inline_inode(inode);
 
-	if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))
+	if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)))
 		f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
+	else
+		f2fs_inode_synced(inode);
 
 	/* ino == 0, if f2fs_new_inode() was failed t*/
 	if (inode->i_ino)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a4dab98c4b7b..28bdf8828e73 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -29,6 +29,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	nid_t ino;
 	struct inode *inode;
 	bool nid_free = false;
+	int xattr_size = 0;
 	int err;
 
 	inode = new_inode(dir->i_sb);
@@ -86,11 +87,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	if (test_opt(sbi, INLINE_XATTR))
 		set_inode_flag(inode, FI_INLINE_XATTR);
+
 	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
 		set_inode_flag(inode, FI_INLINE_DATA);
 	if (f2fs_may_inline_dentry(inode))
 		set_inode_flag(inode, FI_INLINE_DENTRY);
 
+	if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+		f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+		if (f2fs_has_inline_xattr(inode))
+			xattr_size = sbi->inline_xattr_size;
+		/* Otherwise, will be 0 */
+	} else if (f2fs_has_inline_xattr(inode) ||
+				f2fs_has_inline_dentry(inode)) {
+		xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+	}
+	F2FS_I(inode)->i_inline_xattr_size = xattr_size;
+
 	f2fs_init_extent_tree(inode, NULL);
 
 	stat_inc_inline_xattr(inode);
@@ -177,6 +190,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	nid_t ino = 0;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -221,6 +237,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if (f2fs_encrypted_inode(dir) &&
 			!fscrypt_has_permitted_context(dir, inode))
 		return -EPERM;
@@ -331,12 +350,15 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	struct f2fs_dir_entry *de;
 	struct page *page;
-	nid_t ino;
+	struct dentry *new;
+	nid_t ino = -1;
 	int err = 0;
 	unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
 
+	trace_f2fs_lookup_start(dir, dentry, flags);
+
 	if (f2fs_encrypted_inode(dir)) {
-		int res = fscrypt_get_encryption_info(dir);
+		err = fscrypt_get_encryption_info(dir);
 
 		/*
 		 * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
@@ -346,18 +368,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 		if (fscrypt_has_encryption_key(dir))
 			fscrypt_set_encrypted_dentry(dentry);
 		fscrypt_set_d_op(dentry);
-		if (res && res != -ENOKEY)
-			return ERR_PTR(res);
+		if (err && err != -ENOKEY)
+			goto out;
 	}
 
-	if (dentry->d_name.len > F2FS_NAME_LEN)
-		return ERR_PTR(-ENAMETOOLONG);
+	if (dentry->d_name.len > F2FS_NAME_LEN) {
+		err = -ENAMETOOLONG;
+		goto out;
+	}
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
 	if (!de) {
-		if (IS_ERR(page))
-			return (struct dentry *)page;
-		return d_splice_alias(inode, dentry);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto out;
+		}
+		goto out_splice;
 	}
 
 	ino = le32_to_cpu(de->ino);
@@ -365,19 +391,21 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	f2fs_put_page(page, 0);
 
 	inode = f2fs_iget(dir->i_sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
 
 	if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
 		err = __recover_dot_dentries(dir, root_ino);
 		if (err)
-			goto err_out;
+			goto out_iput;
 	}
 
 	if (f2fs_has_inline_dots(inode)) {
 		err = __recover_dot_dentries(inode, dir->i_ino);
 		if (err)
-			goto err_out;
+			goto out_iput;
 	}
 	if (f2fs_encrypted_inode(dir) &&
 	    (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
@@ -386,12 +414,18 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 			 "Inconsistent encryption contexts: %lu/%lu",
 			 dir->i_ino, inode->i_ino);
 		err = -EPERM;
-		goto err_out;
+		goto out_iput;
 	}
-	return d_splice_alias(inode, dentry);
-
-err_out:
+out_splice:
+	new = d_splice_alias(inode, dentry);
+	if (IS_ERR(new))
+		err = PTR_ERR(new);
+	trace_f2fs_lookup_end(dir, dentry, ino, err);
+	return new;
+out_iput:
 	iput(inode);
+out:
+	trace_f2fs_lookup_end(dir, dentry, ino, err);
 	return ERR_PTR(err);
 }
 
@@ -405,9 +439,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 
 	trace_f2fs_unlink_enter(dir, dentry);
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
 	if (!de) {
@@ -460,6 +500,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	struct fscrypt_symlink_data *sd = NULL;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if (f2fs_encrypted_inode(dir)) {
 		err = fscrypt_get_encryption_info(dir);
 		if (err)
@@ -566,6 +609,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct inode *inode;
 	int err;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -618,6 +664,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err = 0;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -712,6 +761,9 @@ out:
 
 static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+		return -EIO;
+
 	if (f2fs_encrypted_inode(dir)) {
 		int err = fscrypt_get_encryption_info(dir);
 		if (err)
@@ -723,6 +775,9 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
 {
+	if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+		return -EIO;
+
 	return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
 }
 
@@ -742,6 +797,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	bool is_old_inline = f2fs_has_inline_dentry(old_dir);
 	int err = -ENOENT;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if ((f2fs_encrypted_inode(old_dir) &&
 			!fscrypt_has_encryption_key(old_dir)) ||
 			(f2fs_encrypted_inode(new_dir) &&
@@ -767,6 +825,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (err)
 		goto out;
 
+	if (new_inode) {
+		err = dquot_initialize(new_inode);
+		if (err)
+			goto out;
+	}
+
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_entry) {
 		if (IS_ERR(old_page))
@@ -935,6 +999,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int old_nlink = 0, new_nlink = 0;
 	int err = -ENOENT;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
 	if ((f2fs_encrypted_inode(old_dir) &&
 			!fscrypt_has_encryption_key(old_dir)) ||
 			(f2fs_encrypted_inode(new_dir) &&
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index fca87835a1da..d3322752426f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -46,7 +46,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 	 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
 	 */
 	if (type == FREE_NIDS) {
-		mem_size = (nm_i->nid_cnt[FREE_NID_LIST] *
+		mem_size = (nm_i->nid_cnt[FREE_NID] *
 				sizeof(struct free_nid)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == NAT_ENTRIES) {
@@ -63,7 +63,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 	} else if (type == INO_ENTRIES) {
 		int i;
 
-		for (i = 0; i <= UPDATE_INO; i++)
+		for (i = 0; i < MAX_INO_ENTRY; i++)
 			mem_size += sbi->im[i].ino_num *
 						sizeof(struct ino_entry);
 		mem_size >>= PAGE_SHIFT;
@@ -74,6 +74,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 				atomic_read(&sbi->total_ext_node) *
 				sizeof(struct extent_node)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+	} else if (type == INMEM_PAGES) {
+		/* it allows 20% / total_ram for inmemory pages */
+		mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
+		res = mem_size < (val.totalram / 5);
 	} else {
 		if (!sbi->sb->s_bdi->wb.dirty_exceeded)
 			return true;
@@ -134,6 +138,44 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 	return dst_page;
 }
 
+static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
+{
+	struct nat_entry *new;
+
+	if (no_fail)
+		new = f2fs_kmem_cache_alloc(nat_entry_slab,
+						GFP_NOFS | __GFP_ZERO);
+	else
+		new = kmem_cache_alloc(nat_entry_slab,
+						GFP_NOFS | __GFP_ZERO);
+	if (new) {
+		nat_set_nid(new, nid);
+		nat_reset_flag(new);
+	}
+	return new;
+}
+
+static void __free_nat_entry(struct nat_entry *e)
+{
+	kmem_cache_free(nat_entry_slab, e);
+}
+
+/* must be locked by nat_tree_lock */
+static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
+	struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
+{
+	if (no_fail)
+		f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
+	else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne))
+		return NULL;
+
+	if (raw_ne)
+		node_info_from_raw_nat(&ne->ni, raw_ne);
+	list_add_tail(&ne->list, &nm_i->nat_entries);
+	nm_i->nat_cnt++;
+	return ne;
+}
+
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
 {
 	return radix_tree_lookup(&nm_i->nat_root, n);
@@ -150,7 +192,7 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
 	list_del(&e->list);
 	radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
 	nm_i->nat_cnt--;
-	kmem_cache_free(nat_entry_slab, e);
+	__free_nat_entry(e);
 }
 
 static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
@@ -246,49 +288,29 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
 	return need_update;
 }
 
-static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
-								bool no_fail)
-{
-	struct nat_entry *new;
-
-	if (no_fail) {
-		new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
-		f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
-	} else {
-		new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
-		if (!new)
-			return NULL;
-		if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
-			kmem_cache_free(nat_entry_slab, new);
-			return NULL;
-		}
-	}
-
-	memset(new, 0, sizeof(struct nat_entry));
-	nat_set_nid(new, nid);
-	nat_reset_flag(new);
-	list_add_tail(&new->list, &nm_i->nat_entries);
-	nm_i->nat_cnt++;
-	return new;
-}
-
+/* must be locked by nat_tree_lock */
 static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
 						struct f2fs_nat_entry *ne)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct nat_entry *e;
+	struct nat_entry *new, *e;
 
+	new = __alloc_nat_entry(nid, false);
+	if (!new)
+		return;
+
+	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
-	if (!e) {
-		e = grab_nat_entry(nm_i, nid, false);
-		if (e)
-			node_info_from_raw_nat(&e->ni, ne);
-	} else {
+	if (!e)
+		e = __init_nat_entry(nm_i, new, ne, false);
+	else
 		f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
 				nat_get_blkaddr(e) !=
 					le32_to_cpu(ne->block_addr) ||
 				nat_get_version(e) != ne->version);
-	}
+	up_write(&nm_i->nat_tree_lock);
+	if (e != new)
+		__free_nat_entry(new);
 }
 
 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -296,11 +318,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct nat_entry *e;
+	struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
 
 	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, ni->nid);
 	if (!e) {
-		e = grab_nat_entry(nm_i, ni->nid, true);
+		e = __init_nat_entry(nm_i, new, NULL, true);
 		copy_node_info(&e->ni, ni);
 		f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
 	} else if (new_blkaddr == NEW_ADDR) {
@@ -312,6 +335,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 		copy_node_info(&e->ni, ni);
 		f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
 	}
+	/* let's free early to reduce memory consumption */
+	if (e != new)
+		__free_nat_entry(new);
 
 	/* sanity check */
 	f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
@@ -327,10 +353,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 	if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
 		unsigned char version = nat_get_version(e);
 		nat_set_version(e, inc_node_version(version));
-
-		/* in order to reuse the nid */
-		if (nm_i->next_scan_nid > ni->nid)
-			nm_i->next_scan_nid = ni->nid;
 	}
 
 	/* change address */
@@ -424,9 +446,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 	f2fs_put_page(page, 1);
 cache:
 	/* cache nat entry */
-	down_write(&nm_i->nat_tree_lock);
 	cache_nat_entry(sbi, nid, &ne);
-	up_write(&nm_i->nat_tree_lock);
 }
 
 /*
@@ -962,7 +982,8 @@ fail:
 	return err > 0 ? 0 : err;
 }
 
-int truncate_xattr_node(struct inode *inode, struct page *page)
+/* caller must lock inode page */
+int truncate_xattr_node(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	nid_t nid = F2FS_I(inode)->i_xattr_nid;
@@ -978,10 +999,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
 
 	f2fs_i_xnid_write(inode, 0);
 
-	set_new_dnode(&dn, inode, page, npage, nid);
-
-	if (page)
-		dn.inode_page_locked = true;
+	set_new_dnode(&dn, inode, NULL, npage, nid);
 	truncate_node(&dn);
 	return 0;
 }
@@ -1000,7 +1018,7 @@ int remove_inode_page(struct inode *inode)
 	if (err)
 		return err;
 
-	err = truncate_xattr_node(inode, dn.inode_page);
+	err = truncate_xattr_node(inode);
 	if (err) {
 		f2fs_put_dnode(&dn);
 		return err;
@@ -1220,7 +1238,8 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
 	if (!inode)
 		return;
 
-	page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
+	page = f2fs_pagecache_get_page(inode->i_mapping, 0,
+					FGP_LOCK|FGP_NOWAIT, 0);
 	if (!page)
 		goto iput_out;
 
@@ -1244,54 +1263,19 @@ iput_out:
 	iput(inode);
 }
 
-void move_node_page(struct page *node_page, int gc_type)
-{
-	if (gc_type == FG_GC) {
-		struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL,
-			.nr_to_write = 1,
-			.for_reclaim = 0,
-		};
-
-		set_page_dirty(node_page);
-		f2fs_wait_on_page_writeback(node_page, NODE, true);
-
-		f2fs_bug_on(sbi, PageWriteback(node_page));
-		if (!clear_page_dirty_for_io(node_page))
-			goto out_page;
-
-		if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
-			unlock_page(node_page);
-		goto release_page;
-	} else {
-		/* set page dirty and write it */
-		if (!PageWriteback(node_page))
-			set_page_dirty(node_page);
-	}
-out_page:
-	unlock_page(node_page);
-release_page:
-	f2fs_put_page(node_page, 0);
-}
-
 static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	pgoff_t index, end;
+	pgoff_t index;
 	struct pagevec pvec;
 	struct page *last_page = NULL;
+	int nr_pages;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	index = 0;
-	end = ULONG_MAX;
-
-	while (index <= end) {
-		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_DIRTY,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (nr_pages == 0)
-			break;
+
+	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_DIRTY))) {
+		int i;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -1344,6 +1328,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 	struct node_info ni;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
+		.ino = ino_of_node(page),
 		.type = NODE,
 		.op = REQ_OP_WRITE,
 		.op_flags = wbc_to_write_flags(wbc),
@@ -1416,6 +1401,37 @@ redirty_out:
 	return AOP_WRITEPAGE_ACTIVATE;
 }
 
+void move_node_page(struct page *node_page, int gc_type)
+{
+	if (gc_type == FG_GC) {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = 1,
+			.for_reclaim = 0,
+		};
+
+		set_page_dirty(node_page);
+		f2fs_wait_on_page_writeback(node_page, NODE, true);
+
+		f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
+		if (!clear_page_dirty_for_io(node_page))
+			goto out_page;
+
+		if (__write_node_page(node_page, false, NULL,
+					&wbc, false, FS_GC_NODE_IO))
+			unlock_page(node_page);
+		goto release_page;
+	} else {
+		/* set page dirty and write it */
+		if (!PageWriteback(node_page))
+			set_page_dirty(node_page);
+	}
+out_page:
+	unlock_page(node_page);
+release_page:
+	f2fs_put_page(node_page, 0);
+}
+
 static int f2fs_write_node_page(struct page *page,
 				struct writeback_control *wbc)
 {
@@ -1425,13 +1441,14 @@ static int f2fs_write_node_page(struct page *page,
 int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
 			struct writeback_control *wbc, bool atomic)
 {
-	pgoff_t index, end;
+	pgoff_t index;
 	pgoff_t last_idx = ULONG_MAX;
 	struct pagevec pvec;
 	int ret = 0;
 	struct page *last_page = NULL;
 	bool marked = false;
 	nid_t ino = inode->i_ino;
+	int nr_pages;
 
 	if (atomic) {
 		last_page = last_fsync_dnode(sbi, ino);
@@ -1439,17 +1456,12 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
 			return PTR_ERR_OR_ZERO(last_page);
 	}
 retry:
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	index = 0;
-	end = ULONG_MAX;
-
-	while (index <= end) {
-		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_DIRTY,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (nr_pages == 0)
-			break;
+
+	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_DIRTY))) {
+		int i;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -1548,25 +1560,21 @@ out:
 int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
 				bool do_balance, enum iostat_type io_type)
 {
-	pgoff_t index, end;
+	pgoff_t index;
 	struct pagevec pvec;
 	int step = 0;
 	int nwritten = 0;
 	int ret = 0;
+	int nr_pages;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 next_step:
 	index = 0;
-	end = ULONG_MAX;
-
-	while (index <= end) {
-		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_DIRTY,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (nr_pages == 0)
-			break;
+
+	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_DIRTY))) {
+		int i;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -1655,27 +1663,20 @@ out:
 
 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	pgoff_t index = 0, end = ULONG_MAX;
+	pgoff_t index = 0;
 	struct pagevec pvec;
 	int ret2, ret = 0;
+	int nr_pages;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
-	while (index <= end) {
-		int i, nr_pages;
-		nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
-				PAGECACHE_TAG_WRITEBACK,
-				min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
-		if (nr_pages == 0)
-			break;
+	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+				PAGECACHE_TAG_WRITEBACK))) {
+		int i;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
-			/* until radix tree lookup accepts end_index */
-			if (unlikely(page->index > end))
-				continue;
-
 			if (ino && ino_of_node(page) == ino) {
 				f2fs_wait_on_page_writeback(page, NODE, true);
 				if (TestClearPageError(page))
@@ -1761,35 +1762,54 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
 	return radix_tree_lookup(&nm_i->free_nid_root, n);
 }
 
-static int __insert_nid_to_list(struct f2fs_sb_info *sbi,
-			struct free_nid *i, enum nid_list list, bool new)
+static int __insert_free_nid(struct f2fs_sb_info *sbi,
+			struct free_nid *i, enum nid_state state)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
-	if (new) {
-		int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
-		if (err)
-			return err;
-	}
+	int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+	if (err)
+		return err;
 
-	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
-						i->state != NID_ALLOC);
-	nm_i->nid_cnt[list]++;
-	list_add_tail(&i->list, &nm_i->nid_list[list]);
+	f2fs_bug_on(sbi, state != i->state);
+	nm_i->nid_cnt[state]++;
+	if (state == FREE_NID)
+		list_add_tail(&i->list, &nm_i->free_nid_list);
 	return 0;
 }
 
-static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
-			struct free_nid *i, enum nid_list list, bool reuse)
+static void __remove_free_nid(struct f2fs_sb_info *sbi,
+			struct free_nid *i, enum nid_state state)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
-	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
-						i->state != NID_ALLOC);
-	nm_i->nid_cnt[list]--;
-	list_del(&i->list);
-	if (!reuse)
-		radix_tree_delete(&nm_i->free_nid_root, i->nid);
+	f2fs_bug_on(sbi, state != i->state);
+	nm_i->nid_cnt[state]--;
+	if (state == FREE_NID)
+		list_del(&i->list);
+	radix_tree_delete(&nm_i->free_nid_root, i->nid);
+}
+
+static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
+			enum nid_state org_state, enum nid_state dst_state)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	f2fs_bug_on(sbi, org_state != i->state);
+	i->state = dst_state;
+	nm_i->nid_cnt[org_state]--;
+	nm_i->nid_cnt[dst_state]++;
+
+	switch (dst_state) {
+	case PREALLOC_NID:
+		list_del(&i->list);
+		break;
+	case FREE_NID:
+		list_add_tail(&i->list, &nm_i->free_nid_list);
+		break;
+	default:
+		BUG_ON(1);
+	}
 }
 
 /* return if the nid is recognized as free */
@@ -1807,7 +1827,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
 	i->nid = nid;
-	i->state = NID_NEW;
+	i->state = FREE_NID;
 
 	if (radix_tree_preload(GFP_NOFS))
 		goto err;
@@ -1820,7 +1840,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 		 *  - f2fs_create
 		 *   - f2fs_new_inode
 		 *    - alloc_nid
-		 *     - __insert_nid_to_list(ALLOC_NID_LIST)
+		 *     - __insert_nid_to_list(PREALLOC_NID)
 		 *                     - f2fs_balance_fs_bg
 		 *                      - build_free_nids
 		 *                       - __build_free_nids
@@ -1833,8 +1853,8 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 		 *     - new_node_page
 		 *      - set_node_addr
 		 *  - alloc_nid_done
-		 *   - __remove_nid_from_list(ALLOC_NID_LIST)
-		 *                         - __insert_nid_to_list(FREE_NID_LIST)
+		 *   - __remove_nid_from_list(PREALLOC_NID)
+		 *                         - __insert_nid_to_list(FREE_NID)
 		 */
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
@@ -1843,13 +1863,13 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 		e = __lookup_free_nid_list(nm_i, nid);
 		if (e) {
-			if (e->state == NID_NEW)
+			if (e->state == FREE_NID)
 				ret = true;
 			goto err_out;
 		}
 	}
 	ret = true;
-	err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+	err = __insert_free_nid(sbi, i, FREE_NID);
 err_out:
 	spin_unlock(&nm_i->nid_list_lock);
 	radix_tree_preload_end();
@@ -1867,8 +1887,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
 
 	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
-	if (i && i->state == NID_NEW) {
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+	if (i && i->state == FREE_NID) {
+		__remove_free_nid(sbi, i, FREE_NID);
 		need_free = true;
 	}
 	spin_unlock(&nm_i->nid_list_lock);
@@ -1887,15 +1907,18 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
 	if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
 		return;
 
-	if (set)
+	if (set) {
+		if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+			return;
 		__set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
-	else
-		__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
-
-	if (set)
 		nm_i->free_nid_count[nat_ofs]++;
-	else if (!build)
-		nm_i->free_nid_count[nat_ofs]--;
+	} else {
+		if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+			return;
+		__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+		if (!build)
+			nm_i->free_nid_count[nat_ofs]--;
+	}
 }
 
 static void scan_nat_page(struct f2fs_sb_info *sbi,
@@ -1930,12 +1953,32 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 	}
 }
 
-static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+static void scan_curseg_cache(struct f2fs_sb_info *sbi)
 {
-	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
 	struct f2fs_journal *journal = curseg->journal;
+	int i;
+
+	down_read(&curseg->journal_rwsem);
+	for (i = 0; i < nats_in_cursum(journal); i++) {
+		block_t addr;
+		nid_t nid;
+
+		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+		nid = le32_to_cpu(nid_in_journal(journal, i));
+		if (addr == NULL_ADDR)
+			add_free_nid(sbi, nid, true);
+		else
+			remove_free_nid(sbi, nid);
+	}
+	up_read(&curseg->journal_rwsem);
+}
+
+static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	unsigned int i, idx;
+	nid_t nid;
 
 	down_read(&nm_i->nat_tree_lock);
 
@@ -1945,40 +1988,27 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
 		if (!nm_i->free_nid_count[i])
 			continue;
 		for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
-			nid_t nid;
-
-			if (!test_bit_le(idx, nm_i->free_nid_bitmap[i]))
-				continue;
+			idx = find_next_bit_le(nm_i->free_nid_bitmap[i],
+						NAT_ENTRY_PER_BLOCK, idx);
+			if (idx >= NAT_ENTRY_PER_BLOCK)
+				break;
 
 			nid = i * NAT_ENTRY_PER_BLOCK + idx;
 			add_free_nid(sbi, nid, true);
 
-			if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS)
+			if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
 				goto out;
 		}
 	}
 out:
-	down_read(&curseg->journal_rwsem);
-	for (i = 0; i < nats_in_cursum(journal); i++) {
-		block_t addr;
-		nid_t nid;
+	scan_curseg_cache(sbi);
 
-		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
-		nid = le32_to_cpu(nid_in_journal(journal, i));
-		if (addr == NULL_ADDR)
-			add_free_nid(sbi, nid, true);
-		else
-			remove_free_nid(sbi, nid);
-	}
-	up_read(&curseg->journal_rwsem);
 	up_read(&nm_i->nat_tree_lock);
 }
 
 static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
-	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-	struct f2fs_journal *journal = curseg->journal;
 	int i = 0;
 	nid_t nid = nm_i->next_scan_nid;
 
@@ -1986,7 +2016,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 		nid = 0;
 
 	/* Enough entries */
-	if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
+	if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
 		return;
 
 	if (!sync && !available_free_memory(sbi, FREE_NIDS))
@@ -1996,7 +2026,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 		/* try to find free nids in free_nid_bitmap */
 		scan_free_nid_bits(sbi);
 
-		if (nm_i->nid_cnt[FREE_NID_LIST])
+		if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
 			return;
 	}
 
@@ -2024,18 +2054,8 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
 	nm_i->next_scan_nid = nid;
 
 	/* find free nids from current sum_pages */
-	down_read(&curseg->journal_rwsem);
-	for (i = 0; i < nats_in_cursum(journal); i++) {
-		block_t addr;
+	scan_curseg_cache(sbi);
 
-		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
-		nid = le32_to_cpu(nid_in_journal(journal, i));
-		if (addr == NULL_ADDR)
-			add_free_nid(sbi, nid, true);
-		else
-			remove_free_nid(sbi, nid);
-	}
-	up_read(&curseg->journal_rwsem);
 	up_read(&nm_i->nat_tree_lock);
 
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
@@ -2073,15 +2093,13 @@ retry:
 	}
 
 	/* We should not use stale free nids created by build_free_nids */
-	if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) {
-		f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST]));
-		i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
+	if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) {
+		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
+		i = list_first_entry(&nm_i->free_nid_list,
 					struct free_nid, list);
 		*nid = i->nid;
 
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, true);
-		i->state = NID_ALLOC;
-		__insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
+		__move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
 		nm_i->available_nids--;
 
 		update_free_nid_bitmap(sbi, *nid, false, false);
@@ -2107,7 +2125,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
 	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
 	f2fs_bug_on(sbi, !i);
-	__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+	__remove_free_nid(sbi, i, PREALLOC_NID);
 	spin_unlock(&nm_i->nid_list_lock);
 
 	kmem_cache_free(free_nid_slab, i);
@@ -2130,12 +2148,10 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 	f2fs_bug_on(sbi, !i);
 
 	if (!available_free_memory(sbi, FREE_NIDS)) {
-		__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+		__remove_free_nid(sbi, i, PREALLOC_NID);
 		need_free = true;
 	} else {
-		__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true);
-		i->state = NID_NEW;
-		__insert_nid_to_list(sbi, i, FREE_NID_LIST, false);
+		__move_free_nid(sbi, i, PREALLOC_NID, FREE_NID);
 	}
 
 	nm_i->available_nids++;
@@ -2154,20 +2170,19 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
 	struct free_nid *i, *next;
 	int nr = nr_shrink;
 
-	if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
+	if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
 		return 0;
 
 	if (!mutex_trylock(&nm_i->build_lock))
 		return 0;
 
 	spin_lock(&nm_i->nid_list_lock);
-	list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST],
-									list) {
+	list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
 		if (nr_shrink <= 0 ||
-				nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
+				nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
 			break;
 
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+		__remove_free_nid(sbi, i, FREE_NID);
 		kmem_cache_free(free_nid_slab, i);
 		nr_shrink--;
 	}
@@ -2193,8 +2208,8 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
 		goto update_inode;
 	}
 
-	dst_addr = inline_xattr_addr(ipage);
-	src_addr = inline_xattr_addr(page);
+	dst_addr = inline_xattr_addr(inode, ipage);
+	src_addr = inline_xattr_addr(inode, page);
 	inline_size = inline_xattr_size(inode);
 
 	f2fs_wait_on_page_writeback(ipage, NODE, true);
@@ -2283,6 +2298,12 @@ retry:
 	dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
 	if (dst->i_inline & F2FS_EXTRA_ATTR) {
 		dst->i_extra_isize = src->i_extra_isize;
+
+		if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
+			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+							i_inline_xattr_size))
+			dst->i_inline_xattr_size = src->i_inline_xattr_size;
+
 		if (f2fs_sb_has_project_quota(sbi->sb) &&
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 								i_projid))
@@ -2354,8 +2375,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (!ne) {
-			ne = grab_nat_entry(nm_i, nid, true);
-			node_info_from_raw_nat(&ne->ni, &raw_ne);
+			ne = __alloc_nat_entry(nid, true);
+			__init_nat_entry(nm_i, ne, &raw_ne, true);
 		}
 
 		/*
@@ -2401,15 +2422,17 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
 	unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
 	struct f2fs_nat_block *nat_blk = page_address(page);
 	int valid = 0;
-	int i;
+	int i = 0;
 
 	if (!enabled_nat_bits(sbi, NULL))
 		return;
 
-	for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) {
-		if (start_nid == 0 && i == 0)
-			valid++;
-		if (nat_blk->entries[i].block_addr)
+	if (nat_index == 0) {
+		valid = 1;
+		i = 1;
+	}
+	for (; i < NAT_ENTRY_PER_BLOCK; i++) {
+		if (nat_blk->entries[i].block_addr != NULL_ADDR)
 			valid++;
 	}
 	if (valid == 0) {
@@ -2604,7 +2627,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
 		__set_bit_le(i, nm_i->nat_block_bitmap);
 
 		nid = i * NAT_ENTRY_PER_BLOCK;
-		last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK;
+		last_nid = nid + NAT_ENTRY_PER_BLOCK;
 
 		spin_lock(&NM_I(sbi)->nid_list_lock);
 		for (; nid < last_nid; nid++)
@@ -2639,16 +2662,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
 	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
 							F2FS_RESERVED_NODE_NUM;
-	nm_i->nid_cnt[FREE_NID_LIST] = 0;
-	nm_i->nid_cnt[ALLOC_NID_LIST] = 0;
+	nm_i->nid_cnt[FREE_NID] = 0;
+	nm_i->nid_cnt[PREALLOC_NID] = 0;
 	nm_i->nat_cnt = 0;
 	nm_i->ram_thresh = DEF_RAM_THRESHOLD;
 	nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
 	nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
 
 	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
-	INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]);
-	INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]);
+	INIT_LIST_HEAD(&nm_i->free_nid_list);
 	INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
 	INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
 	INIT_LIST_HEAD(&nm_i->nat_entries);
@@ -2740,16 +2762,15 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 
 	/* destroy free nid list */
 	spin_lock(&nm_i->nid_list_lock);
-	list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST],
-									list) {
-		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
+		__remove_free_nid(sbi, i, FREE_NID);
 		spin_unlock(&nm_i->nid_list_lock);
 		kmem_cache_free(free_nid_slab, i);
 		spin_lock(&nm_i->nid_list_lock);
 	}
-	f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]);
-	f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]);
-	f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST]));
+	f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]);
+	f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]);
+	f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list));
 	spin_unlock(&nm_i->nid_list_lock);
 
 	/* destroy nat cache */
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index bb53e9955ff2..0ee3e5ff49a3 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -140,6 +140,7 @@ enum mem_type {
 	DIRTY_DENTS,	/* indicates dirty dentry pages */
 	INO_ENTRIES,	/* indicates inode entries */
 	EXTENT_CACHE,	/* indicates extent cache */
+	INMEM_PAGES,	/* indicates inmemory pages */
 	BASE_CHECK,	/* check kernel status */
 };
 
@@ -150,18 +151,10 @@ struct nat_entry_set {
 	unsigned int entry_cnt;		/* the # of nat entries in set */
 };
 
-/*
- * For free nid mangement
- */
-enum nid_state {
-	NID_NEW,	/* newly added to free nid list */
-	NID_ALLOC	/* it is allocated */
-};
-
 struct free_nid {
 	struct list_head list;	/* for free node id list */
 	nid_t nid;		/* node id */
-	int state;		/* in use or not: NID_NEW or NID_ALLOC */
+	int state;		/* in use or not: FREE_NID or PREALLOC_NID */
 };
 
 static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
@@ -170,12 +163,11 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 	struct free_nid *fnid;
 
 	spin_lock(&nm_i->nid_list_lock);
-	if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) {
+	if (nm_i->nid_cnt[FREE_NID] <= 0) {
 		spin_unlock(&nm_i->nid_list_lock);
 		return;
 	}
-	fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
-						struct free_nid, list);
+	fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list);
 	*nid = fnid->nid;
 	spin_unlock(&nm_i->nid_list_lock);
 }
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9626758bc762..92c57ace1939 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -594,6 +594,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	int ret = 0;
 	unsigned long s_flags = sbi->sb->s_flags;
 	bool need_writecp = false;
+#ifdef CONFIG_QUOTA
+	int quota_enabled;
+#endif
 
 	if (s_flags & MS_RDONLY) {
 		f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
@@ -604,7 +607,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	/* Needed for iput() to work correctly and not trash data */
 	sbi->sb->s_flags |= MS_ACTIVE;
 	/* Turn on quotas so that they are updated correctly */
-	f2fs_enable_quota_files(sbi);
+	quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
 #endif
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -665,7 +668,8 @@ skip:
 out:
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
-	f2fs_quota_off_umount(sbi->sb);
+	if (quota_enabled)
+		f2fs_quota_off_umount(sbi->sb);
 #endif
 	sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c695ff462ee6..c117e0913f2a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -181,11 +181,12 @@ bool need_SSR(struct f2fs_sb_info *sbi)
 		return true;
 
 	return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
-						2 * reserved_sections(sbi));
+			SM_I(sbi)->min_ssr_sections + reserved_sections(sbi));
 }
 
 void register_inmem_page(struct inode *inode, struct page *page)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct inmem_pages *new;
 
@@ -204,6 +205,10 @@ void register_inmem_page(struct inode *inode, struct page *page)
 	mutex_lock(&fi->inmem_lock);
 	get_page(page);
 	list_add_tail(&new->list, &fi->inmem_pages);
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (list_empty(&fi->inmem_ilist))
+		list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 	inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
 	mutex_unlock(&fi->inmem_lock);
 
@@ -262,12 +267,41 @@ next:
 	return err;
 }
 
+void drop_inmem_pages_all(struct f2fs_sb_info *sbi)
+{
+	struct list_head *head = &sbi->inode_list[ATOMIC_FILE];
+	struct inode *inode;
+	struct f2fs_inode_info *fi;
+next:
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (list_empty(head)) {
+		spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+		return;
+	}
+	fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist);
+	inode = igrab(&fi->vfs_inode);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+
+	if (inode) {
+		drop_inmem_pages(inode);
+		iput(inode);
+	}
+	congestion_wait(BLK_RW_ASYNC, HZ/50);
+	cond_resched();
+	goto next;
+}
+
 void drop_inmem_pages(struct inode *inode)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
 	mutex_lock(&fi->inmem_lock);
 	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (!list_empty(&fi->inmem_ilist))
+		list_del_init(&fi->inmem_ilist);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 	mutex_unlock(&fi->inmem_lock);
 
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
@@ -313,6 +347,7 @@ static int __commit_inmem_pages(struct inode *inode,
 	struct inmem_pages *cur, *tmp;
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
+		.ino = inode->i_ino,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
 		.op_flags = REQ_SYNC | REQ_PRIO,
@@ -398,6 +433,10 @@ int commit_inmem_pages(struct inode *inode)
 		/* drop all uncommitted pages */
 		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
 	}
+	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+	if (!list_empty(&fi->inmem_ilist))
+		list_del_init(&fi->inmem_ilist);
+	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
 	mutex_unlock(&fi->inmem_lock);
 
 	clear_inode_flag(inode, FI_ATOMIC_COMMIT);
@@ -472,7 +511,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 				struct block_device *bdev)
 {
-	struct bio *bio = f2fs_bio_alloc(0);
+	struct bio *bio = f2fs_bio_alloc(sbi, 0, true);
 	int ret;
 
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
@@ -485,15 +524,17 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 	return ret;
 }
 
-static int submit_flush_wait(struct f2fs_sb_info *sbi)
+static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino)
 {
-	int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev);
+	int ret = 0;
 	int i;
 
-	if (!sbi->s_ndevs || ret)
-		return ret;
+	if (!sbi->s_ndevs)
+		return __submit_flush_wait(sbi, sbi->sb->s_bdev);
 
-	for (i = 1; i < sbi->s_ndevs; i++) {
+	for (i = 0; i < sbi->s_ndevs; i++) {
+		if (!is_dirty_device(sbi, ino, i, FLUSH_INO))
+			continue;
 		ret = __submit_flush_wait(sbi, FDEV(i).bdev);
 		if (ret)
 			break;
@@ -519,7 +560,9 @@ repeat:
 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
-		ret = submit_flush_wait(sbi);
+		cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode);
+
+		ret = submit_flush_wait(sbi, cmd->ino);
 		atomic_inc(&fcc->issued_flush);
 
 		llist_for_each_entry_safe(cmd, next,
@@ -537,7 +580,7 @@ repeat:
 	goto repeat;
 }
 
-int f2fs_issue_flush(struct f2fs_sb_info *sbi)
+int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
 {
 	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
 	struct flush_cmd cmd;
@@ -547,19 +590,20 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		return 0;
 
 	if (!test_opt(sbi, FLUSH_MERGE)) {
-		ret = submit_flush_wait(sbi);
+		ret = submit_flush_wait(sbi, ino);
 		atomic_inc(&fcc->issued_flush);
 		return ret;
 	}
 
-	if (atomic_inc_return(&fcc->issing_flush) == 1) {
-		ret = submit_flush_wait(sbi);
+	if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) {
+		ret = submit_flush_wait(sbi, ino);
 		atomic_dec(&fcc->issing_flush);
 
 		atomic_inc(&fcc->issued_flush);
 		return ret;
 	}
 
+	cmd.ino = ino;
 	init_completion(&cmd.wait);
 
 	llist_add(&cmd.llnode, &fcc->issue_list);
@@ -583,7 +627,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		} else {
 			struct flush_cmd *tmp, *next;
 
-			ret = submit_flush_wait(sbi);
+			ret = submit_flush_wait(sbi, ino);
 
 			llist_for_each_entry_safe(tmp, next, list, llnode) {
 				if (tmp == &cmd) {
@@ -653,6 +697,28 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
 	}
 }
 
+int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
+{
+	int ret = 0, i;
+
+	if (!sbi->s_ndevs)
+		return 0;
+
+	for (i = 1; i < sbi->s_ndevs; i++) {
+		if (!f2fs_test_bit(i, (char *)&sbi->dirty_device))
+			continue;
+		ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+		if (ret)
+			break;
+
+		spin_lock(&sbi->dev_lock);
+		f2fs_clear_bit(i, (char *)&sbi->dirty_device);
+		spin_unlock(&sbi->dev_lock);
+	}
+
+	return ret;
+}
+
 static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 		enum dirty_type dirty_type)
 {
@@ -794,6 +860,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 
+	trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len);
+
 	f2fs_bug_on(sbi, dc->ref);
 
 	if (dc->error == -EOPNOTSUPP)
@@ -845,10 +913,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi,
 
 /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
 static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
-				struct discard_cmd *dc)
+						struct discard_policy *dpolicy,
+						struct discard_cmd *dc)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
+					&(dcc->fstrim_list) : &(dcc->wait_list);
 	struct bio *bio = NULL;
+	int flag = dpolicy->sync ? REQ_SYNC : 0;
 
 	if (dc->state != D_PREP)
 		return;
@@ -867,9 +939,9 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
 		if (bio) {
 			bio->bi_private = dc;
 			bio->bi_end_io = f2fs_submit_discard_endio;
-			bio->bi_opf |= REQ_SYNC;
+			bio->bi_opf |= flag;
 			submit_bio(bio);
-			list_move_tail(&dc->list, &dcc->wait_list);
+			list_move_tail(&dc->list, wait_list);
 			__check_sit_bitmap(sbi, dc->start, dc->start + dc->len);
 
 			f2fs_update_iostat(sbi, FS_DISCARD, 1);
@@ -886,7 +958,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
 				struct rb_node *insert_parent)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-	struct rb_node **p = &dcc->root.rb_node;
+	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct discard_cmd *dc = NULL;
 
@@ -1054,58 +1126,107 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
 	return 0;
 }
 
-static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
+static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi,
+					struct discard_policy *dpolicy,
+					unsigned int start, unsigned int end)
+{
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+	struct rb_node **insert_p = NULL, *insert_parent = NULL;
+	struct discard_cmd *dc;
+	struct blk_plug plug;
+	int issued;
+
+next:
+	issued = 0;
+
+	mutex_lock(&dcc->cmd_lock);
+	f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+
+	dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
+					NULL, start,
+					(struct rb_entry **)&prev_dc,
+					(struct rb_entry **)&next_dc,
+					&insert_p, &insert_parent, true);
+	if (!dc)
+		dc = next_dc;
+
+	blk_start_plug(&plug);
+
+	while (dc && dc->lstart <= end) {
+		struct rb_node *node;
+
+		if (dc->len < dpolicy->granularity)
+			goto skip;
+
+		if (dc->state != D_PREP) {
+			list_move_tail(&dc->list, &dcc->fstrim_list);
+			goto skip;
+		}
+
+		__submit_discard_cmd(sbi, dpolicy, dc);
+
+		if (++issued >= dpolicy->max_requests) {
+			start = dc->lstart + dc->len;
+
+			blk_finish_plug(&plug);
+			mutex_unlock(&dcc->cmd_lock);
+
+			schedule();
+
+			goto next;
+		}
+skip:
+		node = rb_next(&dc->rb_node);
+		dc = rb_entry_safe(node, struct discard_cmd, rb_node);
+
+		if (fatal_signal_pending(current))
+			break;
+	}
+
+	blk_finish_plug(&plug);
+	mutex_unlock(&dcc->cmd_lock);
+}
+
+static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
+					struct discard_policy *dpolicy)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct list_head *pend_list;
 	struct discard_cmd *dc, *tmp;
 	struct blk_plug plug;
-	int iter = 0, issued = 0;
-	int i;
+	int i, iter = 0, issued = 0;
 	bool io_interrupted = false;
 
-	mutex_lock(&dcc->cmd_lock);
-	f2fs_bug_on(sbi,
-		!__check_rb_tree_consistence(sbi, &dcc->root));
-	blk_start_plug(&plug);
-	for (i = MAX_PLIST_NUM - 1;
-			i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+		if (i + 1 < dpolicy->granularity)
+			break;
 		pend_list = &dcc->pend_list[i];
+
+		mutex_lock(&dcc->cmd_lock);
+		f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+		blk_start_plug(&plug);
 		list_for_each_entry_safe(dc, tmp, pend_list, list) {
 			f2fs_bug_on(sbi, dc->state != D_PREP);
 
-			/* Hurry up to finish fstrim */
-			if (dcc->pend_list_tag[i] & P_TRIM) {
-				__submit_discard_cmd(sbi, dc);
-				issued++;
-
-				if (fatal_signal_pending(current))
-					break;
-				continue;
-			}
-
-			if (!issue_cond) {
-				__submit_discard_cmd(sbi, dc);
-				issued++;
-				continue;
-			}
-
-			if (is_idle(sbi)) {
-				__submit_discard_cmd(sbi, dc);
-				issued++;
-			} else {
+			if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
+								!is_idle(sbi)) {
 				io_interrupted = true;
+				goto skip;
 			}
 
-			if (++iter >= DISCARD_ISSUE_RATE)
-				goto out;
+			__submit_discard_cmd(sbi, dpolicy, dc);
+			issued++;
+skip:
+			if (++iter >= dpolicy->max_requests)
+				break;
 		}
-		if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM)
-			dcc->pend_list_tag[i] &= (~P_TRIM);
+		blk_finish_plug(&plug);
+		mutex_unlock(&dcc->cmd_lock);
+
+		if (iter >= dpolicy->max_requests)
+			break;
 	}
-out:
-	blk_finish_plug(&plug);
-	mutex_unlock(&dcc->cmd_lock);
 
 	if (!issued && io_interrupted)
 		issued = -1;
@@ -1113,12 +1234,13 @@ out:
 	return issued;
 }
 
-static void __drop_discard_cmd(struct f2fs_sb_info *sbi)
+static bool __drop_discard_cmd(struct f2fs_sb_info *sbi)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	struct list_head *pend_list;
 	struct discard_cmd *dc, *tmp;
 	int i;
+	bool dropped = false;
 
 	mutex_lock(&dcc->cmd_lock);
 	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
@@ -1126,39 +1248,58 @@ static void __drop_discard_cmd(struct f2fs_sb_info *sbi)
 		list_for_each_entry_safe(dc, tmp, pend_list, list) {
 			f2fs_bug_on(sbi, dc->state != D_PREP);
 			__remove_discard_cmd(sbi, dc);
+			dropped = true;
 		}
 	}
 	mutex_unlock(&dcc->cmd_lock);
+
+	return dropped;
 }
 
-static void __wait_one_discard_bio(struct f2fs_sb_info *sbi,
+static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi,
 							struct discard_cmd *dc)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	unsigned int len = 0;
 
 	wait_for_completion_io(&dc->wait);
 	mutex_lock(&dcc->cmd_lock);
 	f2fs_bug_on(sbi, dc->state != D_DONE);
 	dc->ref--;
-	if (!dc->ref)
+	if (!dc->ref) {
+		if (!dc->error)
+			len = dc->len;
 		__remove_discard_cmd(sbi, dc);
+	}
 	mutex_unlock(&dcc->cmd_lock);
+
+	return len;
 }
 
-static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond)
+static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi,
+						struct discard_policy *dpolicy,
+						block_t start, block_t end)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-	struct list_head *wait_list = &(dcc->wait_list);
+	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
+					&(dcc->fstrim_list) : &(dcc->wait_list);
 	struct discard_cmd *dc, *tmp;
 	bool need_wait;
+	unsigned int trimmed = 0;
 
 next:
 	need_wait = false;
 
 	mutex_lock(&dcc->cmd_lock);
 	list_for_each_entry_safe(dc, tmp, wait_list, list) {
-		if (!wait_cond || (dc->state == D_DONE && !dc->ref)) {
+		if (dc->lstart + dc->len <= start || end <= dc->lstart)
+			continue;
+		if (dc->len < dpolicy->granularity)
+			continue;
+		if (dc->state == D_DONE && !dc->ref) {
 			wait_for_completion_io(&dc->wait);
+			if (!dc->error)
+				trimmed += dc->len;
 			__remove_discard_cmd(sbi, dc);
 		} else {
 			dc->ref++;
@@ -1169,9 +1310,17 @@ next:
 	mutex_unlock(&dcc->cmd_lock);
 
 	if (need_wait) {
-		__wait_one_discard_bio(sbi, dc);
+		trimmed += __wait_one_discard_bio(sbi, dc);
 		goto next;
 	}
+
+	return trimmed;
+}
+
+static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
+						struct discard_policy *dpolicy)
+{
+	__wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX);
 }
 
 /* This should be covered by global mutex, &sit_i->sentry_lock */
@@ -1209,23 +1358,19 @@ void stop_discard_thread(struct f2fs_sb_info *sbi)
 	}
 }
 
-/* This comes from f2fs_put_super and f2fs_trim_fs */
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount)
-{
-	__issue_discard_cmd(sbi, false);
-	__drop_discard_cmd(sbi);
-	__wait_discard_cmd(sbi, !umount);
-}
-
-static void mark_discard_range_all(struct f2fs_sb_info *sbi)
+/* This comes from f2fs_put_super */
+bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
 {
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-	int i;
+	struct discard_policy dpolicy;
+	bool dropped;
 
-	mutex_lock(&dcc->cmd_lock);
-	for (i = 0; i < MAX_PLIST_NUM; i++)
-		dcc->pend_list_tag[i] |= P_TRIM;
-	mutex_unlock(&dcc->cmd_lock);
+	init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity);
+	__issue_discard_cmd(sbi, &dpolicy);
+	dropped = __drop_discard_cmd(sbi);
+	__wait_all_discard_cmd(sbi, &dpolicy);
+
+	return dropped;
 }
 
 static int issue_discard_thread(void *data)
@@ -1233,12 +1378,16 @@ static int issue_discard_thread(void *data)
 	struct f2fs_sb_info *sbi = data;
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	wait_queue_head_t *q = &dcc->discard_wait_queue;
+	struct discard_policy dpolicy;
 	unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
 	int issued;
 
 	set_freezable();
 
 	do {
+		init_discard_policy(&dpolicy, DPOLICY_BG,
+					dcc->discard_granularity);
+
 		wait_event_interruptible_timeout(*q,
 				kthread_should_stop() || freezing(current) ||
 				dcc->discard_wake,
@@ -1251,17 +1400,18 @@ static int issue_discard_thread(void *data)
 		if (dcc->discard_wake) {
 			dcc->discard_wake = 0;
 			if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
-				mark_discard_range_all(sbi);
+				init_discard_policy(&dpolicy,
+							DPOLICY_FORCE, 1);
 		}
 
 		sb_start_intwrite(sbi->sb);
 
-		issued = __issue_discard_cmd(sbi, true);
+		issued = __issue_discard_cmd(sbi, &dpolicy);
 		if (issued) {
-			__wait_discard_cmd(sbi, true);
-			wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+			__wait_all_discard_cmd(sbi, &dpolicy);
+			wait_ms = dpolicy.min_interval;
 		} else {
-			wait_ms = DEF_MAX_DISCARD_ISSUE_TIME;
+			wait_ms = dpolicy.max_interval;
 		}
 
 		sb_end_intwrite(sbi->sb);
@@ -1525,7 +1675,6 @@ find_next:
 
 			f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
 									len);
-			cpc->trimmed += len;
 			total_len += len;
 		} else {
 			next_pos = find_next_bit_le(entry->discard_map,
@@ -1546,6 +1695,37 @@ skip:
 	wake_up_discard_thread(sbi, false);
 }
 
+void init_discard_policy(struct discard_policy *dpolicy,
+				int discard_type, unsigned int granularity)
+{
+	/* common policy */
+	dpolicy->type = discard_type;
+	dpolicy->sync = true;
+	dpolicy->granularity = granularity;
+
+	if (discard_type == DPOLICY_BG) {
+		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = true;
+	} else if (discard_type == DPOLICY_FORCE) {
+		dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+		dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = true;
+	} else if (discard_type == DPOLICY_FSTRIM) {
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = false;
+	} else if (discard_type == DPOLICY_UMOUNT) {
+		dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+		dpolicy->io_aware_gran = MAX_PLIST_NUM;
+		dpolicy->io_aware = false;
+	}
+}
+
 static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 {
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
@@ -1563,12 +1743,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 
 	dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
 	INIT_LIST_HEAD(&dcc->entry_list);
-	for (i = 0; i < MAX_PLIST_NUM; i++) {
+	for (i = 0; i < MAX_PLIST_NUM; i++)
 		INIT_LIST_HEAD(&dcc->pend_list[i]);
-		if (i >= dcc->discard_granularity - 1)
-			dcc->pend_list_tag[i] |= P_ACTIVE;
-	}
 	INIT_LIST_HEAD(&dcc->wait_list);
+	INIT_LIST_HEAD(&dcc->fstrim_list);
 	mutex_init(&dcc->cmd_lock);
 	atomic_set(&dcc->issued_discard, 0);
 	atomic_set(&dcc->issing_discard, 0);
@@ -1716,16 +1894,6 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 		get_sec_entry(sbi, segno)->valid_blocks += del;
 }
 
-void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
-{
-	update_sit_entry(sbi, new, 1);
-	if (GET_SEGNO(sbi, old) != NULL_SEGNO)
-		update_sit_entry(sbi, old, -1);
-
-	locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
-	locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
-}
-
 void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 {
 	unsigned int segno = GET_SEGNO(sbi, addr);
@@ -1736,14 +1904,14 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
 		return;
 
 	/* add it into sit main buffer */
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	update_sit_entry(sbi, addr, -1);
 
 	/* add it into dirty seglist */
 	locate_dirty_segment(sbi, segno);
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 }
 
 bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
@@ -1756,7 +1924,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
 	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
 		return true;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_read(&sit_i->sentry_lock);
 
 	segno = GET_SEGNO(sbi, blkaddr);
 	se = get_seg_entry(sbi, segno);
@@ -1765,7 +1933,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
 	if (f2fs_test_bit(offset, se->ckpt_valid_map))
 		is_cp = true;
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_read(&sit_i->sentry_lock);
 
 	return is_cp;
 }
@@ -1823,12 +1991,8 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
 void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
 {
 	struct page *page = grab_meta_page(sbi, blk_addr);
-	void *dst = page_address(page);
 
-	if (src)
-		memcpy(dst, src, PAGE_SIZE);
-	else
-		memset(dst, 0, PAGE_SIZE);
+	memcpy(page_address(page), src, PAGE_SIZE);
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
 }
@@ -1927,7 +2091,6 @@ find_other_zone:
 	}
 	secno = left_start;
 skip_left:
-	hint = secno;
 	segno = GET_SEG_FROM_SEC(sbi, secno);
 	zoneno = GET_ZONE_FROM_SEC(sbi, secno);
 
@@ -2162,12 +2325,16 @@ void allocate_new_segments(struct f2fs_sb_info *sbi)
 	unsigned int old_segno;
 	int i;
 
+	down_write(&SIT_I(sbi)->sentry_lock);
+
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
 		curseg = CURSEG_I(sbi, i);
 		old_segno = curseg->segno;
 		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
 		locate_dirty_segment(sbi, old_segno);
 	}
+
+	up_write(&SIT_I(sbi)->sentry_lock);
 }
 
 static const struct segment_allocation default_salloc_ops = {
@@ -2179,14 +2346,14 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	__u64 trim_start = cpc->trim_start;
 	bool has_candidate = false;
 
-	mutex_lock(&SIT_I(sbi)->sentry_lock);
+	down_write(&SIT_I(sbi)->sentry_lock);
 	for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) {
 		if (add_discard_addrs(sbi, cpc, true)) {
 			has_candidate = true;
 			break;
 		}
 	}
-	mutex_unlock(&SIT_I(sbi)->sentry_lock);
+	up_write(&SIT_I(sbi)->sentry_lock);
 
 	cpc->trim_start = trim_start;
 	return has_candidate;
@@ -2196,14 +2363,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 {
 	__u64 start = F2FS_BYTES_TO_BLK(range->start);
 	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
-	unsigned int start_segno, end_segno;
+	unsigned int start_segno, end_segno, cur_segno;
+	block_t start_block, end_block;
 	struct cp_control cpc;
+	struct discard_policy dpolicy;
+	unsigned long long trimmed = 0;
 	int err = 0;
 
 	if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
 		return -EINVAL;
 
-	cpc.trimmed = 0;
 	if (end <= MAIN_BLKADDR(sbi))
 		goto out;
 
@@ -2217,12 +2386,14 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
 	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
 						GET_SEGNO(sbi, end);
+
 	cpc.reason = CP_DISCARD;
 	cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
 
 	/* do checkpoint to issue discard commands safely */
-	for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
-		cpc.trim_start = start_segno;
+	for (cur_segno = start_segno; cur_segno <= end_segno;
+					cur_segno = cpc.trim_end + 1) {
+		cpc.trim_start = cur_segno;
 
 		if (sbi->discard_blks == 0)
 			break;
@@ -2230,7 +2401,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 			cpc.trim_end = end_segno;
 		else
 			cpc.trim_end = min_t(unsigned int,
-				rounddown(start_segno +
+				rounddown(cur_segno +
 				BATCHED_TRIM_SEGMENTS(sbi),
 				sbi->segs_per_sec) - 1, end_segno);
 
@@ -2242,11 +2413,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 
 		schedule();
 	}
-	/* It's time to issue all the filed discards */
-	mark_discard_range_all(sbi);
-	f2fs_wait_discard_bios(sbi, false);
+
+	start_block = START_BLOCK(sbi, start_segno);
+	end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1);
+
+	init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen);
+	__issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block);
+	trimmed = __wait_discard_cmd_range(sbi, &dpolicy,
+					start_block, end_block);
 out:
-	range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
+	range->len = F2FS_BLK_TO_BYTES(trimmed);
 	return err;
 }
 
@@ -2258,6 +2434,18 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
 	return false;
 }
 
+int rw_hint_to_seg_type(enum rw_hint hint)
+{
+	switch (hint) {
+	case WRITE_LIFE_SHORT:
+		return CURSEG_HOT_DATA;
+	case WRITE_LIFE_EXTREME:
+		return CURSEG_COLD_DATA;
+	default:
+		return CURSEG_WARM_DATA;
+	}
+}
+
 static int __get_segment_type_2(struct f2fs_io_info *fio)
 {
 	if (fio->type == DATA)
@@ -2292,7 +2480,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
 			return CURSEG_COLD_DATA;
 		if (is_inode_flag_set(inode, FI_HOT_DATA))
 			return CURSEG_HOT_DATA;
-		return CURSEG_WARM_DATA;
+		return rw_hint_to_seg_type(inode->i_write_hint);
 	} else {
 		if (IS_DNODE(fio->page))
 			return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
@@ -2336,8 +2524,10 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 
+	down_read(&SM_I(sbi)->curseg_lock);
+
 	mutex_lock(&curseg->curseg_mutex);
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
@@ -2354,15 +2544,26 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 
 	stat_inc_block_count(sbi, curseg);
 
+	/*
+	 * SIT information should be updated before segment allocation,
+	 * since SSR needs latest valid block information.
+	 */
+	update_sit_entry(sbi, *new_blkaddr, 1);
+	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+		update_sit_entry(sbi, old_blkaddr, -1);
+
 	if (!__has_curseg_space(sbi, type))
 		sit_i->s_ops->allocate_segment(sbi, type, false);
+
 	/*
-	 * SIT information should be updated after segment allocation,
-	 * since we need to keep dirty segments precisely under SSR.
+	 * segment dirty status should be updated after segment allocation,
+	 * so we just need to update status only one time after previous
+	 * segment being closed.
 	 */
-	refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
+	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+	locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 
 	if (page && IS_NODESEG(type)) {
 		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
@@ -2382,6 +2583,29 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	}
 
 	mutex_unlock(&curseg->curseg_mutex);
+
+	up_read(&SM_I(sbi)->curseg_lock);
+}
+
+static void update_device_state(struct f2fs_io_info *fio)
+{
+	struct f2fs_sb_info *sbi = fio->sbi;
+	unsigned int devidx;
+
+	if (!sbi->s_ndevs)
+		return;
+
+	devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
+
+	/* update device state for fsync */
+	set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO);
+
+	/* update device state for checkpoint */
+	if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
+		spin_lock(&sbi->dev_lock);
+		f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
+		spin_unlock(&sbi->dev_lock);
+	}
 }
 
 static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
@@ -2398,6 +2622,8 @@ reallocate:
 	if (err == -EAGAIN) {
 		fio->old_blkaddr = fio->new_blkaddr;
 		goto reallocate;
+	} else if (!err) {
+		update_device_state(fio);
 	}
 }
 
@@ -2458,12 +2684,26 @@ int rewrite_data_page(struct f2fs_io_info *fio)
 	stat_inc_inplace_blocks(fio->sbi);
 
 	err = f2fs_submit_page_bio(fio);
+	if (!err)
+		update_device_state(fio);
 
 	f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
 
 	return err;
 }
 
+static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi,
+						unsigned int segno)
+{
+	int i;
+
+	for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
+		if (CURSEG_I(sbi, i)->segno == segno)
+			break;
+	}
+	return i;
+}
+
 void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 				block_t old_blkaddr, block_t new_blkaddr,
 				bool recover_curseg, bool recover_newaddr)
@@ -2479,6 +2719,8 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	se = get_seg_entry(sbi, segno);
 	type = se->type;
 
+	down_write(&SM_I(sbi)->curseg_lock);
+
 	if (!recover_curseg) {
 		/* for recovery flow */
 		if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
@@ -2488,14 +2730,19 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 				type = CURSEG_WARM_DATA;
 		}
 	} else {
-		if (!IS_CURSEG(sbi, segno))
+		if (IS_CURSEG(sbi, segno)) {
+			/* se->type is volatile as SSR allocation */
+			type = __f2fs_get_curseg(sbi, segno);
+			f2fs_bug_on(sbi, type == NO_CHECK_TYPE);
+		} else {
 			type = CURSEG_WARM_DATA;
+		}
 	}
 
 	curseg = CURSEG_I(sbi, type);
 
 	mutex_lock(&curseg->curseg_mutex);
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	old_cursegno = curseg->segno;
 	old_blkoff = curseg->next_blkoff;
@@ -2527,8 +2774,9 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		curseg->next_blkoff = old_blkoff;
 	}
 
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 	mutex_unlock(&curseg->curseg_mutex);
+	up_write(&SM_I(sbi)->curseg_lock);
 }
 
 void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
@@ -2982,7 +3230,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	bool to_journal = true;
 	struct seg_entry *se;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	if (!sit_i->dirty_sentries)
 		goto out;
@@ -3076,7 +3324,7 @@ out:
 
 		cpc->trim_start = trim_start;
 	}
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 
 	set_prefree_as_free_segments(sbi);
 }
@@ -3169,7 +3417,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
 	sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
 	sit_i->mounted_time = ktime_get_real_seconds();
-	mutex_init(&sit_i->sentry_lock);
+	init_rwsem(&sit_i->sentry_lock);
 	return 0;
 }
 
@@ -3410,7 +3658,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
 	struct sit_info *sit_i = SIT_I(sbi);
 	unsigned int segno;
 
-	mutex_lock(&sit_i->sentry_lock);
+	down_write(&sit_i->sentry_lock);
 
 	sit_i->min_mtime = LLONG_MAX;
 
@@ -3427,7 +3675,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
 			sit_i->min_mtime = mtime;
 	}
 	sit_i->max_mtime = get_mtime(sbi);
-	mutex_unlock(&sit_i->sentry_lock);
+	up_write(&sit_i->sentry_lock);
 }
 
 int build_segment_manager(struct f2fs_sb_info *sbi)
@@ -3460,11 +3708,14 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
 	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
 	sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
+	sm_info->min_ssr_sections = reserved_sections(sbi);
 
 	sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
 
 	INIT_LIST_HEAD(&sm_info->sit_entry_set);
 
+	init_rwsem(&sm_info->curseg_lock);
+
 	if (!f2fs_readonly(sbi->sb)) {
 		err = create_flush_cmd_control(sbi);
 		if (err)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index e0a6cc23ace3..d1d394cdf61d 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -231,7 +231,7 @@ struct sit_info {
 	unsigned long *dirty_sentries_bitmap;	/* bitmap for dirty sentries */
 	unsigned int dirty_sentries;		/* # of dirty sentries */
 	unsigned int sents_per_block;		/* # of SIT entries per block */
-	struct mutex sentry_lock;		/* to protect SIT cache */
+	struct rw_semaphore sentry_lock;	/* to protect SIT cache */
 	struct seg_entry *sentries;		/* SIT segment-level cache */
 	struct sec_entry *sec_entries;		/* SIT section-level cache */
 
@@ -497,6 +497,33 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
 	return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
 }
 
+static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
+{
+	unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
+					get_pages(sbi, F2FS_DIRTY_DENTS);
+	unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+	unsigned int segno, left_blocks;
+	int i;
+
+	/* check current node segment */
+	for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
+		segno = CURSEG_I(sbi, i)->segno;
+		left_blocks = sbi->blocks_per_seg -
+			get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+
+		if (node_blocks > left_blocks)
+			return false;
+	}
+
+	/* check current data segment */
+	segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
+	left_blocks = sbi->blocks_per_seg -
+			get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+	if (dent_blocks > left_blocks)
+		return false;
+	return true;
+}
+
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
 					int freed, int needed)
 {
@@ -507,6 +534,9 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		return false;
 
+	if (free_sections(sbi) + freed == reserved_sections(sbi) + needed &&
+			has_curseg_enough_space(sbi))
+		return false;
 	return (free_sections(sbi) + freed) <=
 		(node_secs + 2 * dent_secs + imeta_secs +
 		reserved_sections(sbi) + needed);
@@ -731,7 +761,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
 static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi,
 						unsigned int secno)
 {
-	if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >=
+	if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >
 						sbi->fggc_threshold)
 		return true;
 	return false;
@@ -796,8 +826,9 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
 		goto wake_up;
 
 	mutex_lock(&dcc->cmd_lock);
-	for (i = MAX_PLIST_NUM - 1;
-			i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+		if (i + 1 < dcc->discard_granularity)
+			break;
 		if (!list_empty(&dcc->pend_list[i])) {
 			wakeup = true;
 			break;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 5c60fc28ec75..0b5664a1a6cc 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -28,7 +28,7 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
 
 static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
 {
-	long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
+	long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS;
 
 	return count > 0 ? count : 0;
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 97e03c637e90..a6c5dd450002 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -44,6 +44,8 @@ static struct kmem_cache *f2fs_inode_cachep;
 char *fault_name[FAULT_MAX] = {
 	[FAULT_KMALLOC]		= "kmalloc",
 	[FAULT_PAGE_ALLOC]	= "page alloc",
+	[FAULT_PAGE_GET]	= "page get",
+	[FAULT_ALLOC_BIO]	= "alloc bio",
 	[FAULT_ALLOC_NID]	= "alloc nid",
 	[FAULT_ORPHAN]		= "orphan",
 	[FAULT_BLOCK]		= "no more block",
@@ -92,6 +94,7 @@ enum {
 	Opt_disable_ext_identify,
 	Opt_inline_xattr,
 	Opt_noinline_xattr,
+	Opt_inline_xattr_size,
 	Opt_inline_data,
 	Opt_inline_dentry,
 	Opt_noinline_dentry,
@@ -141,6 +144,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_disable_ext_identify, "disable_ext_identify"},
 	{Opt_inline_xattr, "inline_xattr"},
 	{Opt_noinline_xattr, "noinline_xattr"},
+	{Opt_inline_xattr_size, "inline_xattr_size=%u"},
 	{Opt_inline_data, "inline_data"},
 	{Opt_inline_dentry, "inline_dentry"},
 	{Opt_noinline_dentry, "noinline_dentry"},
@@ -209,6 +213,12 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
 			"quota options when quota turned on");
 		return -EINVAL;
 	}
+	if (f2fs_sb_has_quota_ino(sb)) {
+		f2fs_msg(sb, KERN_INFO,
+			"QUOTA feature is enabled, so ignore qf_name");
+		return 0;
+	}
+
 	qname = match_strdup(args);
 	if (!qname) {
 		f2fs_msg(sb, KERN_ERR,
@@ -287,6 +297,18 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
 			return -1;
 		}
 	}
+
+	if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"QUOTA feature is enabled, so ignore jquota_fmt");
+		sbi->s_jquota_fmt = 0;
+	}
+	if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			 "Filesystem with quota feature cannot be mounted RDWR "
+			 "without CONFIG_QUOTA");
+		return -1;
+	}
 	return 0;
 }
 #endif
@@ -383,6 +405,12 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_noinline_xattr:
 			clear_opt(sbi, INLINE_XATTR);
 			break;
+		case Opt_inline_xattr_size:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			set_opt(sbi, INLINE_XATTR_SIZE);
+			sbi->inline_xattr_size = arg;
+			break;
 #else
 		case Opt_user_xattr:
 			f2fs_msg(sb, KERN_INFO,
@@ -604,6 +632,24 @@ static int parse_options(struct super_block *sb, char *options)
 				F2FS_IO_SIZE_KB(sbi));
 		return -EINVAL;
 	}
+
+	if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+		if (!test_opt(sbi, INLINE_XATTR)) {
+			f2fs_msg(sb, KERN_ERR,
+					"inline_xattr_size option should be "
+					"set with inline_xattr option");
+			return -EINVAL;
+		}
+		if (!sbi->inline_xattr_size ||
+			sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE -
+					F2FS_TOTAL_EXTRA_ATTR_SIZE -
+					DEF_INLINE_RESERVED_SIZE -
+					DEF_MIN_INLINE_SIZE) {
+			f2fs_msg(sb, KERN_ERR,
+					"inline xattr size is out of range");
+			return -EINVAL;
+		}
+	}
 	return 0;
 }
 
@@ -618,13 +664,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	init_once((void *) fi);
 
 	/* Initialize f2fs-specific inode info */
-	fi->vfs_inode.i_version = 1;
 	atomic_set(&fi->dirty_pages, 0);
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
 	init_rwsem(&fi->i_sem);
 	INIT_LIST_HEAD(&fi->dirty_list);
 	INIT_LIST_HEAD(&fi->gdirty_list);
+	INIT_LIST_HEAD(&fi->inmem_ilist);
 	INIT_LIST_HEAD(&fi->inmem_pages);
 	mutex_init(&fi->inmem_lock);
 	init_rwsem(&fi->dio_rwsem[READ]);
@@ -673,7 +719,6 @@ static int f2fs_drop_inode(struct inode *inode)
 
 			sb_end_intwrite(inode->i_sb);
 
-			fscrypt_put_encryption_info(inode, NULL);
 			spin_lock(&inode->i_lock);
 			atomic_dec(&inode->i_count);
 		}
@@ -781,6 +826,7 @@ static void f2fs_put_super(struct super_block *sb)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	int i;
+	bool dropped;
 
 	f2fs_quota_off_umount(sb);
 
@@ -801,9 +847,9 @@ static void f2fs_put_super(struct super_block *sb)
 	}
 
 	/* be sure to wait for any on-going discard commands */
-	f2fs_wait_discard_bios(sbi, true);
+	dropped = f2fs_wait_discard_bios(sbi);
 
-	if (f2fs_discard_en(sbi) && !sbi->discard_blks) {
+	if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) {
 		struct cp_control cpc = {
 			.reason = CP_UMOUNT | CP_TRIMMED,
 		};
@@ -858,6 +904,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	int err = 0;
 
+	if (unlikely(f2fs_cp_error(sbi)))
+		return 0;
+
 	trace_f2fs_sync_fs(sb, sync);
 
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
@@ -957,7 +1006,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_blocks = total_count - start_count;
 	buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
 	buf->f_bavail = user_block_count - valid_user_blocks(sbi) -
-						sbi->reserved_blocks;
+						sbi->current_reserved_blocks;
 
 	avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
 
@@ -1046,6 +1095,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",inline_xattr");
 	else
 		seq_puts(seq, ",noinline_xattr");
+	if (test_opt(sbi, INLINE_XATTR_SIZE))
+		seq_printf(seq, ",inline_xattr_size=%u",
+					sbi->inline_xattr_size);
 #endif
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
 	if (test_opt(sbi, POSIX_ACL))
@@ -1108,6 +1160,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 {
 	/* init some FS parameters */
 	sbi->active_logs = NR_CURSEG_TYPE;
+	sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
 
 	set_opt(sbi, BG_GC);
 	set_opt(sbi, INLINE_XATTR);
@@ -1136,6 +1189,9 @@ static void default_options(struct f2fs_sb_info *sbi)
 #endif
 }
 
+#ifdef CONFIG_QUOTA
+static int f2fs_enable_quotas(struct super_block *sb);
+#endif
 static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -1202,6 +1258,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
 		goto skip;
 
+#ifdef CONFIG_QUOTA
 	if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) {
 		err = dquot_suspend(sb, -1);
 		if (err < 0)
@@ -1209,9 +1266,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	} else {
 		/* dquot_resume needs RW */
 		sb->s_flags &= ~MS_RDONLY;
-		dquot_resume(sb, -1);
+		if (sb_any_quota_suspended(sb)) {
+			dquot_resume(sb, -1);
+		} else if (f2fs_sb_has_quota_ino(sb)) {
+			err = f2fs_enable_quotas(sb);
+			if (err)
+				goto restore_opts;
+		}
 	}
-
+#endif
 	/* disallow enable/disable extent_cache dynamically */
 	if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
 		err = -EINVAL;
@@ -1320,8 +1383,13 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
 		tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
 repeat:
 		page = read_mapping_page(mapping, blkidx, NULL);
-		if (IS_ERR(page))
+		if (IS_ERR(page)) {
+			if (PTR_ERR(page) == -ENOMEM) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto repeat;
+			}
 			return PTR_ERR(page);
+		}
 
 		lock_page(page);
 
@@ -1364,11 +1432,16 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
 	while (towrite > 0) {
 		tocopy = min_t(unsigned long, sb->s_blocksize - offset,
 								towrite);
-
+retry:
 		err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
 							&page, NULL);
-		if (unlikely(err))
+		if (unlikely(err)) {
+			if (err == -ENOMEM) {
+				congestion_wait(BLK_RW_ASYNC, HZ/50);
+				goto retry;
+			}
 			break;
+		}
 
 		kaddr = kmap_atomic(page);
 		memcpy(kaddr + offset, data, tocopy);
@@ -1385,8 +1458,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
 	}
 
 	if (len == towrite)
-		return 0;
-	inode->i_version++;
+		return err;
 	inode->i_mtime = inode->i_ctime = current_time(inode);
 	f2fs_mark_inode_dirty_sync(inode, false);
 	return len - towrite;
@@ -1408,19 +1480,91 @@ static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
 						sbi->s_jquota_fmt, type);
 }
 
-void f2fs_enable_quota_files(struct f2fs_sb_info *sbi)
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
 {
-	int i, ret;
+	int enabled = 0;
+	int i, err;
+
+	if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) {
+		err = f2fs_enable_quotas(sbi->sb);
+		if (err) {
+			f2fs_msg(sbi->sb, KERN_ERR,
+					"Cannot turn on quota_ino: %d", err);
+			return 0;
+		}
+		return 1;
+	}
 
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (sbi->s_qf_names[i]) {
-			ret = f2fs_quota_on_mount(sbi, i);
-			if (ret < 0)
-				f2fs_msg(sbi->sb, KERN_ERR,
-					"Cannot turn on journaled "
-					"quota: error %d", ret);
+			err = f2fs_quota_on_mount(sbi, i);
+			if (!err) {
+				enabled = 1;
+				continue;
+			}
+			f2fs_msg(sbi->sb, KERN_ERR,
+				"Cannot turn on quotas: %d on %d", err, i);
+		}
+	}
+	return enabled;
+}
+
+static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
+			     unsigned int flags)
+{
+	struct inode *qf_inode;
+	unsigned long qf_inum;
+	int err;
+
+	BUG_ON(!f2fs_sb_has_quota_ino(sb));
+
+	qf_inum = f2fs_qf_ino(sb, type);
+	if (!qf_inum)
+		return -EPERM;
+
+	qf_inode = f2fs_iget(sb, qf_inum);
+	if (IS_ERR(qf_inode)) {
+		f2fs_msg(sb, KERN_ERR,
+			"Bad quota inode %u:%lu", type, qf_inum);
+		return PTR_ERR(qf_inode);
+	}
+
+	/* Don't account quota for quota files to avoid recursion */
+	qf_inode->i_flags |= S_NOQUOTA;
+	err = dquot_enable(qf_inode, type, format_id, flags);
+	iput(qf_inode);
+	return err;
+}
+
+static int f2fs_enable_quotas(struct super_block *sb)
+{
+	int type, err = 0;
+	unsigned long qf_inum;
+	bool quota_mopt[MAXQUOTAS] = {
+		test_opt(F2FS_SB(sb), USRQUOTA),
+		test_opt(F2FS_SB(sb), GRPQUOTA),
+		test_opt(F2FS_SB(sb), PRJQUOTA),
+	};
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		qf_inum = f2fs_qf_ino(sb, type);
+		if (qf_inum) {
+			err = f2fs_quota_enable(sb, type, QFMT_VFS_V1,
+				DQUOT_USAGE_ENABLED |
+				(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
+			if (err) {
+				f2fs_msg(sb, KERN_ERR,
+					"Failed to enable quota tracking "
+					"(type=%d, err=%d). Please run "
+					"fsck to fix.", type, err);
+				for (type--; type >= 0; type--)
+					dquot_quota_off(sb, type);
+				return err;
+			}
 		}
 	}
+	return 0;
 }
 
 static int f2fs_quota_sync(struct super_block *sb, int type)
@@ -1491,7 +1635,7 @@ static int f2fs_quota_off(struct super_block *sb, int type)
 	f2fs_quota_sync(sb, type);
 
 	err = dquot_quota_off(sb, type);
-	if (err)
+	if (err || f2fs_sb_has_quota_ino(sb))
 		goto out_put;
 
 	inode_lock(inode);
@@ -1651,7 +1795,7 @@ static loff_t max_file_blocks(void)
 
 	/*
 	 * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
-	 * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
+	 * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
 	 * space in inode.i_addr, it will be more safe to reassign
 	 * result as zero.
 	 */
@@ -1960,6 +2104,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 		for (j = HOT; j < NR_TEMP_TYPE; j++)
 			mutex_init(&sbi->wio_mutex[i][j]);
 	spin_lock_init(&sbi->cp_lock);
+
+	sbi->dirty_device = 0;
+	spin_lock_init(&sbi->dev_lock);
 }
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -2310,7 +2457,10 @@ try_onemore:
 
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &f2fs_quota_operations;
-	sb->s_qcop = &f2fs_quotactl_ops;
+	if (f2fs_sb_has_quota_ino(sb))
+		sb->s_qcop = &dquot_quotactl_sysfile_ops;
+	else
+		sb->s_qcop = &f2fs_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 #endif
 
@@ -2408,6 +2558,7 @@ try_onemore:
 				le64_to_cpu(sbi->ckpt->valid_block_count);
 	sbi->last_valid_block_count = sbi->total_valid_block_count;
 	sbi->reserved_blocks = 0;
+	sbi->current_reserved_blocks = 0;
 
 	for (i = 0; i < NR_INODE_TYPE; i++) {
 		INIT_LIST_HEAD(&sbi->inode_list[i]);
@@ -2482,10 +2633,24 @@ try_onemore:
 	if (err)
 		goto free_root_inode;
 
+#ifdef CONFIG_QUOTA
+	/*
+	 * Turn on quotas which were not enabled for read-only mounts if
+	 * filesystem has quota feature, so that they are updated correctly.
+	 */
+	if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) {
+		err = f2fs_enable_quotas(sb);
+		if (err) {
+			f2fs_msg(sb, KERN_ERR,
+				"Cannot turn on quotas: error %d", err);
+			goto free_sysfs;
+		}
+	}
+#endif
 	/* if there are nt orphan nodes free them */
 	err = recover_orphan_inodes(sbi);
 	if (err)
-		goto free_sysfs;
+		goto free_meta;
 
 	/* recover fsynced data */
 	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -2519,7 +2684,7 @@ try_onemore:
 			err = -EINVAL;
 			f2fs_msg(sb, KERN_ERR,
 				"Need to recover fsync data");
-			goto free_sysfs;
+			goto free_meta;
 		}
 	}
 skip_recovery:
@@ -2553,6 +2718,10 @@ skip_recovery:
 	return 0;
 
 free_meta:
+#ifdef CONFIG_QUOTA
+	if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb))
+		f2fs_quota_off_umount(sbi->sb);
+#endif
 	f2fs_sync_inode_meta(sbi);
 	/*
 	 * Some dirty meta pages can be produced by recover_orphan_inodes()
@@ -2561,7 +2730,9 @@ free_meta:
 	 * falls into an infinite loop in sync_meta_pages().
 	 */
 	truncate_inode_pages_final(META_MAPPING(sbi));
+#ifdef CONFIG_QUOTA
 free_sysfs:
+#endif
 	f2fs_unregister_sysfs(sbi);
 free_root_inode:
 	dput(sb->s_root);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index e2c258f717cd..9835348b6e5d 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -30,7 +30,7 @@ enum {
 	FAULT_INFO_RATE,	/* struct f2fs_fault_info */
 	FAULT_INFO_TYPE,	/* struct f2fs_fault_info */
 #endif
-	RESERVED_BLOCKS,
+	RESERVED_BLOCKS,	/* struct f2fs_sb_info */
 };
 
 struct f2fs_attr {
@@ -63,6 +63,13 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
 	return NULL;
 }
 
+static ssize_t dirty_segments_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)(dirty_segments(sbi)));
+}
+
 static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
@@ -100,10 +107,22 @@ static ssize_t features_show(struct f2fs_attr *a,
 	if (f2fs_sb_has_inode_chksum(sb))
 		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
 				len ? ", " : "", "inode_checksum");
+	if (f2fs_sb_has_flexible_inline_xattr(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "flexible_inline_xattr");
+	if (f2fs_sb_has_quota_ino(sb))
+		len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+				len ? ", " : "", "quota_ino");
 	len += snprintf(buf + len, PAGE_SIZE - len, "\n");
 	return len;
 }
 
+static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
+					struct f2fs_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks);
+}
+
 static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 			struct f2fs_sb_info *sbi, char *buf)
 {
@@ -143,34 +162,22 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
 #endif
 	if (a->struct_type == RESERVED_BLOCKS) {
 		spin_lock(&sbi->stat_lock);
-		if ((unsigned long)sbi->total_valid_block_count + t >
-				(unsigned long)sbi->user_block_count) {
+		if (t > (unsigned long)sbi->user_block_count) {
 			spin_unlock(&sbi->stat_lock);
 			return -EINVAL;
 		}
 		*ui = t;
+		sbi->current_reserved_blocks = min(sbi->reserved_blocks,
+				sbi->user_block_count - valid_user_blocks(sbi));
 		spin_unlock(&sbi->stat_lock);
 		return count;
 	}
 
 	if (!strcmp(a->attr.name, "discard_granularity")) {
-		struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-		int i;
-
 		if (t == 0 || t > MAX_PLIST_NUM)
 			return -EINVAL;
 		if (t == *ui)
 			return count;
-
-		mutex_lock(&dcc->cmd_lock);
-		for (i = 0; i < MAX_PLIST_NUM; i++) {
-			if (i >= t - 1)
-				dcc->pend_list_tag[i] |= P_ACTIVE;
-			else
-				dcc->pend_list_tag[i] &= (~P_ACTIVE);
-		}
-		mutex_unlock(&dcc->cmd_lock);
-
 		*ui = t;
 		return count;
 	}
@@ -222,6 +229,8 @@ enum feat_id {
 	FEAT_EXTRA_ATTR,
 	FEAT_PROJECT_QUOTA,
 	FEAT_INODE_CHECKSUM,
+	FEAT_FLEXIBLE_INLINE_XATTR,
+	FEAT_QUOTA_INO,
 };
 
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -234,6 +243,8 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 	case FEAT_EXTRA_ATTR:
 	case FEAT_PROJECT_QUOTA:
 	case FEAT_INODE_CHECKSUM:
+	case FEAT_FLEXIBLE_INLINE_XATTR:
+	case FEAT_QUOTA_INO:
 		return snprintf(buf, PAGE_SIZE, "supported\n");
 	}
 	return 0;
@@ -279,6 +290,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
@@ -291,8 +303,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
 F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
 F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
 #endif
+F2FS_GENERAL_RO_ATTR(dirty_segments);
 F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
 F2FS_GENERAL_RO_ATTR(features);
+F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
 
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
 F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
@@ -304,6 +318,8 @@ F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
 F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
 F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
 F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
+F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
+F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -321,6 +337,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(min_ipu_util),
 	ATTR_LIST(min_fsync_blocks),
 	ATTR_LIST(min_hot_blocks),
+	ATTR_LIST(min_ssr_sections),
 	ATTR_LIST(max_victim_search),
 	ATTR_LIST(dir_level),
 	ATTR_LIST(ram_thresh),
@@ -333,9 +350,11 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(inject_rate),
 	ATTR_LIST(inject_type),
 #endif
+	ATTR_LIST(dirty_segments),
 	ATTR_LIST(lifetime_write_kbytes),
 	ATTR_LIST(features),
 	ATTR_LIST(reserved_blocks),
+	ATTR_LIST(current_reserved_blocks),
 	NULL,
 };
 
@@ -350,6 +369,8 @@ static struct attribute *f2fs_feat_attrs[] = {
 	ATTR_LIST(extra_attr),
 	ATTR_LIST(project_quota),
 	ATTR_LIST(inode_checksum),
+	ATTR_LIST(flexible_inline_xattr),
+	ATTR_LIST(quota_ino),
 	NULL,
 };
 
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 7c65540148f8..ec8961ef8cac 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -217,12 +217,12 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
 	return entry;
 }
 
-static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
-					void **last_addr, int index,
-					size_t len, const char *name)
+static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
+				void *base_addr, void **last_addr, int index,
+				size_t len, const char *name)
 {
 	struct f2fs_xattr_entry *entry;
-	unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2;
+	unsigned int inline_size = inline_xattr_size(inode);
 
 	list_for_each_xattr(entry, base_addr) {
 		if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
@@ -241,12 +241,54 @@ static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
 	return entry;
 }
 
+static int read_inline_xattr(struct inode *inode, struct page *ipage,
+							void *txattr_addr)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	unsigned int inline_size = inline_xattr_size(inode);
+	struct page *page = NULL;
+	void *inline_addr;
+
+	if (ipage) {
+		inline_addr = inline_xattr_addr(inode, ipage);
+	} else {
+		page = get_node_page(sbi, inode->i_ino);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		inline_addr = inline_xattr_addr(inode, page);
+	}
+	memcpy(txattr_addr, inline_addr, inline_size);
+	f2fs_put_page(page, 1);
+
+	return 0;
+}
+
+static int read_xattr_block(struct inode *inode, void *txattr_addr)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+	unsigned int inline_size = inline_xattr_size(inode);
+	struct page *xpage;
+	void *xattr_addr;
+
+	/* The inode already has an extended attribute block. */
+	xpage = get_node_page(sbi, xnid);
+	if (IS_ERR(xpage))
+		return PTR_ERR(xpage);
+
+	xattr_addr = page_address(xpage);
+	memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE);
+	f2fs_put_page(xpage, 1);
+
+	return 0;
+}
+
 static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 				unsigned int index, unsigned int len,
 				const char *name, struct f2fs_xattr_entry **xe,
 				void **base_addr)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	void *cur_addr, *txattr_addr, *last_addr = NULL;
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
 	unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
@@ -263,23 +305,11 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 
 	/* read from inline xattr */
 	if (inline_size) {
-		struct page *page = NULL;
-		void *inline_addr;
-
-		if (ipage) {
-			inline_addr = inline_xattr_addr(ipage);
-		} else {
-			page = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(page)) {
-				err = PTR_ERR(page);
-				goto out;
-			}
-			inline_addr = inline_xattr_addr(page);
-		}
-		memcpy(txattr_addr, inline_addr, inline_size);
-		f2fs_put_page(page, 1);
+		err = read_inline_xattr(inode, ipage, txattr_addr);
+		if (err)
+			goto out;
 
-		*xe = __find_inline_xattr(txattr_addr, &last_addr,
+		*xe = __find_inline_xattr(inode, txattr_addr, &last_addr,
 						index, len, name);
 		if (*xe)
 			goto check;
@@ -287,19 +317,9 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 
 	/* read from xattr node block */
 	if (xnid) {
-		struct page *xpage;
-		void *xattr_addr;
-
-		/* The inode already has an extended attribute block. */
-		xpage = get_node_page(sbi, xnid);
-		if (IS_ERR(xpage)) {
-			err = PTR_ERR(xpage);
+		err = read_xattr_block(inode, txattr_addr);
+		if (err)
 			goto out;
-		}
-
-		xattr_addr = page_address(xpage);
-		memcpy(txattr_addr + inline_size, xattr_addr, size);
-		f2fs_put_page(xpage, 1);
 	}
 
 	if (last_addr)
@@ -324,7 +344,6 @@ out:
 static int read_all_xattrs(struct inode *inode, struct page *ipage,
 							void **base_addr)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_xattr_header *header;
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
 	unsigned int size = VALID_XATTR_BLOCK_SIZE;
@@ -339,38 +358,16 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
 
 	/* read from inline xattr */
 	if (inline_size) {
-		struct page *page = NULL;
-		void *inline_addr;
-
-		if (ipage) {
-			inline_addr = inline_xattr_addr(ipage);
-		} else {
-			page = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(page)) {
-				err = PTR_ERR(page);
-				goto fail;
-			}
-			inline_addr = inline_xattr_addr(page);
-		}
-		memcpy(txattr_addr, inline_addr, inline_size);
-		f2fs_put_page(page, 1);
+		err = read_inline_xattr(inode, ipage, txattr_addr);
+		if (err)
+			goto fail;
 	}
 
 	/* read from xattr node block */
 	if (xnid) {
-		struct page *xpage;
-		void *xattr_addr;
-
-		/* The inode already has an extended attribute block. */
-		xpage = get_node_page(sbi, xnid);
-		if (IS_ERR(xpage)) {
-			err = PTR_ERR(xpage);
+		err = read_xattr_block(inode, txattr_addr);
+		if (err)
 			goto fail;
-		}
-
-		xattr_addr = page_address(xpage);
-		memcpy(txattr_addr + inline_size, xattr_addr, size);
-		f2fs_put_page(xpage, 1);
 	}
 
 	header = XATTR_HDR(txattr_addr);
@@ -392,10 +389,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	size_t inline_size = inline_xattr_size(inode);
+	struct page *in_page = NULL;
 	void *xattr_addr;
+	void *inline_addr = NULL;
 	struct page *xpage;
 	nid_t new_nid = 0;
-	int err;
+	int err = 0;
 
 	if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
 		if (!alloc_nid(sbi, &new_nid))
@@ -403,30 +402,30 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 
 	/* write to inline xattr */
 	if (inline_size) {
-		struct page *page = NULL;
-		void *inline_addr;
-
 		if (ipage) {
-			inline_addr = inline_xattr_addr(ipage);
-			f2fs_wait_on_page_writeback(ipage, NODE, true);
-			set_page_dirty(ipage);
+			inline_addr = inline_xattr_addr(inode, ipage);
 		} else {
-			page = get_node_page(sbi, inode->i_ino);
-			if (IS_ERR(page)) {
+			in_page = get_node_page(sbi, inode->i_ino);
+			if (IS_ERR(in_page)) {
 				alloc_nid_failed(sbi, new_nid);
-				return PTR_ERR(page);
+				return PTR_ERR(in_page);
 			}
-			inline_addr = inline_xattr_addr(page);
-			f2fs_wait_on_page_writeback(page, NODE, true);
+			inline_addr = inline_xattr_addr(inode, in_page);
 		}
-		memcpy(inline_addr, txattr_addr, inline_size);
-		f2fs_put_page(page, 1);
 
+		f2fs_wait_on_page_writeback(ipage ? ipage : in_page,
+							NODE, true);
 		/* no need to use xattr node block */
 		if (hsize <= inline_size) {
-			err = truncate_xattr_node(inode, ipage);
+			err = truncate_xattr_node(inode);
 			alloc_nid_failed(sbi, new_nid);
-			return err;
+			if (err) {
+				f2fs_put_page(in_page, 1);
+				return err;
+			}
+			memcpy(inline_addr, txattr_addr, inline_size);
+			set_page_dirty(ipage ? ipage : in_page);
+			goto in_page_out;
 		}
 	}
 
@@ -435,7 +434,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 		xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
 		if (IS_ERR(xpage)) {
 			alloc_nid_failed(sbi, new_nid);
-			return PTR_ERR(xpage);
+			goto in_page_out;
 		}
 		f2fs_bug_on(sbi, new_nid);
 		f2fs_wait_on_page_writeback(xpage, NODE, true);
@@ -445,17 +444,24 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
 		xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
 		if (IS_ERR(xpage)) {
 			alloc_nid_failed(sbi, new_nid);
-			return PTR_ERR(xpage);
+			goto in_page_out;
 		}
 		alloc_nid_done(sbi, new_nid);
 	}
-
 	xattr_addr = page_address(xpage);
+
+	if (inline_size)
+		memcpy(inline_addr, txattr_addr, inline_size);
 	memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
+
+	if (inline_size)
+		set_page_dirty(ipage ? ipage : in_page);
 	set_page_dirty(xpage);
-	f2fs_put_page(xpage, 1);
 
-	return 0;
+	f2fs_put_page(xpage, 1);
+in_page_out:
+	f2fs_put_page(in_page, 1);
+	return err;
 }
 
 int f2fs_getxattr(struct inode *inode, int index, const char *name,
@@ -681,6 +687,10 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int err;
 
+	err = dquot_initialize(inode);
+	if (err)
+		return err;
+
 	/* this case is only from init_inode_metadata */
 	if (ipage)
 		return __f2fs_setxattr(inode, index, name, value,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 30f47d0f74a0..0522e283a4f4 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -563,6 +563,9 @@ static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __u
 {
 	struct compat_flock64 fl;
 
+	BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start));
+	BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len));
+
 	memset(&fl, 0, sizeof(struct compat_flock64));
 	copy_flock_fields(&fl, kfl);
 	if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
@@ -632,9 +635,8 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 		if (err)
 			break;
 		err = fixup_compat_flock(&flock);
-		if (err)
-			return err;
-		err = put_compat_flock(&flock, compat_ptr(arg));
+		if (!err)
+			err = put_compat_flock(&flock, compat_ptr(arg));
 		break;
 	case F_GETLK64:
 	case F_OFD_GETLK:
@@ -642,12 +644,8 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 		if (err)
 			break;
 		err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
-		if (err)
-			break;
-		err = fixup_compat_flock(&flock);
-		if (err)
-			return err;
-		err = put_compat_flock64(&flock, compat_ptr(arg));
+		if (!err)
+			err = put_compat_flock64(&flock, compat_ptr(arg));
 		break;
 	case F_SETLK:
 	case F_SETLKW:
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 474adc8d2a3a..0ace128f5d23 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -213,8 +213,8 @@ out_err:
 	return retval;
 }
 
-long do_handle_open(int mountdirfd,
-		    struct file_handle __user *ufh, int open_flag)
+static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
+			   int open_flag)
 {
 	long retval = 0;
 	struct path path;
diff --git a/fs/file.c b/fs/file.c
index 4eecbf4244a5..3b080834b870 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -593,13 +593,16 @@ void __fd_install(struct files_struct *files, unsigned int fd,
 {
 	struct fdtable *fdt;
 
-	might_sleep();
 	rcu_read_lock_sched();
 
-	while (unlikely(files->resize_in_progress)) {
+	if (unlikely(files->resize_in_progress)) {
 		rcu_read_unlock_sched();
-		wait_event(files->resize_wait, !files->resize_in_progress);
-		rcu_read_lock_sched();
+		spin_lock(&files->file_lock);
+		fdt = files_fdtable(files);
+		BUG_ON(fdt->fd[fd] != NULL);
+		rcu_assign_pointer(fdt->fd[fd], file);
+		spin_unlock(&files->file_lock);
+		return;
 	}
 	/* coupled with smp_wmb() in expand_fdtable() */
 	smp_rmb();
@@ -632,7 +635,6 @@ int __close_fd(struct files_struct *files, unsigned fd)
 	if (!file)
 		goto out_unlock;
 	rcu_assign_pointer(fdt->fd[fd], NULL);
-	__clear_close_on_exec(fd, fdt);
 	__put_unused_fd(files, fd);
 	spin_unlock(&files->file_lock);
 	return filp_close(file, files);
diff --git a/fs/file_table.c b/fs/file_table.c
index 49e1f2f1a4cb..2dc9f38bd195 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -312,7 +312,7 @@ void put_filp(struct file *file)
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 }
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 245c430a2e41..08f5debd07d1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -933,33 +933,36 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
-void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-			bool range_cyclic, enum wb_reason reason)
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
 {
-	struct wb_writeback_work *work;
+	return global_node_page_state(NR_FILE_DIRTY) +
+		global_node_page_state(NR_UNSTABLE_NFS) +
+		get_nr_dirty_inodes();
+}
 
+static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
+{
 	if (!wb_has_dirty_io(wb))
 		return;
 
 	/*
-	 * This is WB_SYNC_NONE writeback, so if allocation fails just
-	 * wakeup the thread for old dirty data writeback
+	 * All callers of this function want to start writeback of all
+	 * dirty pages. Places like vmscan can call this at a very
+	 * high frequency, causing pointless allocations of tons of
+	 * work items and keeping the flusher threads busy retrieving
+	 * that work. Ensure that we only allow one of them pending and
+	 * inflight at the time.
 	 */
-	work = kzalloc(sizeof(*work),
-		       GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
-	if (!work) {
-		trace_writeback_nowork(wb);
-		wb_wakeup(wb);
+	if (test_bit(WB_start_all, &wb->state) ||
+	    test_and_set_bit(WB_start_all, &wb->state))
 		return;
-	}
-
-	work->sync_mode	= WB_SYNC_NONE;
-	work->nr_pages	= nr_pages;
-	work->range_cyclic = range_cyclic;
-	work->reason	= reason;
-	work->auto_free	= 1;
 
-	wb_queue_work(wb, work);
+	wb->start_all_reason = reason;
+	wb_wakeup(wb);
 }
 
 /**
@@ -1814,17 +1817,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 	return work;
 }
 
-/*
- * Add in the number of potentially dirty inodes, because each inode
- * write can dirty pagecache in the underlying blockdev.
- */
-static unsigned long get_nr_dirty_pages(void)
-{
-	return global_node_page_state(NR_FILE_DIRTY) +
-		global_node_page_state(NR_UNSTABLE_NFS) +
-		get_nr_dirty_inodes();
-}
-
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
 	if (wb_over_bg_thresh(wb)) {
@@ -1877,6 +1869,30 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 	return 0;
 }
 
+static long wb_check_start_all(struct bdi_writeback *wb)
+{
+	long nr_pages;
+
+	if (!test_bit(WB_start_all, &wb->state))
+		return 0;
+
+	nr_pages = get_nr_dirty_pages();
+	if (nr_pages) {
+		struct wb_writeback_work work = {
+			.nr_pages	= wb_split_bdi_pages(wb, nr_pages),
+			.sync_mode	= WB_SYNC_NONE,
+			.range_cyclic	= 1,
+			.reason		= wb->start_all_reason,
+		};
+
+		nr_pages = wb_writeback(wb, &work);
+	}
+
+	clear_bit(WB_start_all, &wb->state);
+	return nr_pages;
+}
+
+
 /*
  * Retrieve work items and do the writeback they describe
  */
@@ -1893,6 +1909,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 	}
 
 	/*
+	 * Check for a flush-everything request
+	 */
+	wrote += wb_check_start_all(wb);
+
+	/*
 	 * Check for periodic writeback, kupdated() style
 	 */
 	wrote += wb_check_old_data_flush(wb);
@@ -1947,10 +1968,33 @@ void wb_workfn(struct work_struct *work)
 }
 
 /*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.
+ * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
+ * write back the whole world.
  */
-void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+					 enum wb_reason reason)
+{
+	struct bdi_writeback *wb;
+
+	if (!bdi_has_dirty_io(bdi))
+		return;
+
+	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+		wb_start_writeback(wb, reason);
+}
+
+void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+				enum wb_reason reason)
+{
+	rcu_read_lock();
+	__wakeup_flusher_threads_bdi(bdi, reason);
+	rcu_read_unlock();
+}
+
+/*
+ * Wakeup the flusher threads to start writeback of all currently dirty pages
+ */
+void wakeup_flusher_threads(enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
 
@@ -1960,20 +2004,9 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	if (blk_needs_flush_plug(current))
 		blk_schedule_flush_plug(current);
 
-	if (!nr_pages)
-		nr_pages = get_nr_dirty_pages();
-
 	rcu_read_lock();
-	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
-		struct bdi_writeback *wb;
-
-		if (!bdi_has_dirty_io(bdi))
-			continue;
-
-		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
-			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
-					   false, reason);
-	}
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+		__wakeup_flusher_threads_bdi(bdi, reason);
 	rcu_read_unlock();
 }
 
@@ -2343,37 +2376,19 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 EXPORT_SYMBOL(writeback_inodes_sb);
 
 /**
- * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
+ * try_to_writeback_inodes_sb - try to start writeback if none underway
  * @sb: the superblock
- * @nr: the number of pages to write
- * @reason: the reason of writeback
+ * @reason: reason why some writeback work was initiated
  *
- * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
- * Returns 1 if writeback was started, 0 if not.
+ * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
  */
-bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
-				   enum wb_reason reason)
+void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
 	if (!down_read_trylock(&sb->s_umount))
-		return false;
+		return;
 
-	__writeback_inodes_sb_nr(sb, nr, reason, true);
+	__writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
 	up_read(&sb->s_umount);
-	return true;
-}
-EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
-
-/**
- * try_to_writeback_inodes_sb - try to start writeback if none underway
- * @sb: the superblock
- * @reason: reason why some writeback work was initiated
- *
- * Implement by try_to_writeback_inodes_sb_nr()
- * Returns 1 if writeback was started, 0 if not.
- */
-bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
-{
-	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
 EXPORT_SYMBOL(try_to_writeback_inodes_sb);
 
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 40d61077bead..ff84258132bb 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -558,7 +558,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
 	 * have completed.
 	 */
 	if (!atomic_dec_and_test(&cookie->n_active))
-		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+		wait_on_atomic_t(&cookie->n_active, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 
 	/* Make sure any pending writes are cancelled. */
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 97ec45110957..0ff4b49a0037 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void)
 	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
 }
 
-extern int fscache_wait_atomic_t(atomic_t *);
-
 /*
  * object.c
  */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index b39d487ccfb0..249968dcbf5c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -195,12 +195,3 @@ static void __exit fscache_exit(void)
 }
 
 module_exit(fscache_exit);
-
-/*
- * wait_on_atomic_t() sleep function for uninterruptible waiting
- */
-int fscache_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 0ad3fd3ad0b4..961029e04027 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1175,7 +1175,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 		return;
 	}
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	next = 0;
 	do {
 		if (!pagevec_lookup(&pvec, mapping, &next))
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index a42d89371748..17f0d05bfd4c 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1636,7 +1636,7 @@ out_finish:
 
 static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-	release_pages(req->pages, req->num_pages, false);
+	release_pages(req->pages, req->num_pages);
 }
 
 static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 94a745acaef8..2f504d615d92 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -31,7 +31,7 @@ static struct kmem_cache *fuse_inode_cachep;
 struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
 
-static int set_global_limit(const char *val, struct kernel_param *kp);
+static int set_global_limit(const char *val, const struct kernel_param *kp);
 
 unsigned max_user_bgreq;
 module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
@@ -823,7 +823,7 @@ static void sanitize_global_limit(unsigned *limit)
 		*limit = (1 << 16) - 1;
 }
 
-static int set_global_limit(const char *val, struct kernel_param *kp)
+static int set_global_limit(const char *val, const struct kernel_param *kp)
 {
 	int rv;
 
@@ -1273,9 +1273,9 @@ static int __init fuse_fs_init(void)
 	int err;
 
 	fuse_inode_cachep = kmem_cache_create("fuse_inode",
-					      sizeof(struct fuse_inode), 0,
-					      SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
-					      fuse_inode_init_once);
+			sizeof(struct fuse_inode), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT,
+			fuse_inode_init_once);
 	err = -ENOMEM;
 	if (!fuse_inode_cachep)
 		goto out;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 68ed06962537..1daf15a1f00c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -280,22 +280,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 	for(i = 0; i < nr_pages; i++) {
 		struct page *page = pvec->pages[i];
 
-		/*
-		 * At this point, the page may be truncated or
-		 * invalidated (changing page->mapping to NULL), or
-		 * even swizzled back from swapper_space to tmpfs file
-		 * mapping. However, page->index will not change
-		 * because we have a reference on the page.
-		 */
-		if (page->index > end) {
-			/*
-			 * can't be range_cyclic (1st pass) because
-			 * end == -1 in that case.
-			 */
-			ret = 1;
-			break;
-		}
-
 		*done_index = page->index;
 
 		lock_page(page);
@@ -387,7 +371,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
 	int range_whole = 0;
 	int tag;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	if (wbc->range_cyclic) {
 		writeback_index = mapping->writeback_index; /* prev offset */
 		index = writeback_index;
@@ -413,8 +397,8 @@ retry:
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && (index <= end)) {
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+				tag);
 		if (nr_pages == 0)
 			break;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ed113ea17aff..1e76730aac0d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -407,7 +407,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 	next = start;
 	while (next < end) {
 		/*
@@ -668,7 +668,6 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 		return error;
 
 	if (ia_valid & ATTR_SIZE) {
-		error = -EINVAL;
 		if (attr->ia_size & ~huge_page_mask(h))
 			return -EINVAL;
 		error = hugetlb_vmtruncate(inode, attr->ia_size);
diff --git a/fs/internal.h b/fs/internal.h
index 48cee21b4f14..df262f41a0ef 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,6 +55,7 @@ extern void __init chrdev_init(void);
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
+long do_unlinkat(int dfd, struct filename *name);
 
 /*
  * namespace.c
diff --git a/fs/iomap.c b/fs/iomap.c
index 5011a964a550..47d29ccffaef 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -856,6 +856,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 	struct bio *bio;
 	bool need_zeroout = false;
 	int nr_pages, ret;
+	size_t copied = 0;
 
 	if ((pos | length | align) & ((1 << blkbits) - 1))
 		return -EINVAL;
@@ -867,7 +868,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 		/*FALLTHRU*/
 	case IOMAP_UNWRITTEN:
 		if (!(dio->flags & IOMAP_DIO_WRITE)) {
-			iov_iter_zero(length, dio->submit.iter);
+			length = iov_iter_zero(length, dio->submit.iter);
 			dio->size += length;
 			return length;
 		}
@@ -904,8 +905,11 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 	}
 
 	do {
-		if (dio->error)
+		size_t n;
+		if (dio->error) {
+			iov_iter_revert(dio->submit.iter, copied);
 			return 0;
+		}
 
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
 		bio_set_dev(bio, iomap->bdev);
@@ -918,20 +922,24 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 		ret = bio_iov_iter_get_pages(bio, &iter);
 		if (unlikely(ret)) {
 			bio_put(bio);
-			return ret;
+			return copied ? copied : ret;
 		}
 
+		n = bio->bi_iter.bi_size;
 		if (dio->flags & IOMAP_DIO_WRITE) {
 			bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
-			task_io_account_write(bio->bi_iter.bi_size);
+			task_io_account_write(n);
 		} else {
 			bio_set_op_attrs(bio, REQ_OP_READ, 0);
 			if (dio->flags & IOMAP_DIO_DIRTY)
 				bio_set_pages_dirty(bio);
 		}
 
-		dio->size += bio->bi_iter.bi_size;
-		pos += bio->bi_iter.bi_size;
+		iov_iter_advance(dio->submit.iter, n);
+
+		dio->size += n;
+		pos += n;
+		copied += n;
 
 		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
 
@@ -947,9 +955,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 		if (pad)
 			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
 	}
-
-	iov_iter_advance(dio->submit.iter, length);
-	return length;
+	return copied;
 }
 
 ssize_t
@@ -1057,7 +1063,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 
 			if (!(iocb->ki_flags & IOCB_HIPRI) ||
 			    !dio->submit.last_queue ||
-			    !blk_mq_poll(dio->submit.last_queue,
+			    !blk_poll(dio->submit.last_queue,
 					 dio->submit.cookie))
 				io_schedule();
 		}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index d2a85c9720e9..67546c7ad473 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -737,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	return err;
 }
 
+/* Return 1 when transaction with given tid has already committed. */
+int jbd2_transaction_committed(journal_t *journal, tid_t tid)
+{
+	int ret = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid)
+		ret = 0;
+	if (journal->j_committing_transaction &&
+	    journal->j_committing_transaction->t_tid == tid)
+		ret = 0;
+	read_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_transaction_committed);
+
 /*
  * When this function returns the transaction corresponding to tid
  * will be completed.  If the transaction has currently running, start
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index b995bdc13976..b837fb7e290a 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -602,7 +602,7 @@ static struct ctl_table nlm_sysctl_root[] = {
  */
 
 #define param_set_min_max(name, type, which_strtol, min, max)		\
-static int param_set_##name(const char *val, struct kernel_param *kp)	\
+static int param_set_##name(const char *val, const struct kernel_param *kp) \
 {									\
 	char *endp;							\
 	__typeof__(type) num = which_strtol(val, &endp, 0);		\
diff --git a/fs/namei.c b/fs/namei.c
index 5424b10cfdc4..f0c7a7b9b6ca 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3459,7 +3459,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 		goto out;
 	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
 	error = PTR_ERR(child);
-	if (unlikely(IS_ERR(child)))
+	if (IS_ERR(child))
 		goto out2;
 	dput(path.dentry);
 	path.dentry = child;
@@ -4010,10 +4010,9 @@ EXPORT_SYMBOL(vfs_unlink);
  * writeout happening, and we don't want to prevent access to the directory
  * while waiting on the I/O.
  */
-static long do_unlinkat(int dfd, const char __user *pathname)
+long do_unlinkat(int dfd, struct filename *name)
 {
 	int error;
-	struct filename *name;
 	struct dentry *dentry;
 	struct path path;
 	struct qstr last;
@@ -4022,8 +4021,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 	struct inode *delegated_inode = NULL;
 	unsigned int lookup_flags = 0;
 retry:
-	name = filename_parentat(dfd, getname(pathname), lookup_flags,
-				&path, &last, &type);
+	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
@@ -4065,12 +4063,12 @@ exit2:
 	mnt_drop_write(path.mnt);
 exit1:
 	path_put(&path);
-	putname(name);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
 		inode = NULL;
 		goto retry;
 	}
+	putname(name);
 	return error;
 
 slashes:
@@ -4091,12 +4089,12 @@ SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
 	if (flag & AT_REMOVEDIR)
 		return do_rmdir(dfd, pathname);
 
-	return do_unlinkat(dfd, pathname);
+	return do_unlinkat(dfd, getname(pathname));
 }
 
 SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 {
-	return do_unlinkat(AT_FDCWD, pathname);
+	return do_unlinkat(AT_FDCWD, getname(pathname));
 }
 
 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 134d9f560240..1629056aa2c9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -85,9 +85,9 @@ int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
 }
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
 
-int nfs_wait_atomic_killable(atomic_t *p)
+int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode)
 {
-	return nfs_wait_killable(TASK_KILLABLE);
+	return nfs_wait_killable(mode);
 }
 
 /**
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f9a4a5524bd5..5ab17fd4700a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -388,7 +388,7 @@ extern void nfs_evict_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
 extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
 extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
-extern int nfs_wait_atomic_killable(atomic_t *p);
+extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode);
 
 /* super.c */
 extern const struct super_operations nfs_sops;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 06ffa135dfa6..16a7a67a11c9 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -2156,10 +2156,10 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
 	     level++)
 		INIT_LIST_HEAD(&lists[level]);
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
-	while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
-				  PAGEVEC_SIZE)) {
+	while (pagevec_lookup_tag(&pvec, btcache, &index,
+					PAGECACHE_TAG_DIRTY)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			bh = head = page_buffers(pvec.pages[i]);
 			do {
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 8616c46d33da..68241512d7c1 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -255,10 +255,9 @@ int nilfs_copy_dirty_pages(struct address_space *dmap,
 	pgoff_t index = 0;
 	int err = 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 repeat:
-	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
-				PAGEVEC_SIZE))
+	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY))
 		return 0;
 
 	for (i = 0; i < pagevec_count(&pvec); i++) {
@@ -310,7 +309,7 @@ void nilfs_copy_back_pages(struct address_space *dmap,
 	pgoff_t index = 0;
 	int err;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 repeat:
 	n = pagevec_lookup(&pvec, smap, &index);
 	if (!n)
@@ -374,10 +373,10 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
 	unsigned int i;
 	pgoff_t index = 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
-	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
-				  PAGEVEC_SIZE)) {
+	while (pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -519,7 +518,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 	index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);
 	nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits);
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
 repeat:
 	pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 70ded52dc1dd..f65392fecb5c 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -708,21 +708,17 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
 		index = start >> PAGE_SHIFT;
 		last = end >> PAGE_SHIFT;
 	}
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
  repeat:
 	if (unlikely(index > last) ||
-	    !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
-				min_t(pgoff_t, last - index,
-				      PAGEVEC_SIZE - 1) + 1))
+	    !pagevec_lookup_range_tag(&pvec, mapping, &index, last,
+				PAGECACHE_TAG_DIRTY))
 		return ndirties;
 
 	for (i = 0; i < pagevec_count(&pvec); i++) {
 		struct buffer_head *bh, *head;
 		struct page *page = pvec.pages[i];
 
-		if (unlikely(page->index > last))
-			break;
-
 		lock_page(page);
 		if (!page_has_buffers(page))
 			create_empty_buffers(page, i_blocksize(inode), 0);
@@ -757,10 +753,10 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
 	unsigned int i;
 	pgoff_t index = 0;
 
-	pagevec_init(&pvec, 0);
+	pagevec_init(&pvec);
 
-	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
-				  PAGEVEC_SIZE)) {
+	while (pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			bh = head = page_buffers(pvec.pages[i]);
 			do {
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index e5f911bd80d2..41355ce74ac0 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -21,6 +21,6 @@ config FANOTIFY_ACCESS_PERMISSIONS
 	   decisions concerning filesystem events.  This is used by some fanotify
 	   listeners which need to scan files before allowing the system access to
 	   use those files.  This is used by some anti-malware vendors and by some
-	   hierarchical storage managent systems.
+	   hierarchical storage management systems.
 
 	   If unsure, say N.
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cc91856b5e2d..3a2e509c77c5 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1739,7 +1739,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
 	spin_lock(&mapping->private_lock);
 	if (unlikely(!page_has_buffers(page))) {
 		spin_unlock(&mapping->private_lock);
-		bh = head = alloc_page_buffers(page, bh_size, 1);
+		bh = head = alloc_page_buffers(page, bh_size, true);
 		spin_lock(&mapping->private_lock);
 		if (likely(!page_has_buffers(page))) {
 			struct buffer_head *tail;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b6f402194f02..ee8392aee9f6 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -507,7 +507,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 	if (unlikely(!page_has_buffers(page))) {
 		struct buffer_head *tail;
 
-		bh = head = alloc_page_buffers(page, blocksize, 1);
+		bh = head = alloc_page_buffers(page, blocksize, true);
 		do {
 			set_buffer_uptodate(bh);
 			tail = bh;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index addd7c5f2d3e..ab5105f9767e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3585,8 +3585,6 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
 		 * The easy case - we can just plop the record right in.
 		 */
 		*left_rec = *split_rec;
-
-		has_empty_extent = 0;
 	} else
 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 88a31e9340a0..d1516327b787 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -134,6 +134,19 @@ bail:
 	return err;
 }
 
+static int ocfs2_lock_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	down_read(&oi->ip_alloc_sem);
+	ret = ocfs2_get_block(inode, iblock, bh_result, create);
+	up_read(&oi->ip_alloc_sem);
+
+	return ret;
+}
+
 int ocfs2_get_block(struct inode *inode, sector_t iblock,
 		    struct buffer_head *bh_result, int create)
 {
@@ -2128,7 +2141,7 @@ static void ocfs2_dio_free_write_ctx(struct inode *inode,
  * called like this: dio->get_blocks(dio->inode, fs_startblk,
  * 					fs_count, map_bh, dio->rw == WRITE);
  */
-static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
 			       struct buffer_head *bh_result, int create)
 {
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2154,12 +2167,9 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
 	 * while file size will be changed.
 	 */
 	if (pos + total_len <= i_size_read(inode)) {
-		down_read(&oi->ip_alloc_sem);
-		/* This is the fast path for re-write. */
-		ret = ocfs2_get_block(inode, iblock, bh_result, create);
-
-		up_read(&oi->ip_alloc_sem);
 
+		/* This is the fast path for re-write. */
+		ret = ocfs2_lock_get_block(inode, iblock, bh_result, create);
 		if (buffer_mapped(bh_result) &&
 		    !buffer_new(bh_result) &&
 		    ret == 0)
@@ -2424,9 +2434,9 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		return 0;
 
 	if (iov_iter_rw(iter) == READ)
-		get_block = ocfs2_get_block;
+		get_block = ocfs2_lock_get_block;
 	else
-		get_block = ocfs2_dio_get_block;
+		get_block = ocfs2_dio_wr_get_block;
 
 	return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
 				    iter, get_block,
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index b97bcc6dde7c..b1bb70c8ca4d 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -28,9 +28,6 @@
 
 #include <linux/buffer_head.h>
 
-void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
-			     int uptodate);
-
 int ocfs2_write_block(struct ocfs2_super          *osb,
 		      struct buffer_head  *bh,
 		      struct ocfs2_caching_info   *ci);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 3ef5137dc362..a9e67efc0004 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -79,10 +79,8 @@ void o2hb_fill_node_map(unsigned long *map,
 			unsigned bytes);
 void o2hb_exit(void);
 int o2hb_init(void);
-int o2hb_check_node_heartbeating(u8 node_num);
 int o2hb_check_node_heartbeating_no_sem(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
-int o2hb_check_local_node_heartbeating(void);
 void o2hb_stop_all_regions(void);
 int o2hb_get_all_regions(char *region_uuids, u8 numregions);
 int o2hb_global_heartbeat_active(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index a51200ece93d..da64c3a20eeb 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -40,6 +40,9 @@ char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
 		"panic",	/* O2NM_FENCE_PANIC */
 };
 
+static inline void o2nm_lock_subsystem(void);
+static inline void o2nm_unlock_subsystem(void);
+
 struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
 {
 	struct o2nm_node *node = NULL;
@@ -181,7 +184,10 @@ static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
 {
 	/* through the first node_set .parent
 	 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
-	return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+	if (node->nd_item.ci_parent)
+		return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
+	else
+		return NULL;
 }
 
 enum {
@@ -194,7 +200,7 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
 				   size_t count)
 {
 	struct o2nm_node *node = to_o2nm_node(item);
-	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	struct o2nm_cluster *cluster;
 	unsigned long tmp;
 	char *p = (char *)page;
 	int ret = 0;
@@ -214,6 +220,13 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
 	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
 		return -EINVAL; /* XXX */
 
+	o2nm_lock_subsystem();
+	cluster = to_o2nm_cluster_from_node(node);
+	if (!cluster) {
+		o2nm_unlock_subsystem();
+		return -EINVAL;
+	}
+
 	write_lock(&cluster->cl_nodes_lock);
 	if (cluster->cl_nodes[tmp])
 		ret = -EEXIST;
@@ -226,6 +239,8 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
 		set_bit(tmp, cluster->cl_nodes_bitmap);
 	}
 	write_unlock(&cluster->cl_nodes_lock);
+	o2nm_unlock_subsystem();
+
 	if (ret)
 		return ret;
 
@@ -269,7 +284,7 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
 					    size_t count)
 {
 	struct o2nm_node *node = to_o2nm_node(item);
-	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	struct o2nm_cluster *cluster;
 	int ret, i;
 	struct rb_node **p, *parent;
 	unsigned int octets[4];
@@ -286,6 +301,13 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
 		be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
 	}
 
+	o2nm_lock_subsystem();
+	cluster = to_o2nm_cluster_from_node(node);
+	if (!cluster) {
+		o2nm_unlock_subsystem();
+		return -EINVAL;
+	}
+
 	ret = 0;
 	write_lock(&cluster->cl_nodes_lock);
 	if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
@@ -298,6 +320,8 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
 		rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
 	}
 	write_unlock(&cluster->cl_nodes_lock);
+	o2nm_unlock_subsystem();
+
 	if (ret)
 		return ret;
 
@@ -315,7 +339,7 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
 				     size_t count)
 {
 	struct o2nm_node *node = to_o2nm_node(item);
-	struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
+	struct o2nm_cluster *cluster;
 	unsigned long tmp;
 	char *p = (char *)page;
 	ssize_t ret;
@@ -333,17 +357,26 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
 	    !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
 		return -EINVAL; /* XXX */
 
+	o2nm_lock_subsystem();
+	cluster = to_o2nm_cluster_from_node(node);
+	if (!cluster) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	/* the only failure case is trying to set a new local node
 	 * when a different one is already set */
 	if (tmp && tmp == cluster->cl_has_local &&
-	    cluster->cl_local_node != node->nd_num)
-		return -EBUSY;
+	    cluster->cl_local_node != node->nd_num) {
+		ret = -EBUSY;
+		goto out;
+	}
 
 	/* bring up the rx thread if we're setting the new local node. */
 	if (tmp && !cluster->cl_has_local) {
 		ret = o2net_start_listening(node);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	if (!tmp && cluster->cl_has_local &&
@@ -358,7 +391,11 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
 		cluster->cl_local_node = node->nd_num;
 	}
 
-	return count;
+	ret = count;
+
+out:
+	o2nm_unlock_subsystem();
+	return ret;
 }
 
 CONFIGFS_ATTR(o2nm_node_, num);
@@ -738,6 +775,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
 	},
 };
 
+static inline void o2nm_lock_subsystem(void)
+{
+	mutex_lock(&o2nm_cluster_group.cs_subsys.su_mutex);
+}
+
+static inline void o2nm_unlock_subsystem(void)
+{
+	mutex_unlock(&o2nm_cluster_group.cs_subsys.su_mutex);
+}
+
 int o2nm_depend_item(struct config_item *item)
 {
 	return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index a2b19fbdcf46..e1fea149f50b 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -394,7 +394,6 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
 static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
 {
 	if (dlm->dlm_worker) {
-		flush_workqueue(dlm->dlm_worker);
 		destroy_workqueue(dlm->dlm_worker);
 		dlm->dlm_worker = NULL;
 	}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3e04279446e8..9c3e0f13ca87 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2616,7 +2616,9 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	 * otherwise the assert_master from the new
 	 * master will destroy this.
 	 */
-	dlm_get_mle_inuse(mle);
+	if (ret != -EEXIST)
+		dlm_get_mle_inuse(mle);
+
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 74407c6dd592..ec8f75813beb 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2419,6 +2419,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 					dlm_lockres_put(res);
 					continue;
 				}
+				dlm_move_lockres_to_recovery_list(dlm, res);
 			} else if (res->owner == dlm->node_num) {
 				dlm_free_dead_locks(dlm, res, dead_node);
 				__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 9ab9e1892b5f..9c7c18c0e129 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -88,13 +88,13 @@ struct workqueue_struct *user_dlm_worker;
  */
 #define DLMFS_CAPABILITIES "bast stackglue"
 static int param_set_dlmfs_capabilities(const char *val,
-					struct kernel_param *kp)
+					const struct kernel_param *kp)
 {
 	printk(KERN_ERR "%s: readonly parameter\n", kp->name);
 	return -EINVAL;
 }
 static int param_get_dlmfs_capabilities(char *buffer,
-					struct kernel_param *kp)
+					const struct kernel_param *kp)
 {
 	return strlcpy(buffer, DLMFS_CAPABILITIES,
 		       strlen(DLMFS_CAPABILITIES) + 1);
@@ -670,7 +670,6 @@ static void __exit exit_dlmfs_fs(void)
 {
 	unregister_filesystem(&dlmfs_fs_type);
 
-	flush_workqueue(user_dlm_worker);
 	destroy_workqueue(user_dlm_worker);
 
 	/*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6e41fc8fabbe..dc455d45a66a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1161,6 +1161,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
 	if (size_change) {
+		/*
+		 * Here we should wait dio to finish before inode lock
+		 * to avoid a deadlock between ocfs2_setattr() and
+		 * ocfs2_dio_end_io_write()
+		 */
+		inode_dio_wait(inode);
+
 		status = ocfs2_rw_lock(inode, 1);
 		if (status < 0) {
 			mlog_errno(status);
@@ -1200,8 +1207,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		if (status)
 			goto bail_unlock;
 
-		inode_dio_wait(inode);
-
 		if (i_size_read(inode) >= attr->ia_size) {
 			if (ocfs2_should_order_data(inode)) {
 				status = ocfs2_begin_ordered_truncate(inode,
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 2cabbcf2f28e..e87279e49ba3 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -129,19 +129,13 @@ static struct kobj_attribute ocfs2_attr_filecheck_set =
 					ocfs2_filecheck_show,
 					ocfs2_filecheck_store);
 
-static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 static void
 ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
 {
 	struct ocfs2_filecheck_entry *p;
 
 	if (!atomic_dec_and_test(&entry->fs_count))
-		wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+		wait_on_atomic_t(&entry->fs_count, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 
 	spin_lock(&entry->fs_fcheck->fc_lock);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 71f22c8fbffd..9f0b95abc09f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1147,12 +1147,9 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 					     GLOBAL_BITMAP_SYSTEM_INODE,
 					     OCFS2_INVALID_SLOT, NULL,
 					     ALLOC_NEW_GROUP);
-	if (status < 0 && status != -ENOSPC) {
+	if (status < 0 && status != -ENOSPC)
 		mlog_errno(status);
-		goto bail;
-	}
 
-bail:
 	return status;
 }
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 80733496b22a..040bbb6a6e4b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2521,10 +2521,8 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	/* This function assumes that the caller has the main osb resource */
 
 	/* ocfs2_initializer_super have already created this workqueue */
-	if (osb->ocfs2_wq) {
-		flush_workqueue(osb->ocfs2_wq);
+	if (osb->ocfs2_wq)
 		destroy_workqueue(osb->ocfs2_wq);
-	}
 
 	ocfs2_free_slot_info(osb);
 
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b023e4f3d740..d4550c8bbc41 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,9 +26,6 @@
 #ifndef OCFS2_SUPER_H
 #define OCFS2_SUPER_H
 
-int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
-				  int node_num);
-
 __printf(3, 4)
 int __ocfs2_error(struct super_block *sb, const char *function,
 		   const char *fmt, ...);
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 004af348fb80..f44d5eb74fcc 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -275,12 +275,6 @@ struct orangefs_kiocb_s {
 	/* orangefs kernel operation type */
 	struct orangefs_kernel_op_s *op;
 
-	/* The user space buffers from/to which I/O is being staged */
-	struct iovec *iov;
-
-	/* number of elements in the iovector */
-	unsigned long nr_segs;
-
 	/* set to indicate the type of the operation */
 	int rw;
 
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index c441f9387a1b..eb3b8d39fb61 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -22,7 +22,6 @@
 #include <linux/ratelimit.h>
 #include <linux/exportfs.h>
 #include "overlayfs.h"
-#include "ovl_entry.h"
 
 #define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
 
@@ -486,6 +485,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
 static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
 {
 	struct inode *udir = c->destdir->d_inode;
+	struct inode *inode;
 	struct dentry *newdentry = NULL;
 	struct dentry *temp = NULL;
 	int err;
@@ -508,7 +508,11 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
 	if (err)
 		goto out_cleanup;
 
-	ovl_inode_update(d_inode(c->dentry), newdentry);
+	inode = d_inode(c->dentry);
+	ovl_inode_update(inode, newdentry);
+	if (S_ISDIR(inode->i_mode))
+		ovl_set_flag(OVL_WHITEOUTS, inode);
+
 out:
 	dput(temp);
 	return err;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index cc961a3bd3bd..e13921824c70 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -181,6 +181,11 @@ static bool ovl_type_origin(struct dentry *dentry)
 	return OVL_TYPE_ORIGIN(ovl_path_type(dentry));
 }
 
+static bool ovl_may_have_whiteouts(struct dentry *dentry)
+{
+	return ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry));
+}
+
 static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 			    struct cattr *attr, struct dentry *hardlink)
 {
@@ -300,7 +305,6 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
 {
 	int err;
 	struct dentry *ret = NULL;
-	enum ovl_path_type type = ovl_path_type(dentry);
 	LIST_HEAD(list);
 
 	err = ovl_check_empty_dir(dentry, &list);
@@ -313,13 +317,13 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
 	 * When removing an empty opaque directory, then it makes no sense to
 	 * replace it with an exact replica of itself.
 	 *
-	 * If no upperdentry then skip clearing whiteouts.
+	 * If upperdentry has whiteouts, clear them.
 	 *
 	 * Can race with copy-up, since we don't hold the upperdir mutex.
 	 * Doesn't matter, since copy-up can't create a non-empty directory
 	 * from an empty one.
 	 */
-	if (OVL_TYPE_UPPER(type) && OVL_TYPE_MERGE(type))
+	if (!list_empty(&list))
 		ret = ovl_clear_empty(dentry, &list);
 
 out_free:
@@ -698,8 +702,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 	struct dentry *opaquedir = NULL;
 	int err;
 
-	/* Redirect dir can be !ovl_lower_positive && OVL_TYPE_MERGE */
-	if (is_dir && ovl_dentry_get_redirect(dentry)) {
+	/* Redirect/origin dir can be !ovl_lower_positive && not clean */
+	if (is_dir && (ovl_dentry_get_redirect(dentry) ||
+		       ovl_may_have_whiteouts(dentry))) {
 		opaquedir = ovl_check_empty_and_clear(dentry);
 		err = PTR_ERR(opaquedir);
 		if (IS_ERR(opaquedir))
@@ -946,7 +951,8 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
 
 	old_cred = ovl_override_creds(old->d_sb);
 
-	if (overwrite && new_is_dir && ovl_type_merge_or_lower(new)) {
+	if (overwrite && new_is_dir && (ovl_type_merge_or_lower(new) ||
+					ovl_may_have_whiteouts(new))) {
 		opaquedir = ovl_check_empty_and_clear(new);
 		err = PTR_ERR(opaquedir);
 		if (IS_ERR(opaquedir)) {
@@ -1069,9 +1075,10 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
 			drop_nlink(d_inode(new));
 	}
 
-	ovl_dentry_version_inc(old->d_parent,
-			       !overwrite && ovl_type_origin(new));
-	ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old));
+	ovl_dentry_version_inc(old->d_parent, ovl_type_origin(old) ||
+			       (!overwrite && ovl_type_origin(new)));
+	ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old) ||
+			       (d_inode(new) && ovl_type_origin(new)));
 
 out_dput:
 	dput(newdentry);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 321511ed8c42..00b6b294272a 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -15,6 +15,14 @@
 #include <linux/ratelimit.h>
 #include "overlayfs.h"
 
+
+static dev_t ovl_get_pseudo_dev(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return oe->lowerstack[0].layer->pseudo_dev;
+}
+
 int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	int err;
@@ -66,6 +74,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
 	struct path realpath;
 	const struct cred *old_cred;
 	bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
+	bool samefs = ovl_same_sb(dentry->d_sb);
 	int err;
 
 	type = ovl_path_real(dentry, &realpath);
@@ -75,16 +84,13 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
 		goto out;
 
 	/*
-	 * When all layers are on the same fs, all real inode number are
-	 * unique, so we use the overlay st_dev, which is friendly to du -x.
-	 *
-	 * We also use st_ino of the copy up origin, if we know it.
-	 * This guaranties constant st_dev/st_ino across copy up.
+	 * For non-dir or same fs, we use st_ino of the copy up origin, if we
+	 * know it. This guaranties constant st_dev/st_ino across copy up.
 	 *
 	 * If filesystem supports NFS export ops, this also guaranties
 	 * persistent st_ino across mount cycle.
 	 */
-	if (ovl_same_sb(dentry->d_sb)) {
+	if (!is_dir || samefs) {
 		if (OVL_TYPE_ORIGIN(type)) {
 			struct kstat lowerstat;
 			u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
@@ -95,7 +101,6 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
 			if (err)
 				goto out;
 
-			WARN_ON_ONCE(stat->dev != lowerstat.dev);
 			/*
 			 * Lower hardlinks may be broken on copy up to different
 			 * upper files, so we cannot use the lower origin st_ino
@@ -107,17 +112,36 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
 			if (is_dir || lowerstat.nlink == 1 ||
 			    ovl_test_flag(OVL_INDEX, d_inode(dentry)))
 				stat->ino = lowerstat.ino;
+
+			if (samefs)
+				WARN_ON_ONCE(stat->dev != lowerstat.dev);
+			else
+				stat->dev = ovl_get_pseudo_dev(dentry);
 		}
-		stat->dev = dentry->d_sb->s_dev;
-	} else if (is_dir) {
+		if (samefs) {
+			/*
+			 * When all layers are on the same fs, all real inode
+			 * number are unique, so we use the overlay st_dev,
+			 * which is friendly to du -x.
+			 */
+			stat->dev = dentry->d_sb->s_dev;
+		} else if (!OVL_TYPE_UPPER(type)) {
+			/*
+			 * For non-samefs setup, to make sure that st_dev/st_ino
+			 * pair is unique across the system, we use a unique
+			 * anonymous st_dev for lower layer inode.
+			 */
+			stat->dev = ovl_get_pseudo_dev(dentry);
+		}
+	} else {
 		/*
-		 * If not all layers are on the same fs the pair {real st_ino;
-		 * overlay st_dev} is not unique, so use the non persistent
-		 * overlay st_ino.
-		 *
 		 * Always use the overlay st_dev for directories, so 'find
 		 * -xdev' will scan the entire overlay mount and won't cross the
 		 * overlay mount boundaries.
+		 *
+		 * If not all layers are on the same fs the pair {real st_ino;
+		 * overlay st_dev} is not unique, so use the non persistent
+		 * overlay st_ino for directories.
 		 */
 		stat->dev = dentry->d_sb->s_dev;
 		stat->ino = dentry->d_inode->i_ino;
@@ -409,6 +433,7 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
 #ifdef CONFIG_LOCKDEP
 	static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
 	static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
+	static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING];
 
 	int depth = inode->i_sb->s_stack_depth - 1;
 
@@ -419,6 +444,8 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
 		lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
 	else
 		lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
+
+	lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]);
 #endif
 }
 
@@ -657,6 +684,16 @@ struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry,
 	if (upperdentry && ovl_is_impuredir(upperdentry))
 		ovl_set_flag(OVL_IMPURE, inode);
 
+	/* Check for non-merge dir that may have whiteouts */
+	if (S_ISDIR(realinode->i_mode)) {
+		struct ovl_entry *oe = dentry->d_fsdata;
+
+		if (((upperdentry && lowerdentry) || oe->numlower > 1) ||
+		    ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
+			ovl_set_flag(OVL_WHITEOUTS, inode);
+		}
+	}
+
 	if (inode->i_state & I_NEW)
 		unlock_new_inode(inode);
 out:
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index a12dc10bf726..625ed8066570 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -15,7 +15,6 @@
 #include <linux/mount.h>
 #include <linux/exportfs.h>
 #include "overlayfs.h"
-#include "ovl_entry.h"
 
 struct ovl_lookup_data {
 	struct qstr name;
@@ -286,16 +285,15 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
 
 
 static int ovl_check_origin(struct dentry *upperdentry,
-			    struct path *lowerstack, unsigned int numlower,
-			    struct path **stackp, unsigned int *ctrp)
+			    struct ovl_path *lower, unsigned int numlower,
+			    struct ovl_path **stackp, unsigned int *ctrp)
 {
 	struct vfsmount *mnt;
 	struct dentry *origin = NULL;
 	int i;
 
-
 	for (i = 0; i < numlower; i++) {
-		mnt = lowerstack[i].mnt;
+		mnt = lower[i].layer->mnt;
 		origin = ovl_get_origin(upperdentry, mnt);
 		if (IS_ERR(origin))
 			return PTR_ERR(origin);
@@ -309,12 +307,12 @@ static int ovl_check_origin(struct dentry *upperdentry,
 
 	BUG_ON(*ctrp);
 	if (!*stackp)
-		*stackp = kmalloc(sizeof(struct path), GFP_KERNEL);
+		*stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL);
 	if (!*stackp) {
 		dput(origin);
 		return -ENOMEM;
 	}
-	**stackp = (struct path) { .dentry = origin, .mnt = mnt };
+	**stackp = (struct ovl_path){.dentry = origin, .layer = lower[i].layer};
 	*ctrp = 1;
 
 	return 0;
@@ -350,8 +348,8 @@ static int ovl_verify_origin_fh(struct dentry *dentry, const struct ovl_fh *fh)
  *
  * Return 0 on match, -ESTALE on mismatch, < 0 on error.
  */
-int ovl_verify_origin(struct dentry *dentry, struct vfsmount *mnt,
-		      struct dentry *origin, bool is_upper, bool set)
+int ovl_verify_origin(struct dentry *dentry, struct dentry *origin,
+		      bool is_upper, bool set)
 {
 	struct inode *inode;
 	struct ovl_fh *fh;
@@ -384,13 +382,13 @@ fail:
  * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path.
  * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error.
  */
-int ovl_verify_index(struct dentry *index, struct path *lowerstack,
+int ovl_verify_index(struct dentry *index, struct ovl_path *lower,
 		     unsigned int numlower)
 {
 	struct ovl_fh *fh = NULL;
 	size_t len;
-	struct path origin = { };
-	struct path *stack = &origin;
+	struct ovl_path origin = { };
+	struct ovl_path *stack = &origin;
 	unsigned int ctr = 0;
 	int err;
 
@@ -429,7 +427,7 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack,
 	if (err)
 		goto fail;
 
-	err = ovl_check_origin(index, lowerstack, numlower, &stack, &ctr);
+	err = ovl_check_origin(index, lower, numlower, &stack, &ctr);
 	if (!err && !ctr)
 		err = -ESTALE;
 	if (err)
@@ -568,11 +566,24 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
 		idx++;
 	}
 	BUG_ON(idx > oe->numlower);
-	*path = oe->lowerstack[idx - 1];
+	path->dentry = oe->lowerstack[idx - 1].dentry;
+	path->mnt = oe->lowerstack[idx - 1].layer->mnt;
 
 	return (idx < oe->numlower) ? idx + 1 : -1;
 }
 
+static int ovl_find_layer(struct ovl_fs *ofs, struct ovl_path *path)
+{
+	int i;
+
+	for (i = 0; i < ofs->numlower; i++) {
+		if (ofs->lower_layers[i].mnt == path->layer->mnt)
+			break;
+	}
+
+	return i;
+}
+
 struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			  unsigned int flags)
 {
@@ -581,7 +592,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
 	struct ovl_entry *poe = dentry->d_parent->d_fsdata;
 	struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata;
-	struct path *stack = NULL;
+	struct ovl_path *stack = NULL;
 	struct dentry *upperdir, *upperdentry = NULL;
 	struct dentry *index = NULL;
 	unsigned int ctr = 0;
@@ -630,7 +641,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			err = ovl_check_origin(upperdentry, roe->lowerstack,
 					       roe->numlower, &stack, &ctr);
 			if (err)
-				goto out;
+				goto out_put_upper;
 		}
 
 		if (d.redirect) {
@@ -646,17 +657,17 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 
 	if (!d.stop && poe->numlower) {
 		err = -ENOMEM;
-		stack = kcalloc(ofs->numlower, sizeof(struct path),
+		stack = kcalloc(ofs->numlower, sizeof(struct ovl_path),
 				GFP_KERNEL);
 		if (!stack)
 			goto out_put_upper;
 	}
 
 	for (i = 0; !d.stop && i < poe->numlower; i++) {
-		struct path lowerpath = poe->lowerstack[i];
+		struct ovl_path lower = poe->lowerstack[i];
 
 		d.last = i == poe->numlower - 1;
-		err = ovl_lookup_layer(lowerpath.dentry, &d, &this);
+		err = ovl_lookup_layer(lower.dentry, &d, &this);
 		if (err)
 			goto out_put;
 
@@ -664,7 +675,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			continue;
 
 		stack[ctr].dentry = this;
-		stack[ctr].mnt = lowerpath.mnt;
+		stack[ctr].layer = lower.layer;
 		ctr++;
 
 		if (d.stop)
@@ -674,10 +685,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 			poe = roe;
 
 			/* Find the current layer on the root dentry */
-			for (i = 0; i < poe->numlower; i++)
-				if (poe->lowerstack[i].mnt == lowerpath.mnt)
-					break;
-			if (WARN_ON(i == poe->numlower))
+			i = ovl_find_layer(ofs, &lower);
+			if (WARN_ON(i == ofs->numlower))
 				break;
 		}
 	}
@@ -700,7 +709,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 		goto out_put;
 
 	oe->opaque = upperopaque;
-	memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
+	memcpy(oe->lowerstack, stack, sizeof(struct ovl_path) * ctr);
 	dentry->d_fsdata = oe;
 
 	if (upperdentry)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index d9a0edd4e57e..13eab09a6b6f 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -9,6 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/uuid.h>
+#include "ovl_entry.h"
 
 enum ovl_path_type {
 	__OVL_PATH_UPPER	= (1 << 0),
@@ -28,7 +29,10 @@ enum ovl_path_type {
 #define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink"
 
 enum ovl_flag {
+	/* Pure upper dir that may contain non pure upper entries */
 	OVL_IMPURE,
+	/* Non-merge dir that may contain whiteout entries */
+	OVL_WHITEOUTS,
 	OVL_INDEX,
 };
 
@@ -223,6 +227,7 @@ bool ovl_is_whiteout(struct dentry *dentry);
 struct file *ovl_path_open(struct path *path, int flags);
 int ovl_copy_up_start(struct dentry *dentry);
 void ovl_copy_up_end(struct dentry *dentry);
+bool ovl_check_origin_xattr(struct dentry *dentry);
 bool ovl_check_dir_xattr(struct dentry *dentry, const char *name);
 int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
 		       const char *name, const void *value, size_t size,
@@ -244,9 +249,9 @@ static inline bool ovl_is_impuredir(struct dentry *dentry)
 
 
 /* namei.c */
-int ovl_verify_origin(struct dentry *dentry, struct vfsmount *mnt,
-		      struct dentry *origin, bool is_upper, bool set);
-int ovl_verify_index(struct dentry *index, struct path *lowerstack,
+int ovl_verify_origin(struct dentry *dentry, struct dentry *origin,
+		      bool is_upper, bool set);
+int ovl_verify_index(struct dentry *index, struct ovl_path *lower,
 		     unsigned int numlower);
 int ovl_get_index_name(struct dentry *origin, struct qstr *name);
 int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
@@ -263,7 +268,7 @@ int ovl_check_d_type_supported(struct path *realpath);
 void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
 			 struct dentry *dentry, int level);
 int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
-			 struct path *lowerstack, unsigned int numlower);
+			 struct ovl_path *lower, unsigned int numlower);
 
 /* inode.c */
 int ovl_set_nlink_upper(struct dentry *dentry);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 36b49bd09264..752bab645879 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -17,11 +17,21 @@ struct ovl_config {
 	bool index;
 };
 
+struct ovl_layer {
+	struct vfsmount *mnt;
+	dev_t pseudo_dev;
+};
+
+struct ovl_path {
+	struct ovl_layer *layer;
+	struct dentry *dentry;
+};
+
 /* private information held for overlayfs's superblock */
 struct ovl_fs {
 	struct vfsmount *upper_mnt;
 	unsigned numlower;
-	struct vfsmount **lower_mnt;
+	struct ovl_layer *lower_layers;
 	/* workbasedir is the path at workdir= mount option */
 	struct dentry *workbasedir;
 	/* workdir is the 'work' directory under workbasedir */
@@ -52,7 +62,7 @@ struct ovl_entry {
 		struct rcu_head rcu;
 	};
 	unsigned numlower;
-	struct path lowerstack[];
+	struct ovl_path lowerstack[];
 };
 
 struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index c310e3ff7f3f..0daa4354fec4 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -26,6 +26,7 @@ struct ovl_cache_entry {
 	struct list_head l_node;
 	struct rb_node node;
 	struct ovl_cache_entry *next_maybe_whiteout;
+	bool is_upper;
 	bool is_whiteout;
 	char name[];
 };
@@ -158,6 +159,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
 	/* Defer setting d_ino for upper entry to ovl_iterate() */
 	if (ovl_calc_d_ino(rdd, p))
 		p->ino = 0;
+	p->is_upper = rdd->is_upper;
 	p->is_whiteout = false;
 
 	if (d_type == DT_CHR) {
@@ -316,21 +318,37 @@ static inline int ovl_dir_read(struct path *realpath,
 	return err;
 }
 
+/*
+ * Can we iterate real dir directly?
+ *
+ * Non-merge dir may contain whiteouts from a time it was a merge upper, before
+ * lower dir was removed under it and possibly before it was rotated from upper
+ * to lower layer.
+ */
+static bool ovl_dir_is_real(struct dentry *dir)
+{
+	return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir));
+}
+
 static void ovl_dir_reset(struct file *file)
 {
 	struct ovl_dir_file *od = file->private_data;
 	struct ovl_dir_cache *cache = od->cache;
 	struct dentry *dentry = file->f_path.dentry;
-	enum ovl_path_type type = ovl_path_type(dentry);
+	bool is_real;
 
 	if (cache && ovl_dentry_version_get(dentry) != cache->version) {
 		ovl_cache_put(od, dentry);
 		od->cache = NULL;
 		od->cursor = NULL;
 	}
-	WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
-	if (od->is_real && OVL_TYPE_MERGE(type))
+	is_real = ovl_dir_is_real(dentry);
+	if (od->is_real != is_real) {
+		/* is_real can only become false when dir is copied up */
+		if (WARN_ON(is_real))
+			return;
 		od->is_real = false;
+	}
 }
 
 static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
@@ -816,7 +834,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
 		return PTR_ERR(realfile);
 	}
 	od->realfile = realfile;
-	od->is_real = !OVL_TYPE_MERGE(type);
+	od->is_real = ovl_dir_is_real(file->f_path.dentry);
 	od->is_upper = OVL_TYPE_UPPER(type);
 	file->private_data = od;
 
@@ -835,7 +853,7 @@ const struct file_operations ovl_dir_operations = {
 int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
 {
 	int err;
-	struct ovl_cache_entry *p;
+	struct ovl_cache_entry *p, *n;
 	struct rb_root root = RB_ROOT;
 
 	err = ovl_dir_read_merged(dentry, list, &root);
@@ -844,18 +862,29 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
 
 	err = 0;
 
-	list_for_each_entry(p, list, l_node) {
-		if (p->is_whiteout)
-			continue;
+	list_for_each_entry_safe(p, n, list, l_node) {
+		/*
+		 * Select whiteouts in upperdir, they should
+		 * be cleared when deleting this directory.
+		 */
+		if (p->is_whiteout) {
+			if (p->is_upper)
+				continue;
+			goto del_entry;
+		}
 
 		if (p->name[0] == '.') {
 			if (p->len == 1)
-				continue;
+				goto del_entry;
 			if (p->len == 2 && p->name[1] == '.')
-				continue;
+				goto del_entry;
 		}
 		err = -ENOTEMPTY;
 		break;
+
+del_entry:
+		list_del(&p->l_node);
+		kfree(p);
 	}
 
 	return err;
@@ -869,7 +898,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
 	list_for_each_entry(p, list, l_node) {
 		struct dentry *dentry;
 
-		if (!p->is_whiteout)
+		if (WARN_ON(!p->is_whiteout || !p->is_upper))
 			continue;
 
 		dentry = lookup_one_len(p->name, upper, p->len);
@@ -985,7 +1014,7 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
 }
 
 int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
-			 struct path *lowerstack, unsigned int numlower)
+			 struct ovl_path *lower, unsigned int numlower)
 {
 	int err;
 	struct dentry *index = NULL;
@@ -1020,7 +1049,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
 			index = NULL;
 			break;
 		}
-		err = ovl_verify_index(index, lowerstack, numlower);
+		err = ovl_verify_index(index, lower, numlower);
 		/* Cleanup stale and orphan index entries */
 		if (err && (err == -ESTALE || err == -ENOENT))
 			err = ovl_cleanup(dir, index);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index f5738e96a052..be03578181d2 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -18,7 +18,6 @@
 #include <linux/seq_file.h>
 #include <linux/posix_acl_xattr.h>
 #include "overlayfs.h"
-#include "ovl_entry.h"
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Overlay filesystem");
@@ -39,15 +38,20 @@ module_param_named(index, ovl_index_def, bool, 0644);
 MODULE_PARM_DESC(ovl_index_def,
 		 "Default to on or off for the inodes index feature");
 
+static void ovl_entry_stack_free(struct ovl_entry *oe)
+{
+	unsigned int i;
+
+	for (i = 0; i < oe->numlower; i++)
+		dput(oe->lowerstack[i].dentry);
+}
+
 static void ovl_dentry_release(struct dentry *dentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
 	if (oe) {
-		unsigned int i;
-
-		for (i = 0; i < oe->numlower; i++)
-			dput(oe->lowerstack[i].dentry);
+		ovl_entry_stack_free(oe);
 		kfree_rcu(oe, rcu);
 	}
 }
@@ -207,39 +211,48 @@ static void ovl_destroy_inode(struct inode *inode)
 	call_rcu(&inode->i_rcu, ovl_i_callback);
 }
 
-static void ovl_put_super(struct super_block *sb)
+static void ovl_free_fs(struct ovl_fs *ofs)
 {
-	struct ovl_fs *ufs = sb->s_fs_info;
 	unsigned i;
 
-	dput(ufs->indexdir);
-	dput(ufs->workdir);
-	if (ufs->workdir_locked)
-		ovl_inuse_unlock(ufs->workbasedir);
-	dput(ufs->workbasedir);
-	if (ufs->upper_mnt && ufs->upperdir_locked)
-		ovl_inuse_unlock(ufs->upper_mnt->mnt_root);
-	mntput(ufs->upper_mnt);
-	for (i = 0; i < ufs->numlower; i++)
-		mntput(ufs->lower_mnt[i]);
-	kfree(ufs->lower_mnt);
-
-	kfree(ufs->config.lowerdir);
-	kfree(ufs->config.upperdir);
-	kfree(ufs->config.workdir);
-	put_cred(ufs->creator_cred);
-	kfree(ufs);
+	dput(ofs->indexdir);
+	dput(ofs->workdir);
+	if (ofs->workdir_locked)
+		ovl_inuse_unlock(ofs->workbasedir);
+	dput(ofs->workbasedir);
+	if (ofs->upperdir_locked)
+		ovl_inuse_unlock(ofs->upper_mnt->mnt_root);
+	mntput(ofs->upper_mnt);
+	for (i = 0; i < ofs->numlower; i++) {
+		mntput(ofs->lower_layers[i].mnt);
+		free_anon_bdev(ofs->lower_layers[i].pseudo_dev);
+	}
+	kfree(ofs->lower_layers);
+
+	kfree(ofs->config.lowerdir);
+	kfree(ofs->config.upperdir);
+	kfree(ofs->config.workdir);
+	if (ofs->creator_cred)
+		put_cred(ofs->creator_cred);
+	kfree(ofs);
+}
+
+static void ovl_put_super(struct super_block *sb)
+{
+	struct ovl_fs *ofs = sb->s_fs_info;
+
+	ovl_free_fs(ofs);
 }
 
 static int ovl_sync_fs(struct super_block *sb, int wait)
 {
-	struct ovl_fs *ufs = sb->s_fs_info;
+	struct ovl_fs *ofs = sb->s_fs_info;
 	struct super_block *upper_sb;
 	int ret;
 
-	if (!ufs->upper_mnt)
+	if (!ofs->upper_mnt)
 		return 0;
-	upper_sb = ufs->upper_mnt->mnt_sb;
+	upper_sb = ofs->upper_mnt->mnt_sb;
 	if (!upper_sb->s_op->sync_fs)
 		return 0;
 
@@ -277,9 +290,9 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
 
 /* Will this overlay be forced to mount/remount ro? */
-static bool ovl_force_readonly(struct ovl_fs *ufs)
+static bool ovl_force_readonly(struct ovl_fs *ofs)
 {
-	return (!ufs->upper_mnt || !ufs->workdir);
+	return (!ofs->upper_mnt || !ofs->workdir);
 }
 
 /**
@@ -291,29 +304,29 @@ static bool ovl_force_readonly(struct ovl_fs *ufs)
 static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
 {
 	struct super_block *sb = dentry->d_sb;
-	struct ovl_fs *ufs = sb->s_fs_info;
+	struct ovl_fs *ofs = sb->s_fs_info;
 
-	seq_show_option(m, "lowerdir", ufs->config.lowerdir);
-	if (ufs->config.upperdir) {
-		seq_show_option(m, "upperdir", ufs->config.upperdir);
-		seq_show_option(m, "workdir", ufs->config.workdir);
+	seq_show_option(m, "lowerdir", ofs->config.lowerdir);
+	if (ofs->config.upperdir) {
+		seq_show_option(m, "upperdir", ofs->config.upperdir);
+		seq_show_option(m, "workdir", ofs->config.workdir);
 	}
-	if (ufs->config.default_permissions)
+	if (ofs->config.default_permissions)
 		seq_puts(m, ",default_permissions");
-	if (ufs->config.redirect_dir != ovl_redirect_dir_def)
+	if (ofs->config.redirect_dir != ovl_redirect_dir_def)
 		seq_printf(m, ",redirect_dir=%s",
-			   ufs->config.redirect_dir ? "on" : "off");
-	if (ufs->config.index != ovl_index_def)
+			   ofs->config.redirect_dir ? "on" : "off");
+	if (ofs->config.index != ovl_index_def)
 		seq_printf(m, ",index=%s",
-			   ufs->config.index ? "on" : "off");
+			   ofs->config.index ? "on" : "off");
 	return 0;
 }
 
 static int ovl_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct ovl_fs *ufs = sb->s_fs_info;
+	struct ovl_fs *ofs = sb->s_fs_info;
 
-	if (!(*flags & MS_RDONLY) && ovl_force_readonly(ufs))
+	if (!(*flags & MS_RDONLY) && ovl_force_readonly(ofs))
 		return -EROFS;
 
 	return 0;
@@ -451,13 +464,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
 #define OVL_WORKDIR_NAME "work"
 #define OVL_INDEXDIR_NAME "index"
 
-static struct dentry *ovl_workdir_create(struct super_block *sb,
-					 struct ovl_fs *ufs,
-					 struct dentry *dentry,
+static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
 					 const char *name, bool persist)
 {
-	struct inode *dir = dentry->d_inode;
-	struct vfsmount *mnt = ufs->upper_mnt;
+	struct inode *dir =  ofs->workbasedir->d_inode;
+	struct vfsmount *mnt = ofs->upper_mnt;
 	struct dentry *work;
 	int err;
 	bool retried = false;
@@ -471,7 +482,7 @@ static struct dentry *ovl_workdir_create(struct super_block *sb,
 	locked = true;
 
 retry:
-	work = lookup_one_len(name, dentry, strlen(name));
+	work = lookup_one_len(name, ofs->workbasedir, strlen(name));
 
 	if (!IS_ERR(work)) {
 		struct iattr attr = {
@@ -541,8 +552,7 @@ out_dput:
 	dput(work);
 out_err:
 	pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n",
-		ufs->config.workdir, name, -err);
-	sb->s_flags |= MS_RDONLY;
+		ofs->config.workdir, name, -err);
 	work = NULL;
 	goto out_unlock;
 }
@@ -585,7 +595,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
 	return 0;
 
 out_put:
-	path_put(path);
+	path_put_init(path);
 out:
 	return err;
 }
@@ -603,7 +613,7 @@ static int ovl_mount_dir(const char *name, struct path *path)
 			if (ovl_dentry_remote(path->dentry)) {
 				pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n",
 				       tmp);
-				path_put(path);
+				path_put_init(path);
 				err = -EINVAL;
 			}
 		kfree(tmp);
@@ -655,7 +665,7 @@ static int ovl_lower_dir(const char *name, struct path *path,
 	return 0;
 
 out_put:
-	path_put(path);
+	path_put_init(path);
 out:
 	return err;
 }
@@ -826,129 +836,269 @@ static const struct xattr_handler *ovl_xattr_handlers[] = {
 	NULL
 };
 
-static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath)
 {
-	struct path upperpath = { };
-	struct path workpath = { };
-	struct dentry *root_dentry;
-	struct ovl_entry *oe;
-	struct ovl_fs *ufs;
-	struct path *stack = NULL;
-	char *lowertmp;
-	char *lower;
-	unsigned int numlower;
-	unsigned int stacklen = 0;
-	unsigned int i;
-	bool remote = false;
-	struct cred *cred;
+	struct vfsmount *upper_mnt;
 	int err;
 
-	err = -ENOMEM;
-	ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
-	if (!ufs)
+	err = ovl_mount_dir(ofs->config.upperdir, upperpath);
+	if (err)
 		goto out;
 
-	ufs->config.redirect_dir = ovl_redirect_dir_def;
-	ufs->config.index = ovl_index_def;
-	err = ovl_parse_opt((char *) data, &ufs->config);
+	/* Upper fs should not be r/o */
+	if (sb_rdonly(upperpath->mnt->mnt_sb)) {
+		pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir);
 	if (err)
-		goto out_free_config;
+		goto out;
+
+	err = -EBUSY;
+	if (ovl_inuse_trylock(upperpath->dentry)) {
+		ofs->upperdir_locked = true;
+	} else if (ofs->config.index) {
+		pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
+		goto out;
+	} else {
+		pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+	}
+
+	upper_mnt = clone_private_mount(upperpath);
+	err = PTR_ERR(upper_mnt);
+	if (IS_ERR(upper_mnt)) {
+		pr_err("overlayfs: failed to clone upperpath\n");
+		goto out;
+	}
+
+	/* Don't inherit atime flags */
+	upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
+	ofs->upper_mnt = upper_mnt;
+	err = 0;
+out:
+	return err;
+}
+
+static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath)
+{
+	struct dentry *temp;
+	int err;
+
+	ofs->workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false);
+	if (!ofs->workdir)
+		return 0;
+
+	/*
+	 * Upper should support d_type, else whiteouts are visible.  Given
+	 * workdir and upper are on same fs, we can do iterate_dir() on
+	 * workdir. This check requires successful creation of workdir in
+	 * previous step.
+	 */
+	err = ovl_check_d_type_supported(workpath);
+	if (err < 0)
+		return err;
+
+	/*
+	 * We allowed this configuration and don't want to break users over
+	 * kernel upgrade. So warn instead of erroring out.
+	 */
+	if (!err)
+		pr_warn("overlayfs: upper fs needs to support d_type.\n");
+
+	/* Check if upper/work fs supports O_TMPFILE */
+	temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0);
+	ofs->tmpfile = !IS_ERR(temp);
+	if (ofs->tmpfile)
+		dput(temp);
+	else
+		pr_warn("overlayfs: upper fs does not support tmpfile.\n");
+
+	/*
+	 * Check if upper/work fs supports trusted.overlay.* xattr
+	 */
+	err = ovl_do_setxattr(ofs->workdir, OVL_XATTR_OPAQUE, "0", 1, 0);
+	if (err) {
+		ofs->noxattr = true;
+		pr_warn("overlayfs: upper fs does not support xattr.\n");
+	} else {
+		vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE);
+	}
+
+	/* Check if upper/work fs supports file handles */
+	if (ofs->config.index &&
+	    !ovl_can_decode_fh(ofs->workdir->d_sb)) {
+		ofs->config.index = false;
+		pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n");
+	}
+
+	return 0;
+}
+
+static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath)
+{
+	int err;
+	struct path workpath = { };
+
+	err = ovl_mount_dir(ofs->config.workdir, &workpath);
+	if (err)
+		goto out;
 
 	err = -EINVAL;
-	if (!ufs->config.lowerdir) {
-		if (!silent)
-			pr_err("overlayfs: missing 'lowerdir'\n");
-		goto out_free_config;
+	if (upperpath->mnt != workpath.mnt) {
+		pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+		goto out;
+	}
+	if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) {
+		pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+		goto out;
 	}
 
-	sb->s_stack_depth = 0;
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	if (ufs->config.upperdir) {
-		if (!ufs->config.workdir) {
-			pr_err("overlayfs: missing 'workdir'\n");
-			goto out_free_config;
-		}
+	err = -EBUSY;
+	if (ovl_inuse_trylock(workpath.dentry)) {
+		ofs->workdir_locked = true;
+	} else if (ofs->config.index) {
+		pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
+		goto out;
+	} else {
+		pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+	}
 
-		err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
-		if (err)
-			goto out_free_config;
+	ofs->workbasedir = dget(workpath.dentry);
+	err = ovl_make_workdir(ofs, &workpath);
+	if (err)
+		goto out;
 
-		/* Upper fs should not be r/o */
-		if (sb_rdonly(upperpath.mnt->mnt_sb)) {
-			pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
-			err = -EINVAL;
-			goto out_put_upperpath;
-		}
+	err = 0;
+out:
+	path_put(&workpath);
 
-		err = ovl_check_namelen(&upperpath, ufs, ufs->config.upperdir);
-		if (err)
-			goto out_put_upperpath;
-
-		err = -EBUSY;
-		if (ovl_inuse_trylock(upperpath.dentry)) {
-			ufs->upperdir_locked = true;
-		} else if (ufs->config.index) {
-			pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
-			goto out_put_upperpath;
-		} else {
-			pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
-		}
+	return err;
+}
+
+static int ovl_get_indexdir(struct ovl_fs *ofs, struct ovl_entry *oe,
+			    struct path *upperpath)
+{
+	int err;
 
-		err = ovl_mount_dir(ufs->config.workdir, &workpath);
+	/* Verify lower root is upper root origin */
+	err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry,
+				false, true);
+	if (err) {
+		pr_err("overlayfs: failed to verify upper root origin\n");
+		goto out;
+	}
+
+	ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true);
+	if (ofs->indexdir) {
+		/* Verify upper root is index dir origin */
+		err = ovl_verify_origin(ofs->indexdir, upperpath->dentry,
+					true, true);
 		if (err)
-			goto out_unlock_upperdentry;
+			pr_err("overlayfs: failed to verify index dir origin\n");
 
-		err = -EINVAL;
-		if (upperpath.mnt != workpath.mnt) {
-			pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
-			goto out_put_workpath;
-		}
-		if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
-			pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
-			goto out_put_workpath;
+		/* Cleanup bad/stale/orphan index entries */
+		if (!err)
+			err = ovl_indexdir_cleanup(ofs->indexdir,
+						   ofs->upper_mnt,
+						   oe->lowerstack,
+						   oe->numlower);
+	}
+	if (err || !ofs->indexdir)
+		pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
+
+out:
+	return err;
+}
+
+static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack,
+				unsigned int numlower)
+{
+	int err;
+	unsigned int i;
+
+	err = -ENOMEM;
+	ofs->lower_layers = kcalloc(numlower, sizeof(struct ovl_layer),
+				    GFP_KERNEL);
+	if (ofs->lower_layers == NULL)
+		goto out;
+	for (i = 0; i < numlower; i++) {
+		struct vfsmount *mnt;
+		dev_t dev;
+
+		err = get_anon_bdev(&dev);
+		if (err) {
+			pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n");
+			goto out;
 		}
 
-		err = -EBUSY;
-		if (ovl_inuse_trylock(workpath.dentry)) {
-			ufs->workdir_locked = true;
-		} else if (ufs->config.index) {
-			pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
-			goto out_put_workpath;
-		} else {
-			pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+		mnt = clone_private_mount(&stack[i]);
+		err = PTR_ERR(mnt);
+		if (IS_ERR(mnt)) {
+			pr_err("overlayfs: failed to clone lowerpath\n");
+			free_anon_bdev(dev);
+			goto out;
 		}
+		/*
+		 * Make lower layers R/O.  That way fchmod/fchown on lower file
+		 * will fail instead of modifying lower fs.
+		 */
+		mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
 
-		ufs->workbasedir = workpath.dentry;
-		sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth;
+		ofs->lower_layers[ofs->numlower].mnt = mnt;
+		ofs->lower_layers[ofs->numlower].pseudo_dev = dev;
+		ofs->numlower++;
+
+		/* Check if all lower layers are on same sb */
+		if (i == 0)
+			ofs->same_sb = mnt->mnt_sb;
+		else if (ofs->same_sb != mnt->mnt_sb)
+			ofs->same_sb = NULL;
 	}
+	err = 0;
+out:
+	return err;
+}
+
+static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
+					    struct ovl_fs *ofs)
+{
+	int err;
+	char *lowertmp, *lower;
+	struct path *stack = NULL;
+	unsigned int stacklen, numlower = 0, i;
+	bool remote = false;
+	struct ovl_entry *oe;
+
 	err = -ENOMEM;
-	lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL);
+	lowertmp = kstrdup(ofs->config.lowerdir, GFP_KERNEL);
 	if (!lowertmp)
-		goto out_unlock_workdentry;
+		goto out_err;
 
 	err = -EINVAL;
 	stacklen = ovl_split_lowerdirs(lowertmp);
 	if (stacklen > OVL_MAX_STACK) {
 		pr_err("overlayfs: too many lower directories, limit is %d\n",
 		       OVL_MAX_STACK);
-		goto out_free_lowertmp;
-	} else if (!ufs->config.upperdir && stacklen == 1) {
+		goto out_err;
+	} else if (!ofs->config.upperdir && stacklen == 1) {
 		pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
-		goto out_free_lowertmp;
+		goto out_err;
 	}
 
 	err = -ENOMEM;
 	stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
 	if (!stack)
-		goto out_free_lowertmp;
+		goto out_err;
 
 	err = -EINVAL;
 	lower = lowertmp;
 	for (numlower = 0; numlower < stacklen; numlower++) {
-		err = ovl_lower_dir(lower, &stack[numlower], ufs,
+		err = ovl_lower_dir(lower, &stack[numlower], ofs,
 				    &sb->s_stack_depth, &remote);
 		if (err)
-			goto out_put_lowerpath;
+			goto out_err;
 
 		lower = strchr(lower, '\0') + 1;
 	}
@@ -957,190 +1107,144 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_stack_depth++;
 	if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
 		pr_err("overlayfs: maximum fs stacking depth exceeded\n");
-		goto out_put_lowerpath;
+		goto out_err;
 	}
 
-	if (ufs->config.upperdir) {
-		ufs->upper_mnt = clone_private_mount(&upperpath);
-		err = PTR_ERR(ufs->upper_mnt);
-		if (IS_ERR(ufs->upper_mnt)) {
-			pr_err("overlayfs: failed to clone upperpath\n");
-			goto out_put_lowerpath;
-		}
+	err = ovl_get_lower_layers(ofs, stack, numlower);
+	if (err)
+		goto out_err;
+
+	err = -ENOMEM;
+	oe = ovl_alloc_entry(numlower);
+	if (!oe)
+		goto out_err;
+
+	for (i = 0; i < numlower; i++) {
+		oe->lowerstack[i].dentry = dget(stack[i].dentry);
+		oe->lowerstack[i].layer = &ofs->lower_layers[i];
+	}
 
-		/* Don't inherit atime flags */
-		ufs->upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
+	if (remote)
+		sb->s_d_op = &ovl_reval_dentry_operations;
+	else
+		sb->s_d_op = &ovl_dentry_operations;
 
-		sb->s_time_gran = ufs->upper_mnt->mnt_sb->s_time_gran;
+out:
+	for (i = 0; i < numlower; i++)
+		path_put(&stack[i]);
+	kfree(stack);
+	kfree(lowertmp);
 
-		ufs->workdir = ovl_workdir_create(sb, ufs, workpath.dentry,
-						  OVL_WORKDIR_NAME, false);
-		/*
-		 * Upper should support d_type, else whiteouts are visible.
-		 * Given workdir and upper are on same fs, we can do
-		 * iterate_dir() on workdir. This check requires successful
-		 * creation of workdir in previous step.
-		 */
-		if (ufs->workdir) {
-			struct dentry *temp;
-
-			err = ovl_check_d_type_supported(&workpath);
-			if (err < 0)
-				goto out_put_workdir;
-
-			/*
-			 * We allowed this configuration and don't want to
-			 * break users over kernel upgrade. So warn instead
-			 * of erroring out.
-			 */
-			if (!err)
-				pr_warn("overlayfs: upper fs needs to support d_type.\n");
-
-			/* Check if upper/work fs supports O_TMPFILE */
-			temp = ovl_do_tmpfile(ufs->workdir, S_IFREG | 0);
-			ufs->tmpfile = !IS_ERR(temp);
-			if (ufs->tmpfile)
-				dput(temp);
-			else
-				pr_warn("overlayfs: upper fs does not support tmpfile.\n");
-
-			/*
-			 * Check if upper/work fs supports trusted.overlay.*
-			 * xattr
-			 */
-			err = ovl_do_setxattr(ufs->workdir, OVL_XATTR_OPAQUE,
-					      "0", 1, 0);
-			if (err) {
-				ufs->noxattr = true;
-				pr_warn("overlayfs: upper fs does not support xattr.\n");
-			} else {
-				vfs_removexattr(ufs->workdir, OVL_XATTR_OPAQUE);
-			}
+	return oe;
 
-			/* Check if upper/work fs supports file handles */
-			if (ufs->config.index &&
-			    !ovl_can_decode_fh(ufs->workdir->d_sb)) {
-				ufs->config.index = false;
-				pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n");
-			}
-		}
-	}
+out_err:
+	oe = ERR_PTR(err);
+	goto out;
+}
+
+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct path upperpath = { };
+	struct dentry *root_dentry;
+	struct ovl_entry *oe;
+	struct ovl_fs *ofs;
+	struct cred *cred;
+	int err;
 
 	err = -ENOMEM;
-	ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL);
-	if (ufs->lower_mnt == NULL)
-		goto out_put_workdir;
-	for (i = 0; i < numlower; i++) {
-		struct vfsmount *mnt = clone_private_mount(&stack[i]);
+	ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+	if (!ofs)
+		goto out;
 
-		err = PTR_ERR(mnt);
-		if (IS_ERR(mnt)) {
-			pr_err("overlayfs: failed to clone lowerpath\n");
-			goto out_put_lower_mnt;
-		}
-		/*
-		 * Make lower_mnt R/O.  That way fchmod/fchown on lower file
-		 * will fail instead of modifying lower fs.
-		 */
-		mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
+	ofs->creator_cred = cred = prepare_creds();
+	if (!cred)
+		goto out_err;
 
-		ufs->lower_mnt[ufs->numlower] = mnt;
-		ufs->numlower++;
+	ofs->config.redirect_dir = ovl_redirect_dir_def;
+	ofs->config.index = ovl_index_def;
+	err = ovl_parse_opt((char *) data, &ofs->config);
+	if (err)
+		goto out_err;
 
-		/* Check if all lower layers are on same sb */
-		if (i == 0)
-			ufs->same_sb = mnt->mnt_sb;
-		else if (ufs->same_sb != mnt->mnt_sb)
-			ufs->same_sb = NULL;
+	err = -EINVAL;
+	if (!ofs->config.lowerdir) {
+		if (!silent)
+			pr_err("overlayfs: missing 'lowerdir'\n");
+		goto out_err;
 	}
 
-	/* If the upper fs is nonexistent, we mark overlayfs r/o too */
-	if (!ufs->upper_mnt)
-		sb->s_flags |= MS_RDONLY;
-	else if (ufs->upper_mnt->mnt_sb != ufs->same_sb)
-		ufs->same_sb = NULL;
-
-	if (!(ovl_force_readonly(ufs)) && ufs->config.index) {
-		/* Verify lower root is upper root origin */
-		err = ovl_verify_origin(upperpath.dentry, ufs->lower_mnt[0],
-					stack[0].dentry, false, true);
-		if (err) {
-			pr_err("overlayfs: failed to verify upper root origin\n");
-			goto out_put_lower_mnt;
+	sb->s_stack_depth = 0;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	if (ofs->config.upperdir) {
+		if (!ofs->config.workdir) {
+			pr_err("overlayfs: missing 'workdir'\n");
+			goto out_err;
 		}
 
-		ufs->indexdir = ovl_workdir_create(sb, ufs, workpath.dentry,
-						   OVL_INDEXDIR_NAME, true);
-		if (ufs->indexdir) {
-			/* Verify upper root is index dir origin */
-			err = ovl_verify_origin(ufs->indexdir, ufs->upper_mnt,
-						upperpath.dentry, true, true);
-			if (err)
-				pr_err("overlayfs: failed to verify index dir origin\n");
+		err = ovl_get_upper(ofs, &upperpath);
+		if (err)
+			goto out_err;
 
-			/* Cleanup bad/stale/orphan index entries */
-			if (!err)
-				err = ovl_indexdir_cleanup(ufs->indexdir,
-							   ufs->upper_mnt,
-							   stack, numlower);
-		}
-		if (err || !ufs->indexdir)
-			pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
+		err = ovl_get_workdir(ofs, &upperpath);
 		if (err)
-			goto out_put_indexdir;
+			goto out_err;
+
+		if (!ofs->workdir)
+			sb->s_flags |= MS_RDONLY;
+
+		sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth;
+		sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran;
+
 	}
+	oe = ovl_get_lowerstack(sb, ofs);
+	err = PTR_ERR(oe);
+	if (IS_ERR(oe))
+		goto out_err;
 
-	/* Show index=off/on in /proc/mounts for any of the reasons above */
-	if (!ufs->indexdir)
-		ufs->config.index = false;
+	/* If the upper fs is nonexistent, we mark overlayfs r/o too */
+	if (!ofs->upper_mnt)
+		sb->s_flags |= MS_RDONLY;
+	else if (ofs->upper_mnt->mnt_sb != ofs->same_sb)
+		ofs->same_sb = NULL;
 
-	if (remote)
-		sb->s_d_op = &ovl_reval_dentry_operations;
-	else
-		sb->s_d_op = &ovl_dentry_operations;
+	if (!(ovl_force_readonly(ofs)) && ofs->config.index) {
+		err = ovl_get_indexdir(ofs, oe, &upperpath);
+		if (err)
+			goto out_free_oe;
 
-	err = -ENOMEM;
-	ufs->creator_cred = cred = prepare_creds();
-	if (!cred)
-		goto out_put_indexdir;
+		if (!ofs->indexdir)
+			sb->s_flags |= MS_RDONLY;
+	}
+
+	/* Show index=off/on in /proc/mounts for any of the reasons above */
+	if (!ofs->indexdir)
+		ofs->config.index = false;
 
 	/* Never override disk quota limits or use reserved space */
 	cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
 
-	err = -ENOMEM;
-	oe = ovl_alloc_entry(numlower);
-	if (!oe)
-		goto out_put_cred;
-
 	sb->s_magic = OVERLAYFS_SUPER_MAGIC;
 	sb->s_op = &ovl_super_operations;
 	sb->s_xattr = ovl_xattr_handlers;
-	sb->s_fs_info = ufs;
+	sb->s_fs_info = ofs;
 	sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
 
+	err = -ENOMEM;
 	root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
 	if (!root_dentry)
 		goto out_free_oe;
 
 	mntput(upperpath.mnt);
-	for (i = 0; i < numlower; i++)
-		mntput(stack[i].mnt);
-	mntput(workpath.mnt);
-	kfree(lowertmp);
-
 	if (upperpath.dentry) {
 		oe->has_upper = true;
 		if (ovl_is_impuredir(upperpath.dentry))
 			ovl_set_flag(OVL_IMPURE, d_inode(root_dentry));
 	}
-	for (i = 0; i < numlower; i++) {
-		oe->lowerstack[i].dentry = stack[i].dentry;
-		oe->lowerstack[i].mnt = ufs->lower_mnt[i];
-	}
-	kfree(stack);
 
 	root_dentry->d_fsdata = oe;
 
+	/* Root is always merge -> can have whiteouts */
+	ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry));
 	ovl_inode_init(d_inode(root_dentry), upperpath.dentry,
 		       ovl_dentry_lower(root_dentry));
 
@@ -1149,39 +1253,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 
 out_free_oe:
+	ovl_entry_stack_free(oe);
 	kfree(oe);
-out_put_cred:
-	put_cred(ufs->creator_cred);
-out_put_indexdir:
-	dput(ufs->indexdir);
-out_put_lower_mnt:
-	for (i = 0; i < ufs->numlower; i++)
-		mntput(ufs->lower_mnt[i]);
-	kfree(ufs->lower_mnt);
-out_put_workdir:
-	dput(ufs->workdir);
-	mntput(ufs->upper_mnt);
-out_put_lowerpath:
-	for (i = 0; i < numlower; i++)
-		path_put(&stack[i]);
-	kfree(stack);
-out_free_lowertmp:
-	kfree(lowertmp);
-out_unlock_workdentry:
-	if (ufs->workdir_locked)
-		ovl_inuse_unlock(workpath.dentry);
-out_put_workpath:
-	path_put(&workpath);
-out_unlock_upperdentry:
-	if (ufs->upperdir_locked)
-		ovl_inuse_unlock(upperpath.dentry);
-out_put_upperpath:
+out_err:
 	path_put(&upperpath);
-out_free_config:
-	kfree(ufs->config.lowerdir);
-	kfree(ufs->config.upperdir);
-	kfree(ufs->config.workdir);
-	kfree(ufs);
+	ovl_free_fs(ofs);
 out:
 	return err;
 }
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index b9b239fa5cfd..d6bb1c9f5e7a 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -17,7 +17,6 @@
 #include <linux/namei.h>
 #include <linux/ratelimit.h>
 #include "overlayfs.h"
-#include "ovl_entry.h"
 
 int ovl_want_write(struct dentry *dentry)
 {
@@ -125,7 +124,12 @@ void ovl_path_lower(struct dentry *dentry, struct path *path)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
-	*path = oe->numlower ? oe->lowerstack[0] : (struct path) { };
+	if (oe->numlower) {
+		path->mnt = oe->lowerstack[0].layer->mnt;
+		path->dentry = oe->lowerstack[0].dentry;
+	} else {
+		*path = (struct path) { };
+	}
 }
 
 enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
@@ -329,6 +333,19 @@ void ovl_copy_up_end(struct dentry *dentry)
 	mutex_unlock(&OVL_I(d_inode(dentry))->lock);
 }
 
+bool ovl_check_origin_xattr(struct dentry *dentry)
+{
+	int res;
+
+	res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0);
+
+	/* Zero size value means "copied up but origin unknown" */
+	if (res >= 0)
+		return true;
+
+	return false;
+}
+
 bool ovl_check_dir_xattr(struct dentry *dentry, const char *name)
 {
 	int res;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6744bd706ecf..339e4c1c044d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
+	unsigned long text, lib, swap, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -50,8 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
-	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
-	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -67,7 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
-		"VmPMD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -80,8 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		shmem << (PAGE_SHIFT-10),
 		mm->data_vm << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		ptes >> 10,
-		pmds >> 10,
+		mm_pgtables_bytes(mm) >> 10,
 		swap << (PAGE_SHIFT-10));
 	hugetlb_report_usage(m, mm);
 }
@@ -665,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_ACCOUNT)]	= "ac",
 		[ilog2(VM_NORESERVE)]	= "nr",
 		[ilog2(VM_HUGETLB)]	= "ht",
+		[ilog2(VM_SYNC)]	= "sf",
 		[ilog2(VM_ARCH_1)]	= "ar",
 		[ilog2(VM_WIPEONFORK)]	= "wf",
 		[ilog2(VM_DONTDUMP)]	= "dd",
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 086e491faf04..423159abd501 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -651,7 +651,7 @@ static int pstore_write_user_compat(struct pstore_record *record,
 		return -EINVAL;
 
 	record->buf = memdup_user(buf, record->size);
-	if (unlikely(IS_ERR(record->buf))) {
+	if (IS_ERR(record->buf)) {
 		ret = PTR_ERR(record->buf);
 		goto out;
 	}
diff --git a/fs/read_write.c b/fs/read_write.c
index 0046d72efe94..f8547b82dfb3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -635,27 +635,6 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 	return ret;
 }
 
-/*
- * Reduce an iovec's length in-place.  Return the resulting number of segments
- */
-unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
-{
-	unsigned long seg = 0;
-	size_t len = 0;
-
-	while (seg < nr_segs) {
-		seg++;
-		if (len + iov->iov_len >= to) {
-			iov->iov_len = to - len;
-			break;
-		}
-		len += iov->iov_len;
-		iov++;
-	}
-	return seg;
-}
-EXPORT_SYMBOL(iov_shorten);
-
 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 		loff_t *ppos, int type, rwf_t flags)
 {
diff --git a/fs/select.c b/fs/select.c
index 063067e606ca..6de493bb42a4 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -292,8 +292,7 @@ static int poll_select_copy_remaining(struct timespec64 *end_time,
 				      void __user *p,
 				      int timeval, int ret)
 {
-	struct timespec64 rts64;
-	struct timespec rts;
+	struct timespec64 rts;
 	struct timeval rtv;
 
 	if (!p)
@@ -306,23 +305,22 @@ static int poll_select_copy_remaining(struct timespec64 *end_time,
 	if (!end_time->tv_sec && !end_time->tv_nsec)
 		return ret;
 
-	ktime_get_ts64(&rts64);
-	rts64 = timespec64_sub(*end_time, rts64);
-	if (rts64.tv_sec < 0)
-		rts64.tv_sec = rts64.tv_nsec = 0;
+	ktime_get_ts64(&rts);
+	rts = timespec64_sub(*end_time, rts);
+	if (rts.tv_sec < 0)
+		rts.tv_sec = rts.tv_nsec = 0;
 
-	rts = timespec64_to_timespec(rts64);
 
 	if (timeval) {
 		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
 			memset(&rtv, 0, sizeof(rtv));
-		rtv.tv_sec = rts64.tv_sec;
-		rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
+		rtv.tv_sec = rts.tv_sec;
+		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
 
 		if (!copy_to_user(p, &rtv, sizeof(rtv)))
 			return ret;
 
-	} else if (!copy_to_user(p, &rts, sizeof(rts)))
+	} else if (!put_timespec64(&rts, p))
 		return ret;
 
 	/*
@@ -705,17 +703,15 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 		       const sigset_t __user *sigmask, size_t sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
-	struct timespec64 ts64, end_time, *to = NULL;
+	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
+		if (get_timespec64(&ts, tsp))
 			return -EFAULT;
-		ts64 = timespec_to_timespec64(ts);
 
 		to = &end_time;
-		if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
 	}
 
@@ -1052,12 +1048,11 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 		size_t, sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
-	struct timespec64 end_time, *to = NULL;
+	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
+		if (get_timespec64(&ts, tsp))
 			return -EFAULT;
 
 		to = &end_time;
@@ -1103,10 +1098,10 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
 static
-int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p,
 				      int timeval, int ret)
 {
-	struct timespec ts;
+	struct timespec64 ts;
 
 	if (!p)
 		return ret;
@@ -1118,8 +1113,8 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
 	if (!end_time->tv_sec && !end_time->tv_nsec)
 		return ret;
 
-	ktime_get_ts(&ts);
-	ts = timespec_sub(*end_time, ts);
+	ktime_get_ts64(&ts);
+	ts = timespec64_sub(*end_time, ts);
 	if (ts.tv_sec < 0)
 		ts.tv_sec = ts.tv_nsec = 0;
 
@@ -1132,12 +1127,7 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
 		if (!copy_to_user(p, &rtv, sizeof(rtv)))
 			return ret;
 	} else {
-		struct compat_timespec rts;
-
-		rts.tv_sec = ts.tv_sec;
-		rts.tv_nsec = ts.tv_nsec;
-
-		if (!copy_to_user(p, &rts, sizeof(rts)))
+		if (!compat_put_timespec64(&ts, p))
 			return ret;
 	}
 	/*
@@ -1195,7 +1185,7 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
  */
 static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
-	struct timespec *end_time)
+	struct timespec64 *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -1268,7 +1258,7 @@ COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
 	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
 	struct compat_timeval __user *, tvp)
 {
-	struct timespec end_time, *to = NULL;
+	struct timespec64 end_time, *to = NULL;
 	struct compat_timeval tv;
 	int ret;
 
@@ -1312,14 +1302,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
 	compat_size_t sigsetsize)
 {
-	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
-	struct compat_timespec ts;
-	struct timespec end_time, *to = NULL;
+	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
+		if (compat_get_timespec64(&ts, tsp))
 			return -EFAULT;
 
 		to = &end_time;
@@ -1330,9 +1318,8 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
 
 		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
@@ -1381,14 +1368,12 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct compat_timespec __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
-	struct compat_timespec ts;
-	struct timespec end_time, *to = NULL;
+	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
+		if (compat_get_timespec64(&ts, tsp))
 			return -EFAULT;
 
 		to = &end_time;
@@ -1399,9 +1384,8 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
 
 		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1c667af86da5..5f1ff8756595 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -313,15 +313,13 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
 		     compat_size_t, sigsetsize,
 		     int, flags)
 {
-	compat_sigset_t ss32;
 	sigset_t tmp;
 	sigset_t __user *ksigmask;
 
 	if (sigsetsize != sizeof(compat_sigset_t))
 		return -EINVAL;
-	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+	if (get_compat_sigset(&tmp, sigmask))
 		return -EFAULT;
-	sigset_from_compat(&tmp, &ss32);
 	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
 	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
 		return -EFAULT;
diff --git a/fs/statfs.c b/fs/statfs.c
index c25dd9a26cc1..b072a8bab71a 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -217,7 +217,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
 	return error;
 }
 
-int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
+static int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
 {
 	struct super_block *s = user_get_super(dev);
 	int err;
diff --git a/fs/super.c b/fs/super.c
index 994db21f59bf..d4e33e8f1e6f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -155,21 +155,19 @@ static void destroy_super_rcu(struct rcu_head *head)
 	schedule_work(&s->destroy_work);
 }
 
-/**
- *	destroy_super	-	frees a superblock
- *	@s: superblock to free
- *
- *	Frees a superblock.
- */
-static void destroy_super(struct super_block *s)
+/* Free a superblock that has never been seen by anyone */
+static void destroy_unused_super(struct super_block *s)
 {
+	if (!s)
+		return;
+	up_write(&s->s_umount);
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
 	security_sb_free(s);
-	WARN_ON(!list_empty(&s->s_mounts));
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
-	call_rcu(&s->rcu, destroy_super_rcu);
+	/* no delays needed */
+	destroy_super_work(&s->destroy_work);
 }
 
 /**
@@ -257,7 +255,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	return s;
 
 fail:
-	destroy_super(s);
+	destroy_unused_super(s);
 	return NULL;
 }
 
@@ -266,11 +264,17 @@ fail:
 /*
  * Drop a superblock's refcount.  The caller must hold sb_lock.
  */
-static void __put_super(struct super_block *sb)
+static void __put_super(struct super_block *s)
 {
-	if (!--sb->s_count) {
-		list_del_init(&sb->s_list);
-		destroy_super(sb);
+	if (!--s->s_count) {
+		list_del_init(&s->s_list);
+		WARN_ON(s->s_dentry_lru.node);
+		WARN_ON(s->s_inode_lru.node);
+		WARN_ON(!list_empty(&s->s_mounts));
+		security_sb_free(s);
+		put_user_ns(s->s_user_ns);
+		kfree(s->s_subtype);
+		call_rcu(&s->rcu, destroy_super_rcu);
 	}
 }
 
@@ -485,19 +489,12 @@ retry:
 				continue;
 			if (user_ns != old->s_user_ns) {
 				spin_unlock(&sb_lock);
-				if (s) {
-					up_write(&s->s_umount);
-					destroy_super(s);
-				}
+				destroy_unused_super(s);
 				return ERR_PTR(-EBUSY);
 			}
 			if (!grab_super(old))
 				goto retry;
-			if (s) {
-				up_write(&s->s_umount);
-				destroy_super(s);
-				s = NULL;
-			}
+			destroy_unused_super(s);
 			return old;
 		}
 	}
@@ -512,8 +509,7 @@ retry:
 	err = set(s, data);
 	if (err) {
 		spin_unlock(&sb_lock);
-		up_write(&s->s_umount);
-		destroy_super(s);
+		destroy_unused_super(s);
 		return ERR_PTR(err);
 	}
 	s->s_type = type;
diff --git a/fs/sync.c b/fs/sync.c
index 83ac79a960dd..6e0a2cbaf6de 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -109,7 +109,7 @@ SYSCALL_DEFINE0(sync)
 {
 	int nowait = 0, wait = 1;
 
-	wakeup_flusher_threads(0, WB_REASON_SYNC);
+	wakeup_flusher_threads(WB_REASON_SYNC);
 	iterate_supers(sync_inodes_one_sb, NULL);
 	iterate_supers(sync_fs_one_sb, &nowait);
 	iterate_supers(sync_fs_one_sb, &wait);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f46d133c0949..ac9a4e65ca49 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -668,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 		ctx->features = octx->features;
 		ctx->released = false;
 		ctx->mm = vma->vm_mm;
-		atomic_inc(&ctx->mm->mm_count);
+		mmgrab(ctx->mm);
 
 		userfaultfd_ctx_get(octx);
 		fctx->orig = octx;
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 758f37ac5ad3..4b87472f35bc 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -104,7 +104,7 @@ kmem_zone_init(int size, char *zone_name)
 }
 
 static inline kmem_zone_t *
-kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
+kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags,
 		     void (*construct)(void *))
 {
 	return kmem_cache_create(zone_name, size, 0, flags, construct);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 18146873a8b3..8601275cc5e6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -44,6 +44,7 @@
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
+#include <linux/mman.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -1045,7 +1046,11 @@ __xfs_filemap_fault(
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
+		pfn_t pfn;
+
+		ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops);
+		if (ret & VM_FAULT_NEEDDSYNC)
+			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
 	} else {
 		if (write_fault)
 			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite(
 }
 
 /*
- * pfn_mkwrite was originally inteneded to ensure we capture time stamp
- * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
- * to ensure we serialise the fault barrier in place.
+ * pfn_mkwrite was originally intended to ensure we capture time stamp updates
+ * on write faults. In reality, it needs to serialise against truncate and
+ * prepare memory for writing so handle is as standard write fault.
  */
 static int
 xfs_filemap_pfn_mkwrite(
 	struct vm_fault		*vmf)
 {
 
-	struct inode		*inode = file_inode(vmf->vma->vm_file);
-	struct xfs_inode	*ip = XFS_I(inode);
-	int			ret = VM_FAULT_NOPAGE;
-	loff_t			size;
-
-	trace_xfs_filemap_pfn_mkwrite(ip);
-
-	sb_start_pagefault(inode->i_sb);
-	file_update_time(vmf->vma->vm_file);
-
-	/* check if the faulting page hasn't raced with truncate */
-	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		ret = VM_FAULT_SIGBUS;
-	else if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
-	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
-	sb_end_pagefault(inode->i_sb);
-	return ret;
-
+	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
 }
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1136,6 +1120,13 @@ xfs_file_mmap(
 	struct file	*filp,
 	struct vm_area_struct *vma)
 {
+	/*
+	 * We don't support synchronous mappings for non-DAX files. At least
+	 * until someone comes with a sensible use case.
+	 */
+	if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+		return -EOPNOTSUPP;
+
 	file_accessed(filp);
 	vma->vm_ops = &xfs_file_vm_ops;
 	if (IS_DAX(file_inode(filp)))
@@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = {
 	.compat_ioctl	= xfs_file_compat_ioctl,
 #endif
 	.mmap		= xfs_file_mmap,
+	.mmap_supported_flags = MAP_SYNC,
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 18077e2189a9..33eb4fb2e3fd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -34,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
@@ -1089,6 +1090,10 @@ xfs_file_iomap_begin(
 		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
 	}
 
+	if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
+				& ~XFS_ILOG_TIMESTAMP))
+		iomap->flags |= IOMAP_F_DIRTY;
+
 	xfs_bmbt_to_iomap(ip, iomap, &imap);
 
 	if (shared)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 515ba042d75c..d718a10c2271 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -654,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
 
-DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
-
 TRACE_EVENT(xfs_filemap_fault,
 	TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
 		 bool write_fault),