summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/addr_list.c2
-rw-r--r--fs/afs/cmservice.c14
-rw-r--r--fs/afs/fs_probe.c2
-rw-r--r--fs/afs/internal.h14
-rw-r--r--fs/afs/rxrpc.c74
-rw-r--r--fs/block_dev.c20
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/async-thread.c8
-rw-r--r--fs/btrfs/async-thread.h1
-rw-r--r--fs/btrfs/backref.c185
-rw-r--r--fs/btrfs/backref.h4
-rw-r--r--fs/btrfs/block-group.c91
-rw-r--r--fs/btrfs/block-rsv.c105
-rw-r--r--fs/btrfs/block-rsv.h12
-rw-r--r--fs/btrfs/btrfs_inode.h6
-rw-r--r--fs/btrfs/check-integrity.c200
-rw-r--r--fs/btrfs/check-integrity.h2
-rw-r--r--fs/btrfs/ctree.c74
-rw-r--r--fs/btrfs/ctree.h82
-rw-r--r--fs/btrfs/delalloc-space.c106
-rw-r--r--fs/btrfs/delayed-inode.c24
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c3
-rw-r--r--fs/btrfs/dev-replace.c44
-rw-r--r--fs/btrfs/disk-io.c918
-rw-r--r--fs/btrfs/disk-io.h34
-rw-r--r--fs/btrfs/export.c32
-rw-r--r--fs/btrfs/export.h5
-rw-r--r--fs/btrfs/extent-io-tree.h7
-rw-r--r--fs/btrfs/extent-tree.c493
-rw-r--r--fs/btrfs/extent_io.c204
-rw-r--r--fs/btrfs/extent_io.h11
-rw-r--r--fs/btrfs/extent_map.c11
-rw-r--r--fs/btrfs/file-item.c131
-rw-r--r--fs/btrfs/file.c80
-rw-r--r--fs/btrfs/free-space-cache.c43
-rw-r--r--fs/btrfs/free-space-tree.c4
-rw-r--r--fs/btrfs/inode-map.c2
-rw-r--r--fs/btrfs/inode.c188
-rw-r--r--fs/btrfs/ioctl.c1050
-rw-r--r--fs/btrfs/locking.c135
-rw-r--r--fs/btrfs/locking.h20
-rw-r--r--fs/btrfs/ordered-data.c147
-rw-r--r--fs/btrfs/ordered-data.h10
-rw-r--r--fs/btrfs/props.c2
-rw-r--r--fs/btrfs/qgroup.c41
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/raid56.c41
-rw-r--r--fs/btrfs/rcu-string.h2
-rw-r--r--fs/btrfs/ref-verify.c14
-rw-r--r--fs/btrfs/reflink.c804
-rw-r--r--fs/btrfs/reflink.h12
-rw-r--r--fs/btrfs/relocation.c661
-rw-r--r--fs/btrfs/root-tree.c43
-rw-r--r--fs/btrfs/scrub.c7
-rw-r--r--fs/btrfs/send.c79
-rw-r--r--fs/btrfs/space-info.c202
-rw-r--r--fs/btrfs/space-info.h7
-rw-r--r--fs/btrfs/super.c37
-rw-r--r--fs/btrfs/sysfs.c90
-rw-r--r--fs/btrfs/sysfs.h4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c44
-rw-r--r--fs/btrfs/tests/qgroup-tests.c2
-rw-r--r--fs/btrfs/transaction.c115
-rw-r--r--fs/btrfs/transaction.h13
-rw-r--r--fs/btrfs/tree-log.c483
-rw-r--r--fs/btrfs/uuid-tree.c57
-rw-r--r--fs/btrfs/volumes.c774
-rw-r--r--fs/btrfs/volumes.h13
-rw-r--r--fs/buffer.c62
-rw-r--r--fs/ceph/file.c31
-rw-r--r--fs/ceph/snap.c1
-rw-r--r--fs/ceph/super.c129
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifsacl.c9
-rw-r--r--fs/cifs/cifsfs.c12
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifspdu.h19
-rw-r--r--fs/cifs/cifsproto.h10
-rw-r--r--fs/cifs/cifssmb.c25
-rw-r--r--fs/cifs/connect.c91
-rw-r--r--fs/cifs/dfs_cache.c38
-rw-r--r--fs/cifs/dfs_cache.h4
-rw-r--r--fs/cifs/dir.c1
-rw-r--r--fs/cifs/file.c24
-rw-r--r--fs/cifs/inode.c67
-rw-r--r--fs/cifs/link.c4
-rw-r--r--fs/cifs/misc.c80
-rw-r--r--fs/cifs/readdir.c82
-rw-r--r--fs/cifs/smb1ops.c2
-rw-r--r--fs/cifs/smb2file.c9
-rw-r--r--fs/cifs/smb2inode.c4
-rw-r--r--fs/cifs/smb2ops.c111
-rw-r--r--fs/cifs/smb2pdu.c203
-rw-r--r--fs/cifs/smb2pdu.h138
-rw-r--r--fs/cifs/smb2proto.h7
-rw-r--r--fs/cifs/smb2transport.c8
-rw-r--r--fs/cifs/smbdirect.c41
-rw-r--r--fs/cifs/smbdirect.h1
-rw-r--r--fs/cifs/transport.c28
-rw-r--r--fs/crypto/fscrypt_private.h20
-rw-r--r--fs/crypto/keysetup.c25
-rw-r--r--fs/crypto/policy.c21
-rw-r--r--fs/dax.c11
-rw-r--r--fs/debugfs/file.c43
-rw-r--r--fs/debugfs/inode.c18
-rw-r--r--fs/ecryptfs/crypto.c6
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/efivarfs/super.c2
-rw-r--r--fs/erofs/decompressor.c22
-rw-r--r--fs/erofs/internal.h8
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/utils.c90
-rw-r--r--fs/erofs/zdata.c76
-rw-r--r--fs/eventpoll.c8
-rw-r--r--fs/exec.c106
-rw-r--r--fs/ext2/inode.c5
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/ext4.h44
-rw-r--r--fs/ext4/ialloc.c23
-rw-r--r--fs/ext4/inode.c30
-rw-r--r--fs/ext4/ioctl.c6
-rw-r--r--fs/ext4/mballoc.c61
-rw-r--r--fs/ext4/migrate.c27
-rw-r--r--fs/ext4/mmp.c12
-rw-r--r--fs/ext4/namei.c8
-rw-r--r--fs/ext4/page-io.c8
-rw-r--r--fs/ext4/resize.c62
-rw-r--r--fs/ext4/super.c161
-rw-r--r--fs/ext4/sysfs.c1
-rw-r--r--fs/f2fs/f2fs.h1
-rw-r--r--fs/f2fs/file.c11
-rw-r--r--fs/f2fs/super.c1
-rw-r--r--fs/fat/inode.c19
-rw-r--r--fs/fcntl.c6
-rw-r--r--fs/file.c7
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/gfs2/acl.c7
-rw-r--r--fs/gfs2/aops.c11
-rw-r--r--fs/gfs2/bmap.c9
-rw-r--r--fs/gfs2/dir.c3
-rw-r--r--fs/gfs2/file.c43
-rw-r--r--fs/gfs2/glock.c137
-rw-r--r--fs/gfs2/glops.c157
-rw-r--r--fs/gfs2/incore.h27
-rw-r--r--fs/gfs2/inode.c55
-rw-r--r--fs/gfs2/lock_dlm.c52
-rw-r--r--fs/gfs2/log.c288
-rw-r--r--fs/gfs2/log.h1
-rw-r--r--fs/gfs2/lops.c14
-rw-r--r--fs/gfs2/meta_io.c3
-rw-r--r--fs/gfs2/ops_fstype.c59
-rw-r--r--fs/gfs2/quota.c76
-rw-r--r--fs/gfs2/quota.h4
-rw-r--r--fs/gfs2/recovery.c12
-rw-r--r--fs/gfs2/rgrp.c88
-rw-r--r--fs/gfs2/rgrp.h4
-rw-r--r--fs/gfs2/super.c112
-rw-r--r--fs/gfs2/super.h1
-rw-r--r--fs/gfs2/sys.c5
-rw-r--r--fs/gfs2/trans.c4
-rw-r--r--fs/gfs2/util.c419
-rw-r--r--fs/gfs2/util.h76
-rw-r--r--fs/gfs2/xattr.c12
-rw-r--r--fs/inode.c1
-rw-r--r--fs/internal.h1
-rw-r--r--fs/io-wq.c429
-rw-r--r--fs/io-wq.h83
-rw-r--r--fs/io_uring.c2473
-rw-r--r--fs/jbd2/commit.c46
-rw-r--r--fs/jbd2/transaction.c26
-rw-r--r--fs/libfs.c8
-rw-r--r--fs/locks.c60
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/nfs/delegation.c50
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c128
-rw-r--r--fs/nfs/fs_context.c9
-rw-r--r--fs/nfs/fscache.c2
-rw-r--r--fs/nfs/getroot.c39
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs4client.c1
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4proc.c32
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nsfs.c14
-rw-r--r--fs/ntfs/aops.c9
-rw-r--r--fs/open.c3
-rw-r--r--fs/overlayfs/Kconfig1
-rw-r--r--fs/overlayfs/file.c6
-rw-r--r--fs/overlayfs/overlayfs.h7
-rw-r--r--fs/overlayfs/super.c9
-rw-r--r--fs/overlayfs/util.c4
-rw-r--r--fs/pipe.c18
-rw-r--r--fs/proc/base.c121
-rw-r--r--fs/proc/inode.c73
-rw-r--r--fs/proc/internal.h4
-rw-r--r--fs/proc/proc_sysctl.c45
-rw-r--r--fs/proc/root.c36
-rw-r--r--fs/pstore/inode.c5
-rw-r--r--fs/pstore/platform.c4
-rw-r--r--fs/pstore/ram.c1
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/reiserfs/journal.c5
-rw-r--r--fs/splice.c6
-rw-r--r--fs/sysfs/file.c148
-rw-r--r--fs/sysfs/group.c115
-rw-r--r--fs/ubifs/ioctl.c4
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/zonefs/Kconfig1
-rw-r--r--fs/zonefs/super.c36
221 files changed, 9921 insertions, 6259 deletions
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index df415c05939e..de1ae0bead3b 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -19,7 +19,7 @@
void afs_put_addrlist(struct afs_addr_list *alist)
{
if (alist && refcount_dec_and_test(&alist->usage))
- call_rcu(&alist->rcu, (rcu_callback_t)kfree);
+ kfree_rcu(alist, rcu);
}
/*
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index ff3994a6be23..6765949b3aab 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -244,6 +244,17 @@ static void afs_cm_destructor(struct afs_call *call)
}
/*
+ * Abort a service call from within an action function.
+ */
+static void afs_abort_service_call(struct afs_call *call, u32 abort_code, int error,
+ const char *why)
+{
+ rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+ abort_code, error, why);
+ afs_set_call_complete(call, error, 0);
+}
+
+/*
* The server supplied a list of callbacks that it wanted to break.
*/
static void SRXAFSCB_CallBack(struct work_struct *work)
@@ -510,8 +521,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0)
afs_send_empty_reply(call);
else
- rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- 1, 1, "K-1");
+ afs_abort_service_call(call, 1, 1, "K-1");
afs_put_call(call);
_leave("");
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index cfe62b154f68..e1b9ed679045 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -145,6 +145,7 @@ static int afs_do_probe_fileserver(struct afs_net *net,
read_lock(&server->fs_lock);
ac.alist = rcu_dereference_protected(server->addresses,
lockdep_is_held(&server->fs_lock));
+ afs_get_addrlist(ac.alist);
read_unlock(&server->fs_lock);
atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
@@ -163,6 +164,7 @@ static int afs_do_probe_fileserver(struct afs_net *net,
if (!in_progress)
afs_fs_probe_done(server);
+ afs_put_addrlist(ac.alist);
return in_progress;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 1d81fc4c3058..ef732dd4e7ef 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -81,7 +81,7 @@ enum afs_call_state {
* List of server addresses.
*/
struct afs_addr_list {
- struct rcu_head rcu; /* Must be first */
+ struct rcu_head rcu;
refcount_t usage;
u32 version; /* Version */
unsigned char max_addrs;
@@ -154,7 +154,7 @@ struct afs_call {
};
unsigned char unmarshall; /* unmarshalling phase */
unsigned char addr_ix; /* Address in ->alist */
- bool incoming; /* T if incoming call */
+ bool drop_ref; /* T if need to drop ref for incoming call */
bool send_pages; /* T if data from mapping should be sent */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
@@ -1209,8 +1209,16 @@ static inline void afs_set_call_complete(struct afs_call *call,
ok = true;
}
spin_unlock_bh(&call->state_lock);
- if (ok)
+ if (ok) {
trace_afs_call_done(call);
+
+ /* Asynchronous calls have two refs to release - one from the alloc and
+ * one queued with the work item - and we can't just deallocate the
+ * call because the work item may be queued again.
+ */
+ if (call->drop_ref)
+ afs_put_call(call);
+ }
}
/*
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 58d396592250..1ecc67da6c1a 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -18,7 +18,6 @@ struct workqueue_struct *afs_async_calls;
static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
-static void afs_delete_async_call(struct work_struct *);
static void afs_process_async_call(struct work_struct *);
static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
@@ -169,7 +168,7 @@ void afs_put_call(struct afs_call *call)
int n = atomic_dec_return(&call->usage);
int o = atomic_read(&net->nr_outstanding_calls);
- trace_afs_call(call, afs_call_trace_put, n + 1, o,
+ trace_afs_call(call, afs_call_trace_put, n, o,
__builtin_return_address(0));
ASSERTCMP(n, >=, 0);
@@ -402,8 +401,10 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
/* If the call is going to be asynchronous, we need an extra ref for
* the call to hold itself so the caller need not hang on to its ref.
*/
- if (call->async)
+ if (call->async) {
afs_get_call(call, afs_call_trace_get);
+ call->drop_ref = true;
+ }
/* create a call */
rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
@@ -413,7 +414,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
afs_wake_up_async_call :
afs_wake_up_call_waiter),
call->upgrade,
- call->intr,
+ (call->intr ? RXRPC_PREINTERRUPTIBLE :
+ RXRPC_UNINTERRUPTIBLE),
call->debug_id);
if (IS_ERR(rxcall)) {
ret = PTR_ERR(rxcall);
@@ -584,8 +586,6 @@ static void afs_deliver_to_call(struct afs_call *call)
done:
if (call->type->done)
call->type->done(call);
- if (state == AFS_CALL_COMPLETE && call->incoming)
- afs_put_call(call);
out:
_leave("");
return;
@@ -604,11 +604,7 @@ call_complete:
long afs_wait_for_call_to_complete(struct afs_call *call,
struct afs_addr_cursor *ac)
{
- signed long rtt2, timeout;
long ret;
- bool stalled = false;
- u64 rtt;
- u32 life, last_life;
bool rxrpc_complete = false;
DECLARE_WAITQUEUE(myself, current);
@@ -619,14 +615,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
if (ret < 0)
goto out;
- rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
- rtt2 = nsecs_to_jiffies64(rtt) * 2;
- if (rtt2 < 2)
- rtt2 = 2;
-
- timeout = rtt2;
- rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life);
-
add_wait_queue(&call->waitq, &myself);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -637,37 +625,19 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
call->need_attention = false;
__set_current_state(TASK_RUNNING);
afs_deliver_to_call(call);
- timeout = rtt2;
continue;
}
if (afs_check_call_state(call, AFS_CALL_COMPLETE))
break;
- if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) {
+ if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
/* rxrpc terminated the call. */
rxrpc_complete = true;
break;
}
- if (call->intr && timeout == 0 &&
- life == last_life && signal_pending(current)) {
- if (stalled)
- break;
- __set_current_state(TASK_RUNNING);
- rxrpc_kernel_probe_life(call->net->socket, call->rxcall);
- timeout = rtt2;
- stalled = true;
- continue;
- }
-
- if (life != last_life) {
- timeout = rtt2;
- last_life = life;
- stalled = false;
- }
-
- timeout = schedule_timeout(timeout);
+ schedule();
}
remove_wait_queue(&call->waitq, &myself);
@@ -735,7 +705,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
u = atomic_fetch_add_unless(&call->usage, 1, 0);
if (u != 0) {
- trace_afs_call(call, afs_call_trace_wake, u,
+ trace_afs_call(call, afs_call_trace_wake, u + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
@@ -745,21 +715,6 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
}
/*
- * Delete an asynchronous call. The work item carries a ref to the call struct
- * that we need to release.
- */
-static void afs_delete_async_call(struct work_struct *work)
-{
- struct afs_call *call = container_of(work, struct afs_call, async_work);
-
- _enter("");
-
- afs_put_call(call);
-
- _leave("");
-}
-
-/*
* Perform I/O processing on an asynchronous call. The work item carries a ref
* to the call struct that we either need to release or to pass on.
*/
@@ -774,16 +729,6 @@ static void afs_process_async_call(struct work_struct *work)
afs_deliver_to_call(call);
}
- if (call->state == AFS_CALL_COMPLETE) {
- /* We have two refs to release - one from the alloc and one
- * queued with the work item - and we can't just deallocate the
- * call because the work item may be queued again.
- */
- call->async_work.func = afs_delete_async_call;
- if (!queue_work(afs_async_calls, &call->async_work))
- afs_put_call(call);
- }
-
afs_put_call(call);
_leave("");
}
@@ -810,6 +755,7 @@ void afs_charge_preallocation(struct work_struct *work)
if (!call)
break;
+ call->drop_ref = true;
call->async = true;
call->state = AFS_CALL_SV_AWAIT_OP_ID;
init_waitqueue_head(&call->waitq);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 69bf2fb6f7cd..9501880dff5e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1520,10 +1520,22 @@ rescan:
if (ret)
return ret;
- if (invalidate)
- set_capacity(disk, 0);
- else if (disk->fops->revalidate_disk)
- disk->fops->revalidate_disk(disk);
+ /*
+ * Historically we only set the capacity to zero for devices that
+ * support partitions (independ of actually having partitions created).
+ * Doing that is rather inconsistent, but changing it broke legacy
+ * udisks polling for legacy ide-cdrom devices. Use the crude check
+ * below to get the sane behavior for most device while not breaking
+ * userspace for this particular setup.
+ */
+ if (invalidate) {
+ if (disk_part_scan_enabled(disk) ||
+ !(disk->flags & GENHD_FL_REMOVABLE))
+ set_capacity(disk, 0);
+ } else {
+ if (disk->fops->revalidate_disk)
+ disk->fops->revalidate_disk(disk);
+ }
check_disk_size_change(disk, bdev, !invalidate);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9a0ff3384381..e738f6206ea5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
- block-rsv.o delalloc-space.o block-group.o discard.o
+ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 1d32a07bb2d1..309516e6a968 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -395,3 +395,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work)
{
set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
}
+
+void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
+{
+ if (wq->high)
+ flush_workqueue(wq->high->normal_wq);
+
+ flush_workqueue(wq->normal->normal_wq);
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index a4434301d84d..3204daa51b95 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -44,5 +44,6 @@ void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
+void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e5d85311d5d5..9c380e7edf62 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -347,33 +347,10 @@ static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
return -ENOMEM;
ref->root_id = root_id;
- if (key) {
+ if (key)
ref->key_for_search = *key;
- /*
- * We can often find data backrefs with an offset that is too
- * large (>= LLONG_MAX, maximum allowed file offset) due to
- * underflows when subtracting a file's offset with the data
- * offset of its corresponding extent data item. This can
- * happen for example in the clone ioctl.
- * So if we detect such case we set the search key's offset to
- * zero to make sure we will find the matching file extent item
- * at add_all_parents(), otherwise we will miss it because the
- * offset taken form the backref is much larger then the offset
- * of the file extent item. This can make us scan a very large
- * number of file extent items, but at least it will not make
- * us miss any.
- * This is an ugly workaround for a behaviour that should have
- * never existed, but it does and a fix for the clone ioctl
- * would touch a lot of places, cause backwards incompatibility
- * and would not fix the problem for extents cloned with older
- * kernels.
- */
- if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY &&
- ref->key_for_search.offset >= LLONG_MAX)
- ref->key_for_search.offset = 0;
- } else {
+ else
memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
- }
ref->inode_list = NULL;
ref->level = level;
@@ -409,10 +386,36 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
wanted_disk_byte, count, sc, gfp_mask);
}
+static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr)
+{
+ struct rb_node **p = &preftrees->direct.root.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct prelim_ref *ref = NULL;
+ struct prelim_ref target = {0};
+ int result;
+
+ target.parent = bytenr;
+
+ while (*p) {
+ parent = *p;
+ ref = rb_entry(parent, struct prelim_ref, rbnode);
+ result = prelim_ref_compare(ref, &target);
+
+ if (result < 0)
+ p = &(*p)->rb_left;
+ else if (result > 0)
+ p = &(*p)->rb_right;
+ else
+ return 1;
+ }
+ return 0;
+}
+
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
- struct ulist *parents, struct prelim_ref *ref,
+ struct ulist *parents,
+ struct preftrees *preftrees, struct prelim_ref *ref,
int level, u64 time_seq, const u64 *extent_item_pos,
- u64 total_refs, bool ignore_offset)
+ bool ignore_offset)
{
int ret = 0;
int slot;
@@ -424,6 +427,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
u64 disk_byte;
u64 wanted_disk_byte = ref->wanted_disk_byte;
u64 count = 0;
+ u64 data_offset;
if (level != 0) {
eb = path->nodes[level];
@@ -434,18 +438,26 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
}
/*
- * We normally enter this function with the path already pointing to
- * the first item to check. But sometimes, we may enter it with
- * slot==nritems. In that case, go to the next leaf before we continue.
+ * 1. We normally enter this function with the path already pointing to
+ * the first item to check. But sometimes, we may enter it with
+ * slot == nritems.
+ * 2. We are searching for normal backref but bytenr of this leaf
+ * matches shared data backref
+ * 3. The leaf owner is not equal to the root we are searching
+ *
+ * For these cases, go to the next leaf before we continue.
*/
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ eb = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(eb) ||
+ is_shared_data_backref(preftrees, eb->start) ||
+ ref->root_id != btrfs_header_owner(eb)) {
if (time_seq == SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
}
- while (!ret && count < total_refs) {
+ while (!ret && count < ref->count) {
eb = path->nodes[0];
slot = path->slots[0];
@@ -455,13 +467,31 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
key.type != BTRFS_EXTENT_DATA_KEY)
break;
+ /*
+ * We are searching for normal backref but bytenr of this leaf
+ * matches shared data backref, OR
+ * the leaf owner is not equal to the root we are searching for
+ */
+ if (slot == 0 &&
+ (is_shared_data_backref(preftrees, eb->start) ||
+ ref->root_id != btrfs_header_owner(eb))) {
+ if (time_seq == SEQ_LAST)
+ ret = btrfs_next_leaf(root, path);
+ else
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ continue;
+ }
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ data_offset = btrfs_file_extent_offset(eb, fi);
if (disk_byte == wanted_disk_byte) {
eie = NULL;
old = NULL;
- count++;
+ if (ref->key_for_search.offset == key.offset - data_offset)
+ count++;
+ else
+ goto next;
if (extent_item_pos) {
ret = check_extent_in_eb(&key, eb, fi,
*extent_item_pos,
@@ -502,9 +532,9 @@ next:
*/
static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
+ struct preftrees *preftrees,
struct prelim_ref *ref, struct ulist *parents,
- const u64 *extent_item_pos, u64 total_refs,
- bool ignore_offset)
+ const u64 *extent_item_pos, bool ignore_offset)
{
struct btrfs_root *root;
struct btrfs_key root_key;
@@ -512,23 +542,25 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int ret = 0;
int root_level;
int level = ref->level;
- int index;
+ struct btrfs_key search_key = ref->key_for_search;
root_key.objectid = ref->root_id;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
root = btrfs_get_fs_root(fs_info, &root_key, false);
if (IS_ERR(root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(root);
+ goto out_free;
+ }
+
+ if (!path->search_commit_root &&
+ test_bit(BTRFS_ROOT_DELETING, &root->state)) {
+ ret = -ENOENT;
goto out;
}
if (btrfs_is_testing(fs_info)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -ENOENT;
goto out;
}
@@ -540,21 +572,36 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
else
root_level = btrfs_old_root_level(root, time_seq);
- if (root_level + 1 == level) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ if (root_level + 1 == level)
goto out;
- }
+ /*
+ * We can often find data backrefs with an offset that is too large
+ * (>= LLONG_MAX, maximum allowed file offset) due to underflows when
+ * subtracting a file's offset with the data offset of its
+ * corresponding extent data item. This can happen for example in the
+ * clone ioctl.
+ *
+ * So if we detect such case we set the search key's offset to zero to
+ * make sure we will find the matching file extent item at
+ * add_all_parents(), otherwise we will miss it because the offset
+ * taken form the backref is much larger then the offset of the file
+ * extent item. This can make us scan a very large number of file
+ * extent items, but at least it will not make us miss any.
+ *
+ * This is an ugly workaround for a behaviour that should have never
+ * existed, but it does and a fix for the clone ioctl would touch a lot
+ * of places, cause backwards incompatibility and would not fix the
+ * problem for extents cloned with older kernels.
+ */
+ if (search_key.type == BTRFS_EXTENT_DATA_KEY &&
+ search_key.offset >= LLONG_MAX)
+ search_key.offset = 0;
path->lowest_level = level;
if (time_seq == SEQ_LAST)
- ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
- 0, 0);
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
else
- ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
- time_seq);
-
- /* root node has been locked, we can release @subvol_srcu safely here */
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
btrfs_debug(fs_info,
"search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)",
@@ -574,9 +621,11 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
eb = path->nodes[level];
}
- ret = add_all_parents(root, path, parents, ref, level, time_seq,
- extent_item_pos, total_refs, ignore_offset);
+ ret = add_all_parents(root, path, parents, preftrees, ref, level,
+ time_seq, extent_item_pos, ignore_offset);
out:
+ btrfs_put_root(root);
+out_free:
path->lowest_level = 0;
btrfs_release_path(path);
return ret;
@@ -609,7 +658,7 @@ unode_aux_to_inode_list(struct ulist_node *node)
static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct preftrees *preftrees,
- const u64 *extent_item_pos, u64 total_refs,
+ const u64 *extent_item_pos,
struct share_check *sc, bool ignore_offset)
{
int err;
@@ -653,9 +702,9 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
ret = BACKREF_FOUND_SHARED;
goto out;
}
- err = resolve_indirect_ref(fs_info, path, time_seq, ref,
- parents, extent_item_pos,
- total_refs, ignore_offset);
+ err = resolve_indirect_ref(fs_info, path, time_seq, preftrees,
+ ref, parents, extent_item_pos,
+ ignore_offset);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
@@ -758,8 +807,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
*/
static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head, u64 seq,
- struct preftrees *preftrees, u64 *total_refs,
- struct share_check *sc)
+ struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
@@ -793,7 +841,6 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
default:
BUG();
}
- *total_refs += count;
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
@@ -876,7 +923,7 @@ out:
static int add_inline_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
int *info_level, struct preftrees *preftrees,
- u64 *total_refs, struct share_check *sc)
+ struct share_check *sc)
{
int ret = 0;
int slot;
@@ -900,7 +947,6 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
- *total_refs += btrfs_extent_refs(leaf, ei);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
ptr = (unsigned long)(ei + 1);
@@ -1125,8 +1171,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct prelim_ref *ref;
struct rb_node *node;
struct extent_inode_elem *eie = NULL;
- /* total of both direct AND indirect refs! */
- u64 total_refs = 0;
struct preftrees preftrees = {
.direct = PREFTREE_INIT,
.indirect = PREFTREE_INIT,
@@ -1195,7 +1239,7 @@ again:
}
spin_unlock(&delayed_refs->lock);
ret = add_delayed_refs(fs_info, head, time_seq,
- &preftrees, &total_refs, sc);
+ &preftrees, sc);
mutex_unlock(&head->mutex);
if (ret)
goto out;
@@ -1216,8 +1260,7 @@ again:
(key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = add_inline_refs(fs_info, path, bytenr,
- &info_level, &preftrees,
- &total_refs, sc);
+ &info_level, &preftrees, sc);
if (ret)
goto out;
ret = add_keyed_refs(fs_info, path, bytenr, info_level,
@@ -1236,7 +1279,7 @@ again:
WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
- extent_item_pos, total_refs, sc, ignore_offset);
+ extent_item_pos, sc, ignore_offset);
if (ret)
goto out;
@@ -1362,10 +1405,10 @@ static void free_leaf_list(struct ulist *blocks)
*
* returns 0 on success, <0 on error
*/
-static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **leafs,
- const u64 *extent_item_pos, bool ignore_offset)
+int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **leafs,
+ const u64 *extent_item_pos, bool ignore_offset)
{
int ret;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 777f61dc081e..723d6da99114 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -40,6 +40,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **leafs,
+ const u64 *extent_item_pos, bool ignore_offset);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots, bool ignore_offset);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 404e050ce8ee..786849fcc319 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -460,7 +460,7 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
int ret;
while (start < end) {
- ret = find_first_extent_bit(info->pinned_extents, start,
+ ret = find_first_extent_bit(&info->excluded_extents, start,
&extent_start, &extent_end,
EXTENT_DIRTY | EXTENT_UPTODATE,
NULL);
@@ -856,9 +856,9 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
found_raid1c34 = true;
up_read(&sinfo->groups_sem);
}
- if (found_raid56)
+ if (!found_raid56)
btrfs_clear_fs_incompat(fs_info, RAID56);
- if (found_raid1c34)
+ if (!found_raid1c34)
btrfs_clear_fs_incompat(fs_info, RAID1C34);
}
}
@@ -1248,6 +1248,55 @@ out:
return ret;
}
+static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_transaction *prev_trans = NULL;
+ const u64 start = bg->start;
+ const u64 end = start + bg->length - 1;
+ int ret;
+
+ spin_lock(&fs_info->trans_lock);
+ if (trans->transaction->list.prev != &fs_info->trans_list) {
+ prev_trans = list_last_entry(&trans->transaction->list,
+ struct btrfs_transaction, list);
+ refcount_inc(&prev_trans->use_count);
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N, another
+ * task might be running finish_extent_commit() for the previous
+ * transaction N - 1, and have seen a range belonging to the block
+ * group in pinned_extents before we were able to clear the whole block
+ * group range from pinned_extents. This means that task can lookup for
+ * the block group after we unpinned it from pinned_extents and removed
+ * it, leading to a BUG_ON() at unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ if (prev_trans) {
+ ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
+ EXTENT_DIRTY);
+ if (ret)
+ goto err;
+ }
+
+ ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
+ EXTENT_DIRTY);
+ if (ret)
+ goto err;
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+
+ return true;
+
+err:
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_dec_block_group_ro(bg);
+ return false;
+}
+
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
@@ -1265,7 +1314,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
- u64 start, end;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1344,35 +1392,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* We could have pending pinned extents for this block group,
* just delete them, we don't care about them anymore.
*/
- start = block_group->start;
- end = start + block_group->length - 1;
- /*
- * Hold the unused_bg_unpin_mutex lock to avoid racing with
- * btrfs_finish_extent_commit(). If we are at transaction N,
- * another task might be running finish_extent_commit() for the
- * previous transaction N - 1, and have seen a range belonging
- * to the block group in freed_extents[] before we were able to
- * clear the whole block group range from freed_extents[]. This
- * means that task can lookup for the block group after we
- * unpinned it from freed_extents[] and removed it, leading to
- * a BUG_ON() at btrfs_unpin_extent_range().
- */
- mutex_lock(&fs_info->unused_bg_unpin_mutex);
- ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
- EXTENT_DIRTY);
- if (ret) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_dec_block_group_ro(block_group);
+ if (!clean_pinned_extents(trans, block_group))
goto end_trans;
- }
- ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
- EXTENT_DIRTY);
- if (ret) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_dec_block_group_ro(block_group);
- goto end_trans;
- }
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
/*
* At this point, the block_group is read only and should fail
@@ -1987,6 +2008,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_release_path(path);
}
+ rcu_read_lock();
list_for_each_entry_rcu(space_info, &info->space_info, list) {
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
@@ -2007,6 +2029,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
list)
inc_block_group_ro(cache, 1);
}
+ rcu_read_unlock();
btrfs_init_global_block_rsv(info);
ret = check_chunk_block_group_mappings(info);
@@ -2345,7 +2368,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
return 0;
}
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
again:
inode = lookup_free_space_inode(block_group, path);
@@ -2881,7 +2904,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
&cache->space_info->total_bytes_pinned,
num_bytes,
BTRFS_TOTAL_BYTES_PINNED_BATCH);
- set_extent_dirty(info->pinned_extents,
+ set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
}
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index d07bd41a7c1e..27efec8f7c5b 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -6,6 +6,98 @@
#include "space-info.h"
#include "transaction.h"
+/*
+ * HOW DO BLOCK RESERVES WORK
+ *
+ * Think of block_rsv's as buckets for logically grouped metadata
+ * reservations. Each block_rsv has a ->size and a ->reserved. ->size is
+ * how large we want our block rsv to be, ->reserved is how much space is
+ * currently reserved for this block reserve.
+ *
+ * ->failfast exists for the truncate case, and is described below.
+ *
+ * NORMAL OPERATION
+ *
+ * -> Reserve
+ * Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
+ *
+ * We call into btrfs_reserve_metadata_bytes() with our bytes, which is
+ * accounted for in space_info->bytes_may_use, and then add the bytes to
+ * ->reserved, and ->size in the case of btrfs_block_rsv_add.
+ *
+ * ->size is an over-estimation of how much we may use for a particular
+ * operation.
+ *
+ * -> Use
+ * Entrance: btrfs_use_block_rsv
+ *
+ * When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
+ * to determine the appropriate block_rsv to use, and then verify that
+ * ->reserved has enough space for our tree block allocation. Once
+ * successful we subtract fs_info->nodesize from ->reserved.
+ *
+ * -> Finish
+ * Entrance: btrfs_block_rsv_release
+ *
+ * We are finished with our operation, subtract our individual reservation
+ * from ->size, and then subtract ->size from ->reserved and free up the
+ * excess if there is any.
+ *
+ * There is some logic here to refill the delayed refs rsv or the global rsv
+ * as needed, otherwise the excess is subtracted from
+ * space_info->bytes_may_use.
+ *
+ * TYPES OF BLOCK RESERVES
+ *
+ * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
+ * These behave normally, as described above, just within the confines of the
+ * lifetime of their particular operation (transaction for the whole trans
+ * handle lifetime, for example).
+ *
+ * BLOCK_RSV_GLOBAL
+ * It is impossible to properly account for all the space that may be required
+ * to make our extent tree updates. This block reserve acts as an overflow
+ * buffer in case our delayed refs reserve does not reserve enough space to
+ * update the extent tree.
+ *
+ * We can steal from this in some cases as well, notably on evict() or
+ * truncate() in order to help users recover from ENOSPC conditions.
+ *
+ * BLOCK_RSV_DELALLOC
+ * The individual item sizes are determined by the per-inode size
+ * calculations, which are described with the delalloc code. This is pretty
+ * straightforward, it's just the calculation of ->size encodes a lot of
+ * different items, and thus it gets used when updating inodes, inserting file
+ * extents, and inserting checksums.
+ *
+ * BLOCK_RSV_DELREFS
+ * We keep a running tally of how many delayed refs we have on the system.
+ * We assume each one of these delayed refs are going to use a full
+ * reservation. We use the transaction items and pre-reserve space for every
+ * operation, and use this reservation to refill any gap between ->size and
+ * ->reserved that may exist.
+ *
+ * From there it's straightforward, removing a delayed ref means we remove its
+ * count from ->size and free up reservations as necessary. Since this is
+ * the most dynamic block reserve in the system, we will try to refill this
+ * block reserve first with any excess returned by any other block reserve.
+ *
+ * BLOCK_RSV_EMPTY
+ * This is the fallback block reserve to make us try to reserve space if we
+ * don't have a specific bucket for this allocation. It is mostly used for
+ * updating the device tree and such, since that is a separate pool we're
+ * content to just reserve space from the space_info on demand.
+ *
+ * BLOCK_RSV_TEMP
+ * This is used by things like truncate and iput. We will temporarily
+ * allocate a block reserve, set it to some size, and then truncate bytes
+ * until we have no space left. With ->failfast set we'll simply return
+ * ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
+ * to make a new reservation. This is because these operations are
+ * unbounded, so we want to do as much work as we can, and then back off and
+ * re-reserve.
+ */
+
static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes,
@@ -111,7 +203,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
{
if (!rsv)
return;
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
kfree(rsv);
}
@@ -178,9 +270,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
-u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, u64 *qgroup_to_release)
+u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ u64 *qgroup_to_release)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
@@ -297,9 +389,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
if (block_rsv->reserved < block_rsv->size) {
num_bytes = block_rsv->size - block_rsv->reserved;
- block_rsv->reserved += num_bytes;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
num_bytes);
+ block_rsv->reserved = block_rsv->size;
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
@@ -344,7 +436,8 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
- btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
+ NULL);
WARN_ON(fs_info->trans_block_rsv.size > 0);
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index d1428bb73fc5..0b6ae5302837 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -73,7 +73,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
int min_factor);
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes, bool update_size);
-u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
+u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes, u64 *qgroup_to_release);
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info);
@@ -82,20 +82,12 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize);
-
-static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
-}
-
static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u32 blocksize)
{
btrfs_block_rsv_add_bytes(block_rsv, blocksize, false);
- btrfs_block_rsv_release(fs_info, block_rsv, 0);
+ btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
}
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4e12a477d32e..27a1fefce508 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -60,6 +60,12 @@ struct btrfs_inode {
*/
struct extent_io_tree io_failure_tree;
+ /*
+ * Keep track of where the inode has extent items mapped in order to
+ * make sure the i_size adjustments are accurate
+ */
+ struct extent_io_tree file_extent_tree;
+
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index a0ce69f2d27c..32e11a23b47f 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -77,7 +77,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include <linux/mutex.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
@@ -152,11 +151,8 @@ struct btrfsic_block {
struct list_head ref_to_list; /* list */
struct list_head ref_from_list; /* list */
struct btrfsic_block *next_in_same_bio;
- void *orig_bio_bh_private;
- union {
- bio_end_io_t *bio;
- bh_end_io_t *bh;
- } orig_bio_bh_end_io;
+ void *orig_bio_private;
+ bio_end_io_t *orig_bio_end_io;
int submit_bio_bh_rw;
u64 flush_gen; /* only valid if !never_written */
};
@@ -325,14 +321,12 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
u64 dev_bytenr, char **mapped_datav,
unsigned int num_pages,
struct bio *bio, int *bio_is_patched,
- struct buffer_head *bh,
int submit_bio_bh_rw);
static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const block,
struct btrfs_super_block *const super_hdr);
static void btrfsic_bio_end_io(struct bio *bp);
-static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
const struct btrfsic_block *block,
int recursion_level);
@@ -399,8 +393,8 @@ static void btrfsic_block_init(struct btrfsic_block *b)
b->never_written = 0;
b->mirror_num = 0;
b->next_in_same_bio = NULL;
- b->orig_bio_bh_private = NULL;
- b->orig_bio_bh_end_io.bio = NULL;
+ b->orig_bio_private = NULL;
+ b->orig_bio_end_io = NULL;
INIT_LIST_HEAD(&b->collision_resolving_node);
INIT_LIST_HEAD(&b->all_blocks_node);
INIT_LIST_HEAD(&b->ref_to_list);
@@ -767,29 +761,31 @@ static int btrfsic_process_superblock_dev_mirror(
struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_super_block *super_tmp;
u64 dev_bytenr;
- struct buffer_head *bh;
struct btrfsic_block *superblock_tmp;
int pass;
struct block_device *const superblock_bdev = device->bdev;
+ struct page *page;
+ struct address_space *mapping = superblock_bdev->bd_inode->i_mapping;
+ int ret = 0;
/* super block bytenr is always the unmapped device bytenr */
dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
return -1;
- bh = __bread(superblock_bdev, dev_bytenr / BTRFS_BDEV_BLOCKSIZE,
- BTRFS_SUPER_INFO_SIZE);
- if (NULL == bh)
+
+ page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page))
return -1;
- super_tmp = (struct btrfs_super_block *)
- (bh->b_data + (dev_bytenr & (BTRFS_BDEV_BLOCKSIZE - 1)));
+
+ super_tmp = page_address(page);
if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
btrfs_super_nodesize(super_tmp) != state->metablock_size ||
btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
- brelse(bh);
- return 0;
+ ret = 0;
+ goto out;
}
superblock_tmp =
@@ -800,8 +796,8 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp = btrfsic_block_alloc();
if (NULL == superblock_tmp) {
pr_info("btrfsic: error, kmalloc failed!\n");
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
/* for superblock, only the dev_bytenr makes sense */
superblock_tmp->dev_bytenr = dev_bytenr;
@@ -885,8 +881,8 @@ static int btrfsic_process_superblock_dev_mirror(
mirror_num)) {
pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n",
next_bytenr, mirror_num);
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
next_block = btrfsic_block_lookup_or_add(
@@ -895,8 +891,8 @@ static int btrfsic_process_superblock_dev_mirror(
mirror_num, NULL);
if (NULL == next_block) {
btrfsic_release_block_ctx(&tmp_next_block_ctx);
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
next_block->disk_key = tmp_disk_key;
@@ -907,16 +903,17 @@ static int btrfsic_process_superblock_dev_mirror(
BTRFSIC_GENERATION_UNKNOWN);
btrfsic_release_block_ctx(&tmp_next_block_ctx);
if (NULL == l) {
- brelse(bh);
- return -1;
+ ret = -1;
+ goto out;
}
}
}
if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
btrfsic_dump_tree_sub(state, superblock_tmp, 0);
- brelse(bh);
- return 0;
+out:
+ put_page(page);
+ return ret;
}
static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
@@ -1743,7 +1740,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
u64 dev_bytenr, char **mapped_datav,
unsigned int num_pages,
struct bio *bio, int *bio_is_patched,
- struct buffer_head *bh,
int submit_bio_bh_rw)
{
int is_metadata;
@@ -1902,9 +1898,9 @@ again:
block->is_iodone = 0;
BUG_ON(NULL == bio_is_patched);
if (!*bio_is_patched) {
- block->orig_bio_bh_private =
+ block->orig_bio_private =
bio->bi_private;
- block->orig_bio_bh_end_io.bio =
+ block->orig_bio_end_io =
bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
@@ -1916,25 +1912,17 @@ again:
bio->bi_private;
BUG_ON(NULL == chained_block);
- block->orig_bio_bh_private =
- chained_block->orig_bio_bh_private;
- block->orig_bio_bh_end_io.bio =
- chained_block->orig_bio_bh_end_io.
- bio;
+ block->orig_bio_private =
+ chained_block->orig_bio_private;
+ block->orig_bio_end_io =
+ chained_block->orig_bio_end_io;
block->next_in_same_bio = chained_block;
bio->bi_private = block;
}
- } else if (NULL != bh) {
- block->is_iodone = 0;
- block->orig_bio_bh_private = bh->b_private;
- block->orig_bio_bh_end_io.bh = bh->b_end_io;
- block->next_in_same_bio = NULL;
- bh->b_private = block;
- bh->b_end_io = btrfsic_bh_end_io;
} else {
block->is_iodone = 1;
- block->orig_bio_bh_private = NULL;
- block->orig_bio_bh_end_io.bio = NULL;
+ block->orig_bio_private = NULL;
+ block->orig_bio_end_io = NULL;
block->next_in_same_bio = NULL;
}
}
@@ -2042,8 +2030,8 @@ again:
block->is_iodone = 0;
BUG_ON(NULL == bio_is_patched);
if (!*bio_is_patched) {
- block->orig_bio_bh_private = bio->bi_private;
- block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
bio->bi_end_io = btrfsic_bio_end_io;
@@ -2054,24 +2042,17 @@ again:
bio->bi_private;
BUG_ON(NULL == chained_block);
- block->orig_bio_bh_private =
- chained_block->orig_bio_bh_private;
- block->orig_bio_bh_end_io.bio =
- chained_block->orig_bio_bh_end_io.bio;
+ block->orig_bio_private =
+ chained_block->orig_bio_private;
+ block->orig_bio_end_io =
+ chained_block->orig_bio_end_io;
block->next_in_same_bio = chained_block;
bio->bi_private = block;
}
- } else if (NULL != bh) {
- block->is_iodone = 0;
- block->orig_bio_bh_private = bh->b_private;
- block->orig_bio_bh_end_io.bh = bh->b_end_io;
- block->next_in_same_bio = NULL;
- bh->b_private = block;
- bh->b_end_io = btrfsic_bh_end_io;
} else {
block->is_iodone = 1;
- block->orig_bio_bh_private = NULL;
- block->orig_bio_bh_end_io.bio = NULL;
+ block->orig_bio_private = NULL;
+ block->orig_bio_end_io = NULL;
block->next_in_same_bio = NULL;
}
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
@@ -2112,8 +2093,8 @@ static void btrfsic_bio_end_io(struct bio *bp)
iodone_w_error = 1;
BUG_ON(NULL == block);
- bp->bi_private = block->orig_bio_bh_private;
- bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+ bp->bi_private = block->orig_bio_private;
+ bp->bi_end_io = block->orig_bio_end_io;
do {
struct btrfsic_block *next_block;
@@ -2146,38 +2127,6 @@ static void btrfsic_bio_end_io(struct bio *bp)
bp->bi_end_io(bp);
}
-static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
-{
- struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
- int iodone_w_error = !uptodate;
- struct btrfsic_dev_state *dev_state;
-
- BUG_ON(NULL == block);
- dev_state = block->dev_state;
- if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
- iodone_w_error,
- btrfsic_get_block_type(dev_state->state, block),
- block->logical_bytenr, block->dev_state->name,
- block->dev_bytenr, block->mirror_num);
-
- block->iodone_w_error = iodone_w_error;
- if (block->submit_bio_bh_rw & REQ_PREFLUSH) {
- dev_state->last_flush_gen++;
- if ((dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
- pr_info("bh_end_io() new %s flush_gen=%llu\n",
- dev_state->name, dev_state->last_flush_gen);
- }
- if (block->submit_bio_bh_rw & REQ_FUA)
- block->flush_gen = 0; /* FUA completed means block is on disk */
-
- bh->b_private = block->orig_bio_bh_private;
- bh->b_end_io = block->orig_bio_bh_end_io.bh;
- block->is_iodone = 1; /* for FLUSH, this releases the block */
- bh->b_end_io(bh, uptodate);
-}
-
static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const superblock,
@@ -2730,63 +2679,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
&btrfsic_dev_state_hashtable);
}
-int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh)
-{
- struct btrfsic_dev_state *dev_state;
-
- if (!btrfsic_is_initialized)
- return submit_bh(op, op_flags, bh);
-
- mutex_lock(&btrfsic_mutex);
- /* since btrfsic_submit_bh() might also be called before
- * btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev);
-
- /* Only called to write the superblock (incl. FLUSH/FUA) */
- if (NULL != dev_state &&
- (op == REQ_OP_WRITE) && bh->b_size > 0) {
- u64 dev_bytenr;
-
- dev_bytenr = BTRFS_BDEV_BLOCKSIZE * bh->b_blocknr;
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bh(op=0x%x,0x%x, blocknr=%llu (bytenr %llu), size=%zu, data=%p, bdev=%p)\n",
- op, op_flags, (unsigned long long)bh->b_blocknr,
- dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev);
- btrfsic_process_written_block(dev_state, dev_bytenr,
- &bh->b_data, 1, NULL,
- NULL, bh, op_flags);
- } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) {
- if (dev_state->state->print_mask &
- BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n",
- op, op_flags, bh->b_bdev);
- if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
- if ((dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE)))
- pr_info("btrfsic_submit_bh(%s) with FLUSH but dummy block already in use (ignored)!\n",
- dev_state->name);
- } else {
- struct btrfsic_block *const block =
- &dev_state->dummy_block_for_bio_bh_flush;
-
- block->is_iodone = 0;
- block->never_written = 0;
- block->iodone_w_error = 0;
- block->flush_gen = dev_state->last_flush_gen + 1;
- block->submit_bio_bh_rw = op_flags;
- block->orig_bio_bh_private = bh->b_private;
- block->orig_bio_bh_end_io.bh = bh->b_end_io;
- block->next_in_same_bio = NULL;
- bh->b_private = block;
- bh->b_end_io = btrfsic_bh_end_io;
- }
- }
- mutex_unlock(&btrfsic_mutex);
- return submit_bh(op, op_flags, bh);
-}
-
static void __btrfsic_submit_bio(struct bio *bio)
{
struct btrfsic_dev_state *dev_state;
@@ -2838,7 +2730,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
btrfsic_process_written_block(dev_state, dev_bytenr,
mapped_datav, segs,
bio, &bio_is_patched,
- NULL, bio->bi_opf);
+ bio->bi_opf);
bio_for_each_segment(bvec, bio, iter)
kunmap(bvec.bv_page);
kfree(mapped_datav);
@@ -2862,8 +2754,8 @@ static void __btrfsic_submit_bio(struct bio *bio)
block->iodone_w_error = 0;
block->flush_gen = dev_state->last_flush_gen + 1;
block->submit_bio_bh_rw = bio->bi_opf;
- block->orig_bio_bh_private = bio->bi_private;
- block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->orig_bio_private = bio->bi_private;
+ block->orig_bio_end_io = bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
bio->bi_end_io = btrfsic_bio_end_io;
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index 9bf4359cc44c..bcc730a06cb5 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -7,11 +7,9 @@
#define BTRFS_CHECK_INTEGRITY_H
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
-int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh);
void btrfsic_submit_bio(struct bio *bio);
int btrfsic_submit_bio_wait(struct bio *bio);
#else
-#define btrfsic_submit_bh submit_bh
#define btrfsic_submit_bio submit_bio
#define btrfsic_submit_bio_wait submit_bio_wait
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f2ec1a9bae28..bfedbbe2311f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -31,8 +31,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
static const struct btrfs_csums {
u16 size;
- const char *name;
- const char *driver;
+ const char name[10];
+ const char driver[12];
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
@@ -63,7 +63,8 @@ const char *btrfs_super_csum_name(u16 csum_type)
const char *btrfs_super_csum_driver(u16 csum_type)
{
/* csum type is validated at mount time */
- return btrfs_csums[csum_type].driver ?:
+ return btrfs_csums[csum_type].driver[0] ?
+ btrfs_csums[csum_type].driver :
btrfs_csums[csum_type].name;
}
@@ -143,44 +144,6 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
return eb;
}
-/* loop around taking references on and locking the root node of the
- * tree until you end up with a lock on the root. A locked buffer
- * is returned, with a reference held.
- */
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
-{
- struct extent_buffer *eb;
-
- while (1) {
- eb = btrfs_root_node(root);
- btrfs_tree_lock(eb);
- if (eb == root->node)
- break;
- btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
- }
- return eb;
-}
-
-/* loop around taking references on and locking the root node of the
- * tree until you end up with a lock on the root. A locked buffer
- * is returned, with a reference held.
- */
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
-{
- struct extent_buffer *eb;
-
- while (1) {
- eb = btrfs_root_node(root);
- btrfs_tree_read_lock(eb);
- if (eb == root->node)
- break;
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
- }
- return eb;
-}
-
/* cowonly root (everything not a reference counted cow subvolume), just get
* put onto a simple dirty list. transaction.c walks this to make sure they
* get properly updated on disk.
@@ -341,7 +304,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct rb_root *tm_root;
struct rb_node *node;
struct rb_node *next;
- struct seq_list *cur_elem;
struct tree_mod_elem *tm;
u64 min_seq = (u64)-1;
u64 seq_putting = elem->seq;
@@ -353,18 +315,20 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
list_del(&elem->list);
elem->seq = 0;
- list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
- if (cur_elem->seq < min_seq) {
- if (seq_putting > cur_elem->seq) {
- /*
- * blocker with lower sequence number exists, we
- * cannot remove anything from the log
- */
- write_unlock(&fs_info->tree_mod_log_lock);
- return;
- }
- min_seq = cur_elem->seq;
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct seq_list *first;
+
+ first = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ if (seq_putting > first->seq) {
+ /*
+ * Blocker with lower sequence number exists, we
+ * cannot remove anything from the log.
+ */
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return;
}
+ min_seq = first->seq;
}
/*
@@ -962,9 +926,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (new_flags != 0) {
int level = btrfs_header_level(buf);
- ret = btrfs_set_disk_extent_flags(trans,
- buf->start,
- buf->len,
+ ret = btrfs_set_disk_extent_flags(trans, buf,
new_flags, level, 0);
if (ret)
return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 36df977b64d9..8aa7b9dac405 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
+#include "locking.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@@ -596,8 +597,8 @@ struct btrfs_fs_info {
/* keep track of unallocated space */
atomic64_t free_chunk_space;
- struct extent_io_tree freed_extents[2];
- struct extent_io_tree *pinned_extents;
+ /* Track ranges which are used by log trees blocks/logged data extents */
+ struct extent_io_tree excluded_extents;
/* logical->physical extent mapping */
struct extent_map_tree mapping_tree;
@@ -696,7 +697,6 @@ struct btrfs_fs_info {
struct rw_semaphore cleanup_work_sem;
struct rw_semaphore subvol_sem;
- struct srcu_struct subvol_srcu;
spinlock_t trans_lock;
/*
@@ -947,6 +947,10 @@ struct btrfs_fs_info {
#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
struct kobject *discard_debug_kobj;
+ struct list_head allocated_roots;
+
+ spinlock_t eb_leak_lock;
+ struct list_head allocated_ebs;
#endif
};
@@ -955,11 +959,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
return sb->s_fs_info;
}
-struct btrfs_subvolume_writers {
- struct percpu_counter counter;
- wait_queue_head_t wait;
-};
-
/*
* The state of btrfs root
*/
@@ -1131,8 +1130,9 @@ struct btrfs_root {
* root_item_lock.
*/
int dedupe_in_progress;
- struct btrfs_subvolume_writers *subv_writers;
- atomic_t will_be_snapshotted;
+ /* For exclusion of snapshot creation and nocow writes */
+ struct btrfs_drew_lock snapshot_lock;
+
atomic_t snapshot_force_cow;
/* For qgroup metadata reserved space */
@@ -1149,6 +1149,10 @@ struct btrfs_root {
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
u64 alloc_bytenr;
#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
};
struct btrfs_clone_extent_info {
@@ -1971,16 +1975,6 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
btrfs_set_header_flags(eb, flags);
}
-static inline unsigned long btrfs_header_fsid(void)
-{
- return offsetof(struct btrfs_header, fsid);
-}
-
-static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb)
-{
- return offsetof(struct btrfs_header, chunk_tree_uuid);
-}
-
static inline int btrfs_is_leaf(const struct extent_buffer *eb)
{
return btrfs_header_level(eb) == 0;
@@ -2458,9 +2452,9 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags);
-int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
+ int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
@@ -2490,13 +2484,13 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 flags,
+ struct extent_buffer *eb, u64 flags,
int level, int is_data);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
-int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start,
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
u64 len);
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
@@ -2665,9 +2659,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
return btrfs_next_old_item(root, p, 0);
}
int btrfs_leaf_free_space(struct extent_buffer *leaf);
-int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- int update_ref, int for_reloc);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
+ int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2695,23 +2688,6 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info);
}
-static inline void free_fs_info(struct btrfs_fs_info *fs_info)
-{
- kfree(fs_info->balance_ctl);
- kfree(fs_info->delayed_root);
- kfree(fs_info->extent_root);
- kfree(fs_info->tree_root);
- kfree(fs_info->chunk_root);
- kfree(fs_info->dev_root);
- kfree(fs_info->csum_root);
- kfree(fs_info->quota_root);
- kfree(fs_info->uuid_root);
- kfree(fs_info->free_space_root);
- kfree(fs_info->super_copy);
- kfree(fs_info->super_for_commit);
- kvfree(fs_info);
-}
-
/* tree mod log functions from ctree.c */
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
@@ -2750,9 +2726,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
u64 subid);
-int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
- int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
- u64));
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
/* dir-item.c */
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -2859,6 +2833,12 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
struct btrfs_file_extent_item *fi,
const bool new_inline,
struct extent_map *em);
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len);
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len);
+void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size);
+u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
@@ -2996,9 +2976,6 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -3008,6 +2985,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid);
static inline __printf(2, 3) __cold
void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
@@ -3401,6 +3380,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
+int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
/* scrub.c */
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 4cdac4d834f5..1245739a3a6e 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -9,6 +9,108 @@
#include "qgroup.h"
#include "block-group.h"
+/*
+ * HOW DOES THIS WORK
+ *
+ * There are two stages to data reservations, one for data and one for metadata
+ * to handle the new extents and checksums generated by writing data.
+ *
+ *
+ * DATA RESERVATION
+ * The general flow of the data reservation is as follows
+ *
+ * -> Reserve
+ * We call into btrfs_reserve_data_bytes() for the user request bytes that
+ * they wish to write. We make this reservation and add it to
+ * space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree
+ * for the range and carry on if this is buffered, or follow up trying to
+ * make a real allocation if we are pre-allocating or doing O_DIRECT.
+ *
+ * -> Use
+ * At writepages()/prealloc/O_DIRECT time we will call into
+ * btrfs_reserve_extent() for some part or all of this range of bytes. We
+ * will make the allocation and subtract space_info->bytes_may_use by the
+ * original requested length and increase the space_info->bytes_reserved by
+ * the allocated length. This distinction is important because compression
+ * may allocate a smaller on disk extent than we previously reserved.
+ *
+ * -> Allocation
+ * finish_ordered_io() will insert the new file extent item for this range,
+ * and then add a delayed ref update for the extent tree. Once that delayed
+ * ref is written the extent size is subtracted from
+ * space_info->bytes_reserved and added to space_info->bytes_used.
+ *
+ * Error handling
+ *
+ * -> By the reservation maker
+ * This is the simplest case, we haven't completed our operation and we know
+ * how much we reserved, we can simply call
+ * btrfs_free_reserved_data_space*() and it will be removed from
+ * space_info->bytes_may_use.
+ *
+ * -> After the reservation has been made, but before cow_file_range()
+ * This is specifically for the delalloc case. You must clear
+ * EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will
+ * be subtracted from space_info->bytes_may_use.
+ *
+ * METADATA RESERVATION
+ * The general metadata reservation lifetimes are discussed elsewhere, this
+ * will just focus on how it is used for delalloc space.
+ *
+ * We keep track of two things on a per inode bases
+ *
+ * ->outstanding_extents
+ * This is the number of file extent items we'll need to handle all of the
+ * outstanding DELALLOC space we have in this inode. We limit the maximum
+ * size of an extent, so a large contiguous dirty area may require more than
+ * one outstanding_extent, which is why count_max_extents() is used to
+ * determine how many outstanding_extents get added.
+ *
+ * ->csum_bytes
+ * This is essentially how many dirty bytes we have for this inode, so we
+ * can calculate the number of checksum items we would have to add in order
+ * to checksum our outstanding data.
+ *
+ * We keep a per-inode block_rsv in order to make it easier to keep track of
+ * our reservation. We use btrfs_calculate_inode_block_rsv_size() to
+ * calculate the current theoretical maximum reservation we would need for the
+ * metadata for this inode. We call this and then adjust our reservation as
+ * necessary, either by attempting to reserve more space, or freeing up excess
+ * space.
+ *
+ * OUTSTANDING_EXTENTS HANDLING
+ *
+ * ->outstanding_extents is used for keeping track of how many extents we will
+ * need to use for this inode, and it will fluctuate depending on where you are
+ * in the life cycle of the dirty data. Consider the following normal case for
+ * a completely clean inode, with a num_bytes < our maximum allowed extent size
+ *
+ * -> reserve
+ * ->outstanding_extents += 1 (current value is 1)
+ *
+ * -> set_delalloc
+ * ->outstanding_extents += 1 (currrent value is 2)
+ *
+ * -> btrfs_delalloc_release_extents()
+ * ->outstanding_extents -= 1 (current value is 1)
+ *
+ * We must call this once we are done, as we hold our reservation for the
+ * duration of our operation, and then assume set_delalloc will update the
+ * counter appropriately.
+ *
+ * -> add ordered extent
+ * ->outstanding_extents += 1 (current value is 2)
+ *
+ * -> btrfs_clear_delalloc_extent
+ * ->outstanding_extents -= 1 (current value is 1)
+ *
+ * -> finish_ordered_io/btrfs_remove_ordered_extent
+ * ->outstanding_extents -= 1 (current value is 0)
+ *
+ * Each stage is responsible for their own accounting of the extent, thus
+ * making error handling and cleanup easier.
+ */
+
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
@@ -228,8 +330,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
* are releasing 0 bytes, and then we'll just get the reservation over
* the size free'd.
*/
- released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
- &qgroup_to_release);
+ released = btrfs_block_rsv_release(fs_info, block_rsv, 0,
+ &qgroup_to_release);
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index d3e15e1d4a91..bf1595a42a98 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/iversion.h>
+#include <linux/sched/mm.h>
#include "misc.h"
#include "delayed-inode.h"
#include "disk-io.h"
@@ -595,8 +596,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid, item->bytes_reserved,
0);
- btrfs_block_rsv_release(fs_info, rsv,
- item->bytes_reserved);
+ btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
}
static int btrfs_delayed_inode_reserve_metadata(
@@ -677,8 +677,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
rsv = &fs_info->delayed_block_rsv;
trace_btrfs_space_reservation(fs_info, "delayed_inode",
node->inode_id, node->bytes_reserved, 0);
- btrfs_block_rsv_release(fs_info, rsv,
- node->bytes_reserved);
+ btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);
if (qgroup_free)
btrfs_qgroup_free_meta_prealloc(node->root,
node->bytes_reserved);
@@ -805,11 +804,14 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_delayed_item *delayed_item)
{
struct extent_buffer *leaf;
+ unsigned int nofs_flag;
char *ptr;
int ret;
+ nofs_flag = memalloc_nofs_save();
ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
delayed_item->data_len);
+ memalloc_nofs_restore(nofs_flag);
if (ret < 0 && ret != -EEXIST)
return ret;
@@ -937,6 +939,7 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_delayed_node *node)
{
struct btrfs_delayed_item *curr, *prev;
+ unsigned int nofs_flag;
int ret = 0;
do_again:
@@ -945,7 +948,9 @@ do_again:
if (!curr)
goto delete_fail;
+ nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
+ memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto delete_fail;
else if (ret > 0) {
@@ -1012,6 +1017,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
+ unsigned int nofs_flag;
int mod;
int ret;
@@ -1024,7 +1030,9 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
else
mod = 1;
+ nofs_flag = memalloc_nofs_save();
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
+ memalloc_nofs_restore(nofs_flag);
if (ret > 0) {
btrfs_release_path(path);
return -ENOENT;
@@ -1075,7 +1083,10 @@ search:
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = -1;
+
+ nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto err_out;
ASSERT(ret);
@@ -1139,7 +1150,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
int ret = 0;
bool count = (nr > 0);
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return -EIO;
path = btrfs_alloc_path();
@@ -1760,6 +1771,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
int btrfs_fill_inode(struct inode *inode, u32 *rdev)
{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
@@ -1779,6 +1791,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
+ btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
+ round_up(i_size_read(inode), fs_info->sectorsize));
inode->i_mode = btrfs_stack_inode_mode(inode_item);
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 74ae226ffaf0..ca96ef007d8f 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -70,7 +70,7 @@ struct btrfs_delayed_item {
refcount_t refs;
int ins_or_del;
u32 data_len;
- char data[0];
+ char data[];
};
static inline void btrfs_init_delayed_root(
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index dfdb7d4f8406..353cc2994d10 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -82,8 +82,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
u64 released = 0;
- released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes,
- NULL);
+ released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, released, 0);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2ca2a09d0e23..db93909b25e0 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -22,6 +22,46 @@
#include "dev-replace.h"
#include "sysfs.h"
+/*
+ * Device replace overview
+ *
+ * [Objective]
+ * To copy all extents (both new and on-disk) from source device to target
+ * device, while still keeping the filesystem read-write.
+ *
+ * [Method]
+ * There are two main methods involved:
+ *
+ * - Write duplication
+ *
+ * All new writes will be written to both target and source devices, so even
+ * if replace gets canceled, sources device still contans up-to-date data.
+ *
+ * Location: handle_ops_on_dev_replace() from __btrfs_map_block()
+ * Start: btrfs_dev_replace_start()
+ * End: btrfs_dev_replace_finishing()
+ * Content: Latest data/metadata
+ *
+ * - Copy existing extents
+ *
+ * This happens by re-using scrub facility, as scrub also iterates through
+ * existing extents from commit root.
+ *
+ * Location: scrub_write_block_to_dev_replace() from
+ * scrub_block_complete()
+ * Content: Data/meta from commit root.
+ *
+ * Due to the content difference, we need to avoid nocow write when dev-replace
+ * is happening. This is done by marking the block group read-only and waiting
+ * for NOCOW writes.
+ *
+ * After replace is done, the finishing part is done by swapping the target and
+ * source devices.
+ *
+ * Location: btrfs_dev_replace_update_device_in_mapping_tree() from
+ * btrfs_dev_replace_finishing()
+ */
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
static void btrfs_dev_replace_update_device_in_mapping_tree(
@@ -472,7 +512,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
- ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
+ ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
@@ -703,7 +743,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* replace the sysfs entry */
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
+ btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device);
btrfs_sysfs_update_devid(tgt_device);
btrfs_rm_dev_replace_free_srcdev(src_device);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7fa9bb79ad08..a6cb5cbbdb9f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -7,7 +7,6 @@
#include <linux/blkdev.h>
#include <linux/radix-tree.h>
#include <linux/writeback.h>
-#include <linux/buffer_head.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/slab.h>
@@ -42,6 +41,7 @@
#include "ref-verify.h"
#include "block-group.h"
#include "discard.h"
+#include "space-info.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -98,6 +98,12 @@ void __cold btrfs_end_io_wq_exit(void)
kmem_cache_destroy(btrfs_end_io_wq_cache);
}
+static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->csum_shash)
+ crypto_free_shash(fs_info->csum_shash);
+}
+
/*
* async submit bios are used to offload expensive checksumming
* onto the worker threads. They checksum file and metadata bios
@@ -247,47 +253,27 @@ out:
/*
* Compute the csum of a btree block and store the result to provided buffer.
- *
- * Returns error if the extent buffer cannot be mapped.
*/
-static int csum_tree_block(struct extent_buffer *buf, u8 *result)
+static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
+ const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- unsigned long len;
- unsigned long cur_len;
- unsigned long offset = BTRFS_CSUM_SIZE;
char *kaddr;
- unsigned long map_start;
- unsigned long map_len;
- int err;
+ int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
+ kaddr = page_address(buf->pages[0]);
+ crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
+ PAGE_SIZE - BTRFS_CSUM_SIZE);
- len = buf->len - offset;
-
- while (len > 0) {
- /*
- * Note: we don't need to check for the err == 1 case here, as
- * with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
- * and 'min_len = 32' and the currently implemented mapping
- * algorithm we cannot cross a page boundary.
- */
- err = map_private_extent_buffer(buf, offset, 32,
- &kaddr, &map_start, &map_len);
- if (WARN_ON(err))
- return err;
- cur_len = min(len, map_len - (offset - map_start));
- crypto_shash_update(shash, kaddr + offset - map_start, cur_len);
- len -= cur_len;
- offset += cur_len;
+ for (i = 1; i < num_pages; i++) {
+ kaddr = page_address(buf->pages[i]);
+ crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
-
crypto_shash_final(shash, result);
-
- return 0;
}
/*
@@ -535,10 +521,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
return -EUCLEAN;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
- btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
+ offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE) == 0);
- if (csum_tree_block(eb, result))
- return -EINVAL;
+ csum_tree_block(eb, result);
if (btrfs_header_level(eb))
ret = btrfs_check_node(eb);
@@ -565,7 +551,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
u8 fsid[BTRFS_FSID_SIZE];
int ret = 1;
- read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
+ read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE);
while (fs_devices) {
u8 *metadata_uuid;
@@ -596,9 +583,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
u64 found_start;
int found_level;
struct extent_buffer *eb;
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ struct btrfs_fs_info *fs_info;
+ u16 csum_size;
int ret = 0;
u8 result[BTRFS_CSUM_SIZE];
int reads_done;
@@ -607,6 +593,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
goto out;
eb = (struct extent_buffer *)page->private;
+ fs_info = eb->fs_info;
+ csum_size = btrfs_super_csum_size(fs_info->super_copy);
/* the pending IO might have been the only thing that kept this buffer
* in memory. Make sure we have a ref for all this other checks
@@ -647,9 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
eb, found_level);
- ret = csum_tree_block(eb, result);
- if (ret)
- goto err;
+ csum_tree_block(eb, result);
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
u32 val;
@@ -972,9 +958,7 @@ static int btree_writepages(struct address_space *mapping,
static int btree_readpage(struct file *file, struct page *page)
{
- struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_read_full_page(tree, page, btree_get_extent, 0);
+ return extent_read_full_page(page, btree_get_extent, 0);
}
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -1100,36 +1084,11 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
}
}
-static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
-{
- struct btrfs_subvolume_writers *writers;
- int ret;
-
- writers = kmalloc(sizeof(*writers), GFP_NOFS);
- if (!writers)
- return ERR_PTR(-ENOMEM);
-
- ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
- if (ret < 0) {
- kfree(writers);
- return ERR_PTR(ret);
- }
-
- init_waitqueue_head(&writers->wait);
- return writers;
-}
-
-static void
-btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
-{
- percpu_counter_destroy(&writers->counter);
- kfree(writers);
-}
-
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+ root->fs_info = fs_info;
root->node = NULL;
root->commit_root = NULL;
root->state = 0;
@@ -1173,7 +1132,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
refcount_set(&root->refs, 1);
- atomic_set(&root->will_be_snapshotted, 0);
atomic_set(&root->snapshot_force_cow, 0);
atomic_set(&root->nr_swapfiles, 0);
root->log_transid = 0;
@@ -1195,14 +1153,20 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
spin_lock_init(&root->root_item_lock);
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
+#ifdef CONFIG_BTRFS_DEBUG
+ INIT_LIST_HEAD(&root->leak_list);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ list_add_tail(&root->leak_list, &fs_info->allocated_roots);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+#endif
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
- gfp_t flags)
+ u64 objectid, gfp_t flags)
{
struct btrfs_root *root = kzalloc(sizeof(*root), flags);
if (root)
- root->fs_info = fs_info;
+ __setup_root(root, fs_info, objectid);
return root;
}
@@ -1215,12 +1179,11 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
if (!fs_info)
return ERR_PTR(-EINVAL);
- root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
/* We don't use the stripesize in selftest, set it as sectorsize */
- __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
root->alloc_bytenr = 0;
return root;
@@ -1237,19 +1200,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_key key;
unsigned int nofs_flag;
int ret = 0;
- uuid_le uuid = NULL_UUID_LE;
/*
* We're holding a transaction handle, so use a NOFS memory allocation
* context to avoid deadlock if reclaim happens.
*/
nofs_flag = memalloc_nofs_save();
- root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(root, fs_info, objectid);
root->root_key.objectid = objectid;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
@@ -1277,8 +1238,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
if (is_fstree(objectid))
- uuid_le_gen(&uuid);
- memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
+ generate_random_guid(root->root_item.uuid);
+ else
+ export_guid(root->root_item.uuid, &guid_null);
root->root_item.drop_level = 0;
key.objectid = objectid;
@@ -1293,12 +1255,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
return root;
fail:
- if (leaf) {
+ if (leaf)
btrfs_tree_unlock(leaf);
- free_extent_buffer(root->commit_root);
- free_extent_buffer(leaf);
- }
- kfree(root);
+ btrfs_put_root(root);
return ERR_PTR(ret);
}
@@ -1309,12 +1268,10 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root;
struct extent_buffer *leaf;
- root = btrfs_alloc_root(fs_info, GFP_NOFS);
+ root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
if (!root)
return ERR_PTR(-ENOMEM);
- __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
@@ -1331,7 +1288,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
- kfree(root);
+ btrfs_put_root(root);
return ERR_CAST(leaf);
}
@@ -1387,8 +1344,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return 0;
}
-static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key)
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root->fs_info;
@@ -1401,14 +1358,12 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
if (!path)
return ERR_PTR(-ENOMEM);
- root = btrfs_alloc_root(fs_info, GFP_NOFS);
+ root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
if (!root) {
ret = -ENOMEM;
goto alloc_fail;
}
- __setup_root(root, fs_info, key->objectid);
-
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
if (ret) {
@@ -1424,10 +1379,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
generation, level, NULL);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
+ root->node = NULL;
goto find_fail;
} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
- free_extent_buffer(root->node);
goto find_fail;
}
root->commit_root = btrfs_root_node(root);
@@ -1436,33 +1391,16 @@ out:
return root;
find_fail:
- kfree(root);
+ btrfs_put_root(root);
alloc_fail:
root = ERR_PTR(ret);
goto out;
}
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
- struct btrfs_key *location)
-{
- struct btrfs_root *root;
-
- root = btrfs_read_tree_root(tree_root, location);
- if (IS_ERR(root))
- return root;
-
- if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- set_bit(BTRFS_ROOT_REF_COWS, &root->state);
- btrfs_check_and_init_root_item(&root->root_item);
- }
-
- return root;
-}
-
-int btrfs_init_fs_root(struct btrfs_root *root)
+static int btrfs_init_fs_root(struct btrfs_root *root)
{
int ret;
- struct btrfs_subvolume_writers *writers;
+ unsigned int nofs_flag;
root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1472,12 +1410,20 @@ int btrfs_init_fs_root(struct btrfs_root *root)
goto fail;
}
- writers = btrfs_alloc_subvolume_writers();
- if (IS_ERR(writers)) {
- ret = PTR_ERR(writers);
+ /*
+ * We might be called under a transaction (e.g. indirect backref
+ * resolution) which could deadlock if it triggers memory reclaim
+ */
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_drew_lock_init(&root->snapshot_lock);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret)
goto fail;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ set_bit(BTRFS_ROOT_REF_COWS, &root->state);
+ btrfs_check_and_init_root_item(&root->root_item);
}
- root->subv_writers = writers;
btrfs_init_free_ino_ctl(root);
spin_lock_init(&root->ino_cache_lock);
@@ -1505,14 +1451,16 @@ fail:
return ret;
}
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_id)
+static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
{
struct btrfs_root *root;
spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)root_id);
+ if (root)
+ root = btrfs_grab_root(root);
spin_unlock(&fs_info->fs_roots_radix_lock);
return root;
}
@@ -1530,14 +1478,62 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
- if (ret == 0)
+ if (ret == 0) {
+ btrfs_grab_root(root);
set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+ }
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
return ret;
}
+void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
+{
+#ifdef CONFIG_BTRFS_DEBUG
+ struct btrfs_root *root;
+
+ while (!list_empty(&fs_info->allocated_roots)) {
+ root = list_first_entry(&fs_info->allocated_roots,
+ struct btrfs_root, leak_list);
+ btrfs_err(fs_info, "leaked root %llu-%llu refcount %d",
+ root->root_key.objectid, root->root_key.offset,
+ refcount_read(&root->refs));
+ while (refcount_read(&root->refs) > 1)
+ btrfs_put_root(root);
+ btrfs_put_root(root);
+ }
+#endif
+}
+
+void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+ percpu_counter_destroy(&fs_info->delalloc_bytes);
+ percpu_counter_destroy(&fs_info->dio_bytes);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
+ btrfs_free_csum_hash(fs_info);
+ btrfs_free_stripe_hash_table(fs_info);
+ btrfs_free_ref_cache(fs_info);
+ kfree(fs_info->balance_ctl);
+ kfree(fs_info->delayed_root);
+ btrfs_put_root(fs_info->extent_root);
+ btrfs_put_root(fs_info->tree_root);
+ btrfs_put_root(fs_info->chunk_root);
+ btrfs_put_root(fs_info->dev_root);
+ btrfs_put_root(fs_info->csum_root);
+ btrfs_put_root(fs_info->quota_root);
+ btrfs_put_root(fs_info->uuid_root);
+ btrfs_put_root(fs_info->free_space_root);
+ btrfs_put_root(fs_info->fs_root);
+ btrfs_check_leaked_roots(fs_info);
+ btrfs_extent_buffer_leak_debug_check(fs_info);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->super_for_commit);
+ kvfree(fs_info);
+}
+
+
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *location,
bool check_ref)
@@ -1548,33 +1544,35 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
int ret;
if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
- return fs_info->tree_root;
+ return btrfs_grab_root(fs_info->tree_root);
if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return fs_info->extent_root;
+ return btrfs_grab_root(fs_info->extent_root);
if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
- return fs_info->chunk_root;
+ return btrfs_grab_root(fs_info->chunk_root);
if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
- return fs_info->dev_root;
+ return btrfs_grab_root(fs_info->dev_root);
if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
- return fs_info->csum_root;
+ return btrfs_grab_root(fs_info->csum_root);
if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
- return fs_info->quota_root ? fs_info->quota_root :
- ERR_PTR(-ENOENT);
+ return btrfs_grab_root(fs_info->quota_root) ?
+ fs_info->quota_root : ERR_PTR(-ENOENT);
if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
- return fs_info->uuid_root ? fs_info->uuid_root :
- ERR_PTR(-ENOENT);
+ return btrfs_grab_root(fs_info->uuid_root) ?
+ fs_info->uuid_root : ERR_PTR(-ENOENT);
if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return fs_info->free_space_root ? fs_info->free_space_root :
- ERR_PTR(-ENOENT);
+ return btrfs_grab_root(fs_info->free_space_root) ?
+ fs_info->free_space_root : ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, location->objectid);
if (root) {
- if (check_ref && btrfs_root_refs(&root->root_item) == 0)
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
+ btrfs_put_root(root);
return ERR_PTR(-ENOENT);
+ }
return root;
}
- root = btrfs_read_fs_root(fs_info->tree_root, location);
+ root = btrfs_read_tree_root(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
@@ -1605,15 +1603,14 @@ again:
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
- if (ret == -EEXIST) {
- btrfs_free_fs_root(root);
+ btrfs_put_root(root);
+ if (ret == -EEXIST)
goto again;
- }
goto fail;
}
return root;
fail:
- btrfs_free_fs_root(root);
+ btrfs_put_root(root);
return ERR_PTR(ret);
}
@@ -1985,11 +1982,35 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
free_root_extent_buffers(info->csum_root);
free_root_extent_buffers(info->quota_root);
free_root_extent_buffers(info->uuid_root);
+ free_root_extent_buffers(info->fs_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
free_root_extent_buffers(info->free_space_root);
}
+void btrfs_put_root(struct btrfs_root *root)
+{
+ if (!root)
+ return;
+
+ if (refcount_dec_and_test(&root->refs)) {
+ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ if (root->anon_dev)
+ free_anon_bdev(root->anon_dev);
+ btrfs_drew_lock_destroy(&root->snapshot_lock);
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ kfree(root->free_ino_ctl);
+ kfree(root->free_ino_pinned);
+#ifdef CONFIG_BTRFS_DEBUG
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
+ list_del_init(&root->leak_list);
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
+#endif
+ kfree(root);
+ }
+}
+
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
int ret;
@@ -2001,13 +2022,9 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
- if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
btrfs_drop_and_free_fs_root(fs_info, gang[0]);
- } else {
- free_extent_buffer(gang[0]->node);
- free_extent_buffer(gang[0]->commit_root);
- btrfs_put_fs_root(gang[0]);
- }
+ btrfs_put_root(gang[0]);
}
while (1) {
@@ -2020,10 +2037,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
btrfs_free_log_root_tree(NULL, fs_info);
- btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
- }
}
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
@@ -2069,7 +2084,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
- BTRFS_I(inode)->root = fs_info->tree_root;
+ BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
btrfs_insert_inode_hash(inode);
@@ -2189,11 +2204,6 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
return 0;
}
-static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
-{
- crypto_free_shash(fs_info->csum_shash);
-}
-
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
@@ -2208,24 +2218,23 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return -EIO;
}
- log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
+ GFP_KERNEL);
if (!log_tree_root)
return -ENOMEM;
- __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-
log_tree_root->node = read_tree_block(fs_info, bytenr,
fs_info->generation + 1,
level, NULL);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
- kfree(log_tree_root);
+ log_tree_root->node = NULL;
+ btrfs_put_root(log_tree_root);
return ret;
} else if (!extent_buffer_uptodate(log_tree_root->node)) {
btrfs_err(fs_info, "failed to read log tree");
- free_extent_buffer(log_tree_root->node);
- kfree(log_tree_root);
+ btrfs_put_root(log_tree_root);
return -EIO;
}
/* returns with log_tree_root freed on success */
@@ -2233,8 +2242,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Failed to recover log tree");
- free_extent_buffer(log_tree_root->node);
- kfree(log_tree_root);
+ btrfs_put_root(log_tree_root);
return ret;
}
@@ -2624,67 +2632,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
return ret;
}
-int __cold open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options)
+void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
- u32 sectorsize;
- u32 nodesize;
- u32 stripesize;
- u64 generation;
- u64 features;
- u16 csum_type;
- struct btrfs_key location;
- struct buffer_head *bh;
- struct btrfs_super_block *disk_super;
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_root *tree_root;
- struct btrfs_root *chunk_root;
- int ret;
- int err = -EINVAL;
- int clear_free_space_tree = 0;
- int level;
-
- tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
- chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
- if (!tree_root || !chunk_root) {
- err = -ENOMEM;
- goto fail;
- }
-
- ret = init_srcu_struct(&fs_info->subvol_srcu);
- if (ret) {
- err = ret;
- goto fail;
- }
-
- ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_srcu;
- }
-
- ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_dio_bytes;
- }
- fs_info->dirty_metadata_batch = PAGE_SIZE *
- (1 + ilog2(nr_cpu_ids));
-
- ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_dirty_metadata_bytes;
- }
-
- ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
- GFP_KERNEL);
- if (ret) {
- err = ret;
- goto fail_delalloc_bytes;
- }
-
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
@@ -2711,6 +2660,11 @@ int __cold open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
+#ifdef CONFIG_BTRFS_DEBUG
+ INIT_LIST_HEAD(&fs_info->allocated_roots);
+ INIT_LIST_HEAD(&fs_info->allocated_ebs);
+ spin_lock_init(&fs_info->eb_leak_lock);
+#endif
extent_map_tree_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
@@ -2727,7 +2681,6 @@ int __cold open_ctree(struct super_block *sb,
atomic_set(&fs_info->reada_works_cnt, 0);
atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
- fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
@@ -2746,21 +2699,6 @@ int __cold open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->ordered_roots);
spin_lock_init(&fs_info->ordered_root_lock);
- fs_info->btree_inode = new_inode(sb);
- if (!fs_info->btree_inode) {
- err = -ENOMEM;
- goto fail_bio_counter;
- }
- mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-
- fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
- GFP_KERNEL);
- if (!fs_info->delayed_root) {
- err = -ENOMEM;
- goto fail_iput;
- }
- btrfs_init_delayed_root(fs_info->delayed_root);
-
btrfs_init_scrub(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
fs_info->check_integrity_print_mask = 0;
@@ -2768,20 +2706,12 @@ int __cold open_ctree(struct super_block *sb,
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
- sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
- sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
-
- btrfs_init_btree_inode(fs_info);
-
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
fs_info->first_logical_byte = (u64)-1;
- extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
- IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
- extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
- IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
- fs_info->pinned_extents = &fs_info->freed_extents[0];
+ extent_io_tree_init(fs_info, &fs_info->excluded_extents,
+ IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
mutex_init(&fs_info->ordered_operations_mutex);
@@ -2817,23 +2747,135 @@ int __cold open_ctree(struct super_block *sb,
fs_info->swapfile_pins = RB_ROOT;
fs_info->send_in_progress = 0;
+}
+
+static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+ int ret;
+
+ fs_info->sb = sb;
+ sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
+ sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
+
+ ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ fs_info->dirty_metadata_batch = PAGE_SIZE *
+ (1 + ilog2(nr_cpu_ids));
+
+ ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
+ GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+ GFP_KERNEL);
+ if (!fs_info->delayed_root)
+ return -ENOMEM;
+ btrfs_init_delayed_root(fs_info->delayed_root);
+
+ return btrfs_alloc_stripe_hash_table(fs_info);
+}
+
+static int btrfs_uuid_rescan_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+ int ret;
- ret = btrfs_alloc_stripe_hash_table(fs_info);
+ /*
+ * 1st step is to iterate through the existing UUID tree and
+ * to delete all entries that contain outdated data.
+ * 2nd step is to add all missing entries to the UUID tree.
+ */
+ ret = btrfs_uuid_tree_iterate(fs_info);
+ if (ret < 0) {
+ if (ret != -EINTR)
+ btrfs_warn(fs_info, "iterating uuid_tree failed %d",
+ ret);
+ up(&fs_info->uuid_tree_rescan_sem);
+ return ret;
+ }
+ return btrfs_uuid_scan_kthread(data);
+}
+
+static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+ struct task_struct *task;
+
+ down(&fs_info->uuid_tree_rescan_sem);
+ task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
+ if (IS_ERR(task)) {
+ /* fs_info->update_uuid_tree_gen remains 0 in all error case */
+ btrfs_warn(fs_info, "failed to start uuid_rescan task");
+ up(&fs_info->uuid_tree_rescan_sem);
+ return PTR_ERR(task);
+ }
+
+ return 0;
+}
+
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
+ char *options)
+{
+ u32 sectorsize;
+ u32 nodesize;
+ u32 stripesize;
+ u64 generation;
+ u64 features;
+ u16 csum_type;
+ struct btrfs_key location;
+ struct btrfs_super_block *disk_super;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *tree_root;
+ struct btrfs_root *chunk_root;
+ int ret;
+ int err = -EINVAL;
+ int clear_free_space_tree = 0;
+ int level;
+
+ ret = init_mount_fs_info(fs_info, sb);
if (ret) {
err = ret;
- goto fail_alloc;
+ goto fail;
}
- __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
+ /* These need to be init'ed before we start creating inodes and such. */
+ tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
+ GFP_KERNEL);
+ fs_info->tree_root = tree_root;
+ chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
+ GFP_KERNEL);
+ fs_info->chunk_root = chunk_root;
+ if (!tree_root || !chunk_root) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ fs_info->btree_inode = new_inode(sb);
+ if (!fs_info->btree_inode) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+ btrfs_init_btree_inode(fs_info);
invalidate_bdev(fs_devices->latest_bdev);
/*
* Read super block and check the signature bytes only
*/
- bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- if (IS_ERR(bh)) {
- err = PTR_ERR(bh);
+ disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+ if (IS_ERR(disk_super)) {
+ err = PTR_ERR(disk_super);
goto fail_alloc;
}
@@ -2841,18 +2883,19 @@ int __cold open_ctree(struct super_block *sb,
* Verify the type first, if that or the the checksum value are
* corrupted, we'll find out
*/
- csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
+ csum_type = btrfs_super_csum_type(disk_super);
if (!btrfs_supported_super_csum(csum_type)) {
btrfs_err(fs_info, "unsupported checksum algorithm: %u",
csum_type);
err = -EINVAL;
- brelse(bh);
+ btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
ret = btrfs_init_csum_hash(fs_info, csum_type);
if (ret) {
err = ret;
+ btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
@@ -2860,11 +2903,11 @@ int __cold open_ctree(struct super_block *sb,
* We want to check superblock checksum, the type is stored inside.
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
- if (btrfs_check_super_csum(fs_info, bh->b_data)) {
+ if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
- brelse(bh);
- goto fail_csum;
+ btrfs_release_disk_super(disk_super);
+ goto fail_alloc;
}
/*
@@ -2872,8 +2915,8 @@ int __cold open_ctree(struct super_block *sb,
* following bytes up to INFO_SIZE, the checksum is calculated from
* the whole block of INFO_SIZE
*/
- memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
- brelse(bh);
+ memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
+ btrfs_release_disk_super(disk_super);
disk_super = fs_info->super_copy;
@@ -2901,11 +2944,11 @@ int __cold open_ctree(struct super_block *sb,
if (ret) {
btrfs_err(fs_info, "superblock contains fatal errors");
err = -EINVAL;
- goto fail_csum;
+ goto fail_alloc;
}
if (!btrfs_super_root(disk_super))
- goto fail_csum;
+ goto fail_alloc;
/* check FS state, whether FS is broken. */
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
@@ -2920,7 +2963,7 @@ int __cold open_ctree(struct super_block *sb,
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
- goto fail_csum;
+ goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super) &
@@ -2930,7 +2973,7 @@ int __cold open_ctree(struct super_block *sb,
"cannot mount because of unsupported optional features (%llx)",
features);
err = -EINVAL;
- goto fail_csum;
+ goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super);
@@ -2974,7 +3017,7 @@ int __cold open_ctree(struct super_block *sb,
btrfs_err(fs_info,
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
nodesize, sectorsize);
- goto fail_csum;
+ goto fail_alloc;
}
/*
@@ -2990,7 +3033,7 @@ int __cold open_ctree(struct super_block *sb,
"cannot mount read-write because of unsupported optional features (%llx)",
features);
err = -EINVAL;
- goto fail_csum;
+ goto fail_alloc;
}
ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -3021,8 +3064,6 @@ int __cold open_ctree(struct super_block *sb,
generation = btrfs_super_chunk_root_generation(disk_super);
level = btrfs_super_chunk_root_level(disk_super);
- __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
-
chunk_root->node = read_tree_block(fs_info,
btrfs_super_chunk_root(disk_super),
generation, level, NULL);
@@ -3038,7 +3079,8 @@ int __cold open_ctree(struct super_block *sb,
chunk_root->commit_root = btrfs_root_node(chunk_root);
read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
- btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
+ offsetof(struct btrfs_header, chunk_tree_uuid),
+ BTRFS_UUID_SIZE);
ret = btrfs_read_chunk_tree(fs_info);
if (ret) {
@@ -3061,6 +3103,18 @@ int __cold open_ctree(struct super_block *sb,
if (ret)
goto fail_tree_roots;
+ /*
+ * If we have a uuid root and we're not being told to rescan we need to
+ * check the generation here so we can set the
+ * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
+ * transaction during a balance or the log replay without updating the
+ * uuid generation, and then if we crash we would rescan the uuid tree,
+ * even though it was perfectly fine.
+ */
+ if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
+ fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
+ set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
+
ret = btrfs_verify_dev_extents(fs_info);
if (ret) {
btrfs_err(fs_info,
@@ -3164,6 +3218,7 @@ int __cold open_ctree(struct super_block *sb,
/* do not make disk changes in broken FS or nologreplay is given */
if (btrfs_super_log_root(disk_super) != 0 &&
!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_info(fs_info, "start tree-log replay");
ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3195,10 +3250,11 @@ int __cold open_ctree(struct super_block *sb,
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
- fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+ fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true);
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
btrfs_warn(fs_info, "failed to read fs tree: %d", err);
+ fs_info->fs_root = NULL;
goto fail_qgroup;
}
@@ -3283,8 +3339,6 @@ int __cold open_ctree(struct super_block *sb,
close_ctree(fs_info);
return ret;
}
- } else {
- set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
}
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
@@ -3327,90 +3381,78 @@ fail_tree_roots:
fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
-fail_csum:
- btrfs_free_csum_hash(fs_info);
fail_alloc:
-fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
iput(fs_info->btree_inode);
-fail_bio_counter:
- percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
-fail_delalloc_bytes:
- percpu_counter_destroy(&fs_info->delalloc_bytes);
-fail_dirty_metadata_bytes:
- percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
-fail_dio_bytes:
- percpu_counter_destroy(&fs_info->dio_bytes);
-fail_srcu:
- cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
- btrfs_free_stripe_hash_table(fs_info);
btrfs_close_devices(fs_info->fs_devices);
return err;
}
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
-static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+static void btrfs_end_super_write(struct bio *bio)
{
- if (uptodate) {
- set_buffer_uptodate(bh);
- } else {
- struct btrfs_device *device = (struct btrfs_device *)
- bh->b_private;
-
- btrfs_warn_rl_in_rcu(device->fs_info,
- "lost page write due to IO error on %s",
- rcu_str_deref(device->name));
- /* note, we don't set_buffer_write_io_error because we have
- * our own ways of dealing with the IO errors
- */
- clear_buffer_uptodate(bh);
- btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
+ struct btrfs_device *device = bio->bi_private;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ struct page *page;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ page = bvec->bv_page;
+
+ if (bio->bi_status) {
+ btrfs_warn_rl_in_rcu(device->fs_info,
+ "lost page write due to IO error on %s (%d)",
+ rcu_str_deref(device->name),
+ blk_status_to_errno(bio->bi_status));
+ ClearPageUptodate(page);
+ SetPageError(page);
+ btrfs_dev_stat_inc_and_print(device,
+ BTRFS_DEV_STAT_WRITE_ERRS);
+ } else {
+ SetPageUptodate(page);
+ }
+
+ put_page(page);
+ unlock_page(page);
}
- unlock_buffer(bh);
- put_bh(bh);
+
+ bio_put(bio);
}
-int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
- struct buffer_head **bh_ret)
+struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
+ int copy_num)
{
- struct buffer_head *bh;
struct btrfs_super_block *super;
+ struct page *page;
u64 bytenr;
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
bytenr = btrfs_sb_offset(copy_num);
if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
- /*
- * If we fail to read from the underlying devices, as of now
- * the best option we have is to mark it EIO.
- */
- if (!bh)
- return -EIO;
+ page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
- super = (struct btrfs_super_block *)bh->b_data;
+ super = page_address(page);
if (btrfs_super_bytenr(super) != bytenr ||
btrfs_super_magic(super) != BTRFS_MAGIC) {
- brelse(bh);
- return -EINVAL;
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-EINVAL);
}
- *bh_ret = bh;
- return 0;
+ return super;
}
-struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
{
- struct buffer_head *bh;
- struct buffer_head *latest = NULL;
- struct btrfs_super_block *super;
+ struct btrfs_super_block *super, *latest = NULL;
int i;
u64 transid = 0;
- int ret = -EINVAL;
/* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
@@ -3418,48 +3460,41 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- ret = btrfs_read_dev_one_super(bdev, i, &bh);
- if (ret)
+ super = btrfs_read_dev_one_super(bdev, i);
+ if (IS_ERR(super))
continue;
- super = (struct btrfs_super_block *)bh->b_data;
-
if (!latest || btrfs_super_generation(super) > transid) {
- brelse(latest);
- latest = bh;
+ if (latest)
+ btrfs_release_disk_super(super);
+
+ latest = super;
transid = btrfs_super_generation(super);
- } else {
- brelse(bh);
}
}
- if (!latest)
- return ERR_PTR(ret);
-
- return latest;
+ return super;
}
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
- * buffer heads we write are pinned.
+ * pages we use for writing are locked.
*
* Write @max_mirrors copies of the superblock, where 0 means default that fit
* the expected device size at commit time. Note that max_mirrors must be
* same for write and wait phases.
*
- * Return number of errors when buffer head is not found or submission fails.
+ * Return number of errors when page is not found or submission fails.
*/
static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
{
struct btrfs_fs_info *fs_info = device->fs_info;
+ struct address_space *mapping = device->bdev->bd_inode->i_mapping;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- struct buffer_head *bh;
int i;
- int ret;
int errors = 0;
u64 bytenr;
- int op_flags;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
@@ -3467,6 +3502,10 @@ static int write_dev_supers(struct btrfs_device *device,
shash->tfm = fs_info->csum_shash;
for (i = 0; i < max_mirrors; i++) {
+ struct page *page;
+ struct bio *bio;
+ struct btrfs_super_block *disk_super;
+
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
@@ -3479,37 +3518,45 @@ static int write_dev_supers(struct btrfs_device *device,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
crypto_shash_final(shash, sb->csum);
- /* One reference for us, and we leave it for the caller */
- bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh) {
+ page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
+ GFP_NOFS);
+ if (!page) {
btrfs_err(device->fs_info,
- "couldn't get super buffer head for bytenr %llu",
+ "couldn't get super block page for bytenr %llu",
bytenr);
errors++;
continue;
}
- memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+ /* Bump the refcount for wait_dev_supers() */
+ get_page(page);
- /* one reference for submit_bh */
- get_bh(bh);
+ disk_super = page_address(page);
+ memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
- set_buffer_uptodate(bh);
- lock_buffer(bh);
- bh->b_end_io = btrfs_end_buffer_write_sync;
- bh->b_private = device;
+ /*
+ * Directly use bios here instead of relying on the page cache
+ * to do I/O, so we don't lose the ability to do integrity
+ * checking.
+ */
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio_set_dev(bio, device->bdev);
+ bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
+ bio->bi_private = device;
+ bio->bi_end_io = btrfs_end_super_write;
+ __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
+ offset_in_page(bytenr));
/*
- * we fua the first super. The others we allow
- * to go down lazy.
+ * We FUA only the first super block. The others we allow to
+ * go down lazy and there's a short window where the on-disk
+ * copies might still contain the older version.
*/
- op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
- op_flags |= REQ_FUA;
- ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
- if (ret)
- errors++;
+ bio->bi_opf |= REQ_FUA;
+
+ btrfsic_submit_bio(bio);
}
return errors < i ? 0 : -1;
}
@@ -3518,12 +3565,11 @@ static int write_dev_supers(struct btrfs_device *device,
* Wait for write completion of superblocks done by write_dev_supers,
* @max_mirrors same for write and wait phases.
*
- * Return number of errors when buffer head is not found or not marked up to
+ * Return number of errors when page is not found or not marked up to
* date.
*/
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
- struct buffer_head *bh;
int i;
int errors = 0;
bool primary_failed = false;
@@ -3533,32 +3579,34 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
for (i = 0; i < max_mirrors; i++) {
+ struct page *page;
+
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
- bh = __find_get_block(device->bdev,
- bytenr / BTRFS_BDEV_BLOCKSIZE,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh) {
+ page = find_get_page(device->bdev->bd_inode->i_mapping,
+ bytenr >> PAGE_SHIFT);
+ if (!page) {
errors++;
if (i == 0)
primary_failed = true;
continue;
}
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
+ /* Page is submitted locked and unlocked once the IO completes */
+ wait_on_page_locked(page);
+ if (PageError(page)) {
errors++;
if (i == 0)
primary_failed = true;
}
- /* drop our reference */
- brelse(bh);
+ /* Drop our reference */
+ put_page(page);
- /* drop the reference from the writing run */
- brelse(bh);
+ /* Drop the reference from the writing run */
+ put_page(page);
}
/* log error, force error return */
@@ -3830,20 +3878,19 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root)
{
+ bool drop_ref = false;
+
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid);
+ if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+ drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
- if (btrfs_root_refs(&root->root_item) == 0)
- synchronize_srcu(&fs_info->subvol_srcu);
-
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
btrfs_free_log(NULL, root);
if (root->reloc_root) {
- free_extent_buffer(root->reloc_root->node);
- free_extent_buffer(root->reloc_root->commit_root);
- btrfs_put_fs_root(root->reloc_root);
+ btrfs_put_root(root->reloc_root);
root->reloc_root = NULL;
}
}
@@ -3852,22 +3899,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
__btrfs_remove_free_space_cache(root->free_ino_pinned);
if (root->free_ino_ctl)
__btrfs_remove_free_space_cache(root->free_ino_ctl);
- btrfs_free_fs_root(root);
-}
-
-void btrfs_free_fs_root(struct btrfs_root *root)
-{
- iput(root->ino_cache_inode);
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
- if (root->anon_dev)
- free_anon_bdev(root->anon_dev);
- if (root->subv_writers)
- btrfs_free_subvolume_writers(root->subv_writers);
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
- kfree(root->free_ino_ctl);
- kfree(root->free_ino_pinned);
- btrfs_put_fs_root(root);
+ if (root->ino_cache_inode) {
+ iput(root->ino_cache_inode);
+ root->ino_cache_inode = NULL;
+ }
+ if (drop_ref)
+ btrfs_put_root(root);
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3877,15 +3914,14 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
int i = 0;
int err = 0;
unsigned int ret = 0;
- int index;
while (1) {
- index = srcu_read_lock(&fs_info->subvol_srcu);
+ spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
if (!ret) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
break;
}
root_objectid = gang[ret - 1]->root_key.objectid + 1;
@@ -3897,9 +3933,9 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
continue;
}
/* grab all the search result for later use */
- gang[i] = btrfs_grab_fs_root(gang[i]);
+ gang[i] = btrfs_grab_root(gang[i]);
}
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
for (i = 0; i < ret; i++) {
if (!gang[i])
@@ -3908,7 +3944,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
err = btrfs_orphan_cleanup(gang[i]);
if (err)
break;
- btrfs_put_fs_root(gang[i]);
+ btrfs_put_root(gang[i]);
}
root_objectid++;
}
@@ -3916,7 +3952,7 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
/* release the uncleaned roots due to error */
for (; i < ret; i++) {
if (gang[i])
- btrfs_put_fs_root(gang[i]);
+ btrfs_put_root(gang[i]);
}
return err;
}
@@ -3988,6 +4024,19 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
btrfs_delete_unused_bgs(fs_info);
+ /*
+ * There might be existing delayed inode workers still running
+ * and holding an empty delayed inode item. We must wait for
+ * them to complete first because they can create a transaction.
+ * This happens when someone calls btrfs_balance_delayed_items()
+ * and then a transaction commit runs the same delayed nodes
+ * before any delayed worker has done something with the nodes.
+ * We must wait for any worker here and not at transaction
+ * commit time since that could cause a deadlock.
+ * This is a very rare case.
+ */
+ btrfs_flush_workqueue(fs_info->delayed_workers);
+
ret = btrfs_commit_super(fs_info);
if (ret)
btrfs_err(fs_info, "commit super ret %d", ret);
@@ -4018,8 +4067,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
- btrfs_free_fs_roots(fs_info);
-
btrfs_put_block_group_cache(fs_info);
/*
@@ -4031,6 +4078,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
free_root_pointers(fs_info, true);
+ btrfs_free_fs_roots(fs_info);
/*
* We must free the block groups after dropping the fs_roots as we could
@@ -4050,16 +4098,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_mapping_tree_free(&fs_info->mapping_tree);
btrfs_close_devices(fs_info->fs_devices);
-
- percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
- percpu_counter_destroy(&fs_info->delalloc_bytes);
- percpu_counter_destroy(&fs_info->dio_bytes);
- percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
- cleanup_srcu_struct(&fs_info->subvol_srcu);
-
- btrfs_free_csum_hash(fs_info);
- btrfs_free_stripe_hash_table(fs_info);
- btrfs_free_ref_cache(fs_info);
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -4233,7 +4271,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
spin_lock(&delayed_refs->lock);
if (atomic_read(&delayed_refs->num_entries) == 0) {
spin_unlock(&delayed_refs->lock);
- btrfs_info(fs_info, "delayed_refs has NO entry");
+ btrfs_debug(fs_info, "delayed_refs has NO entry");
return ret;
}
@@ -4267,14 +4305,36 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
- if (pin_bytes)
- btrfs_pin_extent(fs_info, head->bytenr,
- head->num_bytes, 1);
+ if (pin_bytes) {
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, head->bytenr);
+ BUG_ON(!cache);
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += head->num_bytes;
+ btrfs_space_info_update_bytes_pinned(fs_info,
+ cache->space_info, head->num_bytes);
+ cache->reserved -= head->num_bytes;
+ cache->space_info->bytes_reserved -= head->num_bytes;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ percpu_counter_add_batch(
+ &cache->space_info->total_bytes_pinned,
+ head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
+
+ btrfs_put_block_group(cache);
+
+ btrfs_error_unpin_extent_range(fs_info, head->bytenr,
+ head->bytenr + head->num_bytes - 1);
+ }
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
}
+ btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
@@ -4324,12 +4384,12 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
while (!list_empty(&splice)) {
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
- root = btrfs_grab_fs_root(root);
+ root = btrfs_grab_root(root);
BUG_ON(!root);
spin_unlock(&fs_info->delalloc_root_lock);
btrfs_destroy_delalloc_inodes(root);
- btrfs_put_fs_root(root);
+ btrfs_put_root(root);
spin_lock(&fs_info->delalloc_root_lock);
}
@@ -4370,16 +4430,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
}
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *pinned_extents)
+ struct extent_io_tree *unpin)
{
- struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
- bool loop = true;
- unpin = pinned_extents;
-again:
while (1) {
struct extent_state *cached_state = NULL;
@@ -4404,15 +4460,6 @@ again:
cond_resched();
}
- if (loop) {
- if (unpin == &fs_info->freed_extents[0])
- unpin = &fs_info->freed_extents[1];
- else
- unpin = &fs_info->freed_extents[0];
- loop = false;
- goto again;
- }
-
return 0;
}
@@ -4500,12 +4547,10 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
wake_up(&fs_info->transaction_wait);
btrfs_destroy_delayed_inodes(fs_info);
- btrfs_assert_delayed_root_empty(fs_info);
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
- btrfs_destroy_pinned_extent(fs_info,
- fs_info->pinned_extents);
+ btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
@@ -4557,7 +4602,6 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
btrfs_destroy_all_ordered_extents(fs_info);
btrfs_destroy_delayed_inodes(fs_info);
btrfs_assert_delayed_root_empty(fs_info);
- btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
btrfs_destroy_all_delalloc_inodes(fs_info);
mutex_unlock(&fs_info->transaction_kthread_mutex);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 8c2d6cf1ce59..cd629113f61c 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -39,6 +39,8 @@ static inline u64 btrfs_sb_offset(int mirror)
struct btrfs_device;
struct btrfs_fs_devices;
+void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
+void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -54,15 +56,12 @@ int __cold open_ctree(struct super_block *sb,
char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
-struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
-int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
- struct buffer_head **bh_ret);
+struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
+struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
+ int copy_num);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
- struct btrfs_key *location);
-int btrfs_init_fs_root(struct btrfs_root *root);
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_id);
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ struct btrfs_key *key);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
@@ -70,19 +69,13 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *key,
bool check_ref);
-static inline struct btrfs_root *
-btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
- struct btrfs_key *location)
-{
- return btrfs_get_fs_root(fs_info, location, true);
-}
+void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-void btrfs_free_fs_root(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
@@ -95,19 +88,16 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
* If you want to ensure the whole tree is safe, you should use
* fs_info->subvol_srcu
*/
-static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
{
+ if (!root)
+ return NULL;
if (refcount_inc_not_zero(&root->refs))
return root;
return NULL;
}
-static inline void btrfs_put_fs_root(struct btrfs_root *root)
-{
- if (refcount_dec_and_test(&root->refs))
- kfree(root);
-}
-
+void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 72e312cae69d..2bb25d2dc44b 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -57,16 +57,14 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return type;
}
-static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
- int check_generation)
+struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u32 generation,
+ int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
struct inode *inode;
struct btrfs_key key;
- int index;
- int err = 0;
if (objectid < BTRFS_FIRST_FREE_OBJECTID)
return ERR_PTR(-ESTALE);
@@ -75,25 +73,18 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto fail;
- }
+ root = btrfs_get_fs_root(fs_info, &key, true);
+ if (IS_ERR(root))
+ return ERR_CAST(root);
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(sb, &key, root);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- goto fail;
- }
-
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ btrfs_put_root(root);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
if (check_generation && generation != inode->i_generation) {
iput(inode);
@@ -101,9 +92,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
}
return d_obtain_alias(inode);
-fail:
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- return ERR_PTR(err);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -152,7 +140,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
-static struct dentry *btrfs_get_parent(struct dentry *child)
+struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = d_inode(child);
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index 57488ecd7d4e..f32f4113c976 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -18,4 +18,9 @@ struct btrfs_fid {
u64 parent_root_objectid;
} __attribute__ ((packed));
+struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u32 generation,
+ int check_generation);
+struct dentry *btrfs_get_parent(struct dentry *child);
+
#endif
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index a3febe746c79..b4a7bad3e82e 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -36,13 +36,14 @@ struct io_failure_record;
#define CHUNK_TRIMMED EXTENT_DEFRAG
enum {
- IO_TREE_FS_INFO_FREED_EXTENTS0,
- IO_TREE_FS_INFO_FREED_EXTENTS1,
+ IO_TREE_FS_PINNED_EXTENTS,
+ IO_TREE_FS_EXCLUDED_EXTENTS,
IO_TREE_INODE_IO,
IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
IO_TREE_TRANS_DIRTY_PAGES,
IO_TREE_ROOT_DIRTY_LOG_PAGES,
+ IO_TREE_INODE_FILE_EXTENT,
IO_TREE_SELFTEST,
};
@@ -222,6 +223,8 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state);
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits);
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0163fdd59f8f..54a64d1e18c6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -64,10 +64,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes)
{
u64 end = start + num_bytes - 1;
- set_extent_bits(&fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE);
- set_extent_bits(&fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE);
+ set_extent_bits(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE);
return 0;
}
@@ -79,10 +77,8 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
start = cache->start;
end = start + cache->length - 1;
- clear_extent_bits(&fs_info->freed_extents[0],
- start, end, EXTENT_UPTODATE);
- clear_extent_bits(&fs_info->freed_extents[1],
- start, end, EXTENT_UPTODATE);
+ clear_extent_bits(&fs_info->excluded_extents, start, end,
+ EXTENT_UPTODATE);
}
static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
@@ -1193,24 +1189,6 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-static int insert_extent_backref(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- u64 bytenr, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int refs_to_add)
-{
- int ret;
- if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- BUG_ON(refs_to_add != 1);
- ret = insert_tree_block_ref(trans, path, bytenr, parent,
- root_objectid);
- } else {
- ret = insert_extent_data_ref(trans, path, bytenr, parent,
- root_objectid, owner, offset,
- refs_to_add);
- }
- return ret;
-}
-
static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
@@ -1469,7 +1447,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
@@ -1494,11 +1471,17 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* now insert the actual backref */
- ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
- owner, offset, refs_to_add);
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ BUG_ON(refs_to_add != 1);
+ ret = insert_tree_block_ref(trans, path, bytenr, parent,
+ root_objectid);
+ } else {
+ ret = insert_extent_data_ref(trans, path, bytenr, parent,
+ root_objectid, owner, offset,
+ refs_to_add);
+ }
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -1583,7 +1566,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
int err = 0;
int metadata = !extent_op->is_data;
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
@@ -1604,7 +1587,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
if (ret < 0) {
@@ -1703,10 +1685,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
{
int ret = 0;
- if (trans->aborted) {
+ if (TRANS_ABORTED(trans)) {
if (insert_reserved)
- btrfs_pin_extent(trans->fs_info, node->bytenr,
- node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return 0;
}
@@ -1721,8 +1702,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
else
BUG();
if (ret && insert_reserved)
- btrfs_pin_extent(trans->fs_info, node->bytenr,
- node->num_bytes, 1);
+ btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return ret;
}
@@ -1867,8 +1847,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
if (head->must_insert_reserved) {
- btrfs_pin_extent(fs_info, head->bytenr,
- head->num_bytes, 1);
+ btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
if (head->is_data) {
ret = btrfs_del_csums(trans, fs_info->csum_root,
head->bytenr, head->num_bytes);
@@ -2191,7 +2170,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
int run_all = count == (unsigned long)-1;
/* We'll clean this up in btrfs_cleanup_transaction */
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
@@ -2238,7 +2217,7 @@ out:
}
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 flags,
+ struct extent_buffer *eb, u64 flags,
int level, int is_data)
{
struct btrfs_delayed_extent_op *extent_op;
@@ -2254,7 +2233,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->is_data = is_data ? true : false;
extent_op->level = level;
- ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
@@ -2588,7 +2567,8 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
return bytenr;
}
-static int pin_down_extent(struct btrfs_block_group *cache,
+static int pin_down_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -2607,22 +2587,20 @@ static int pin_down_extent(struct btrfs_block_group *cache,
percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
- set_extent_dirty(fs_info->pinned_extents, bytenr,
+ set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
}
-int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_block_group *cache;
- ASSERT(fs_info->running_transaction);
-
- cache = btrfs_lookup_block_group(fs_info, bytenr);
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
BUG_ON(!cache); /* Logic error */
- pin_down_extent(cache, bytenr, num_bytes, reserved);
+ pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
btrfs_put_block_group(cache);
return 0;
@@ -2631,13 +2609,15 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
/*
* this function must be called within transaction
*/
-int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group *cache;
int ret;
- cache = btrfs_lookup_block_group(fs_info, bytenr);
+ btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
+
+ cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
if (!cache)
return -EINVAL;
@@ -2649,7 +2629,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
*/
btrfs_cache_block_group(cache, 1);
- pin_down_extent(cache, bytenr, num_bytes, 0);
+ pin_down_extent(trans, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
@@ -2763,11 +2743,6 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
}
}
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
- fs_info->pinned_extents = &fs_info->freed_extents[1];
- else
- fs_info->pinned_extents = &fs_info->freed_extents[0];
-
up_write(&fs_info->commit_root_sem);
btrfs_update_global_block_rsv(fs_info);
@@ -2908,12 +2883,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
u64 end;
int ret;
- if (fs_info->pinned_extents == &fs_info->freed_extents[0])
- unpin = &fs_info->freed_extents[1];
- else
- unpin = &fs_info->freed_extents[0];
+ unpin = &trans->transaction->pinned_extents;
- while (!trans->aborted) {
+ while (!TRANS_ABORTED(trans)) {
struct extent_state *cached_state = NULL;
mutex_lock(&fs_info->unused_bg_unpin_mutex);
@@ -2923,6 +2895,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+ clear_extent_bits(&fs_info->excluded_extents, start,
+ end, EXTENT_UPTODATE);
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
@@ -2950,7 +2925,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
u64 trimmed = 0;
ret = -EROFS;
- if (!trans->aborted)
+ if (!TRANS_ABORTED(trans))
ret = btrfs_discard_extent(fs_info,
block_group->start,
block_group->length,
@@ -3000,7 +2975,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
@@ -3301,7 +3275,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(cache, buf->start, buf->len, 1);
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache);
goto out;
}
@@ -3345,7 +3319,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
(ref->type == BTRFS_REF_DATA &&
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
- btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
+ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
old_ref_mod = new_ref_mod = 0;
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
@@ -3438,6 +3412,10 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
btrfs_put_block_group(cache);
}
+enum btrfs_extent_allocation_policy {
+ BTRFS_EXTENT_ALLOC_CLUSTERED,
+};
+
/*
* Structure used internally for find_free_extent() function. Wraps needed
* parameters.
@@ -3454,6 +3432,8 @@ struct find_free_extent_ctl {
/* For clustered allocation */
u64 empty_cluster;
+ struct btrfs_free_cluster *last_ptr;
+ bool use_cluster;
bool have_caching_bg;
bool orig_have_caching_bg;
@@ -3489,6 +3469,12 @@ struct find_free_extent_ctl {
/* Found result */
u64 found_offset;
+
+ /* Hint where to start looking for an empty space */
+ u64 hint_byte;
+
+ /* Allocation policy */
+ enum btrfs_extent_allocation_policy policy;
};
@@ -3501,11 +3487,11 @@ struct find_free_extent_ctl {
* Return 0 means we have found a location and set ffe_ctl->found_offset.
*/
static int find_free_extent_clustered(struct btrfs_block_group *bg,
- struct btrfs_free_cluster *last_ptr,
- struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group **cluster_bg_ret)
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **cluster_bg_ret)
{
struct btrfs_block_group *cluster_bg;
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 aligned_cluster;
u64 offset;
int ret;
@@ -3605,9 +3591,9 @@ refill_cluster:
* Return -EAGAIN to inform caller that we need to re-search this block group
*/
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
- struct btrfs_free_cluster *last_ptr,
- struct find_free_extent_ctl *ffe_ctl)
+ struct find_free_extent_ctl *ffe_ctl)
{
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 offset;
/*
@@ -3663,16 +3649,101 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg,
return 0;
}
+static int do_allocation_clustered(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ int ret;
+
+ /* We want to try and use the cluster allocator, so lets look there */
+ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
+ ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ /* ret == -ENOENT case falls through */
+ }
+
+ return find_free_extent_unclustered(block_group, ffe_ctl);
+}
+
+static int do_allocation(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
+ default:
+ BUG();
+ }
+}
+
+static void release_block_group(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ int delalloc)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ ffe_ctl->retry_clustered = false;
+ ffe_ctl->retry_unclustered = false;
+ break;
+ default:
+ BUG();
+ }
+
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+ ffe_ctl->index);
+ btrfs_release_block_group(block_group, delalloc);
+}
+
+static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_key *ins)
+{
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+
+ if (!ffe_ctl->use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
+}
+
+static void found_extent(struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_key *ins)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ found_extent_clustered(ffe_ctl, ins);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ /*
+ * If we can't allocate a new chunk we've already looped through
+ * at least once, move on to the NO_EMPTY_SIZE case.
+ */
+ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
+ return 0;
+ default:
+ BUG();
+ }
+}
+
/*
* Return >0 means caller needs to re-search for free extent
* Return 0 means we have the needed free extent.
* Return <0 means we failed to locate any free extent.
*/
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
- struct btrfs_free_cluster *last_ptr,
struct btrfs_key *ins,
struct find_free_extent_ctl *ffe_ctl,
- int full_search, bool use_cluster)
+ bool full_search)
{
struct btrfs_root *root = fs_info->extent_root;
int ret;
@@ -3689,11 +3760,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return 1;
if (ins->objectid) {
- if (!use_cluster && last_ptr) {
- spin_lock(&last_ptr->lock);
- last_ptr->window_start = ins->objectid;
- spin_unlock(&last_ptr->lock);
- }
+ found_extent(ffe_ctl, ins);
return 0;
}
@@ -3739,16 +3806,10 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
CHUNK_ALLOC_FORCE);
- /*
- * If we can't allocate a new chunk we've already looped
- * through at least once, move on to the NO_EMPTY_SIZE
- * case.
- */
- if (ret == -ENOSPC)
- ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
-
/* Do not bail out on ENOSPC since we can do more. */
- if (ret < 0 && ret != -ENOSPC)
+ if (ret == -ENOSPC)
+ ret = chunk_allocation_failed(ffe_ctl);
+ else if (ret < 0)
btrfs_abort_transaction(trans, ret);
else
ret = 0;
@@ -3759,6 +3820,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
}
if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
+ if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
+ return -ENOSPC;
+
/*
* Don't loop again if we already have no empty_size and
* no empty_cluster.
@@ -3774,6 +3838,71 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
+static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
+ struct btrfs_key *ins)
+{
+ /*
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
+ */
+ if (space_info->max_extent_size) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ ffe_ctl->num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ ffe_ctl->use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
+ }
+
+ ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
+ &ffe_ctl->empty_cluster);
+ if (ffe_ctl->last_ptr) {
+ struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
+
+ spin_lock(&last_ptr->lock);
+ if (last_ptr->block_group)
+ ffe_ctl->hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ ffe_ctl->hint_byte = last_ptr->window_start;
+ ffe_ctl->use_cluster = false;
+ }
+ spin_unlock(&last_ptr->lock);
+ }
+
+ return 0;
+}
+
+static int prepare_allocation(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
+ struct btrfs_key *ins)
+{
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return prepare_allocation_clustered(fs_info, ffe_ctl,
+ space_info, ins);
+ default:
+ BUG();
+ }
+}
+
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
@@ -3801,16 +3930,14 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
*/
static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
- u64 hint_byte, struct btrfs_key *ins,
+ u64 hint_byte_orig, struct btrfs_key *ins,
u64 flags, int delalloc)
{
int ret = 0;
int cache_block_group_error = 0;
- struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group *block_group = NULL;
struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
- bool use_cluster = true;
bool full_search = false;
WARN_ON(num_bytes < fs_info->sectorsize);
@@ -3819,13 +3946,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
ffe_ctl.empty_size = empty_size;
ffe_ctl.flags = flags;
ffe_ctl.search_start = 0;
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
ffe_ctl.delalloc = delalloc;
ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
ffe_ctl.have_caching_bg = false;
ffe_ctl.orig_have_caching_bg = false;
ffe_ctl.found_offset = 0;
+ ffe_ctl.hint_byte = hint_byte_orig;
+ ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+
+ /* For clustered allocation */
+ ffe_ctl.retry_clustered = false;
+ ffe_ctl.retry_unclustered = false;
+ ffe_ctl.last_ptr = NULL;
+ ffe_ctl.use_cluster = true;
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
@@ -3839,51 +3972,14 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
- /*
- * If our free space is heavily fragmented we may not be able to make
- * big contiguous allocations, so instead of doing the expensive search
- * for free space, simply return ENOSPC with our max_extent_size so we
- * can go ahead and search for a more manageable chunk.
- *
- * If our max_extent_size is large enough for our allocation simply
- * disable clustering since we will likely not be able to find enough
- * space to create a cluster and induce latency trying.
- */
- if (unlikely(space_info->max_extent_size)) {
- spin_lock(&space_info->lock);
- if (space_info->max_extent_size &&
- num_bytes > space_info->max_extent_size) {
- ins->offset = space_info->max_extent_size;
- spin_unlock(&space_info->lock);
- return -ENOSPC;
- } else if (space_info->max_extent_size) {
- use_cluster = false;
- }
- spin_unlock(&space_info->lock);
- }
-
- last_ptr = fetch_cluster_info(fs_info, space_info,
- &ffe_ctl.empty_cluster);
- if (last_ptr) {
- spin_lock(&last_ptr->lock);
- if (last_ptr->block_group)
- hint_byte = last_ptr->window_start;
- if (last_ptr->fragmented) {
- /*
- * We still set window_start so we can keep track of the
- * last place we found an allocation to try and save
- * some time.
- */
- hint_byte = last_ptr->window_start;
- use_cluster = false;
- }
- spin_unlock(&last_ptr->lock);
- }
+ ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
+ if (ret < 0)
+ return ret;
ffe_ctl.search_start = max(ffe_ctl.search_start,
first_logical_byte(fs_info, 0));
- ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
- if (ffe_ctl.search_start == hint_byte) {
+ ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
+ if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
ffe_ctl.search_start);
/*
@@ -3924,6 +4020,8 @@ search:
down_read(&space_info->groups_sem);
list_for_each_entry(block_group,
&space_info->block_groups[ffe_ctl.index], list) {
+ struct btrfs_block_group *bg_ret;
+
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro))
continue;
@@ -3984,39 +4082,20 @@ have_block_group:
if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
goto loop;
- /*
- * Ok we want to try and use the cluster allocator, so
- * lets look there
- */
- if (last_ptr && use_cluster) {
- struct btrfs_block_group *cluster_bg = NULL;
-
- ret = find_free_extent_clustered(block_group, last_ptr,
- &ffe_ctl, &cluster_bg);
-
- if (ret == 0) {
- if (cluster_bg && cluster_bg != block_group) {
- btrfs_release_block_group(block_group,
- delalloc);
- block_group = cluster_bg;
- }
- goto checks;
- } else if (ret == -EAGAIN) {
- goto have_block_group;
- } else if (ret > 0) {
- goto loop;
+ bg_ret = NULL;
+ ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
+ if (ret == 0) {
+ if (bg_ret && bg_ret != block_group) {
+ btrfs_release_block_group(block_group, delalloc);
+ block_group = bg_ret;
}
- /* ret == -ENOENT case falls through */
- }
-
- ret = find_free_extent_unclustered(block_group, last_ptr,
- &ffe_ctl);
- if (ret == -EAGAIN)
+ } else if (ret == -EAGAIN) {
goto have_block_group;
- else if (ret > 0)
+ } else if (ret > 0) {
goto loop;
- /* ret == 0 case falls through */
-checks:
+ }
+
+ /* Checks */
ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
fs_info->stripesize);
@@ -4050,17 +4129,12 @@ checks:
btrfs_release_block_group(block_group, delalloc);
break;
loop:
- ffe_ctl.retry_clustered = false;
- ffe_ctl.retry_unclustered = false;
- BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
- ffe_ctl.index);
- btrfs_release_block_group(block_group, delalloc);
+ release_block_group(block_group, &ffe_ctl, delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
- full_search, use_cluster);
+ ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
if (ret > 0)
goto search;
@@ -4189,18 +4263,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
+int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
+ u64 len)
{
struct btrfs_block_group *cache;
int ret = 0;
- cache = btrfs_lookup_block_group(fs_info, start);
+ cache = btrfs_lookup_block_group(trans->fs_info, start);
if (!cache) {
- btrfs_err(fs_info, "unable to find block group for %llu", start);
+ btrfs_err(trans->fs_info, "unable to find block group for %llu",
+ start);
return -ENOSPC;
}
- ret = pin_down_extent(cache, start, len, 1);
+ ret = pin_down_extent(trans, cache, start, len, 1);
btrfs_put_block_group(cache);
return ret;
}
@@ -4430,6 +4506,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
offset, ins, 1);
+ if (ret)
+ btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
btrfs_put_block_group(block_group);
return ret;
}
@@ -4748,8 +4826,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
- ret = btrfs_set_disk_extent_flags(trans, eb->start,
- eb->len, flag,
+ ret = btrfs_set_disk_extent_flags(trans, eb, flag,
btrfs_header_level(eb), 0);
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
@@ -5207,9 +5284,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*
* If called with for_reloc == 0, may exit early with -EAGAIN
*/
-int btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref,
- int for_reloc)
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
@@ -5248,9 +5323,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
if (err)
goto out_end_trans;
- if (block_rsv)
- trans->block_rsv = block_rsv;
-
/*
* This will help us catch people modifying the fs tree while we're
* dropping it. It is unsafe to mess with the fs tree while it's being
@@ -5378,8 +5450,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
err = PTR_ERR(trans);
goto out_free;
}
- if (block_rsv)
- trans->block_rsv = block_rsv;
}
}
btrfs_release_path(path);
@@ -5411,13 +5481,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
}
- if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
- } else {
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
- btrfs_put_fs_root(root);
- }
+ else
+ btrfs_put_root(root);
root_dropped = true;
out_end_trans:
btrfs_end_transaction_throttle(trans);
@@ -5747,47 +5814,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
return bg_ret;
return dev_ret;
}
-
-/*
- * btrfs_{start,end}_write_no_snapshotting() are similar to
- * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
- * data into the page cache through nocow before the subvolume is snapshoted,
- * but flush the data into disk after the snapshot creation, or to prevent
- * operations while snapshotting is ongoing and that cause the snapshot to be
- * inconsistent (writes followed by expanding truncates for example).
- */
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
-{
- percpu_counter_dec(&root->subv_writers->counter);
- cond_wake_up(&root->subv_writers->wait);
-}
-
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
-{
- if (atomic_read(&root->will_be_snapshotted))
- return 0;
-
- percpu_counter_inc(&root->subv_writers->counter);
- /*
- * Make sure counter is updated before we check for snapshot creation.
- */
- smp_mb();
- if (atomic_read(&root->will_be_snapshotted)) {
- btrfs_end_write_no_snapshotting(root);
- return 0;
- }
- return 1;
-}
-
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
-{
- while (true) {
- int ret;
-
- ret = btrfs_start_write_no_snapshotting(root);
- if (ret)
- break;
- wait_var_event(&root->will_be_snapshotted,
- !atomic_read(&root->will_be_snapshotted));
- }
-}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c0f202741e09..39e45b8a5031 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -35,42 +35,54 @@ static inline bool extent_state_in_tree(const struct extent_state *state)
}
#ifdef CONFIG_BTRFS_DEBUG
-static LIST_HEAD(buffers);
static LIST_HEAD(states);
-
static DEFINE_SPINLOCK(leak_lock);
-static inline
-void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+static inline void btrfs_leak_debug_add(spinlock_t *lock,
+ struct list_head *new,
+ struct list_head *head)
{
unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
+ spin_lock_irqsave(lock, flags);
list_add(new, head);
- spin_unlock_irqrestore(&leak_lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
-static inline
-void btrfs_leak_debug_del(struct list_head *entry)
+static inline void btrfs_leak_debug_del(spinlock_t *lock,
+ struct list_head *entry)
{
unsigned long flags;
- spin_lock_irqsave(&leak_lock, flags);
+ spin_lock_irqsave(lock, flags);
list_del(entry);
- spin_unlock_irqrestore(&leak_lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
-static inline void btrfs_extent_buffer_leak_debug_check(void)
+void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
{
struct extent_buffer *eb;
+ unsigned long flags;
- while (!list_empty(&buffers)) {
- eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
+ /*
+ * If we didn't get into open_ctree our allocated_ebs will not be
+ * initialized, so just skip this.
+ */
+ if (!fs_info->allocated_ebs.next)
+ return;
+
+ spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+ while (!list_empty(&fs_info->allocated_ebs)) {
+ eb = list_first_entry(&fs_info->allocated_ebs,
+ struct extent_buffer, leak_list);
+ pr_err(
+ "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
+ eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+ btrfs_header_owner(eb));
list_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
+ spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
static inline void btrfs_extent_state_leak_debug_check(void)
@@ -107,9 +119,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
}
}
#else
-#define btrfs_leak_debug_add(new, head) do {} while (0)
-#define btrfs_leak_debug_del(entry) do {} while (0)
-#define btrfs_extent_buffer_leak_debug_check() do {} while (0)
+#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
+#define btrfs_leak_debug_del(lock, entry) do {} while (0)
#define btrfs_extent_state_leak_debug_check() do {} while (0)
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
@@ -122,7 +133,6 @@ struct tree_entry {
struct extent_page_data {
struct bio *bio;
- struct extent_io_tree *tree;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
@@ -246,8 +256,6 @@ void __cold extent_state_cache_exit(void)
void __cold extent_io_exit(void)
{
- btrfs_extent_buffer_leak_debug_check();
-
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
@@ -257,6 +265,15 @@ void __cold extent_io_exit(void)
bioset_exit(&btrfs_bioset);
}
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data)
@@ -268,6 +285,8 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
spin_lock_init(&tree->lock);
tree->private_data = private_data;
tree->owner = owner;
+ if (owner == IO_TREE_INODE_FILE_EXTENT)
+ lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
void extent_io_tree_release(struct extent_io_tree *tree)
@@ -314,7 +333,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
state->state = 0;
state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
- btrfs_leak_debug_add(&state->leak_list, &states);
+ btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
@@ -327,7 +346,7 @@ void free_extent_state(struct extent_state *state)
return;
if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
- btrfs_leak_debug_del(&state->leak_list);
+ btrfs_leak_debug_del(&leak_lock, &state->leak_list);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
@@ -1053,6 +1072,16 @@ hit_next:
goto out;
}
+ /*
+ * If this extent already has all the bits we want set, then
+ * skip it, not necessary to split it or do anything with it.
+ */
+ if ((state->state & bits) == bits) {
+ start = state->end + 1;
+ cache_state(state, cached_state);
+ goto search_again;
+ }
+
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
@@ -1568,6 +1597,43 @@ out:
}
/**
+ * find_contiguous_extent_bit: find a contiguous area of bits
+ * @tree - io tree to check
+ * @start - offset to start the search from
+ * @start_ret - the first offset we found with the bits set
+ * @end_ret - the final contiguous range of the bits that were set
+ * @bits - bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again. During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits. We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area. The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits)
+{
+ struct extent_state *state;
+ int ret = 1;
+
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, bits);
+ if (state) {
+ *start_ret = state->start;
+ *end_ret = state->end;
+ while ((state = next_state(state)) != NULL) {
+ if (state->start > (*end_ret + 1))
+ break;
+ *end_ret = state->end;
+ }
+ ret = 0;
+ }
+ spin_unlock(&tree->lock);
+ return ret;
+}
+
+/**
* find_first_clear_extent_bit - find the first range that has @bits not set.
* This range could start before @start.
*
@@ -2926,7 +2992,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
- * @tree: tree so we can call our merge_bio hook
* @wbc: optional writeback control for io accounting
* @page: page to add to the bio
* @pg_offset: offset of the new bio or to check whether we are adding
@@ -2939,7 +3004,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @bio_flags: flags of the current bio to see if we can merge them
*/
-static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
+static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
struct page *page, u64 offset,
size_t size, unsigned long pg_offset,
@@ -2954,6 +3019,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
struct bio *bio;
size_t page_size = min_t(size_t, size, PAGE_SIZE);
sector_t sector = offset >> 9;
+ struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
ASSERT(bio_ret);
@@ -3062,8 +3128,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int __do_readpage(struct extent_io_tree *tree,
- struct page *page,
+static int __do_readpage(struct page *page,
get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
@@ -3086,6 +3151,7 @@ static int __do_readpage(struct extent_io_tree *tree,
size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
unsigned long this_bio_flag = 0;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
set_page_extent_mapped(page);
@@ -3242,7 +3308,7 @@ static int __do_readpage(struct extent_io_tree *tree,
continue;
}
- ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
+ ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
page, offset, disk_io_size,
pg_offset, bio,
end_bio_extent_readpage, mirror_num,
@@ -3269,8 +3335,7 @@ out:
return ret;
}
-static inline void contiguous_readpages(struct extent_io_tree *tree,
- struct page *pages[], int nr_pages,
+static inline void contiguous_readpages(struct page *pages[], int nr_pages,
u64 start, u64 end,
struct extent_map **em_cached,
struct bio **bio,
@@ -3280,17 +3345,16 @@ static inline void contiguous_readpages(struct extent_io_tree *tree,
struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
int index;
- btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
+ __do_readpage(pages[index], btrfs_get_extent, em_cached,
bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
-static int __extent_read_full_page(struct extent_io_tree *tree,
- struct page *page,
+static int __extent_read_full_page(struct page *page,
get_extent_t *get_extent,
struct bio **bio, int mirror_num,
unsigned long *bio_flags,
@@ -3301,21 +3365,21 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
u64 end = start + PAGE_SIZE - 1;
int ret;
- btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
- ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
+ ret = __do_readpage(page, get_extent, NULL, bio, mirror_num,
bio_flags, read_flags, NULL);
return ret;
}
-int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num)
+int extent_read_full_page(struct page *page, get_extent_t *get_extent,
+ int mirror_num)
{
struct bio *bio = NULL;
unsigned long bio_flags = 0;
int ret;
- ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
+ ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,
&bio_flags, 0);
if (bio)
ret = submit_one_bio(bio, mirror_num, bio_flags);
@@ -3423,7 +3487,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
unsigned long nr_written,
int *nr_ret)
{
- struct extent_io_tree *tree = epd->tree;
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
u64 end;
@@ -3509,7 +3573,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page->index, cur, end);
}
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
page, offset, iosize, pg_offset,
&epd->bio,
end_bio_extent_writepage,
@@ -3830,8 +3894,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
u64 offset = eb->start;
u32 nritems;
int i, num_pages;
@@ -3864,7 +3926,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
p, offset, PAGE_SIZE, 0,
&epd->bio,
end_bio_extent_buffer_writepage,
@@ -3897,14 +3959,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
struct extent_buffer *eb, *prev_eb = NULL;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
+ struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
@@ -4018,7 +4079,39 @@ retry:
end_write_bio(&epd, ret);
return ret;
}
- ret = flush_write_bio(&epd);
+ /*
+ * If something went wrong, don't allow any metadata write bio to be
+ * submitted.
+ *
+ * This would prevent use-after-free if we had dirty pages not
+ * cleaned up, which can still happen by fuzzed images.
+ *
+ * - Bad extent tree
+ * Allowing existing tree block to be allocated for other trees.
+ *
+ * - Log tree operations
+ * Exiting tree blocks get allocated to log tree, bumps its
+ * generation, then get cleaned in tree re-balance.
+ * Such tree block will not be written back, since it's clean,
+ * thus no WRITTEN flag set.
+ * And after log writes back, this tree block is not traced by
+ * any dirty extent_io_tree.
+ *
+ * - Offending tree block gets re-dirtied from its original owner
+ * Since it has bumped generation, no WRITTEN flag, it can be
+ * reused without COWing. This tree block will not be traced
+ * by btrfs_transaction::dirty_pages.
+ *
+ * Now such dirty tree block will not be cleaned by any dirty
+ * extent io tree. Thus we don't want to submit such wild eb
+ * if the fs already has error.
+ */
+ if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ ret = flush_write_bio(&epd);
+ } else {
+ ret = -EUCLEAN;
+ end_write_bio(&epd, ret);
+ }
return ret;
}
@@ -4190,7 +4283,6 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
int ret;
struct extent_page_data epd = {
.bio = NULL,
- .tree = &BTRFS_I(page->mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4212,14 +4304,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
{
int ret = 0;
struct address_space *mapping = inode->i_mapping;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct page *page;
unsigned long nr_pages = (end - start + PAGE_SIZE) >>
PAGE_SHIFT;
struct extent_page_data epd = {
.bio = NULL,
- .tree = tree,
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
@@ -4263,7 +4353,6 @@ int extent_writepages(struct address_space *mapping,
int ret = 0;
struct extent_page_data epd = {
.bio = NULL,
- .tree = &BTRFS_I(mapping->host)->io_tree,
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4285,7 +4374,6 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
unsigned long bio_flags = 0;
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
- struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
int nr = 0;
u64 prev_em_start = (u64)-1;
@@ -4312,7 +4400,7 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages,
ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
- contiguous_readpages(tree, pagepool, nr, contig_start,
+ contiguous_readpages(pagepool, nr, contig_start,
contig_end, &em_cached, &bio, &bio_flags,
&prev_em_start);
}
@@ -4796,7 +4884,6 @@ out_free_ulist:
static void __free_extent_buffer(struct extent_buffer *eb)
{
- btrfs_leak_debug_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
@@ -4862,6 +4949,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_pages(eb);
+ btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
__free_extent_buffer(eb);
}
@@ -4883,7 +4971,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
- btrfs_leak_debug_add(&eb->leak_list, &buffers);
+ btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
+ &fs_info->allocated_ebs);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
@@ -5230,6 +5319,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
}
static int release_extent_buffer(struct extent_buffer *eb)
+ __releases(&eb->refs_lock)
{
lockdep_assert_held(&eb->refs_lock);
@@ -5248,6 +5338,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
+ btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -5405,7 +5496,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
unsigned long num_reads = 0;
struct bio *bio = NULL;
unsigned long bio_flags = 0;
- struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -5453,7 +5543,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = __extent_read_full_page(tree, page,
+ err = __extent_read_full_page(page,
btree_get_extent, &bio,
mirror_num, &bio_flags,
REQ_META);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5d205bbaafdc..2ed65bd0760e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -189,8 +189,8 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num);
+int extent_read_full_page(struct page *page, get_extent_t *get_extent,
+ int mirror_num);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
@@ -325,4 +325,11 @@ bool find_lock_delalloc_range(struct inode *inode,
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
+
+#ifdef CONFIG_BTRFS_DEBUG
+void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info);
+#else
+#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0)
+#endif
+
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6f417ff68980..bd6229fb2b6f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -237,6 +237,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
struct extent_map *merge = NULL;
struct rb_node *rb;
+ /*
+ * We can't modify an extent map that is in the tree and that is being
+ * used by another task, as it can cause that other task to see it in
+ * inconsistent state during the merging. We always have 1 reference for
+ * the tree and 1 for this task (which is unpinning the extent map or
+ * clearing the logging flag), so anything > 2 means it's being used by
+ * other tasks too.
+ */
+ if (refcount_read(&em->refs) > 2)
+ return;
+
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
if (rb)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c2f365662d55..b618ad5339ba 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -23,6 +23,97 @@
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
+/**
+ * @inode - the inode we want to update the disk_i_size for
+ * @new_i_size - the i_size we want to set to, 0 if we use i_size
+ *
+ * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read()
+ * returns as it is perfectly fine with a file that has holes without hole file
+ * extent items.
+ *
+ * However without NO_HOLES we need to only return the area that is contiguous
+ * from the 0 offset of the file. Otherwise we could end up adjust i_size up
+ * to an extent that has a gap in between.
+ *
+ * Finally new_i_size should only be set in the case of truncate where we're not
+ * ready to use i_size_read() as the limiter yet.
+ */
+void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ u64 start, end, i_size;
+ int ret;
+
+ i_size = new_i_size ?: i_size_read(inode);
+ if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ BTRFS_I(inode)->disk_i_size = i_size;
+ return;
+ }
+
+ spin_lock(&BTRFS_I(inode)->lock);
+ ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0,
+ &start, &end, EXTENT_DIRTY);
+ if (!ret && start == 0)
+ i_size = min(i_size, end + 1);
+ else
+ i_size = 0;
+ BTRFS_I(inode)->disk_i_size = i_size;
+ spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+/**
+ * @inode - the inode we're modifying
+ * @start - the start file offset of the file extent we've inserted
+ * @len - the logical length of the file extent item
+ *
+ * Call when we are inserting a new file extent where there was none before.
+ * Does not need to call this in the case where we're replacing an existing file
+ * extent, however if not sure it's fine to call this multiple times.
+ *
+ * The start and len must match the file extent item, so thus must be sectorsize
+ * aligned.
+ */
+int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len)
+{
+ if (len == 0)
+ return 0;
+
+ ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
+
+ if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
+ return 0;
+ return set_extent_bits(&inode->file_extent_tree, start, start + len - 1,
+ EXTENT_DIRTY);
+}
+
+/**
+ * @inode - the inode we're modifying
+ * @start - the start file offset of the file extent we've inserted
+ * @len - the logical length of the file extent item
+ *
+ * Called when we drop a file extent, for example when we truncate. Doesn't
+ * need to be called for cases where we're replacing a file extent, like when
+ * we've COWed a file extent.
+ *
+ * The start and len must match the file extent item, so thus must be sectorsize
+ * aligned.
+ */
+int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
+ u64 len)
+{
+ if (len == 0)
+ return 0;
+
+ ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
+ len == (u64)-1);
+
+ if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
+ return 0;
+ return clear_extent_bit(&inode->file_extent_tree, start,
+ start + len - 1, EXTENT_DIRTY, 0, 0, NULL);
+}
+
static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
u16 csum_size)
{
@@ -949,18 +1040,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
-
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- extent_end = extent_start +
- btrfs_file_extent_num_bytes(leaf, fi);
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- size_t size;
- size = btrfs_file_extent_ram_bytes(leaf, fi);
- extent_end = ALIGN(extent_start + size,
- fs_info->sectorsize);
- }
-
+ extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -1007,3 +1087,30 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
root->root_key.objectid);
}
}
+
+/*
+ * Returns the end offset (non inclusive) of the file extent item the given path
+ * points to. If it points to an inline extent, the returned offset is rounded
+ * up to the sector size.
+ */
+u64 btrfs_file_extent_end(const struct btrfs_path *path)
+{
+ const struct extent_buffer *leaf = path->nodes[0];
+ const int slot = path->slots[0];
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 end;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ end = btrfs_file_extent_ram_bytes(leaf, fi);
+ end = ALIGN(key.offset + end, leaf->fs_info->sectorsize);
+ } else {
+ end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ }
+
+ return end;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a16da274c9aa..8a144f9cb7ac 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -27,6 +27,7 @@
#include "qgroup.h"
#include "compression.h"
#include "delalloc-space.h"
+#include "reflink.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -277,7 +278,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
int num_defrag;
- int index;
int ret;
/* get the inode */
@@ -285,9 +285,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ inode_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(inode_root)) {
ret = PTR_ERR(inode_root);
goto cleanup;
@@ -297,11 +295,11 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, inode_root);
+ btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto cleanup;
}
- srcu_read_unlock(&fs_info->subvol_srcu, index);
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
@@ -337,7 +335,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
iput(inode);
return 0;
cleanup:
- srcu_read_unlock(&fs_info->subvol_srcu, index);
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
}
@@ -1552,15 +1549,14 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
u64 num_bytes;
int ret;
- ret = btrfs_start_write_no_snapshotting(root);
- if (!ret)
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
- btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart,
+ btrfs_lock_and_flush_ordered_range(inode, lockstart,
lockend, NULL);
num_bytes = lockend - lockstart + 1;
@@ -1568,7 +1564,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
@@ -1674,7 +1670,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
data_reserved, pos,
write_bytes);
else
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
break;
}
@@ -1778,7 +1774,7 @@ again:
release_bytes = 0;
if (only_release_metadata)
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos,
@@ -1807,7 +1803,7 @@ again:
if (release_bytes) {
if (only_release_metadata) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
@@ -2071,6 +2067,16 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_init_log_ctx(&ctx, inode);
/*
+ * Set the range to full if the NO_HOLES feature is not enabled.
+ * This is to avoid missing file extent items representing holes after
+ * replaying the log.
+ */
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ start = 0;
+ end = LLONG_MAX;
+ }
+
+ /*
* We write the dirty pages in the range and wait until they complete
* out of the ->i_mutex. If so, we can flush the dirty pages by
* multi-task, and make the performance up. See
@@ -2092,19 +2098,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges, and assertion failures during
- * hole detection. Do this while holding the inode lock, to avoid races
- * with other tasks.
- */
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
-
- /*
* Before we acquired the inode's lock, someone may have dirtied more
* pages in the target range. We need to make sure that writeback for
* any such pages does not start while we are logging the inode, because
@@ -2124,6 +2117,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2486,6 +2480,11 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ clone_info->file_offset, clone_len);
+ if (ret)
+ return ret;
+
/* If it's a hole, nothing more needs to be done. */
if (clone_info->disk_offset == 0)
return 0;
@@ -2596,6 +2595,24 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
+ } else if (!clone_info && cur_offset < drop_end) {
+ /*
+ * We are past the i_size here, but since we didn't
+ * insert holes we need to clear the mapped area so we
+ * know to not set disk_i_size in this area until a new
+ * file extent is inserted here.
+ */
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ cur_offset, drop_end - cur_offset);
+ if (ret) {
+ /*
+ * We couldn't clear our area, so we could
+ * presumably adjust up and corrupt the fs, so
+ * we need to abort.
+ */
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
}
if (clone_info && drop_end > clone_info->file_offset) {
@@ -2686,6 +2703,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
+ } else if (!clone_info && cur_offset < drop_end) {
+ /* See the comment in the loop above for the reasoning here. */
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ cur_offset, drop_end - cur_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+
}
if (clone_info) {
ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
@@ -2935,7 +2961,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
inode->i_ctime = current_time(inode);
i_size_write(inode, end);
- btrfs_ordered_update_i_size(inode, end, NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode(trans, root, inode);
ret2 = btrfs_end_transaction(trans);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0598fd3c6e3f..3613da065a73 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -371,10 +371,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
}
}
-static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
- int uptodate)
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
{
struct page *page;
+ struct inode *inode = io_ctl->inode;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int i;
@@ -732,7 +732,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
readahead_cache(inode);
- ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
+ ret = io_ctl_prepare_pages(&io_ctl, true);
if (ret)
goto out;
@@ -1067,6 +1067,7 @@ fail:
}
static noinline_for_stack int write_pinned_extent_entries(
+ struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
int *entries)
@@ -1085,7 +1086,7 @@ static noinline_for_stack int write_pinned_extent_entries(
* We shouldn't have switched the pinned extents yet so this is the
* right one
*/
- unpin = block_group->fs_info->pinned_extents;
+ unpin = &trans->transaction->pinned_extents;
start = block_group->start;
@@ -1190,7 +1191,7 @@ out:
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
if (block_group) {
-#ifdef DEBUG
+#ifdef CONFIG_BTRFS_DEBUG
btrfs_err(root->fs_info,
"failed to write free space cache for block group %llu",
block_group->start);
@@ -1291,7 +1292,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
}
/* Lock all pages first so we can lock the extent safely. */
- ret = io_ctl_prepare_pages(io_ctl, inode, 0);
+ ret = io_ctl_prepare_pages(io_ctl, false);
if (ret)
goto out_unlock;
@@ -1317,7 +1318,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* If this changes while we are working we'll get added back to
* the dirty list and redo it. No locking needed
*/
- ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
+ ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
if (ret)
goto out_nospc_locked;
@@ -1366,18 +1367,6 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
return 0;
-out:
- io_ctl->inode = NULL;
- io_ctl_free(io_ctl);
- if (ret) {
- invalidate_inode_pages2(inode->i_mapping);
- BTRFS_I(inode)->generation = 0;
- }
- btrfs_update_inode(trans, root, inode);
- if (must_iput)
- iput(inode);
- return ret;
-
out_nospc_locked:
cleanup_bitmap_list(&bitmap_list);
spin_unlock(&ctl->tree_lock);
@@ -1390,7 +1379,17 @@ out_unlock:
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
- goto out;
+out:
+ io_ctl->inode = NULL;
+ io_ctl_free(io_ctl);
+ if (ret) {
+ invalidate_inode_pages2(inode->i_mapping);
+ BTRFS_I(inode)->generation = 0;
+ }
+ btrfs_update_inode(trans, root, inode);
+ if (must_iput)
+ iput(inode);
+ return ret;
}
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
@@ -1416,7 +1415,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
block_group, &block_group->io_ctl, trans);
if (ret) {
-#ifdef DEBUG
+#ifdef CONFIG_BTRFS_DEBUG
btrfs_err(fs_info,
"failed to write free space cache for block group %llu",
block_group->start);
@@ -4036,7 +4035,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
if (release_metadata)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
inode->i_size, true);
-#ifdef DEBUG
+#ifdef CONFIG_BTRFS_DEBUG
btrfs_err(fs_info,
"failed to write free ino cache for root %llu",
root->root_key.objectid);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 258cb3fae17a..8b1f5c8897b7 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1251,9 +1251,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
0, 1);
- free_extent_buffer(free_space_root->node);
- free_extent_buffer(free_space_root->commit_root);
- kfree(free_space_root);
+ btrfs_put_root(free_space_root);
return btrfs_commit_transaction(trans);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d5c9c69d8263..6009e0e939b5 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -515,7 +515,7 @@ out_release:
trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
trans->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved);
+ trans->bytes_reserved, NULL);
out:
trans->block_rsv = rsv;
trans->bytes_reserved = num_bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b3ec93ff911..320d1062068d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -28,6 +28,7 @@
#include <linux/magic.h>
#include <linux/iversion.h>
#include <linux/swap.h>
+#include <linux/migrate.h>
#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include "misc.h"
@@ -242,6 +243,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
/*
+ * We align size to sectorsize for inline extents just for simplicity
+ * sake.
+ */
+ size = ALIGN(size, root->fs_info->sectorsize);
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
+ if (ret)
+ goto fail;
+
+ /*
* we're an inline extent, so nobody can
* extend the file past i_size without locking
* a page we already have locked.
@@ -2446,6 +2456,11 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos,
+ ram_bytes);
+ if (ret)
+ goto out;
+
/*
* Release the reserved range from inode dirty range map, as it is
* already moved into delayed_ref_head
@@ -2536,7 +2551,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
*/
btrfs_qgroup_free_data(inode, NULL, start,
ordered_extent->num_bytes);
- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
@@ -2607,7 +2622,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
@@ -3187,6 +3202,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
+ btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
+ round_up(i_size_read(inode), fs_info->sectorsize));
inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
@@ -4085,6 +4102,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 bytes_deleted = 0;
bool be_nice = false;
bool should_throttle = false;
+ const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
+ struct extent_state *cached_state = NULL;
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
@@ -4101,6 +4120,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = READA_BACK;
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
+
/*
* We want to drop from the next block forward in case this new size is
* not block aligned since we will be keeping the last block of the
@@ -4137,7 +4160,6 @@ search_again:
goto out;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
@@ -4153,6 +4175,8 @@ search_again:
}
while (1) {
+ u64 clear_start = 0, clear_len = 0;
+
fi = NULL;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -4203,6 +4227,8 @@ search_again:
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
+
+ clear_start = found_key.offset;
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
if (!del_item) {
u64 orig_num_bytes =
@@ -4210,6 +4236,7 @@ search_again:
extent_num_bytes = ALIGN(new_size -
found_key.offset,
fs_info->sectorsize);
+ clear_start = ALIGN(new_size, fs_info->sectorsize);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_num_bytes);
num_dec = (orig_num_bytes -
@@ -4235,6 +4262,7 @@ search_again:
inode_sub_bytes(inode, num_dec);
}
}
+ clear_len = num_dec;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
/*
* we can't truncate inline items that have had
@@ -4256,12 +4284,33 @@ search_again:
*/
ret = NEED_TRUNCATE_BLOCK;
break;
+ } else {
+ /*
+ * Inline extents are special, we just treat
+ * them as a full sector worth in the file
+ * extent tree just for simplicity sake.
+ */
+ clear_len = fs_info->sectorsize;
}
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
inode_sub_bytes(inode, item_end + 1 - new_size);
}
delete:
+ /*
+ * We use btrfs_truncate_inode_items() to clean up log trees for
+ * multiple fsyncs, and in this case we don't want to clear the
+ * file extent range because it's just the log.
+ */
+ if (root == BTRFS_I(inode)->root) {
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ clear_start, clear_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
+ }
+
if (del_item)
last_size = found_key.offset;
else
@@ -4289,7 +4338,6 @@ delete:
root == fs_info->tree_root)) {
struct btrfs_ref ref = { 0 };
- btrfs_set_path_blocking(path);
bytes_deleted += extent_num_bytes;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
@@ -4364,7 +4412,9 @@ out:
ASSERT(last_size >= new_size);
if (!ret && last_size > new_size)
last_size = new_size;
- btrfs_ordered_update_i_size(inode, last_size, NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, last_size);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
+ (u64)-1, &cached_state);
}
btrfs_free_path(path);
@@ -4570,7 +4620,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (size <= hole_start)
return 0;
- btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start,
+ btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start,
block_end - 1, &cached_state);
cur_offset = hole_start;
while (1) {
@@ -4583,14 +4633,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
}
last_byte = min(extent_map_end(em), block_end);
last_byte = ALIGN(last_byte, fs_info->sectorsize);
+ hole_size = last_byte - cur_offset;
+
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
struct extent_map *hole_em;
- hole_size = last_byte - cur_offset;
err = maybe_insert_hole(root, inode, cur_offset,
hole_size);
if (err)
break;
+
+ err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ cur_offset, hole_size);
+ if (err)
+ break;
+
btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
@@ -4622,6 +4679,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_size - 1, 0);
}
free_extent_map(hole_em);
+ } else {
+ err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ cur_offset, hole_size);
+ if (err)
+ break;
}
next:
free_extent_map(em);
@@ -4665,24 +4727,24 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* truncation, it must capture all writes that happened before
* this truncation.
*/
- btrfs_wait_for_snapshot_creation(root);
+ btrfs_drew_write_lock(&root->snapshot_lock);
ret = btrfs_cont_expand(inode, oldsize, newsize);
if (ret) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
return ret;
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
return PTR_ERR(trans);
}
i_size_write(inode, newsize);
- btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
pagecache_isize_extended(inode, oldsize, newsize);
ret = btrfs_update_inode(trans, root, inode);
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
@@ -5092,7 +5154,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
btrfs_release_path(path);
- new_root = btrfs_read_fs_root_no_name(fs_info, location);
+ new_root = btrfs_get_fs_root(fs_info, location, true);
if (IS_ERR(new_root)) {
err = PTR_ERR(new_root);
goto out;
@@ -5173,7 +5235,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
inode->i_ino = args->location->objectid;
memcpy(&BTRFS_I(inode)->location, args->location,
sizeof(*args->location));
- BTRFS_I(inode)->root = args->root;
+ BTRFS_I(inode)->root = btrfs_grab_root(args->root);
+ BUG_ON(args->root && !BTRFS_I(inode)->root);
return 0;
}
@@ -5254,7 +5317,7 @@ static struct inode *new_simple_dir(struct super_block *s,
if (!inode)
return ERR_PTR(-ENOMEM);
- BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->root = btrfs_grab_root(root);
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
@@ -5301,7 +5364,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
struct btrfs_root *sub_root = root;
struct btrfs_key location;
u8 di_type = 0;
- int index;
int ret = 0;
if (dentry->d_name.len > BTRFS_NAME_LEN)
@@ -5328,7 +5390,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return inode;
}
- index = srcu_read_lock(&fs_info->subvol_srcu);
ret = fixup_tree_root_location(fs_info, dir, dentry,
&location, &sub_root);
if (ret < 0) {
@@ -5339,7 +5400,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
} else {
inode = btrfs_iget(dir->i_sb, &location, sub_root);
}
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ if (root != sub_root)
+ btrfs_put_root(sub_root);
if (!IS_ERR(inode) && root != sub_root) {
down_read(&fs_info->cleanup_work_sem);
@@ -5820,7 +5882,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
*/
BTRFS_I(inode)->index_cnt = 2;
BTRFS_I(inode)->dir_index = *index;
- BTRFS_I(inode)->root = root;
+ BTRFS_I(inode)->root = btrfs_grab_root(root);
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
@@ -6457,6 +6519,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
+ extent_end = btrfs_file_extent_end(path);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
@@ -6467,18 +6530,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
btrfs_ino(inode));
goto out;
}
- extent_end = extent_start +
- btrfs_file_extent_num_bytes(leaf, item);
-
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- size_t size;
-
- size = btrfs_file_extent_ram_bytes(leaf, item);
- extent_end = ALIGN(extent_start + size,
- fs_info->sectorsize);
-
trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
path->slots[0],
extent_start);
@@ -7777,6 +7831,7 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
{
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+ u16 csum_size;
blk_status_t ret;
/*
@@ -7796,7 +7851,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
file_offset -= dip->logical_offset;
file_offset >>= inode->i_sb->s_blocksize_bits;
- io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+ csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy);
+ io_bio->csum = orig_io_bio->csum + csum_size * file_offset;
return 0;
}
@@ -8203,9 +8259,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int btrfs_readpage(struct file *file, struct page *page)
{
- struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_read_full_page(tree, page, btrfs_get_extent, 0);
+ return extent_read_full_page(page, btrfs_get_extent, 0);
}
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -8264,6 +8318,39 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
return __btrfs_releasepage(page, gfp_flags);
}
+#ifdef CONFIG_MIGRATION
+static int btrfs_migratepage(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
+{
+ int ret;
+
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
+ if (ret != MIGRATEPAGE_SUCCESS)
+ return ret;
+
+ if (page_has_private(page)) {
+ ClearPagePrivate(page);
+ get_page(newpage);
+ set_page_private(newpage, page_private(page));
+ set_page_private(page, 0);
+ put_page(page);
+ SetPagePrivate(newpage);
+ }
+
+ if (PagePrivate2(page)) {
+ ClearPagePrivate2(page);
+ SetPagePrivate2(newpage);
+ }
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
+ return MIGRATEPAGE_SUCCESS;
+}
+#endif
+
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
@@ -8639,7 +8726,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
break;
}
- btrfs_block_rsv_release(fs_info, rsv, -1);
+ btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
@@ -8664,7 +8751,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
ret = PTR_ERR(trans);
goto out;
}
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
}
if (trans) {
@@ -8768,6 +8855,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
extent_io_tree_init(fs_info, &ei->io_failure_tree,
IO_TREE_INODE_IO_FAILURE, inode);
+ extent_io_tree_init(fs_info, &ei->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT, inode);
ei->io_tree.track_uptodate = true;
ei->io_failure_tree.track_uptodate = true;
atomic_set(&ei->sync_writers, 0);
@@ -8834,6 +8923,8 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+ btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
+ btrfs_put_root(BTRFS_I(inode)->root);
}
int btrfs_drop_inode(struct inode *inode)
@@ -9488,6 +9579,10 @@ out_fail:
ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
if (ret)
commit_transaction = true;
+ } else if (sync_log) {
+ mutex_lock(&root->log_mutex);
+ list_del(&ctx.list);
+ mutex_unlock(&root->log_mutex);
}
if (commit_transaction) {
ret = btrfs_commit_transaction(trans);
@@ -9657,14 +9752,14 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
while (!list_empty(&splice) && nr) {
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
- root = btrfs_grab_fs_root(root);
+ root = btrfs_grab_root(root);
BUG_ON(!root);
list_move_tail(&root->delalloc_root,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
ret = start_delalloc_inodes(root, nr, false);
- btrfs_put_fs_root(root);
+ btrfs_put_root(root);
if (ret < 0)
goto out;
@@ -9818,6 +9913,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
+ u64 clear_offset = start;
u64 i_size;
u64 cur_bytes;
u64 last_alloc = (u64)-1;
@@ -9852,6 +9948,15 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_end_transaction(trans);
break;
}
+
+ /*
+ * We've reserved this space, and thus converted it from
+ * ->bytes_may_use to ->bytes_reserved. Any error that happens
+ * from here on out we will only need to clear our reservation
+ * for the remaining unreserved area, so advance our
+ * clear_offset by our extent size.
+ */
+ clear_offset += ins.offset;
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
@@ -9916,7 +10021,7 @@ next:
else
i_size = cur_offset;
i_size_write(inode, i_size);
- btrfs_ordered_update_i_size(inode, i_size, NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
}
ret = btrfs_update_inode(trans, root, inode);
@@ -9931,9 +10036,9 @@ next:
if (own_trans)
btrfs_end_transaction(trans);
}
- if (cur_offset < end)
- btrfs_free_reserved_data_space(inode, NULL, cur_offset,
- end - cur_offset + 1);
+ if (clear_offset < end)
+ btrfs_free_reserved_data_space(inode, NULL, clear_offset,
+ end - clear_offset + 1);
return ret;
}
@@ -10452,6 +10557,9 @@ static const struct address_space_operations btrfs_aops = {
.direct_IO = btrfs_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
+#ifdef CONFIG_MIGRATION
+ .migratepage = btrfs_migratepage,
+#endif
.set_page_dirty = btrfs_set_page_dirty,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4f4b13830b25..40b729dce91c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -28,6 +28,7 @@
#include <linux/iversion.h>
#include "ctree.h"
#include "disk-io.h"
+#include "export.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
@@ -86,10 +87,6 @@ struct btrfs_ioctl_send_args_32 {
struct btrfs_ioctl_send_args_32)
#endif
-static int btrfs_clone(struct inode *src, struct inode *inode,
- u64 off, u64 olen, u64 olen_aligned, u64 destoff,
- int no_time_update);
-
/* Mask out flags that are inappropriate for the given type of inode. */
static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
unsigned int flags)
@@ -554,7 +551,6 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
static noinline int create_subvol(struct inode *dir,
struct dentry *dentry,
const char *name, int namelen,
- u64 *async_transid,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -573,7 +569,6 @@ static noinline int create_subvol(struct inode *dir,
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
- uuid_le new_uuid;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
@@ -643,8 +638,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_root_generation_v2(root_item,
btrfs_root_generation(root_item));
- uuid_le_gen(&new_uuid);
- memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ generate_random_guid(root_item->uuid);
btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
root_item->ctime = root_item->otime;
@@ -666,7 +660,7 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
key.offset = (u64)-1;
- new_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ new_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
@@ -676,6 +670,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_record_root_in_trans(trans, new_root);
ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
+ btrfs_put_root(new_root);
if (ret) {
/* We potentially lose an unused inode item here */
btrfs_abort_transaction(trans, ret);
@@ -727,14 +722,7 @@ fail:
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(fs_info, &block_rsv);
- if (async_transid) {
- *async_transid = trans->transid;
- err = btrfs_commit_transaction_async(trans, 1);
- if (err)
- err = btrfs_commit_transaction(trans);
- } else {
- err = btrfs_commit_transaction(trans);
- }
+ err = btrfs_commit_transaction(trans);
if (err && !ret)
ret = err;
@@ -752,8 +740,7 @@ fail_free:
}
static int create_snapshot(struct btrfs_root *root, struct inode *dir,
- struct dentry *dentry,
- u64 *async_transid, bool readonly,
+ struct dentry *dentry, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@ -789,11 +776,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
* possible. This is to avoid later writeback (running dealloc) to
* fallback to COW mode and unexpectedly fail with ENOSPC.
*/
- atomic_inc(&root->will_be_snapshotted);
- smp_mb__after_atomic();
- /* wait for no snapshot writes */
- wait_event(root->subv_writers->wait,
- percpu_counter_sum(&root->subv_writers->counter) == 0);
+ btrfs_drew_read_lock(&root->snapshot_lock);
ret = btrfs_start_delalloc_snapshot(root);
if (ret)
@@ -841,14 +824,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
spin_unlock(&fs_info->trans_lock);
- if (async_transid) {
- *async_transid = trans->transid;
- ret = btrfs_commit_transaction_async(trans, 1);
- if (ret)
- ret = btrfs_commit_transaction(trans);
- } else {
- ret = btrfs_commit_transaction(trans);
- }
+
+ ret = btrfs_commit_transaction(trans);
if (ret)
goto fail;
@@ -869,12 +846,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
d_instantiate(dentry, inode);
ret = 0;
fail:
+ btrfs_put_root(pending_snapshot->snap);
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
dec_and_free:
if (snapshot_force_cow)
atomic_dec(&root->snapshot_force_cow);
- if (atomic_dec_and_test(&root->will_be_snapshotted))
- wake_up_var(&root->will_be_snapshotted);
+ btrfs_drew_read_unlock(&root->snapshot_lock);
+
free_pending:
kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path);
@@ -953,7 +931,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
static noinline int btrfs_mksubvol(const struct path *parent,
const char *name, int namelen,
struct btrfs_root *snap_src,
- u64 *async_transid, bool readonly,
+ bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
struct inode *dir = d_inode(parent->dentry);
@@ -989,13 +967,11 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
goto out_up_read;
- if (snap_src) {
- error = create_snapshot(snap_src, dir, dentry,
- async_transid, readonly, inherit);
- } else {
- error = create_subvol(dir, dentry, name, namelen,
- async_transid, inherit);
- }
+ if (snap_src)
+ error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
+ else
+ error = create_subvol(dir, dentry, name, namelen, inherit);
+
if (!error)
fsnotify_mkdir(dir, dentry);
out_up_read:
@@ -1711,9 +1687,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = round_down(new_size, fs_info->sectorsize);
- btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
- rcu_str_deref(device->name), new_size);
-
if (new_size > old_size) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -1726,6 +1699,11 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = btrfs_shrink_device(device, new_size);
} /* equal, nothing need to do */
+ if (ret == 0 && new_size != old_size)
+ btrfs_info_in_rcu(fs_info,
+ "resize device %s (devid %llu) from %llu to %llu",
+ rcu_str_deref(device->name), device->devid,
+ old_size, new_size);
out_free:
kfree(vol_args);
out:
@@ -1734,9 +1712,9 @@ out:
return ret;
}
-static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
+static noinline int __btrfs_ioctl_snap_create(struct file *file,
const char *name, unsigned long fd, int subvol,
- u64 *transid, bool readonly,
+ bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
int namelen;
@@ -1763,7 +1741,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
if (subvol) {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
- NULL, transid, readonly, inherit);
+ NULL, readonly, inherit);
} else {
struct fd src = fdget(fd);
struct inode *src_inode;
@@ -1786,7 +1764,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
} else {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
BTRFS_I(src_inode)->root,
- transid, readonly, inherit);
+ readonly, inherit);
}
fdput(src);
}
@@ -1810,9 +1788,8 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
- vol_args->fd, subvol,
- NULL, false, NULL);
+ ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
+ subvol, false, NULL);
kfree(vol_args);
return ret;
@@ -1823,8 +1800,6 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
{
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
- u64 transid = 0;
- u64 *ptr = NULL;
bool readonly = false;
struct btrfs_qgroup_inherit *inherit = NULL;
@@ -1836,22 +1811,11 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
return PTR_ERR(vol_args);
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
- if (vol_args->flags &
- ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
- BTRFS_SUBVOL_QGROUP_INHERIT)) {
+ if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto free_args;
}
- if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) {
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- btrfs_warn(fs_info,
-"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7");
-
- ptr = &transid;
- }
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
@@ -1866,18 +1830,10 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
}
}
- ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
- vol_args->fd, subvol, ptr,
- readonly, inherit);
+ ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
+ subvol, readonly, inherit);
if (ret)
goto free_inherit;
-
- if (ptr && copy_to_user(arg +
- offsetof(struct btrfs_ioctl_vol_args_v2,
- transid),
- ptr, sizeof(*ptr)))
- ret = -EFAULT;
-
free_inherit:
kfree(inherit);
free_args:
@@ -1936,11 +1892,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
goto out_drop_write;
}
- if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
- ret = -EINVAL;
- goto out_drop_write;
- }
-
if (flags & ~BTRFS_SUBVOL_RDONLY) {
ret = -EOPNOTSUPP;
goto out_drop_write;
@@ -2174,12 +2125,12 @@ static noinline int search_ioctl(struct inode *inode,
if (sk->tree_id == 0) {
/* search the root of the inode that was passed */
- root = BTRFS_I(inode)->root;
+ root = btrfs_grab_root(BTRFS_I(inode)->root);
} else {
key.objectid = sk->tree_id;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(info, &key);
+ root = btrfs_get_fs_root(info, &key, true);
if (IS_ERR(root)) {
btrfs_free_path(path);
return PTR_ERR(root);
@@ -2208,6 +2159,7 @@ static noinline int search_ioctl(struct inode *inode,
ret = 0;
err:
sk->nr_items = num_found;
+ btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}
@@ -2314,9 +2266,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.objectid = tree_id;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(info, &key);
+ root = btrfs_get_fs_root(info, &key, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
+ root = NULL;
goto out;
}
@@ -2367,6 +2320,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
name[total_len] = '\0';
ret = 0;
out:
+ btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}
@@ -2383,7 +2337,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
unsigned long item_len;
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
- struct btrfs_root *root;
+ struct btrfs_root *root = NULL;
struct btrfs_path *path;
struct btrfs_key key, key2;
struct extent_buffer *leaf;
@@ -2408,7 +2362,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
key.objectid = treeid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(fs_info, &key);
+ root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
@@ -2420,15 +2374,15 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- goto out;
+ goto out_put;
} else if (ret > 0) {
ret = btrfs_previous_item(root, path, dirid,
BTRFS_INODE_REF_KEY);
if (ret < 0) {
- goto out;
+ goto out_put;
} else if (ret > 0) {
ret = -ENOENT;
- goto out;
+ goto out_put;
}
}
@@ -2442,7 +2396,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
total_len += len + 1;
if (ptr < args->path) {
ret = -ENAMETOOLONG;
- goto out;
+ goto out_put;
}
*(ptr + len) = '/';
@@ -2453,10 +2407,10 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
ret = btrfs_previous_item(root, path, dirid,
BTRFS_INODE_ITEM_KEY);
if (ret < 0) {
- goto out;
+ goto out_put;
} else if (ret > 0) {
ret = -ENOENT;
- goto out;
+ goto out_put;
}
leaf = path->nodes[0];
@@ -2464,26 +2418,26 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
btrfs_item_key_to_cpu(leaf, &key2, slot);
if (key2.objectid != dirid) {
ret = -ENOENT;
- goto out;
+ goto out_put;
}
temp_inode = btrfs_iget(sb, &key2, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
- goto out;
+ goto out_put;
}
ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
iput(temp_inode);
if (ret) {
ret = -EACCES;
- goto out;
+ goto out_put;
}
if (key.offset == upper_limit.objectid)
break;
if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
ret = -EACCES;
- goto out;
+ goto out_put;
}
btrfs_release_path(path);
@@ -2494,15 +2448,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
memmove(args->path, ptr, total_len);
args->path[total_len] = '\0';
+ btrfs_put_root(root);
+ root = NULL;
btrfs_release_path(path);
}
/* Get the bottom subvolume's name from ROOT_REF */
- root = fs_info->tree_root;
key.objectid = treeid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = args->treeid;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (ret > 0) {
@@ -2529,6 +2484,8 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
read_extent_buffer(leaf, args->name, item_off, item_len);
args->name[item_len] = 0;
+out_put:
+ btrfs_put_root(root);
out:
btrfs_free_path(path);
return ret;
@@ -2653,10 +2610,10 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
key.objectid = BTRFS_I(inode)->root->root_key.objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(fs_info, &key);
+ root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
- goto out;
+ goto out_free;
}
root_item = &root->root_item;
@@ -2689,16 +2646,14 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
/* Search root tree for ROOT_BACKREF of this subvolume */
- root = fs_info->tree_root;
-
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
+ ret = btrfs_next_leaf(fs_info->tree_root, path);
if (ret < 0) {
goto out;
} else if (ret > 0) {
@@ -2733,6 +2688,8 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
ret = -EFAULT;
out:
+ btrfs_put_root(root);
+out_free:
btrfs_free_path(path);
kzfree(subvol_info);
return ret;
@@ -2836,7 +2793,8 @@ out:
}
static noinline int btrfs_ioctl_snap_destroy(struct file *file,
- void __user *arg)
+ void __user *arg,
+ bool destroy_v2)
{
struct dentry *parent = file->f_path.dentry;
struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
@@ -2845,34 +2803,120 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
- struct btrfs_ioctl_vol_args *vol_args;
- int namelen;
+ struct btrfs_ioctl_vol_args *vol_args = NULL;
+ struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
+ char *subvol_name, *subvol_name_ptr = NULL;
+ int subvol_namelen;
int err = 0;
+ bool destroy_parent = false;
- if (!S_ISDIR(dir->i_mode))
- return -ENOTDIR;
+ if (destroy_v2) {
+ vol_args2 = memdup_user(arg, sizeof(*vol_args2));
+ if (IS_ERR(vol_args2))
+ return PTR_ERR(vol_args2);
- vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
- vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- namelen = strlen(vol_args->name);
- if (strchr(vol_args->name, '/') ||
- strncmp(vol_args->name, "..", namelen) == 0) {
- err = -EINVAL;
- goto out;
+ /*
+ * If SPEC_BY_ID is not set, we are looking for the subvolume by
+ * name, same as v1 currently does.
+ */
+ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
+ vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0;
+ subvol_name = vol_args2->name;
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+ } else {
+ if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
+
+ dentry = btrfs_get_dentry(fs_info->sb,
+ BTRFS_FIRST_FREE_OBJECTID,
+ vol_args2->subvolid, 0, 0);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_drop_write;
+ }
+
+ /*
+ * Change the default parent since the subvolume being
+ * deleted can be outside of the current mount point.
+ */
+ parent = btrfs_get_parent(dentry);
+
+ /*
+ * At this point dentry->d_name can point to '/' if the
+ * subvolume we want to destroy is outsite of the
+ * current mount point, so we need to release the
+ * current dentry and execute the lookup to return a new
+ * one with ->d_name pointing to the
+ * <mount point>/subvol_name.
+ */
+ dput(dentry);
+ if (IS_ERR(parent)) {
+ err = PTR_ERR(parent);
+ goto out_drop_write;
+ }
+ dir = d_inode(parent);
+
+ /*
+ * If v2 was used with SPEC_BY_ID, a new parent was
+ * allocated since the subvolume can be outside of the
+ * current mount point. Later on we need to release this
+ * new parent dentry.
+ */
+ destroy_parent = true;
+
+ subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
+ fs_info, vol_args2->subvolid);
+ if (IS_ERR(subvol_name_ptr)) {
+ err = PTR_ERR(subvol_name_ptr);
+ goto free_parent;
+ }
+ /* subvol_name_ptr is already NULL termined */
+ subvol_name = (char *)kbasename(subvol_name_ptr);
+ }
+ } else {
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = 0;
+ subvol_name = vol_args->name;
+
+ err = mnt_want_write_file(file);
+ if (err)
+ goto out;
}
- err = mnt_want_write_file(file);
- if (err)
- goto out;
+ subvol_namelen = strlen(subvol_name);
+
+ if (strchr(subvol_name, '/') ||
+ strncmp(subvol_name, "..", subvol_namelen) == 0) {
+ err = -EINVAL;
+ goto free_subvol_name;
+ }
+ if (!S_ISDIR(dir->i_mode)) {
+ err = -ENOTDIR;
+ goto free_subvol_name;
+ }
err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (err == -EINTR)
- goto out_drop_write;
- dentry = lookup_one_len(vol_args->name, parent, namelen);
+ goto free_subvol_name;
+ dentry = lookup_one_len(subvol_name, parent, subvol_namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_unlock_dir;
@@ -2941,9 +2985,15 @@ out_dput:
dput(dentry);
out_unlock_dir:
inode_unlock(dir);
+free_subvol_name:
+ kfree(subvol_name_ptr);
+free_parent:
+ if (destroy_parent)
+ dput(parent);
out_drop_write:
mnt_drop_write_file(file);
out:
+ kfree(vol_args2);
kfree(vol_args);
return err;
}
@@ -3069,8 +3119,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto err_drop;
}
- /* Check for compatibility reject unknown flags */
- if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) {
+ if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
@@ -3220,733 +3269,6 @@ out:
return ret;
}
-static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-}
-
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
-{
- if (inode1 < inode2) {
- swap(inode1, inode2);
- swap(loff1, loff2);
- } else if (inode1 == inode2 && loff2 < loff1) {
- swap(loff1, loff2);
- }
- lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-}
-
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
- struct inode *dst, u64 dst_loff)
-{
- const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
- int ret;
-
- /*
- * Lock destination range to serialize with concurrent readpages() and
- * source range to serialize with relocation.
- */
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
- ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
-
- return ret;
-}
-
-#define BTRFS_MAX_DEDUPE_LEN SZ_16M
-
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
- struct inode *dst, u64 dst_loff)
-{
- int ret;
- u64 i, tail_len, chunk_count;
- struct btrfs_root *root_dst = BTRFS_I(dst)->root;
-
- spin_lock(&root_dst->root_item_lock);
- if (root_dst->send_in_progress) {
- btrfs_warn_rl(root_dst->fs_info,
-"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
- root_dst->root_key.objectid,
- root_dst->send_in_progress);
- spin_unlock(&root_dst->root_item_lock);
- return -EAGAIN;
- }
- root_dst->dedupe_in_progress++;
- spin_unlock(&root_dst->root_item_lock);
-
- tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
- chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
-
- for (i = 0; i < chunk_count; i++) {
- ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
- dst, dst_loff);
- if (ret)
- goto out;
-
- loff += BTRFS_MAX_DEDUPE_LEN;
- dst_loff += BTRFS_MAX_DEDUPE_LEN;
- }
-
- if (tail_len > 0)
- ret = btrfs_extent_same_range(src, loff, tail_len, dst,
- dst_loff);
-out:
- spin_lock(&root_dst->root_item_lock);
- root_dst->dedupe_in_progress--;
- spin_unlock(&root_dst->root_item_lock);
-
- return ret;
-}
-
-static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
- struct inode *inode,
- u64 endoff,
- const u64 destoff,
- const u64 olen,
- int no_time_update)
-{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret;
-
- inode_inc_iversion(inode);
- if (!no_time_update)
- inode->i_mtime = inode->i_ctime = current_time(inode);
- /*
- * We round up to the block size at eof when determining which
- * extents to clone above, but shouldn't round up the file size.
- */
- if (endoff > destoff + olen)
- endoff = destoff + olen;
- if (endoff > inode->i_size)
- btrfs_i_size_write(BTRFS_I(inode), endoff);
-
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- goto out;
- }
- ret = btrfs_end_transaction(trans);
-out:
- return ret;
-}
-
-/*
- * Make sure we do not end up inserting an inline extent into a file that has
- * already other (non-inline) extents. If a file has an inline extent it can
- * not have any other extents and the (single) inline extent must start at the
- * file offset 0. Failing to respect these rules will lead to file corruption,
- * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
- *
- * We can have extents that have been already written to disk or we can have
- * dirty ranges still in delalloc, in which case the extent maps and items are
- * created only when we run delalloc, and the delalloc ranges might fall outside
- * the range we are currently locking in the inode's io tree. So we check the
- * inode's i_size because of that (i_size updates are done while holding the
- * i_mutex, which we are holding here).
- * We also check to see if the inode has a size not greater than "datal" but has
- * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
- * protected against such concurrent fallocate calls by the i_mutex).
- *
- * If the file has no extents but a size greater than datal, do not allow the
- * copy because we would need turn the inline extent into a non-inline one (even
- * with NO_HOLES enabled). If we find our destination inode only has one inline
- * extent, just overwrite it with the source inline extent if its size is less
- * than the source extent's size, or we could copy the source inline extent's
- * data into the destination inode's inline extent if the later is greater then
- * the former.
- */
-static int clone_copy_inline_extent(struct inode *dst,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_key *new_key,
- const u64 drop_start,
- const u64 datal,
- const u64 skip,
- const u64 size,
- char *inline_data)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
- struct btrfs_root *root = BTRFS_I(dst)->root;
- const u64 aligned_end = ALIGN(new_key->offset + datal,
- fs_info->sectorsize);
- int ret;
- struct btrfs_key key;
-
- if (new_key->offset > 0)
- return -EOPNOTSUPP;
-
- key.objectid = btrfs_ino(BTRFS_I(dst));
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = 0;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- goto copy_inline_extent;
- }
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
- key.type == BTRFS_EXTENT_DATA_KEY) {
- ASSERT(key.offset > 0);
- return -EOPNOTSUPP;
- }
- } else if (i_size_read(dst) <= datal) {
- struct btrfs_file_extent_item *ei;
- u64 ext_len;
-
- /*
- * If the file size is <= datal, make sure there are no other
- * extents following (can happen do to an fallocate call with
- * the flag FALLOC_FL_KEEP_SIZE).
- */
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- /*
- * If it's an inline extent, it can not have other extents
- * following it.
- */
- if (btrfs_file_extent_type(path->nodes[0], ei) ==
- BTRFS_FILE_EXTENT_INLINE)
- goto copy_inline_extent;
-
- ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- if (ext_len > aligned_end)
- return -EOPNOTSUPP;
-
- ret = btrfs_next_item(root, path);
- if (ret < 0) {
- return ret;
- } else if (ret == 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key,
- path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
- key.type == BTRFS_EXTENT_DATA_KEY)
- return -EOPNOTSUPP;
- }
- }
-
-copy_inline_extent:
- /*
- * We have no extent items, or we have an extent at offset 0 which may
- * or may not be inlined. All these cases are dealt the same way.
- */
- if (i_size_read(dst) > datal) {
- /*
- * If the destination inode has an inline extent...
- * This would require copying the data from the source inline
- * extent into the beginning of the destination's inline extent.
- * But this is really complex, both extents can be compressed
- * or just one of them, which would require decompressing and
- * re-compressing data (which could increase the new compressed
- * size, not allowing the compressed data to fit anymore in an
- * inline extent).
- * So just don't support this case for now (it should be rare,
- * we are not really saving space when cloning inline extents).
- */
- return -EOPNOTSUPP;
- }
-
- btrfs_release_path(path);
- ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
- if (ret)
- return ret;
- ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
- if (ret)
- return ret;
-
- if (skip) {
- const u32 start = btrfs_file_extent_calc_inline_size(0);
-
- memmove(inline_data + start, inline_data + start + skip, datal);
- }
-
- write_extent_buffer(path->nodes[0], inline_data,
- btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]),
- size);
- inode_add_bytes(dst, datal);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
-
- return 0;
-}
-
-/**
- * btrfs_clone() - clone a range from inode file to another
- *
- * @src: Inode to clone from
- * @inode: Inode to clone to
- * @off: Offset within source to start clone from
- * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen
- * @destoff: Offset within @inode to start clone
- * @no_time_update: Whether to update mtime/ctime on the target inode
- */
-static int btrfs_clone(struct inode *src, struct inode *inode,
- const u64 off, const u64 olen, const u64 olen_aligned,
- const u64 destoff, int no_time_update)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path = NULL;
- struct extent_buffer *leaf;
- struct btrfs_trans_handle *trans;
- char *buf = NULL;
- struct btrfs_key key;
- u32 nritems;
- int slot;
- int ret;
- const u64 len = olen_aligned;
- u64 last_dest_end = destoff;
-
- ret = -ENOMEM;
- buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
- if (!buf)
- return ret;
-
- path = btrfs_alloc_path();
- if (!path) {
- kvfree(buf);
- return ret;
- }
-
- path->reada = READA_FORWARD;
- /* clone data */
- key.objectid = btrfs_ino(BTRFS_I(src));
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = off;
-
- while (1) {
- u64 next_key_min_offset = key.offset + 1;
- struct btrfs_file_extent_item *extent;
- int type;
- u32 size;
- struct btrfs_key new_key;
- u64 disko = 0, diskl = 0;
- u64 datao = 0, datal = 0;
- u8 comp;
- u64 drop_start;
-
- /*
- * note the key will change type as we walk through the
- * tree.
- */
- path->leave_spinning = 1;
- ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
- 0, 0);
- if (ret < 0)
- goto out;
- /*
- * First search, if no extent item that starts at offset off was
- * found but the previous item is an extent item, it's possible
- * it might overlap our target range, therefore process it.
- */
- if (key.offset == off && ret > 0 && path->slots[0] > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key,
- path->slots[0] - 1);
- if (key.type == BTRFS_EXTENT_DATA_KEY)
- path->slots[0]--;
- }
-
- nritems = btrfs_header_nritems(path->nodes[0]);
-process_slot:
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
- if (ret < 0)
- goto out;
- if (ret > 0)
- break;
- nritems = btrfs_header_nritems(path->nodes[0]);
- }
- leaf = path->nodes[0];
- slot = path->slots[0];
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.type > BTRFS_EXTENT_DATA_KEY ||
- key.objectid != btrfs_ino(BTRFS_I(src)))
- break;
-
- ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
-
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
- comp = btrfs_file_extent_compression(leaf, extent);
- type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- disko = btrfs_file_extent_disk_bytenr(leaf, extent);
- diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
- datao = btrfs_file_extent_offset(leaf, extent);
- datal = btrfs_file_extent_num_bytes(leaf, extent);
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- /* Take upper bound, may be compressed */
- datal = btrfs_file_extent_ram_bytes(leaf, extent);
- }
-
- /*
- * The first search might have left us at an extent item that
- * ends before our target range's start, can happen if we have
- * holes and NO_HOLES feature enabled.
- */
- if (key.offset + datal <= off) {
- path->slots[0]++;
- goto process_slot;
- } else if (key.offset >= off + len) {
- break;
- }
- next_key_min_offset = key.offset + datal;
- size = btrfs_item_size_nr(leaf, slot);
- read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
- size);
-
- btrfs_release_path(path);
- path->leave_spinning = 0;
-
- memcpy(&new_key, &key, sizeof(new_key));
- new_key.objectid = btrfs_ino(BTRFS_I(inode));
- if (off <= key.offset)
- new_key.offset = key.offset + destoff - off;
- else
- new_key.offset = destoff;
-
- /*
- * Deal with a hole that doesn't have an extent item that
- * represents it (NO_HOLES feature enabled).
- * This hole is either in the middle of the cloning range or at
- * the beginning (fully overlaps it or partially overlaps it).
- */
- if (new_key.offset != last_dest_end)
- drop_start = last_dest_end;
- else
- drop_start = new_key.offset;
-
- if (type == BTRFS_FILE_EXTENT_REG ||
- type == BTRFS_FILE_EXTENT_PREALLOC) {
- struct btrfs_clone_extent_info clone_info;
-
- /*
- * a | --- range to clone ---| b
- * | ------------- extent ------------- |
- */
-
- /* Subtract range b */
- if (key.offset + datal > off + len)
- datal = off + len - key.offset;
-
- /* Subtract range a */
- if (off > key.offset) {
- datao += off - key.offset;
- datal -= off - key.offset;
- }
-
- clone_info.disk_offset = disko;
- clone_info.disk_len = diskl;
- clone_info.data_offset = datao;
- clone_info.data_len = datal;
- clone_info.file_offset = new_key.offset;
- clone_info.extent_buf = buf;
- clone_info.item_size = size;
- ret = btrfs_punch_hole_range(inode, path,
- drop_start,
- new_key.offset + datal - 1,
- &clone_info, &trans);
- if (ret)
- goto out;
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 skip = 0;
- u64 trim = 0;
-
- if (off > key.offset) {
- skip = off - key.offset;
- new_key.offset += skip;
- }
-
- if (key.offset + datal > off + len)
- trim = key.offset + datal - (off + len);
-
- if (comp && (skip || trim)) {
- ret = -EINVAL;
- goto out;
- }
- size -= skip + trim;
- datal -= skip + trim;
-
- /*
- * If our extent is inline, we know we will drop or
- * adjust at most 1 extent item in the destination root.
- *
- * 1 - adjusting old extent (we may have to split it)
- * 1 - add new extent
- * 1 - inode update
- */
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out;
- }
-
- ret = clone_copy_inline_extent(inode, trans, path,
- &new_key, drop_start,
- datal, skip, size, buf);
- if (ret) {
- if (ret != -EOPNOTSUPP)
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- goto out;
- }
- }
-
- btrfs_release_path(path);
-
- last_dest_end = ALIGN(new_key.offset + datal,
- fs_info->sectorsize);
- ret = clone_finish_inode_update(trans, inode, last_dest_end,
- destoff, olen, no_time_update);
- if (ret)
- goto out;
- if (new_key.offset + datal >= destoff + len)
- break;
-
- btrfs_release_path(path);
- key.offset = next_key_min_offset;
-
- if (fatal_signal_pending(current)) {
- ret = -EINTR;
- goto out;
- }
- }
- ret = 0;
-
- if (last_dest_end < destoff + len) {
- /*
- * We have an implicit hole that fully or partially overlaps our
- * cloning range at its end. This means that we either have the
- * NO_HOLES feature enabled or the implicit hole happened due to
- * mixing buffered and direct IO writes against this file.
- */
- btrfs_release_path(path);
- path->leave_spinning = 0;
-
- ret = btrfs_punch_hole_range(inode, path,
- last_dest_end, destoff + len - 1,
- NULL, &trans);
- if (ret)
- goto out;
-
- ret = clone_finish_inode_update(trans, inode, destoff + len,
- destoff, olen, no_time_update);
- }
-
-out:
- btrfs_free_path(path);
- kvfree(buf);
- return ret;
-}
-
-static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
- u64 off, u64 olen, u64 destoff)
-{
- struct inode *inode = file_inode(file);
- struct inode *src = file_inode(file_src);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int ret;
- u64 len = olen;
- u64 bs = fs_info->sb->s_blocksize;
-
- /*
- * TODO:
- * - split compressed inline extents. annoying: we need to
- * decompress into destination's address_space (the file offset
- * may change, so source mapping won't do), then recompress (or
- * otherwise reinsert) a subrange.
- *
- * - split destination inode's inline extents. The inline extents can
- * be either compressed or non-compressed.
- */
-
- /*
- * VFS's generic_remap_file_range_prep() protects us from cloning the
- * eof block into the middle of a file, which would result in corruption
- * if the file size is not blocksize aligned. So we don't need to check
- * for that case here.
- */
- if (off + len == src->i_size)
- len = ALIGN(src->i_size, bs) - off;
-
- if (destoff > inode->i_size) {
- const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
-
- ret = btrfs_cont_expand(inode, inode->i_size, destoff);
- if (ret)
- return ret;
- /*
- * We may have truncated the last block if the inode's size is
- * not sector size aligned, so we need to wait for writeback to
- * complete before proceeding further, otherwise we can race
- * with cloning and attempt to increment a reference to an
- * extent that no longer exists (writeback completed right after
- * we found the previous extent covering eof and before we
- * attempted to increment its reference count).
- */
- ret = btrfs_wait_ordered_range(inode, wb_start,
- destoff - wb_start);
- if (ret)
- return ret;
- }
-
- /*
- * Lock destination range to serialize with concurrent readpages() and
- * source range to serialize with relocation.
- */
- btrfs_double_extent_lock(src, off, inode, destoff, len);
- ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- btrfs_double_extent_unlock(src, off, inode, destoff, len);
- /*
- * Truncate page cache pages so that future reads will see the cloned
- * data immediately and not the previous data.
- */
- truncate_inode_pages_range(&inode->i_data,
- round_down(destoff, PAGE_SIZE),
- round_up(destoff + len, PAGE_SIZE) - 1);
-
- return ret;
-}
-
-static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t *len, unsigned int remap_flags)
-{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
- bool same_inode = inode_out == inode_in;
- u64 wb_len;
- int ret;
-
- if (!(remap_flags & REMAP_FILE_DEDUP)) {
- struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
-
- if (btrfs_root_readonly(root_out))
- return -EROFS;
-
- if (file_in->f_path.mnt != file_out->f_path.mnt ||
- inode_in->i_sb != inode_out->i_sb)
- return -EXDEV;
- }
-
- /* don't make the dst file partly checksummed */
- if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
- return -EINVAL;
- }
-
- /*
- * Now that the inodes are locked, we need to start writeback ourselves
- * and can not rely on the writeback from the VFS's generic helper
- * generic_remap_file_range_prep() because:
- *
- * 1) For compression we must call filemap_fdatawrite_range() range
- * twice (btrfs_fdatawrite_range() does it for us), and the generic
- * helper only calls it once;
- *
- * 2) filemap_fdatawrite_range(), called by the generic helper only
- * waits for the writeback to complete, i.e. for IO to be done, and
- * not for the ordered extents to complete. We need to wait for them
- * to complete so that new file extent items are in the fs tree.
- */
- if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
- wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
- else
- wb_len = ALIGN(*len, bs);
-
- /*
- * Since we don't lock ranges, wait for ongoing lockless dio writes (as
- * any in progress could create its ordered extents after we wait for
- * existing ordered extents below).
- */
- inode_dio_wait(inode_in);
- if (!same_inode)
- inode_dio_wait(inode_out);
-
- /*
- * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
- *
- * Btrfs' back references do not have a block level granularity, they
- * work at the whole extent level.
- * NOCOW buffered write without data space reserved may not be able
- * to fall back to CoW due to lack of data space, thus could cause
- * data loss.
- *
- * Here we take a shortcut by flushing the whole inode, so that all
- * nocow write should reach disk as nocow before we increase the
- * reference of the extent. We could do better by only flushing NOCOW
- * data, but that needs extra accounting.
- *
- * Also we don't need to check ASYNC_EXTENT, as async extent will be
- * CoWed anyway, not affecting nocow part.
- */
- ret = filemap_flush(inode_in->i_mapping);
- if (ret < 0)
- return ret;
-
- ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
- wb_len);
- if (ret < 0)
- return ret;
- ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
- wb_len);
- if (ret < 0)
- return ret;
-
- return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
- len, remap_flags);
-}
-
-loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
- struct file *dst_file, loff_t destoff, loff_t len,
- unsigned int remap_flags)
-{
- struct inode *src_inode = file_inode(src_file);
- struct inode *dst_inode = file_inode(dst_file);
- bool same_inode = dst_inode == src_inode;
- int ret;
-
- if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
- return -EINVAL;
-
- if (same_inode)
- inode_lock(src_inode);
- else
- lock_two_nondirectories(src_inode, dst_inode);
-
- ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
- &len, remap_flags);
- if (ret < 0 || len == 0)
- goto out_unlock;
-
- if (remap_flags & REMAP_FILE_DEDUP)
- ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
- else
- ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
-
-out_unlock:
- if (same_inode)
- inode_unlock(src_inode);
- else
- unlock_two_nondirectories(src_inode, dst_inode);
-
- return ret < 0 ? ret : len;
-}
-
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
@@ -3955,7 +3277,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
struct btrfs_trans_handle *trans;
- struct btrfs_path *path;
+ struct btrfs_path *path = NULL;
struct btrfs_key location;
struct btrfs_disk_key disk_key;
u64 objectid = 0;
@@ -3981,49 +3303,51 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = (u64)-1;
- new_root = btrfs_read_fs_root_no_name(fs_info, &location);
+ new_root = btrfs_get_fs_root(fs_info, &location, true);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
goto out;
}
if (!is_fstree(new_root->root_key.objectid)) {
ret = -ENOENT;
- goto out;
+ goto out_free;
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto out;
+ goto out_free;
}
path->leave_spinning = 1;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- btrfs_free_path(path);
ret = PTR_ERR(trans);
- goto out;
+ goto out_free;
}
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
dir_id, "default", 7, 1);
if (IS_ERR_OR_NULL(di)) {
- btrfs_free_path(path);
+ btrfs_release_path(path);
btrfs_end_transaction(trans);
btrfs_err(fs_info,
"Umm, you don't have the default diritem, this isn't going to work");
ret = -ENOENT;
- goto out;
+ goto out_free;
}
btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_free_path(path);
+ btrfs_release_path(path);
btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
btrfs_end_transaction(trans);
+out_free:
+ btrfs_put_root(new_root);
+ btrfs_free_path(path);
out:
mnt_drop_write_file(file);
return ret;
@@ -5465,7 +4789,9 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SUBVOL_CREATE_V2:
return btrfs_ioctl_snap_create_v2(file, argp, 1);
case BTRFS_IOC_SNAP_DESTROY:
- return btrfs_ioctl_snap_destroy(file, argp);
+ return btrfs_ioctl_snap_destroy(file, argp, false);
+ case BTRFS_IOC_SNAP_DESTROY_V2:
+ return btrfs_ioctl_snap_destroy(file, argp, true);
case BTRFS_IOC_SUBVOL_GETFLAGS:
return btrfs_ioctl_subvol_getflags(file, argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 571c4826c428..fb647d8cf527 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -523,3 +523,138 @@ void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
path->locks[i] = 0;
}
}
+
+/*
+ * Loop around taking references on and locking the root node of the tree until
+ * we end up with a lock on the root node.
+ *
+ * Return: root extent buffer with write lock held
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ btrfs_tree_lock(eb);
+ if (eb == root->node)
+ break;
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
+ * Loop around taking references on and locking the root node of the tree until
+ * we end up with a lock on the root node.
+ *
+ * Return: root extent buffer with read lock held
+ */
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ struct extent_buffer *eb;
+
+ while (1) {
+ eb = btrfs_root_node(root);
+ btrfs_tree_read_lock(eb);
+ if (eb == root->node)
+ break;
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ }
+ return eb;
+}
+
+/*
+ * DREW locks
+ * ==========
+ *
+ * DREW stands for double-reader-writer-exclusion lock. It's used in situation
+ * where you want to provide A-B exclusion but not AA or BB.
+ *
+ * Currently implementation gives more priority to reader. If a reader and a
+ * writer both race to acquire their respective sides of the lock the writer
+ * would yield its lock as soon as it detects a concurrent reader. Additionally
+ * if there are pending readers no new writers would be allowed to come in and
+ * acquire the lock.
+ */
+
+int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
+{
+ int ret;
+
+ ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ atomic_set(&lock->readers, 0);
+ init_waitqueue_head(&lock->pending_readers);
+ init_waitqueue_head(&lock->pending_writers);
+
+ return 0;
+}
+
+void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
+{
+ percpu_counter_destroy(&lock->writers);
+}
+
+/* Return true if acquisition is successful, false otherwise */
+bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
+{
+ if (atomic_read(&lock->readers))
+ return false;
+
+ percpu_counter_inc(&lock->writers);
+
+ /* Ensure writers count is updated before we check for pending readers */
+ smp_mb();
+ if (atomic_read(&lock->readers)) {
+ btrfs_drew_write_unlock(lock);
+ return false;
+ }
+
+ return true;
+}
+
+void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
+{
+ while (true) {
+ if (btrfs_drew_try_write_lock(lock))
+ return;
+ wait_event(lock->pending_writers, !atomic_read(&lock->readers));
+ }
+}
+
+void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
+{
+ percpu_counter_dec(&lock->writers);
+ cond_wake_up(&lock->pending_readers);
+}
+
+void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
+{
+ atomic_inc(&lock->readers);
+
+ /*
+ * Ensure the pending reader count is perceieved BEFORE this reader
+ * goes to sleep in case of active writers. This guarantees new writers
+ * won't be allowed and that the current reader will be woken up when
+ * the last active writer finishes its jobs.
+ */
+ smp_mb__after_atomic();
+
+ wait_event(lock->pending_readers,
+ percpu_counter_sum(&lock->writers) == 0);
+}
+
+void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
+{
+ /*
+ * atomic_dec_and_test implies a full barrier, so woken up writers
+ * are guaranteed to see the decrement
+ */
+ if (atomic_dec_and_test(&lock->readers))
+ wake_up(&lock->pending_writers);
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 21a285883e89..d715846c10b8 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -6,6 +6,9 @@
#ifndef BTRFS_LOCKING_H
#define BTRFS_LOCKING_H
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/percpu_counter.h>
#include "extent_io.h"
#define BTRFS_WRITE_LOCK 1
@@ -13,6 +16,8 @@
#define BTRFS_WRITE_LOCK_BLOCKING 3
#define BTRFS_READ_LOCK_BLOCKING 4
+struct btrfs_path;
+
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
@@ -48,4 +53,19 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
BUG();
}
+struct btrfs_drew_lock {
+ atomic_t readers;
+ struct percpu_counter writers;
+ wait_queue_head_t pending_writers;
+ wait_queue_head_t pending_readers;
+};
+
+int btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
+void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock);
+void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
+bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
+void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
+void btrfs_drew_read_lock(struct btrfs_drew_lock *lock);
+void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock);
+
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ecb9fb6a6fe0..e13b3d28c063 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -580,7 +580,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
while (!list_empty(&splice) && nr) {
root = list_first_entry(&splice, struct btrfs_root,
ordered_root);
- root = btrfs_grab_fs_root(root);
+ root = btrfs_grab_root(root);
BUG_ON(!root);
list_move_tail(&root->ordered_root,
&fs_info->ordered_roots);
@@ -588,7 +588,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
done = btrfs_wait_ordered_extents(root, nr,
range_start, range_len);
- btrfs_put_fs_root(root);
+ btrfs_put_root(root);
spin_lock(&fs_info->ordered_root_lock);
if (nr != U64_MAX) {
@@ -679,10 +679,15 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
}
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
+ /*
+ * If the ordered extent had an error save the error but don't
+ * exit without waiting first for all other ordered extents in
+ * the range to complete.
+ */
if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
ret = -EIO;
btrfs_put_ordered_extent(ordered);
- if (ret || end == 0 || end == start)
+ if (end == 0 || end == start)
break;
end--;
}
@@ -781,134 +786,6 @@ out:
}
/*
- * After an extent is done, call this to conditionally update the on disk
- * i_size. i_size is updated to cover any fully written part of the file.
- */
-int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
- struct btrfs_ordered_extent *ordered)
-{
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
- u64 disk_i_size;
- u64 new_i_size;
- u64 i_size = i_size_read(inode);
- struct rb_node *node;
- struct rb_node *prev = NULL;
- struct btrfs_ordered_extent *test;
- int ret = 1;
- u64 orig_offset = offset;
-
- spin_lock_irq(&tree->lock);
- if (ordered) {
- offset = entry_end(ordered);
- if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
- offset = min(offset,
- ordered->file_offset +
- ordered->truncated_len);
- } else {
- offset = ALIGN(offset, btrfs_inode_sectorsize(inode));
- }
- disk_i_size = BTRFS_I(inode)->disk_i_size;
-
- /*
- * truncate file.
- * If ordered is not NULL, then this is called from endio and
- * disk_i_size will be updated by either truncate itself or any
- * in-flight IOs which are inside the disk_i_size.
- *
- * Because btrfs_setsize() may set i_size with disk_i_size if truncate
- * fails somehow, we need to make sure we have a precise disk_i_size by
- * updating it as usual.
- *
- */
- if (!ordered && disk_i_size > i_size) {
- BTRFS_I(inode)->disk_i_size = orig_offset;
- ret = 0;
- goto out;
- }
-
- /*
- * if the disk i_size is already at the inode->i_size, or
- * this ordered extent is inside the disk i_size, we're done
- */
- if (disk_i_size == i_size)
- goto out;
-
- /*
- * We still need to update disk_i_size if outstanding_isize is greater
- * than disk_i_size.
- */
- if (offset <= disk_i_size &&
- (!ordered || ordered->outstanding_isize <= disk_i_size))
- goto out;
-
- /*
- * walk backward from this ordered extent to disk_i_size.
- * if we find an ordered extent then we can't update disk i_size
- * yet
- */
- if (ordered) {
- node = rb_prev(&ordered->rb_node);
- } else {
- prev = tree_search(tree, offset);
- /*
- * we insert file extents without involving ordered struct,
- * so there should be no ordered struct cover this offset
- */
- if (prev) {
- test = rb_entry(prev, struct btrfs_ordered_extent,
- rb_node);
- BUG_ON(offset_in_entry(test, offset));
- }
- node = prev;
- }
- for (; node; node = rb_prev(node)) {
- test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-
- /* We treat this entry as if it doesn't exist */
- if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
- continue;
-
- if (entry_end(test) <= disk_i_size)
- break;
- if (test->file_offset >= i_size)
- break;
-
- /*
- * We don't update disk_i_size now, so record this undealt
- * i_size. Or we will not know the real i_size.
- */
- if (test->outstanding_isize < offset)
- test->outstanding_isize = offset;
- if (ordered &&
- ordered->outstanding_isize > test->outstanding_isize)
- test->outstanding_isize = ordered->outstanding_isize;
- goto out;
- }
- new_i_size = min_t(u64, offset, i_size);
-
- /*
- * Some ordered extents may completed before the current one, and
- * we hold the real i_size in ->outstanding_isize.
- */
- if (ordered && ordered->outstanding_isize > new_i_size)
- new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
- BTRFS_I(inode)->disk_i_size = new_i_size;
- ret = 0;
-out:
- /*
- * We need to do this because we can't remove ordered extents until
- * after the i_disk_size has been updated and then the inode has been
- * updated to reflect the change, so we need to tell anybody who finds
- * this ordered extent that we've already done all the real work, we
- * just haven't completed all the other work.
- */
- if (ordered)
- set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
- spin_unlock_irq(&tree->lock);
- return ret;
-}
-
-/*
* search the ordered extents for one corresponding to 'offset' and
* try to find a checksum. This is used because we allow pages to
* be reclaimed before their checksum is actually put into the btree
@@ -958,7 +835,6 @@ out:
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
*
- * @tree: IO tree used for locking out other users of the range
* @inode: Inode whose ordered tree is to be searched
* @start: Beginning of range to flush
* @end: Last byte of range to lock
@@ -968,8 +844,7 @@ out:
* This function always returns with the given range locked, ensuring after it's
* called no order extent can be pending.
*/
-void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
- struct btrfs_inode *inode, u64 start,
+void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state)
{
@@ -981,7 +856,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
cachedp = cached_state;
while (1) {
- lock_extent_bits(tree, start, end, cachedp);
+ lock_extent_bits(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -994,7 +869,7 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
refcount_dec(&cache->refs);
break;
}
- unlock_extent_cached(tree, start, end, cachedp);
+ unlock_extent_cached(&inode->io_tree, start, end, cachedp);
btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3beb4da4ab41..c01c9698250b 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -52,11 +52,6 @@ enum {
BTRFS_ORDERED_DIRECT,
/* We had an io error when writing this out */
BTRFS_ORDERED_IOERR,
- /*
- * indicates whether this ordered extent has done its due diligence in
- * updating the isize
- */
- BTRFS_ORDERED_UPDATED_ISIZE,
/* Set when we have to truncate an extent */
BTRFS_ORDERED_TRUNCATED,
/* Regular IO for COW */
@@ -182,16 +177,13 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
u64 len);
-int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
- struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len);
-void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
- struct btrfs_inode *inode, u64 start,
+void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
int __init ordered_data_init(void);
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index deb59e7cfcac..ff1ff90e48b1 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -383,7 +383,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
if (need_reserve) {
btrfs_block_rsv_release(fs_info, trans->block_rsv,
- num_bytes);
+ num_bytes, NULL);
if (ret)
return ret;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98d9a50352d6..c3888fb367e7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1030,6 +1030,7 @@ out_add_root:
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
+ fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
}
@@ -1037,11 +1038,8 @@ out_add_root:
out_free_path:
btrfs_free_path(path);
out_free_root:
- if (ret) {
- free_extent_buffer(quota_root->node);
- free_extent_buffer(quota_root->commit_root);
- kfree(quota_root);
- }
+ if (ret)
+ btrfs_put_root(quota_root);
out:
if (ret) {
ulist_free(fs_info->qgroup_ulist);
@@ -1104,9 +1102,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_tree_unlock(quota_root->node);
btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
- free_extent_buffer(quota_root->node);
- free_extent_buffer(quota_root->commit_root);
- kfree(quota_root);
+ btrfs_put_root(quota_root);
end_trans:
ret = btrfs_end_transaction(trans);
@@ -3237,7 +3233,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
}
mutex_lock(&fs_info->qgroup_rescan_lock);
- spin_lock(&fs_info->qgroup_lock);
if (init_flags) {
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
@@ -3252,7 +3247,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
}
if (ret) {
- spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
return ret;
}
@@ -3263,9 +3257,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
- fs_info->qgroup_rescan_running = true;
-
- spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
btrfs_init_work(&fs_info->qgroup_rescan_work,
@@ -3326,8 +3317,11 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
return 0;
}
@@ -3339,9 +3333,7 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
int ret = 0;
mutex_lock(&fs_info->qgroup_rescan_lock);
- spin_lock(&fs_info->qgroup_lock);
running = fs_info->qgroup_rescan_running;
- spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
if (!running)
@@ -3363,9 +3355,13 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
void
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
{
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ }
}
/*
@@ -4002,3 +3998,16 @@ out:
}
return ret;
}
+
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
+{
+ struct btrfs_qgroup_extent_record *entry;
+ struct btrfs_qgroup_extent_record *next;
+ struct rb_root *root;
+
+ root = &trans->delayed_refs.dirty_extent_root;
+ rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
+ ulist_free(entry->old_roots);
+ kfree(entry);
+ }
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 236f12224d52..1bc654459469 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -414,5 +414,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
u64 last_snapshot);
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a8e53c8e7b01..c870ef70f817 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -206,7 +206,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
struct btrfs_stripe_hash *h;
int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
int i;
- int table_size;
if (info->stripe_hash_table)
return 0;
@@ -218,8 +217,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
* Try harder to allocate and fallback to vmalloc to lower the chance
* of a failing mount.
*/
- table_size = sizeof(*table) + sizeof(*h) * num_entries;
- table = kvzalloc(table_size, GFP_KERNEL);
+ table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -1196,22 +1194,19 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
- int p_stripe = -1;
- int q_stripe = -1;
+ bool has_qstripe;
struct bio_list bio_list;
struct bio *bio;
int ret;
bio_list_init(&bio_list);
- if (rbio->real_stripes - rbio->nr_data == 1) {
- p_stripe = rbio->real_stripes - 1;
- } else if (rbio->real_stripes - rbio->nr_data == 2) {
- p_stripe = rbio->real_stripes - 2;
- q_stripe = rbio->real_stripes - 1;
- } else {
+ if (rbio->real_stripes - rbio->nr_data == 1)
+ has_qstripe = false;
+ else if (rbio->real_stripes - rbio->nr_data == 2)
+ has_qstripe = true;
+ else
BUG();
- }
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
@@ -1255,7 +1250,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
SetPageUptodate(p);
pointers[stripe++] = kmap(p);
- if (q_stripe != -1) {
+ if (has_qstripe) {
/*
* raid6, add the qstripe and call the
@@ -2353,8 +2348,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
- int p_stripe = -1;
- int q_stripe = -1;
+ bool has_qstripe;
struct page *p_page = NULL;
struct page *q_page = NULL;
struct bio_list bio_list;
@@ -2364,14 +2358,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
bio_list_init(&bio_list);
- if (rbio->real_stripes - rbio->nr_data == 1) {
- p_stripe = rbio->real_stripes - 1;
- } else if (rbio->real_stripes - rbio->nr_data == 2) {
- p_stripe = rbio->real_stripes - 2;
- q_stripe = rbio->real_stripes - 1;
- } else {
+ if (rbio->real_stripes - rbio->nr_data == 1)
+ has_qstripe = false;
+ else if (rbio->real_stripes - rbio->nr_data == 2)
+ has_qstripe = true;
+ else
BUG();
- }
if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
@@ -2393,7 +2385,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
goto cleanup;
SetPageUptodate(p_page);
- if (q_stripe != -1) {
+ if (has_qstripe) {
q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!q_page) {
__free_page(p_page);
@@ -2416,8 +2408,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
/* then add the parity stripe */
pointers[stripe++] = kmap(p_page);
- if (q_stripe != -1) {
-
+ if (has_qstripe) {
/*
* raid6, add the qstripe and call the
* library function to fill in our p/q
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index a97dc74a4d3d..5c1a617eb25d 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -8,7 +8,7 @@
struct rcu_string {
struct rcu_head rcu;
- char str[0];
+ char str[];
};
static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index b57f3618e58e..7887317033c9 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -744,6 +744,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
*/
be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
if (IS_ERR(be)) {
+ kfree(ref);
kfree(ra);
ret = PTR_ERR(be);
goto out;
@@ -757,6 +758,8 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"re-allocated a block that still has references to it!");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
goto out_unlock;
}
@@ -800,6 +803,15 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
kfree(ref);
kfree(ra);
goto out_unlock;
+ } else if (be->num_refs == 0) {
+ btrfs_err(fs_info,
+ "trying to do action %d for a bytenr that has 0 total references",
+ action);
+ dump_block_entry(fs_info, be);
+ dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
+ goto out_unlock;
}
if (!parent) {
@@ -819,6 +831,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"dropping a ref for a existing root that doesn't have a ref on the block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
kfree(ra);
goto out_unlock;
}
@@ -834,6 +847,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"attempting to add another ref for an existing ref on a tree block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
kfree(ra);
goto out_unlock;
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
new file mode 100644
index 000000000000..d1973141d3bb
--- /dev/null
+++ b/fs/btrfs/reflink.c
@@ -0,0 +1,804 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/blkdev.h>
+#include <linux/iversion.h>
+#include "compression.h"
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "reflink.h"
+#include "transaction.h"
+
+#define BTRFS_MAX_DEDUPE_LEN SZ_16M
+
+static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ u64 endoff,
+ const u64 destoff,
+ const u64 olen,
+ int no_time_update)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ inode_inc_iversion(inode);
+ if (!no_time_update)
+ inode->i_mtime = inode->i_ctime = current_time(inode);
+ /*
+ * We round up to the block size at eof when determining which
+ * extents to clone above, but shouldn't round up the file size.
+ */
+ if (endoff > destoff + olen)
+ endoff = destoff + olen;
+ if (endoff > inode->i_size) {
+ i_size_write(inode, endoff);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
+ }
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ ret = btrfs_end_transaction(trans);
+out:
+ return ret;
+}
+
+static int copy_inline_to_page(struct inode *inode,
+ const u64 file_offset,
+ char *inline_data,
+ const u64 size,
+ const u64 datal,
+ const u8 comp_type)
+{
+ const u64 block_size = btrfs_inode_sectorsize(inode);
+ const u64 range_end = file_offset + block_size - 1;
+ const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
+ char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
+ struct extent_changeset *data_reserved = NULL;
+ struct page *page = NULL;
+ int ret;
+
+ ASSERT(IS_ALIGNED(file_offset, block_size));
+
+ /*
+ * We have flushed and locked the ranges of the source and destination
+ * inodes, we also have locked the inodes, so we are safe to do a
+ * reservation here. Also we must not do the reservation while holding
+ * a transaction open, otherwise we would deadlock.
+ */
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+ block_size);
+ if (ret)
+ goto out;
+
+ page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
+ btrfs_alloc_write_mask(inode->i_mapping));
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ set_page_extent_mapped(page);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, NULL);
+ ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
+ if (ret)
+ goto out_unlock;
+
+ if (comp_type == BTRFS_COMPRESS_NONE) {
+ char *map;
+
+ map = kmap(page);
+ memcpy(map, data_start, datal);
+ flush_dcache_page(page);
+ kunmap(page);
+ } else {
+ ret = btrfs_decompress(comp_type, data_start, page, 0,
+ inline_size, datal);
+ if (ret)
+ goto out_unlock;
+ flush_dcache_page(page);
+ }
+
+ /*
+ * If our inline data is smaller then the block/page size, then the
+ * remaining of the block/page is equivalent to zeroes. We had something
+ * like the following done:
+ *
+ * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
+ * $ sync # (or fsync)
+ * $ xfs_io -c "falloc 0 4K" file
+ * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
+ *
+ * So what's in the range [500, 4095] corresponds to zeroes.
+ */
+ if (datal < block_size) {
+ char *map;
+
+ map = kmap(page);
+ memset(map + datal, 0, block_size - datal);
+ flush_dcache_page(page);
+ kunmap(page);
+ }
+
+ SetPageUptodate(page);
+ ClearPageChecked(page);
+ set_page_dirty(page);
+out_unlock:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ if (ret)
+ btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+ block_size, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
+out:
+ extent_changeset_free(data_reserved);
+
+ return ret;
+}
+
+/*
+ * Deal with cloning of inline extents. We try to copy the inline extent from
+ * the source inode to destination inode when possible. When not possible we
+ * copy the inline extent's data into the respective page of the inode.
+ */
+static int clone_copy_inline_extent(struct inode *dst,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ const u64 drop_start,
+ const u64 datal,
+ const u64 size,
+ const u8 comp_type,
+ char *inline_data,
+ struct btrfs_trans_handle **trans_out)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
+ struct btrfs_root *root = BTRFS_I(dst)->root;
+ const u64 aligned_end = ALIGN(new_key->offset + datal,
+ fs_info->sectorsize);
+ struct btrfs_trans_handle *trans = NULL;
+ int ret;
+ struct btrfs_key key;
+
+ if (new_key->offset > 0) {
+ ret = copy_inline_to_page(dst, new_key->offset, inline_data,
+ size, datal, comp_type);
+ goto out;
+ }
+
+ key.objectid = btrfs_ino(BTRFS_I(dst));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ goto copy_inline_extent;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
+ key.type == BTRFS_EXTENT_DATA_KEY) {
+ /*
+ * There's an implicit hole at file offset 0, copy the
+ * inline extent's data to the page.
+ */
+ ASSERT(key.offset > 0);
+ ret = copy_inline_to_page(dst, new_key->offset,
+ inline_data, size, datal,
+ comp_type);
+ goto out;
+ }
+ } else if (i_size_read(dst) <= datal) {
+ struct btrfs_file_extent_item *ei;
+
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ /*
+ * If it's an inline extent replace it with the source inline
+ * extent, otherwise copy the source inline extent data into
+ * the respective page at the destination inode.
+ */
+ if (btrfs_file_extent_type(path->nodes[0], ei) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ goto copy_inline_extent;
+
+ ret = copy_inline_to_page(dst, new_key->offset, inline_data,
+ size, datal, comp_type);
+ goto out;
+ }
+
+copy_inline_extent:
+ ret = 0;
+ /*
+ * We have no extent items, or we have an extent at offset 0 which may
+ * or may not be inlined. All these cases are dealt the same way.
+ */
+ if (i_size_read(dst) > datal) {
+ /*
+ * At the destination offset 0 we have either a hole, a regular
+ * extent or an inline extent larger then the one we want to
+ * clone. Deal with all these cases by copying the inline extent
+ * data into the respective page at the destination inode.
+ */
+ ret = copy_inline_to_page(dst, new_key->offset, inline_data,
+ size, datal, comp_type);
+ goto out;
+ }
+
+ btrfs_release_path(path);
+ /*
+ * If we end up here it means were copy the inline extent into a leaf
+ * of the destination inode. We know we will drop or adjust at most one
+ * extent item in the destination root.
+ *
+ * 1 unit - adjusting old extent (we may have to split it)
+ * 1 unit - add new extent
+ * 1 unit - inode update
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
+ }
+ ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+ if (ret)
+ goto out;
+ ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+ if (ret)
+ goto out;
+
+ write_extent_buffer(path->nodes[0], inline_data,
+ btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]),
+ size);
+ inode_add_bytes(dst, datal);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
+out:
+ if (!ret && !trans) {
+ /*
+ * No transaction here means we copied the inline extent into a
+ * page of the destination inode.
+ *
+ * 1 unit to update inode item
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ }
+ }
+ if (ret && trans) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ }
+ if (!ret)
+ *trans_out = trans;
+
+ return ret;
+}
+
+/**
+ * btrfs_clone() - clone a range from inode file to another
+ *
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen
+ * @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
+ */
+static int btrfs_clone(struct inode *src, struct inode *inode,
+ const u64 off, const u64 olen, const u64 olen_aligned,
+ const u64 destoff, int no_time_update)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_path *path = NULL;
+ struct extent_buffer *leaf;
+ struct btrfs_trans_handle *trans;
+ char *buf = NULL;
+ struct btrfs_key key;
+ u32 nritems;
+ int slot;
+ int ret;
+ const u64 len = olen_aligned;
+ u64 last_dest_end = destoff;
+
+ ret = -ENOMEM;
+ buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+ if (!buf)
+ return ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ kvfree(buf);
+ return ret;
+ }
+
+ path->reada = READA_FORWARD;
+ /* Clone data */
+ key.objectid = btrfs_ino(BTRFS_I(src));
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = off;
+
+ while (1) {
+ u64 next_key_min_offset = key.offset + 1;
+ struct btrfs_file_extent_item *extent;
+ int type;
+ u32 size;
+ struct btrfs_key new_key;
+ u64 disko = 0, diskl = 0;
+ u64 datao = 0, datal = 0;
+ u8 comp;
+ u64 drop_start;
+
+ /* Note the key will change type as we walk through the tree */
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ goto out;
+ /*
+ * First search, if no extent item that starts at offset off was
+ * found but the previous item is an extent item, it's possible
+ * it might overlap our target range, therefore process it.
+ */
+ if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0] - 1);
+ if (key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0)
+ break;
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ }
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.type > BTRFS_EXTENT_DATA_KEY ||
+ key.objectid != btrfs_ino(BTRFS_I(src)))
+ break;
+
+ ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
+
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
+ comp = btrfs_file_extent_compression(leaf, extent);
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ disko = btrfs_file_extent_disk_bytenr(leaf, extent);
+ diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+ datao = btrfs_file_extent_offset(leaf, extent);
+ datal = btrfs_file_extent_num_bytes(leaf, extent);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /* Take upper bound, may be compressed */
+ datal = btrfs_file_extent_ram_bytes(leaf, extent);
+ }
+
+ /*
+ * The first search might have left us at an extent item that
+ * ends before our target range's start, can happen if we have
+ * holes and NO_HOLES feature enabled.
+ */
+ if (key.offset + datal <= off) {
+ path->slots[0]++;
+ goto process_slot;
+ } else if (key.offset >= off + len) {
+ break;
+ }
+ next_key_min_offset = key.offset + datal;
+ size = btrfs_item_size_nr(leaf, slot);
+ read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ btrfs_release_path(path);
+ path->leave_spinning = 0;
+
+ memcpy(&new_key, &key, sizeof(new_key));
+ new_key.objectid = btrfs_ino(BTRFS_I(inode));
+ if (off <= key.offset)
+ new_key.offset = key.offset + destoff - off;
+ else
+ new_key.offset = destoff;
+
+ /*
+ * Deal with a hole that doesn't have an extent item that
+ * represents it (NO_HOLES feature enabled).
+ * This hole is either in the middle of the cloning range or at
+ * the beginning (fully overlaps it or partially overlaps it).
+ */
+ if (new_key.offset != last_dest_end)
+ drop_start = last_dest_end;
+ else
+ drop_start = new_key.offset;
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ struct btrfs_clone_extent_info clone_info;
+
+ /*
+ * a | --- range to clone ---| b
+ * | ------------- extent ------------- |
+ */
+
+ /* Subtract range b */
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ /* Subtract range a */
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+
+ clone_info.disk_offset = disko;
+ clone_info.disk_len = diskl;
+ clone_info.data_offset = datao;
+ clone_info.data_len = datal;
+ clone_info.file_offset = new_key.offset;
+ clone_info.extent_buf = buf;
+ clone_info.item_size = size;
+ ret = btrfs_punch_hole_range(inode, path, drop_start,
+ new_key.offset + datal - 1, &clone_info,
+ &trans);
+ if (ret)
+ goto out;
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ /*
+ * Inline extents always have to start at file offset 0
+ * and can never be bigger then the sector size. We can
+ * never clone only parts of an inline extent, since all
+ * reflink operations must start at a sector size aligned
+ * offset, and the length must be aligned too or end at
+ * the i_size (which implies the whole inlined data).
+ */
+ ASSERT(key.offset == 0);
+ ASSERT(datal <= fs_info->sectorsize);
+ if (key.offset != 0 || datal > fs_info->sectorsize)
+ return -EUCLEAN;
+
+ ret = clone_copy_inline_extent(inode, path, &new_key,
+ drop_start, datal, size,
+ comp, buf, &trans);
+ if (ret)
+ goto out;
+ }
+
+ btrfs_release_path(path);
+
+ last_dest_end = ALIGN(new_key.offset + datal,
+ fs_info->sectorsize);
+ ret = clone_finish_inode_update(trans, inode, last_dest_end,
+ destoff, olen, no_time_update);
+ if (ret)
+ goto out;
+ if (new_key.offset + datal >= destoff + len)
+ break;
+
+ btrfs_release_path(path);
+ key.offset = next_key_min_offset;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ }
+ ret = 0;
+
+ if (last_dest_end < destoff + len) {
+ /*
+ * We have an implicit hole that fully or partially overlaps our
+ * cloning range at its end. This means that we either have the
+ * NO_HOLES feature enabled or the implicit hole happened due to
+ * mixing buffered and direct IO writes against this file.
+ */
+ btrfs_release_path(path);
+ path->leave_spinning = 0;
+
+ ret = btrfs_punch_hole_range(inode, path, last_dest_end,
+ destoff + len - 1, NULL, &trans);
+ if (ret)
+ goto out;
+
+ ret = clone_finish_inode_update(trans, inode, destoff + len,
+ destoff, olen, no_time_update);
+ }
+
+out:
+ btrfs_free_path(path);
+ kvfree(buf);
+ return ret;
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ if (inode1 < inode2) {
+ swap(inode1, inode2);
+ swap(loff1, loff2);
+ } else if (inode1 == inode2 && loff2 < loff1) {
+ swap(loff1, loff2);
+ }
+ lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
+ struct inode *dst, u64 dst_loff)
+{
+ const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+ int ret;
+
+ /*
+ * Lock destination range to serialize with concurrent readpages() and
+ * source range to serialize with relocation.
+ */
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+
+ return ret;
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
+ struct inode *dst, u64 dst_loff)
+{
+ int ret;
+ u64 i, tail_len, chunk_count;
+ struct btrfs_root *root_dst = BTRFS_I(dst)->root;
+
+ spin_lock(&root_dst->root_item_lock);
+ if (root_dst->send_in_progress) {
+ btrfs_warn_rl(root_dst->fs_info,
+"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
+ root_dst->root_key.objectid,
+ root_dst->send_in_progress);
+ spin_unlock(&root_dst->root_item_lock);
+ return -EAGAIN;
+ }
+ root_dst->dedupe_in_progress++;
+ spin_unlock(&root_dst->root_item_lock);
+
+ tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
+ chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
+
+ for (i = 0; i < chunk_count; i++) {
+ ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
+ dst, dst_loff);
+ if (ret)
+ goto out;
+
+ loff += BTRFS_MAX_DEDUPE_LEN;
+ dst_loff += BTRFS_MAX_DEDUPE_LEN;
+ }
+
+ if (tail_len > 0)
+ ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
+out:
+ spin_lock(&root_dst->root_item_lock);
+ root_dst->dedupe_in_progress--;
+ spin_unlock(&root_dst->root_item_lock);
+
+ return ret;
+}
+
+static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
+ u64 off, u64 olen, u64 destoff)
+{
+ struct inode *inode = file_inode(file);
+ struct inode *src = file_inode(file_src);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int ret;
+ int wb_ret;
+ u64 len = olen;
+ u64 bs = fs_info->sb->s_blocksize;
+
+ /*
+ * VFS's generic_remap_file_range_prep() protects us from cloning the
+ * eof block into the middle of a file, which would result in corruption
+ * if the file size is not blocksize aligned. So we don't need to check
+ * for that case here.
+ */
+ if (off + len == src->i_size)
+ len = ALIGN(src->i_size, bs) - off;
+
+ if (destoff > inode->i_size) {
+ const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
+
+ ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+ if (ret)
+ return ret;
+ /*
+ * We may have truncated the last block if the inode's size is
+ * not sector size aligned, so we need to wait for writeback to
+ * complete before proceeding further, otherwise we can race
+ * with cloning and attempt to increment a reference to an
+ * extent that no longer exists (writeback completed right after
+ * we found the previous extent covering eof and before we
+ * attempted to increment its reference count).
+ */
+ ret = btrfs_wait_ordered_range(inode, wb_start,
+ destoff - wb_start);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Lock destination range to serialize with concurrent readpages() and
+ * source range to serialize with relocation.
+ */
+ btrfs_double_extent_lock(src, off, inode, destoff, len);
+ ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
+ btrfs_double_extent_unlock(src, off, inode, destoff, len);
+
+ /*
+ * We may have copied an inline extent into a page of the destination
+ * range, so wait for writeback to complete before truncating pages
+ * from the page cache. This is a rare case.
+ */
+ wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
+ ret = ret ? ret : wb_ret;
+ /*
+ * Truncate page cache pages so that future reads will see the cloned
+ * data immediately and not the previous data.
+ */
+ truncate_inode_pages_range(&inode->i_data,
+ round_down(destoff, PAGE_SIZE),
+ round_up(destoff + len, PAGE_SIZE) - 1);
+
+ return ret;
+}
+
+static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *len, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
+ bool same_inode = inode_out == inode_in;
+ u64 wb_len;
+ int ret;
+
+ if (!(remap_flags & REMAP_FILE_DEDUP)) {
+ struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+
+ if (btrfs_root_readonly(root_out))
+ return -EROFS;
+
+ if (file_in->f_path.mnt != file_out->f_path.mnt ||
+ inode_in->i_sb != inode_out->i_sb)
+ return -EXDEV;
+ }
+
+ /* Don't make the dst file partly checksummed */
+ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
+ (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ return -EINVAL;
+ }
+
+ /*
+ * Now that the inodes are locked, we need to start writeback ourselves
+ * and can not rely on the writeback from the VFS's generic helper
+ * generic_remap_file_range_prep() because:
+ *
+ * 1) For compression we must call filemap_fdatawrite_range() range
+ * twice (btrfs_fdatawrite_range() does it for us), and the generic
+ * helper only calls it once;
+ *
+ * 2) filemap_fdatawrite_range(), called by the generic helper only
+ * waits for the writeback to complete, i.e. for IO to be done, and
+ * not for the ordered extents to complete. We need to wait for them
+ * to complete so that new file extent items are in the fs tree.
+ */
+ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
+ wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ else
+ wb_len = ALIGN(*len, bs);
+
+ /*
+ * Since we don't lock ranges, wait for ongoing lockless dio writes (as
+ * any in progress could create its ordered extents after we wait for
+ * existing ordered extents below).
+ */
+ inode_dio_wait(inode_in);
+ if (!same_inode)
+ inode_dio_wait(inode_out);
+
+ /*
+ * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
+ *
+ * Btrfs' back references do not have a block level granularity, they
+ * work at the whole extent level.
+ * NOCOW buffered write without data space reserved may not be able
+ * to fall back to CoW due to lack of data space, thus could cause
+ * data loss.
+ *
+ * Here we take a shortcut by flushing the whole inode, so that all
+ * nocow write should reach disk as nocow before we increase the
+ * reference of the extent. We could do better by only flushing NOCOW
+ * data, but that needs extra accounting.
+ *
+ * Also we don't need to check ASYNC_EXTENT, as async extent will be
+ * CoWed anyway, not affecting nocow part.
+ */
+ ret = filemap_flush(inode_in->i_mapping);
+ if (ret < 0)
+ return ret;
+
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
+ wb_len);
+ if (ret < 0)
+ return ret;
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
+ wb_len);
+ if (ret < 0)
+ return ret;
+
+ return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ len, remap_flags);
+}
+
+loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, loff_t len,
+ unsigned int remap_flags)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *dst_inode = file_inode(dst_file);
+ bool same_inode = dst_inode == src_inode;
+ int ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (same_inode)
+ inode_lock(src_inode);
+ else
+ lock_two_nondirectories(src_inode, dst_inode);
+
+ ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ goto out_unlock;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ else
+ ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
+
+out_unlock:
+ if (same_inode)
+ inode_unlock(src_inode);
+ else
+ unlock_two_nondirectories(src_inode, dst_inode);
+
+ return ret < 0 ? ret : len;
+}
diff --git a/fs/btrfs/reflink.h b/fs/btrfs/reflink.h
new file mode 100644
index 000000000000..ecb309b4dad0
--- /dev/null
+++ b/fs/btrfs/reflink.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_REFLINK_H
+#define BTRFS_REFLINK_H
+
+#include <linux/fs.h>
+
+loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
+
+#endif /* BTRFS_REFLINK_H */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 995d4b8b1cfd..f65595602aa8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -9,6 +9,7 @@
#include <linux/blkdev.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -22,6 +23,54 @@
#include "print-tree.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "backref.h"
+
+/*
+ * Relocation overview
+ *
+ * [What does relocation do]
+ *
+ * The objective of relocation is to relocate all extents of the target block
+ * group to other block groups.
+ * This is utilized by resize (shrink only), profile converting, compacting
+ * space, or balance routine to spread chunks over devices.
+ *
+ * Before | After
+ * ------------------------------------------------------------------
+ * BG A: 10 data extents | BG A: deleted
+ * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated)
+ * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated)
+ *
+ * [How does relocation work]
+ *
+ * 1. Mark the target block group read-only
+ * New extents won't be allocated from the target block group.
+ *
+ * 2.1 Record each extent in the target block group
+ * To build a proper map of extents to be relocated.
+ *
+ * 2.2 Build data reloc tree and reloc trees
+ * Data reloc tree will contain an inode, recording all newly relocated
+ * data extents.
+ * There will be only one data reloc tree for one data block group.
+ *
+ * Reloc tree will be a special snapshot of its source tree, containing
+ * relocated tree blocks.
+ * Each tree referring to a tree block in target block group will get its
+ * reloc tree built.
+ *
+ * 2.3 Swap source tree with its corresponding reloc tree
+ * Each involved tree only refers to new extents after swap.
+ *
+ * 3. Cleanup reloc trees and data reloc tree.
+ * As old extents in the target block group are still referenced by reloc
+ * trees, we need to clean them up before really freeing the target block
+ * group.
+ *
+ * The main complexity is in steps 2.2 and 2.3.
+ *
+ * The entry point of relocation is relocate_block_group() function.
+ */
/*
* backref_node, mapping_node and tree_block start with this
@@ -256,6 +305,7 @@ static void free_backref_node(struct backref_cache *cache,
{
if (node) {
cache->nr_nodes--;
+ btrfs_put_root(node->root);
kfree(node);
}
}
@@ -589,22 +639,7 @@ static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
root = (struct btrfs_root *)node->data;
}
spin_unlock(&rc->reloc_root_tree.lock);
- return root;
-}
-
-static int is_cowonly_root(u64 root_objectid)
-{
- if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
- root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
- root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
- root_objectid == BTRFS_DEV_TREE_OBJECTID ||
- root_objectid == BTRFS_TREE_LOG_OBJECTID ||
- root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
- root_objectid == BTRFS_UUID_TREE_OBJECTID ||
- root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
- root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
- return 1;
- return 0;
+ return btrfs_grab_root(root);
}
static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
@@ -614,10 +649,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
key.objectid = root_objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
- if (is_cowonly_root(root_objectid))
- key.offset = 0;
- else
- key.offset = (u64)-1;
+ key.offset = (u64)-1;
return btrfs_get_fs_root(fs_info, &key, false);
}
@@ -711,8 +743,6 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
err = -ENOMEM;
goto out;
}
- path1->reada = READA_FORWARD;
- path2->reada = READA_FORWARD;
node = alloc_backref_node(cache);
if (!node) {
@@ -899,10 +929,12 @@ again:
/* tree root */
ASSERT(btrfs_root_bytenr(&root->root_item) ==
cur->bytenr);
- if (should_ignore_root(root))
+ if (should_ignore_root(root)) {
+ btrfs_put_root(root);
list_add(&cur->list, &useless);
- else
+ } else {
cur->root = root;
+ }
break;
}
@@ -915,6 +947,7 @@ again:
ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
path2->lowest_level = 0;
if (ret < 0) {
+ btrfs_put_root(root);
err = ret;
goto out;
}
@@ -930,6 +963,7 @@ again:
root->root_key.objectid,
node_key->objectid, node_key->type,
node_key->offset);
+ btrfs_put_root(root);
err = -ENOENT;
goto out;
}
@@ -941,15 +975,18 @@ again:
if (!path2->nodes[level]) {
ASSERT(btrfs_root_bytenr(&root->root_item) ==
lower->bytenr);
- if (should_ignore_root(root))
+ if (should_ignore_root(root)) {
+ btrfs_put_root(root);
list_add(&lower->list, &useless);
- else
+ } else {
lower->root = root;
+ }
break;
}
edge = alloc_backref_edge(cache);
if (!edge) {
+ btrfs_put_root(root);
err = -ENOMEM;
goto out;
}
@@ -959,6 +996,7 @@ again:
if (!rb_node) {
upper = alloc_backref_node(cache);
if (!upper) {
+ btrfs_put_root(root);
free_backref_edge(cache, edge);
err = -ENOMEM;
goto out;
@@ -1006,8 +1044,10 @@ again:
edge->node[LOWER] = lower;
edge->node[UPPER] = upper;
- if (rb_node)
+ if (rb_node) {
+ btrfs_put_root(root);
break;
+ }
lower = upper;
upper = NULL;
}
@@ -1186,7 +1226,7 @@ out:
free_backref_node(cache, lower);
}
- free_backref_node(cache, node);
+ remove_backref_node(cache, node);
return ERR_PTR(err);
}
ASSERT(!node || !node->detached);
@@ -1244,7 +1284,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
new_node->level = node->level;
new_node->lowest = node->lowest;
new_node->checked = 1;
- new_node->root = dest;
+ new_node->root = btrfs_grab_root(dest);
+ ASSERT(new_node->root);
if (!node->lowest) {
list_for_each_entry(edge, &node->lower, list[UPPER]) {
@@ -1298,7 +1339,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
if (!node)
return -ENOMEM;
- node->bytenr = root->node->start;
+ node->bytenr = root->commit_root->start;
node->data = root;
spin_lock(&rc->reloc_root_tree.lock);
@@ -1325,14 +1366,16 @@ static void __del_reloc_root(struct btrfs_root *root)
struct rb_node *rb_node;
struct mapping_node *node = NULL;
struct reloc_control *rc = fs_info->reloc_ctl;
+ bool put_ref = false;
if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->node->start);
+ root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ RB_CLEAR_NODE(&node->rb_node);
}
spin_unlock(&rc->reloc_root_tree.lock);
if (!node)
@@ -1340,9 +1383,22 @@ static void __del_reloc_root(struct btrfs_root *root)
BUG_ON((struct btrfs_root *)node->data != root);
}
+ /*
+ * We only put the reloc root here if it's on the list. There's a lot
+ * of places where the pattern is to splice the rc->reloc_roots, process
+ * the reloc roots, and then add the reloc root back onto
+ * rc->reloc_roots. If we call __del_reloc_root while it's off of the
+ * list we don't want the reference being dropped, because the guy
+ * messing with the list is in charge of the reference.
+ */
spin_lock(&fs_info->trans_lock);
- list_del_init(&root->root_list);
+ if (!list_empty(&root->root_list)) {
+ put_ref = true;
+ list_del_init(&root->root_list);
+ }
spin_unlock(&fs_info->trans_lock);
+ if (put_ref)
+ btrfs_put_root(root);
kfree(node);
}
@@ -1350,7 +1406,7 @@ static void __del_reloc_root(struct btrfs_root *root)
* helper to update the 'address of tree root -> reloc tree'
* mapping
*/
-static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
+static int __update_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
@@ -1359,7 +1415,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->node->start);
+ root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
@@ -1371,7 +1427,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
BUG_ON((struct btrfs_root *)node->data != root);
spin_lock(&rc->reloc_root_tree.lock);
- node->bytenr = new_bytenr;
+ node->bytenr = root->node->start;
rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
@@ -1447,8 +1503,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
BUG_ON(ret);
kfree(root_item);
- reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
+ reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
BUG_ON(IS_ERR(reloc_root));
+ set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state);
reloc_root->last_trans = trans->transid;
return reloc_root;
}
@@ -1456,6 +1513,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
/*
* create reloc tree for a given fs tree. reloc tree is just a
* snapshot of the fs tree with special root objectid.
+ *
+ * The reloc_root comes out of here with two references, one for
+ * root->reloc_root, and another for being on the rc->reloc_roots list.
*/
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
@@ -1467,6 +1527,10 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int clear_rsv = 0;
int ret;
+ if (!rc || !rc->create_reloc_tree ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
/*
* The subvolume has reloc tree but the swap is finished, no need to
* create/update the dead reloc tree
@@ -1480,10 +1544,6 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
return 0;
}
- if (!rc || !rc->create_reloc_tree ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
- return 0;
-
if (!trans->reloc_reserved) {
rsv = trans->block_rsv;
trans->block_rsv = rc->block_rsv;
@@ -1495,7 +1555,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
ret = __add_reloc_root(reloc_root);
BUG_ON(ret < 0);
- root->reloc_root = reloc_root;
+ root->reloc_root = btrfs_grab_root(reloc_root);
return 0;
}
@@ -1516,6 +1576,13 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
+ /*
+ * We are probably ok here, but __del_reloc_root() will drop its ref of
+ * the root. We have the ref for root->reloc_root, but just in case
+ * hold it while we update the reloc root.
+ */
+ btrfs_grab_root(reloc_root);
+
/* root->reloc_root will stay until current relocation finished */
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
@@ -1529,6 +1596,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
}
if (reloc_root->commit_root != reloc_root->node) {
+ __update_reloc_root(reloc_root);
btrfs_set_root_node(root_item, reloc_root->node);
free_extent_buffer(reloc_root->commit_root);
reloc_root->commit_root = btrfs_root_node(reloc_root);
@@ -1537,7 +1605,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_update_root(trans, fs_info->tree_root,
&reloc_root->root_key, root_item);
BUG_ON(ret);
-
+ btrfs_put_root(reloc_root);
out:
return 0;
}
@@ -2211,7 +2279,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
btrfs_update_reloc_root(trans, root);
if (list_empty(&root->reloc_dirty_list)) {
- btrfs_grab_fs_root(root);
+ btrfs_grab_root(root);
list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
}
}
@@ -2231,24 +2299,34 @@ static int clean_dirty_subvols(struct reloc_control *rc)
list_del_init(&root->reloc_dirty_list);
root->reloc_root = NULL;
- if (reloc_root) {
-
- ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
- if (ret2 < 0 && !ret)
- ret = ret2;
- }
/*
* Need barrier to ensure clear_bit() only happens after
* root->reloc_root = NULL. Pairs with have_reloc_root.
*/
smp_wmb();
clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
- btrfs_put_fs_root(root);
+ if (reloc_root) {
+ /*
+ * btrfs_drop_snapshot drops our ref we hold for
+ * ->reloc_root. If it fails however we must
+ * drop the ref ourselves.
+ */
+ ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
+ if (ret2 < 0) {
+ btrfs_put_root(reloc_root);
+ if (!ret)
+ ret = ret2;
+ }
+ }
+ btrfs_put_root(root);
} else {
/* Orphan reloc tree, just clean it up */
- ret2 = btrfs_drop_snapshot(root, NULL, 0, 1);
- if (ret2 < 0 && !ret)
- ret = ret2;
+ ret2 = btrfs_drop_snapshot(root, 0, 1);
+ if (ret2 < 0) {
+ btrfs_put_root(root);
+ if (!ret)
+ ret = ret2;
+ }
}
}
return ret;
@@ -2325,6 +2403,18 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
trans = NULL;
goto out;
}
+
+ /*
+ * At this point we no longer have a reloc_control, so we can't
+ * depend on btrfs_init_reloc_root to update our last_trans.
+ *
+ * But that's ok, we started the trans handle on our
+ * corresponding fs_root, which means it's been added to the
+ * dirty list. At commit time we'll still call
+ * btrfs_update_reloc_root() and update our root item
+ * appropriately.
+ */
+ reloc_root->last_trans = trans->transid;
trans->block_rsv = rc->block_rsv;
replaced = 0;
@@ -2435,7 +2525,7 @@ again:
if (IS_ERR(trans)) {
if (!err)
btrfs_block_rsv_release(fs_info, rc->block_rsv,
- num_bytes);
+ num_bytes, NULL);
return PTR_ERR(trans);
}
@@ -2443,7 +2533,7 @@ again:
if (num_bytes != rc->merging_rsv_size) {
btrfs_end_transaction(trans);
btrfs_block_rsv_release(fs_info, rc->block_rsv,
- num_bytes);
+ num_bytes, NULL);
goto again;
}
}
@@ -2468,6 +2558,7 @@ again:
btrfs_update_reloc_root(trans, root);
list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(root);
}
list_splice(&reloc_roots, &rc->reloc_roots);
@@ -2488,10 +2579,6 @@ void free_reloc_roots(struct list_head *list)
reloc_root = list_entry(list->next, struct btrfs_root,
root_list);
__del_reloc_root(reloc_root);
- free_extent_buffer(reloc_root->node);
- free_extent_buffer(reloc_root->commit_root);
- reloc_root->node = NULL;
- reloc_root->commit_root = NULL;
}
}
@@ -2529,6 +2616,7 @@ again:
BUG_ON(root->reloc_root != reloc_root);
ret = merge_reloc_root(rc, root);
+ btrfs_put_root(root);
if (ret) {
if (list_empty(&reloc_root->root_list))
list_add_tail(&reloc_root->root_list,
@@ -2561,7 +2649,21 @@ out:
free_reloc_roots(&reloc_roots);
}
- BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+ /*
+ * We used to have
+ *
+ * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+ *
+ * here, but it's wrong. If we fail to start the transaction in
+ * prepare_to_merge() we will have only 0 ref reloc roots, none of which
+ * have actually been removed from the reloc_root_tree rb tree. This is
+ * fine because we're bailing here, and we hold a reference on the root
+ * for the list that holds it, so these roots will be cleaned up when we
+ * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root
+ * will be cleaned up on unmount.
+ *
+ * The remaining nodes will be cleaned up by free_reloc_control.
+ */
}
static void free_block_list(struct rb_root *blocks)
@@ -2580,6 +2682,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = reloc_root->fs_info;
struct btrfs_root *root;
+ int ret;
if (reloc_root->last_trans == trans->transid)
return 0;
@@ -2587,8 +2690,10 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
root = read_fs_root(fs_info, reloc_root->root_key.offset);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
+ ret = btrfs_record_root_in_trans(trans, root);
+ btrfs_put_root(root);
- return btrfs_record_root_in_trans(trans, root);
+ return ret;
}
static noinline_for_stack
@@ -2621,7 +2726,9 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
BUG_ON(next->new_bytenr);
BUG_ON(!list_empty(&next->list));
next->new_bytenr = root->node->start;
- next->root = root;
+ btrfs_put_root(next->root);
+ next->root = btrfs_grab_root(root);
+ ASSERT(next->root);
list_add_tail(&next->list,
&rc->backref_cache.changed);
__mark_block_processed(rc, next);
@@ -3040,7 +3147,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
{
struct extent_buffer *eb;
- BUG_ON(block->key_ready);
eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
block->level, NULL);
if (IS_ERR(eb)) {
@@ -3073,6 +3179,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
if (!node)
return 0;
+ /*
+ * If we fail here we want to drop our backref_node because we are going
+ * to start over and regenerate the tree for it.
+ */
+ ret = reserve_metadata_space(trans, rc, node);
+ if (ret)
+ goto out;
+
BUG_ON(node->processed);
root = select_one_root(node);
if (root == ERR_PTR(-ENOENT)) {
@@ -3080,12 +3194,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
- if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
- ret = reserve_metadata_space(trans, rc, node);
- if (ret)
- goto out;
- }
-
if (root) {
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
BUG_ON(node->new_bytenr);
@@ -3093,7 +3201,9 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
btrfs_record_root_in_trans(trans, root);
root = root->reloc_root;
node->new_bytenr = root->node->start;
- node->root = root;
+ btrfs_put_root(node->root);
+ node->root = btrfs_grab_root(root);
+ ASSERT(node->root);
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
@@ -3161,9 +3271,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
- if (ret != -EAGAIN || &block->rb_node == rb_first(blocks))
- err = ret;
- goto out;
+ err = ret;
+ break;
}
}
out:
@@ -3264,6 +3373,15 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
return ret;
}
+/*
+ * Allow error injection to test balance cancellation
+ */
+int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ return atomic_read(&fs_info->balance_cancel_req);
+}
+ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
+
static int relocate_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
{
@@ -3385,6 +3503,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
+ if (btrfs_should_cancel_balance(fs_info)) {
+ ret = -ECANCELED;
+ goto out;
+ }
}
WARN_ON(nr != cluster->nr);
out:
@@ -3556,31 +3678,6 @@ out:
return ret;
}
-/*
- * helper to check if the block use full backrefs for pointers in it
- */
-static int block_use_full_backref(struct reloc_control *rc,
- struct extent_buffer *eb)
-{
- u64 flags;
- int ret;
-
- if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
- btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
- return 1;
-
- ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
- eb->start, btrfs_header_level(eb), 1,
- NULL, &flags);
- BUG_ON(ret);
-
- if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
- ret = 1;
- else
- ret = 0;
- return ret;
-}
-
static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
struct btrfs_block_group *block_group,
struct inode *inode,
@@ -3624,172 +3721,40 @@ out:
}
/*
- * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
- * this function scans fs tree to find blocks reference the data extent
+ * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
+ * cache inode, to avoid free space cache data extent blocking data relocation.
*/
-static int find_data_references(struct reloc_control *rc,
- struct btrfs_key *extent_key,
- struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref,
- struct rb_root *blocks)
+static int delete_v1_space_cache(struct extent_buffer *leaf,
+ struct btrfs_block_group *block_group,
+ u64 data_bytenr)
{
- struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
- struct btrfs_path *path;
- struct tree_block *block;
- struct btrfs_root *root;
- struct btrfs_file_extent_item *fi;
- struct rb_node *rb_node;
+ u64 space_cache_ino;
+ struct btrfs_file_extent_item *ei;
struct btrfs_key key;
- u64 ref_root;
- u64 ref_objectid;
- u64 ref_offset;
- u32 ref_count;
- u32 nritems;
- int err = 0;
- int added = 0;
- int counted;
+ bool found = false;
+ int i;
int ret;
- ref_root = btrfs_extent_data_ref_root(leaf, ref);
- ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
- ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
- ref_count = btrfs_extent_data_ref_count(leaf, ref);
-
- /*
- * This is an extent belonging to the free space cache, lets just delete
- * it and redo the search.
- */
- if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
- ret = delete_block_group_cache(fs_info, rc->block_group,
- NULL, ref_objectid);
- if (ret != -ENOENT)
- return ret;
- ret = 0;
- }
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = READA_FORWARD;
-
- root = read_fs_root(fs_info, ref_root);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto out;
- }
-
- key.objectid = ref_objectid;
- key.type = BTRFS_EXTENT_DATA_KEY;
- if (ref_offset > ((u64)-1 << 32))
- key.offset = 0;
- else
- key.offset = ref_offset;
-
- path->search_commit_root = 1;
- path->skip_locking = 1;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- err = ret;
- goto out;
- }
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- /*
- * the references in tree blocks that use full backrefs
- * are not counted in
- */
- if (block_use_full_backref(rc, leaf))
- counted = 0;
- else
- counted = 1;
- rb_node = tree_search(blocks, leaf->start);
- if (rb_node) {
- if (counted)
- added = 1;
- else
- path->slots[0] = nritems;
- }
-
- while (ref_count > 0) {
- while (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- if (WARN_ON(ret > 0))
- goto out;
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- added = 0;
-
- if (block_use_full_backref(rc, leaf))
- counted = 0;
- else
- counted = 1;
- rb_node = tree_search(blocks, leaf->start);
- if (rb_node) {
- if (counted)
- added = 1;
- else
- path->slots[0] = nritems;
- }
- }
+ if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
+ return 0;
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (WARN_ON(key.objectid != ref_objectid ||
- key.type != BTRFS_EXTENT_DATA_KEY))
+ for (i = 0; i < btrfs_header_nritems(leaf); i++) {
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG &&
+ btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
+ found = true;
+ space_cache_ino = key.objectid;
break;
-
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- goto next;
-
- if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
- extent_key->objectid)
- goto next;
-
- key.offset -= btrfs_file_extent_offset(leaf, fi);
- if (key.offset != ref_offset)
- goto next;
-
- if (counted)
- ref_count--;
- if (added)
- goto next;
-
- if (!tree_block_processed(leaf->start, rc)) {
- block = kmalloc(sizeof(*block), GFP_NOFS);
- if (!block) {
- err = -ENOMEM;
- break;
- }
- block->bytenr = leaf->start;
- btrfs_item_key_to_cpu(leaf, &block->key, 0);
- block->level = 0;
- block->key_ready = 1;
- rb_node = tree_insert(blocks, block->bytenr,
- &block->rb_node);
- if (rb_node)
- backref_tree_panic(rb_node, -EEXIST,
- block->bytenr);
}
- if (counted)
- added = 1;
- else
- path->slots[0] = nritems;
-next:
- path->slots[0]++;
-
}
-out:
- btrfs_free_path(path);
- return err;
+ if (!found)
+ return -ENOENT;
+ ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
+ space_cache_ino);
+ return ret;
}
/*
@@ -3801,91 +3766,41 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_path *path,
struct rb_root *blocks)
{
- struct btrfs_key key;
- struct extent_buffer *eb;
- struct btrfs_extent_data_ref *dref;
- struct btrfs_extent_inline_ref *iref;
- unsigned long ptr;
- unsigned long end;
- u32 blocksize = rc->extent_root->fs_info->nodesize;
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct ulist *leaves = NULL;
+ struct ulist_iterator leaf_uiter;
+ struct ulist_node *ref_node = NULL;
+ const u32 blocksize = fs_info->nodesize;
int ret = 0;
- int err = 0;
-
- eb = path->nodes[0];
- ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
- end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
- ptr += sizeof(struct btrfs_extent_item);
- while (ptr < end) {
- iref = (struct btrfs_extent_inline_ref *)ptr;
- key.type = btrfs_get_extent_inline_ref_type(eb, iref,
- BTRFS_REF_TYPE_DATA);
- if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
- key.offset = btrfs_extent_inline_ref_offset(eb, iref);
- ret = __add_tree_block(rc, key.offset, blocksize,
- blocks);
- } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = (struct btrfs_extent_data_ref *)(&iref->offset);
- ret = find_data_references(rc, extent_key,
- eb, dref, blocks);
- } else {
- ret = -EUCLEAN;
- btrfs_err(rc->extent_root->fs_info,
- "extent %llu slot %d has an invalid inline ref type",
- eb->start, path->slots[0]);
- }
- if (ret) {
- err = ret;
- goto out;
- }
- ptr += btrfs_extent_inline_ref_size(key.type);
- }
- WARN_ON(ptr > end);
+ btrfs_release_path(path);
+ ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
+ 0, &leaves, NULL, true);
+ if (ret < 0)
+ return ret;
- while (1) {
- cond_resched();
- eb = path->nodes[0];
- if (path->slots[0] >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(rc->extent_root, path);
- if (ret < 0) {
- err = ret;
- break;
- }
- if (ret > 0)
- break;
- eb = path->nodes[0];
- }
+ ULIST_ITER_INIT(&leaf_uiter);
+ while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
+ struct extent_buffer *eb;
- btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
- if (key.objectid != extent_key->objectid)
+ eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
break;
-
- if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
- ret = __add_tree_block(rc, key.offset, blocksize,
- blocks);
- } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = btrfs_item_ptr(eb, path->slots[0],
- struct btrfs_extent_data_ref);
- ret = find_data_references(rc, extent_key,
- eb, dref, blocks);
- } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
- btrfs_print_v0_err(eb->fs_info);
- btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
- ret = -EINVAL;
- } else {
- ret = 0;
}
- if (ret) {
- err = ret;
+ ret = delete_v1_space_cache(eb, rc->block_group,
+ extent_key->objectid);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ break;
+ ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
+ if (ret < 0)
break;
- }
- path->slots[0]++;
}
-out:
- btrfs_release_path(path);
- if (err)
+ if (ret < 0)
free_block_list(blocks);
- return err;
+ ulist_free(leaves);
+ return ret;
}
/*
@@ -4137,12 +4052,6 @@ restart:
if (!RB_EMPTY_ROOT(&blocks)) {
ret = relocate_tree_blocks(trans, rc, &blocks);
if (ret < 0) {
- /*
- * if we fail to relocate tree blocks, force to update
- * backref cache when committing transaction.
- */
- rc->backref_cache.last_trans = trans->transid - 1;
-
if (ret != -EAGAIN) {
err = ret;
break;
@@ -4166,6 +4075,10 @@ restart:
break;
}
}
+ if (btrfs_should_cancel_balance(fs_info)) {
+ err = -ECANCELED;
+ break;
+ }
}
if (trans && progress && err == -ENOSPC) {
ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
@@ -4195,15 +4108,23 @@ restart:
set_reloc_control(rc);
backref_cache_cleanup(&rc->backref_cache);
- btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
+ /*
+ * Even in the case when the relocation is cancelled, we should all go
+ * through prepare_to_merge() and merge_reloc_roots().
+ *
+ * For error (including cancelled balance), prepare_to_merge() will
+ * mark all reloc trees orphan, then queue them for cleanup in
+ * merge_reloc_roots()
+ */
err = prepare_to_merge(rc, err);
merge_reloc_roots(rc);
rc->merge_reloc_tree = 0;
unset_reloc_control(rc);
- btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
/* get rid of pinned extents */
trans = btrfs_join_transaction(rc->extent_root);
@@ -4212,10 +4133,10 @@ restart:
goto out_free;
}
btrfs_commit_transaction(trans);
+out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
err = ret;
-out_free:
btrfs_free_block_rsv(fs_info, rc->block_rsv);
btrfs_free_path(path);
return err;
@@ -4271,8 +4192,10 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
return ERR_CAST(root);
trans = btrfs_start_transaction(root, 6);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ btrfs_put_root(root);
return ERR_CAST(trans);
+ }
err = btrfs_find_free_objectid(root, &objectid);
if (err)
@@ -4290,6 +4213,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
err = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
+ btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (err) {
@@ -4317,6 +4241,18 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
return rc;
}
+static void free_reloc_control(struct reloc_control *rc)
+{
+ struct mapping_node *node, *tmp;
+
+ free_reloc_roots(&rc->reloc_roots);
+ rbtree_postorder_for_each_entry_safe(node, tmp,
+ &rc->reloc_root_tree.rb_root, rb_node)
+ kfree(node);
+
+ kfree(rc);
+}
+
/*
* Print the block group being relocated
*/
@@ -4461,7 +4397,7 @@ out:
btrfs_dec_block_group_ro(rc->block_group);
iput(rc->data_inode);
btrfs_put_block_group(rc->block_group);
- kfree(rc);
+ free_reloc_control(rc);
return err;
}
@@ -4537,12 +4473,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
key.type != BTRFS_ROOT_ITEM_KEY)
break;
- reloc_root = btrfs_read_fs_root(root, &key);
+ reloc_root = btrfs_read_tree_root(root, &key);
if (IS_ERR(reloc_root)) {
err = PTR_ERR(reloc_root);
goto out;
}
+ set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state);
list_add(&reloc_root->root_list, &reloc_roots);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
@@ -4559,6 +4496,8 @@ int btrfs_recover_relocation(struct btrfs_root *root)
err = ret;
goto out;
}
+ } else {
+ btrfs_put_root(fs_root);
}
}
@@ -4584,9 +4523,8 @@ int btrfs_recover_relocation(struct btrfs_root *root)
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
- unset_reloc_control(rc);
err = PTR_ERR(trans);
- goto out_free;
+ goto out_unset;
}
rc->merge_reloc_tree = 1;
@@ -4606,17 +4544,18 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
list_add_tail(&reloc_root->root_list, &reloc_roots);
- goto out_free;
+ goto out_unset;
}
err = __add_reloc_root(reloc_root);
BUG_ON(err < 0); /* -ENOMEM or logic error */
- fs_root->reloc_root = reloc_root;
+ fs_root->reloc_root = btrfs_grab_root(reloc_root);
+ btrfs_put_root(fs_root);
}
err = btrfs_commit_transaction(trans);
if (err)
- goto out_free;
+ goto out_unset;
merge_reloc_roots(rc);
@@ -4625,15 +4564,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- goto out_free;
+ goto out_clean;
}
err = btrfs_commit_transaction(trans);
-
+out_clean:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
err = ret;
-out_free:
- kfree(rc);
+out_unset:
+ unset_reloc_control(rc);
+ free_reloc_control(rc);
out:
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
@@ -4643,10 +4583,12 @@ out:
if (err == 0) {
/* cleanup orphan inode in data relocation tree */
fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
- if (IS_ERR(fs_root))
+ if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
- else
+ } else {
err = btrfs_orphan_cleanup(fs_root);
+ btrfs_put_root(fs_root);
+ }
}
return err;
}
@@ -4720,11 +4662,6 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
- if (buf == root->node)
- __update_reloc_root(root, cow->start);
- }
-
level = btrfs_header_level(buf);
if (btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item))
@@ -4795,6 +4732,10 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
/*
* called after snapshot is created. migrate block reservation
* and create reloc root for the newly created snapshot
+ *
+ * This is similar to btrfs_init_reloc_root(), we come out of here with two
+ * references held on the reloc_root, one for root->reloc_root and one for
+ * rc->reloc_roots.
*/
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending)
@@ -4827,7 +4768,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
ret = __add_reloc_root(reloc_root);
BUG_ON(ret < 0);
- new_root->reloc_root = reloc_root;
+ new_root->reloc_root = btrfs_grab_root(reloc_root);
if (rc->create_reloc_tree)
ret = clone_backref_node(trans, rc, root, reloc_root);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 612411c74550..668f22844017 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -22,7 +22,6 @@
static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
- uuid_le uuid;
u32 len;
int need_reset = 0;
@@ -44,8 +43,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
sizeof(*item) - offsetof(struct btrfs_root_item,
generation_v2));
- uuid_le_gen(&uuid);
- memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
+ generate_random_guid(item->uuid);
}
}
@@ -255,25 +253,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
root_key.objectid = key.offset;
key.offset++;
- /*
- * The root might have been inserted already, as before we look
- * for orphan roots, log replay might have happened, which
- * triggers a transaction commit and qgroup accounting, which
- * in turn reads and inserts fs roots while doing backref
- * walking.
- */
- root = btrfs_lookup_fs_root(fs_info, root_key.objectid);
- if (root) {
- WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
- &root->state));
- if (btrfs_root_refs(&root->root_item) == 0) {
- set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
- btrfs_add_dead_root(root);
- }
- continue;
- }
-
- root = btrfs_read_fs_root(tree_root, &root_key);
+ root = btrfs_get_fs_root(fs_info, &root_key, false);
err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
break;
@@ -300,25 +280,12 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
continue;
}
- err = btrfs_init_fs_root(root);
- if (err) {
- btrfs_free_fs_root(root);
- break;
- }
-
- set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
-
- err = btrfs_insert_fs_root(fs_info, root);
- if (err) {
- BUG_ON(err == -EEXIST);
- btrfs_free_fs_root(root);
- break;
- }
-
+ WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
+ btrfs_put_root(root);
}
btrfs_free_path(path);
@@ -553,5 +520,5 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 61b37c56a7fb..adaf8ab694d5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -149,7 +149,7 @@ struct scrub_parity {
*/
unsigned long *ebitmap;
- unsigned long bitmap[0];
+ unsigned long bitmap[];
};
struct scrub_ctx {
@@ -653,7 +653,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
root_key.objectid = root;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
- local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ local_root = btrfs_get_fs_root(fs_info, &root_key, true);
if (IS_ERR(local_root)) {
ret = PTR_ERR(local_root);
goto err;
@@ -668,6 +668,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
if (ret) {
+ btrfs_put_root(local_root);
btrfs_release_path(swarn->path);
goto err;
}
@@ -688,6 +689,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
ipath = init_ipath(4096, local_root, swarn->path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ipath)) {
+ btrfs_put_root(local_root);
ret = PTR_ERR(ipath);
ipath = NULL;
goto err;
@@ -711,6 +713,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
min(isize - offset, (u64)PAGE_SIZE), nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
+ btrfs_put_root(local_root);
free_ipath(ipath);
return 0;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a055b657cb85..c5f41bd86765 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5586,10 +5586,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
{
struct btrfs_path *path;
struct btrfs_root *root = sctx->send_root;
- struct btrfs_file_extent_item *fi;
struct btrfs_key key;
- u64 extent_end;
- u8 type;
int ret;
path = alloc_path_for_send();
@@ -5609,18 +5606,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset)
if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
goto out;
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], fi);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
- extent_end = ALIGN(key.offset + size,
- sctx->send_root->fs_info->sectorsize);
- } else {
- extent_end = key.offset +
- btrfs_file_extent_num_bytes(path->nodes[0], fi);
- }
- sctx->cur_inode_last_extent = extent_end;
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
out:
btrfs_free_path(path);
return ret;
@@ -5674,16 +5660,7 @@ static int range_is_hole_in_parent(struct send_ctx *sctx,
break;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(leaf, fi);
-
- extent_end = ALIGN(key.offset + size,
- root->fs_info->sectorsize);
- } else {
- extent_end = key.offset +
- btrfs_file_extent_num_bytes(leaf, fi);
- }
+ extent_end = btrfs_file_extent_end(path);
if (extent_end <= start)
goto next;
if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
@@ -5704,9 +5681,6 @@ out:
static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
struct btrfs_key *key)
{
- struct btrfs_file_extent_item *fi;
- u64 extent_end;
- u8 type;
int ret = 0;
if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
@@ -5718,18 +5692,6 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
return ret;
}
- fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], fi);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
- extent_end = ALIGN(key->offset + size,
- sctx->send_root->fs_info->sectorsize);
- } else {
- extent_end = key->offset +
- btrfs_file_extent_num_bytes(path->nodes[0], fi);
- }
-
if (path->slots[0] == 0 &&
sctx->cur_inode_last_extent < key->offset) {
/*
@@ -5755,7 +5717,7 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
else
ret = 0;
}
- sctx->cur_inode_last_extent = extent_end;
+ sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
return ret;
}
@@ -7066,7 +7028,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
int clone_sources_to_rollback = 0;
unsigned alloc_size;
int sort_clone_roots = 0;
- int index;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -7193,11 +7154,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ clone_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(clone_root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(clone_root);
goto out;
}
@@ -7205,20 +7163,19 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (!btrfs_root_readonly(clone_root) ||
btrfs_root_dead(clone_root)) {
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ btrfs_put_root(clone_root);
ret = -EPERM;
goto out;
}
if (clone_root->dedupe_in_progress) {
dedupe_in_progress_warn(clone_root);
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
+ btrfs_put_root(clone_root);
ret = -EAGAIN;
goto out;
}
clone_root->send_in_progress++;
spin_unlock(&clone_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
sctx->clone_roots[i].root = clone_root;
clone_sources_to_rollback = i + 1;
@@ -7232,11 +7189,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ sctx->parent_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(sctx->parent_root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(sctx->parent_root);
goto out;
}
@@ -7246,20 +7200,16 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (!btrfs_root_readonly(sctx->parent_root) ||
btrfs_root_dead(sctx->parent_root)) {
spin_unlock(&sctx->parent_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EPERM;
goto out;
}
if (sctx->parent_root->dedupe_in_progress) {
dedupe_in_progress_warn(sctx->parent_root);
spin_unlock(&sctx->parent_root->root_item_lock);
- srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EAGAIN;
goto out;
}
spin_unlock(&sctx->parent_root->root_item_lock);
-
- srcu_read_unlock(&fs_info->subvol_srcu, index);
}
/*
@@ -7267,7 +7217,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
* is behind the current send position. This is checked while searching
* for possible clone sources.
*/
- sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
+ sctx->clone_roots[sctx->clone_roots_cnt++].root =
+ btrfs_grab_root(sctx->send_root);
/* We do a bsearch later */
sort(sctx->clone_roots, sctx->clone_roots_cnt,
@@ -7352,18 +7303,24 @@ out:
}
if (sort_clone_roots) {
- for (i = 0; i < sctx->clone_roots_cnt; i++)
+ for (i = 0; i < sctx->clone_roots_cnt; i++) {
btrfs_root_dec_send_in_progress(
sctx->clone_roots[i].root);
+ btrfs_put_root(sctx->clone_roots[i].root);
+ }
} else {
- for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+ for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
btrfs_root_dec_send_in_progress(
sctx->clone_roots[i].root);
+ btrfs_put_root(sctx->clone_roots[i].root);
+ }
btrfs_root_dec_send_in_progress(send_root);
}
- if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+ if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
btrfs_root_dec_send_in_progress(sctx->parent_root);
+ btrfs_put_root(sctx->parent_root);
+ }
kvfree(clone_sources_tmp);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 01297c5b2666..8b0fe053a25d 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -10,6 +10,153 @@
#include "transaction.h"
#include "block-group.h"
+/*
+ * HOW DOES SPACE RESERVATION WORK
+ *
+ * If you want to know about delalloc specifically, there is a separate comment
+ * for that with the delalloc code. This comment is about how the whole system
+ * works generally.
+ *
+ * BASIC CONCEPTS
+ *
+ * 1) space_info. This is the ultimate arbiter of how much space we can use.
+ * There's a description of the bytes_ fields with the struct declaration,
+ * refer to that for specifics on each field. Suffice it to say that for
+ * reservations we care about total_bytes - SUM(space_info->bytes_) when
+ * determining if there is space to make an allocation. There is a space_info
+ * for METADATA, SYSTEM, and DATA areas.
+ *
+ * 2) block_rsv's. These are basically buckets for every different type of
+ * metadata reservation we have. You can see the comment in the block_rsv
+ * code on the rules for each type, but generally block_rsv->reserved is how
+ * much space is accounted for in space_info->bytes_may_use.
+ *
+ * 3) btrfs_calc*_size. These are the worst case calculations we used based
+ * on the number of items we will want to modify. We have one for changing
+ * items, and one for inserting new items. Generally we use these helpers to
+ * determine the size of the block reserves, and then use the actual bytes
+ * values to adjust the space_info counters.
+ *
+ * MAKING RESERVATIONS, THE NORMAL CASE
+ *
+ * We call into either btrfs_reserve_data_bytes() or
+ * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
+ * num_bytes we want to reserve.
+ *
+ * ->reserve
+ * space_info->bytes_may_reserve += num_bytes
+ *
+ * ->extent allocation
+ * Call btrfs_add_reserved_bytes() which does
+ * space_info->bytes_may_reserve -= num_bytes
+ * space_info->bytes_reserved += extent_bytes
+ *
+ * ->insert reference
+ * Call btrfs_update_block_group() which does
+ * space_info->bytes_reserved -= extent_bytes
+ * space_info->bytes_used += extent_bytes
+ *
+ * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
+ *
+ * Assume we are unable to simply make the reservation because we do not have
+ * enough space
+ *
+ * -> __reserve_bytes
+ * create a reserve_ticket with ->bytes set to our reservation, add it to
+ * the tail of space_info->tickets, kick async flush thread
+ *
+ * ->handle_reserve_ticket
+ * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
+ * on the ticket.
+ *
+ * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
+ * Flushes various things attempting to free up space.
+ *
+ * -> btrfs_try_granting_tickets()
+ * This is called by anything that either subtracts space from
+ * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
+ * space_info->total_bytes. This loops through the ->priority_tickets and
+ * then the ->tickets list checking to see if the reservation can be
+ * completed. If it can the space is added to space_info->bytes_may_use and
+ * the ticket is woken up.
+ *
+ * -> ticket wakeup
+ * Check if ->bytes == 0, if it does we got our reservation and we can carry
+ * on, if not return the appropriate error (ENOSPC, but can be EINTR if we
+ * were interrupted.)
+ *
+ * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
+ *
+ * Same as the above, except we add ourselves to the
+ * space_info->priority_tickets, and we do not use ticket->wait, we simply
+ * call flush_space() ourselves for the states that are safe for us to call
+ * without deadlocking and hope for the best.
+ *
+ * THE FLUSHING STATES
+ *
+ * Generally speaking we will have two cases for each state, a "nice" state
+ * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
+ * reduce the locking over head on the various trees, and even to keep from
+ * doing any work at all in the case of delayed refs. Each of these delayed
+ * things however hold reservations, and so letting them run allows us to
+ * reclaim space so we can make new reservations.
+ *
+ * FLUSH_DELAYED_ITEMS
+ * Every inode has a delayed item to update the inode. Take a simple write
+ * for example, we would update the inode item at write time to update the
+ * mtime, and then again at finish_ordered_io() time in order to update the
+ * isize or bytes. We keep these delayed items to coalesce these operations
+ * into a single operation done on demand. These are an easy way to reclaim
+ * metadata space.
+ *
+ * FLUSH_DELALLOC
+ * Look at the delalloc comment to get an idea of how much space is reserved
+ * for delayed allocation. We can reclaim some of this space simply by
+ * running delalloc, but usually we need to wait for ordered extents to
+ * reclaim the bulk of this space.
+ *
+ * FLUSH_DELAYED_REFS
+ * We have a block reserve for the outstanding delayed refs space, and every
+ * delayed ref operation holds a reservation. Running these is a quick way
+ * to reclaim space, but we want to hold this until the end because COW can
+ * churn a lot and we can avoid making some extent tree modifications if we
+ * are able to delay for as long as possible.
+ *
+ * ALLOC_CHUNK
+ * We will skip this the first time through space reservation, because of
+ * overcommit and we don't want to have a lot of useless metadata space when
+ * our worst case reservations will likely never come true.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we're freeing inodes we're likely freeing checksums, file extent
+ * items, and extent tree items. Loads of space could be freed up by these
+ * operations, however they won't be usable until the transaction commits.
+ *
+ * COMMIT_TRANS
+ * may_commit_transaction() is the ultimate arbiter on whether we commit the
+ * transaction or not. In order to avoid constantly churning we do all the
+ * above flushing first and then commit the transaction as the last resort.
+ * However we need to take into account things like pinned space that would
+ * be freed, plus any delayed work we may not have gotten rid of in the case
+ * of metadata.
+ *
+ * OVERCOMMIT
+ *
+ * Because we hold so many reservations for metadata we will allow you to
+ * reserve more space than is currently free in the currently allocate
+ * metadata space. This only happens with metadata, data does not allow
+ * overcommitting.
+ *
+ * You can see the current logic for when we allow overcommit in
+ * btrfs_can_overcommit(), but it only applies to unallocated space. If there
+ * is no unallocated space to be had, all reservations are kept within the
+ * free space in the allocated metadata chunks.
+ *
+ * Because of overcommitting, you generally want to use the
+ * btrfs_can_overcommit() logic for metadata allocations, as it does the right
+ * thing with or without extra unallocated space.
+ */
+
u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included)
{
@@ -159,25 +306,19 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
return (global->size << 1);
}
-int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush)
+static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
- u64 used;
int factor;
- /* Don't overcommit when in mixed mode. */
- if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
- return 0;
-
if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
profile = btrfs_system_alloc_profile(fs_info);
else
profile = btrfs_metadata_alloc_profile(fs_info);
- used = btrfs_space_info_used(space_info, true);
avail = atomic64_read(&fs_info->free_chunk_space);
/*
@@ -198,6 +339,22 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
+ return avail;
+}
+
+int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ u64 avail;
+ u64 used;
+
+ /* Don't overcommit when in mixed mode */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+ return 0;
+
+ used = btrfs_space_info_used(space_info, true);
+ avail = calc_available_free_space(fs_info, space_info, flush);
if (used + bytes < space_info->total_bytes + avail)
return 1;
@@ -232,6 +389,8 @@ again:
space_info,
ticket->bytes);
list_del_init(&ticket->list);
+ ASSERT(space_info->reclaim_size >= ticket->bytes);
+ space_info->reclaim_size -= ticket->bytes;
ticket->bytes = 0;
space_info->tickets_id++;
wake_up(&ticket->wait);
@@ -627,15 +786,26 @@ static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
- struct reserve_ticket *ticket;
u64 used;
+ u64 avail;
u64 expected;
- u64 to_reclaim = 0;
+ u64 to_reclaim = space_info->reclaim_size;
+
+ lockdep_assert_held(&space_info->lock);
+
+ avail = calc_available_free_space(fs_info, space_info,
+ BTRFS_RESERVE_FLUSH_ALL);
+ used = btrfs_space_info_used(space_info, true);
+
+ /*
+ * We may be flushing because suddenly we have less space than we had
+ * before, and now we're well over-committed based on our current free
+ * space. If that's the case add in our overage so we make sure to put
+ * appropriate pressure on the flushing state machine.
+ */
+ if (space_info->total_bytes + avail < used)
+ to_reclaim += used - (space_info->total_bytes + avail);
- list_for_each_entry(ticket, &space_info->tickets, list)
- to_reclaim += ticket->bytes;
- list_for_each_entry(ticket, &space_info->priority_tickets, list)
- to_reclaim += ticket->bytes;
if (to_reclaim)
return to_reclaim;
@@ -1020,8 +1190,10 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+ ASSERT(space_info->reclaim_size >= 0);
ticket.bytes = orig_bytes;
ticket.error = 0;
+ space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
if (flush == BTRFS_RESERVE_FLUSH_ALL) {
list_add_tail(&ticket.list, &space_info->tickets);
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 24514cd2c6c1..0a5001ef1481 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -54,6 +54,13 @@ struct btrfs_space_info {
struct list_head ro_bgs;
struct list_head priority_tickets;
struct list_head tickets;
+
+ /*
+ * Size of space that needs to be reclaimed in order to satisfy pending
+ * tickets
+ */
+ u64 reclaim_size;
+
/*
* tickets_id just indicates the next ticket will be handled, so note
* it's not stored per ticket.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0616a5434793..7932d8d07cff 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -244,7 +244,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- trans->aborted = errno;
+ WRITE_ONCE(trans->aborted, errno);
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
if (!trans->dirty && list_empty(&trans->new_bgs)) {
@@ -873,7 +873,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
#endif
case Opt_err:
- btrfs_info(info, "unrecognized mount option '%s'", p);
+ btrfs_err(info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
goto out;
default:
@@ -1024,11 +1024,11 @@ out:
return error;
}
-static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
- u64 subvol_objectid)
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid)
{
struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_root *fs_root;
+ struct btrfs_root *fs_root = NULL;
struct btrfs_root_ref *root_ref;
struct btrfs_inode_ref *inode_ref;
struct btrfs_key key;
@@ -1096,9 +1096,10 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
key.objectid = subvol_objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ fs_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
+ fs_root = NULL;
goto err;
}
@@ -1143,6 +1144,8 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
ptr[0] = '/';
btrfs_release_path(path);
}
+ btrfs_put_root(fs_root);
+ fs_root = NULL;
}
btrfs_free_path(path);
@@ -1155,6 +1158,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
return name;
err:
+ btrfs_put_root(fs_root);
btrfs_free_path(path);
kfree(name);
return ERR_PTR(ret);
@@ -1438,8 +1442,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
goto out;
}
}
- subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
- subvol_objectid);
+ subvol_name = btrfs_get_subvol_name_from_objectid(
+ btrfs_sb(mnt->mnt_sb), subvol_objectid);
if (IS_ERR(subvol_name)) {
root = ERR_CAST(subvol_name);
subvol_name = NULL;
@@ -1518,14 +1522,17 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
/*
* Setup a dummy root and fs_info for test/set super. This is because
* we don't actually fill this stuff out until open_ctree, but we need
- * it for searching for existing supers, so this lets us do that and
- * then open_ctree will properly initialize everything later.
+ * then open_ctree will properly initialize the file system specific
+ * settings later. btrfs_init_fs_info initializes the static elements
+ * of the fs_info (locks and such) to make cleanup easier if we find a
+ * superblock with our given fs_devices later on at sget() time.
*/
fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
if (!fs_info) {
error = -ENOMEM;
goto error_sec_opts;
}
+ btrfs_init_fs_info(fs_info);
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
@@ -1571,7 +1578,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
if (s->s_root) {
btrfs_close_devices(fs_devices);
- free_fs_info(fs_info);
+ btrfs_free_fs_info(fs_info);
if ((flags ^ s->s_flags) & SB_RDONLY)
error = -EBUSY;
} else {
@@ -1594,7 +1601,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
error_close_devices:
btrfs_close_devices(fs_devices);
error_fs_info:
- free_fs_info(fs_info);
+ btrfs_free_fs_info(fs_info);
error_sec_opts:
security_free_mnt_opts(&new_sec_opts);
return ERR_PTR(error);
@@ -1834,6 +1841,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ btrfs_warn(fs_info,
+ "mount required to replay tree-log, cannot remount read-write");
ret = -EINVAL;
goto restore;
}
@@ -2168,7 +2177,7 @@ static void btrfs_kill_super(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
kill_anon_super(sb);
- free_fs_info(fs_info);
+ btrfs_free_fs_info(fs_info);
}
static struct file_system_type btrfs_fs_type = {
@@ -2201,7 +2210,7 @@ static int btrfs_control_open(struct inode *inode, struct file *file)
}
/*
- * used by btrfsctl to scan devices when no FS is mounted
+ * Used by /dev/btrfs-control for devices ioctls.
*/
static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 7436422194da..a39bff64ff24 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -155,7 +155,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
} else
val = can_modify_feature(fa);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
@@ -295,7 +295,7 @@ static const struct attribute_group btrfs_feature_attr_group = {
static ssize_t rmdir_subvol_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "0\n");
+ return scnprintf(buf, PAGE_SIZE, "0\n");
}
BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
@@ -310,12 +310,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
* This "trick" only works as long as 'enum btrfs_csum_type' has
* no holes in it
*/
- ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
(i == 0 ? "" : " "), btrfs_super_csum_name(i));
}
- ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
return ret;
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
@@ -350,7 +350,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%lld\n",
+ return scnprintf(buf, PAGE_SIZE, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discardable_bytes));
}
BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
@@ -361,7 +361,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%d\n",
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
atomic_read(&fs_info->discard_ctl.discardable_extents));
}
BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
@@ -372,7 +372,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%lld\n",
+ return scnprintf(buf, PAGE_SIZE, "%lld\n",
fs_info->discard_ctl.discard_bitmap_bytes);
}
BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
@@ -383,7 +383,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%lld\n",
+ return scnprintf(buf, PAGE_SIZE, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
}
BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
@@ -394,7 +394,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%lld\n",
+ return scnprintf(buf, PAGE_SIZE, "%lld\n",
fs_info->discard_ctl.discard_extent_bytes);
}
BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
@@ -405,7 +405,7 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n",
+ return scnprintf(buf, PAGE_SIZE, "%u\n",
READ_ONCE(fs_info->discard_ctl.iops_limit));
}
@@ -435,7 +435,7 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n",
+ return scnprintf(buf, PAGE_SIZE, "%u\n",
READ_ONCE(fs_info->discard_ctl.kbps_limit));
}
@@ -465,7 +465,7 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%llu\n",
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
READ_ONCE(fs_info->discard_ctl.max_discard_size));
}
@@ -530,7 +530,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
val = *value_ptr;
if (lock)
spin_unlock(lock);
- return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
}
static ssize_t global_rsv_size_show(struct kobject *kobj,
@@ -576,7 +576,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
val += block_group->used;
}
up_read(&sinfo->groups_sem);
- return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
}
static struct attribute *raid_attrs[] = {
@@ -613,7 +613,7 @@ static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
{
struct btrfs_space_info *sinfo = to_space_info(kobj);
s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
- return snprintf(buf, PAGE_SIZE, "%lld\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%lld\n", val);
}
SPACE_INFO_ATTR(flags);
@@ -670,7 +670,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
ssize_t ret;
spin_lock(&fs_info->super_lock);
- ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+ ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
spin_unlock(&fs_info->super_lock);
return ret;
@@ -718,7 +718,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+ return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
}
BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -728,8 +728,8 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n",
- fs_info->super_copy->sectorsize);
+ return scnprintf(buf, PAGE_SIZE, "%u\n",
+ fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -739,8 +739,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%u\n",
- fs_info->super_copy->sectorsize);
+ return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
}
BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -752,7 +751,7 @@ static ssize_t quota_override_show(struct kobject *kobj,
int quota_override;
quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
- return snprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override);
}
static ssize_t quota_override_store(struct kobject *kobj,
@@ -790,7 +789,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- return snprintf(buf, PAGE_SIZE, "%pU\n",
+ return scnprintf(buf, PAGE_SIZE, "%pU\n",
fs_info->fs_devices->metadata_uuid);
}
@@ -802,7 +801,7 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
- return snprintf(buf, PAGE_SIZE, "%s (%s)\n",
+ return scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
btrfs_super_csum_name(csum_type),
crypto_shash_driver_name(fs_info->csum_shash));
}
@@ -901,6 +900,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
{
+ if (fs_devs->devinfo_kobj) {
+ kobject_del(fs_devs->devinfo_kobj);
+ kobject_put(fs_devs->devinfo_kobj);
+ fs_devs->devinfo_kobj = NULL;
+ }
+
if (fs_devs->devices_kobj) {
kobject_del(fs_devs->devices_kobj);
kobject_put(fs_devs->devices_kobj);
@@ -954,7 +959,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
addrm_unknown_feature_attrs(fs_info, false);
sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
- btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
+ btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL);
}
static const char * const btrfs_feature_set_names[FEAT_MAX] = {
@@ -986,7 +991,7 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
continue;
name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
- len += snprintf(str + len, bufsize - len, "%s%s",
+ len += scnprintf(str + len, bufsize - len, "%s%s",
len ? "," : "", name);
}
@@ -1143,7 +1148,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
/* when one_device is NULL, it removes all device links */
-int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
struct hd_struct *disk;
@@ -1195,11 +1200,11 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
-static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj,
+static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
int val;
@@ -1208,9 +1213,9 @@ static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
-BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show);
+BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
struct kobj_attribute *a,
@@ -1222,7 +1227,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
@@ -1235,7 +1240,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- return snprintf(buf, PAGE_SIZE, "%d\n", val);
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
@@ -1263,7 +1268,7 @@ static struct kobj_type devid_ktype = {
.release = btrfs_release_devid_kobj,
};
-int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
int error = 0;
@@ -1289,7 +1294,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
init_completion(&dev->kobj_unregister);
error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype,
- fs_devices->devices_kobj, "%llu",
+ fs_devices->devinfo_kobj, "%llu",
dev->devid);
if (error) {
kobject_put(&dev->devid_kobj);
@@ -1365,7 +1370,16 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
if (!fs_devs->devices_kobj) {
btrfs_err(fs_devs->fs_info,
"failed to init sysfs device interface");
- kobject_put(&fs_devs->fsid_kobj);
+ btrfs_sysfs_remove_fsid(fs_devs);
+ return -ENOMEM;
+ }
+
+ fs_devs->devinfo_kobj = kobject_create_and_add("devinfo",
+ &fs_devs->fsid_kobj);
+ if (!fs_devs->devinfo_kobj) {
+ btrfs_err(fs_devs->fs_info,
+ "failed to init sysfs devinfo kobject");
+ btrfs_sysfs_remove_fsid(fs_devs);
return -ENOMEM;
}
@@ -1380,13 +1394,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
btrfs_set_fs_info_ptr(fs_info);
- error = btrfs_sysfs_add_device_link(fs_devs, NULL);
+ error = btrfs_sysfs_add_devices_dir(fs_devs, NULL);
if (error)
return error;
error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_sysfs_rm_device_link(fs_devs, NULL);
+ btrfs_sysfs_remove_devices_dir(fs_devs, NULL);
return error;
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index c68582add92e..718a26c97833 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -14,9 +14,9 @@ enum btrfs_feature_set {
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
-int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
-int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 84fb3fa940a6..999c14e5d0bd 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -120,6 +120,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
kfree(fs_info);
return NULL;
}
+ INIT_LIST_HEAD(&fs_info->fs_devices->devices);
+
fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
GFP_KERNEL);
if (!fs_info->super_copy) {
@@ -128,39 +130,10 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
return NULL;
}
+ btrfs_init_fs_info(fs_info);
+
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
-
- if (init_srcu_struct(&fs_info->subvol_srcu)) {
- kfree(fs_info->fs_devices);
- kfree(fs_info->super_copy);
- kfree(fs_info);
- return NULL;
- }
-
- spin_lock_init(&fs_info->buffer_lock);
- spin_lock_init(&fs_info->qgroup_lock);
- spin_lock_init(&fs_info->super_lock);
- spin_lock_init(&fs_info->fs_roots_radix_lock);
- mutex_init(&fs_info->qgroup_ioctl_lock);
- mutex_init(&fs_info->qgroup_rescan_lock);
- rwlock_init(&fs_info->tree_mod_log_lock);
- fs_info->running_transaction = NULL;
- fs_info->qgroup_tree = RB_ROOT;
- fs_info->qgroup_ulist = NULL;
- atomic64_set(&fs_info->tree_mod_seq, 0);
- INIT_LIST_HEAD(&fs_info->dirty_qgroups);
- INIT_LIST_HEAD(&fs_info->dead_roots);
- INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
- INIT_LIST_HEAD(&fs_info->fs_devices->devices);
- INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
- IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
- extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
- IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
- extent_map_tree_init(&fs_info->mapping_tree);
- fs_info->pinned_extents = &fs_info->freed_extents[0];
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
@@ -210,8 +183,9 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
}
btrfs_free_qgroup_config(fs_info);
btrfs_free_fs_roots(fs_info);
- cleanup_srcu_struct(&fs_info->subvol_srcu);
kfree(fs_info->super_copy);
+ btrfs_check_leaked_roots(fs_info);
+ btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->fs_devices);
kfree(fs_info);
}
@@ -223,11 +197,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
- if (root->node) {
- /* One for allocate_extent_buffer */
- free_extent_buffer(root->node);
- }
- kfree(root);
+ btrfs_put_root(root);
}
struct btrfs_block_group *
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ac035a6fa003..ce1ca8e73c2d 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -507,6 +507,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
test_err("couldn't insert fs root %d", ret);
goto out;
}
+ btrfs_put_root(tmp_root);
tmp_root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(tmp_root)) {
@@ -521,6 +522,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
test_err("couldn't insert fs root %d", ret);
goto out;
}
+ btrfs_put_root(tmp_root);
test_msg("running qgroup tests");
ret = test_no_shared_qgroup(root, sectorsize, nodesize);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 33dcc88b428a..8cede6eb9843 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -121,6 +121,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(
&transaction->delayed_refs.href_root.rb_root));
+ WARN_ON(!RB_EMPTY_ROOT(
+ &transaction->delayed_refs.dirty_extent_root));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
@@ -219,7 +221,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
WARN_ON_ONCE(!list_empty(&trans->new_bgs));
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
- trans->chunk_bytes_reserved);
+ trans->chunk_bytes_reserved, NULL);
trans->chunk_bytes_reserved = 0;
}
@@ -241,7 +243,7 @@ loop:
cur_trans = fs_info->running_transaction;
if (cur_trans) {
- if (cur_trans->aborted) {
+ if (TRANS_ABORTED(cur_trans)) {
spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
@@ -334,6 +336,8 @@ loop:
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
+ extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
+ IO_TREE_FS_PINNED_EXTENTS, NULL);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -457,7 +461,7 @@ static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
return (trans->state >= TRANS_STATE_COMMIT_START &&
trans->state < TRANS_STATE_UNBLOCKED &&
- !trans->aborted);
+ !TRANS_ABORTED(trans));
}
/* wait for commit against the current transaction to become unblocked
@@ -476,7 +480,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
wait_event(fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
- cur_trans->aborted);
+ TRANS_ABORTED(cur_trans));
btrfs_put_transaction(cur_trans);
} else {
spin_unlock(&fs_info->trans_lock);
@@ -671,7 +675,7 @@ join_fail:
alloc_fail:
if (num_bytes)
btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
- num_bytes);
+ num_bytes, NULL);
reserve_fail:
btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
return ERR_PTR(ret);
@@ -894,7 +898,7 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved);
+ trans->bytes_reserved, NULL);
trans->bytes_reserved = 0;
}
@@ -935,7 +939,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(info);
- if (trans->aborted ||
+ if (TRANS_ABORTED(trans) ||
test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
wake_up_process(info->transaction_kthread);
err = -EIO;
@@ -1260,8 +1264,10 @@ void btrfs_add_dead_root(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
- if (list_empty(&root->root_list))
+ if (list_empty(&root->root_list)) {
+ btrfs_grab_root(root);
list_add_tail(&root->root_list, &fs_info->dead_roots);
+ }
spin_unlock(&fs_info->trans_lock);
}
@@ -1475,7 +1481,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 index = 0;
u64 objectid;
u64 root_flags;
- uuid_le new_uuid;
ASSERT(pending->path);
path = pending->path;
@@ -1568,8 +1573,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_generation_v2(new_root_item,
trans->transid);
- uuid_le_gen(&new_uuid);
- memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ generate_random_guid(new_root_item->uuid);
memcpy(new_root_item->parent_uuid, root->root_item.uuid,
BTRFS_UUID_SIZE);
if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
@@ -1631,7 +1635,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
- pending->snap = btrfs_read_fs_root_no_name(fs_info, &key);
+ pending->snap = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
btrfs_abort_transaction(trans, ret);
@@ -1680,7 +1684,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
goto fail;
}
- ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL,
+ ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
+ BTRFS_UUID_KEY_SUBVOL,
objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1792,7 +1797,8 @@ static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,
struct btrfs_transaction *trans)
{
wait_event(fs_info->transaction_blocked_wait,
- trans->state >= TRANS_STATE_COMMIT_START || trans->aborted);
+ trans->state >= TRANS_STATE_COMMIT_START ||
+ TRANS_ABORTED(trans));
}
/*
@@ -1804,7 +1810,8 @@ static void wait_current_trans_commit_start_and_unblock(
struct btrfs_transaction *trans)
{
wait_event(fs_info->transaction_wait,
- trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted);
+ trans->state >= TRANS_STATE_UNBLOCKED ||
+ TRANS_ABORTED(trans));
}
/*
@@ -2024,7 +2031,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
trans->dirty = true;
/* Stop the commit early if ->aborted is set */
- if (unlikely(READ_ONCE(cur_trans->aborted))) {
+ if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
btrfs_end_transaction(trans);
return ret;
@@ -2098,7 +2105,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wait_for_commit(cur_trans);
- if (unlikely(cur_trans->aborted))
+ if (TRANS_ABORTED(cur_trans))
ret = cur_trans->aborted;
btrfs_put_transaction(cur_trans);
@@ -2117,7 +2124,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->trans_lock);
wait_for_commit(prev_trans);
- ret = prev_trans->aborted;
+ ret = READ_ONCE(prev_trans->aborted);
btrfs_put_transaction(prev_trans);
if (ret)
@@ -2171,8 +2178,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
- /* ->aborted might be set after the previous check, so check it */
- if (unlikely(READ_ONCE(cur_trans->aborted))) {
+ if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
goto scrub_continue;
}
@@ -2189,10 +2195,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* core function of the snapshot creation.
*/
ret = create_pending_snapshots(trans);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
/*
* We insert the dir indexes of the snapshots and update the inode
@@ -2205,16 +2209,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* the nodes and leaves.
*/
ret = btrfs_run_delayed_items(trans);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_reloc;
/*
* make sure none of the code above managed to slip in a
@@ -2240,11 +2240,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
mutex_lock(&fs_info->tree_log_mutex);
ret = commit_fs_roots(trans);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_tree_log;
/*
* Since the transaction is done, we can apply the pending changes
@@ -2262,39 +2259,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* new delayed refs. Must handle them or qgroup can be wrong.
*/
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_tree_log;
/*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
ret = btrfs_qgroup_account_extents(trans);
- if (ret < 0) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret < 0)
+ goto unlock_tree_log;
ret = commit_cowonly_roots(trans);
- if (ret) {
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
- }
+ if (ret)
+ goto unlock_tree_log;
/*
* The tasks which save the space cache and inode cache may also
* update ->aborted, check it.
*/
- if (unlikely(READ_ONCE(cur_trans->aborted))) {
+ if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
- mutex_unlock(&fs_info->tree_log_mutex);
- mutex_unlock(&fs_info->reloc_mutex);
- goto scrub_continue;
+ goto unlock_tree_log;
}
btrfs_prepare_extent_commit(fs_info);
@@ -2341,6 +2327,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction");
+ /*
+ * reloc_mutex has been unlocked, tree_log_mutex is still held
+ * but we can't jump to unlock_tree_log causing double unlock
+ */
mutex_unlock(&fs_info->tree_log_mutex);
goto scrub_continue;
}
@@ -2389,6 +2379,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
+unlock_tree_log:
+ mutex_unlock(&fs_info->tree_log_mutex);
+unlock_reloc:
+ mutex_unlock(&fs_info->reloc_mutex);
scrub_continue:
btrfs_scrub_continue(fs_info);
cleanup_transaction:
@@ -2432,13 +2426,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
btrfs_kill_all_delayed_nodes(root);
+ if (root->ino_cache_inode) {
+ iput(root->ino_cache_inode);
+ root->ino_cache_inode = NULL;
+ }
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- ret = btrfs_drop_snapshot(root, NULL, 0, 0);
+ ret = btrfs_drop_snapshot(root, 0, 0);
else
- ret = btrfs_drop_snapshot(root, NULL, 1, 0);
+ ret = btrfs_drop_snapshot(root, 1, 0);
+ btrfs_put_root(root);
return (ret < 0) ? 0 : 1;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 49f7196368f5..31ae8d273065 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -71,6 +71,7 @@ struct btrfs_transaction {
*/
struct list_head io_bgs;
struct list_head dropped_roots;
+ struct extent_io_tree pinned_extents;
/*
* we need to make sure block group deletion doesn't race with
@@ -115,6 +116,10 @@ struct btrfs_trans_handle {
struct btrfs_block_rsv *orig_rsv;
refcount_t use_count;
unsigned int type;
+ /*
+ * Error code of transaction abort, set outside of locks and must use
+ * the READ_ONCE/WRITE_ONCE access
+ */
short aborted;
bool adding_csums;
bool allocating_chunk;
@@ -126,6 +131,14 @@ struct btrfs_trans_handle {
struct list_head new_bgs;
};
+/*
+ * The abort status can be changed between calls and is not protected by locks.
+ * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's
+ * set to a non-zero value it does not change, so the macro should be in checks
+ * but is not necessary for further reads of the value.
+ */
+#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted)))
+
struct btrfs_pending_snapshot {
struct dentry *dentry;
struct inode *dir;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7dd7552f53a4..58c111474ba5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,8 @@
#include "compression.h"
#include "qgroup.h"
#include "inode-map.h"
+#include "block-group.h"
+#include "space-info.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -94,8 +96,8 @@ enum {
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
+ u64 start,
+ u64 end,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -311,7 +313,7 @@ static int process_one_buffer(struct btrfs_root *log,
}
if (wc->pin)
- ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
+ ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
eb->len);
if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
@@ -830,6 +832,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
}
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
+ extent_end - start);
+ if (ret)
+ goto out;
+
inode_add_bytes(inode, nbytes);
update_inode:
ret = btrfs_update_inode(trans, root, inode);
@@ -2659,18 +2666,39 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
return ret;
}
+/*
+ * Correctly adjust the reserved bytes occupied by a log tree extent buffer
+ */
+static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+{
+ struct btrfs_block_group *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, start);
+ if (!cache) {
+ btrfs_err(fs_info, "unable to find block group for %llu", start);
+ return;
+ }
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->reserved -= fs_info->nodesize;
+ cache->space_info->bytes_reserved -= fs_info->nodesize;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ btrfs_put_block_group(cache);
+}
+
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 root_owner;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
struct extent_buffer *cur;
- struct extent_buffer *parent;
u32 blocksize;
int ret = 0;
@@ -2690,9 +2718,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
- parent = path->nodes[*level];
- root_owner = btrfs_header_owner(parent);
-
next = btrfs_find_create_tree_block(fs_info, bytenr);
if (IS_ERR(next))
return PTR_ERR(next);
@@ -2720,18 +2745,16 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ bytenr, blocksize);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
- }
-
- WARN_ON(root_owner !=
- BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_pin_reserved_extent(fs_info,
- bytenr, blocksize);
- if (ret) {
- free_extent_buffer(next);
- return ret;
+ unaccount_log_buffer(fs_info, bytenr);
}
}
free_extent_buffer(next);
@@ -2762,7 +2785,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 root_owner;
int i;
int slot;
int ret;
@@ -2775,13 +2797,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level == 0);
return 0;
} else {
- struct extent_buffer *parent;
- if (path->nodes[*level] == root->node)
- parent = path->nodes[*level];
- else
- parent = path->nodes[*level + 1];
-
- root_owner = btrfs_header_owner(parent);
ret = wc->process_func(root, path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]),
*level);
@@ -2799,17 +2814,18 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ path->nodes[*level]->start,
+ path->nodes[*level]->len);
+ if (ret)
+ return ret;
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
- }
- WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_pin_reserved_extent(fs_info,
- path->nodes[*level]->start,
- path->nodes[*level]->len);
- if (ret)
- return ret;
+ unaccount_log_buffer(fs_info,
+ path->nodes[*level]->start);
+ }
}
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
@@ -2880,15 +2896,15 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
+ ret = btrfs_pin_reserved_extent(trans,
+ next->start, next->len);
+ if (ret)
+ goto out;
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
+ unaccount_log_buffer(fs_info, next->start);
}
-
- ret = btrfs_pin_reserved_extent(fs_info, next->start,
- next->len);
- if (ret)
- goto out;
}
}
@@ -3283,8 +3299,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
- free_extent_buffer(log->node);
- kfree(log);
+ btrfs_put_root(log);
}
/*
@@ -4518,13 +4533,15 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ const u64 start,
+ const u64 end)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
const u64 i_size = i_size_read(&inode->vfs_inode);
- u64 prev_extent_end = 0;
+ u64 prev_extent_end = start;
int ret;
if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
@@ -4532,16 +4549,21 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = 0;
+ key.offset = start;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
while (true) {
- struct btrfs_file_extent_item *extent;
struct extent_buffer *leaf = path->nodes[0];
- u64 len;
+ u64 extent_end;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
@@ -4558,9 +4580,18 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
break;
+ extent_end = btrfs_file_extent_end(path);
+ if (extent_end <= start)
+ goto next_slot;
+
/* We have a hole, log it. */
if (prev_extent_end < key.offset) {
- const u64 hole_len = key.offset - prev_extent_end;
+ u64 hole_len;
+
+ if (key.offset >= end)
+ hole_len = end - prev_extent_end;
+ else
+ hole_len = key.offset - prev_extent_end;
/*
* Release the path to avoid deadlocks with other code
@@ -4590,27 +4621,20 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
}
- extent = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(leaf, extent);
- prev_extent_end = ALIGN(key.offset + len,
- fs_info->sectorsize);
- } else {
- len = btrfs_file_extent_num_bytes(leaf, extent);
- prev_extent_end = key.offset + len;
- }
-
+ prev_extent_end = min(extent_end, end);
+ if (extent_end >= end)
+ break;
+next_slot:
path->slots[0]++;
cond_resched();
}
- if (prev_extent_end < i_size) {
+ if (prev_extent_end < end && prev_extent_end < i_size) {
u64 hole_len;
btrfs_release_path(path);
- hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
+ hole_len = min(ALIGN(i_size, fs_info->sectorsize), end);
+ hole_len -= prev_extent_end;
ret = btrfs_insert_file_extent(trans, root->log_root,
ino, prev_extent_end, 0, 0,
hole_len, 0, hole_len,
@@ -4938,6 +4962,178 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
return ret;
}
+static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_key *min_key,
+ const struct btrfs_key *max_key,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ const u64 logged_isize,
+ const bool recursive_logging,
+ const int inode_only,
+ const u64 start,
+ const u64 end,
+ struct btrfs_log_ctx *ctx,
+ bool *need_log_inode_item)
+{
+ struct btrfs_root *root = inode->root;
+ int ins_start_slot = 0;
+ int ins_nr = 0;
+ int ret;
+
+ /*
+ * We must make sure we don't copy extent items that are entirely out of
+ * the range [start, end - 1]. This is not just an optimization to avoid
+ * copying but also needed to avoid a corruption where we end up with
+ * file extent items in the log tree that have overlapping ranges - this
+ * can happen if we race with ordered extent completion for ranges that
+ * are outside our target range. For example we copy an extent item and
+ * when we move to the next leaf, that extent was trimmed and a new one
+ * covering a subrange of it, but with a higher key, was inserted - we
+ * would then copy this other extent too, resulting in a log tree with
+ * 2 extent items that represent overlapping ranges.
+ *
+ * We can copy the entire extents at the range bondaries however, even
+ * if they cover an area outside the target range. That's ok.
+ */
+ while (1) {
+ ret = btrfs_search_forward(root, min_key, path, trans->transid);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+again:
+ /* Note, ins_nr might be > 0 here, cleanup outside the loop */
+ if (min_key->objectid != max_key->objectid)
+ break;
+ if (min_key->type > max_key->type)
+ break;
+
+ if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ *need_log_inode_item = false;
+
+ if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ inode->generation == trans->transid &&
+ !recursive_logging) {
+ u64 other_ino = 0;
+ u64 other_parent = 0;
+
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0], min_key, inode,
+ &other_ino, &other_parent);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0 && ctx &&
+ other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
+ if (ins_nr > 0) {
+ ins_nr++;
+ } else {
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+ }
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr,
+ inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+
+ ret = log_conflicting_inodes(trans, root, path,
+ ctx, other_ino, other_parent);
+ if (ret)
+ return ret;
+ btrfs_release_path(path);
+ goto next_key;
+ }
+ }
+
+ /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+ if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ goto next_slot;
+ }
+
+ if (min_key->type == BTRFS_EXTENT_DATA_KEY) {
+ const u64 extent_end = btrfs_file_extent_end(path);
+
+ if (extent_end <= start) {
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path,
+ path, ins_start_slot,
+ ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ goto next_slot;
+ }
+ if (extent_end >= end) {
+ ins_nr++;
+ if (ins_nr == 1)
+ ins_start_slot = path->slots[0];
+ break;
+ }
+ }
+
+ if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+ ins_nr++;
+ goto next_slot;
+ } else if (!ins_nr) {
+ ins_start_slot = path->slots[0];
+ ins_nr = 1;
+ goto next_slot;
+ }
+
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+next_slot:
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], min_key,
+ path->slots[0]);
+ goto again;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ btrfs_release_path(path);
+next_key:
+ if (min_key->offset < (u64)-1) {
+ min_key->offset++;
+ } else if (min_key->type < max_key->type) {
+ min_key->type++;
+ min_key->offset = 0;
+ } else {
+ break;
+ }
+ }
+ if (ins_nr)
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4955,8 +5151,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
+ u64 start,
+ u64 end,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4967,9 +5163,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
int err = 0;
int ret;
- int nritems;
- int ins_start_slot = 0;
- int ins_nr;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
@@ -4987,6 +5180,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
+ start = ALIGN_DOWN(start, fs_info->sectorsize);
+ end = ALIGN(end, fs_info->sectorsize);
+
min_key.objectid = ino;
min_key.type = BTRFS_INODE_ITEM_KEY;
min_key.offset = 0;
@@ -5100,139 +5296,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
goto out_unlock;
}
- while (1) {
- ins_nr = 0;
- ret = btrfs_search_forward(root, &min_key,
- path, trans->transid);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- if (ret != 0)
- break;
-again:
- /* note, ins_nr might be > 0 here, cleanup outside the loop */
- if (min_key.objectid != ino)
- break;
- if (min_key.type > max_key.type)
- break;
-
- if (min_key.type == BTRFS_INODE_ITEM_KEY)
- need_log_inode_item = false;
-
- if ((min_key.type == BTRFS_INODE_REF_KEY ||
- min_key.type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
- u64 other_ino = 0;
- u64 other_parent = 0;
-
- ret = btrfs_check_ref_name_override(path->nodes[0],
- path->slots[0], &min_key, inode,
- &other_ino, &other_parent);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- } else if (ret > 0 && ctx &&
- other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
- if (ins_nr > 0) {
- ins_nr++;
- } else {
- ins_nr = 1;
- ins_start_slot = path->slots[0];
- }
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot,
- ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
-
- err = log_conflicting_inodes(trans, root, path,
- ctx, other_ino, other_parent);
- if (err)
- goto out_unlock;
- btrfs_release_path(path);
- goto next_key;
- }
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
- if (ins_nr == 0)
- goto next_slot;
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot,
- ins_nr, inode_only, logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- goto next_slot;
- }
-
- if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
- ins_nr++;
- goto next_slot;
- } else if (!ins_nr) {
- ins_start_slot = path->slots[0];
- ins_nr = 1;
- goto next_slot;
- }
-
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot, ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 1;
- ins_start_slot = path->slots[0];
-next_slot:
-
- nritems = btrfs_header_nritems(path->nodes[0]);
- path->slots[0]++;
- if (path->slots[0] < nritems) {
- btrfs_item_key_to_cpu(path->nodes[0], &min_key,
- path->slots[0]);
- goto again;
- }
- if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot,
- ins_nr, inode_only, logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- }
- btrfs_release_path(path);
-next_key:
- if (min_key.offset < (u64)-1) {
- min_key.offset++;
- } else if (min_key.type < max_key.type) {
- min_key.type++;
- min_key.offset = 0;
- } else {
- break;
- }
- }
- if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot, ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- }
+ err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+ path, dst_path, logged_isize,
+ recursive_logging, inode_only,
+ start, end, ctx, &need_log_inode_item);
+ if (err)
+ goto out_unlock;
btrfs_release_path(path);
btrfs_release_path(dst_path);
@@ -5243,7 +5312,7 @@ next_key:
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_holes(trans, root, inode, path);
+ err = btrfs_log_holes(trans, root, inode, path, start, end);
if (err)
goto out_unlock;
}
@@ -6145,7 +6214,7 @@ again:
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
- log = btrfs_read_fs_root(log_root_tree, &found_key);
+ log = btrfs_read_tree_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
btrfs_handle_fs_error(fs_info, ret,
@@ -6157,7 +6226,7 @@ again:
tmp_key.type = BTRFS_ROOT_ITEM_KEY;
tmp_key.offset = (u64)-1;
- wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+ wc.replay_dest = btrfs_get_fs_root(fs_info, &tmp_key, true);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
@@ -6173,12 +6242,10 @@ again:
* each subsequent pass.
*/
if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(fs_info,
+ ret = btrfs_pin_extent_for_log_replay(trans,
log->node->start,
log->node->len);
- free_extent_buffer(log->node);
- free_extent_buffer(log->commit_root);
- kfree(log);
+ btrfs_put_root(log);
if (!ret)
goto next;
@@ -6214,9 +6281,8 @@ again:
}
wc.replay_dest->log_root = NULL;
- free_extent_buffer(log->node);
- free_extent_buffer(log->commit_root);
- kfree(log);
+ btrfs_put_root(wc.replay_dest);
+ btrfs_put_root(log);
if (ret)
goto error;
@@ -6247,10 +6313,9 @@ next:
if (ret)
return ret;
- free_extent_buffer(log_root_tree->node);
log_root_tree->log_root = NULL;
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
- kfree(log_root_tree);
+ btrfs_put_root(log_root_tree);
return 0;
error:
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 76b84f2397b1..76671a6bcb61 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -246,9 +246,53 @@ out:
return ret;
}
-int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
- int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
- u64))
+/*
+ * Check if there's an matching subvolume for given UUID
+ *
+ * Return:
+ * 0 check succeeded, the entry is not outdated
+ * > 0 if the check failed, the caller should remove the entry
+ * < 0 if an error occurred
+ */
+static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
+ u8 *uuid, u8 type, u64 subvolid)
+{
+ struct btrfs_key key;
+ int ret = 0;
+ struct btrfs_root *subvol_root;
+
+ if (type != BTRFS_UUID_KEY_SUBVOL &&
+ type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+ goto out;
+
+ key.objectid = subvolid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ subvol_root = btrfs_get_fs_root(fs_info, &key, true);
+ if (IS_ERR(subvol_root)) {
+ ret = PTR_ERR(subvol_root);
+ if (ret == -ENOENT)
+ ret = 1;
+ goto out;
+ }
+
+ switch (type) {
+ case BTRFS_UUID_KEY_SUBVOL:
+ if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
+ ret = 1;
+ break;
+ case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+ if (memcmp(uuid, subvol_root->root_item.received_uuid,
+ BTRFS_UUID_SIZE))
+ ret = 1;
+ break;
+ }
+ btrfs_put_root(subvol_root);
+out:
+ return ret;
+}
+
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->uuid_root;
struct btrfs_key key;
@@ -278,6 +322,10 @@ again_search_slot:
}
while (1) {
+ if (btrfs_fs_closing(fs_info)) {
+ ret = -EINTR;
+ goto out;
+ }
cond_resched();
leaf = path->nodes[0];
slot = path->slots[0];
@@ -305,7 +353,8 @@ again_search_slot:
read_extent_buffer(leaf, &subid_le, offset,
sizeof(subid_le));
subid_cpu = le64_to_cpu(subid_le);
- ret = check_func(fs_info, uuid, key.type, subid_cpu);
+ ret = btrfs_check_uuid_tree_entry(fs_info, uuid,
+ key.type, subid_cpu);
if (ret < 0)
goto out;
if (ret > 0) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9cfc668f91f4..c1909e5f4506 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6,7 +6,6 @@
#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
@@ -500,7 +499,7 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
int flush, struct block_device **bdev,
- struct buffer_head **bh)
+ struct btrfs_super_block **disk_super)
{
int ret;
@@ -519,9 +518,9 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
goto error;
}
invalidate_bdev(*bdev);
- *bh = btrfs_read_dev_super(*bdev);
- if (IS_ERR(*bh)) {
- ret = PTR_ERR(*bh);
+ *disk_super = btrfs_read_dev_super(*bdev);
+ if (IS_ERR(*disk_super)) {
+ ret = PTR_ERR(*disk_super);
blkdev_put(*bdev, flags);
goto error;
}
@@ -530,7 +529,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
error:
*bdev = NULL;
- *bh = NULL;
return ret;
}
@@ -611,7 +609,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
{
struct request_queue *q;
struct block_device *bdev;
- struct buffer_head *bh;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
@@ -622,17 +619,16 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
- &bdev, &bh);
+ &bdev, &disk_super);
if (ret)
return ret;
- disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
if (devid != device->devid)
- goto error_brelse;
+ goto error_free_page;
if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
- goto error_brelse;
+ goto error_free_page;
device->generation = btrfs_super_generation(disk_super);
@@ -641,7 +637,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
pr_err(
"BTRFS: Invalid seeding and uuid-changed device detected\n");
- goto error_brelse;
+ goto error_free_page;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
@@ -667,12 +663,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
fs_devices->rw_devices++;
list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
}
- brelse(bh);
+ btrfs_release_disk_super(disk_super);
return 0;
-error_brelse:
- brelse(bh);
+error_free_page:
+ btrfs_release_disk_super(disk_super);
blkdev_put(bdev, flags);
return -EINVAL;
@@ -1209,6 +1205,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->opened = 1;
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
+ fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
out:
return ret;
}
@@ -1247,9 +1244,10 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
return ret;
}
-static void btrfs_release_disk_super(struct page *page)
+void btrfs_release_disk_super(struct btrfs_super_block *super)
{
- kunmap(page);
+ struct page *page = virt_to_page(super);
+
put_page(page);
}
@@ -1277,17 +1275,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
index, GFP_KERNEL);
- if (IS_ERR_OR_NULL(*page))
+ if (IS_ERR(*page))
return 1;
- p = kmap(*page);
+ p = page_address(*page);
/* align our pointer to the offset of the super block */
*disk_super = p + offset_in_page(bytenr);
if (btrfs_super_bytenr(*disk_super) != bytenr ||
btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(*page);
+ btrfs_release_disk_super(p);
return 1;
}
@@ -1350,7 +1348,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
btrfs_free_stale_devices(path, device);
}
- btrfs_release_disk_super(page);
+ btrfs_release_disk_super(disk_super);
error_bdev_put:
blkdev_put(bdev, flags);
@@ -1383,6 +1381,59 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
return false;
}
+static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
+{
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ /*
+ * We don't want to overwrite the superblock on the drive nor
+ * any area used by the boot loader (grub for example), so we
+ * make sure to start at an offset of at least 1MB.
+ */
+ return max_t(u64, start, SZ_1M);
+ default:
+ BUG();
+ }
+}
+
+/**
+ * dev_extent_hole_check - check if specified hole is suitable for allocation
+ * @device: the device which we have the hole
+ * @hole_start: starting position of the hole
+ * @hole_size: the size of the hole
+ * @num_bytes: the size of the free space that we need
+ *
+ * This function may modify @hole_start and @hole_end to reflect the suitable
+ * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
+ */
+static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
+ u64 *hole_size, u64 num_bytes)
+{
+ bool changed = false;
+ u64 hole_end = *hole_start + *hole_size;
+
+ /*
+ * Check before we set max_hole_start, otherwise we could end up
+ * sending back this offset anyway.
+ */
+ if (contains_pending_extent(device, hole_start, *hole_size)) {
+ if (hole_end >= *hole_start)
+ *hole_size = hole_end - *hole_start;
+ else
+ *hole_size = 0;
+ changed = true;
+ }
+
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ /* No extra check */
+ break;
+ default:
+ BUG();
+ }
+
+ return changed;
+}
/*
* find_free_dev_extent_start - find free space in the specified device
@@ -1429,12 +1480,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
int slot;
struct extent_buffer *l;
- /*
- * We don't want to overwrite the superblock on the drive nor any area
- * used by the boot loader (grub for example), so we make sure to start
- * at an offset of at least 1MB.
- */
- search_start = max_t(u64, search_start, SZ_1M);
+ search_start = dev_extent_search_start(device, search_start);
path = btrfs_alloc_path();
if (!path)
@@ -1492,18 +1538,8 @@ again:
if (key.offset > search_start) {
hole_size = key.offset - search_start;
-
- /*
- * Have to check before we set max_hole_start, otherwise
- * we could end up sending back this offset anyway.
- */
- if (contains_pending_extent(device, &search_start,
- hole_size)) {
- if (key.offset >= search_start)
- hole_size = key.offset - search_start;
- else
- hole_size = 0;
- }
+ dev_extent_hole_check(device, &search_start, &hole_size,
+ num_bytes);
if (hole_size > max_hole_size) {
max_hole_start = search_start;
@@ -1542,8 +1578,8 @@ next:
*/
if (search_end > search_start) {
hole_size = search_end - search_start;
-
- if (contains_pending_extent(device, &search_start, hole_size)) {
+ if (dev_extent_hole_check(device, &search_start, &hole_size,
+ num_bytes)) {
btrfs_release_path(path);
goto again;
}
@@ -1949,6 +1985,46 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
return num_devices;
}
+static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path)
+{
+ struct btrfs_super_block *disk_super;
+ int copy_num;
+
+ if (!bdev)
+ return;
+
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
+ struct page *page;
+ int ret;
+
+ disk_super = btrfs_read_dev_one_super(bdev, copy_num);
+ if (IS_ERR(disk_super))
+ continue;
+
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+
+ page = virt_to_page(disk_super);
+ set_page_dirty(page);
+ lock_page(page);
+ /* write_on_page() unlocks the page */
+ ret = write_one_page(page);
+ if (ret)
+ btrfs_warn(fs_info,
+ "error clearing superblock number %d (%d)",
+ copy_num, ret);
+ btrfs_release_disk_super(disk_super);
+
+ }
+
+ /* Notify udev that device has changed */
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
+}
+
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 devid)
{
@@ -2054,7 +2130,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (device->bdev) {
cur_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_rm_device_link(fs_devices, device);
+ btrfs_sysfs_remove_devices_dir(fs_devices, device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
@@ -2067,7 +2143,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
* supers and free the device.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
- btrfs_scratch_superblocks(device->bdev, device->name->str);
+ btrfs_scratch_superblocks(fs_info, device->bdev,
+ device->name->str);
btrfs_close_bdev(device);
synchronize_rcu();
@@ -2135,7 +2212,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
/* zero out the old super if it is writable */
- btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
+ btrfs_scratch_superblocks(fs_info, srcdev->bdev,
+ srcdev->name->str);
}
btrfs_close_bdev(srcdev);
@@ -2174,7 +2252,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_lock(&fs_devices->device_list_mutex);
- btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
+ btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev);
if (tgtdev->bdev)
fs_devices->open_devices--;
@@ -2194,7 +2272,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
* is already out of device list, so we don't have to hold
* the device_list_mutex lock.
*/
- btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
+ tgtdev->name->str);
btrfs_close_bdev(tgtdev);
synchronize_rcu();
@@ -2209,14 +2288,13 @@ static struct btrfs_device *btrfs_find_device_by_path(
u64 devid;
u8 *dev_uuid;
struct block_device *bdev;
- struct buffer_head *bh;
struct btrfs_device *device;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
- fs_info->bdev_holder, 0, &bdev, &bh);
+ fs_info->bdev_holder, 0, &bdev, &disk_super);
if (ret)
return ERR_PTR(ret);
- disk_super = (struct btrfs_super_block *)bh->b_data;
+
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
@@ -2226,7 +2304,7 @@ static struct btrfs_device *btrfs_find_device_by_path(
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
disk_super->fsid, true);
- brelse(bh);
+ btrfs_release_disk_super(disk_super);
if (!device)
device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
@@ -2522,7 +2600,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
orig_super_num_devices + 1);
/* add sysfs device entry */
- btrfs_sysfs_add_device_link(fs_devices, device);
+ btrfs_sysfs_add_devices_dir(fs_devices, device);
/*
* we've got more storage, clear any full flags on the space
@@ -2590,7 +2668,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return ret;
error_sysfs:
- btrfs_sysfs_rm_device_link(fs_devices, device);
+ btrfs_sysfs_remove_devices_dir(fs_devices, device);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_del_rcu(&device->dev_list);
@@ -3723,13 +3801,25 @@ static inline int balance_need_close(struct btrfs_fs_info *fs_info)
atomic_read(&fs_info->balance_cancel_req) == 0);
}
-/* Non-zero return value signifies invalidity */
-static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
- u64 allowed)
+/*
+ * Validate target profile against allowed profiles and return true if it's OK.
+ * Otherwise print the error message and return false.
+ */
+static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
+ const struct btrfs_balance_args *bargs,
+ u64 allowed, const char *type)
{
- return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl_arg->target, 1) ||
- (bctl_arg->target & ~allowed)));
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+ return true;
+
+ /* Profile is valid and does not have bits outside of the allowed set */
+ if (alloc_profile_is_valid(bargs->target, 1) &&
+ (bargs->target & ~allowed) == 0)
+ return true;
+
+ btrfs_err(fs_info, "balance: invalid convert %s profile %s",
+ type, btrfs_bg_type_to_raid_name(bargs->target));
+ return false;
}
/*
@@ -3904,7 +3994,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
- atomic_read(&fs_info->balance_cancel_req)) {
+ btrfs_should_cancel_balance(fs_info)) {
ret = -EINVAL;
goto out;
}
@@ -3945,24 +4035,9 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if (num_devices >= btrfs_raid_array[i].devs_min)
allowed |= btrfs_raid_array[i].bg_flag;
- if (validate_convert_profile(&bctl->data, allowed)) {
- btrfs_err(fs_info,
- "balance: invalid convert data profile %s",
- btrfs_bg_type_to_raid_name(bctl->data.target));
- ret = -EINVAL;
- goto out;
- }
- if (validate_convert_profile(&bctl->meta, allowed)) {
- btrfs_err(fs_info,
- "balance: invalid convert metadata profile %s",
- btrfs_bg_type_to_raid_name(bctl->meta.target));
- ret = -EINVAL;
- goto out;
- }
- if (validate_convert_profile(&bctl->sys, allowed)) {
- btrfs_err(fs_info,
- "balance: invalid convert system profile %s",
- btrfs_bg_type_to_raid_name(bctl->sys.target));
+ if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
+ !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
+ !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
ret = -EINVAL;
goto out;
}
@@ -4274,7 +4349,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
return 0;
}
-static int btrfs_uuid_scan_kthread(void *data)
+int btrfs_uuid_scan_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
struct btrfs_root *root = fs_info->tree_root;
@@ -4286,6 +4361,7 @@ static int btrfs_uuid_scan_kthread(void *data)
struct btrfs_root_item root_item;
u32 item_size;
struct btrfs_trans_handle *trans = NULL;
+ bool closing = false;
path = btrfs_alloc_path();
if (!path) {
@@ -4298,6 +4374,10 @@ static int btrfs_uuid_scan_kthread(void *data)
key.offset = 0;
while (1) {
+ if (btrfs_fs_closing(fs_info)) {
+ closing = true;
+ break;
+ }
ret = btrfs_search_forward(root, &key, path,
BTRFS_OLDEST_GENERATION);
if (ret) {
@@ -4397,76 +4477,12 @@ out:
btrfs_end_transaction(trans);
if (ret)
btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
- else
+ else if (!closing)
set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
up(&fs_info->uuid_tree_rescan_sem);
return 0;
}
-/*
- * Callback for btrfs_uuid_tree_iterate().
- * returns:
- * 0 check succeeded, the entry is not outdated.
- * < 0 if an error occurred.
- * > 0 if the check failed, which means the caller shall remove the entry.
- */
-static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
- u8 *uuid, u8 type, u64 subid)
-{
- struct btrfs_key key;
- int ret = 0;
- struct btrfs_root *subvol_root;
-
- if (type != BTRFS_UUID_KEY_SUBVOL &&
- type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
- goto out;
-
- key.objectid = subid;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
- subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(subvol_root)) {
- ret = PTR_ERR(subvol_root);
- if (ret == -ENOENT)
- ret = 1;
- goto out;
- }
-
- switch (type) {
- case BTRFS_UUID_KEY_SUBVOL:
- if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
- ret = 1;
- break;
- case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
- if (memcmp(uuid, subvol_root->root_item.received_uuid,
- BTRFS_UUID_SIZE))
- ret = 1;
- break;
- }
-
-out:
- return ret;
-}
-
-static int btrfs_uuid_rescan_kthread(void *data)
-{
- struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
- int ret;
-
- /*
- * 1st step is to iterate through the existing UUID tree and
- * to delete all entries that contain outdated data.
- * 2nd step is to add all missing entries to the UUID tree.
- */
- ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
- if (ret < 0) {
- btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
- up(&fs_info->uuid_tree_rescan_sem);
- return ret;
- }
- return btrfs_uuid_scan_kthread(data);
-}
-
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
@@ -4509,22 +4525,6 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
-{
- struct task_struct *task;
-
- down(&fs_info->uuid_tree_rescan_sem);
- task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
- if (IS_ERR(task)) {
- /* fs_info->update_uuid_tree_gen remains 0 in all error case */
- btrfs_warn(fs_info, "failed to start uuid_rescan task");
- up(&fs_info->uuid_tree_rescan_sem);
- return PTR_ERR(task);
- }
-
- return 0;
-}
-
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
@@ -4777,96 +4777,111 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID1C34);
}
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- u64 start, u64 type)
-{
- struct btrfs_fs_info *info = trans->fs_info;
- struct btrfs_fs_devices *fs_devices = info->fs_devices;
- struct btrfs_device *device;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- struct btrfs_device_info *devices_info = NULL;
- u64 total_avail;
- int num_stripes; /* total number of stripes to allocate */
- int data_stripes; /* number of stripes that count for
- block group size */
- int sub_stripes; /* sub_stripes info for map */
- int dev_stripes; /* stripes per dev */
- int devs_max; /* max devs to use */
- int devs_min; /* min devs needed */
- int devs_increment; /* ndevs has to be a multiple of this */
- int ncopies; /* how many copies to data has */
- int nparity; /* number of stripes worth of bytes to
- store parity information */
- int ret;
+/*
+ * Structure used internally for __btrfs_alloc_chunk() function.
+ * Wraps needed parameters.
+ */
+struct alloc_chunk_ctl {
+ u64 start;
+ u64 type;
+ /* Total number of stripes to allocate */
+ int num_stripes;
+ /* sub_stripes info for map */
+ int sub_stripes;
+ /* Stripes per device */
+ int dev_stripes;
+ /* Maximum number of devices to use */
+ int devs_max;
+ /* Minimum number of devices to use */
+ int devs_min;
+ /* ndevs has to be a multiple of this */
+ int devs_increment;
+ /* Number of copies */
+ int ncopies;
+ /* Number of stripes worth of bytes to store parity information */
+ int nparity;
u64 max_stripe_size;
u64 max_chunk_size;
+ u64 dev_extent_min;
u64 stripe_size;
u64 chunk_size;
int ndevs;
- int i;
- int j;
- int index;
-
- BUG_ON(!alloc_profile_is_valid(type, 0));
-
- if (list_empty(&fs_devices->alloc_list)) {
- if (btrfs_test_opt(info, ENOSPC_DEBUG))
- btrfs_debug(info, "%s: no writable device", __func__);
- return -ENOSPC;
- }
-
- index = btrfs_bg_flags_to_raid_index(type);
+};
- sub_stripes = btrfs_raid_array[index].sub_stripes;
- dev_stripes = btrfs_raid_array[index].dev_stripes;
- devs_max = btrfs_raid_array[index].devs_max;
- if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info);
- devs_min = btrfs_raid_array[index].devs_min;
- devs_increment = btrfs_raid_array[index].devs_increment;
- ncopies = btrfs_raid_array[index].ncopies;
- nparity = btrfs_raid_array[index].nparity;
+static void init_alloc_chunk_ctl_policy_regular(
+ struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ u64 type = ctl->type;
if (type & BTRFS_BLOCK_GROUP_DATA) {
- max_stripe_size = SZ_1G;
- max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
+ ctl->max_stripe_size = SZ_1G;
+ ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- /* for larger filesystems, use larger metadata chunks */
+ /* For larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
- max_stripe_size = SZ_1G;
+ ctl->max_stripe_size = SZ_1G;
else
- max_stripe_size = SZ_256M;
- max_chunk_size = max_stripe_size;
+ ctl->max_stripe_size = SZ_256M;
+ ctl->max_chunk_size = ctl->max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- max_stripe_size = SZ_32M;
- max_chunk_size = 2 * max_stripe_size;
- devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
+ ctl->max_stripe_size = SZ_32M;
+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
+ ctl->devs_max = min_t(int, ctl->devs_max,
+ BTRFS_MAX_DEVS_SYS_CHUNK);
} else {
- btrfs_err(info, "invalid chunk type 0x%llx requested",
- type);
BUG();
}
/* We don't want a chunk larger than 10% of writable space */
- max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
- max_chunk_size);
+ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+ ctl->max_chunk_size);
+ ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
+}
+
+static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ int index = btrfs_bg_flags_to_raid_index(ctl->type);
+
+ ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
+ ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
+ ctl->devs_max = btrfs_raid_array[index].devs_max;
+ if (!ctl->devs_max)
+ ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
+ ctl->devs_min = btrfs_raid_array[index].devs_min;
+ ctl->devs_increment = btrfs_raid_array[index].devs_increment;
+ ctl->ncopies = btrfs_raid_array[index].ncopies;
+ ctl->nparity = btrfs_raid_array[index].nparity;
+ ctl->ndevs = 0;
+
+ switch (fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
+ break;
+ default:
+ BUG();
+ }
+}
- devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
- GFP_NOFS);
- if (!devices_info)
- return -ENOMEM;
+static int gather_device_info(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = fs_devices->fs_info;
+ struct btrfs_device *device;
+ u64 total_avail;
+ u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
+ int ret;
+ int ndevs = 0;
+ u64 max_avail;
+ u64 dev_offset;
/*
* in the first pass through the devices list, we gather information
* about the available holes on each device.
*/
- ndevs = 0;
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
- u64 max_avail;
- u64 dev_offset;
-
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
WARN(1, KERN_ERR
"BTRFS: read-only device in alloc_list\n");
@@ -4884,24 +4899,23 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
total_avail = 0;
/* If there is no space on this device, skip it. */
- if (total_avail == 0)
+ if (total_avail < ctl->dev_extent_min)
continue;
- ret = find_free_dev_extent(device,
- max_stripe_size * dev_stripes,
- &dev_offset, &max_avail);
+ ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
+ &max_avail);
if (ret && ret != -ENOSPC)
- goto error;
+ return ret;
if (ret == 0)
- max_avail = max_stripe_size * dev_stripes;
+ max_avail = dev_extent_want;
- if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
+ if (max_avail < ctl->dev_extent_min) {
if (btrfs_test_opt(info, ENOSPC_DEBUG))
btrfs_debug(info,
- "%s: devid %llu has no free space, have=%llu want=%u",
+ "%s: devid %llu has no free space, have=%llu want=%llu",
__func__, device->devid, max_avail,
- BTRFS_STRIPE_LEN * dev_stripes);
+ ctl->dev_extent_min);
continue;
}
@@ -4916,6 +4930,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
devices_info[ndevs].dev = device;
++ndevs;
}
+ ctl->ndevs = ndevs;
/*
* now sort the devices by hole size / available space
@@ -4923,23 +4938,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
btrfs_cmp_device_info, NULL);
- /*
- * Round down to number of usable stripes, devs_increment can be any
- * number so we can't use round_down()
- */
- ndevs -= ndevs % devs_increment;
-
- if (ndevs < devs_min) {
- ret = -ENOSPC;
- if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
- btrfs_debug(info,
- "%s: not enough devices with free space: have=%d minimum required=%d",
- __func__, ndevs, devs_min);
- }
- goto error;
- }
+ return 0;
+}
- ndevs = min(ndevs, devs_max);
+static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ /* Number of stripes that count for block group size */
+ int data_stripes;
/*
* The primary goal is to maximize the number of stripes, so use as
@@ -4948,73 +4954,116 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
* The DUP profile stores more than one stripe per device, the
* max_avail is the total size so we have to adjust.
*/
- stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
- num_stripes = ndevs * dev_stripes;
+ ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
+ ctl->dev_stripes);
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
- /*
- * this will have to be fixed for RAID1 and RAID10 over
- * more drives
- */
- data_stripes = (num_stripes - nparity) / ncopies;
+ /* This will have to be fixed for RAID1 and RAID10 over more drives */
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
/*
- * Use the number of data stripes to figure out how big this chunk
- * is really going to be in terms of logical address space,
- * and compare that answer with the max chunk size. If it's higher,
- * we try to reduce stripe_size.
+ * Use the number of data stripes to figure out how big this chunk is
+ * really going to be in terms of logical address space, and compare
+ * that answer with the max chunk size. If it's higher, we try to
+ * reduce stripe_size.
*/
- if (stripe_size * data_stripes > max_chunk_size) {
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
/*
* Reduce stripe_size, round it up to a 16MB boundary again and
* then use it, unless it ends up being even bigger than the
* previous value we had already.
*/
- stripe_size = min(round_up(div_u64(max_chunk_size,
- data_stripes), SZ_16M),
- stripe_size);
+ ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
+ data_stripes), SZ_16M),
+ ctl->stripe_size);
}
- /* align to BTRFS_STRIPE_LEN */
- stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
+ /* Align to BTRFS_STRIPE_LEN */
+ ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map) {
- ret = -ENOMEM;
- goto error;
+ return 0;
+}
+
+static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = fs_devices->fs_info;
+
+ /*
+ * Round down to number of usable stripes, devs_increment can be any
+ * number so we can't use round_down() that requires power of 2, while
+ * rounddown is safe.
+ */
+ ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
+
+ if (ctl->ndevs < ctl->devs_min) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
+ btrfs_debug(info,
+ "%s: not enough devices with free space: have=%d minimum required=%d",
+ __func__, ctl->ndevs, ctl->devs_min);
+ }
+ return -ENOSPC;
}
- map->num_stripes = num_stripes;
- for (i = 0; i < ndevs; ++i) {
- for (j = 0; j < dev_stripes; ++j) {
- int s = i * dev_stripes + j;
+ ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
+
+ switch (fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ return decide_stripe_size_regular(ctl, devices_info);
+ default:
+ BUG();
+ }
+}
+
+static int create_chunk(struct btrfs_trans_handle *trans,
+ struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct map_lookup *map = NULL;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ u64 start = ctl->start;
+ u64 type = ctl->type;
+ int ret;
+ int i;
+ int j;
+
+ map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ if (!map)
+ return -ENOMEM;
+ map->num_stripes = ctl->num_stripes;
+
+ for (i = 0; i < ctl->ndevs; ++i) {
+ for (j = 0; j < ctl->dev_stripes; ++j) {
+ int s = i * ctl->dev_stripes + j;
map->stripes[s].dev = devices_info[i].dev;
map->stripes[s].physical = devices_info[i].dev_offset +
- j * stripe_size;
+ j * ctl->stripe_size;
}
}
map->stripe_len = BTRFS_STRIPE_LEN;
map->io_align = BTRFS_STRIPE_LEN;
map->io_width = BTRFS_STRIPE_LEN;
map->type = type;
- map->sub_stripes = sub_stripes;
-
- chunk_size = stripe_size * data_stripes;
+ map->sub_stripes = ctl->sub_stripes;
- trace_btrfs_chunk_alloc(info, map, start, chunk_size);
+ trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
em = alloc_extent_map();
if (!em) {
kfree(map);
- ret = -ENOMEM;
- goto error;
+ return -ENOMEM;
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
em->start = start;
- em->len = chunk_size;
+ em->len = ctl->chunk_size;
em->block_start = 0;
em->block_len = em->len;
- em->orig_block_len = stripe_size;
+ em->orig_block_len = ctl->stripe_size;
em_tree = &info->mapping_tree;
write_lock(&em_tree->lock);
@@ -5022,30 +5071,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (ret) {
write_unlock(&em_tree->lock);
free_extent_map(em);
- goto error;
+ return ret;
}
write_unlock(&em_tree->lock);
- ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
+ ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
if (ret)
goto error_del_extent;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
- btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size);
+ btrfs_device_set_bytes_used(dev,
+ dev->bytes_used + ctl->stripe_size);
if (list_empty(&dev->post_commit_list))
list_add_tail(&dev->post_commit_list,
&trans->transaction->dev_update_list);
}
- atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
+ atomic64_sub(ctl->stripe_size * map->num_stripes,
+ &info->free_chunk_space);
free_extent_map(em);
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
- kfree(devices_info);
return 0;
error_del_extent:
@@ -5057,11 +5107,68 @@ error_del_extent:
free_extent_map(em);
/* One for the tree reference */
free_extent_map(em);
-error:
+
+ return ret;
+}
+
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+ struct btrfs_fs_info *info = trans->fs_info;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct btrfs_device_info *devices_info = NULL;
+ struct alloc_chunk_ctl ctl;
+ int ret;
+
+ lockdep_assert_held(&info->chunk_mutex);
+
+ if (!alloc_profile_is_valid(type, 0)) {
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ if (list_empty(&fs_devices->alloc_list)) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info, "%s: no writable device", __func__);
+ return -ENOSPC;
+ }
+
+ if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(info, "invalid chunk type 0x%llx requested", type);
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ ctl.start = find_next_chunk(info);
+ ctl.type = type;
+ init_alloc_chunk_ctl(fs_devices, &ctl);
+
+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
+ GFP_NOFS);
+ if (!devices_info)
+ return -ENOMEM;
+
+ ret = gather_device_info(fs_devices, &ctl, devices_info);
+ if (ret < 0)
+ goto out;
+
+ ret = decide_stripe_size(fs_devices, &ctl, devices_info);
+ if (ret < 0)
+ goto out;
+
+ ret = create_chunk(trans, &ctl, devices_info);
+
+out:
kfree(devices_info);
return ret;
}
+/*
+ * Chunk allocation falls into two parts. The first part does work
+ * that makes the new allocated chunk usable, but does not do any operation
+ * that modifies the chunk tree. The second part does the work that
+ * requires modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 chunk_size)
{
@@ -5160,39 +5267,19 @@ out:
return ret;
}
-/*
- * Chunk allocation falls into two parts. The first part does work
- * that makes the new allocated chunk usable, but does not do any operation
- * that modifies the chunk tree. The second part does the work that
- * requires modifying the chunk tree. This division is important for the
- * bootstrap process of adding storage to a seed btrfs.
- */
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
-{
- u64 chunk_offset;
-
- lockdep_assert_held(&trans->fs_info->chunk_mutex);
- chunk_offset = find_next_chunk(trans->fs_info);
- return __btrfs_alloc_chunk(trans, chunk_offset, type);
-}
-
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- u64 chunk_offset;
- u64 sys_chunk_offset;
u64 alloc_profile;
int ret;
- chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
+ ret = btrfs_alloc_chunk(trans, alloc_profile);
if (ret)
return ret;
- sys_chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
+ ret = btrfs_alloc_chunk(trans, alloc_profile);
return ret;
}
@@ -5389,31 +5476,19 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
return preferred_mirror;
}
-static inline int parity_smaller(u64 a, u64 b)
-{
- return a > b;
-}
-
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
{
- struct btrfs_bio_stripe s;
int i;
- u64 l;
int again = 1;
while (again) {
again = 0;
for (i = 0; i < num_stripes - 1; i++) {
- if (parity_smaller(bbio->raid_map[i],
- bbio->raid_map[i+1])) {
- s = bbio->stripes[i];
- l = bbio->raid_map[i];
- bbio->stripes[i] = bbio->stripes[i+1];
- bbio->raid_map[i] = bbio->raid_map[i+1];
- bbio->stripes[i+1] = s;
- bbio->raid_map[i+1] = l;
-
+ /* Swap if parity is on a smaller index */
+ if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
+ swap(bbio->stripes[i], bbio->stripes[i + 1]);
+ swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
again = 1;
}
}
@@ -5914,10 +5989,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
struct btrfs_io_geometry geom;
ASSERT(bbio_ret);
-
- if (op == BTRFS_MAP_DISCARD)
- return __btrfs_map_block_for_discard(fs_info, logical,
- length, bbio_ret);
+ ASSERT(op != BTRFS_MAP_DISCARD);
ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
if (ret < 0)
@@ -6147,6 +6219,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num)
{
+ if (op == BTRFS_MAP_DISCARD)
+ return __btrfs_map_block_for_discard(fs_info, logical,
+ length, bbio_ret);
+
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
mirror_num, 0);
}
@@ -6241,8 +6317,8 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
btrfs_debug_in_rcu(fs_info,
"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
- (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
- bio->bi_iter.bi_size);
+ (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
+ dev->devid, bio->bi_iter.bi_size);
bio_set_dev(bio, dev->bdev);
btrfs_bio_counter_inc_noblocked(fs_info);
@@ -7317,36 +7393,6 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
return 0;
}
-void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
-{
- struct buffer_head *bh;
- struct btrfs_super_block *disk_super;
- int copy_num;
-
- if (!bdev)
- return;
-
- for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
- copy_num++) {
-
- if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
- continue;
-
- disk_super = (struct btrfs_super_block *)bh->b_data;
-
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
- brelse(bh);
- }
-
- /* Notify udev that device has changed */
- btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
-
- /* Update ctime/mtime for device path for libblkid */
- update_dev_time(device_path);
-}
-
/*
* Update the size and bytes used for each device where it changed. This is
* delayed since we would otherwise get errors while writing out the
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 409f4816fb89..f067b5934c46 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -17,8 +17,6 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN SZ_64K
-struct buffer_head;
-
struct btrfs_io_geometry {
/* remaining bytes before crossing a stripe */
u64 len;
@@ -209,6 +207,10 @@ BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+enum btrfs_chunk_allocation_policy {
+ BTRFS_CHUNK_ALLOC_REGULAR,
+};
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
u8 metadata_uuid[BTRFS_FSID_SIZE];
@@ -258,7 +260,10 @@ struct btrfs_fs_devices {
/* sysfs kobjects */
struct kobject fsid_kobj;
struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
struct completion kobj_unregister;
+
+ enum btrfs_chunk_allocation_policy chunk_alloc_policy;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -460,7 +465,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
-int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_uuid_scan_kthread(void *data);
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
@@ -473,7 +478,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
-void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
@@ -483,6 +487,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
+void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
int index)
diff --git a/fs/buffer.c b/fs/buffer.c
index b8d28370cfd7..f73276d746bb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -274,8 +274,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
* decide that the page is now completely done.
*/
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
@@ -288,8 +287,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
}
tmp = tmp->b_this_page;
} while (tmp != bh);
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
/*
* If none of the buffers had errors and they are all
@@ -301,8 +299,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
@@ -371,8 +368,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
}
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_write(bh);
unlock_buffer(bh);
@@ -384,14 +380,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
}
tmp = tmp->b_this_page;
}
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
end_page_writeback(page);
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
EXPORT_SYMBOL(end_buffer_async_write);
@@ -3019,49 +3013,6 @@ static void end_bio_bh_io_sync(struct bio *bio)
bio_put(bio);
}
-/*
- * This allows us to do IO even on the odd last sectors
- * of a device, even if the block size is some multiple
- * of the physical sector size.
- *
- * We'll just truncate the bio to the size of the device,
- * and clear the end of the buffer head manually.
- *
- * Truly out-of-range accesses will turn into actual IO
- * errors, this only handles the "we need to be able to
- * do IO at the final sector" case.
- */
-void guard_bio_eod(struct bio *bio)
-{
- sector_t maxsector;
- struct hd_struct *part;
-
- rcu_read_lock();
- part = __disk_get_part(bio->bi_disk, bio->bi_partno);
- if (part)
- maxsector = part_nr_sects_read(part);
- else
- maxsector = get_capacity(bio->bi_disk);
- rcu_read_unlock();
-
- if (!maxsector)
- return;
-
- /*
- * If the *whole* IO is past the end of the device,
- * let it through, and the IO layer will turn it into
- * an EIO.
- */
- if (unlikely(bio->bi_iter.bi_sector >= maxsector))
- return;
-
- maxsector -= bio->bi_iter.bi_sector;
- if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
- return;
-
- bio_truncate(bio, maxsector << 9);
-}
-
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
enum rw_hint write_hint, struct writeback_control *wbc)
{
@@ -3385,6 +3336,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
if (ret) {
INIT_LIST_HEAD(&ret->b_assoc_buffers);
+ spin_lock_init(&ret->b_uptodate_lock);
preempt_disable();
__this_cpu_inc(bh_accounting.nr);
recalc_bh_state();
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c3b8e8e0bf17..5a478cd06e11 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1415,9 +1415,13 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
int err, want, got;
+ bool direct_lock = false;
+ u32 map_flags;
+ u64 pool_flags;
loff_t pos;
loff_t limit = max(i_size_read(inode), fsc->max_file_size);
@@ -1428,8 +1432,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!prealloc_cf)
return -ENOMEM;
+ if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT)
+ direct_lock = true;
+
retry_snap:
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_start_io_direct(inode);
else
ceph_start_io_write(inode);
@@ -1477,8 +1484,12 @@ retry_snap:
goto out;
}
- /* FIXME: not complete since it doesn't account for being at quota */
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL)) {
+ down_read(&osdc->lock);
+ map_flags = osdc->osdmap->flags;
+ pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
+ up_read(&osdc->lock);
+ if ((map_flags & CEPH_OSDMAP_FULL) ||
+ (pool_flags & CEPH_POOL_FLAG_FULL)) {
err = -ENOSPC;
goto out;
}
@@ -1519,14 +1530,15 @@ retry_snap:
/* we might need to revert back to that point */
data = *from;
- if (iocb->ki_flags & IOCB_DIRECT) {
+ if (iocb->ki_flags & IOCB_DIRECT)
written = ceph_direct_read_write(iocb, &data, snapc,
&prealloc_cf);
- ceph_end_io_direct(inode);
- } else {
+ else
written = ceph_sync_write(iocb, &data, pos, snapc);
+ if (direct_lock)
+ ceph_end_io_direct(inode);
+ else
ceph_end_io_write(inode);
- }
if (written > 0)
iov_iter_advance(from, written);
ceph_put_snap_context(snapc);
@@ -1570,14 +1582,15 @@ retry_snap:
}
if (written >= 0) {
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_NEARFULL))
+ if ((map_flags & CEPH_OSDMAP_NEARFULL) ||
+ (pool_flags & CEPH_POOL_FLAG_NEARFULL))
iocb->ki_flags |= IOCB_DSYNC;
written = generic_write_sync(iocb, written);
}
goto out_unlocked;
out:
- if (iocb->ki_flags & IOCB_DIRECT)
+ if (direct_lock)
ceph_end_io_direct(inode);
else
ceph_end_io_write(inode);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index ccfcc66aaf44..923be9399b21 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1155,5 +1155,6 @@ void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
pr_err("snapid map %llx -> %x still in use\n",
sm->snap, sm->dev);
}
+ kfree(sm);
}
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 1d9f083b8a11..c7f150686a53 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -203,6 +203,26 @@ struct ceph_parse_opts_ctx {
};
/*
+ * Remove adjacent slashes and then the trailing slash, unless it is
+ * the only remaining character.
+ *
+ * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/".
+ */
+static void canonicalize_path(char *path)
+{
+ int i, j = 0;
+
+ for (i = 0; path[i] != '\0'; i++) {
+ if (path[i] != '/' || j < 1 || path[j - 1] != '/')
+ path[j++] = path[i];
+ }
+
+ if (j > 1 && path[j - 1] == '/')
+ j--;
+ path[j] = '\0';
+}
+
+/*
* Parse the source parameter. Distinguish the server list from the path.
*
* The source will look like:
@@ -224,15 +244,16 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
dev_name_end = strchr(dev_name, '/');
if (dev_name_end) {
- kfree(fsopt->server_path);
-
/*
* The server_path will include the whole chars from userland
* including the leading '/'.
*/
+ kfree(fsopt->server_path);
fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
if (!fsopt->server_path)
return -ENOMEM;
+
+ canonicalize_path(fsopt->server_path);
} else {
dev_name_end = dev_name + strlen(dev_name);
}
@@ -456,73 +477,6 @@ static int strcmp_null(const char *s1, const char *s2)
return strcmp(s1, s2);
}
-/**
- * path_remove_extra_slash - Remove the extra slashes in the server path
- * @server_path: the server path and could be NULL
- *
- * Return NULL if the path is NULL or only consists of "/", or a string
- * without any extra slashes including the leading slash(es) and the
- * slash(es) at the end of the server path, such as:
- * "//dir1////dir2///" --> "dir1/dir2"
- */
-static char *path_remove_extra_slash(const char *server_path)
-{
- const char *path = server_path;
- const char *cur, *end;
- char *buf, *p;
- int len;
-
- /* if the server path is omitted */
- if (!path)
- return NULL;
-
- /* remove all the leading slashes */
- while (*path == '/')
- path++;
-
- /* if the server path only consists of slashes */
- if (*path == '\0')
- return NULL;
-
- len = strlen(path);
-
- buf = kmalloc(len + 1, GFP_KERNEL);
- if (!buf)
- return ERR_PTR(-ENOMEM);
-
- end = path + len;
- p = buf;
- do {
- cur = strchr(path, '/');
- if (!cur)
- cur = end;
-
- len = cur - path;
-
- /* including one '/' */
- if (cur != end)
- len += 1;
-
- memcpy(p, path, len);
- p += len;
-
- while (cur <= end && *cur == '/')
- cur++;
- path = cur;
- } while (path < end);
-
- *p = '\0';
-
- /*
- * remove the last slash if there has and just to make sure that
- * we will get something like "dir1/dir2"
- */
- if (*(--p) == '/')
- *p = '\0';
-
- return buf;
-}
-
static int compare_mount_options(struct ceph_mount_options *new_fsopt,
struct ceph_options *new_opt,
struct ceph_fs_client *fsc)
@@ -530,7 +484,6 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
struct ceph_mount_options *fsopt1 = new_fsopt;
struct ceph_mount_options *fsopt2 = fsc->mount_options;
int ofs = offsetof(struct ceph_mount_options, snapdir_name);
- char *p1, *p2;
int ret;
ret = memcmp(fsopt1, fsopt2, ofs);
@@ -540,21 +493,12 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
if (ret)
return ret;
+
ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
if (ret)
return ret;
- p1 = path_remove_extra_slash(fsopt1->server_path);
- if (IS_ERR(p1))
- return PTR_ERR(p1);
- p2 = path_remove_extra_slash(fsopt2->server_path);
- if (IS_ERR(p2)) {
- kfree(p1);
- return PTR_ERR(p2);
- }
- ret = strcmp_null(p1, p2);
- kfree(p1);
- kfree(p2);
+ ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
if (ret)
return ret;
@@ -957,7 +901,9 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
mutex_lock(&fsc->client->mount_mutex);
if (!fsc->sb->s_root) {
- const char *path, *p;
+ const char *path = fsc->mount_options->server_path ?
+ fsc->mount_options->server_path + 1 : "";
+
err = __ceph_open_session(fsc->client, started);
if (err < 0)
goto out;
@@ -969,22 +915,11 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
goto out;
}
- p = path_remove_extra_slash(fsc->mount_options->server_path);
- if (IS_ERR(p)) {
- err = PTR_ERR(p);
- goto out;
- }
- /* if the server path is omitted or just consists of '/' */
- if (!p)
- path = "";
- else
- path = p;
dout("mount opening path '%s'\n", path);
ceph_fs_debugfs_init(fsc);
root = open_root_dentry(fsc, path, started);
- kfree(p);
if (IS_ERR(root)) {
err = PTR_ERR(root);
goto out;
@@ -1097,10 +1032,6 @@ static int ceph_get_tree(struct fs_context *fc)
if (!fc->source)
return invalfc(fc, "No source");
-#ifdef CONFIG_CEPH_FS_POSIX_ACL
- fc->sb_flags |= SB_POSIXACL;
-#endif
-
/* create client (which we may/may not use) */
fsc = create_fs_client(pctx->opts, pctx->copts);
pctx->opts = NULL;
@@ -1223,6 +1154,10 @@ static int ceph_init_fs_context(struct fs_context *fc)
fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
fsopt->congestion_kb = default_congestion_kb();
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ fc->sb_flags |= SB_POSIXACL;
+#endif
+
fc->fs_private = pctx;
fc->ops = &ceph_context_ops;
return 0;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1e456a9011bb..037cdfb2ad4f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -91,7 +91,7 @@ struct ceph_mount_options {
char *snapdir_name; /* default ".snap" */
char *mds_namespace; /* default NULL */
- char *server_path; /* default "/" */
+ char *server_path; /* default NULL (means "/") */
char *fscache_uniq; /* default NULL */
};
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 606f26d862dc..cc3ada12848d 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -324,6 +324,8 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
if (full_path == NULL)
goto cdda_exit;
+ convert_delimiter(full_path, '\\');
+
cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
if (!cifs_sb_master_tlink(cifs_sb)) {
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 440828afcdde..ae421634aa42 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -342,7 +342,7 @@ static int
sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
struct cifs_fattr *fattr, uint sidtype)
{
- int rc;
+ int rc = 0;
struct key *sidkey;
char *sidstr;
const struct cred *saved_cred;
@@ -450,11 +450,12 @@ out_revert_creds:
* fails then we just fall back to using the mnt_uid/mnt_gid.
*/
got_valid_id:
+ rc = 0;
if (sidtype == SIDOWNER)
fattr->cf_uid = fuid;
else
fattr->cf_gid = fgid;
- return 0;
+ return rc;
}
int
@@ -601,7 +602,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
*pmode |= (S_IXUGO & (*pbits_to_set));
- cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode);
+ cifs_dbg(NOISY, "access flags 0x%x mode now %04o\n", flags, *pmode);
return;
}
@@ -630,7 +631,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
if (mode & S_IXUGO)
*pace_flags |= SET_FILE_EXEC_RIGHTS;
- cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n",
+ cifs_dbg(NOISY, "mode: %04o, access flags now 0x%x\n",
mode, *pace_flags);
return;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index febab27cd838..94e3ed4850b5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -414,7 +414,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
seq_puts(s, "ntlm");
break;
case Kerberos:
- seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid));
+ seq_puts(s, "krb5");
break;
case RawNTLMSSP:
seq_puts(s, "ntlmssp");
@@ -427,6 +427,10 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
if (ses->sign)
seq_puts(s, "i");
+
+ if (ses->sectype == Kerberos)
+ seq_printf(s, ",cruid=%u",
+ from_kuid_munged(&init_user_ns, ses->cred_uid));
}
static void
@@ -526,6 +530,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
if (tcon->seal)
seq_puts(s, ",seal");
+ else if (tcon->ses->server->ignore_signature)
+ seq_puts(s, ",signloosely");
if (tcon->nocase)
seq_puts(s, ",nocase");
if (tcon->local_lease)
@@ -1012,7 +1018,7 @@ struct file_system_type cifs_fs_type = {
.name = "cifs",
.mount = cifs_do_mount,
.kill_sb = cifs_kill_sb,
- /* .fs_flags */
+ .fs_flags = FS_RENAME_DOES_D_MOVE,
};
MODULE_ALIAS_FS("cifs");
@@ -1021,7 +1027,7 @@ static struct file_system_type smb3_fs_type = {
.name = "smb3",
.mount = smb3_do_mount,
.kill_sb = cifs_kill_sb,
- /* .fs_flags */
+ .fs_flags = FS_RENAME_DOES_D_MOVE,
};
MODULE_ALIAS_FS("smb3");
MODULE_ALIAS("smb3");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b87456bae1a1..c9e2e6bbca13 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.25"
+#define CIFS_VERSION "2.26"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index de82cfa44b1a..0d956360e984 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1281,6 +1281,7 @@ struct cifs_fid {
__u64 volatile_fid; /* volatile file id for smb2 */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
__u8 create_guid[16];
+ __u32 access;
struct cifs_pending_open *pending_open;
unsigned int epoch;
#ifdef CONFIG_CIFS_DEBUG2
@@ -1741,6 +1742,12 @@ static inline bool is_retryable_error(int error)
return false;
}
+
+/* cifs_get_writable_file() flags */
+#define FIND_WR_ANY 0
+#define FIND_WR_FSUID_ONLY 1
+#define FIND_WR_WITH_DELETE 2
+
#define MID_FREE 0
#define MID_REQUEST_ALLOCATED 1
#define MID_REQUEST_SUBMITTED 2
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 79d842e7240c..593d826820c3 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1021,7 +1021,7 @@ typedef struct smb_com_writex_req {
__le16 ByteCount;
__u8 Pad; /* BB check for whether padded to DWORD
boundary and optimum performance here */
- char Data[0];
+ char Data[];
} __attribute__((packed)) WRITEX_REQ;
typedef struct smb_com_write_req {
@@ -1041,7 +1041,7 @@ typedef struct smb_com_write_req {
__le16 ByteCount;
__u8 Pad; /* BB check for whether padded to DWORD
boundary and optimum performance here */
- char Data[0];
+ char Data[];
} __attribute__((packed)) WRITE_REQ;
typedef struct smb_com_write_rsp {
@@ -1306,7 +1306,7 @@ typedef struct smb_com_ntransact_req {
/* SetupCount words follow then */
__le16 ByteCount;
__u8 Pad[3];
- __u8 Parms[0];
+ __u8 Parms[];
} __attribute__((packed)) NTRANSACT_REQ;
typedef struct smb_com_ntransact_rsp {
@@ -1523,7 +1523,7 @@ struct file_notify_information {
__le32 NextEntryOffset;
__le32 Action;
__le32 FileNameLength;
- __u8 FileName[0];
+ __u8 FileName[];
} __attribute__((packed));
/* For IO_REPARSE_TAG_SYMLINK */
@@ -1536,7 +1536,7 @@ struct reparse_symlink_data {
__le16 PrintNameOffset;
__le16 PrintNameLength;
__le32 Flags;
- char PathBuffer[0];
+ char PathBuffer[];
} __attribute__((packed));
/* Flag above */
@@ -1553,7 +1553,7 @@ struct reparse_posix_data {
__le16 ReparseDataLength;
__u16 Reserved;
__le64 InodeType; /* LNK, FIFO, CHR etc. */
- char PathBuffer[0];
+ char PathBuffer[];
} __attribute__((packed));
struct cifs_quota_data {
@@ -1691,6 +1691,7 @@ struct smb_t2_rsp {
#define SMB_FIND_FILE_ID_FULL_DIR_INFO 0x105
#define SMB_FIND_FILE_ID_BOTH_DIR_INFO 0x106
#define SMB_FIND_FILE_UNIX 0x202
+#define SMB_FIND_FILE_POSIX_INFO 0x064
typedef struct smb_com_transaction2_qpi_req {
struct smb_hdr hdr; /* wct = 14+ */
@@ -1761,7 +1762,7 @@ struct set_file_rename {
__le32 overwrite; /* 1 = overwrite dest */
__u32 root_fid; /* zero */
__le32 target_name_len;
- char target_name[0]; /* Must be unicode */
+ char target_name[]; /* Must be unicode */
} __attribute__((packed));
struct smb_com_transaction2_sfi_req {
@@ -2450,7 +2451,7 @@ struct cifs_posix_acl { /* access conrol list (ACL) */
__le16 version;
__le16 access_entry_count; /* access ACL - count of entries */
__le16 default_entry_count; /* default ACL - count of entries */
- struct cifs_posix_ace ace_array[0];
+ struct cifs_posix_ace ace_array[];
/* followed by
struct cifs_posix_ace default_ace_arraay[] */
} __attribute__((packed)); /* level 0x204 */
@@ -2756,7 +2757,7 @@ typedef struct file_xattr_info {
/* BB do we need another field for flags? BB */
__u32 xattr_name_len;
__u32 xattr_value_len;
- char xattr_name[0];
+ char xattr_name[];
/* followed by xattr_value[xattr_value_len], no pad */
} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
level 0x205 */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 89eaaf46d1ca..12a895e02db4 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -134,11 +134,12 @@ extern bool backup_cred(struct cifs_sb_info *);
extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
-extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
+extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int);
extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
- bool fsuid_only,
+ int flags,
struct cifsFileInfo **ret_file);
extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
+ int flags,
struct cifsFileInfo **ret_file);
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
@@ -601,6 +602,11 @@ int smb2_parse_query_directory(struct cifs_tcon *tcon, struct kvec *rsp_iov,
int resp_buftype,
struct cifs_search_info *srch_inf);
+struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server);
+void cifs_put_tcp_super(struct super_block *sb);
+int update_super_prepath(struct cifs_tcon *tcon, const char *prefix,
+ size_t prefix_len);
+
#ifdef CONFIG_CIFS_DFS_UPCALL
static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
const char *old_path,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3c89569e7210..140efc1a9374 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -162,9 +162,18 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
for (it = dfs_cache_get_tgt_iterator(&tl); it;
it = dfs_cache_get_next_tgt(&tl, it)) {
- const char *tgt = dfs_cache_get_tgt_name(it);
+ const char *share, *prefix;
+ size_t share_len, prefix_len;
- extract_unc_hostname(tgt, &dfs_host, &dfs_host_len);
+ rc = dfs_cache_get_tgt_share(it, &share, &share_len, &prefix,
+ &prefix_len);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to parse target share %d\n",
+ __func__, rc);
+ continue;
+ }
+
+ extract_unc_hostname(share, &dfs_host, &dfs_host_len);
if (dfs_host_len != tcp_host_len
|| strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
@@ -175,11 +184,13 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc,
continue;
}
- scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%.*s", (int)share_len, share);
rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc);
- if (!rc)
+ if (!rc) {
+ rc = update_super_prepath(tcon, prefix, prefix_len);
break;
+ }
if (rc == -EREMOTE)
break;
}
@@ -320,7 +331,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
atomic_inc(&tconInfoReconnectCount);
/* tell server Unix caps we support */
- if (ses->capabilities & CAP_UNIX)
+ if (cap_unix(ses))
reset_cifs_unix_caps(0, tcon, NULL, NULL);
/*
@@ -1492,6 +1503,7 @@ openRetry:
*oplock = rsp->OplockLevel;
/* cifs fid stays in le */
oparms->fid->netfid = rsp->Fid;
+ oparms->fid->access = desired_access;
/* Let caller know file was created so we can set the mode. */
/* Do we care about the CreateAction in any other cases? */
@@ -1590,7 +1602,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
if (server->ops->is_session_expired &&
server->ops->is_session_expired(buf)) {
cifs_reconnect(server);
- wake_up(&server->response_q);
return -1;
}
@@ -2115,7 +2126,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wdata2->tailsz = tailsz;
wdata2->bytes = cur_len;
- rc = cifs_get_writable_file(CIFS_I(inode), false,
+ rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY,
&wdata2->cfile);
if (!wdata2->cfile) {
cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a941ac7a659d..95b3ab0ca8c0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -21,6 +21,7 @@
#include <linux/fs.h>
#include <linux/net.h>
#include <linux/string.h>
+#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/list.h>
#include <linux/wait.h>
@@ -57,7 +58,6 @@
#include "smb2proto.h"
#include "smbdirect.h"
#include "dns_resolve.h"
-#include "cifsfs.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
#endif
@@ -389,54 +389,7 @@ static inline int reconn_set_ipaddr(struct TCP_Server_Info *server)
#endif
#ifdef CONFIG_CIFS_DFS_UPCALL
-struct super_cb_data {
- struct TCP_Server_Info *server;
- struct super_block *sb;
-};
-
/* These functions must be called with server->srv_mutex held */
-
-static void super_cb(struct super_block *sb, void *arg)
-{
- struct super_cb_data *d = arg;
- struct cifs_sb_info *cifs_sb;
- struct cifs_tcon *tcon;
-
- if (d->sb)
- return;
-
- cifs_sb = CIFS_SB(sb);
- tcon = cifs_sb_master_tcon(cifs_sb);
- if (tcon->ses->server == d->server)
- d->sb = sb;
-}
-
-static struct super_block *get_tcp_super(struct TCP_Server_Info *server)
-{
- struct super_cb_data d = {
- .server = server,
- .sb = NULL,
- };
-
- iterate_supers_type(&cifs_fs_type, super_cb, &d);
-
- if (unlikely(!d.sb))
- return ERR_PTR(-ENOENT);
- /*
- * Grab an active reference in order to prevent automounts (DFS links)
- * of expiring and then freeing up our cifs superblock pointer while
- * we're doing failover.
- */
- cifs_sb_active(d.sb);
- return d.sb;
-}
-
-static inline void put_tcp_super(struct super_block *sb)
-{
- if (!IS_ERR_OR_NULL(sb))
- cifs_sb_deactive(sb);
-}
-
static void reconn_inval_dfs_target(struct TCP_Server_Info *server,
struct cifs_sb_info *cifs_sb,
struct dfs_cache_tgt_list *tgt_list,
@@ -508,7 +461,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->nr_targets = 1;
#ifdef CONFIG_CIFS_DFS_UPCALL
spin_unlock(&GlobalMid_Lock);
- sb = get_tcp_super(server);
+ sb = cifs_get_tcp_super(server);
if (IS_ERR(sb)) {
rc = PTR_ERR(sb);
cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n",
@@ -535,8 +488,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
spin_unlock(&GlobalMid_Lock);
#ifdef CONFIG_CIFS_DFS_UPCALL
dfs_cache_free_tgts(&tgt_list);
- put_tcp_super(sb);
+ cifs_put_tcp_super(sb);
#endif
+ wake_up(&server->response_q);
return rc;
} else
server->tcpStatus = CifsNeedReconnect;
@@ -666,11 +620,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
}
- put_tcp_super(sb);
+ cifs_put_tcp_super(sb);
#endif
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
+ wake_up(&server->response_q);
return rc;
}
@@ -765,7 +720,6 @@ server_unresponsive(struct TCP_Server_Info *server)
cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n",
(3 * server->echo_interval) / HZ);
cifs_reconnect(server);
- wake_up(&server->response_q);
return true;
}
@@ -898,7 +852,6 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
*/
cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
cifs_reconnect(server);
- wake_up(&server->response_q);
break;
default:
cifs_server_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type);
@@ -1070,7 +1023,6 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
server->vals->header_preamble_size) {
cifs_server_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
cifs_reconnect(server);
- wake_up(&server->response_q);
return -ECONNABORTED;
}
@@ -1118,7 +1070,6 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
if (server->ops->is_session_expired &&
server->ops->is_session_expired(buf)) {
cifs_reconnect(server);
- wake_up(&server->response_q);
return -1;
}
@@ -1164,8 +1115,9 @@ cifs_demultiplex_thread(void *p)
struct task_struct *task_to_wake = NULL;
struct mid_q_entry *mids[MAX_COMPOUND];
char *bufs[MAX_COMPOUND];
+ unsigned int noreclaim_flag;
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current));
length = atomic_inc_return(&tcpSesAllocCount);
@@ -1212,7 +1164,6 @@ next_pdu:
cifs_server_dbg(VFS, "SMB response too short (%u bytes)\n",
server->pdu_size);
cifs_reconnect(server);
- wake_up(&server->response_q);
continue;
}
@@ -1320,6 +1271,7 @@ next_pdu:
set_current_state(TASK_RUNNING);
}
+ memalloc_noreclaim_restore(noreclaim_flag);
module_put_and_exit(0);
}
@@ -1522,6 +1474,9 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3)
cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n");
return 1;
}
+ cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 "
+ "is not recommended unless required for "
+ "access to very old servers\n");
vol->ops = &smb1_operations;
vol->vals = &smb1_values;
break;
@@ -2517,11 +2472,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n");
if (got_version == false)
- pr_warn("No dialect specified on mount. Default has changed to "
- "a more secure dialect, SMB2.1 or later (e.g. SMB3), from CIFS "
- "(SMB1). To use the less secure SMB1 dialect to access "
- "old servers which do not support SMB3 (or SMB2.1) specify vers=1.0"
- " on mount.\n");
+ pr_warn_once("No dialect specified on mount. Default has changed"
+ " to a more secure dialect, SMB2.1 or later (e.g. "
+ "SMB3.1.1), from CIFS (SMB1). To use the less secure "
+ "SMB1 dialect to access old servers which do not "
+ "support SMB3.1.1 (or even SMB3 or SMB2.1) specify "
+ "vers=1.0 on mount.\n");
kfree(mountdata_copy);
return 0;
@@ -4151,7 +4107,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_gid = pvolume_info->linux_gid;
cifs_sb->mnt_file_mode = pvolume_info->file_mode;
cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
- cifs_dbg(FYI, "file mode: 0x%hx dir mode: 0x%hx\n",
+ cifs_dbg(FYI, "file mode: %04ho dir mode: %04ho\n",
cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
cifs_sb->actimeo = pvolume_info->actimeo;
@@ -4999,6 +4955,15 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
* dentry revalidation to think the dentry are stale (ESTALE).
*/
cifs_autodisable_serverino(cifs_sb);
+ /*
+ * Force the use of prefix path to support failover on DFS paths that
+ * resolve to targets that have different prefix paths.
+ */
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+ kfree(cifs_sb->prepath);
+ cifs_sb->prepath = vol->prepath;
+ vol->prepath = NULL;
+
out:
free_xid(xid);
cifs_try_adding_channels(ses);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 43c1b43a07ec..a67f88bf7ae1 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1260,6 +1260,44 @@ void dfs_cache_del_vol(const char *fullpath)
kref_put(&vi->refcnt, vol_release);
}
+/**
+ * dfs_cache_get_tgt_share - parse a DFS target
+ *
+ * @it: DFS target iterator.
+ * @share: tree name.
+ * @share_len: length of tree name.
+ * @prefix: prefix path.
+ * @prefix_len: length of prefix path.
+ *
+ * Return zero if target was parsed correctly, otherwise non-zero.
+ */
+int dfs_cache_get_tgt_share(const struct dfs_cache_tgt_iterator *it,
+ const char **share, size_t *share_len,
+ const char **prefix, size_t *prefix_len)
+{
+ char *s, sep;
+
+ if (!it || !share || !share_len || !prefix || !prefix_len)
+ return -EINVAL;
+
+ sep = it->it_name[0];
+ if (sep != '\\' && sep != '/')
+ return -EINVAL;
+
+ s = strchr(it->it_name + 1, sep);
+ if (!s)
+ return -EINVAL;
+
+ s = strchrnul(s + 1, sep);
+
+ *share = it->it_name;
+ *share_len = s - it->it_name;
+ *prefix = *s ? s + 1 : s;
+ *prefix_len = &it->it_name[strlen(it->it_name)] - *prefix;
+
+ return 0;
+}
+
/* Get all tcons that are within a DFS namespace and can be refreshed */
static void get_tcons(struct TCP_Server_Info *server, struct list_head *head)
{
diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
index 99ee44f8ad07..bf94d08cfb5a 100644
--- a/fs/cifs/dfs_cache.h
+++ b/fs/cifs/dfs_cache.h
@@ -49,6 +49,10 @@ extern int dfs_cache_update_vol(const char *fullpath,
struct TCP_Server_Info *server);
extern void dfs_cache_del_vol(const char *fullpath);
+extern int dfs_cache_get_tgt_share(const struct dfs_cache_tgt_iterator *it,
+ const char **share, size_t *share_len,
+ const char **prefix, size_t *prefix_len);
+
static inline struct dfs_cache_tgt_iterator *
dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl,
struct dfs_cache_tgt_iterator *it)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 0ef099442f20..36e7b2fd2190 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -555,7 +555,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
if (server->ops->close)
server->ops->close(xid, tcon, &fid);
cifs_del_pending_open(&open);
- fput(file);
rc = -ENOMEM;
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index bc9516ab4b34..5920820bfbd0 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1169,7 +1169,8 @@ try_again:
rc = posix_lock_file(file, flock, NULL);
up_write(&cinode->lock_sem);
if (rc == FILE_LOCK_DEFERRED) {
- rc = wait_event_interruptible(flock->fl_wait, !flock->fl_blocker);
+ rc = wait_event_interruptible(flock->fl_wait,
+ list_empty(&flock->fl_blocked_member));
if (!rc)
goto try_again;
locks_delete_block(flock);
@@ -1958,7 +1959,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
/* Return -EBADF if no handle is found and general rc otherwise */
int
-cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
+cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags,
struct cifsFileInfo **ret_file)
{
struct cifsFileInfo *open_file, *inv_file = NULL;
@@ -1966,7 +1967,8 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
bool any_available = false;
int rc = -EBADF;
unsigned int refind = 0;
-
+ bool fsuid_only = flags & FIND_WR_FSUID_ONLY;
+ bool with_delete = flags & FIND_WR_WITH_DELETE;
*ret_file = NULL;
/*
@@ -1998,6 +2000,8 @@ refind_writable:
continue;
if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
continue;
+ if (with_delete && !(open_file->fid.access & DELETE))
+ continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
if (!open_file->invalidHandle) {
/* found a good writable file */
@@ -2045,12 +2049,12 @@ refind_writable:
}
struct cifsFileInfo *
-find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
+find_writable_file(struct cifsInodeInfo *cifs_inode, int flags)
{
struct cifsFileInfo *cfile;
int rc;
- rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile);
+ rc = cifs_get_writable_file(cifs_inode, flags, &cfile);
if (rc)
cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc);
@@ -2059,6 +2063,7 @@ find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
int
cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
+ int flags,
struct cifsFileInfo **ret_file)
{
struct list_head *tmp;
@@ -2085,7 +2090,7 @@ cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
kfree(full_path);
cinode = CIFS_I(d_inode(cfile->dentry));
spin_unlock(&tcon->open_file_lock);
- return cifs_get_writable_file(cinode, 0, ret_file);
+ return cifs_get_writable_file(cinode, flags, ret_file);
}
spin_unlock(&tcon->open_file_lock);
@@ -2162,7 +2167,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
if (mapping->host->i_size - offset < (loff_t)to)
to = (unsigned)(mapping->host->i_size - offset);
- rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file);
+ rc = cifs_get_writable_file(CIFS_I(mapping->host), FIND_WR_ANY,
+ &open_file);
if (!rc) {
bytes_written = cifs_write(open_file, open_file->pid,
write_data, to - from, &offset);
@@ -2355,7 +2361,7 @@ retry:
if (cfile)
cifsFileInfo_put(cfile);
- rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile);
+ rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile);
/* in case of an error store it to return later */
if (rc)
@@ -3835,7 +3841,7 @@ again:
if (rc == -ENODATA)
rc = 0;
- ctx->rc = (rc == 0) ? ctx->total_len : rc;
+ ctx->rc = (rc == 0) ? (ssize_t)ctx->total_len : rc;
mutex_unlock(&ctx->aio_mutex);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9ba623b601ec..8d01ec2dca66 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -653,8 +653,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
*/
if ((fattr->cf_nlink < 1) && !tcon->unix_ext &&
!info->DeletePending) {
- cifs_dbg(1, "bogus file nlink value %u\n",
- fattr->cf_nlink);
+ cifs_dbg(VFS, "bogus file nlink value %u\n",
+ fattr->cf_nlink);
fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
}
}
@@ -1648,7 +1648,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
struct TCP_Server_Info *server;
char *full_path;
- cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n",
+ cifs_dbg(FYI, "In cifs_mkdir, mode = %04ho inode = 0x%p\n",
mode, inode);
cifs_sb = CIFS_SB(inode->i_sb);
@@ -1835,6 +1835,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
CIFSSMBClose(xid, tcon, fid.netfid);
}
do_rename_exit:
+ if (rc == 0)
+ d_move(from_dentry, to_dentry);
cifs_put_tlink(tlink);
return rc;
}
@@ -2073,6 +2075,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
struct inode *inode = d_inode(dentry);
struct super_block *sb = dentry->d_sb;
char *full_path = NULL;
+ int count = 0;
if (inode == NULL)
return -ENOENT;
@@ -2094,15 +2097,18 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
full_path, inode, inode->i_count.counter,
dentry, cifs_get_time(dentry), jiffies);
+again:
if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
else
rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
xid, NULL);
-
+ if (rc == -EAGAIN && count++ < 10)
+ goto again;
out:
kfree(full_path);
free_xid(xid);
+
return rc;
}
@@ -2144,8 +2150,9 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
* We need to be sure that all dirty pages are written and the server
* has actual ctime, mtime and file length.
*/
- if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
- inode->i_mapping->nrpages != 0) {
+ if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_SIZE)) &&
+ !CIFS_CACHE_READ(CIFS_I(inode)) &&
+ inode->i_mapping && inode->i_mapping->nrpages != 0) {
rc = filemap_fdatawait(inode->i_mapping);
if (rc) {
mapping_set_error(inode->i_mapping, rc);
@@ -2153,9 +2160,20 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
}
}
- rc = cifs_revalidate_dentry_attr(dentry);
- if (rc)
- return rc;
+ if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_FORCE_SYNC)
+ CIFS_I(inode)->time = 0; /* force revalidate */
+
+ /*
+ * If the caller doesn't require syncing, only sync if
+ * necessary (e.g. due to earlier truncate or setattr
+ * invalidating the cached metadata)
+ */
+ if (((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) ||
+ (CIFS_I(inode)->time == 0)) {
+ rc = cifs_revalidate_dentry_attr(dentry);
+ if (rc)
+ return rc;
+ }
generic_fillattr(inode, stat);
stat->blksize = cifs_sb->bsize;
@@ -2187,7 +2205,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
stat->gid = current_fsgid();
}
- return rc;
+ return 0;
}
int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
@@ -2278,7 +2296,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
* writebehind data than the SMB timeout for the SetPathInfo
* request would allow
*/
- open_file = find_writable_file(cifsInode, true);
+ open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY);
if (open_file) {
tcon = tlink_tcon(open_file->tlink);
server = tcon->ses->server;
@@ -2428,7 +2446,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
args->ctime = NO_CHANGE_64;
args->device = 0;
- open_file = find_writable_file(cifsInode, true);
+ open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY);
if (open_file) {
u16 nfid = open_file->fid.netfid;
u32 npid = open_file->pid;
@@ -2512,26 +2530,27 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
/*
* Attempt to flush data before changing attributes. We need to do
- * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
- * ownership or mode then we may also need to do this. Here, we take
- * the safe way out and just do the flush on all setattr requests. If
- * the flush returns error, store it to report later and continue.
+ * this for ATTR_SIZE and ATTR_MTIME. If the flush of the data
+ * returns error, store it to report later and continue.
*
* BB: This should be smarter. Why bother flushing pages that
* will be truncated anyway? Also, should we error out here if
- * the flush returns error?
+ * the flush returns error? Do we need to check for ATTR_MTIME_SET flag?
*/
- rc = filemap_write_and_wait(inode->i_mapping);
- if (is_interrupt_error(rc)) {
- rc = -ERESTARTSYS;
- goto cifs_setattr_exit;
+ if (attrs->ia_valid & (ATTR_MTIME | ATTR_SIZE | ATTR_CTIME)) {
+ rc = filemap_write_and_wait(inode->i_mapping);
+ if (is_interrupt_error(rc)) {
+ rc = -ERESTARTSYS;
+ goto cifs_setattr_exit;
+ }
+ mapping_set_error(inode->i_mapping, rc);
}
- mapping_set_error(inode->i_mapping, rc);
rc = 0;
- if (attrs->ia_valid & ATTR_MTIME) {
- rc = cifs_get_writable_file(cifsInode, false, &wfile);
+ if ((attrs->ia_valid & ATTR_MTIME) &&
+ !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
+ rc = cifs_get_writable_file(cifsInode, FIND_WR_ANY, &wfile);
if (!rc) {
tcon = tlink_tcon(wfile->tlink);
rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 852aa00ec729..a25ef35b023e 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -416,7 +416,7 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
}
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, pfile_info, NULL,
- NULL);
+ NULL, NULL);
if (rc)
goto qmf_out_open_fail;
@@ -470,7 +470,7 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
oparms.reconnect = false;
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
- NULL);
+ NULL, NULL);
if (rc) {
kfree(utf16_path);
return rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 40ca394fd5de..a456febd4109 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -31,6 +31,7 @@
#include "nterr.h"
#include "cifs_unicode.h"
#include "smb2pdu.h"
+#include "cifsfs.h"
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
@@ -1022,3 +1023,82 @@ int copy_path_name(char *dst, const char *src)
name_len++;
return name_len;
}
+
+struct super_cb_data {
+ struct TCP_Server_Info *server;
+ struct super_block *sb;
+};
+
+static void super_cb(struct super_block *sb, void *arg)
+{
+ struct super_cb_data *d = arg;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+
+ if (d->sb)
+ return;
+
+ cifs_sb = CIFS_SB(sb);
+ tcon = cifs_sb_master_tcon(cifs_sb);
+ if (tcon->ses->server == d->server)
+ d->sb = sb;
+}
+
+struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server)
+{
+ struct super_cb_data d = {
+ .server = server,
+ .sb = NULL,
+ };
+
+ iterate_supers_type(&cifs_fs_type, super_cb, &d);
+
+ if (unlikely(!d.sb))
+ return ERR_PTR(-ENOENT);
+ /*
+ * Grab an active reference in order to prevent automounts (DFS links)
+ * of expiring and then freeing up our cifs superblock pointer while
+ * we're doing failover.
+ */
+ cifs_sb_active(d.sb);
+ return d.sb;
+}
+
+void cifs_put_tcp_super(struct super_block *sb)
+{
+ if (!IS_ERR_OR_NULL(sb))
+ cifs_sb_deactive(sb);
+}
+
+int update_super_prepath(struct cifs_tcon *tcon, const char *prefix,
+ size_t prefix_len)
+{
+ struct super_block *sb;
+ struct cifs_sb_info *cifs_sb;
+ int rc = 0;
+
+ sb = cifs_get_tcp_super(tcon->ses->server);
+ if (IS_ERR(sb))
+ return PTR_ERR(sb);
+
+ cifs_sb = CIFS_SB(sb);
+
+ kfree(cifs_sb->prepath);
+
+ if (*prefix && prefix_len) {
+ cifs_sb->prepath = kstrndup(prefix, prefix_len, GFP_ATOMIC);
+ if (!cifs_sb->prepath) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
+ } else
+ cifs_sb->prepath = NULL;
+
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+
+out:
+ cifs_put_tcp_super(sb);
+ return rc;
+}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index ba9dadf3be24..19e4a5d3b4ca 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -32,6 +32,7 @@
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
#include "cifsfs.h"
+#include "smb2proto.h"
/*
* To be safe - for UCS to UTF-8 with strings loaded with the rare long
@@ -217,6 +218,60 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
}
}
+/* Fill a cifs_fattr struct with info from SMB_FIND_FILE_POSIX_INFO. */
+static void
+cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info,
+ struct cifs_sb_info *cifs_sb)
+{
+ struct smb2_posix_info_parsed parsed;
+
+ posix_info_parse(info, NULL, &parsed);
+
+ memset(fattr, 0, sizeof(*fattr));
+ fattr->cf_uniqueid = le64_to_cpu(info->Inode);
+ fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
+ fattr->cf_eof = le64_to_cpu(info->EndOfFile);
+
+ fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
+ fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
+ fattr->cf_ctime = cifs_NTtimeToUnix(info->CreationTime);
+
+ fattr->cf_nlink = le32_to_cpu(info->HardLinks);
+ fattr->cf_cifsattrs = le32_to_cpu(info->DosAttributes);
+
+ /*
+ * Since we set the inode type below we need to mask off
+ * to avoid strange results if bits set above.
+ * XXX: why not make server&client use the type bits?
+ */
+ fattr->cf_mode = le32_to_cpu(info->Mode) & ~S_IFMT;
+
+ cifs_dbg(VFS, "XXX dev %d, reparse %d, mode %o",
+ le32_to_cpu(info->DeviceId),
+ le32_to_cpu(info->ReparseTag),
+ le32_to_cpu(info->Mode));
+
+ if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+ fattr->cf_mode |= S_IFDIR;
+ fattr->cf_dtype = DT_DIR;
+ } else {
+ /*
+ * mark anything that is not a dir as regular
+ * file. special files should have the REPARSE
+ * attribute and will be marked as needing revaluation
+ */
+ fattr->cf_mode |= S_IFREG;
+ fattr->cf_dtype = DT_REG;
+ }
+
+ if (reparse_file_needs_reval(fattr))
+ fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+
+ /* TODO map SIDs */
+ fattr->cf_uid = cifs_sb->mnt_uid;
+ fattr->cf_gid = cifs_sb->mnt_gid;
+}
+
static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info)
{
const FILE_DIRECTORY_INFO *fi = info;
@@ -359,6 +414,8 @@ ffirst_retry:
/* if (cap_unix(tcon->ses) { */
if (tcon->unix_ext)
cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX;
+ else if (tcon->posix_extensions)
+ cifsFile->srch_inf.info_level = SMB_FIND_FILE_POSIX_INFO;
else if ((tcon->ses->capabilities &
tcon->ses->server->vals->cap_nt_find) == 0) {
cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD;
@@ -451,6 +508,23 @@ struct cifs_dirent {
u64 ino;
};
+static void cifs_fill_dirent_posix(struct cifs_dirent *de,
+ const struct smb2_posix_info *info)
+{
+ struct smb2_posix_info_parsed parsed;
+
+ /* payload should have already been checked at this point */
+ if (posix_info_parse(info, NULL, &parsed) < 0) {
+ cifs_dbg(VFS, "invalid POSIX info payload");
+ return;
+ }
+
+ de->name = parsed.name;
+ de->namelen = parsed.name_len;
+ de->resume_key = info->Ignored;
+ de->ino = le64_to_cpu(info->Inode);
+}
+
static void cifs_fill_dirent_unix(struct cifs_dirent *de,
const FILE_UNIX_INFO *info, bool is_unicode)
{
@@ -511,6 +585,9 @@ static int cifs_fill_dirent(struct cifs_dirent *de, const void *info,
memset(de, 0, sizeof(*de));
switch (level) {
+ case SMB_FIND_FILE_POSIX_INFO:
+ cifs_fill_dirent_posix(de, info);
+ break;
case SMB_FIND_FILE_UNIX:
cifs_fill_dirent_unix(de, info, is_unicode);
break;
@@ -786,6 +863,11 @@ static int cifs_filldir(char *find_entry, struct file *file,
}
switch (file_info->srch_inf.info_level) {
+ case SMB_FIND_FILE_POSIX_INFO:
+ cifs_posix_to_fattr(&fattr,
+ (struct smb2_posix_info *)find_entry,
+ cifs_sb);
+ break;
case SMB_FIND_FILE_UNIX:
cifs_unix_basic_to_fattr(&fattr,
&((FILE_UNIX_INFO *)find_entry)->basic,
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index eb994e313c6a..b130efaf8feb 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -766,7 +766,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
struct cifs_tcon *tcon;
/* if the file is already open for write, just use that fileid */
- open_file = find_writable_file(cinode, true);
+ open_file = find_writable_file(cinode, FIND_WR_FSUID_ONLY);
if (open_file) {
fid.netfid = open_file->fid.netfid;
netpid = open_file->pid;
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index afe1f03aabe3..2fa3ba354cc9 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -62,7 +62,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL,
- NULL);
+ NULL, NULL);
if (rc)
goto out;
@@ -152,7 +152,12 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
(li->offset + li->length))
continue;
if (current->tgid != li->pid)
- continue;
+ /*
+ * flock and OFD lock are associated with an open
+ * file description, not the process.
+ */
+ if (!(flock->fl_flags & (FL_FLOCK | FL_OFDLCK)))
+ continue;
if (cinode->can_cache_brlcks) {
/*
* We can cache brlock requests - simply remove a lock
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1cf207564ff9..a8c301ae00ed 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -521,7 +521,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
cifs_i = CIFS_I(inode);
dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
data.Attributes = cpu_to_le32(dosattrs);
- cifs_get_writable_path(tcon, name, &cfile);
+ cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile);
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, ACL_NO_MODE,
@@ -577,7 +577,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
{
struct cifsFileInfo *cfile;
- cifs_get_writable_path(tcon, from_name, &cfile);
+ cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile);
return smb2_set_path_attr(xid, tcon, from_name, to_name,
cifs_sb, DELETE, SMB2_OP_RENAME, cfile);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index baa825f4cec0..b36c46f48705 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -328,16 +328,6 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
/* start with specified wsize, or default */
wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->rdma) {
- if (server->sign)
- wsize = min_t(unsigned int,
- wsize, server->smbd_conn->max_fragmented_send_size);
- else
- wsize = min_t(unsigned int,
- wsize, server->smbd_conn->max_readwrite_size);
- }
-#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
@@ -356,8 +346,15 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
if (server->sign)
+ /*
+ * Account for SMB2 data transfer packet header and
+ * possible encryption header
+ */
wsize = min_t(unsigned int,
- wsize, server->smbd_conn->max_fragmented_send_size);
+ wsize,
+ server->smbd_conn->max_fragmented_send_size -
+ SMB2_READWRITE_PDU_HEADER_SIZE -
+ sizeof(struct smb2_transform_hdr));
else
wsize = min_t(unsigned int,
wsize, server->smbd_conn->max_readwrite_size);
@@ -378,16 +375,6 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
/* start with specified rsize, or default */
rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (server->rdma) {
- if (server->sign)
- rsize = min_t(unsigned int,
- rsize, server->smbd_conn->max_fragmented_recv_size);
- else
- rsize = min_t(unsigned int,
- rsize, server->smbd_conn->max_readwrite_size);
- }
-#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
@@ -407,8 +394,15 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
if (server->sign)
+ /*
+ * Account for SMB2 data transfer packet header and
+ * possible encryption header
+ */
rsize = min_t(unsigned int,
- rsize, server->smbd_conn->max_fragmented_recv_size);
+ rsize,
+ server->smbd_conn->max_fragmented_recv_size -
+ SMB2_READWRITE_PDU_HEADER_SIZE -
+ sizeof(struct smb2_transform_hdr));
else
rsize = min_t(unsigned int,
rsize, server->smbd_conn->max_readwrite_size);
@@ -794,7 +788,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
tcon->crfid.has_lease = true;
smb2_parse_contexts(server, o_rsp,
&oparms.fid->epoch,
- oparms.fid->lease_key, &oplock, NULL);
+ oparms.fid->lease_key, &oplock,
+ NULL, NULL);
} else
goto oshr_exit;
@@ -838,7 +833,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
if (no_cached_open)
rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
- NULL);
+ NULL, NULL);
else
rc = open_shroot(xid, tcon, cifs_sb, &fid);
@@ -878,7 +873,8 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
+ NULL, NULL);
if (rc)
return;
@@ -913,7 +909,8 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL,
+ NULL);
if (rc) {
kfree(utf16_path);
return rc;
@@ -1116,7 +1113,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
void *data[1];
struct smb2_file_full_ea_info *ea = NULL;
struct kvec close_iov[1];
- int rc;
+ struct smb2_query_info_rsp *rsp;
+ int rc, used_len = 0;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -1139,6 +1137,38 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
cifs_sb);
if (rc == -ENODATA)
goto sea_exit;
+ } else {
+ /* If we are adding a attribute we should first check
+ * if there will be enough space available to store
+ * the new EA. If not we should not add it since we
+ * would not be able to even read the EAs back.
+ */
+ rc = smb2_query_info_compound(xid, tcon, utf16_path,
+ FILE_READ_EA,
+ FILE_FULL_EA_INFORMATION,
+ SMB2_O_INFO_FILE,
+ CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE,
+ &rsp_iov[1], &resp_buftype[1], cifs_sb);
+ if (rc == 0) {
+ rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ used_len = le32_to_cpu(rsp->OutputBufferLength);
+ }
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(&rsp_iov[1], 0, sizeof(rsp_iov[1]));
+ rc = 0;
+
+ /* Use a fudge factor of 256 bytes in case we collide
+ * with a different set_EAs command.
+ */
+ if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 <
+ used_len + ea_name_len + ea_value_len + 1) {
+ rc = -ENOSPC;
+ goto sea_exit;
+ }
}
}
@@ -1331,6 +1361,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
cfile->fid.persistent_fid = fid->persistent_fid;
cfile->fid.volatile_fid = fid->volatile_fid;
+ cfile->fid.access = fid->access;
#ifdef CONFIG_CIFS_DEBUG2
cfile->fid.mid = fid->mid;
#endif /* CIFS_DEBUG2 */
@@ -2088,7 +2119,8 @@ smb3_notify(const unsigned int xid, struct file *pfile,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL,
+ NULL);
if (rc)
goto notify_exit;
@@ -2188,6 +2220,8 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
goto qdf_free;
}
+ atomic_inc(&tcon->num_remote_opens);
+
qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base;
if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
trace_smb3_query_dir_done(xid, fid->persistent_fid,
@@ -2507,7 +2541,8 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
+ NULL, NULL);
if (rc)
return rc;
@@ -2992,7 +3027,8 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL,
+ NULL);
kfree(utf16_path);
if (!rc) {
rc = SMB2_query_acl(xid, tlink_tcon(tlink), fid.persistent_fid,
@@ -3050,7 +3086,8 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
+ NULL, NULL);
kfree(utf16_path);
if (!rc) {
rc = SMB2_set_acl(xid, tlink_tcon(tlink), fid.persistent_fid,
@@ -3212,6 +3249,10 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
* Extending the file
*/
if ((keep_size == false) && i_size_read(inode) < off + len) {
+ rc = inode_newsize_ok(inode, off + len);
+ if (rc)
+ goto out;
+
if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0)
smb2_set_sparse(xid, tcon, cfile, inode, false);
@@ -3294,7 +3335,7 @@ static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offs
* some servers (Windows2016) will not reflect recent writes in
* QUERY_ALLOCATED_RANGES until SMB2_flush is called.
*/
- wrcfile = find_writable_file(cifsi, false);
+ wrcfile = find_writable_file(cifsi, FIND_WR_ANY);
if (wrcfile) {
filemap_write_and_wait(inode->i_mapping);
smb2_flush_file(xid, tcon, &wrcfile->fid);
@@ -3383,7 +3424,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon,
if (rc)
goto out;
- if (out_data_len < sizeof(struct file_allocated_range_buffer)) {
+ if (out_data_len && out_data_len < sizeof(struct file_allocated_range_buffer)) {
rc = -EINVAL;
goto out;
}
@@ -4115,7 +4156,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
if (server->ops->is_session_expired &&
server->ops->is_session_expired(buf)) {
cifs_reconnect(server);
- wake_up(&server->response_q);
return -1;
}
@@ -4479,14 +4519,12 @@ smb3_receive_transform(struct TCP_Server_Info *server,
cifs_server_dbg(VFS, "Transform message is too small (%u)\n",
pdu_length);
cifs_reconnect(server);
- wake_up(&server->response_q);
return -ECONNABORTED;
}
if (pdu_length < orig_len + sizeof(struct smb2_transform_hdr)) {
cifs_server_dbg(VFS, "Transform message is broken\n");
cifs_reconnect(server);
- wake_up(&server->response_q);
return -ECONNABORTED;
}
@@ -4795,6 +4833,7 @@ struct smb_version_operations smb21_operations = {
.wp_retry_size = smb2_wp_retry_size,
.dir_needs_close = smb2_dir_needs_close,
.enum_snapshots = smb3_enum_snapshots,
+ .notify = smb3_notify,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
#ifdef CONFIG_CIFS_XATTR
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 1234f9ccab03..47d3e382ecaa 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -193,9 +193,18 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
for (it = dfs_cache_get_tgt_iterator(&tl); it;
it = dfs_cache_get_next_tgt(&tl, it)) {
- const char *tgt = dfs_cache_get_tgt_name(it);
+ const char *share, *prefix;
+ size_t share_len, prefix_len;
- extract_unc_hostname(tgt, &dfs_host, &dfs_host_len);
+ rc = dfs_cache_get_tgt_share(it, &share, &share_len, &prefix,
+ &prefix_len);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to parse target share %d\n",
+ __func__, rc);
+ continue;
+ }
+
+ extract_unc_hostname(share, &dfs_host, &dfs_host_len);
if (dfs_host_len != tcp_host_len
|| strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
@@ -206,11 +215,13 @@ static int __smb2_reconnect(const struct nls_table *nlsc,
continue;
}
- scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt);
+ scnprintf(tree, MAX_TREE_SIZE, "\\%.*s", (int)share_len, share);
rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc);
- if (!rc)
+ if (!rc) {
+ rc = update_super_prepath(tcon, prefix, prefix_len);
break;
+ }
if (rc == -EREMOTE)
break;
}
@@ -378,7 +389,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
}
if (smb2_command != SMB2_INTERNAL_CMD)
- queue_delayed_work(cifsiod_wq, &server->reconnect, 0);
+ mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
atomic_inc(&tconInfoReconnectCount);
out:
@@ -1940,20 +1951,46 @@ parse_query_id_ctxt(struct create_context *cc, struct smb2_file_all_info *buf)
}
static void
-parse_posix_ctxt(struct create_context *cc, struct smb_posix_info *pposix_inf)
+parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info,
+ struct create_posix_rsp *posix)
{
- /* struct smb_posix_info *ppinf = (struct smb_posix_info *)cc; */
+ int sid_len;
+ u8 *beg = (u8 *)cc + le16_to_cpu(cc->DataOffset);
+ u8 *end = beg + le32_to_cpu(cc->DataLength);
+ u8 *sid;
+
+ memset(posix, 0, sizeof(*posix));
+
+ posix->nlink = le32_to_cpu(*(__le32 *)(beg + 0));
+ posix->reparse_tag = le32_to_cpu(*(__le32 *)(beg + 4));
+ posix->mode = le32_to_cpu(*(__le32 *)(beg + 8));
- /* TODO: Need to add parsing for the context and return */
- printk_once(KERN_WARNING
- "SMB3 3.11 POSIX response context not completed yet\n");
+ sid = beg + 12;
+ sid_len = posix_info_sid_size(sid, end);
+ if (sid_len < 0) {
+ cifs_dbg(VFS, "bad owner sid in posix create response\n");
+ return;
+ }
+ memcpy(&posix->owner, sid, sid_len);
+
+ sid = sid + sid_len;
+ sid_len = posix_info_sid_size(sid, end);
+ if (sid_len < 0) {
+ cifs_dbg(VFS, "bad group sid in posix create response\n");
+ return;
+ }
+ memcpy(&posix->group, sid, sid_len);
+
+ cifs_dbg(FYI, "nlink=%d mode=%o reparse_tag=%x\n",
+ posix->nlink, posix->mode, posix->reparse_tag);
}
void
smb2_parse_contexts(struct TCP_Server_Info *server,
- struct smb2_create_rsp *rsp,
- unsigned int *epoch, char *lease_key, __u8 *oplock,
- struct smb2_file_all_info *buf)
+ struct smb2_create_rsp *rsp,
+ unsigned int *epoch, char *lease_key, __u8 *oplock,
+ struct smb2_file_all_info *buf,
+ struct create_posix_rsp *posix)
{
char *data_offset;
struct create_context *cc;
@@ -1983,8 +2020,9 @@ smb2_parse_contexts(struct TCP_Server_Info *server,
strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4) == 0)
parse_query_id_ctxt(cc, buf);
else if ((le16_to_cpu(cc->NameLength) == 16)) {
- if (memcmp(name, smb3_create_tag_posix, 16) == 0)
- parse_posix_ctxt(cc, NULL);
+ if (posix &&
+ memcmp(name, smb3_create_tag_posix, 16) == 0)
+ parse_posix_ctxt(cc, buf, posix);
}
/* else {
cifs_dbg(FYI, "Context not matched with len %d\n",
@@ -2709,6 +2747,7 @@ SMB2_open_free(struct smb_rqst *rqst)
int
SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
__u8 *oplock, struct smb2_file_all_info *buf,
+ struct create_posix_rsp *posix,
struct kvec *err_iov, int *buftype)
{
struct smb_rqst rqst;
@@ -2771,6 +2810,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
atomic_inc(&tcon->num_remote_opens);
oparms->fid->persistent_fid = rsp->PersistentFileId;
oparms->fid->volatile_fid = rsp->VolatileFileId;
+ oparms->fid->access = oparms->desired_access;
#ifdef CONFIG_CIFS_DEBUG2
oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId);
#endif /* CIFS_DEBUG2 */
@@ -2786,7 +2826,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
smb2_parse_contexts(server, rsp, &oparms->fid->epoch,
- oparms->fid->lease_key, oplock, buf);
+ oparms->fid->lease_key, oplock, buf, posix);
creat_exit:
SMB2_open_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
@@ -3558,7 +3598,7 @@ SMB2_echo(struct TCP_Server_Info *server)
if (server->tcpStatus == CifsNeedNegotiate) {
/* No need to send echo on newly established connections */
- queue_delayed_work(cifsiod_wq, &server->reconnect, 0);
+ mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
return rc;
}
@@ -4285,8 +4325,104 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
return rc;
}
+int posix_info_sid_size(const void *beg, const void *end)
+{
+ size_t subauth;
+ int total;
+
+ if (beg + 1 > end)
+ return -1;
+
+ subauth = *(u8 *)(beg+1);
+ if (subauth < 1 || subauth > 15)
+ return -1;
+
+ total = 1 + 1 + 6 + 4*subauth;
+ if (beg + total > end)
+ return -1;
+
+ return total;
+}
+
+int posix_info_parse(const void *beg, const void *end,
+ struct smb2_posix_info_parsed *out)
+
+{
+ int total_len = 0;
+ int sid_len;
+ int name_len;
+ const void *owner_sid;
+ const void *group_sid;
+ const void *name;
+
+ /* if no end bound given, assume payload to be correct */
+ if (!end) {
+ const struct smb2_posix_info *p = beg;
+
+ end = beg + le32_to_cpu(p->NextEntryOffset);
+ /* last element will have a 0 offset, pick a sensible bound */
+ if (end == beg)
+ end += 0xFFFF;
+ }
+
+ /* check base buf */
+ if (beg + sizeof(struct smb2_posix_info) > end)
+ return -1;
+ total_len = sizeof(struct smb2_posix_info);
+
+ /* check owner sid */
+ owner_sid = beg + total_len;
+ sid_len = posix_info_sid_size(owner_sid, end);
+ if (sid_len < 0)
+ return -1;
+ total_len += sid_len;
+
+ /* check group sid */
+ group_sid = beg + total_len;
+ sid_len = posix_info_sid_size(group_sid, end);
+ if (sid_len < 0)
+ return -1;
+ total_len += sid_len;
+
+ /* check name len */
+ if (beg + total_len + 4 > end)
+ return -1;
+ name_len = le32_to_cpu(*(__le32 *)(beg + total_len));
+ if (name_len < 1 || name_len > 0xFFFF)
+ return -1;
+ total_len += 4;
+
+ /* check name */
+ name = beg + total_len;
+ if (name + name_len > end)
+ return -1;
+ total_len += name_len;
+
+ if (out) {
+ out->base = beg;
+ out->size = total_len;
+ out->name_len = name_len;
+ out->name = name;
+ memcpy(&out->owner, owner_sid,
+ posix_info_sid_size(owner_sid, end));
+ memcpy(&out->group, group_sid,
+ posix_info_sid_size(group_sid, end));
+ }
+ return total_len;
+}
+
+static int posix_info_extra_size(const void *beg, const void *end)
+{
+ int len = posix_info_parse(beg, end, NULL);
+
+ if (len < 0)
+ return -1;
+ return len - sizeof(struct smb2_posix_info);
+}
+
static unsigned int
-num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
+num_entries(int infotype, char *bufstart, char *end_of_buf, char **lastentry,
+ size_t size)
{
int len;
unsigned int entrycount = 0;
@@ -4310,8 +4446,13 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
entryptr = entryptr + next_offset;
dir_info = (FILE_DIRECTORY_INFO *)entryptr;
- len = le32_to_cpu(dir_info->FileNameLength);
- if (entryptr + len < entryptr ||
+ if (infotype == SMB_FIND_FILE_POSIX_INFO)
+ len = posix_info_extra_size(entryptr, end_of_buf);
+ else
+ len = le32_to_cpu(dir_info->FileNameLength);
+
+ if (len < 0 ||
+ entryptr + len < entryptr ||
entryptr + len > end_of_buf ||
entryptr + len + size > end_of_buf) {
cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n",
@@ -4361,6 +4502,9 @@ int SMB2_query_directory_init(const unsigned int xid,
case SMB_FIND_FILE_ID_FULL_DIR_INFO:
req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION;
break;
+ case SMB_FIND_FILE_POSIX_INFO:
+ req->FileInformationClass = SMB_FIND_FILE_POSIX_INFO;
+ break;
default:
cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
info_level);
@@ -4426,6 +4570,10 @@ smb2_parse_query_directory(struct cifs_tcon *tcon,
case SMB_FIND_FILE_ID_FULL_DIR_INFO:
info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
break;
+ case SMB_FIND_FILE_POSIX_INFO:
+ /* note that posix payload are variable size */
+ info_buf_size = sizeof(struct smb2_posix_info);
+ break;
default:
cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
srch_inf->info_level);
@@ -4435,8 +4583,10 @@ smb2_parse_query_directory(struct cifs_tcon *tcon,
rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
le32_to_cpu(rsp->OutputBufferLength), rsp_iov,
info_buf_size);
- if (rc)
+ if (rc) {
+ cifs_tcon_dbg(VFS, "bad info payload");
return rc;
+ }
srch_inf->unicode = true;
@@ -4450,9 +4600,14 @@ smb2_parse_query_directory(struct cifs_tcon *tcon,
srch_inf->srch_entries_start = srch_inf->last_entry =
(char *)rsp + le16_to_cpu(rsp->OutputBufferOffset);
end_of_smb = rsp_iov->iov_len + (char *)rsp;
- srch_inf->entries_in_buffer =
- num_entries(srch_inf->srch_entries_start, end_of_smb,
- &srch_inf->last_entry, info_buf_size);
+
+ srch_inf->entries_in_buffer = num_entries(
+ srch_inf->info_level,
+ srch_inf->srch_entries_start,
+ end_of_smb,
+ &srch_inf->last_entry,
+ info_buf_size);
+
srch_inf->index_of_last_entry += srch_inf->entries_in_buffer;
cifs_dbg(FYI, "num entries %d last_index %lld srch start %p srch end %p\n",
srch_inf->entries_in_buffer, srch_inf->index_of_last_entry,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index fa03df130f1a..10acf90f858d 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -91,6 +91,7 @@
#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
+#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc)
/*
* SMB2 Header Definition
@@ -119,6 +120,9 @@ struct smb2_sync_hdr {
__u8 Signature[16];
} __packed;
+/* The total header size for SMB2 read and write */
+#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_sync_hdr))
+
struct smb2_sync_pdu {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize2; /* size of wct area (varies, request specific) */
@@ -127,16 +131,33 @@ struct smb2_sync_pdu {
#define SMB3_AES128CCM_NONCE 11
#define SMB3_AES128GCM_NONCE 12
+/* Transform flags (for 3.0 dialect this flag indicates CCM */
+#define TRANSFORM_FLAG_ENCRYPTED 0x0001
struct smb2_transform_hdr {
__le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */
__u8 Signature[16];
__u8 Nonce[16];
__le32 OriginalMessageSize;
__u16 Reserved1;
- __le16 Flags; /* EncryptionAlgorithm */
+ __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */
__u64 SessionId;
} __packed;
+/* See MS-SMB2 2.2.42.1 */
+struct compression_playload_header {
+ __le16 AlgorithmId;
+ __le16 Reserved;
+ __le32 Length;
+} __packed;
+
+/* See MS-SMB2 2.2.42.2 */
+struct compression_pattern_payload_v1 {
+ __le16 Pattern;
+ __le16 Reserved1;
+ __le16 Reserved2;
+ __le32 Repetitions;
+} __packed;
+
/*
* SMB2 flag definitions
*/
@@ -182,7 +203,7 @@ struct smb2_symlink_err_rsp {
__le16 PrintNameOffset;
__le16 PrintNameLength;
__le32 Flags;
- __u8 PathBuffer[0];
+ __u8 PathBuffer[];
} __packed;
/* SMB 3.1.1 and later dialects. See MS-SMB2 section 2.2.2.1 */
@@ -192,6 +213,10 @@ struct smb2_error_context_rsp {
__u8 ErrorContextData; /* ErrorDataLength long array */
} __packed;
+/* ErrorId values */
+#define SMB2_ERROR_ID_DEFAULT 0x00000000
+#define SMB2_ERROR_ID_SHARE_REDIRECT cpu_to_le32(0x72645253) /* "rdRS" */
+
/* Defines for Type field below (see MS-SMB2 2.2.2.2.2.1) */
#define MOVE_DST_IPADDR_V4 cpu_to_le32(0x00000001)
#define MOVE_DST_IPADDR_V6 cpu_to_le32(0x00000002)
@@ -210,7 +235,7 @@ struct share_redirect_error_context_rsp {
__le16 Flags;
__le16 TargetType;
__le32 IPAddrCount;
- struct move_dst_ipaddr IpAddrMoveList[0];
+ struct move_dst_ipaddr IpAddrMoveList[];
/* __u8 ResourceName[] */ /* Name of share as counted Unicode string */
} __packed;
@@ -307,11 +332,17 @@ struct smb2_encryption_neg_context {
#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001)
#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002)
#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003)
+/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
+#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004)
+
+/* Compression Flags */
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000)
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001)
struct smb2_compression_capabilities_context {
__le16 ContextType; /* 3 */
__le16 DataLength;
- __u32 Reserved;
+ __u32 Flags;
__le16 CompressionAlgorithmCount;
__u16 Padding;
__u32 Reserved1;
@@ -326,7 +357,7 @@ struct smb2_netname_neg_context {
__le16 ContextType; /* 0x100 */
__le16 DataLength;
__le32 Reserved;
- __le16 NetName[0]; /* hostname of target converted to UCS-2 */
+ __le16 NetName[]; /* hostname of target converted to UCS-2 */
} __packed;
#define POSIX_CTXT_DATA_LEN 16
@@ -406,7 +437,7 @@ struct smb2_logoff_rsp {
struct smb2_tree_connect_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 9 */
- __le16 Reserved; /* Flags in SMB3.1.1 */
+ __le16 Flags; /* Reserved MBZ for dialects prior to SMB3.1.1 */
__le16 PathOffset;
__le16 PathLength;
__u8 Buffer[1]; /* variable length */
@@ -421,13 +452,13 @@ struct tree_connect_contexts {
__le16 ContextType;
__le16 DataLength;
__le32 Reserved;
- __u8 Data[0];
+ __u8 Data[];
} __packed;
/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
struct smb3_blob_data {
__le16 BlobSize;
- __u8 BlobData[0];
+ __u8 BlobData[];
} __packed;
/* Valid values for Attr */
@@ -477,14 +508,14 @@ struct remoted_identity_tcon_context {
__le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
__le16 UserClaims; /* offset to BLOB_DATA struct */
__le16 DeviceClaims; /* offset to BLOB_DATA struct */
- __u8 TicketInfo[0]; /* variable length buf - remoted identity data */
+ __u8 TicketInfo[]; /* variable length buf - remoted identity data */
} __packed;
struct smb2_tree_connect_req_extension {
__le32 TreeConnectContextOffset;
__le16 TreeConnectContextCount;
__u8 Reserved[10];
- __u8 PathName[0]; /* variable sized array */
+ __u8 PathName[]; /* variable sized array */
/* followed by array of TreeConnectContexts */
} __packed;
@@ -633,7 +664,7 @@ struct smb2_tree_disconnect_rsp {
| FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
-/* Impersonation Levels */
+/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */
#define IL_ANONYMOUS cpu_to_le32(0x00000000)
#define IL_IDENTIFICATION cpu_to_le32(0x00000001)
#define IL_IMPERSONATION cpu_to_le32(0x00000002)
@@ -689,7 +720,7 @@ struct smb2_create_req {
__le16 NameLength;
__le32 CreateContextsOffset;
__le32 CreateContextsLength;
- __u8 Buffer[0];
+ __u8 Buffer[];
} __packed;
/*
@@ -727,7 +758,7 @@ struct create_context {
__le16 Reserved;
__le16 DataOffset;
__le32 DataLength;
- __u8 Buffer[0];
+ __u8 Buffer[];
} __packed;
#define SMB2_LEASE_READ_CACHING_HE 0x01
@@ -739,7 +770,7 @@ struct create_context {
#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02)
#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04)
-#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02)
+#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x00000002)
#define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET cpu_to_le32(0x00000004)
#define SMB2_LEASE_KEY_SIZE 16
@@ -869,7 +900,7 @@ struct crt_sd_ctxt {
struct resume_key_req {
char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
__le32 ContextLength; /* MBZ */
- char Context[0]; /* ignored, Windows sets to 4 bytes of zero */
+ char Context[]; /* ignored, Windows sets to 4 bytes of zero */
} __packed;
/* this goes in the ioctl buffer when doing a copychunk request */
@@ -931,7 +962,7 @@ struct reparse_data_buffer {
__le32 ReparseTag;
__le16 ReparseDataLength;
__u16 Reserved;
- __u8 DataBuffer[0]; /* Variable Length */
+ __u8 DataBuffer[]; /* Variable Length */
} __packed;
struct reparse_guid_data_buffer {
@@ -939,7 +970,7 @@ struct reparse_guid_data_buffer {
__le16 ReparseDataLength;
__u16 Reserved;
__u8 ReparseGuid[16];
- __u8 DataBuffer[0]; /* Variable Length */
+ __u8 DataBuffer[]; /* Variable Length */
} __packed;
struct reparse_mount_point_data_buffer {
@@ -950,7 +981,7 @@ struct reparse_mount_point_data_buffer {
__le16 SubstituteNameLength;
__le16 PrintNameOffset;
__le16 PrintNameLength;
- __u8 PathBuffer[0]; /* Variable Length */
+ __u8 PathBuffer[]; /* Variable Length */
} __packed;
#define SYMLINK_FLAG_RELATIVE 0x00000001
@@ -964,7 +995,7 @@ struct reparse_symlink_data_buffer {
__le16 PrintNameOffset;
__le16 PrintNameLength;
__le32 Flags;
- __u8 PathBuffer[0]; /* Variable Length */
+ __u8 PathBuffer[]; /* Variable Length */
} __packed;
/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
@@ -1066,7 +1097,7 @@ struct smb2_ioctl_req {
__le32 MaxOutputResponse;
__le32 Flags;
__u32 Reserved2;
- __u8 Buffer[0];
+ __u8 Buffer[];
} __packed;
struct smb2_ioctl_rsp {
@@ -1180,7 +1211,7 @@ struct smb2_write_req {
__le64 Offset;
__u64 PersistentFileId; /* opaque endianness */
__u64 VolatileFileId; /* opaque endianness */
- __le32 Channel; /* Reserved MBZ */
+ __le32 Channel; /* MBZ unless SMB3.02 or later */
__le32 RemainingBytes;
__le16 WriteChannelInfoOffset;
__le16 WriteChannelInfoLength;
@@ -1469,7 +1500,7 @@ struct smb3_fs_vol_info {
__le32 VolumeLabelLength; /* includes trailing null */
__u8 SupportsObjects; /* True if eg like NTFS, supports objects */
__u8 Reserved;
- __u8 VolumeLabel[0]; /* variable len */
+ __u8 VolumeLabel[]; /* variable len */
} __packed;
/* partial list of QUERY INFO levels */
@@ -1531,7 +1562,7 @@ struct smb2_file_rename_info { /* encoding of request for level 10 */
__u8 Reserved[7];
__u64 RootDirectory; /* MBZ for network operations (why says spec?) */
__le32 FileNameLength;
- char FileName[0]; /* New name to be assigned */
+ char FileName[]; /* New name to be assigned */
} __packed; /* level 10 Set */
struct smb2_file_link_info { /* encoding of request for level 11 */
@@ -1540,7 +1571,7 @@ struct smb2_file_link_info { /* encoding of request for level 11 */
__u8 Reserved[7];
__u64 RootDirectory; /* MBZ for network operations (why says spec?) */
__le32 FileNameLength;
- char FileName[0]; /* Name to be assigned to new link */
+ char FileName[]; /* Name to be assigned to new link */
} __packed; /* level 11 Set */
struct smb2_file_full_ea_info { /* encoding of response for level 15 */
@@ -1548,7 +1579,7 @@ struct smb2_file_full_ea_info { /* encoding of response for level 15 */
__u8 flags;
__u8 ea_name_length;
__le16 ea_value_length;
- char ea_data[0]; /* \0 terminated name plus value */
+ char ea_data[]; /* \0 terminated name plus value */
} __packed; /* level 15 Set */
/*
@@ -1604,11 +1635,56 @@ struct smb2_file_id_information {
extern char smb2_padding[7];
/* equivalent of the contents of SMB3.1.1 POSIX open context response */
-struct smb_posix_info {
- __le32 nlink;
- __le32 reparse_tag;
- __le32 mode;
- kuid_t uid;
- kuid_t gid;
+struct create_posix_rsp {
+ u32 nlink;
+ u32 reparse_tag;
+ u32 mode;
+ struct cifs_sid owner; /* var-sized on the wire */
+ struct cifs_sid group; /* var-sized on the wire */
+} __packed;
+
+/*
+ * SMB2-only POSIX info level
+ *
+ * See posix_info_sid_size(), posix_info_extra_size() and
+ * posix_info_parse() to help with the handling of this struct.
+ */
+struct smb2_posix_info {
+ __le32 NextEntryOffset;
+ __u32 Ignored;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 DosAttributes;
+ __le64 Inode;
+ __le32 DeviceId;
+ __le32 Zero;
+ /* beginning of POSIX Create Context Response */
+ __le32 HardLinks;
+ __le32 ReparseTag;
+ __le32 Mode;
+ /*
+ * var sized owner SID
+ * var sized group SID
+ * le32 filenamelength
+ * u8 filename[]
+ */
+} __packed;
+
+/*
+ * Parsed version of the above struct. Allows direct access to the
+ * variable length fields
+ */
+struct smb2_posix_info_parsed {
+ const struct smb2_posix_info *base;
+ size_t size;
+ struct cifs_sid owner;
+ struct cifs_sid group;
+ int name_len;
+ const u8 *name;
};
+
#endif /* _SMB2PDU_H */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index de6388ef344f..4d1ff7b66fdc 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -139,6 +139,7 @@ extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
__le16 *path, __u8 *oplock,
struct smb2_file_all_info *buf,
+ struct create_posix_rsp *posix,
struct kvec *err_iov, int *resp_buftype);
extern int SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
__u8 *oplock, struct cifs_open_parms *oparms,
@@ -252,7 +253,8 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
extern void smb2_parse_contexts(struct TCP_Server_Info *server,
struct smb2_create_rsp *rsp,
unsigned int *epoch, char *lease_key,
- __u8 *oplock, struct smb2_file_all_info *buf);
+ __u8 *oplock, struct smb2_file_all_info *buf,
+ struct create_posix_rsp *posix);
extern int smb3_encryption_required(const struct cifs_tcon *tcon);
extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length,
struct kvec *iov, unsigned int min_buf_size);
@@ -272,4 +274,7 @@ extern int smb2_query_info_compound(const unsigned int xid,
u32 class, u32 type, u32 output_len,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb);
+int posix_info_parse(const void *beg, const void *end,
+ struct smb2_posix_info_parsed *out);
+int posix_info_sid_size(const void *beg, const void *end);
#endif /* _SMB2PROTO_H */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 08b703b7a15e..20cc79e5c15d 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -602,7 +602,7 @@ int
smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
unsigned int rc;
- char server_response_sig[16];
+ char server_response_sig[SMB2_SIGNATURE_SIZE];
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
@@ -638,9 +638,11 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
if (rc)
return rc;
- if (memcmp(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE))
+ if (memcmp(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE)) {
+ dump_stack();
+ cifs_dbg(VFS, "sign fail cmd 0x%x message id 0x%llx\n", shdr->Command, shdr->MessageId);
return -EACCES;
- else
+ } else
return 0;
}
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5b1b97e9e0c9..8da43a500686 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -459,25 +459,6 @@ static void smbd_post_send_credits(struct work_struct *work)
check_and_send_immediate(info);
}
-static void smbd_recv_done_work(struct work_struct *work)
-{
- struct smbd_connection *info =
- container_of(work, struct smbd_connection, recv_done_work);
-
- /*
- * We may have new send credits granted from remote peer
- * If any sender is blcoked on lack of credets, unblock it
- */
- if (atomic_read(&info->send_credits))
- wake_up_interruptible(&info->wait_send_queue);
-
- /*
- * Check if we need to send something to remote peer to
- * grant more credits or respond to KEEP_ALIVE packet
- */
- check_and_send_immediate(info);
-}
-
/* Called from softirq, when recv is done */
static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
@@ -546,8 +527,15 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
atomic_dec(&info->receive_credits);
info->receive_credit_target =
le16_to_cpu(data_transfer->credits_requested);
- atomic_add(le16_to_cpu(data_transfer->credits_granted),
- &info->send_credits);
+ if (le16_to_cpu(data_transfer->credits_granted)) {
+ atomic_add(le16_to_cpu(data_transfer->credits_granted),
+ &info->send_credits);
+ /*
+ * We have new send credits granted from remote peer
+ * If any sender is waiting for credits, unblock it
+ */
+ wake_up_interruptible(&info->wait_send_queue);
+ }
log_incoming(INFO, "data flags %d data_offset %d "
"data_length %d remaining_data_length %d\n",
@@ -563,7 +551,12 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
info->keep_alive_requested = KEEP_ALIVE_PENDING;
}
- queue_work(info->workqueue, &info->recv_done_work);
+ /*
+ * Check if we need to send something to remote peer to
+ * grant more credits or respond to KEEP_ALIVE packet
+ */
+ check_and_send_immediate(info);
+
return;
default:
@@ -1762,7 +1755,6 @@ static struct smbd_connection *_smbd_get_connection(
atomic_set(&info->send_payload_pending, 0);
INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
- INIT_WORK(&info->recv_done_work, smbd_recv_done_work);
INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
info->new_credits_offered = 0;
spin_lock_init(&info->lock_new_credits_offered);
@@ -2097,8 +2089,7 @@ int smbd_send(struct TCP_Server_Info *server,
for (i = 0; i < num_rqst; i++)
remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
- if (remaining_data_length + sizeof(struct smbd_data_transfer) >
- info->max_fragmented_send_size) {
+ if (remaining_data_length > info->max_fragmented_send_size) {
log_write(ERR, "payload size %d > max size %d\n",
remaining_data_length, info->max_fragmented_send_size);
rc = -EINVAL;
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
index 6ff880a1e186..8ede915f2b24 100644
--- a/fs/cifs/smbdirect.h
+++ b/fs/cifs/smbdirect.h
@@ -67,7 +67,6 @@ struct smbd_connection {
bool negotiate_done;
struct work_struct disconnect_work;
- struct work_struct recv_done_work;
struct work_struct post_send_credits_work;
spinlock_t lock_new_credits_offered;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index cb3ee916f527..c97570eb2c18 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -466,7 +466,7 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
struct smb_rqst *rqst, int flags)
{
struct kvec iov;
- struct smb2_transform_hdr tr_hdr;
+ struct smb2_transform_hdr *tr_hdr;
struct smb_rqst cur_rqst[MAX_COMPOUND];
int rc;
@@ -476,28 +476,34 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
if (num_rqst > MAX_COMPOUND - 1)
return -ENOMEM;
- memset(&cur_rqst[0], 0, sizeof(cur_rqst));
- memset(&iov, 0, sizeof(iov));
- memset(&tr_hdr, 0, sizeof(tr_hdr));
-
- iov.iov_base = &tr_hdr;
- iov.iov_len = sizeof(tr_hdr);
- cur_rqst[0].rq_iov = &iov;
- cur_rqst[0].rq_nvec = 1;
-
if (!server->ops->init_transform_rq) {
cifs_server_dbg(VFS, "Encryption requested but transform "
"callback is missing\n");
return -EIO;
}
+ tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS);
+ if (!tr_hdr)
+ return -ENOMEM;
+
+ memset(&cur_rqst[0], 0, sizeof(cur_rqst));
+ memset(&iov, 0, sizeof(iov));
+ memset(tr_hdr, 0, sizeof(*tr_hdr));
+
+ iov.iov_base = tr_hdr;
+ iov.iov_len = sizeof(*tr_hdr);
+ cur_rqst[0].rq_iov = &iov;
+ cur_rqst[0].rq_nvec = 1;
+
rc = server->ops->init_transform_rq(server, num_rqst + 1,
&cur_rqst[0], rqst);
if (rc)
- return rc;
+ goto out;
rc = __smb_send_rqst(server, num_rqst + 1, &cur_rqst[0]);
smb3_free_compound_rqst(num_rqst, &cur_rqst[1]);
+out:
+ kfree(tr_hdr);
return rc;
}
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 9aae851409e5..dbced2937ec8 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -76,6 +76,26 @@ static inline int fscrypt_context_size(const union fscrypt_context *ctx)
return 0;
}
+/* Check whether an fscrypt_context has a recognized version number and size */
+static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx,
+ int ctx_size)
+{
+ return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx);
+}
+
+/* Retrieve the context's nonce, assuming the context was already validated */
+static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx)
+{
+ switch (ctx->version) {
+ case FSCRYPT_CONTEXT_V1:
+ return ctx->v1.nonce;
+ case FSCRYPT_CONTEXT_V2:
+ return ctx->v2.nonce;
+ }
+ WARN_ON(1);
+ return NULL;
+}
+
#undef fscrypt_policy
union fscrypt_policy {
u8 version;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 65cb09fa6ead..302375e9f719 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -425,20 +425,8 @@ int fscrypt_get_encryption_info(struct inode *inode)
goto out;
}
- switch (ctx.version) {
- case FSCRYPT_CONTEXT_V1:
- memcpy(crypt_info->ci_nonce, ctx.v1.nonce,
- FS_KEY_DERIVATION_NONCE_SIZE);
- break;
- case FSCRYPT_CONTEXT_V2:
- memcpy(crypt_info->ci_nonce, ctx.v2.nonce,
- FS_KEY_DERIVATION_NONCE_SIZE);
- break;
- default:
- WARN_ON(1);
- res = -EINVAL;
- goto out;
- }
+ memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx),
+ FS_KEY_DERIVATION_NONCE_SIZE);
if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) {
res = -EINVAL;
@@ -539,6 +527,15 @@ int fscrypt_drop_inode(struct inode *inode)
mk = ci->ci_master_key->payload.data[0];
/*
+ * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes
+ * protected by the key were cleaned by sync_filesystem(). But if
+ * userspace is still using the files, inodes can be dirtied between
+ * then and now. We mustn't lose any writes, so skip dirty inodes here.
+ */
+ if (inode->i_state & I_DIRTY_ALL)
+ return 0;
+
+ /*
* Note: since we aren't holding ->mk_secret_sem, the result here can
* immediately become outdated. But there's no correctness problem with
* unnecessarily evicting. Nor is there a correctness problem with not
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index cf2a9d26ef7d..10ccf945020c 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -258,7 +258,7 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
{
memset(policy_u, 0, sizeof(*policy_u));
- if (ctx_size <= 0 || ctx_size != fscrypt_context_size(ctx_u))
+ if (!fscrypt_context_is_valid(ctx_u, ctx_size))
return -EINVAL;
switch (ctx_u->version) {
@@ -481,6 +481,25 @@ int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg)
}
EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex);
+/* FS_IOC_GET_ENCRYPTION_NONCE: retrieve file's encryption nonce for testing */
+int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg)
+{
+ struct inode *inode = file_inode(filp);
+ union fscrypt_context ctx;
+ int ret;
+
+ ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (ret < 0)
+ return ret;
+ if (!fscrypt_context_is_valid(&ctx, ret))
+ return -EINVAL;
+ if (copy_to_user(arg, fscrypt_context_nonce(&ctx),
+ FS_KEY_DERIVATION_NONCE_SIZE))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_nonce);
+
/**
* fscrypt_has_permitted_context() - is a file's encryption policy permitted
* within its directory?
diff --git a/fs/dax.c b/fs/dax.c
index 1f1f0201cad1..35da144375a0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -937,12 +937,11 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
* on persistent storage prior to completion of the operation.
*/
int dax_writeback_mapping_range(struct address_space *mapping,
- struct block_device *bdev, struct writeback_control *wbc)
+ struct dax_device *dax_dev, struct writeback_control *wbc)
{
XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
struct inode *inode = mapping->host;
pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
- struct dax_device *dax_dev;
void *entry;
int ret = 0;
unsigned int scanned = 0;
@@ -953,10 +952,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
return 0;
- dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
- if (!dax_dev)
- return -EIO;
-
trace_dax_writeback_range(inode, xas.xa_index, end_index);
tag_pages_for_writeback(mapping, xas.xa_index, end_index);
@@ -977,7 +972,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
xas_lock_irq(&xas);
}
xas_unlock_irq(&xas);
- put_dax(dax_dev);
trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
return ret;
}
@@ -1207,6 +1201,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
lockdep_assert_held(&inode->i_rwsem);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags |= IOMAP_NOWAIT;
+
while (iov_iter_count(iter)) {
ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
iter, dax_iomap_actor);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 634b09d18b77..2d357680094c 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/device.h>
+#include <linux/pm_runtime.h>
#include <linux/poll.h>
#include <linux/security.h>
@@ -175,8 +176,13 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
if (r)
goto out;
- real_fops = fops_get(real_fops);
- if (!real_fops) {
+ if (!fops_get(real_fops)) {
+#ifdef MODULE
+ if (real_fops->owner &&
+ real_fops->owner->state == MODULE_STATE_GOING)
+ goto out;
+#endif
+
/* Huh? Module did not clean up after itself at exit? */
WARN(1, "debugfs file owner did not clean up at exit: %pd",
dentry);
@@ -305,8 +311,13 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
if (r)
goto out;
- real_fops = fops_get(real_fops);
- if (!real_fops) {
+ if (!fops_get(real_fops)) {
+#ifdef MODULE
+ if (real_fops->owner &&
+ real_fops->owner->state == MODULE_STATE_GOING)
+ goto out;
+#endif
+
/* Huh? Module did not cleanup after itself at exit? */
WARN(1, "debugfs file owner did not clean up at exit: %pd",
dentry);
@@ -1060,7 +1071,14 @@ static int debugfs_show_regset32(struct seq_file *s, void *data)
{
struct debugfs_regset32 *regset = s->private;
+ if (regset->dev)
+ pm_runtime_get_sync(regset->dev);
+
debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, "");
+
+ if (regset->dev)
+ pm_runtime_put(regset->dev);
+
return 0;
}
@@ -1090,21 +1108,12 @@ static const struct file_operations fops_regset32 = {
* This function creates a file in debugfs with the given name that reports
* the names and values of a set of 32-bit registers. If the @mode variable
* is so set it can be read from. Writing is not supported.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
- struct dentry *parent,
- struct debugfs_regset32 *regset)
+void debugfs_create_regset32(const char *name, umode_t mode,
+ struct dentry *parent,
+ struct debugfs_regset32 *regset)
{
- return debugfs_create_file(name, mode, parent, regset, &fops_regset32);
+ debugfs_create_file(name, mode, parent, regset, &fops_regset32);
}
EXPORT_SYMBOL_GPL(debugfs_create_regset32);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e742dfc66933..b7f2e971ecbc 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -501,26 +501,16 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
* wide range of flexibility in creating a file, or a directory (if you want
* to create a directory, the debugfs_create_dir() function is
* recommended to be used instead.)
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
*/
-struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
- struct dentry *parent, void *data,
- const struct file_operations *fops,
- loff_t file_size)
+void debugfs_create_file_size(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops,
+ loff_t file_size)
{
struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
if (de)
d_inode(de)->i_size = file_size;
- return de;
}
EXPORT_SYMBOL_GPL(debugfs_create_file_size);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index db1ef144c63a..2c449aed1b92 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -311,8 +311,10 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
struct extent_crypt_result ecr;
int rc = 0;
- BUG_ON(!crypt_stat || !crypt_stat->tfm
- || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
+ if (!crypt_stat || !crypt_stat->tfm
+ || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
+ return -EINVAL;
+
if (unlikely(ecryptfs_verbosity > 0)) {
ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
crypt_stat->key_size);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 1c1a56be7ea2..e6ac78c62ca4 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -8,7 +8,7 @@
* Copyright (C) 2004-2008 International Business Machines Corp.
* Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
* Trevor S. Highland <trevor.highland@gmail.com>
- * Tyler Hicks <tyhicks@ou.edu>
+ * Tyler Hicks <code@tyhicks.com>
*/
#ifndef ECRYPTFS_KERNEL_H
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 7d326aa0308e..af3eb02bbca1 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1304,7 +1304,7 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat,
printk(KERN_WARNING "Tag 1 packet contains key larger "
"than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
rc = -EINVAL;
- goto out;
+ goto out_free;
}
memcpy((*new_auth_tok)->session_key.encrypted_key,
&data[(*packet_size)], (body_size - (ECRYPTFS_SIG_SIZE + 2)));
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index b8a7ce379ffe..e63259fdef28 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -7,7 +7,7 @@
* Copyright (C) 2004-2007 International Business Machines Corp.
* Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
* Michael C. Thompson <mcthomps@us.ibm.com>
- * Tyler Hicks <tyhicks@ou.edu>
+ * Tyler Hicks <code@tyhicks.com>
*/
#include <linux/dcache.h>
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index d668e60b85b5..8646ba76def3 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -4,7 +4,7 @@
*
* Copyright (C) 2004-2008 International Business Machines Corp.
* Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
- * Tyler Hicks <tyhicks@ou.edu>
+ * Tyler Hicks <code@tyhicks.com>
*/
#include <linux/sched.h>
#include <linux/slab.h>
@@ -379,6 +379,7 @@ int __init ecryptfs_init_messaging(void)
* ecryptfs_message_buf_len),
GFP_KERNEL);
if (!ecryptfs_msg_ctx_arr) {
+ kfree(ecryptfs_daemon_hash);
rc = -ENOMEM;
goto out;
}
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index fa4f6447ddad..12c66f5d92dd 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -252,7 +252,7 @@ static struct file_system_type efivarfs_type = {
static __init int efivarfs_init(void)
{
- if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ if (!efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES))
return -ENODEV;
if (!efivars_kobject())
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 5779a15c2cd6..5d2d81940679 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -157,17 +157,27 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
}
}
- ret = LZ4_decompress_safe_partial(src + inputmargin, out,
- inlen, rq->outputsize,
- rq->outputsize);
- if (ret < 0) {
- erofs_err(rq->sb, "failed to decompress, in[%u, %u] out[%u]",
- inlen, inputmargin, rq->outputsize);
+ /* legacy format could compress extra data in a pcluster. */
+ if (rq->partial_decoding || !support_0padding)
+ ret = LZ4_decompress_safe_partial(src + inputmargin, out,
+ inlen, rq->outputsize,
+ rq->outputsize);
+ else
+ ret = LZ4_decompress_safe(src + inputmargin, out,
+ inlen, rq->outputsize);
+
+ if (ret != rq->outputsize) {
+ erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
+ ret, inlen, inputmargin, rq->outputsize);
+
WARN_ON(1);
print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
16, 1, src + inputmargin, inlen, true);
print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
16, 1, out, rq->outputsize, true);
+
+ if (ret >= 0)
+ memset(out + ret, 0, rq->outputsize - ret);
ret = -EIO;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index c4c6dcdc89ad..5eead7fdc7a6 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -52,8 +52,8 @@ struct erofs_sb_info {
struct list_head list;
struct mutex umount_mutex;
- /* the dedicated workstation for compression */
- struct radix_tree_root workstn_tree;
+ /* managed XArray arranged in physical block number */
+ struct xarray managed_pslots;
/* threshold for decompression synchronously */
unsigned int max_sync_decompress_pages;
@@ -402,8 +402,8 @@ static inline void *erofs_get_pcpubuf(unsigned int pagenr)
int erofs_workgroup_put(struct erofs_workgroup *grp);
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
pgoff_t index);
-int erofs_register_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp);
+struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
+ struct erofs_workgroup *grp);
void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 057e6d7b5b7f..b514c67e5fc2 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -425,7 +425,7 @@ static int erofs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags &= ~SB_POSIXACL;
#ifdef CONFIG_EROFS_FS_ZIP
- INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
+ xa_init(&sbi->managed_pslots);
#endif
/* get the root inode */
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index fddc5059c930..52d0be10f1aa 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -37,9 +37,6 @@ void *erofs_get_pcpubuf(unsigned int pagenr)
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
-#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount)
-#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount)
-
static int erofs_workgroup_get(struct erofs_workgroup *grp)
{
int o;
@@ -66,7 +63,7 @@ struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
repeat:
rcu_read_lock();
- grp = radix_tree_lookup(&sbi->workstn_tree, index);
+ grp = xa_load(&sbi->managed_pslots, index);
if (grp) {
if (erofs_workgroup_get(grp)) {
/* prefer to relax rcu read side */
@@ -80,43 +77,37 @@ repeat:
return grp;
}
-int erofs_register_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp)
+struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
+ struct erofs_workgroup *grp)
{
- struct erofs_sb_info *sbi;
- int err;
-
- /* grp shouldn't be broken or used before */
- if (atomic_read(&grp->refcount) != 1) {
- DBG_BUGON(1);
- return -EINVAL;
- }
-
- err = radix_tree_preload(GFP_NOFS);
- if (err)
- return err;
-
- sbi = EROFS_SB(sb);
- xa_lock(&sbi->workstn_tree);
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ struct erofs_workgroup *pre;
/*
- * Bump up reference count before making this workgroup
- * visible to other users in order to avoid potential UAF
- * without serialized by workstn_lock.
+ * Bump up a reference count before making this visible
+ * to others for the XArray in order to avoid potential
+ * UAF without serialized by xa_lock.
*/
- __erofs_workgroup_get(grp);
-
- err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp);
- if (err)
- /*
- * it's safe to decrease since the workgroup isn't visible
- * and refcount >= 2 (cannot be freezed).
- */
- __erofs_workgroup_put(grp);
+ atomic_inc(&grp->refcount);
- xa_unlock(&sbi->workstn_tree);
- radix_tree_preload_end();
- return err;
+repeat:
+ xa_lock(&sbi->managed_pslots);
+ pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
+ NULL, grp, GFP_NOFS);
+ if (pre) {
+ if (xa_is_err(pre)) {
+ pre = ERR_PTR(xa_err(pre));
+ } else if (erofs_workgroup_get(pre)) {
+ /* try to legitimize the current in-tree one */
+ xa_unlock(&sbi->managed_pslots);
+ cond_resched();
+ goto repeat;
+ }
+ atomic_dec(&grp->refcount);
+ grp = pre;
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return grp;
}
static void __erofs_workgroup_free(struct erofs_workgroup *grp)
@@ -155,7 +146,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
/*
* Note that all cached pages should be unattached
- * before deleted from the radix tree. Otherwise some
+ * before deleted from the XArray. Otherwise some
* cached pages could be still attached to the orphan
* old workgroup when the new one is available in the tree.
*/
@@ -169,7 +160,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
* however in order to avoid some race conditions, add a
* DBG_BUGON to observe this in advance.
*/
- DBG_BUGON(radix_tree_delete(&sbi->workstn_tree, grp->index) != grp);
+ DBG_BUGON(xa_erase(&sbi->managed_pslots, grp->index) != grp);
/*
* If managed cache is on, last refcount should indicate
@@ -182,22 +173,11 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
unsigned long nr_shrink)
{
- pgoff_t first_index = 0;
- void *batch[PAGEVEC_SIZE];
+ struct erofs_workgroup *grp;
unsigned int freed = 0;
+ unsigned long index;
- int i, found;
-repeat:
- xa_lock(&sbi->workstn_tree);
-
- found = radix_tree_gang_lookup(&sbi->workstn_tree,
- batch, first_index, PAGEVEC_SIZE);
-
- for (i = 0; i < found; ++i) {
- struct erofs_workgroup *grp = batch[i];
-
- first_index = grp->index + 1;
-
+ xa_for_each(&sbi->managed_pslots, index, grp) {
/* try to shrink each valid workgroup */
if (!erofs_try_to_release_workgroup(sbi, grp))
continue;
@@ -206,10 +186,6 @@ repeat:
if (!--nr_shrink)
break;
}
- xa_unlock(&sbi->workstn_tree);
-
- if (i && nr_shrink)
- goto repeat;
return freed;
}
@@ -286,7 +262,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
spin_unlock(&erofs_sb_list_lock);
sbi->shrinker_run_no = run_no;
- freed += erofs_shrink_workstation(sbi, nr);
+ freed += erofs_shrink_workstation(sbi, nr - freed);
spin_lock(&erofs_sb_list_lock);
/* Get the next list element before we move this one */
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 80e47f07d946..c4b6c9aa87ec 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -67,16 +67,6 @@ static void z_erofs_pcluster_init_once(void *ptr)
pcl->compressed_pages[i] = NULL;
}
-static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl)
-{
- struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
-
- atomic_set(&pcl->obj.refcount, 1);
-
- DBG_BUGON(cl->nr_pages);
- DBG_BUGON(cl->vcnt);
-}
-
int __init z_erofs_init_zip_subsystem(void)
{
pcluster_cachep = kmem_cache_create("erofs_compress",
@@ -341,26 +331,19 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
struct inode *inode,
struct erofs_map_blocks *map)
{
- struct erofs_workgroup *grp;
- struct z_erofs_pcluster *pcl;
+ struct z_erofs_pcluster *pcl = clt->pcl;
struct z_erofs_collection *cl;
unsigned int length;
- grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
- if (!grp)
- return -ENOENT;
-
- pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ /* to avoid unexpected loop formed by corrupted images */
if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
DBG_BUGON(1);
- erofs_workgroup_put(grp);
return -EFSCORRUPTED;
}
cl = z_erofs_primarycollection(pcl);
if (cl->pageofs != (map->m_la & ~PAGE_MASK)) {
DBG_BUGON(1);
- erofs_workgroup_put(grp);
return -EFSCORRUPTED;
}
@@ -368,7 +351,6 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) {
if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
DBG_BUGON(1);
- erofs_workgroup_put(grp);
return -EFSCORRUPTED;
}
} else {
@@ -391,7 +373,6 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
/* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
clt->tailpcl = NULL;
- clt->pcl = pcl;
clt->cl = cl;
return 0;
}
@@ -402,6 +383,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
{
struct z_erofs_pcluster *pcl;
struct z_erofs_collection *cl;
+ struct erofs_workgroup *grp;
int err;
/* no available workgroup, let's allocate one */
@@ -409,7 +391,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
if (!pcl)
return -ENOMEM;
- z_erofs_pcluster_init_always(pcl);
+ atomic_set(&pcl->obj.refcount, 1);
pcl->obj.index = map->m_pa >> PAGE_SHIFT;
pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
@@ -429,19 +411,29 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
clt->mode = COLLECT_PRIMARY_FOLLOWED;
cl = z_erofs_primarycollection(pcl);
+
+ /* must be cleaned before freeing to slab */
+ DBG_BUGON(cl->nr_pages);
+ DBG_BUGON(cl->vcnt);
+
cl->pageofs = map->m_la & ~PAGE_MASK;
/*
* lock all primary followed works before visible to others
* and mutex_trylock *never* fails for a new pcluster.
*/
- mutex_trylock(&cl->lock);
+ DBG_BUGON(!mutex_trylock(&cl->lock));
- err = erofs_register_workgroup(inode->i_sb, &pcl->obj);
- if (err) {
- mutex_unlock(&cl->lock);
- kmem_cache_free(pcluster_cachep, pcl);
- return -EAGAIN;
+ grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
+ if (IS_ERR(grp)) {
+ err = PTR_ERR(grp);
+ goto err_out;
+ }
+
+ if (grp != &pcl->obj) {
+ clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ err = -EEXIST;
+ goto err_out;
}
/* used to check tail merging loop due to corrupted images */
if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
@@ -450,12 +442,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
clt->pcl = pcl;
clt->cl = cl;
return 0;
+
+err_out:
+ mutex_unlock(&cl->lock);
+ kmem_cache_free(pcluster_cachep, pcl);
+ return err;
}
static int z_erofs_collector_begin(struct z_erofs_collector *clt,
struct inode *inode,
struct erofs_map_blocks *map)
{
+ struct erofs_workgroup *grp;
int ret;
DBG_BUGON(clt->cl);
@@ -469,21 +467,25 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
return -EINVAL;
}
-repeat:
- ret = z_erofs_lookup_collection(clt, inode, map);
- if (ret == -ENOENT) {
+ grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
+ if (grp) {
+ clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ } else {
ret = z_erofs_register_collection(clt, inode, map);
- /* someone registered at the same time, give another try */
- if (ret == -EAGAIN) {
- cond_resched();
- goto repeat;
- }
+ if (!ret)
+ goto out;
+ if (ret != -EEXIST)
+ return ret;
}
- if (ret)
+ ret = z_erofs_lookup_collection(clt, inode, map);
+ if (ret) {
+ erofs_workgroup_put(&clt->pcl->obj);
return ret;
+ }
+out:
z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
clt->cl->pagevec, clt->cl->vcnt);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b041b66002db..eee3c92a9ebf 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1854,9 +1854,9 @@ fetch_events:
waiter = true;
init_waitqueue_entry(&wait, current);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
__add_wait_queue_exclusive(&ep->wq, &wait);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
for (;;) {
@@ -1904,9 +1904,9 @@ send_events:
goto fetch_events;
if (waiter) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
__remove_wait_queue(&ep->wq, &wait);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
return res;
diff --git a/fs/exec.c b/fs/exec.c
index db17be51b112..06b4c550af5d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -985,6 +985,32 @@ int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
}
EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
+int kernel_read_file_from_path_initns(const char *path, void **buf,
+ loff_t *size, loff_t max_size,
+ enum kernel_read_file_id id)
+{
+ struct file *file;
+ struct path root;
+ int ret;
+
+ if (!path || !*path)
+ return -EINVAL;
+
+ task_lock(&init_task);
+ get_fs_root(init_task.fs, &root);
+ task_unlock(&init_task);
+
+ file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0);
+ path_put(&root);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = kernel_read_file(file, buf, size, max_size, id);
+ fput(file);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_path_initns);
+
int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
enum kernel_read_file_id id)
{
@@ -1010,16 +1036,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
}
EXPORT_SYMBOL(read_code);
+/*
+ * Maps the mm_struct mm into the current task struct.
+ * On success, this function returns with the mutex
+ * exec_update_mutex locked.
+ */
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct *old_mm, *active_mm;
+ int ret;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
+ ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
+ if (ret)
+ return ret;
+
if (old_mm) {
sync_mm_rss(old_mm);
/*
@@ -1031,9 +1067,11 @@ static int exec_mmap(struct mm_struct *mm)
down_read(&old_mm->mmap_sem);
if (unlikely(old_mm->core_state)) {
up_read(&old_mm->mmap_sem);
+ mutex_unlock(&tsk->signal->exec_update_mutex);
return -EINTR;
}
}
+
task_lock(tsk);
active_mm = tsk->active_mm;
membarrier_exec_mmap(mm);
@@ -1189,10 +1227,22 @@ no_thread_group:
/* we have changed execution domain */
tsk->exit_signal = SIGCHLD;
-#ifdef CONFIG_POSIX_TIMERS
- exit_itimers(sig);
- flush_itimer_signals();
-#endif
+ BUG_ON(!thread_group_leader(tsk));
+ return 0;
+
+killed:
+ /* protects against exit_notify() and __exit_signal() */
+ read_lock(&tasklist_lock);
+ sig->group_exit_task = NULL;
+ sig->notify_count = 0;
+ read_unlock(&tasklist_lock);
+ return -EAGAIN;
+}
+
+
+static int unshare_sighand(struct task_struct *me)
+{
+ struct sighand_struct *oldsighand = me->sighand;
if (refcount_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
@@ -1210,23 +1260,13 @@ no_thread_group:
write_lock_irq(&tasklist_lock);
spin_lock(&oldsighand->siglock);
- rcu_assign_pointer(tsk->sighand, newsighand);
+ rcu_assign_pointer(me->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
write_unlock_irq(&tasklist_lock);
__cleanup_sighand(oldsighand);
}
-
- BUG_ON(!thread_group_leader(tsk));
return 0;
-
-killed:
- /* protects against exit_notify() and __exit_signal() */
- read_lock(&tasklist_lock);
- sig->group_exit_task = NULL;
- sig->notify_count = 0;
- read_unlock(&tasklist_lock);
- return -EAGAIN;
}
char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
@@ -1260,13 +1300,13 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
*/
int flush_old_exec(struct linux_binprm * bprm)
{
+ struct task_struct *me = current;
int retval;
/*
- * Make sure we have a private signal table and that
- * we are unassociated from the previous thread group.
+ * Make this the only thread in the thread group.
*/
- retval = de_thread(current);
+ retval = de_thread(me);
if (retval)
goto out;
@@ -1286,18 +1326,31 @@ int flush_old_exec(struct linux_binprm * bprm)
goto out;
/*
- * After clearing bprm->mm (to mark that current is using the
- * prepared mm now), we have nothing left of the original
+ * After setting bprm->called_exec_mmap (to mark that current is
+ * using the prepared mm now), we have nothing left of the original
* process. If anything from here on returns an error, the check
* in search_binary_handler() will SEGV current.
*/
+ bprm->called_exec_mmap = 1;
bprm->mm = NULL;
+#ifdef CONFIG_POSIX_TIMERS
+ exit_itimers(me->signal);
+ flush_itimer_signals();
+#endif
+
+ /*
+ * Make the signal table private.
+ */
+ retval = unshare_sighand(me);
+ if (retval)
+ goto out;
+
set_fs(USER_DS);
- current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+ me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
- current->personality &= ~bprm->per_clear;
+ me->personality &= ~bprm->per_clear;
/*
* We have to apply CLOEXEC before we change whether the process is
@@ -1305,7 +1358,7 @@ int flush_old_exec(struct linux_binprm * bprm)
* trying to access the should-be-closed file descriptors of a process
* undergoing exec(2).
*/
- do_close_on_exec(current->files);
+ do_close_on_exec(me->files);
return 0;
out:
@@ -1386,7 +1439,7 @@ void setup_new_exec(struct linux_binprm * bprm)
/* An exec changes our domain. We are no longer part of the thread
group */
- current->self_exec_id++;
+ WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1);
flush_signal_handlers(current, 0);
}
EXPORT_SYMBOL(setup_new_exec);
@@ -1424,6 +1477,8 @@ static void free_bprm(struct linux_binprm *bprm)
{
free_arg_pages(bprm);
if (bprm->cred) {
+ if (bprm->called_exec_mmap)
+ mutex_unlock(&current->signal->exec_update_mutex);
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
@@ -1473,6 +1528,7 @@ void install_exec_creds(struct linux_binprm *bprm)
* credentials; any time after this it may be unlocked.
*/
security_bprm_committed_creds(bprm);
+ mutex_unlock(&current->signal->exec_update_mutex);
mutex_unlock(&current->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(install_exec_creds);
@@ -1664,7 +1720,7 @@ int search_binary_handler(struct linux_binprm *bprm)
read_lock(&binfmt_lock);
put_binfmt(fmt);
- if (retval < 0 && !bprm->mm) {
+ if (retval < 0 && bprm->called_exec_mmap) {
/* we got to flush_old_exec() and failed after it */
read_unlock(&binfmt_lock);
force_sigsegv(SIGSEGV);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 119667e65890..c885cf7d724b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -960,8 +960,9 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
static int
ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
- return dax_writeback_mapping_range(mapping,
- mapping->host->i_sb->s_bdev, wbc);
+ struct ext2_sb_info *sbi = EXT2_SB(mapping->host->i_sb);
+
+ return dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
}
const struct address_space_operations ext2_aops = {
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 5f993a411251..8fd0b3cdab4c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -270,6 +270,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh_p;
if (block_group >= ngroups) {
ext4_error(sb, "block_group >= groups_count - block_group = %u,"
@@ -280,7 +281,14 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- if (!sbi->s_group_desc[group_desc]) {
+ bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
+ /*
+ * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
+ * the pointer being dereferenced won't be dereferenced again. By
+ * looking at the usage in add_new_gdb() the value isn't modified,
+ * just the pointer, and so it remains valid.
+ */
+ if (!bh_p) {
ext4_error(sb, "Group descriptor not loaded - "
"block_group = %u, group_desc = %u, desc = %u",
block_group, group_desc, offset);
@@ -288,10 +296,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
}
desc = (struct ext4_group_desc *)(
- (__u8 *)sbi->s_group_desc[group_desc]->b_data +
+ (__u8 *)bh_p->b_data +
offset * EXT4_DESC_SIZE(sb));
if (bh)
- *bh = sbi->s_group_desc[group_desc];
+ *bh = bh_p;
return desc;
}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 1ee04e76bbe0..0a734ffb4310 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -207,6 +207,7 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
return PTR_ERR(inode);
num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
while (i < num) {
+ cond_resched();
map.m_lblk = i;
map.m_len = num - i;
n = ext4_map_blocks(NULL, inode, &map, 0);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 1f340743c9a8..9aa1f75409b0 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,12 +129,14 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
if (err != ERR_BAD_DX_DIR) {
return err;
}
- /*
- * We don't set the inode dirty flag since it's not
- * critical that it get flushed back to the disk.
- */
- ext4_clear_inode_flag(file_inode(file),
- EXT4_INODE_INDEX);
+ /* Can we just clear INDEX flag to ignore htree information? */
+ if (!ext4_has_metadata_csum(sb)) {
+ /*
+ * We don't set the inode dirty flag since it's not
+ * critical that it gets flushed back to the disk.
+ */
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
}
if (ext4_has_inline_data(inode)) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9a2ee2428ecc..61b37a052052 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1400,7 +1400,7 @@ struct ext4_sb_info {
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
- struct buffer_head **s_group_desc;
+ struct buffer_head * __rcu *s_group_desc;
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
unsigned int s_mount_flags;
@@ -1462,7 +1462,7 @@ struct ext4_sb_info {
#endif
/* for buddy allocator */
- struct ext4_group_info ***s_group_info;
+ struct ext4_group_info ** __rcu *s_group_info;
struct inode *s_buddy_cache;
spinlock_t s_md_lock;
unsigned short *s_mb_offsets;
@@ -1512,7 +1512,7 @@ struct ext4_sb_info {
unsigned int s_extent_max_zeroout_kb;
unsigned int s_log_groups_per_flex;
- struct flex_groups *s_flex_groups;
+ struct flex_groups * __rcu *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
/* workqueue for reserved extent conversions (buffered io) */
@@ -1552,8 +1552,11 @@ struct ext4_sb_info {
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
- /* Barrier between changing inodes' journal flags and writepages ops. */
- struct percpu_rw_semaphore s_journal_flag_rwsem;
+ /*
+ * Barrier between writepages ops and changing any inode's JOURNAL_DATA
+ * or EXTENTS flag.
+ */
+ struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
#ifdef CONFIG_EXT4_DEBUG
unsigned long s_simulate_fail;
@@ -1577,6 +1580,23 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
}
/*
+ * Returns: sbi->field[index]
+ * Used to access an array element from the following sbi fields which require
+ * rcu protection to avoid dereferencing an invalid pointer due to reassignment
+ * - s_group_desc
+ * - s_group_info
+ * - s_flex_group
+ */
+#define sbi_array_rcu_deref(sbi, field, index) \
+({ \
+ typeof(*((sbi)->field)) _v; \
+ rcu_read_lock(); \
+ _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \
+ rcu_read_unlock(); \
+ _v; \
+})
+
+/*
* Simulate_fail codes
*/
#define EXT4_SIM_BBITMAP_EIO 1
@@ -2544,8 +2564,11 @@ void ext4_insert_dentry(struct inode *inode,
struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
- if (!ext4_has_feature_dir_index(inode->i_sb))
+ if (!ext4_has_feature_dir_index(inode->i_sb)) {
+ /* ext4_iget() should have caught this... */
+ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
}
static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
@@ -2727,6 +2750,7 @@ extern int ext4_generic_delete_entry(handle_t *handle,
extern bool ext4_empty_dir(struct inode *inode);
/* resize.c */
+extern void ext4_kvfree_array_rcu(void *to_free);
extern int ext4_group_add(struct super_block *sb,
struct ext4_new_group_data *input);
extern int ext4_group_extend(struct super_block *sb,
@@ -2973,13 +2997,13 @@ static inline
struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
ext4_group_t group)
{
- struct ext4_group_info ***grp_info;
+ struct ext4_group_info **grp_info;
long indexv, indexh;
BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
- grp_info = EXT4_SB(sb)->s_group_info;
indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
- return grp_info[indexv][indexh];
+ grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+ return grp_info[indexh];
}
/*
@@ -3029,7 +3053,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
!inode_is_locked(inode));
down_write(&EXT4_I(inode)->i_data_sem);
if (newsize > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = newsize;
+ WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
up_write(&EXT4_I(inode)->i_data_sem);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c66e8f9451a2..f95ee99091e4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -328,11 +328,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
percpu_counter_inc(&sbi->s_freeinodes_counter);
if (sbi->s_log_groups_per_flex) {
- ext4_group_t f = ext4_flex_group(sbi, block_group);
+ struct flex_groups *fg;
- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ fg = sbi_array_rcu_deref(sbi, s_flex_groups,
+ ext4_flex_group(sbi, block_group));
+ atomic_inc(&fg->free_inodes);
if (is_directory)
- atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ atomic_dec(&fg->used_dirs);
}
BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
@@ -368,12 +370,13 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
int flex_size, struct orlov_stats *stats)
{
struct ext4_group_desc *desc;
- struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
if (flex_size > 1) {
- stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
- stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
- stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+ struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb),
+ s_flex_groups, g);
+ stats->free_inodes = atomic_read(&fg->free_inodes);
+ stats->free_clusters = atomic64_read(&fg->free_clusters);
+ stats->used_dirs = atomic_read(&fg->used_dirs);
return;
}
@@ -1054,7 +1057,8 @@ got:
if (sbi->s_log_groups_per_flex) {
ext4_group_t f = ext4_flex_group(sbi, group);
- atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+ atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
+ f)->used_dirs);
}
}
if (ext4_has_group_desc_csum(sb)) {
@@ -1077,7 +1081,8 @@ got:
if (sbi->s_log_groups_per_flex) {
flex_group = ext4_flex_group(sbi, group);
- atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+ atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_inodes);
}
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3313168b680f..fa0ff78dc033 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2465,7 +2465,7 @@ update_disksize:
* truncate are avoided by checking i_size under i_data_sem.
*/
disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
- if (disksize > EXT4_I(inode)->i_disksize) {
+ if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
int err2;
loff_t i_size;
@@ -2628,7 +2628,7 @@ static int ext4_writepages(struct address_space *mapping,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- percpu_down_read(&sbi->s_journal_flag_rwsem);
+ percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
/*
@@ -2849,7 +2849,7 @@ unplug:
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_journal_flag_rwsem);
+ percpu_up_read(&sbi->s_writepages_rwsem);
return ret;
}
@@ -2864,13 +2864,13 @@ static int ext4_dax_writepages(struct address_space *mapping,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- percpu_down_read(&sbi->s_journal_flag_rwsem);
+ percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
- ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
+ ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_journal_flag_rwsem);
+ percpu_up_read(&sbi->s_writepages_rwsem);
return ret;
}
@@ -4644,6 +4644,18 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
+ /*
+ * If dir_index is not enabled but there's dir with INDEX flag set,
+ * we'd normally treat htree data as empty space. But with metadata
+ * checksumming that corrupts checksums so forbid that.
+ */
+ if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
+ ext4_error_inode(inode, function, line, 0,
+ "iget: Dir with htree data on filesystem without dir_index feature.");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
@@ -5849,7 +5861,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
}
}
- percpu_down_write(&sbi->s_journal_flag_rwsem);
+ percpu_down_write(&sbi->s_writepages_rwsem);
jbd2_journal_lock_updates(journal);
/*
@@ -5866,7 +5878,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
err = jbd2_journal_flush(journal);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
- percpu_up_write(&sbi->s_journal_flag_rwsem);
+ percpu_up_write(&sbi->s_writepages_rwsem);
return err;
}
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
@@ -5874,7 +5886,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
- percpu_up_write(&sbi->s_journal_flag_rwsem);
+ percpu_up_write(&sbi->s_writepages_rwsem);
if (val)
up_write(&EXT4_I(inode)->i_mmap_sem);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a0ec750018dd..0c1d1720cf1a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1210,6 +1210,11 @@ resizefs_out:
return -EOPNOTSUPP;
return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_nonce(filp, (void __user *)arg);
+
case EXT4_IOC_CLEAR_ES_CACHE:
{
if (!inode_owner_or_capable(inode))
@@ -1370,6 +1375,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_REMOVE_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ case FS_IOC_GET_ENCRYPTION_NONCE:
case EXT4_IOC_SHUTDOWN:
case FS_IOC_GETFSMAP:
case FS_IOC_ENABLE_VERITY:
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f64838187559..51a78eb65f3c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2356,7 +2356,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned size;
- struct ext4_group_info ***new_groupinfo;
+ struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
@@ -2369,13 +2369,16 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
return -ENOMEM;
}
- if (sbi->s_group_info) {
- memcpy(new_groupinfo, sbi->s_group_info,
+ rcu_read_lock();
+ old_groupinfo = rcu_dereference(sbi->s_group_info);
+ if (old_groupinfo)
+ memcpy(new_groupinfo, old_groupinfo,
sbi->s_group_info_size * sizeof(*sbi->s_group_info));
- kvfree(sbi->s_group_info);
- }
- sbi->s_group_info = new_groupinfo;
+ rcu_read_unlock();
+ rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+ if (old_groupinfo)
+ ext4_kvfree_array_rcu(old_groupinfo);
ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
sbi->s_group_info_size);
return 0;
@@ -2387,6 +2390,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
{
int i;
int metalen = 0;
+ int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_info **meta_group_info;
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
@@ -2405,12 +2409,12 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
"for a buddy group");
goto exit_meta_group_info;
}
- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
- meta_group_info;
+ rcu_read_lock();
+ rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
+ rcu_read_unlock();
}
- meta_group_info =
- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
@@ -2458,8 +2462,13 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
exit_group_info:
/* If a meta_group_info table has been allocated, release it now */
if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
- kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+ struct ext4_group_info ***group_info;
+
+ rcu_read_lock();
+ group_info = rcu_dereference(sbi->s_group_info);
+ kfree(group_info[idx]);
+ group_info[idx] = NULL;
+ rcu_read_unlock();
}
exit_meta_group_info:
return -ENOMEM;
@@ -2472,6 +2481,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
struct ext4_group_desc *desc;
+ struct ext4_group_info ***group_info;
struct kmem_cache *cachep;
err = ext4_mb_alloc_groupinfo(sb, ngroups);
@@ -2507,11 +2517,16 @@ err_freebuddy:
while (i-- > 0)
kmem_cache_free(cachep, ext4_get_group_info(sb, i));
i = sbi->s_group_info_size;
+ rcu_read_lock();
+ group_info = rcu_dereference(sbi->s_group_info);
while (i-- > 0)
- kfree(sbi->s_group_info[i]);
+ kfree(group_info[i]);
+ rcu_read_unlock();
iput(sbi->s_buddy_cache);
err_freesgi:
- kvfree(sbi->s_group_info);
+ rcu_read_lock();
+ kvfree(rcu_dereference(sbi->s_group_info));
+ rcu_read_unlock();
return -ENOMEM;
}
@@ -2700,7 +2715,7 @@ int ext4_mb_release(struct super_block *sb)
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
int num_meta_group_infos;
- struct ext4_group_info *grinfo;
+ struct ext4_group_info *grinfo, ***group_info;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
@@ -2719,9 +2734,12 @@ int ext4_mb_release(struct super_block *sb)
num_meta_group_infos = (ngroups +
EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
+ rcu_read_lock();
+ group_info = rcu_dereference(sbi->s_group_info);
for (i = 0; i < num_meta_group_infos; i++)
- kfree(sbi->s_group_info[i]);
- kvfree(sbi->s_group_info);
+ kfree(group_info[i]);
+ kvfree(group_info);
+ rcu_read_unlock();
}
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
@@ -3020,7 +3038,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
atomic64_sub(ac->ac_b_ex.fe_len,
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
}
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4918,7 +4937,8 @@ do_more:
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
atomic64_add(count_clusters,
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
}
/*
@@ -5075,7 +5095,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
atomic64_add(clusters_freed,
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
}
ext4_mb_unload_buddy(&e4b);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 89725fa42573..fb6520f37135 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -407,6 +407,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
int ext4_ext_migrate(struct inode *inode)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
int retval = 0, i;
__le32 *i_data;
@@ -431,6 +432,8 @@ int ext4_ext_migrate(struct inode *inode)
*/
return retval;
+ percpu_down_write(&sbi->s_writepages_rwsem);
+
/*
* Worst case we can touch the allocation bitmaps, a bgd
* block, and a block to link in the orphan list. We do need
@@ -441,7 +444,7 @@ int ext4_ext_migrate(struct inode *inode)
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- return retval;
+ goto out_unlock;
}
goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
@@ -452,7 +455,7 @@ int ext4_ext_migrate(struct inode *inode)
if (IS_ERR(tmp_inode)) {
retval = PTR_ERR(tmp_inode);
ext4_journal_stop(handle);
- return retval;
+ goto out_unlock;
}
i_size_write(tmp_inode, i_size_read(inode));
/*
@@ -494,7 +497,7 @@ int ext4_ext_migrate(struct inode *inode)
*/
ext4_orphan_del(NULL, tmp_inode);
retval = PTR_ERR(handle);
- goto out;
+ goto out_tmp_inode;
}
ei = EXT4_I(inode);
@@ -576,10 +579,11 @@ err_out:
ext4_ext_tree_init(handle, tmp_inode);
out_stop:
ext4_journal_stop(handle);
-out:
+out_tmp_inode:
unlock_new_inode(tmp_inode);
iput(tmp_inode);
-
+out_unlock:
+ percpu_up_write(&sbi->s_writepages_rwsem);
return retval;
}
@@ -589,7 +593,8 @@ out:
int ext4_ind_migrate(struct inode *inode)
{
struct ext4_extent_header *eh;
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_super_block *es = sbi->s_es;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_extent *ex;
unsigned int i, len;
@@ -613,9 +618,13 @@ int ext4_ind_migrate(struct inode *inode)
if (test_opt(inode->i_sb, DELALLOC))
ext4_alloc_da_blocks(inode);
+ percpu_down_write(&sbi->s_writepages_rwsem);
+
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_unlock;
+ }
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_ext_check_inode(inode);
@@ -650,5 +659,7 @@ int ext4_ind_migrate(struct inode *inode)
errout:
ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
+out_unlock:
+ percpu_up_write(&sbi->s_writepages_rwsem);
return ret;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 1c44b1a32001..87f7551c5132 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -120,10 +120,10 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
{
__ext4_warning(sb, function, line, "%s", msg);
__ext4_warning(sb, function, line,
- "MMP failure info: last update time: %llu, last update "
- "node: %s, last update device: %s",
- (long long unsigned int) le64_to_cpu(mmp->mmp_time),
- mmp->mmp_nodename, mmp->mmp_bdevname);
+ "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s",
+ (unsigned long long)le64_to_cpu(mmp->mmp_time),
+ (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename,
+ (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname);
}
/*
@@ -154,6 +154,7 @@ static int kmmpd(void *data)
mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
bdevname(bh->b_bdev, mmp->mmp_bdevname);
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
@@ -379,7 +380,8 @@ skip:
/*
* Start a kernel thread to update the MMP block periodically.
*/
- EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s",
+ (int)sizeof(mmp->mmp_bdevname),
bdevname(bh->b_bdev,
mmp->mmp_bdevname));
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 129d2ebae00d..b05ea72f38fd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1511,6 +1511,7 @@ restart:
/*
* We deal with the read-ahead logic here.
*/
+ cond_resched();
if (ra_ptr >= ra_max) {
/* Refill the readahead buffer */
ra_ptr = 0;
@@ -2213,6 +2214,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
retval = ext4_dx_add_entry(handle, &fname, dir, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
+ /* Can we just ignore htree data? */
+ if (ext4_has_metadata_csum(sb)) {
+ EXT4_ERROR_INODE(dir,
+ "Directory has corrupted htree index.");
+ retval = -EFSCORRUPTED;
+ goto out;
+ }
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 68b39e75446a..de6fe969f773 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -125,11 +125,10 @@ static void ext4_finish_bio(struct bio *bio)
}
bh = head = page_buffers(page);
/*
- * We check all buffers in the page under BH_Uptodate_Lock
+ * We check all buffers in the page under b_uptodate_lock
* to avoid races with other end io clearing async_write flags
*/
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ spin_lock_irqsave(&head->b_uptodate_lock, flags);
do {
if (bh_offset(bh) < bio_start ||
bh_offset(bh) + bh->b_size > bio_end) {
@@ -141,8 +140,7 @@ static void ext4_finish_bio(struct bio *bio)
if (bio->bi_status)
buffer_io_error(bh);
} while ((bh = bh->b_this_page) != head);
- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
fscrypt_free_bounce_page(bounce_page);
end_page_writeback(page);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 86a2500ed292..a50b51270ea9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -17,6 +17,33 @@
#include "ext4_jbd2.h"
+struct ext4_rcu_ptr {
+ struct rcu_head rcu;
+ void *ptr;
+};
+
+static void ext4_rcu_ptr_callback(struct rcu_head *head)
+{
+ struct ext4_rcu_ptr *ptr;
+
+ ptr = container_of(head, struct ext4_rcu_ptr, rcu);
+ kvfree(ptr->ptr);
+ kfree(ptr);
+}
+
+void ext4_kvfree_array_rcu(void *to_free)
+{
+ struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+
+ if (ptr) {
+ ptr->ptr = to_free;
+ call_rcu(&ptr->rcu, ext4_rcu_ptr_callback);
+ return;
+ }
+ synchronize_rcu();
+ kvfree(to_free);
+}
+
int ext4_resize_begin(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -542,8 +569,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
brelse(gdb);
goto out;
}
- memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
- gdb->b_size);
+ memcpy(gdb->b_data, sbi_array_rcu_deref(sbi,
+ s_group_desc, j)->b_data, gdb->b_size);
set_buffer_uptodate(gdb);
err = ext4_handle_dirty_metadata(handle, NULL, gdb);
@@ -860,13 +887,15 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
}
brelse(dind);
- o_group_desc = EXT4_SB(sb)->s_group_desc;
+ rcu_read_lock();
+ o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ rcu_read_unlock();
n_group_desc[gdb_num] = gdb_bh;
- EXT4_SB(sb)->s_group_desc = n_group_desc;
+ rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
EXT4_SB(sb)->s_gdb_count++;
- kvfree(o_group_desc);
+ ext4_kvfree_array_rcu(o_group_desc);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
err = ext4_handle_dirty_super(handle, sb);
@@ -909,9 +938,11 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
return err;
}
- o_group_desc = EXT4_SB(sb)->s_group_desc;
+ rcu_read_lock();
+ o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ rcu_read_unlock();
n_group_desc[gdb_num] = gdb_bh;
BUFFER_TRACE(gdb_bh, "get_write_access");
@@ -922,9 +953,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
return err;
}
- EXT4_SB(sb)->s_group_desc = n_group_desc;
+ rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
EXT4_SB(sb)->s_gdb_count++;
- kvfree(o_group_desc);
+ ext4_kvfree_array_rcu(o_group_desc);
return err;
}
@@ -1188,7 +1219,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
* use non-sparse filesystems anymore. This is already checked above.
*/
if (gdb_off) {
- gdb_bh = sbi->s_group_desc[gdb_num];
+ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+ gdb_num);
BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
@@ -1270,7 +1302,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
/*
* get_write_access() has been called on gdb_bh by ext4_add_new_desc().
*/
- gdb_bh = sbi->s_group_desc[gdb_num];
+ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num);
/* Update group descriptor block for new group */
gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
gdb_off * EXT4_DESC_SIZE(sb));
@@ -1398,11 +1430,14 @@ static void ext4_update_super(struct super_block *sb,
percpu_counter_read(&sbi->s_freeclusters_counter));
if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
+ struct flex_groups *fg;
+
flex_group = ext4_flex_group(sbi, group_data[0].group);
+ fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &fg->free_clusters);
atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
- &sbi->s_flex_groups[flex_group].free_inodes);
+ &fg->free_inodes);
}
/*
@@ -1497,7 +1532,8 @@ exit_journal:
for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
- gdb_bh = sbi->s_group_desc[gdb_num];
+ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+ gdb_num);
if (old_gdb == gdb_bh->b_blocknr)
continue;
update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8434217549b3..c8dff4c68141 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -43,7 +43,7 @@
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
-
+#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -927,7 +927,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
{
struct block_device *bdev;
- char b[BDEVNAME_SIZE];
bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
if (IS_ERR(bdev))
@@ -935,8 +934,9 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
return bdev;
fail:
- ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
- __bdevname(dev, b), PTR_ERR(bdev));
+ ext4_msg(sb, KERN_ERR,
+ "failed to open journal device unknown-block(%u,%u) %ld",
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
return NULL;
}
@@ -1014,6 +1014,8 @@ static void ext4_put_super(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head **group_desc;
+ struct flex_groups **flex_groups;
int aborted = 0;
int i, err;
@@ -1046,15 +1048,23 @@ static void ext4_put_super(struct super_block *sb)
if (!sb_rdonly(sb))
ext4_commit_super(sb, 1);
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
for (i = 0; i < sbi->s_gdb_count; i++)
- brelse(sbi->s_group_desc[i]);
- kvfree(sbi->s_group_desc);
- kvfree(sbi->s_flex_groups);
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ flex_groups = rcu_dereference(sbi->s_flex_groups);
+ if (flex_groups) {
+ for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+ kvfree(flex_groups[i]);
+ kvfree(flex_groups);
+ }
+ rcu_read_unlock();
percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
+ percpu_free_rwsem(&sbi->s_writepages_rwsem);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
@@ -2380,8 +2390,8 @@ done:
int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct flex_groups *new_groups;
- int size;
+ struct flex_groups **old_groups, **new_groups;
+ int size, i, j;
if (!sbi->s_log_groups_per_flex)
return 0;
@@ -2390,22 +2400,37 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
if (size <= sbi->s_flex_groups_allocated)
return 0;
- size = roundup_pow_of_two(size * sizeof(struct flex_groups));
- new_groups = kvzalloc(size, GFP_KERNEL);
+ new_groups = kvzalloc(roundup_pow_of_two(size *
+ sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
if (!new_groups) {
- ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
- size / (int) sizeof(struct flex_groups));
+ ext4_msg(sb, KERN_ERR,
+ "not enough memory for %d flex group pointers", size);
return -ENOMEM;
}
-
- if (sbi->s_flex_groups) {
- memcpy(new_groups, sbi->s_flex_groups,
- (sbi->s_flex_groups_allocated *
- sizeof(struct flex_groups)));
- kvfree(sbi->s_flex_groups);
+ for (i = sbi->s_flex_groups_allocated; i < size; i++) {
+ new_groups[i] = kvzalloc(roundup_pow_of_two(
+ sizeof(struct flex_groups)),
+ GFP_KERNEL);
+ if (!new_groups[i]) {
+ for (j = sbi->s_flex_groups_allocated; j < i; j++)
+ kvfree(new_groups[j]);
+ kvfree(new_groups);
+ ext4_msg(sb, KERN_ERR,
+ "not enough memory for %d flex groups", size);
+ return -ENOMEM;
+ }
}
- sbi->s_flex_groups = new_groups;
- sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+ rcu_read_lock();
+ old_groups = rcu_dereference(sbi->s_flex_groups);
+ if (old_groups)
+ memcpy(new_groups, old_groups,
+ (sbi->s_flex_groups_allocated *
+ sizeof(struct flex_groups *)));
+ rcu_read_unlock();
+ rcu_assign_pointer(sbi->s_flex_groups, new_groups);
+ sbi->s_flex_groups_allocated = size;
+ if (old_groups)
+ ext4_kvfree_array_rcu(old_groups);
return 0;
}
@@ -2413,6 +2438,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
+ struct flex_groups *fg;
ext4_group_t flex_group;
int i, err;
@@ -2430,12 +2456,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
flex_group = ext4_flex_group(sbi, i);
- atomic_add(ext4_free_inodes_count(sb, gdp),
- &sbi->s_flex_groups[flex_group].free_inodes);
+ fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
+ atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
atomic64_add(ext4_free_group_clusters(sb, gdp),
- &sbi->s_flex_groups[flex_group].free_clusters);
- atomic_add(ext4_used_dirs_count(sb, gdp),
- &sbi->s_flex_groups[flex_group].used_dirs);
+ &fg->free_clusters);
+ atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
}
return 1;
@@ -3009,17 +3034,11 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#ifndef CONFIG_QUOTA
- if (ext4_has_feature_quota(sb) && !readonly) {
- ext4_msg(sb, KERN_ERR,
- "Filesystem with quota feature cannot be mounted RDWR "
- "without CONFIG_QUOTA");
- return 0;
- }
- if (ext4_has_feature_project(sb) && !readonly) {
+#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
+ if (!readonly && (ext4_has_feature_quota(sb) ||
+ ext4_has_feature_project(sb))) {
ext4_msg(sb, KERN_ERR,
- "Filesystem with project quota feature cannot be mounted RDWR "
- "without CONFIG_QUOTA");
+ "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
return 0;
}
#endif /* CONFIG_QUOTA */
@@ -3640,9 +3659,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
char *orig_data = kstrdup(data, GFP_KERNEL);
- struct buffer_head *bh;
+ struct buffer_head *bh, **group_desc;
struct ext4_super_block *es = NULL;
struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ struct flex_groups **flex_groups;
ext4_fsblk_t block;
ext4_fsblk_t sb_block = get_sb_block(&data);
ext4_fsblk_t logical_sb_block;
@@ -3814,6 +3834,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+ blocksize > EXT4_MAX_BLOCK_SIZE) {
+ ext4_msg(sb, KERN_ERR,
+ "Unsupported filesystem blocksize %d (%d log_block_size)",
+ blocksize, le32_to_cpu(es->s_log_block_size));
+ goto failed_mount;
+ }
+
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
@@ -3831,6 +3860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_msg(sb, KERN_ERR,
"unsupported inode size: %d",
sbi->s_inode_size);
+ ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
goto failed_mount;
}
/*
@@ -4033,14 +4063,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
goto failed_mount;
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
- if (blocksize < EXT4_MIN_BLOCK_SIZE ||
- blocksize > EXT4_MAX_BLOCK_SIZE) {
- ext4_msg(sb, KERN_ERR,
- "Unsupported filesystem blocksize %d (%d log_block_size)",
- blocksize, le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
- }
if (le32_to_cpu(es->s_log_block_size) >
(EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
ext4_msg(sb, KERN_ERR,
@@ -4294,9 +4316,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
}
- sbi->s_group_desc = kvmalloc_array(db_count,
- sizeof(struct buffer_head *),
- GFP_KERNEL);
+ rcu_assign_pointer(sbi->s_group_desc,
+ kvmalloc_array(db_count,
+ sizeof(struct buffer_head *),
+ GFP_KERNEL));
if (sbi->s_group_desc == NULL) {
ext4_msg(sb, KERN_ERR, "not enough memory");
ret = -ENOMEM;
@@ -4312,14 +4335,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
for (i = 0; i < db_count; i++) {
+ struct buffer_head *bh;
+
block = descriptor_loc(sb, logical_sb_block, i);
- sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
- if (!sbi->s_group_desc[i]) {
+ bh = sb_bread_unmovable(sb, block);
+ if (!bh) {
ext4_msg(sb, KERN_ERR,
"can't read group descriptor %d", i);
db_count = i;
goto failed_mount2;
}
+ rcu_read_lock();
+ rcu_dereference(sbi->s_group_desc)[i] = bh;
+ rcu_read_unlock();
}
sbi->s_gdb_count = db_count;
if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
@@ -4598,7 +4626,7 @@ no_journal:
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
GFP_KERNEL);
if (!err)
- err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+ err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -4686,13 +4714,19 @@ failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
ext4_mb_release(sb);
- if (sbi->s_flex_groups)
- kvfree(sbi->s_flex_groups);
+ rcu_read_lock();
+ flex_groups = rcu_dereference(sbi->s_flex_groups);
+ if (flex_groups) {
+ for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+ kvfree(flex_groups[i]);
+ kvfree(flex_groups);
+ }
+ rcu_read_unlock();
percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
+ percpu_free_rwsem(&sbi->s_writepages_rwsem);
failed_mount5:
ext4_ext_release(sb);
ext4_release_system_zone(sb);
@@ -4721,9 +4755,12 @@ failed_mount3:
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
for (i = 0; i < db_count; i++)
- brelse(sbi->s_group_desc[i]);
- kvfree(sbi->s_group_desc);
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ rcu_read_unlock();
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
@@ -5585,10 +5622,7 @@ static int ext4_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = 0;
- if (dquot->dq_dqb.dqb_bsoftlimit &&
- (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit))
- limit = dquot->dq_dqb.dqb_bsoftlimit;
+ limit = dquot->dq_dqb.dqb_bsoftlimit;
if (dquot->dq_dqb.dqb_bhardlimit &&
(!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
limit = dquot->dq_dqb.dqb_bhardlimit;
@@ -5603,10 +5637,7 @@ static int ext4_statfs_project(struct super_block *sb,
(buf->f_blocks - curblock) : 0;
}
- limit = 0;
- if (dquot->dq_dqb.dqb_isoftlimit &&
- (!limit || dquot->dq_dqb.dqb_isoftlimit < limit))
- limit = dquot->dq_dqb.dqb_isoftlimit;
+ limit = dquot->dq_dqb.dqb_isoftlimit;
if (dquot->dq_dqb.dqb_ihardlimit &&
(!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
limit = dquot->dq_dqb.dqb_ihardlimit;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index d218ebdafa4a..04bfaf63752c 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -13,6 +13,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
+#include <linux/part_stat.h>
#include "ext4.h"
#include "ext4_jbd2.h"
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5355be6b6755..088c3e7a1080 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,6 +22,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/quotaops.h>
+#include <linux/part_stat.h>
#include <crypto/hash.h>
#include <linux/fscrypt.h>
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0d4da644df3b..351762f77840 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2423,6 +2423,14 @@ static int f2fs_ioc_get_encryption_key_status(struct file *filp,
return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
}
+static int f2fs_ioc_get_encryption_nonce(struct file *filp, unsigned long arg)
+{
+ if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+
+ return fscrypt_ioctl_get_nonce(filp, (void __user *)arg);
+}
+
static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -3437,6 +3445,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_remove_encryption_key_all_users(filp, arg);
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
return f2fs_ioc_get_encryption_key_status(filp, arg);
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ return f2fs_ioc_get_encryption_nonce(filp, arg);
case F2FS_IOC_GARBAGE_COLLECT:
return f2fs_ioc_gc(filp, arg);
case F2FS_IOC_GARBAGE_COLLECT_RANGE:
@@ -3611,6 +3621,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_REMOVE_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ case FS_IOC_GET_ENCRYPTION_NONCE:
case F2FS_IOC_GARBAGE_COLLECT:
case F2FS_IOC_GARBAGE_COLLECT_RANGE:
case F2FS_IOC_WRITE_CHECKPOINT:
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 65a7a432dfee..d398b2d90c6c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -24,6 +24,7 @@
#include <linux/sysfs.h>
#include <linux/quota.h>
#include <linux/unicode.h>
+#include <linux/part_stat.h>
#include "f2fs.h"
#include "node.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 594b05ae16c9..71946da84388 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -750,6 +750,13 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
return NULL;
init_rwsem(&ei->truncate_lock);
+ /* Zeroing to allow iput() even if partial initialized inode. */
+ ei->mmu_private = 0;
+ ei->i_start = 0;
+ ei->i_logstart = 0;
+ ei->i_attrs = 0;
+ ei->i_pos = 0;
+
return &ei->vfs_inode;
}
@@ -1374,16 +1381,6 @@ out:
return 0;
}
-static void fat_dummy_inode_init(struct inode *inode)
-{
- /* Initialize this dummy inode to work as no-op. */
- MSDOS_I(inode)->mmu_private = 0;
- MSDOS_I(inode)->i_start = 0;
- MSDOS_I(inode)->i_logstart = 0;
- MSDOS_I(inode)->i_attrs = 0;
- MSDOS_I(inode)->i_pos = 0;
-}
-
static int fat_read_root(struct inode *inode)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
@@ -1844,13 +1841,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
fat_inode = new_inode(sb);
if (!fat_inode)
goto out_fail;
- fat_dummy_inode_init(fat_inode);
sbi->fat_inode = fat_inode;
fsinfo_inode = new_inode(sb);
if (!fsinfo_inode)
goto out_fail;
- fat_dummy_inode_init(fsinfo_inode);
fsinfo_inode->i_ino = MSDOS_FSINFO_INO;
sbi->fsinfo_inode = fsinfo_inode;
insert_inode_hash(fsinfo_inode);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9bc167562ee8..2e4c0fa2074b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -735,8 +735,9 @@ static void send_sigio_to_task(struct task_struct *p,
return;
switch (signum) {
- kernel_siginfo_t si;
- default:
+ default: {
+ kernel_siginfo_t si;
+
/* Queue a rt signal with the appropriate fd as its
value. We use SI_SIGIO as the source, not
SI_KERNEL, since kernel signals always get
@@ -769,6 +770,7 @@ static void send_sigio_to_task(struct task_struct *p,
si.si_fd = fd;
if (!do_send_sig_info(signum, &si, p, type))
break;
+ }
/* fall-through - fall back on the old plain SIGIO signal */
case 0:
do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
diff --git a/fs/file.c b/fs/file.c
index a364e1a9b7e8..c8a4e4c86e55 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -540,9 +540,14 @@ static int alloc_fd(unsigned start, unsigned flags)
return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
}
+int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
+{
+ return __alloc_fd(current->files, 0, nofile, flags);
+}
+
int get_unused_fd_flags(unsigned flags)
{
- return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
+ return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8e02d76fe104..97eec7522bf2 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -276,12 +276,10 @@ static void flush_bg_queue(struct fuse_conn *fc)
void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_iqueue *fiq = &fc->iq;
- bool async;
if (test_and_set_bit(FR_FINISHED, &req->flags))
goto put_request;
- async = req->args->end;
/*
* test_and_set_bit() implies smp_mb() between bit
* changing and below intr_entry check. Pairs with
@@ -324,7 +322,7 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&req->waitq);
}
- if (async)
+ if (test_bit(FR_ASYNC, &req->flags))
req->args->end(fc, req->args, req->out.h.error);
put_request:
fuse_put_request(fc, req);
@@ -471,6 +469,8 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
req->in.h.opcode = args->opcode;
req->in.h.nodeid = args->nodeid;
req->args = args;
+ if (args->end)
+ __set_bit(FR_ASYNC, &req->flags);
}
ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index aa75e2305b75..ca344bf71404 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -301,6 +301,7 @@ struct fuse_io_priv {
* FR_SENT: request is in userspace, waiting for an answer
* FR_FINISHED: request is finished
* FR_PRIVATE: request is on private list
+ * FR_ASYNC: request is asynchronous
*/
enum fuse_req_flag {
FR_ISREPLY,
@@ -314,6 +315,7 @@ enum fuse_req_flag {
FR_SENT,
FR_FINISHED,
FR_PRIVATE,
+ FR_ASYNC,
};
/**
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 09e6be8aa036..2e939f5fe751 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -21,6 +21,7 @@
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
+#include "quota.h"
#include "rgrp.h"
#include "trans.h"
#include "util.h"
@@ -116,14 +117,14 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
return -E2BIG;
- ret = gfs2_rsqa_alloc(ip);
+ ret = gfs2_qa_get(ip);
if (ret)
return ret;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (ret)
- return ret;
+ goto out;
need_unlock = true;
}
@@ -143,5 +144,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
unlock:
if (need_unlock)
gfs2_glock_dq_uninit(&gh);
+out:
+ gfs2_qa_put(ip);
return ret;
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ba83b49ce18c..786c1ce8f030 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -805,11 +805,16 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
bd = bh->b_private;
if (bd) {
gfs2_assert_warn(sdp, bd->bd_bh == bh);
- if (!list_empty(&bd->bd_list))
- list_del_init(&bd->bd_list);
bd->bd_bh = NULL;
bh->b_private = NULL;
- kmem_cache_free(gfs2_bufdata_cachep, bd);
+ /*
+ * The bd may still be queued as a revoke, in which
+ * case we must not dequeue nor free it.
+ */
+ if (!bd->bd_blkno && !list_empty(&bd->bd_list))
+ list_del_init(&bd->bd_list);
+ if (list_empty(&bd->bd_list))
+ kmem_cache_free(gfs2_bufdata_cachep, bd);
}
bh = bh->b_this_page;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 08f6fbb3655e..936a8ec6b48e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -2183,7 +2183,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
inode_dio_wait(inode);
- ret = gfs2_rsqa_alloc(ip);
+ ret = gfs2_qa_get(ip);
if (ret)
goto out;
@@ -2194,7 +2194,8 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
ret = do_shrink(inode, newsize);
out:
- gfs2_rsqa_delete(ip, NULL);
+ gfs2_rs_delete(ip, NULL);
+ gfs2_qa_put(ip);
return ret;
}
@@ -2223,7 +2224,7 @@ void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
struct gfs2_journal_extent *jext;
while(!list_empty(&jd->extent_list)) {
- jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
+ jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
list_del(&jext->list);
kfree(jext);
}
@@ -2244,7 +2245,7 @@ static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 b
struct gfs2_journal_extent *jext;
if (!list_empty(&jd->extent_list)) {
- jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
+ jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
if ((jext->dblock + jext->blocks) == dblock) {
jext->blocks += blocks;
return 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c8b62577e2f2..c3f7732415be 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2028,7 +2028,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
error = gfs2_trans_begin(sdp,
rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
- RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
+ RES_DINODE + RES_STATFS + RES_QUOTA, RES_DINODE +
+ l_blocks);
if (error)
goto out_rg_gunlock;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cb26be6f4351..fe305e4bfd37 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -458,10 +458,6 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
- ret = gfs2_rsqa_alloc(ip);
- if (ret)
- goto out;
-
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret)
@@ -558,7 +554,6 @@ out_uninit:
set_page_dirty(page);
wait_for_stable_page(page);
}
-out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
}
@@ -635,7 +630,17 @@ int gfs2_open_common(struct inode *inode, struct file *file)
gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
file->private_data = fp;
+ if (file->f_mode & FMODE_WRITE) {
+ ret = gfs2_qa_get(GFS2_I(inode));
+ if (ret)
+ goto fail;
+ }
return 0;
+
+fail:
+ kfree(file->private_data);
+ file->private_data = NULL;
+ return ret;
}
/**
@@ -690,10 +695,10 @@ static int gfs2_release(struct inode *inode, struct file *file)
kfree(file->private_data);
file->private_data = NULL;
- if (!(file->f_mode & FMODE_WRITE))
- return 0;
-
- gfs2_rsqa_delete(ip, &inode->i_writecount);
+ if (file->f_mode & FMODE_WRITE) {
+ gfs2_rs_delete(ip, &inode->i_writecount);
+ gfs2_qa_put(ip);
+ }
return 0;
}
@@ -849,10 +854,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct gfs2_inode *ip = GFS2_I(inode);
ssize_t ret;
- ret = gfs2_rsqa_alloc(ip);
- if (ret)
- return ret;
-
gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
if (iocb->ki_flags & IOCB_APPEND) {
@@ -1149,17 +1150,11 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
if (mode & FALLOC_FL_PUNCH_HOLE) {
ret = __gfs2_punch_hole(file, offset, len);
} else {
- ret = gfs2_rsqa_alloc(ip);
- if (ret)
- goto out_putw;
-
ret = __gfs2_fallocate(file, mode, offset, len);
-
if (ret)
gfs2_rs_deltree(&ip->i_res);
}
-out_putw:
put_write_access(inode);
out_unlock:
gfs2_glock_dq(&gh);
@@ -1173,16 +1168,12 @@ static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos,
size_t len, unsigned int flags)
{
- int error;
- struct gfs2_inode *ip = GFS2_I(out->f_mapping->host);
-
- error = gfs2_rsqa_alloc(ip);
- if (error)
- return (ssize_t)error;
+ ssize_t ret;
gfs2_size_hint(out, *ppos, len);
- return iter_file_splice_write(pipe, out, ppos, len, flags);
+ ret = iter_file_splice_write(pipe, out, ppos, len, flags);
+ return ret;
}
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d0eceaff3cea..29f9b6684b74 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -133,6 +133,33 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
}
}
+/**
+ * glock_blocked_by_withdraw - determine if we can still use a glock
+ * @gl: the glock
+ *
+ * We need to allow some glocks to be enqueued, dequeued, promoted, and demoted
+ * when we're withdrawn. For example, to maintain metadata integrity, we should
+ * disallow the use of inode and rgrp glocks when withdrawn. Other glocks, like
+ * iopen or the transaction glocks may be safely used because none of their
+ * metadata goes through the journal. So in general, we should disallow all
+ * glocks that are journaled, and allow all the others. One exception is:
+ * we need to allow our active journal to be promoted and demoted so others
+ * may recover it and we can reacquire it when they're done.
+ */
+static bool glock_blocked_by_withdraw(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+ if (likely(!gfs2_withdrawn(sdp)))
+ return false;
+ if (gl->gl_ops->go_flags & GLOF_NONDISK)
+ return false;
+ if (!sdp->sd_jdesc ||
+ gl->gl_name.ln_number == sdp->sd_jdesc->jd_no_addr)
+ return false;
+ return true;
+}
+
void gfs2_glock_free(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -244,7 +271,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
gfs2_glock_remove_from_lru(gl);
spin_unlock(&gl->gl_lockref.lock);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
- GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+ GLOCK_BUG_ON(gl, mapping && mapping->nrpages && !gfs2_withdrawn(sdp));
trace_gfs2_glock_put(gl);
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
}
@@ -281,7 +308,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
{
- const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+ const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list);
if ((gh->gh_state == LM_ST_EXCLUSIVE ||
gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
return 0;
@@ -549,8 +576,8 @@ __acquires(&gl->gl_lockref.lock)
unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
int ret;
- if (unlikely(gfs2_withdrawn(sdp)) &&
- target != LM_ST_UNLOCKED)
+ if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) &&
+ gh && !(gh->gh_flags & LM_FLAG_NOEXP))
return;
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY);
@@ -575,13 +602,64 @@ __acquires(&gl->gl_lockref.lock)
(lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
clear_bit(GLF_BLOCKING, &gl->gl_flags);
spin_unlock(&gl->gl_lockref.lock);
- if (glops->go_sync)
- glops->go_sync(gl);
- if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+ if (glops->go_sync) {
+ ret = glops->go_sync(gl);
+ /* If we had a problem syncing (due to io errors or whatever,
+ * we should not invalidate the metadata or tell dlm to
+ * release the glock to other nodes.
+ */
+ if (ret) {
+ if (cmpxchg(&sdp->sd_log_error, 0, ret)) {
+ fs_err(sdp, "Error %d syncing glock \n", ret);
+ gfs2_dump_glock(NULL, gl, true);
+ }
+ return;
+ }
+ }
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) {
+ /*
+ * The call to go_sync should have cleared out the ail list.
+ * If there are still items, we have a problem. We ought to
+ * withdraw, but we can't because the withdraw code also uses
+ * glocks. Warn about the error, dump the glock, then fall
+ * through and wait for logd to do the withdraw for us.
+ */
+ if ((atomic_read(&gl->gl_ail_count) != 0) &&
+ (!cmpxchg(&sdp->sd_log_error, 0, -EIO))) {
+ gfs2_assert_warn(sdp, !atomic_read(&gl->gl_ail_count));
+ gfs2_dump_glock(NULL, gl, true);
+ }
glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
- clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+ clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+ }
gfs2_glock_hold(gl);
+ /*
+ * Check for an error encountered since we called go_sync and go_inval.
+ * If so, we can't withdraw from the glock code because the withdraw
+ * code itself uses glocks (see function signal_our_withdraw) to
+ * change the mount to read-only. Most importantly, we must not call
+ * dlm to unlock the glock until the journal is in a known good state
+ * (after journal replay) otherwise other nodes may use the object
+ * (rgrp or dinode) and then later, journal replay will corrupt the
+ * file system. The best we can do here is wait for the logd daemon
+ * to see sd_log_error and withdraw, and in the meantime, requeue the
+ * work for later.
+ *
+ * However, if we're just unlocking the lock (say, for unmount, when
+ * gfs2_gl_hash_clear calls clear_glock) and recovery is complete
+ * then it's okay to tell dlm to unlock it.
+ */
+ if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp)))
+ gfs2_withdraw_delayed(sdp);
+ if (glock_blocked_by_withdraw(gl)) {
+ if (target != LM_ST_UNLOCKED ||
+ test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags)) {
+ gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
+ goto out;
+ }
+ }
+
if (sdp->sd_lockstruct.ls_ops->lm_lock) {
/* lock_dlm */
ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
@@ -590,8 +668,7 @@ __acquires(&gl->gl_lockref.lock)
test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) {
finish_xmote(gl, target);
gfs2_glock_queue_work(gl, 0);
- }
- else if (ret) {
+ } else if (ret) {
fs_err(sdp, "lm_lock ret %d\n", ret);
GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp));
}
@@ -599,7 +676,7 @@ __acquires(&gl->gl_lockref.lock)
finish_xmote(gl, target);
gfs2_glock_queue_work(gl, 0);
}
-
+out:
spin_lock(&gl->gl_lockref.lock);
}
@@ -613,7 +690,7 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
struct gfs2_holder *gh;
if (!list_empty(&gl->gl_holders)) {
- gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
return gh;
}
@@ -645,6 +722,9 @@ __acquires(&gl->gl_lockref.lock)
goto out_unlock;
if (nonblock)
goto out_sched;
+ smp_mb();
+ if (atomic_read(&gl->gl_revokes) != 0)
+ goto out_sched;
set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
gl->gl_target = gl->gl_demote_state;
@@ -1160,7 +1240,7 @@ fail:
}
list_add_tail(&gh->gh_list, insert_pt);
do_cancel:
- gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+ gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
spin_unlock(&gl->gl_lockref.lock);
if (sdp->sd_lockstruct.ls_ops->lm_cancel)
@@ -1194,10 +1274,9 @@ trap_recursive:
int gfs2_glock_nq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
int error = 0;
- if (unlikely(gfs2_withdrawn(sdp)))
+ if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP))
return -EIO;
if (test_bit(GLF_LRU, &gl->gl_flags))
@@ -1241,24 +1320,32 @@ int gfs2_glock_poll(struct gfs2_holder *gh)
void gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned delay = 0;
int fast_path = 0;
spin_lock(&gl->gl_lockref.lock);
+ /*
+ * If we're in the process of file system withdraw, we cannot just
+ * dequeue any glocks until our journal is recovered, lest we
+ * introduce file system corruption. We need two exceptions to this
+ * rule: We need to allow unlocking of nondisk glocks and the glock
+ * for our own journal that needs recovery.
+ */
+ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+ glock_blocked_by_withdraw(gl) &&
+ gh->gh_gl != sdp->sd_jinode_gl) {
+ sdp->sd_glock_dqs_held++;
+ might_sleep();
+ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+ TASK_UNINTERRUPTIBLE);
+ }
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
list_del_init(&gh->gh_list);
clear_bit(HIF_HOLDER, &gh->gh_iflags);
if (find_first_holder(gl) == NULL) {
- if (glops->go_unlock) {
- GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
- spin_unlock(&gl->gl_lockref.lock);
- glops->go_unlock(gh);
- spin_lock(&gl->gl_lockref.lock);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- }
if (list_empty(&gl->gl_holders) &&
!test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
!test_bit(GLF_DEMOTE, &gl->gl_flags))
@@ -1555,7 +1642,7 @@ __acquires(&lru_lock)
list_sort(NULL, list, glock_cmp);
while(!list_empty(list)) {
- gl = list_entry(list->next, struct gfs2_glock, gl_lru);
+ gl = list_first_entry(list, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
@@ -1596,7 +1683,7 @@ static long gfs2_scan_glock_lru(int nr)
spin_lock(&lru_lock);
while ((nr-- >= 0) && !list_empty(&lru_list)) {
- gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
+ gl = list_first_entry(&lru_list, struct gfs2_glock, gl_lru);
/* Test for being demotable */
if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 061d22e1ceb6..9e9c7a4b8c66 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -29,6 +29,8 @@
struct workqueue_struct *gfs2_freeze_wq;
+extern struct workqueue_struct *gfs2_control_wq;
+
static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
{
fs_err(gl->gl_name.ln_sbd,
@@ -39,7 +41,8 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n",
gl->gl_name.ln_type, gl->gl_name.ln_number,
gfs2_glock2aspace(gl));
- gfs2_lm_withdraw(gl->gl_name.ln_sbd, "AIL error\n");
+ gfs2_lm(gl->gl_name.ln_sbd, "AIL error\n");
+ gfs2_withdraw(gl->gl_name.ln_sbd);
}
/**
@@ -79,34 +82,62 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
}
-static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+static int gfs2_ail_empty_gl(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_trans tr;
+ int ret;
memset(&tr, 0, sizeof(tr));
INIT_LIST_HEAD(&tr.tr_buf);
INIT_LIST_HEAD(&tr.tr_databuf);
tr.tr_revokes = atomic_read(&gl->gl_ail_count);
- if (!tr.tr_revokes)
- return;
+ if (!tr.tr_revokes) {
+ bool have_revokes;
+ bool log_in_flight;
+
+ /*
+ * We have nothing on the ail, but there could be revokes on
+ * the sdp revoke queue, in which case, we still want to flush
+ * the log and wait for it to finish.
+ *
+ * If the sdp revoke list is empty too, we might still have an
+ * io outstanding for writing revokes, so we should wait for
+ * it before returning.
+ *
+ * If none of these conditions are true, our revokes are all
+ * flushed and we can return.
+ */
+ gfs2_log_lock(sdp);
+ have_revokes = !list_empty(&sdp->sd_log_revokes);
+ log_in_flight = atomic_read(&sdp->sd_log_in_flight);
+ gfs2_log_unlock(sdp);
+ if (have_revokes)
+ goto flush;
+ if (log_in_flight)
+ log_flush_wait(sdp);
+ return 0;
+ }
/* A shortened, inline version of gfs2_trans_begin()
* tr->alloced is not set since the transaction structure is
* on the stack */
tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes);
tr.tr_ip = _RET_IP_;
- if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0)
- return;
+ ret = gfs2_log_reserve(sdp, tr.tr_reserved);
+ if (ret < 0)
+ return ret;
WARN_ON_ONCE(current->journal_info);
current->journal_info = &tr;
__gfs2_ail_flush(gl, 0, tr.tr_revokes);
gfs2_trans_end(sdp);
+flush:
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_AIL_EMPTY_GL);
+ return 0;
}
void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
@@ -140,35 +171,32 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
* return to caller to demote/unlock the glock until I/O is complete.
*/
-static void rgrp_go_sync(struct gfs2_glock *gl)
+static int rgrp_go_sync(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
- struct gfs2_rgrpd *rgd;
+ struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
int error;
- spin_lock(&gl->gl_lockref.lock);
- rgd = gl->gl_object;
- if (rgd)
- gfs2_rgrp_brelse(rgd);
- spin_unlock(&gl->gl_lockref.lock);
-
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
- return;
+ return 0;
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_RGRP_GO_SYNC);
filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+ WARN_ON_ONCE(error);
mapping_set_error(mapping, error);
- gfs2_ail_empty_gl(gl);
+ if (!error)
+ error = gfs2_ail_empty_gl(gl);
spin_lock(&gl->gl_lockref.lock);
rgd = gl->gl_object;
if (rgd)
gfs2_free_clones(rgd);
spin_unlock(&gl->gl_lockref.lock);
+ return error;
}
/**
@@ -191,7 +219,6 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
gfs2_rgrp_brelse(rgd);
WARN_ON_ONCE(!(flags & DIO_METADATA));
- gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
if (rgd)
@@ -236,12 +263,12 @@ static void gfs2_clear_glop_pending(struct gfs2_inode *ip)
*
*/
-static void inode_go_sync(struct gfs2_glock *gl)
+static int inode_go_sync(struct gfs2_glock *gl)
{
struct gfs2_inode *ip = gfs2_glock2inode(gl);
int isreg = ip && S_ISREG(ip->i_inode.i_mode);
struct address_space *metamapping = gfs2_glock2aspace(gl);
- int error;
+ int error = 0;
if (isreg) {
if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
@@ -274,6 +301,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
out:
gfs2_clear_glop_pending(ip);
+ return error;
}
/**
@@ -291,8 +319,6 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
{
struct gfs2_inode *ip = gfs2_glock2inode(gl);
- gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count));
-
if (flags & DIO_METADATA) {
struct address_space *mapping = gfs2_glock2aspace(gl);
truncate_inode_pages(mapping, 0);
@@ -496,24 +522,29 @@ static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl,
*
*/
-static void freeze_go_sync(struct gfs2_glock *gl)
+static int freeze_go_sync(struct gfs2_glock *gl)
{
int error = 0;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- if (gl->gl_state == LM_ST_SHARED &&
+ if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE);
error = freeze_super(sdp->sd_vfs);
if (error) {
fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
error);
+ if (gfs2_withdrawn(sdp)) {
+ atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN);
+ return 0;
+ }
gfs2_assert_withdraw(sdp, 0);
}
queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work);
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
GFS2_LFC_FREEZE_GO_SYNC);
}
+ return 0;
}
/**
@@ -582,8 +613,76 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
}
}
+/**
+ * inode_go_free - wake up anyone waiting for dlm's unlock ast to free it
+ * @gl: glock being freed
+ *
+ * For now, this is only used for the journal inode glock. In withdraw
+ * situations, we need to wait for the glock to be freed so that we know
+ * other nodes may proceed with recovery / journal replay.
+ */
+static void inode_go_free(struct gfs2_glock *gl)
+{
+ /* Note that we cannot reference gl_object because it's already set
+ * to NULL by this point in its lifecycle. */
+ if (!test_bit(GLF_FREEING, &gl->gl_flags))
+ return;
+ clear_bit_unlock(GLF_FREEING, &gl->gl_flags);
+ wake_up_bit(&gl->gl_flags, GLF_FREEING);
+}
+
+/**
+ * nondisk_go_callback - used to signal when a node did a withdraw
+ * @gl: the nondisk glock
+ * @remote: true if this came from a different cluster node
+ *
+ */
+static void nondisk_go_callback(struct gfs2_glock *gl, bool remote)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+ /* Ignore the callback unless it's from another node, and it's the
+ live lock. */
+ if (!remote || gl->gl_name.ln_number != GFS2_LIVE_LOCK)
+ return;
+
+ /* First order of business is to cancel the demote request. We don't
+ * really want to demote a nondisk glock. At best it's just to inform
+ * us of another node's withdraw. We'll keep it in SH mode. */
+ clear_bit(GLF_DEMOTE, &gl->gl_flags);
+ clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
+
+ /* Ignore the unlock if we're withdrawn, unmounting, or in recovery. */
+ if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) ||
+ test_bit(SDF_WITHDRAWN, &sdp->sd_flags) ||
+ test_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags))
+ return;
+
+ /* We only care when a node wants us to unlock, because that means
+ * they want a journal recovered. */
+ if (gl->gl_demote_state != LM_ST_UNLOCKED)
+ return;
+
+ if (sdp->sd_args.ar_spectator) {
+ fs_warn(sdp, "Spectator node cannot recover journals.\n");
+ return;
+ }
+
+ fs_warn(sdp, "Some node has withdrawn; checking for recovery.\n");
+ set_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags);
+ /*
+ * We can't call remote_withdraw directly here or gfs2_recover_journal
+ * because this is called from the glock unlock function and the
+ * remote_withdraw needs to enqueue and dequeue the same "live" glock
+ * we were called from. So we queue it to the control work queue in
+ * lock_dlm.
+ */
+ queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
+}
+
const struct gfs2_glock_operations gfs2_meta_glops = {
.go_type = LM_TYPE_META,
+ .go_flags = GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_inode_glops = {
@@ -594,13 +693,13 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
.go_flags = GLOF_ASPACE | GLOF_LRU,
+ .go_free = inode_go_free,
};
const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_sync = rgrp_go_sync,
.go_inval = rgrp_go_inval,
.go_lock = gfs2_rgrp_go_lock,
- .go_unlock = gfs2_rgrp_go_unlock,
.go_dump = gfs2_rgrp_dump,
.go_type = LM_TYPE_RGRP,
.go_flags = GLOF_LVB,
@@ -611,30 +710,34 @@ const struct gfs2_glock_operations gfs2_freeze_glops = {
.go_xmote_bh = freeze_go_xmote_bh,
.go_demote_ok = freeze_go_demote_ok,
.go_type = LM_TYPE_NONDISK,
+ .go_flags = GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_iopen_glops = {
.go_type = LM_TYPE_IOPEN,
.go_callback = iopen_go_callback,
- .go_flags = GLOF_LRU,
+ .go_flags = GLOF_LRU | GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_flock_glops = {
.go_type = LM_TYPE_FLOCK,
- .go_flags = GLOF_LRU,
+ .go_flags = GLOF_LRU | GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_nondisk_glops = {
.go_type = LM_TYPE_NONDISK,
+ .go_flags = GLOF_NONDISK,
+ .go_callback = nondisk_go_callback,
};
const struct gfs2_glock_operations gfs2_quota_glops = {
.go_type = LM_TYPE_QUOTA,
- .go_flags = GLOF_LVB | GLOF_LRU,
+ .go_flags = GLOF_LVB | GLOF_LRU | GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_journal_glops = {
.go_type = LM_TYPE_JOURNAL,
+ .go_flags = GLOF_NONDISK,
};
const struct gfs2_glock_operations *gfs2_glops_list[] = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 9fd88ed18807..84a824293a78 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -234,20 +234,21 @@ struct lm_lockname {
struct gfs2_glock_operations {
- void (*go_sync) (struct gfs2_glock *gl);
+ int (*go_sync) (struct gfs2_glock *gl);
int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (const struct gfs2_glock *gl);
int (*go_lock) (struct gfs2_holder *gh);
- void (*go_unlock) (struct gfs2_holder *gh);
void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl,
const char *fs_id_buf);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
+ void (*go_free)(struct gfs2_glock *gl);
const int go_type;
const unsigned long go_flags;
-#define GLOF_ASPACE 1
-#define GLOF_LVB 2
-#define GLOF_LRU 4
+#define GLOF_ASPACE 1 /* address space attached */
+#define GLOF_LVB 2 /* Lock Value Block attached */
+#define GLOF_LRU 4 /* LRU managed */
+#define GLOF_NONDISK 8 /* not I/O related */
};
enum {
@@ -294,6 +295,7 @@ struct gfs2_qadata { /* quota allocation data */
struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS];
struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS];
unsigned int qa_qd_num;
+ int qa_ref;
};
/* Resource group multi-block reservation, in order of appearance:
@@ -343,6 +345,7 @@ enum {
GLF_OBJECT = 14, /* Used only for tracing */
GLF_BLOCKING = 15,
GLF_INODE_CREATING = 16, /* Inode creation occurring */
+ GLF_FREEING = 18, /* Wait for glock to be freed */
};
struct gfs2_glock {
@@ -542,6 +545,7 @@ struct gfs2_jdesc {
struct list_head jd_revoke_list;
unsigned int jd_replay_tail;
+ u64 jd_no_addr;
};
struct gfs2_statfs_change_host {
@@ -616,8 +620,12 @@ enum {
SDF_RORECOVERY = 7, /* read only recovery */
SDF_SKIP_DLM_UNLOCK = 8,
SDF_FORCE_AIL_FLUSH = 9,
- SDF_AIL1_IO_ERROR = 10,
- SDF_FS_FROZEN = 11,
+ SDF_FS_FROZEN = 10,
+ SDF_WITHDRAWING = 11, /* Will withdraw eventually */
+ SDF_WITHDRAW_IN_PROG = 12, /* Withdraw is in progress */
+ SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */
+ SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are
+ withdrawing */
};
enum gfs2_freeze_state {
@@ -768,6 +776,7 @@ struct gfs2_sbd {
struct gfs2_jdesc *sd_jdesc;
struct gfs2_holder sd_journal_gh;
struct gfs2_holder sd_jinode_gh;
+ struct gfs2_glock *sd_jinode_gl;
struct gfs2_holder sd_sc_gh;
struct gfs2_holder sd_qc_gh;
@@ -828,7 +837,8 @@ struct gfs2_sbd {
atomic_t sd_log_in_flight;
struct bio *sd_log_bio;
wait_queue_head_t sd_log_flush_wait;
- int sd_log_error;
+ int sd_log_error; /* First log error */
+ wait_queue_head_t sd_withdraw_wait;
atomic_t sd_reserving_log;
wait_queue_head_t sd_reserving_log_wait;
@@ -852,6 +862,7 @@ struct gfs2_sbd {
unsigned long sd_last_warning;
struct dentry *debugfs_dir; /* debugfs directory */
+ unsigned long sd_glock_dqs_held;
};
static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2716d56ed0a0..70b2d3a1e866 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -144,7 +144,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
if (unlikely(error))
- goto fail_put;
+ goto fail;
if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) {
/*
@@ -155,13 +155,13 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE,
GL_SKIP, &i_gh);
if (error)
- goto fail_put;
+ goto fail;
if (blktype != GFS2_BLKST_FREE) {
error = gfs2_check_blk_type(sdp, no_addr,
blktype);
if (error)
- goto fail_put;
+ goto fail;
}
}
@@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
set_bit(GIF_INVALID, &ip->i_flags);
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (unlikely(error))
- goto fail_put;
+ goto fail;
glock_set_object(ip->i_iopen_gh.gh_gl, ip);
gfs2_glock_put(io_gl);
io_gl = NULL;
@@ -182,7 +182,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
/* Inode glock must be locked already */
error = gfs2_inode_refresh(GFS2_I(inode));
if (error)
- goto fail_refresh;
+ goto fail;
} else {
ip->i_no_formal_ino = no_formal_ino;
inode->i_mode = DT2IF(type);
@@ -197,17 +197,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
gfs2_glock_dq_uninit(&i_gh);
return inode;
-fail_refresh:
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-fail_put:
+fail:
if (io_gl)
gfs2_glock_put(io_gl);
- glock_clear_object(ip->i_gl, ip);
if (gfs2_holder_initialized(&i_gh))
gfs2_glock_dq_uninit(&i_gh);
-fail:
iget_failed(inode);
return ERR_PTR(error);
}
@@ -594,13 +588,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
- error = gfs2_rsqa_alloc(dip);
+ error = gfs2_qa_get(dip);
if (error)
return error;
error = gfs2_rindex_update(sdp);
if (error)
- return error;
+ goto fail;
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
if (error)
@@ -647,7 +641,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail_gunlock;
ip = GFS2_I(inode);
- error = gfs2_rsqa_alloc(ip);
+ error = gfs2_qa_get(ip);
if (error)
goto fail_free_acls;
@@ -782,11 +776,13 @@ fail_gunlock2:
clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
gfs2_glock_put(io_gl);
fail_free_inode:
+ gfs2_qa_put(ip);
if (ip->i_gl) {
glock_clear_object(ip->i_gl, ip);
gfs2_glock_put(ip->i_gl);
}
- gfs2_rsqa_delete(ip, NULL);
+ gfs2_rs_delete(ip, NULL);
+ gfs2_qa_put(ip);
fail_free_acls:
posix_acl_release(default_acl);
posix_acl_release(acl);
@@ -804,6 +800,7 @@ fail_gunlock:
if (gfs2_holder_initialized(ghs + 1))
gfs2_glock_dq_uninit(ghs + 1);
fail:
+ gfs2_qa_put(dip);
return error;
}
@@ -905,7 +902,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (S_ISDIR(inode->i_mode))
return -EPERM;
- error = gfs2_rsqa_alloc(dip);
+ error = gfs2_qa_get(dip);
if (error)
return error;
@@ -1008,6 +1005,7 @@ out_gunlock:
out_child:
gfs2_glock_dq(ghs);
out_parent:
+ gfs2_qa_put(ip);
gfs2_holder_uninit(ghs);
gfs2_holder_uninit(ghs + 1);
return error;
@@ -1248,7 +1246,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
if (!(file->f_mode & FMODE_OPENED))
return finish_no_open(file, d);
dput(d);
- return 0;
+ return excl && (flags & O_CREAT) ? -EEXIST : 0;
}
BUG_ON(d != NULL);
@@ -1368,7 +1366,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (error)
return error;
- error = gfs2_rsqa_alloc(ndip);
+ error = gfs2_qa_get(ndip);
if (error)
return error;
@@ -1568,6 +1566,7 @@ out_gunlock_r:
if (gfs2_holder_initialized(&r_gh))
gfs2_glock_dq_uninit(&r_gh);
out:
+ gfs2_qa_put(ndip);
return error;
}
@@ -1879,10 +1878,9 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
ouid = nuid = NO_UID_QUOTA_CHANGE;
if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
ogid = ngid = NO_GID_QUOTA_CHANGE;
-
- error = gfs2_rsqa_alloc(ip);
+ error = gfs2_qa_get(ip);
if (error)
- goto out;
+ return error;
error = gfs2_rindex_update(sdp);
if (error)
@@ -1920,6 +1918,7 @@ out_end_trans:
out_gunlock_q:
gfs2_quota_unlock(ip);
out:
+ gfs2_qa_put(ip);
return error;
}
@@ -1941,21 +1940,21 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
struct gfs2_holder i_gh;
int error;
- error = gfs2_rsqa_alloc(ip);
+ error = gfs2_qa_get(ip);
if (error)
return error;
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
if (error)
- return error;
+ goto out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out;
+ goto error;
error = setattr_prepare(dentry, attr);
if (error)
- goto out;
+ goto error;
if (attr->ia_valid & ATTR_SIZE)
error = gfs2_setattr_size(inode, attr->ia_size);
@@ -1967,10 +1966,12 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
error = posix_acl_chmod(inode, inode->i_mode);
}
-out:
+error:
if (!error)
mark_inode_dirty(inode);
gfs2_glock_dq_uninit(&i_gh);
+out:
+ gfs2_qa_put(ip);
return error;
}
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 7c7197343ee2..9f2b5609f225 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -16,6 +16,8 @@
#include "incore.h"
#include "glock.h"
+#include "glops.h"
+#include "recovery.h"
#include "util.h"
#include "sys.h"
#include "trace_gfs2.h"
@@ -124,6 +126,8 @@ static void gdlm_ast(void *arg)
switch (gl->gl_lksb.sb_status) {
case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
+ if (gl->gl_ops->go_free)
+ gl->gl_ops->go_free(gl);
gfs2_glock_free(gl);
return;
case -DLM_ECANCEL: /* Cancel while getting lock */
@@ -323,6 +327,7 @@ static void gdlm_cancel(struct gfs2_glock *gl)
/*
* dlm/gfs2 recovery coordination using dlm_recover callbacks
*
+ * 0. gfs2 checks for another cluster node withdraw, needing journal replay
* 1. dlm_controld sees lockspace members change
* 2. dlm_controld blocks dlm-kernel locking activity
* 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
@@ -571,6 +576,28 @@ static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
&ls->ls_control_lksb, "control_lock");
}
+/**
+ * remote_withdraw - react to a node withdrawing from the file system
+ * @sdp: The superblock
+ */
+static void remote_withdraw(struct gfs2_sbd *sdp)
+{
+ struct gfs2_jdesc *jd;
+ int ret = 0, count = 0;
+
+ list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+ if (jd->jd_jid == sdp->sd_lockstruct.ls_jid)
+ continue;
+ ret = gfs2_recover_journal(jd, true);
+ if (ret)
+ break;
+ count++;
+ }
+
+ /* Now drop the additional reference we acquired */
+ fs_err(sdp, "Journals checked: %d, ret = %d.\n", count, ret);
+}
+
static void gfs2_control_func(struct work_struct *work)
{
struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
@@ -581,6 +608,13 @@ static void gfs2_control_func(struct work_struct *work)
int recover_size;
int i, error;
+ /* First check for other nodes that may have done a withdraw. */
+ if (test_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags)) {
+ remote_withdraw(sdp);
+ clear_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags);
+ return;
+ }
+
spin_lock(&ls->ls_recover_spin);
/*
* No MOUNT_DONE means we're still mounting; control_mount()
@@ -1079,6 +1113,10 @@ static void gdlm_recover_prep(void *arg)
struct gfs2_sbd *sdp = arg;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ if (gfs2_withdrawn(sdp)) {
+ fs_err(sdp, "recover_prep ignored due to withdraw.\n");
+ return;
+ }
spin_lock(&ls->ls_recover_spin);
ls->ls_recover_block = ls->ls_recover_start;
set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
@@ -1101,6 +1139,11 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int jid = slot->slot - 1;
+ if (gfs2_withdrawn(sdp)) {
+ fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n",
+ jid);
+ return;
+ }
spin_lock(&ls->ls_recover_spin);
if (ls->ls_recover_size < jid + 1) {
fs_err(sdp, "recover_slot jid %d gen %u short size %d\n",
@@ -1125,6 +1168,10 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
struct gfs2_sbd *sdp = arg;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ if (gfs2_withdrawn(sdp)) {
+ fs_err(sdp, "recover_done ignored due to withdraw.\n");
+ return;
+ }
/* ensure the ls jid arrays are large enough */
set_recover_size(sdp, slots, num_slots);
@@ -1152,6 +1199,11 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ if (gfs2_withdrawn(sdp)) {
+ fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n",
+ jid);
+ return;
+ }
if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
return;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 00a2e721a374..3a75843ae580 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -88,8 +88,7 @@ static void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
struct writeback_control *wbc,
- struct gfs2_trans *tr,
- bool *withdraw)
+ struct gfs2_trans *tr)
__releases(&sdp->sd_ail_lock)
__acquires(&sdp->sd_ail_lock)
{
@@ -97,6 +96,7 @@ __acquires(&sdp->sd_ail_lock)
struct address_space *mapping;
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
+ int ret = 0;
list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) {
bh = bd->bd_bh;
@@ -104,16 +104,21 @@ __acquires(&sdp->sd_ail_lock)
gfs2_assert(sdp, bd->bd_tr == tr);
if (!buffer_busy(bh)) {
- if (!buffer_uptodate(bh) &&
- !test_and_set_bit(SDF_AIL1_IO_ERROR,
- &sdp->sd_flags)) {
+ if (buffer_uptodate(bh)) {
+ list_move(&bd->bd_ail_st_list,
+ &tr->tr_ail2_list);
+ continue;
+ }
+ if (!cmpxchg(&sdp->sd_log_error, 0, -EIO)) {
gfs2_io_error_bh(sdp, bh);
- *withdraw = true;
+ gfs2_withdraw_delayed(sdp);
}
- list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
- continue;
}
+ if (gfs2_withdrawn(sdp)) {
+ gfs2_remove_from_ail(bd);
+ continue;
+ }
if (!buffer_dirty(bh))
continue;
if (gl == bd->bd_gl)
@@ -124,16 +129,50 @@ __acquires(&sdp->sd_ail_lock)
if (!mapping)
continue;
spin_unlock(&sdp->sd_ail_lock);
- generic_writepages(mapping, wbc);
+ ret = generic_writepages(mapping, wbc);
spin_lock(&sdp->sd_ail_lock);
- if (wbc->nr_to_write <= 0)
+ if (ret || wbc->nr_to_write <= 0)
break;
- return 1;
+ return -EBUSY;
}
- return 0;
+ return ret;
}
+static void dump_ail_list(struct gfs2_sbd *sdp)
+{
+ struct gfs2_trans *tr;
+ struct gfs2_bufdata *bd;
+ struct buffer_head *bh;
+
+ fs_err(sdp, "Error: In gfs2_ail1_flush for ten minutes! t=%d\n",
+ current->journal_info ? 1 : 0);
+
+ list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry_reverse(bd, &tr->tr_ail1_list,
+ bd_ail_st_list) {
+ bh = bd->bd_bh;
+ fs_err(sdp, "bd %p: blk:0x%llx bh=%p ", bd,
+ (unsigned long long)bd->bd_blkno, bh);
+ if (!bh) {
+ fs_err(sdp, "\n");
+ continue;
+ }
+ fs_err(sdp, "0x%llx up2:%d dirt:%d lkd:%d req:%d "
+ "map:%d new:%d ar:%d aw:%d delay:%d "
+ "io err:%d unwritten:%d dfr:%d pin:%d esc:%d\n",
+ (unsigned long long)bh->b_blocknr,
+ buffer_uptodate(bh), buffer_dirty(bh),
+ buffer_locked(bh), buffer_req(bh),
+ buffer_mapped(bh), buffer_new(bh),
+ buffer_async_read(bh), buffer_async_write(bh),
+ buffer_delay(bh), buffer_write_io_error(bh),
+ buffer_unwritten(bh),
+ buffer_defer_completion(bh),
+ buffer_pinned(bh), buffer_escaped(bh));
+ }
+ }
+}
/**
* gfs2_ail1_flush - start writeback of some ail1 entries
@@ -149,23 +188,36 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
struct list_head *head = &sdp->sd_ail1_list;
struct gfs2_trans *tr;
struct blk_plug plug;
- bool withdraw = false;
+ int ret;
+ unsigned long flush_start = jiffies;
trace_gfs2_ail_flush(sdp, wbc, 1);
blk_start_plug(&plug);
spin_lock(&sdp->sd_ail_lock);
restart:
+ ret = 0;
+ if (time_after(jiffies, flush_start + (HZ * 600))) {
+ dump_ail_list(sdp);
+ goto out;
+ }
list_for_each_entry_reverse(tr, head, tr_list) {
if (wbc->nr_to_write <= 0)
break;
- if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw) &&
- !gfs2_withdrawn(sdp))
- goto restart;
+ ret = gfs2_ail1_start_one(sdp, wbc, tr);
+ if (ret) {
+ if (ret == -EBUSY)
+ goto restart;
+ break;
+ }
}
+out:
spin_unlock(&sdp->sd_ail_lock);
blk_finish_plug(&plug);
- if (withdraw)
- gfs2_lm_withdraw(sdp, NULL);
+ if (ret) {
+ gfs2_lm(sdp, "gfs2_ail1_start_one (generic_writepages) "
+ "returned: %d\n", ret);
+ gfs2_withdraw(sdp);
+ }
trace_gfs2_ail_flush(sdp, wbc, 0);
}
@@ -189,12 +241,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
/**
* gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
* @sdp: the filesystem
- * @ai: the AIL entry
+ * @tr: the transaction
+ * @max_revokes: If nonzero, issue revokes for the bd items for written buffers
*
*/
static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
- bool *withdraw)
+ int *max_revokes)
{
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
@@ -203,12 +256,32 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
bd_ail_st_list) {
bh = bd->bd_bh;
gfs2_assert(sdp, bd->bd_tr == tr);
- if (buffer_busy(bh))
+ /*
+ * If another process flagged an io error, e.g. writing to the
+ * journal, error all other bhs and move them off the ail1 to
+ * prevent a tight loop when unmount tries to flush ail1,
+ * regardless of whether they're still busy. If no outside
+ * errors were found and the buffer is busy, move to the next.
+ * If the ail buffer is not busy and caught an error, flag it
+ * for others.
+ */
+ if (!sdp->sd_log_error && buffer_busy(bh))
continue;
if (!buffer_uptodate(bh) &&
- !test_and_set_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) {
+ !cmpxchg(&sdp->sd_log_error, 0, -EIO)) {
gfs2_io_error_bh(sdp, bh);
- *withdraw = true;
+ gfs2_withdraw_delayed(sdp);
+ }
+ /*
+ * If we have space for revokes and the bd is no longer on any
+ * buf list, we can just add a revoke for it immediately and
+ * avoid having to put it on the ail2 list, where it would need
+ * to be revoked later.
+ */
+ if (*max_revokes && list_empty(&bd->bd_list)) {
+ gfs2_add_revoke(sdp, bd);
+ (*max_revokes)--;
+ continue;
}
list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
}
@@ -217,20 +290,20 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
/**
* gfs2_ail1_empty - Try to empty the ail1 lists
* @sdp: The superblock
+ * @max_revokes: If non-zero, add revokes where appropriate
*
* Tries to empty the ail1 lists, starting with the oldest first
*/
-static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
+static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes)
{
struct gfs2_trans *tr, *s;
int oldest_tr = 1;
int ret;
- bool withdraw = false;
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
- gfs2_ail1_empty_one(sdp, tr, &withdraw);
+ gfs2_ail1_empty_one(sdp, tr, &max_revokes);
if (list_empty(&tr->tr_ail1_list) && oldest_tr)
list_move(&tr->tr_list, &sdp->sd_ail2_list);
else
@@ -239,8 +312,10 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
ret = list_empty(&sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
- if (withdraw)
- gfs2_lm_withdraw(sdp, "fatal: I/O error(s)\n");
+ if (test_bit(SDF_WITHDRAWING, &sdp->sd_flags)) {
+ gfs2_lm(sdp, "fatal: I/O error(s)\n");
+ gfs2_withdraw(sdp);
+ }
return ret;
}
@@ -268,20 +343,17 @@ static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
}
/**
- * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
- * @sdp: the filesystem
- * @ai: the AIL entry
- *
+ * gfs2_ail_empty_tr - empty one of the ail lists for a transaction
*/
-static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+static void gfs2_ail_empty_tr(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
+ struct list_head *head)
{
- struct list_head *head = &tr->tr_ail2_list;
struct gfs2_bufdata *bd;
while (!list_empty(head)) {
- bd = list_entry(head->prev, struct gfs2_bufdata,
- bd_ail_st_list);
+ bd = list_first_entry(head, struct gfs2_bufdata,
+ bd_ail_st_list);
gfs2_assert(sdp, bd->bd_tr == tr);
gfs2_remove_from_ail(bd);
}
@@ -303,7 +375,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
if (!rm)
continue;
- gfs2_ail2_empty_one(sdp, tr);
+ gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list);
list_del(&tr->tr_list);
gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list));
gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list));
@@ -487,7 +559,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
if (list_empty(&sdp->sd_ail1_list)) {
tail = sdp->sd_log_head;
} else {
- tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans,
+ tr = list_last_entry(&sdp->sd_ail1_list, struct gfs2_trans,
tr_list);
tail = tr->tr_first;
}
@@ -512,7 +584,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
}
-static void log_flush_wait(struct gfs2_sbd *sdp)
+void log_flush_wait(struct gfs2_sbd *sdp)
{
DEFINE_WAIT(wait);
@@ -549,7 +621,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
spin_lock(&sdp->sd_ordered_lock);
list_sort(NULL, &sdp->sd_log_ordered, &ip_cmp);
while (!list_empty(&sdp->sd_log_ordered)) {
- ip = list_entry(sdp->sd_log_ordered.next, struct gfs2_inode, i_ordered);
+ ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered);
if (ip->i_inode.i_mapping->nrpages == 0) {
test_and_clear_bit(GIF_ORDERED, &ip->i_flags);
list_del(&ip->i_ordered);
@@ -570,7 +642,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
spin_lock(&sdp->sd_ordered_lock);
while (!list_empty(&sdp->sd_log_ordered)) {
- ip = list_entry(sdp->sd_log_ordered.next, struct gfs2_inode, i_ordered);
+ ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered);
list_del(&ip->i_ordered);
WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
if (ip->i_inode.i_mapping->nrpages == 0)
@@ -616,27 +688,24 @@ void gfs2_glock_remove_revoke(struct gfs2_glock *gl)
}
}
+/**
+ * gfs2_write_revokes - Add as many revokes to the system transaction as we can
+ * @sdp: The GFS2 superblock
+ *
+ * Our usual strategy is to defer writing revokes as much as we can in the hope
+ * that we'll eventually overwrite the journal, which will make those revokes
+ * go away. This changes when we flush the log: at that point, there will
+ * likely be some left-over space in the last revoke block of that transaction.
+ * We can fill that space with additional revokes for blocks that have already
+ * been written back. This will basically come at no cost now, and will save
+ * us from having to keep track of those blocks on the AIL2 list later.
+ */
void gfs2_write_revokes(struct gfs2_sbd *sdp)
{
- struct gfs2_trans *tr;
- struct gfs2_bufdata *bd, *tmp;
- int have_revokes = 0;
+ /* number of revokes we still have room for */
int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
- gfs2_ail1_empty(sdp);
- spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
- list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
- if (list_empty(&bd->bd_list)) {
- have_revokes = 1;
- goto done;
- }
- }
- }
-done:
- spin_unlock(&sdp->sd_ail_lock);
- if (have_revokes == 0)
- return;
+ gfs2_log_lock(sdp);
while (sdp->sd_log_num_revoke > max_revokes)
max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
max_revokes -= sdp->sd_log_num_revoke;
@@ -647,20 +716,7 @@ done:
if (!sdp->sd_log_blks_reserved)
atomic_dec(&sdp->sd_log_blks_free);
}
- gfs2_log_lock(sdp);
- spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
- list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
- if (max_revokes == 0)
- goto out_of_blocks;
- if (!list_empty(&bd->bd_list))
- continue;
- gfs2_add_revoke(sdp, bd);
- max_revokes--;
- }
- }
-out_of_blocks:
- spin_unlock(&sdp->sd_ail_lock);
+ gfs2_ail1_empty(sdp, max_revokes);
gfs2_log_unlock(sdp);
if (!sdp->sd_log_num_revoke) {
@@ -787,6 +843,40 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
}
/**
+ * ail_drain - drain the ail lists after a withdraw
+ * @sdp: Pointer to GFS2 superblock
+ */
+static void ail_drain(struct gfs2_sbd *sdp)
+{
+ struct gfs2_trans *tr;
+
+ spin_lock(&sdp->sd_ail_lock);
+ /*
+ * For transactions on the sd_ail1_list we need to drain both the
+ * ail1 and ail2 lists. That's because function gfs2_ail1_start_one
+ * (temporarily) moves items from its tr_ail1 list to tr_ail2 list
+ * before revokes are sent for that block. Items on the sd_ail2_list
+ * should have already gotten beyond that point, so no need.
+ */
+ while (!list_empty(&sdp->sd_ail1_list)) {
+ tr = list_first_entry(&sdp->sd_ail1_list, struct gfs2_trans,
+ tr_list);
+ gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail1_list);
+ gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list);
+ list_del(&tr->tr_list);
+ kfree(tr);
+ }
+ while (!list_empty(&sdp->sd_ail2_list)) {
+ tr = list_first_entry(&sdp->sd_ail2_list, struct gfs2_trans,
+ tr_list);
+ gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list);
+ list_del(&tr->tr_list);
+ kfree(tr);
+ }
+ spin_unlock(&sdp->sd_ail_lock);
+}
+
+/**
* gfs2_log_flush - flush incore transaction(s)
* @sdp: the filesystem
* @gl: The glock structure to flush. If NULL, flush the whole incore log
@@ -796,11 +886,18 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
{
- struct gfs2_trans *tr;
+ struct gfs2_trans *tr = NULL;
enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
down_write(&sdp->sd_log_flush_lock);
+ /*
+ * Do this check while holding the log_flush_lock to prevent new
+ * buffers from being added to the ail via gfs2_pin()
+ */
+ if (gfs2_withdrawn(sdp))
+ goto out;
+
/* Log might have been flushed while we waited for the flush lock */
if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
up_write(&sdp->sd_log_flush_lock);
@@ -819,17 +916,27 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
INIT_LIST_HEAD(&tr->tr_ail2_list);
tr->tr_first = sdp->sd_log_flush_head;
if (unlikely (state == SFS_FROZEN))
- gfs2_assert_withdraw(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new);
+ if (gfs2_assert_withdraw_delayed(sdp,
+ !tr->tr_num_buf_new && !tr->tr_num_databuf_new))
+ goto out;
}
if (unlikely(state == SFS_FROZEN))
- gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
- gfs2_assert_withdraw(sdp,
- sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke);
+ if (gfs2_assert_withdraw_delayed(sdp, !sdp->sd_log_num_revoke))
+ goto out;
+ if (gfs2_assert_withdraw_delayed(sdp,
+ sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke))
+ goto out;
gfs2_ordered_write(sdp);
+ if (gfs2_withdrawn(sdp))
+ goto out;
lops_before_commit(sdp, tr);
+ if (gfs2_withdrawn(sdp))
+ goto out;
gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE);
+ if (gfs2_withdrawn(sdp))
+ goto out;
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
log_flush_wait(sdp);
@@ -839,6 +946,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
trace_gfs2_log_blocks(sdp, -1);
log_write_header(sdp, flags);
}
+ if (gfs2_withdrawn(sdp))
+ goto out;
lops_after_commit(sdp, tr);
gfs2_log_lock(sdp);
@@ -859,9 +968,11 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
for (;;) {
gfs2_ail1_start(sdp);
gfs2_ail1_wait(sdp);
- if (gfs2_ail1_empty(sdp))
+ if (gfs2_ail1_empty(sdp, 0))
break;
}
+ if (gfs2_withdrawn(sdp))
+ goto out;
atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
trace_gfs2_log_blocks(sdp, -1);
log_write_header(sdp, flags);
@@ -874,6 +985,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
}
+out:
+ if (gfs2_withdrawn(sdp)) {
+ ail_drain(sdp); /* frees all transactions */
+ tr = NULL;
+ }
+
trace_gfs2_log_flush(sdp, 0, flags);
up_write(&sdp->sd_log_flush_lock);
@@ -1016,16 +1133,17 @@ int gfs2_logd(void *data)
/* Check for errors writing to the journal */
if (sdp->sd_log_error) {
- gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: error %d: "
- "withdrawing the file system to "
- "prevent further damage.\n",
- sdp->sd_fsname, sdp->sd_log_error);
+ gfs2_lm(sdp,
+ "GFS2: fsid=%s: error %d: "
+ "withdrawing the file system to "
+ "prevent further damage.\n",
+ sdp->sd_fsname, sdp->sd_log_error);
+ gfs2_withdraw(sdp);
}
did_flush = false;
if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
- gfs2_ail1_empty(sdp);
+ gfs2_ail1_empty(sdp, 0);
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_LOGD_JFLUSH_REQD);
did_flush = true;
@@ -1034,7 +1152,7 @@ int gfs2_logd(void *data)
if (gfs2_ail_flush_reqd(sdp)) {
gfs2_ail1_start(sdp);
gfs2_ail1_wait(sdp);
- gfs2_ail1_empty(sdp);
+ gfs2_ail1_empty(sdp, 0);
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_LOGD_AIL_FLUSH_REQD);
did_flush = true;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index c0a65e5a126b..c1cd6ae17659 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -73,6 +73,7 @@ extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
u32 type);
extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
+extern void log_flush_wait(struct gfs2_sbd *sdp);
extern int gfs2_logd(void *data);
extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index c090d5ad3f22..5ea96757afc4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -203,8 +203,12 @@ static void gfs2_end_log_write(struct bio *bio)
struct bvec_iter_all iter_all;
if (bio->bi_status) {
- fs_err(sdp, "Error %d writing to journal, jid=%u\n",
- bio->bi_status, sdp->sd_jdesc->jd_jid);
+ if (!cmpxchg(&sdp->sd_log_error, 0, (int)bio->bi_status))
+ fs_err(sdp, "Error %d writing to journal, jid=%u\n",
+ bio->bi_status, sdp->sd_jdesc->jd_jid);
+ gfs2_withdraw_delayed(sdp);
+ /* prevent more writes to the journal */
+ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
wake_up(&sdp->sd_logd_waitq);
}
@@ -730,7 +734,7 @@ static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
head = &tr->tr_buf;
while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+ bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
gfs2_unpin(sdp, bd->bd_bh, tr);
}
@@ -900,7 +904,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
struct gfs2_glock *gl;
while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+ bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
gl = bd->bd_gl;
gfs2_glock_remove_revoke(gl);
@@ -1079,7 +1083,7 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
head = &tr->tr_databuf;
while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
+ bd = list_first_entry(head, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
gfs2_unpin(sdp, bd->bd_bh, tr);
}
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0c3772974030..4b72abcf83b2 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -251,7 +251,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head *bh, *bhs[2];
int num = 0;
- if (unlikely(gfs2_withdrawn(sdp))) {
+ if (unlikely(gfs2_withdrawn(sdp)) &&
+ (!sdp->sd_jdesc || (blkno != sdp->sd_jdesc->jd_no_addr))) {
*bhp = NULL;
return -EIO;
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a1a8ef7ed3fd..e2b69ffcc6a8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -552,6 +552,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
mutex_lock(&sdp->sd_jindex_mutex);
for (;;) {
+ struct gfs2_inode *jip;
+
error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
if (error)
break;
@@ -591,6 +593,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
spin_lock(&sdp->sd_jindex_spin);
jd->jd_jid = sdp->sd_journals++;
+ jip = GFS2_I(jd->jd_inode);
+ jd->jd_no_addr = jip->i_no_addr;
list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
spin_unlock(&sdp->sd_jindex_spin);
}
@@ -600,48 +604,6 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
return error;
}
-/**
- * check_journal_clean - Make sure a journal is clean for a spectator mount
- * @sdp: The GFS2 superblock
- * @jd: The journal descriptor
- *
- * Returns: 0 if the journal is clean or locked, else an error
- */
-static int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
-{
- int error;
- struct gfs2_holder j_gh;
- struct gfs2_log_header_host head;
- struct gfs2_inode *ip;
-
- ip = GFS2_I(jd->jd_inode);
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP |
- GL_EXACT | GL_NOCACHE, &j_gh);
- if (error) {
- fs_err(sdp, "Error locking journal for spectator mount.\n");
- return -EPERM;
- }
- error = gfs2_jdesc_check(jd);
- if (error) {
- fs_err(sdp, "Error checking journal for spectator mount.\n");
- goto out_unlock;
- }
- error = gfs2_find_jhead(jd, &head, false);
- if (error) {
- fs_err(sdp, "Error parsing journal for spectator mount.\n");
- goto out_unlock;
- }
- if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
- error = -EPERM;
- fs_err(sdp, "jid=%u: Journal is dirty, so the first mounter "
- "must not be a spectator.\n", jd->jd_jid);
- }
-
-out_unlock:
- gfs2_glock_dq_uninit(&j_gh);
- return error;
-}
-
static int init_journal(struct gfs2_sbd *sdp, int undo)
{
struct inode *master = d_inode(sdp->sd_master_dir);
@@ -694,7 +656,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
&gfs2_journal_glops,
- LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
+ LM_ST_EXCLUSIVE,
+ LM_FLAG_NOEXP | GL_NOCACHE,
&sdp->sd_journal_gh);
if (error) {
fs_err(sdp, "can't acquire journal glock: %d\n", error);
@@ -702,6 +665,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
}
ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+ sdp->sd_jinode_gl = ip->i_gl;
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE,
&sdp->sd_jinode_gh);
@@ -732,7 +696,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
struct gfs2_jdesc *jd = gfs2_jdesc_find(sdp, x);
if (sdp->sd_args.ar_spectator) {
- error = check_journal_clean(sdp, jd);
+ error = check_journal_clean(sdp, jd, true);
if (error)
goto fail_jinode_gh;
continue;
@@ -762,10 +726,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
return 0;
fail_jinode_gh:
- if (!sdp->sd_args.ar_spectator)
+ /* A withdraw may have done dq/uninit so now we need to check it */
+ if (!sdp->sd_args.ar_spectator &&
+ gfs2_holder_initialized(&sdp->sd_jinode_gh))
gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
fail_journal_gh:
- if (!sdp->sd_args.ar_spectator)
+ if (!sdp->sd_args.ar_spectator &&
+ gfs2_holder_initialized(&sdp->sd_journal_gh))
gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
fail_jindex:
gfs2_jindex_free(sdp);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e9f93045eb01..cc0c4b5800be 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -115,7 +115,7 @@ static void gfs2_qd_dispose(struct list_head *list)
struct gfs2_sbd *sdp;
while (!list_empty(list)) {
- qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
+ qd = list_first_entry(list, struct gfs2_quota_data, qd_lru);
sdp = qd->qd_gl->gl_name.ln_sbd;
list_del(&qd->qd_lru);
@@ -525,11 +525,11 @@ static void qdsb_put(struct gfs2_quota_data *qd)
}
/**
- * gfs2_qa_alloc - make sure we have a quota allocations data structure,
- * if necessary
+ * gfs2_qa_get - make sure we have a quota allocations data structure,
+ * if necessary
* @ip: the inode for this reservation
*/
-int gfs2_qa_alloc(struct gfs2_inode *ip)
+int gfs2_qa_get(struct gfs2_inode *ip)
{
int error = 0;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -540,17 +540,21 @@ int gfs2_qa_alloc(struct gfs2_inode *ip)
down_write(&ip->i_rw_mutex);
if (ip->i_qadata == NULL) {
ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
- if (!ip->i_qadata)
+ if (!ip->i_qadata) {
error = -ENOMEM;
+ goto out;
+ }
}
+ ip->i_qadata->qa_ref++;
+out:
up_write(&ip->i_rw_mutex);
return error;
}
-void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_qa_put(struct gfs2_inode *ip)
{
down_write(&ip->i_rw_mutex);
- if (ip->i_qadata && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
+ if (ip->i_qadata && --ip->i_qadata->qa_ref == 0) {
kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata);
ip->i_qadata = NULL;
}
@@ -566,27 +570,27 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- if (ip->i_qadata == NULL) {
- error = gfs2_rsqa_alloc(ip);
- if (error)
- return error;
- }
+ error = gfs2_qa_get(ip);
+ if (error)
+ return error;
qd = ip->i_qadata->qa_qd;
if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) ||
- gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
- return -EIO;
+ gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) {
+ error = -EIO;
+ goto out;
+ }
error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);
if (error)
- goto out;
+ goto out_unhold;
ip->i_qadata->qa_qd_num++;
qd++;
error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);
if (error)
- goto out;
+ goto out_unhold;
ip->i_qadata->qa_qd_num++;
qd++;
@@ -594,7 +598,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
!uid_eq(uid, ip->i_inode.i_uid)) {
error = qdsb_get(sdp, make_kqid_uid(uid), qd);
if (error)
- goto out;
+ goto out_unhold;
ip->i_qadata->qa_qd_num++;
qd++;
}
@@ -603,14 +607,15 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
!gid_eq(gid, ip->i_inode.i_gid)) {
error = qdsb_get(sdp, make_kqid_gid(gid), qd);
if (error)
- goto out;
+ goto out_unhold;
ip->i_qadata->qa_qd_num++;
qd++;
}
-out:
+out_unhold:
if (error)
gfs2_quota_unhold(ip);
+out:
return error;
}
@@ -621,6 +626,7 @@ void gfs2_quota_unhold(struct gfs2_inode *ip)
if (ip->i_qadata == NULL)
return;
+
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
@@ -628,6 +634,7 @@ void gfs2_quota_unhold(struct gfs2_inode *ip)
ip->i_qadata->qa_qd[x] = NULL;
}
ip->i_qadata->qa_qd_num = 0;
+ gfs2_qa_put(ip);
}
static int sort_qd(const void *a, const void *b)
@@ -876,7 +883,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
unsigned int nalloc = 0, blocks;
int error;
- error = gfs2_rsqa_alloc(ip);
+ error = gfs2_qa_get(ip);
if (error)
return error;
@@ -884,8 +891,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
&data_blocks, &ind_blocks);
ghs = kmalloc_array(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
- if (!ghs)
- return -ENOMEM;
+ if (!ghs) {
+ error = -ENOMEM;
+ goto out;
+ }
sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
inode_lock(&ip->i_inode);
@@ -893,12 +902,12 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
GL_NOCACHE, &ghs[qx]);
if (error)
- goto out;
+ goto out_dq;
}
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
if (error)
- goto out;
+ goto out_dq;
for (x = 0; x < num_qd; x++) {
offset = qd2offset(qda[x]);
@@ -950,13 +959,15 @@ out_ipres:
gfs2_inplace_release(ip);
out_alloc:
gfs2_glock_dq_uninit(&i_gh);
-out:
+out_dq:
while (qx--)
gfs2_glock_dq_uninit(&ghs[qx]);
inode_unlock(&ip->i_inode);
kfree(ghs);
gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl,
GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC);
+out:
+ gfs2_qa_put(ip);
return error;
}
@@ -1259,6 +1270,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
if (ip->i_diskflags & GFS2_DIF_SYSTEM)
return;
+ BUG_ON(ip->i_qadata->qa_ref <= 0);
for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
qd = ip->i_qadata->qa_qd[x];
@@ -1441,7 +1453,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
spin_lock(&qd_lock);
while (!list_empty(head)) {
- qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
+ qd = list_last_entry(head, struct gfs2_quota_data, qd_list);
list_del(&qd->qd_list);
@@ -1476,8 +1488,8 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
if (error == 0 || error == -EROFS)
return;
if (!gfs2_withdrawn(sdp)) {
- fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
- sdp->sd_log_error = error;
+ if (!cmpxchg(&sdp->sd_log_error, 0, error))
+ fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
wake_up(&sdp->sd_logd_waitq);
}
}
@@ -1504,7 +1516,7 @@ static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
ip = NULL;
spin_lock(&sdp->sd_trunc_lock);
if (!list_empty(&sdp->sd_trunc_list)) {
- ip = list_entry(sdp->sd_trunc_list.next,
+ ip = list_first_entry(&sdp->sd_trunc_list,
struct gfs2_inode, i_trunc_list);
list_del_init(&ip->i_trunc_list);
}
@@ -1541,6 +1553,8 @@ int gfs2_quotad(void *data)
while (!kthread_should_stop()) {
+ if (gfs2_withdrawn(sdp))
+ goto bypass;
/* Update the master statfs file */
if (sdp->sd_statfs_force_sync) {
int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -1561,6 +1575,7 @@ int gfs2_quotad(void *data)
try_to_freeze();
+bypass:
t = min(quotad_timeo, statfs_timeo);
prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
@@ -1674,7 +1689,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
if (error)
return error;
- error = gfs2_rsqa_alloc(ip);
+ error = gfs2_qa_get(ip);
if (error)
goto out_put;
@@ -1743,6 +1758,7 @@ out_i:
out_q:
gfs2_glock_dq_uninit(&q_gh);
out_unlockput:
+ gfs2_qa_put(ip);
inode_unlock(&ip->i_inode);
out_put:
qd_put(qd);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 765627d9a91e..7f9ca8ef40fc 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,8 +15,8 @@ struct gfs2_sbd;
#define NO_UID_QUOTA_CHANGE INVALID_UID
#define NO_GID_QUOTA_CHANGE INVALID_GID
-extern int gfs2_qa_alloc(struct gfs2_inode *ip);
-extern void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern int gfs2_qa_get(struct gfs2_inode *ip);
+extern void gfs2_qa_put(struct gfs2_inode *ip);
extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_unhold(struct gfs2_inode *ip);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 85f830e56945..96c345f49273 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -111,7 +111,7 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd)
struct gfs2_revoke_replay *rr;
while (!list_empty(head)) {
- rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
+ rr = list_first_entry(head, struct gfs2_revoke_replay, rr_list);
list_del(&rr->rr_list);
kfree(rr);
}
@@ -305,6 +305,11 @@ void gfs2_recover_func(struct work_struct *work)
int error = 0;
int jlocked = 0;
+ if (gfs2_withdrawn(sdp)) {
+ fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n",
+ jd->jd_jid);
+ goto fail;
+ }
t_start = ktime_get();
if (sdp->sd_args.ar_spectator)
goto fail;
@@ -393,6 +398,10 @@ void gfs2_recover_func(struct work_struct *work)
fs_info(sdp, "jid=%u: Replaying journal...0x%x to 0x%x\n",
jd->jd_jid, head.lh_tail, head.lh_blkno);
+ /* We take the sd_log_flush_lock here primarily to prevent log
+ * flushes and simultaneous journal replays from stomping on
+ * each other wrt sd_log_bio. */
+ down_read(&sdp->sd_log_flush_lock);
for (pass = 0; pass < 2; pass++) {
lops_before_scan(jd, &head, pass);
error = foreach_descriptor(jd, head.lh_tail,
@@ -403,6 +412,7 @@ void gfs2_recover_func(struct work_struct *work)
}
clean_journal(jd, &head);
+ up_read(&sdp->sd_log_flush_lock);
gfs2_glock_dq_uninit(&thaw_gh);
t_rep = ktime_get();
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index e7bf91ec231c..a321c34e3d6e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -457,24 +457,24 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
}
if (count[0] != rgd->rd_free) {
- if (gfs2_consist_rgrpd(rgd))
- fs_err(sdp, "free data mismatch: %u != %u\n",
- count[0], rgd->rd_free);
+ gfs2_lm(sdp, "free data mismatch: %u != %u\n",
+ count[0], rgd->rd_free);
+ gfs2_consist_rgrpd(rgd);
return;
}
tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
if (count[1] != tmp) {
- if (gfs2_consist_rgrpd(rgd))
- fs_err(sdp, "used data mismatch: %u != %u\n",
- count[1], tmp);
+ gfs2_lm(sdp, "used data mismatch: %u != %u\n",
+ count[1], tmp);
+ gfs2_consist_rgrpd(rgd);
return;
}
if (count[2] + count[3] != rgd->rd_dinodes) {
- if (gfs2_consist_rgrpd(rgd))
- fs_err(sdp, "used metadata mismatch: %u != %u\n",
- count[2] + count[3], rgd->rd_dinodes);
+ gfs2_lm(sdp, "used metadata mismatch: %u != %u\n",
+ count[2] + count[3], rgd->rd_dinodes);
+ gfs2_consist_rgrpd(rgd);
return;
}
}
@@ -590,16 +590,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
}
}
-/**
- * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
- * plus a quota allocations data structure, if necessary
- * @ip: the inode for this reservation
- */
-int gfs2_rsqa_alloc(struct gfs2_inode *ip)
-{
- return gfs2_qa_alloc(ip);
-}
-
static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
const char *fs_id_buf)
{
@@ -672,18 +662,17 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
}
/**
- * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
+ * gfs2_rs_delete - delete a multi-block reservation
* @ip: The inode for this reservation
* @wcount: The inode's write count, or NULL
*
*/
-void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
{
down_write(&ip->i_rw_mutex);
if ((wcount == NULL) || (atomic_read(wcount) <= 1))
gfs2_rs_deltree(&ip->i_res);
up_write(&ip->i_rw_mutex);
- gfs2_qa_delete(ip, wcount);
}
/**
@@ -720,8 +709,12 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
rb_erase(n, &sdp->sd_rindex_tree);
if (gl) {
- glock_clear_object(gl, rgd);
+ if (gl->gl_state != LM_ST_UNLOCKED) {
+ gfs2_glock_cb(gl, LM_ST_UNLOCKED);
+ flush_delayed_work(&gl->gl_work);
+ }
gfs2_rgrp_brelse(rgd);
+ glock_clear_object(gl, rgd);
gfs2_glock_put(gl);
}
@@ -733,17 +726,6 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
}
}
-static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
-{
- struct gfs2_sbd *sdp = rgd->rd_sbd;
-
- fs_info(sdp, "ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
- fs_info(sdp, "ri_length = %u\n", rgd->rd_length);
- fs_info(sdp, "ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
- fs_info(sdp, "ri_data = %u\n", rgd->rd_data);
- fs_info(sdp, "ri_bitbytes = %u\n", rgd->rd_bitbytes);
-}
-
/**
* gfs2_compute_bitstructs - Compute the bitmap sizes
* @rgd: The resource group descriptor
@@ -814,11 +796,20 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
}
bi = rgd->rd_bits + (length - 1);
if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
- if (gfs2_consist_rgrpd(rgd)) {
- gfs2_rindex_print(rgd);
- fs_err(sdp, "start=%u len=%u offset=%u\n",
- bi->bi_start, bi->bi_bytes, bi->bi_offset);
- }
+ gfs2_lm(sdp,
+ "ri_addr = %llu\n"
+ "ri_length = %u\n"
+ "ri_data0 = %llu\n"
+ "ri_data = %u\n"
+ "ri_bitbytes = %u\n"
+ "start=%u len=%u offset=%u\n",
+ (unsigned long long)rgd->rd_addr,
+ rgd->rd_length,
+ (unsigned long long)rgd->rd_data0,
+ rgd->rd_data,
+ rgd->rd_bitbytes,
+ bi->bi_start, bi->bi_bytes, bi->bi_offset);
+ gfs2_consist_rgrpd(rgd);
return -EIO;
}
@@ -1286,23 +1277,6 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
bi->bi_bh = NULL;
}
}
-
-}
-
-/**
- * gfs2_rgrp_go_unlock - Unlock a rgrp glock
- * @gh: The glock holder for the resource group
- *
- */
-
-void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
-{
- struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
- int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
- test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
-
- if (rgd && demote_requested)
- gfs2_rgrp_brelse(rgd);
}
int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
@@ -1832,10 +1806,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
while (1) {
- down_write(&sdp->sd_log_flush_lock);
error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
true);
- up_write(&sdp->sd_log_flush_lock);
if (error == -ENOSPC)
break;
if (WARN_ON_ONCE(error))
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index c14a673ae36f..a1d7e14fc55b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -33,7 +33,6 @@ extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
-extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
@@ -45,9 +44,8 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip);
extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
bool dinode, u64 *generation);
-extern int gfs2_rsqa_alloc(struct gfs2_inode *ip);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 68cc7c291a81..37fc41632aa2 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -61,11 +61,13 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
sdp->sd_journals = 0;
spin_unlock(&sdp->sd_jindex_spin);
+ sdp->sd_jdesc = NULL;
while (!list_empty(&list)) {
- jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+ jd = list_first_entry(&list, struct gfs2_jdesc, jd_list);
gfs2_free_journal_extents(jd);
list_del(&jd->jd_list);
iput(jd->jd_inode);
+ jd->jd_inode = NULL;
kfree(jd);
}
}
@@ -171,9 +173,13 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
goto fail_threads;
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
+ if (gfs2_withdrawn(sdp)) {
+ error = -EIO;
+ goto fail;
+ }
error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
- if (error)
+ if (error || gfs2_withdrawn(sdp))
goto fail;
if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
@@ -187,7 +193,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
gfs2_log_pointers_init(sdp, head.lh_blkno);
error = gfs2_quota_init(sdp);
- if (error)
+ if (error || gfs2_withdrawn(sdp))
goto fail;
set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
@@ -446,7 +452,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
out:
while (!list_empty(&list)) {
- lfcc = list_entry(list.next, struct lfcc, list);
+ lfcc = list_first_entry(&list, struct lfcc, list);
list_del(&lfcc->list);
gfs2_glock_dq_uninit(&lfcc->gh);
kfree(lfcc);
@@ -599,34 +605,63 @@ out:
int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
{
struct gfs2_holder freeze_gh;
- int error;
-
- error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE,
- &freeze_gh);
- if (error && !gfs2_withdrawn(sdp))
- return error;
+ int error = 0;
+ int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+ gfs2_holder_mark_uninitialized(&freeze_gh);
+ if (sdp->sd_freeze_gl &&
+ !gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) {
+ if (!log_write_allowed) {
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
+ LM_ST_SHARED, GL_NOCACHE |
+ LM_FLAG_TRY, &freeze_gh);
+ if (error == GLR_TRYFAILED)
+ error = 0;
+ } else {
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
+ LM_ST_SHARED, GL_NOCACHE,
+ &freeze_gh);
+ if (error && !gfs2_withdrawn(sdp))
+ return error;
+ }
+ }
flush_workqueue(gfs2_delete_workqueue);
- if (sdp->sd_quotad_process)
+ if (!log_write_allowed && current == sdp->sd_quotad_process)
+ fs_warn(sdp, "The quotad daemon is withdrawing.\n");
+ else if (sdp->sd_quotad_process)
kthread_stop(sdp->sd_quotad_process);
sdp->sd_quotad_process = NULL;
- if (sdp->sd_logd_process)
+
+ if (!log_write_allowed && current == sdp->sd_logd_process)
+ fs_warn(sdp, "The logd daemon is withdrawing.\n");
+ else if (sdp->sd_logd_process)
kthread_stop(sdp->sd_logd_process);
sdp->sd_logd_process = NULL;
- gfs2_quota_sync(sdp->sd_vfs, 0);
- gfs2_statfs_sync(sdp->sd_vfs, 0);
-
- gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
- GFS2_LFC_MAKE_FS_RO);
- wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
- gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
+ if (log_write_allowed) {
+ gfs2_quota_sync(sdp->sd_vfs, 0);
+ gfs2_statfs_sync(sdp->sd_vfs, 0);
+ gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
+ GFS2_LFC_MAKE_FS_RO);
+ wait_event(sdp->sd_reserving_log_wait,
+ atomic_read(&sdp->sd_reserving_log) == 0);
+ gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) ==
+ sdp->sd_jdesc->jd_blocks);
+ } else {
+ wait_event_timeout(sdp->sd_reserving_log_wait,
+ atomic_read(&sdp->sd_reserving_log) == 0,
+ HZ * 5);
+ }
if (gfs2_holder_initialized(&freeze_gh))
gfs2_glock_dq_uninit(&freeze_gh);
gfs2_quota_cleanup(sdp);
+ if (!log_write_allowed)
+ sdp->sd_vfs->s_flags |= SB_RDONLY;
+
return error;
}
@@ -677,8 +712,10 @@ restart:
gfs2_glock_put(sdp->sd_freeze_gl);
if (!sdp->sd_args.ar_spectator) {
- gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
- gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+ if (gfs2_holder_initialized(&sdp->sd_journal_gh))
+ gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+ if (gfs2_holder_initialized(&sdp->sd_jinode_gh))
+ gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
iput(sdp->sd_sc_inode);
@@ -1356,14 +1393,6 @@ out_unlock:
if (gfs2_rs_active(&ip->i_res))
gfs2_rs_deltree(&ip->i_res);
- if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
- glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
- if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq(&ip->i_iopen_gh);
- }
- gfs2_holder_uninit(&ip->i_iopen_gh);
- }
if (gfs2_holder_initialized(&gh)) {
glock_clear_object(ip->i_gl, ip);
gfs2_glock_dq_uninit(&gh);
@@ -1372,22 +1401,30 @@ out_unlock:
fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
out:
truncate_inode_pages_final(&inode->i_data);
- gfs2_rsqa_delete(ip, NULL);
+ if (ip->i_qadata)
+ gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0);
+ gfs2_rs_delete(ip, NULL);
+ gfs2_qa_put(ip);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
- glock_clear_object(ip->i_gl, ip);
- wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
- gfs2_glock_add_to_lru(ip->i_gl);
- gfs2_glock_put_eventually(ip->i_gl);
- ip->i_gl = NULL;
+ if (ip->i_gl) {
+ glock_clear_object(ip->i_gl, ip);
+ wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
+ gfs2_glock_add_to_lru(ip->i_gl);
+ gfs2_glock_put_eventually(ip->i_gl);
+ ip->i_gl = NULL;
+ }
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
glock_clear_object(gl, ip);
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq(&ip->i_iopen_gh);
+ }
gfs2_glock_hold(gl);
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
gfs2_glock_put_eventually(gl);
}
}
@@ -1401,6 +1438,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
return NULL;
ip->i_flags = 0;
ip->i_gl = NULL;
+ gfs2_holder_mark_uninitialized(&ip->i_iopen_gh);
memset(&ip->i_res, 0, sizeof(ip->i_res));
RB_CLEAR_NODE(&ip->i_res.rs_node);
ip->i_rahead = 0;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b8bf811a1305..51900554ed81 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -26,7 +26,6 @@ extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
-
extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
struct gfs2_inode **ipp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 8ccb68f4ed16..d28c41bd69b0 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -136,7 +136,8 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
if (val != 1)
return -EINVAL;
- gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n");
+ gfs2_lm(sdp, "withdrawing from cluster at user's request\n");
+ gfs2_withdraw(sdp);
return len;
}
@@ -434,6 +435,8 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
* never clear the DFL_BLOCK_LOCKS flag, so all our locks would
* permanently stop working.
*/
+ if (!sdp->sd_jdesc)
+ goto out;
if (sdp->sd_jdesc->jd_jid == jid && !sdp->sd_args.ar_spectator)
goto out;
rv = -ENOENT;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index a685637a5b55..ffe840505082 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -228,6 +228,10 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
fs_info(sdp, "GFS2:adding buf while frozen\n");
gfs2_assert_withdraw(sdp, 0);
}
+ if (unlikely(gfs2_withdrawn(sdp))) {
+ fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n",
+ (unsigned long long)bd->bd_bh->b_blocknr);
+ }
gfs2_pin(sdp, bd->bd_bh);
mh->__pad0 = cpu_to_be64(0);
mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index ec600b487498..9b64d40ab379 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -11,12 +11,18 @@
#include <linux/buffer_head.h>
#include <linux/crc32.h>
#include <linux/gfs2_ondisk.h>
+#include <linux/delay.h>
#include <linux/uaccess.h>
#include "gfs2.h"
#include "incore.h"
#include "glock.h"
+#include "glops.h"
+#include "log.h"
+#include "lops.h"
+#include "recovery.h"
#include "rgrp.h"
+#include "super.h"
#include "util.h"
struct kmem_cache *gfs2_glock_cachep __read_mostly;
@@ -33,32 +39,257 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
fs_emerg(sdp, "fatal assertion failed\n");
}
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
+/**
+ * check_journal_clean - Make sure a journal is clean for a spectator mount
+ * @sdp: The GFS2 superblock
+ * @jd: The journal descriptor
+ *
+ * Returns: 0 if the journal is clean or locked, else an error
+ */
+int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+ bool verbose)
+{
+ int error;
+ struct gfs2_holder j_gh;
+ struct gfs2_log_header_host head;
+ struct gfs2_inode *ip;
+
+ ip = GFS2_I(jd->jd_inode);
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP |
+ GL_EXACT | GL_NOCACHE, &j_gh);
+ if (error) {
+ if (verbose)
+ fs_err(sdp, "Error %d locking journal for spectator "
+ "mount.\n", error);
+ return -EPERM;
+ }
+ error = gfs2_jdesc_check(jd);
+ if (error) {
+ if (verbose)
+ fs_err(sdp, "Error checking journal for spectator "
+ "mount.\n");
+ goto out_unlock;
+ }
+ error = gfs2_find_jhead(jd, &head, false);
+ if (error) {
+ if (verbose)
+ fs_err(sdp, "Error parsing journal for spectator "
+ "mount.\n");
+ goto out_unlock;
+ }
+ if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+ error = -EPERM;
+ if (verbose)
+ fs_err(sdp, "jid=%u: Journal is dirty, so the first "
+ "mounter must not be a spectator.\n",
+ jd->jd_jid);
+ }
+
+out_unlock:
+ gfs2_glock_dq_uninit(&j_gh);
+ return error;
+}
+
+static void signal_our_withdraw(struct gfs2_sbd *sdp)
+{
+ struct gfs2_glock *gl = sdp->sd_live_gh.gh_gl;
+ struct inode *inode = sdp->sd_jdesc->jd_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ u64 no_formal_ino = ip->i_no_formal_ino;
+ int ret = 0;
+ int tries;
+
+ if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
+ return;
+
+ /* Prevent any glock dq until withdraw recovery is complete */
+ set_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
+ /*
+ * Don't tell dlm we're bailing until we have no more buffers in the
+ * wind. If journal had an IO error, the log code should just purge
+ * the outstanding buffers rather than submitting new IO. Making the
+ * file system read-only will flush the journal, etc.
+ *
+ * During a normal unmount, gfs2_make_fs_ro calls gfs2_log_shutdown
+ * which clears SDF_JOURNAL_LIVE. In a withdraw, we must not write
+ * any UNMOUNT log header, so we can't call gfs2_log_shutdown, and
+ * therefore we need to clear SDF_JOURNAL_LIVE manually.
+ */
+ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+ if (!sb_rdonly(sdp->sd_vfs))
+ ret = gfs2_make_fs_ro(sdp);
+
+ /*
+ * Drop the glock for our journal so another node can recover it.
+ */
+ if (gfs2_holder_initialized(&sdp->sd_journal_gh)) {
+ gfs2_glock_dq_wait(&sdp->sd_journal_gh);
+ gfs2_holder_uninit(&sdp->sd_journal_gh);
+ }
+ sdp->sd_jinode_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq(&sdp->sd_jinode_gh);
+ if (test_bit(SDF_FS_FROZEN, &sdp->sd_flags)) {
+ /* Make sure gfs2_unfreeze works if partially-frozen */
+ flush_workqueue(gfs2_freeze_wq);
+ atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
+ thaw_super(sdp->sd_vfs);
+ } else {
+ wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE);
+ }
+
+ /*
+ * holder_uninit to force glock_put, to force dlm to let go
+ */
+ gfs2_holder_uninit(&sdp->sd_jinode_gh);
+
+ /*
+ * Note: We need to be careful here:
+ * Our iput of jd_inode will evict it. The evict will dequeue its
+ * glock, but the glock dq will wait for the withdraw unless we have
+ * exception code in glock_dq.
+ */
+ iput(inode);
+ /*
+ * Wait until the journal inode's glock is freed. This allows try locks
+ * on other nodes to be successful, otherwise we remain the owner of
+ * the glock as far as dlm is concerned.
+ */
+ if (gl->gl_ops->go_free) {
+ set_bit(GLF_FREEING, &gl->gl_flags);
+ wait_on_bit(&gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE);
+ }
+
+ if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */
+ clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
+ goto skip_recovery;
+ }
+ /*
+ * Dequeue the "live" glock, but keep a reference so it's never freed.
+ */
+ gfs2_glock_hold(gl);
+ gfs2_glock_dq_wait(&sdp->sd_live_gh);
+ /*
+ * We enqueue the "live" glock in EX so that all other nodes
+ * get a demote request and act on it. We don't really want the
+ * lock in EX, so we send a "try" lock with 1CB to produce a callback.
+ */
+ fs_warn(sdp, "Requesting recovery of jid %d.\n",
+ sdp->sd_lockstruct.ls_jid);
+ gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | LM_FLAG_NOEXP,
+ &sdp->sd_live_gh);
+ msleep(GL_GLOCK_MAX_HOLD);
+ /*
+ * This will likely fail in a cluster, but succeed standalone:
+ */
+ ret = gfs2_glock_nq(&sdp->sd_live_gh);
+
+ /*
+ * If we actually got the "live" lock in EX mode, there are no other
+ * nodes available to replay our journal. So we try to replay it
+ * ourselves. We hold the "live" glock to prevent other mounters
+ * during recovery, then just dequeue it and reacquire it in our
+ * normal SH mode. Just in case the problem that caused us to
+ * withdraw prevents us from recovering our journal (e.g. io errors
+ * and such) we still check if the journal is clean before proceeding
+ * but we may wait forever until another mounter does the recovery.
+ */
+ if (ret == 0) {
+ fs_warn(sdp, "No other mounters found. Trying to recover our "
+ "own journal jid %d.\n", sdp->sd_lockstruct.ls_jid);
+ if (gfs2_recover_journal(sdp->sd_jdesc, 1))
+ fs_warn(sdp, "Unable to recover our journal jid %d.\n",
+ sdp->sd_lockstruct.ls_jid);
+ gfs2_glock_dq_wait(&sdp->sd_live_gh);
+ gfs2_holder_reinit(LM_ST_SHARED, LM_FLAG_NOEXP | GL_EXACT,
+ &sdp->sd_live_gh);
+ gfs2_glock_nq(&sdp->sd_live_gh);
+ }
+
+ gfs2_glock_queue_put(gl); /* drop the extra reference we acquired */
+ clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
+
+ /*
+ * At this point our journal is evicted, so we need to get a new inode
+ * for it. Once done, we need to call gfs2_find_jhead which
+ * calls gfs2_map_journal_extents to map it for us again.
+ *
+ * Note that we don't really want it to look up a FREE block. The
+ * GFS2_BLKST_FREE simply overrides a block check in gfs2_inode_lookup
+ * which would otherwise fail because it requires grabbing an rgrp
+ * glock, which would fail with -EIO because we're withdrawing.
+ */
+ inode = gfs2_inode_lookup(sdp->sd_vfs, DT_UNKNOWN,
+ sdp->sd_jdesc->jd_no_addr, no_formal_ino,
+ GFS2_BLKST_FREE);
+ if (IS_ERR(inode)) {
+ fs_warn(sdp, "Reprocessing of jid %d failed with %ld.\n",
+ sdp->sd_lockstruct.ls_jid, PTR_ERR(inode));
+ goto skip_recovery;
+ }
+ sdp->sd_jdesc->jd_inode = inode;
+
+ /*
+ * Now wait until recovery is complete.
+ */
+ for (tries = 0; tries < 10; tries++) {
+ ret = check_journal_clean(sdp, sdp->sd_jdesc, false);
+ if (!ret)
+ break;
+ msleep(HZ);
+ fs_warn(sdp, "Waiting for journal recovery jid %d.\n",
+ sdp->sd_lockstruct.ls_jid);
+ }
+skip_recovery:
+ if (!ret)
+ fs_warn(sdp, "Journal recovery complete for jid %d.\n",
+ sdp->sd_lockstruct.ls_jid);
+ else
+ fs_warn(sdp, "Journal recovery skipped for %d until next "
+ "mount.\n", sdp->sd_lockstruct.ls_jid);
+ fs_warn(sdp, "Glock dequeues delayed: %lu\n", sdp->sd_glock_dqs_held);
+ sdp->sd_glock_dqs_held = 0;
+ wake_up_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY);
+}
+
+void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...)
{
- struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- const struct lm_lockops *lm = ls->ls_ops;
- va_list args;
struct va_format vaf;
+ va_list args;
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
- test_and_set_bit(SDF_WITHDRAWN, &sdp->sd_flags))
- return 0;
-
- if (fmt) {
- va_start(args, fmt);
+ test_bit(SDF_WITHDRAWN, &sdp->sd_flags))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ fs_err(sdp, "%pV", &vaf);
+ va_end(args);
+}
- vaf.fmt = fmt;
- vaf.va = &args;
+int gfs2_withdraw(struct gfs2_sbd *sdp)
+{
+ struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ const struct lm_lockops *lm = ls->ls_ops;
- fs_err(sdp, "%pV", &vaf);
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+ test_and_set_bit(SDF_WITHDRAWN, &sdp->sd_flags)) {
+ if (!test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags))
+ return -1;
- va_end(args);
+ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG,
+ TASK_UNINTERRUPTIBLE);
+ return -1;
}
+ set_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags);
+
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
fs_err(sdp, "about to withdraw this file system\n");
BUG_ON(sdp->sd_args.ar_debug);
+ signal_our_withdraw(sdp);
+
kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
@@ -69,8 +300,11 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
lm->lm_unmount(sdp);
}
set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
- fs_err(sdp, "withdrawn\n");
+ fs_err(sdp, "File system withdrawn\n");
dump_stack();
+ clear_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG);
}
if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
@@ -81,35 +315,45 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
/**
* gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
- * Returns: -1 if this call withdrew the machine,
- * -2 if it was already withdrawn
*/
-int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
- const char *function, char *file, unsigned int line)
+void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line,
+ bool delayed)
{
- int me;
- me = gfs2_lm_withdraw(sdp,
- "fatal: assertion \"%s\" failed\n"
- " function = %s, file = %s, line = %u\n",
- assertion, function, file, line);
+ if (gfs2_withdrawn(sdp))
+ return;
+
+ fs_err(sdp,
+ "fatal: assertion \"%s\" failed\n"
+ " function = %s, file = %s, line = %u\n",
+ assertion, function, file, line);
+
+ /*
+ * If errors=panic was specified on mount, it won't help to delay the
+ * withdraw.
+ */
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+ delayed = false;
+
+ if (delayed)
+ gfs2_withdraw_delayed(sdp);
+ else
+ gfs2_withdraw(sdp);
dump_stack();
- return (me) ? -1 : -2;
}
/**
* gfs2_assert_warn_i - Print a message to the console if @assertion is false
- * Returns: -1 if we printed something
- * -2 if we didn't
*/
-int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
- const char *function, char *file, unsigned int line)
+void gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line)
{
if (time_before(jiffies,
sdp->sd_last_warning +
gfs2_tune_get(sdp, gt_complain_secs) * HZ))
- return -2;
+ return;
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n",
@@ -127,69 +371,59 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
sdp->sd_fsname, function, file, line);
sdp->sd_last_warning = jiffies;
-
- return -1;
}
/**
* gfs2_consist_i - Flag a filesystem consistency error and withdraw
- * Returns: -1 if this call withdrew the machine,
- * 0 if it was already withdrawn
*/
-int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
- char *file, unsigned int line)
+void gfs2_consist_i(struct gfs2_sbd *sdp, const char *function,
+ char *file, unsigned int line)
{
- int rv;
- rv = gfs2_lm_withdraw(sdp,
- "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n",
- function, file, line);
- return rv;
+ gfs2_lm(sdp,
+ "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n",
+ function, file, line);
+ gfs2_withdraw(sdp);
}
/**
* gfs2_consist_inode_i - Flag an inode consistency error and withdraw
- * Returns: -1 if this call withdrew the machine,
- * 0 if it was already withdrawn
*/
-int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
- const char *function, char *file, unsigned int line)
+void gfs2_consist_inode_i(struct gfs2_inode *ip,
+ const char *function, char *file, unsigned int line)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- int rv;
- rv = gfs2_lm_withdraw(sdp,
- "fatal: filesystem consistency error\n"
- " inode = %llu %llu\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)ip->i_no_formal_ino,
- (unsigned long long)ip->i_no_addr,
- function, file, line);
- return rv;
+
+ gfs2_lm(sdp,
+ "fatal: filesystem consistency error\n"
+ " inode = %llu %llu\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)ip->i_no_formal_ino,
+ (unsigned long long)ip->i_no_addr,
+ function, file, line);
+ gfs2_withdraw(sdp);
}
/**
* gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
- * Returns: -1 if this call withdrew the machine,
- * 0 if it was already withdrawn
*/
-int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
- const char *function, char *file, unsigned int line)
+void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
+ const char *function, char *file, unsigned int line)
{
struct gfs2_sbd *sdp = rgd->rd_sbd;
char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
- int rv;
sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf);
- rv = gfs2_lm_withdraw(sdp,
- "fatal: filesystem consistency error\n"
- " RG = %llu\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)rgd->rd_addr,
- function, file, line);
- return rv;
+ gfs2_lm(sdp,
+ "fatal: filesystem consistency error\n"
+ " RG = %llu\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)rgd->rd_addr,
+ function, file, line);
+ gfs2_withdraw(sdp);
}
/**
@@ -203,12 +437,14 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
unsigned int line)
{
int me;
- me = gfs2_lm_withdraw(sdp,
- "fatal: invalid metadata block\n"
- " bh = %llu (%s)\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)bh->b_blocknr, type,
- function, file, line);
+
+ gfs2_lm(sdp,
+ "fatal: invalid metadata block\n"
+ " bh = %llu (%s)\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)bh->b_blocknr, type,
+ function, file, line);
+ me = gfs2_withdraw(sdp);
return (me) ? -1 : -2;
}
@@ -223,12 +459,14 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
char *file, unsigned int line)
{
int me;
- me = gfs2_lm_withdraw(sdp,
- "fatal: invalid metadata block\n"
- " bh = %llu (type: exp=%u, found=%u)\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)bh->b_blocknr, type, t,
- function, file, line);
+
+ gfs2_lm(sdp,
+ "fatal: invalid metadata block\n"
+ " bh = %llu (type: exp=%u, found=%u)\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)bh->b_blocknr, type, t,
+ function, file, line);
+ me = gfs2_withdraw(sdp);
return (me) ? -1 : -2;
}
@@ -241,12 +479,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
unsigned int line)
{
- int rv;
- rv = gfs2_lm_withdraw(sdp,
- "fatal: I/O error\n"
- " function = %s, file = %s, line = %u\n",
- function, file, line);
- return rv;
+ gfs2_lm(sdp,
+ "fatal: I/O error\n"
+ " function = %s, file = %s, line = %u\n",
+ function, file, line);
+ return gfs2_withdraw(sdp);
}
/**
@@ -258,14 +495,14 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
const char *function, char *file, unsigned int line,
bool withdraw)
{
- if (!gfs2_withdrawn(sdp))
- fs_err(sdp,
- "fatal: I/O error\n"
- " block = %llu\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)bh->b_blocknr,
- function, file, line);
+ if (gfs2_withdrawn(sdp))
+ return;
+
+ fs_err(sdp, "fatal: I/O error\n"
+ " block = %llu\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)bh->b_blocknr, function, file, line);
if (withdraw)
- gfs2_lm_withdraw(sdp, NULL);
+ gfs2_withdraw(sdp);
}
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index f2702bc9837c..a3542560da6f 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -36,41 +36,59 @@ do { \
} while (0)
-int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
- const char *function, char *file, unsigned int line);
+void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line,
+ bool delayed);
#define gfs2_assert_withdraw(sdp, assertion) \
-((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
- __func__, __FILE__, __LINE__))
-
-
-int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
- const char *function, char *file, unsigned int line);
+ ({ \
+ bool _bool = (assertion); \
+ if (unlikely(!_bool)) \
+ gfs2_assert_withdraw_i((sdp), #assertion, \
+ __func__, __FILE__, __LINE__, false); \
+ !_bool; \
+ })
+
+#define gfs2_assert_withdraw_delayed(sdp, assertion) \
+ ({ \
+ bool _bool = (assertion); \
+ if (unlikely(!_bool)) \
+ gfs2_assert_withdraw_i((sdp), #assertion, \
+ __func__, __FILE__, __LINE__, true); \
+ !_bool; \
+ })
+
+void gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line);
#define gfs2_assert_warn(sdp, assertion) \
-((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
- __func__, __FILE__, __LINE__))
-
+ ({ \
+ bool _bool = (assertion); \
+ if (unlikely(!_bool)) \
+ gfs2_assert_warn_i((sdp), #assertion, \
+ __func__, __FILE__, __LINE__); \
+ !_bool; \
+ })
-int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
- const char *function, char *file, unsigned int line);
+void gfs2_consist_i(struct gfs2_sbd *sdp,
+ const char *function, char *file, unsigned int line);
#define gfs2_consist(sdp) \
-gfs2_consist_i((sdp), 0, __func__, __FILE__, __LINE__)
+gfs2_consist_i((sdp), __func__, __FILE__, __LINE__)
-int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
- const char *function, char *file, unsigned int line);
+void gfs2_consist_inode_i(struct gfs2_inode *ip,
+ const char *function, char *file, unsigned int line);
#define gfs2_consist_inode(ip) \
-gfs2_consist_inode_i((ip), 0, __func__, __FILE__, __LINE__)
+gfs2_consist_inode_i((ip), __func__, __FILE__, __LINE__)
-int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
- const char *function, char *file, unsigned int line);
+void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd,
+ const char *function, char *file, unsigned int line);
#define gfs2_consist_rgrpd(rgd) \
-gfs2_consist_rgrpd_i((rgd), 0, __func__, __FILE__, __LINE__)
+gfs2_consist_rgrpd_i((rgd), __func__, __FILE__, __LINE__)
int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
@@ -129,6 +147,9 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
char *file, unsigned int line);
+extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
+ bool verbose);
+
#define gfs2_io_error(sdp) \
gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__);
@@ -165,18 +186,29 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
}
/**
+ * gfs2_withdraw_delayed - withdraw as soon as possible without deadlocks
+ * @sdp: the superblock
+ */
+static inline void gfs2_withdraw_delayed(struct gfs2_sbd *sdp)
+{
+ set_bit(SDF_WITHDRAWING, &sdp->sd_flags);
+}
+
+/**
* gfs2_withdrawn - test whether the file system is withdrawing or withdrawn
* @sdp: the superblock
*/
static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp)
{
- return test_bit(SDF_WITHDRAWN, &sdp->sd_flags);
+ return test_bit(SDF_WITHDRAWN, &sdp->sd_flags) ||
+ test_bit(SDF_WITHDRAWING, &sdp->sd_flags);
}
#define gfs2_tune_get(sdp, field) \
gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
__printf(2, 3)
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...);
+void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...);
+int gfs2_withdraw(struct gfs2_sbd *sdp);
#endif /* __UTIL_DOT_H__ */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index bbe593d16bea..9d7667bc4292 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1222,7 +1222,7 @@ static int gfs2_xattr_set(const struct xattr_handler *handler,
struct gfs2_holder gh;
int ret;
- ret = gfs2_rsqa_alloc(ip);
+ ret = gfs2_qa_get(ip);
if (ret)
return ret;
@@ -1231,15 +1231,19 @@ static int gfs2_xattr_set(const struct xattr_handler *handler,
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (ret)
- return ret;
+ goto out;
} else {
- if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE))
- return -EIO;
+ if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) {
+ ret = -EIO;
+ goto out;
+ }
gfs2_holder_mark_uninitialized(&gh);
}
ret = __gfs2_xattr_set(inode, name, value, size, flags, handler->flags);
if (gfs2_holder_initialized(&gh))
gfs2_glock_dq_uninit(&gh);
+out:
+ gfs2_qa_put(ip);
return ret;
}
diff --git a/fs/inode.c b/fs/inode.c
index 7d57068b6b7a..93d9252a00ab 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -138,6 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
+ atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
inode->i_fop = &no_open_fops;
diff --git a/fs/internal.h b/fs/internal.h
index b108a8eb75ca..aa5d45524e87 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -38,7 +38,6 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
/*
* buffer.c
*/
-extern void guard_bio_eod(struct bio *bio);
extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block, struct iomap *iomap);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index cb60a42b9fdf..cc5cf2209fb0 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/rculist_nulls.h>
+#include <linux/fs_struct.h>
#include "io-wq.h"
@@ -59,6 +60,7 @@ struct io_worker {
const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files;
+ struct fs_struct *restore_fs;
};
#if BITS_PER_LONG == 64
@@ -67,6 +69,8 @@ struct io_worker {
#define IO_WQ_HASH_ORDER 5
#endif
+#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
+
struct io_wqe_acct {
unsigned nr_workers;
unsigned max_workers;
@@ -96,6 +100,7 @@ struct io_wqe {
struct list_head all_list;
struct io_wq *wq;
+ struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
};
/*
@@ -105,8 +110,7 @@ struct io_wq {
struct io_wqe **wqes;
unsigned long state;
- get_work_fn *get_work;
- put_work_fn *put_work;
+ free_work_fn *free_work;
struct task_struct *manager;
struct user_struct *user;
@@ -151,6 +155,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
task_unlock(current);
}
+ if (current->fs != worker->restore_fs)
+ current->fs = worker->restore_fs;
+
/*
* If we have an active mm, we need to drop the wq lock before unusing
* it. If we do, return true and let the caller retry the idle loop.
@@ -311,6 +318,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
worker->restore_files = current->files;
+ worker->restore_fs = current->fs;
io_wqe_inc_running(wqe, worker);
}
@@ -370,26 +378,35 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
return __io_worker_unuse(wqe, worker);
}
-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
+static inline unsigned int io_get_work_hash(struct io_wq_work *work)
+{
+ return work->flags >> IO_WQ_HASH_SHIFT;
+}
+
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
__must_hold(wqe->lock)
{
struct io_wq_work_node *node, *prev;
- struct io_wq_work *work;
+ struct io_wq_work *work, *tail;
+ unsigned int hash;
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
/* not hashed, can run anytime */
- if (!(work->flags & IO_WQ_WORK_HASHED)) {
- wq_node_del(&wqe->work_list, node, prev);
+ if (!io_wq_is_hashed(work)) {
+ wq_list_del(&wqe->work_list, node, prev);
return work;
}
/* hashed, can run if not already running */
- *hash = work->flags >> IO_WQ_HASH_SHIFT;
- if (!(wqe->hash_map & BIT_ULL(*hash))) {
- wqe->hash_map |= BIT_ULL(*hash);
- wq_node_del(&wqe->work_list, node, prev);
+ hash = io_get_work_hash(work);
+ if (!(wqe->hash_map & BIT(hash))) {
+ wqe->hash_map |= BIT(hash);
+ /* all items with this hash lie in [work, tail] */
+ tail = wqe->hash_tail[hash];
+ wqe->hash_tail[hash] = NULL;
+ wq_list_cut(&wqe->work_list, &tail->list, prev);
return work;
}
}
@@ -434,16 +451,49 @@ static void io_wq_switch_creds(struct io_worker *worker,
worker->saved_creds = old_creds;
}
+static void io_impersonate_work(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ if (work->files && current->files != work->files) {
+ task_lock(current);
+ current->files = work->files;
+ task_unlock(current);
+ }
+ if (work->fs && current->fs != work->fs)
+ current->fs = work->fs;
+ if (work->mm != worker->mm)
+ io_wq_switch_mm(worker, work);
+ if (worker->cur_creds != work->creds)
+ io_wq_switch_creds(worker, work);
+}
+
+static void io_assign_current_work(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ if (work) {
+ /* flush pending signals before assigning new work */
+ if (signal_pending(current))
+ flush_signals(current);
+ cond_resched();
+ }
+
+ spin_lock_irq(&worker->lock);
+ worker->cur_work = work;
+ spin_unlock_irq(&worker->lock);
+}
+
+static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
+
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
- struct io_wq_work *work, *old_work = NULL, *put_work = NULL;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
do {
- unsigned hash = -1U;
-
+ struct io_wq_work *work;
+ unsigned int hash;
+get_next:
/*
* If we got some work, mark us as busy. If we didn't, but
* the list isn't empty, it means we stalled on hashed work.
@@ -451,118 +501,80 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
- work = io_get_next_work(wqe, &hash);
+ work = io_get_next_work(wqe);
if (work)
__io_worker_busy(wqe, worker, work);
else if (!wq_list_empty(&wqe->work_list))
wqe->flags |= IO_WQE_FLAG_STALLED;
spin_unlock_irq(&wqe->lock);
- if (put_work && wq->put_work)
- wq->put_work(old_work);
if (!work)
break;
-next:
- /* flush any pending signals before assigning new work */
- if (signal_pending(current))
- flush_signals(current);
-
- cond_resched();
-
- spin_lock_irq(&worker->lock);
- worker->cur_work = work;
- spin_unlock_irq(&worker->lock);
-
- if (work->flags & IO_WQ_WORK_CB)
- work->func(&work);
-
- if (work->files && current->files != work->files) {
- task_lock(current);
- current->files = work->files;
- task_unlock(current);
- }
- if (work->mm != worker->mm)
- io_wq_switch_mm(worker, work);
- if (worker->cur_creds != work->creds)
- io_wq_switch_creds(worker, work);
- /*
- * OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
- * the worker function will do the right thing.
- */
- if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
- work->flags |= IO_WQ_WORK_CANCEL;
- if (worker->mm)
- work->flags |= IO_WQ_WORK_HAS_MM;
-
- if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) {
- put_work = work;
- wq->get_work(work);
- }
-
- old_work = work;
- work->func(&work);
-
- spin_lock_irq(&worker->lock);
- worker->cur_work = NULL;
- spin_unlock_irq(&worker->lock);
-
- spin_lock_irq(&wqe->lock);
-
- if (hash != -1U) {
- wqe->hash_map &= ~BIT_ULL(hash);
- wqe->flags &= ~IO_WQE_FLAG_STALLED;
- }
- if (work && work != old_work) {
- spin_unlock_irq(&wqe->lock);
-
- if (put_work && wq->put_work) {
- wq->put_work(put_work);
- put_work = NULL;
+ io_assign_current_work(worker, work);
+
+ /* handle a whole dependent link */
+ do {
+ struct io_wq_work *old_work, *next_hashed, *linked;
+
+ next_hashed = wq_next_work(work);
+ io_impersonate_work(worker, work);
+ /*
+ * OK to set IO_WQ_WORK_CANCEL even for uncancellable
+ * work, the worker function will do the right thing.
+ */
+ if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
+ work->flags |= IO_WQ_WORK_CANCEL;
+
+ hash = io_get_work_hash(work);
+ linked = old_work = work;
+ linked->func(&linked);
+ linked = (old_work == linked) ? NULL : linked;
+
+ work = next_hashed;
+ if (!work && linked && !io_wq_is_hashed(linked)) {
+ work = linked;
+ linked = NULL;
+ }
+ io_assign_current_work(worker, work);
+ wq->free_work(old_work);
+
+ if (linked)
+ io_wqe_enqueue(wqe, linked);
+
+ if (hash != -1U && !next_hashed) {
+ spin_lock_irq(&wqe->lock);
+ wqe->hash_map &= ~BIT_ULL(hash);
+ wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ /* dependent work is not hashed */
+ hash = -1U;
+ /* skip unnecessary unlock-lock wqe->lock */
+ if (!work)
+ goto get_next;
+ spin_unlock_irq(&wqe->lock);
}
+ } while (work);
- /* dependent work not hashed */
- hash = -1U;
- goto next;
- }
+ spin_lock_irq(&wqe->lock);
} while (1);
}
-static inline void io_worker_spin_for_work(struct io_wqe *wqe)
-{
- int i = 0;
-
- while (++i < 1000) {
- if (io_wqe_run_queue(wqe))
- break;
- if (need_resched())
- break;
- cpu_relax();
- }
-}
-
static int io_wqe_worker(void *data)
{
struct io_worker *worker = data;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
- bool did_work;
io_worker_start(wqe, worker);
- did_work = false;
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
set_current_state(TASK_INTERRUPTIBLE);
loop:
- if (did_work)
- io_worker_spin_for_work(wqe);
spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
__set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
- did_work = true;
goto loop;
}
- did_work = false;
/* drops the lock on success, retry */
if (__io_worker_idle(wqe, worker)) {
__release(&wqe->lock);
@@ -691,11 +703,16 @@ static int io_wq_manager(void *data)
/* create fixed workers */
refcount_set(&wq->refs, workers_to_create);
for_each_node(node) {
+ if (!node_online(node))
+ continue;
if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
goto err;
workers_to_create--;
}
+ while (workers_to_create--)
+ refcount_dec(&wq->refs);
+
complete(&wq->done);
while (!kthread_should_stop()) {
@@ -703,6 +720,9 @@ static int io_wq_manager(void *data)
struct io_wqe *wqe = wq->wqes[node];
bool fork_worker[2] = { false, false };
+ if (!node_online(node))
+ continue;
+
spin_lock_irq(&wqe->lock);
if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
fork_worker[IO_WQ_ACCT_BOUND] = true;
@@ -750,6 +770,40 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
return true;
}
+static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
+{
+ struct io_wq *wq = wqe->wq;
+
+ do {
+ struct io_wq_work *old_work = work;
+
+ work->flags |= IO_WQ_WORK_CANCEL;
+ work->func(&work);
+ work = (work == old_work) ? NULL : work;
+ wq->free_work(old_work);
+ } while (work);
+}
+
+static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
+{
+ unsigned int hash;
+ struct io_wq_work *tail;
+
+ if (!io_wq_is_hashed(work)) {
+append:
+ wq_list_add_tail(&work->list, &wqe->work_list);
+ return;
+ }
+
+ hash = io_get_work_hash(work);
+ tail = wqe->hash_tail[hash];
+ wqe->hash_tail[hash] = work;
+ if (!tail)
+ goto append;
+
+ wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
+}
+
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
@@ -763,14 +817,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
* It's close enough to not be an issue, fork() has the same delay.
*/
if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
- work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
+ io_run_cancel(work, wqe);
return;
}
work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags);
- wq_list_add_tail(&work->list, &wqe->work_list);
+ io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
@@ -787,19 +840,15 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
}
/*
- * Enqueue work, hashed by some key. Work items that hash to the same value
- * will not be done in parallel. Used to limit concurrent writes, generally
- * hashed by inode.
+ * Work items that hash to the same value will not be done in parallel.
+ * Used to limit concurrent writes, generally hashed by inode.
*/
-void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val)
+void io_wq_hash_work(struct io_wq_work *work, void *val)
{
- struct io_wqe *wqe = wq->wqes[numa_node_id()];
- unsigned bit;
-
+ unsigned int bit;
bit = hash_ptr(val, IO_WQ_HASH_ORDER);
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
- io_wqe_enqueue(wqe, work);
}
static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
@@ -821,7 +870,9 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
if (io_worker_get(worker)) {
- ret = func(worker, data);
+ /* no task if node is/was offline */
+ if (worker->task)
+ ret = func(worker, data);
io_worker_release(worker);
if (ret)
break;
@@ -847,14 +898,13 @@ void io_wq_cancel_all(struct io_wq *wq)
}
struct io_cb_cancel_data {
- struct io_wqe *wqe;
- work_cancel_fn *cancel;
- void *caller_data;
+ work_cancel_fn *fn;
+ void *data;
};
-static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
+static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
- struct io_cb_cancel_data *data = cancel_data;
+ struct io_cb_cancel_data *match = data;
unsigned long flags;
bool ret = false;
@@ -865,82 +915,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
- data->cancel(worker->cur_work, data->caller_data)) {
- send_sig(SIGINT, worker->task, 1);
- ret = true;
- }
- spin_unlock_irqrestore(&worker->lock, flags);
-
- return ret;
-}
-
-static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
- work_cancel_fn *cancel,
- void *cancel_data)
-{
- struct io_cb_cancel_data data = {
- .wqe = wqe,
- .cancel = cancel,
- .caller_data = cancel_data,
- };
- struct io_wq_work_node *node, *prev;
- struct io_wq_work *work;
- unsigned long flags;
- bool found = false;
-
- spin_lock_irqsave(&wqe->lock, flags);
- wq_list_for_each(node, prev, &wqe->work_list) {
- work = container_of(node, struct io_wq_work, list);
-
- if (cancel(work, cancel_data)) {
- wq_node_del(&wqe->work_list, node, prev);
- found = true;
- break;
- }
- }
- spin_unlock_irqrestore(&wqe->lock, flags);
-
- if (found) {
- work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
- return IO_WQ_CANCEL_OK;
- }
-
- rcu_read_lock();
- found = io_wq_for_each_worker(wqe, io_work_cancel, &data);
- rcu_read_unlock();
- return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
-}
-
-enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
- void *data)
-{
- enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
- int node;
-
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
-
- ret = io_wqe_cancel_cb_work(wqe, cancel, data);
- if (ret != IO_WQ_CANCEL_NOTFOUND)
- break;
- }
-
- return ret;
-}
-
-static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
-{
- struct io_wq_work *work = data;
- unsigned long flags;
- bool ret = false;
-
- if (worker->cur_work != work)
- return false;
-
- spin_lock_irqsave(&worker->lock, flags);
- if (worker->cur_work == work &&
- !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
+ match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
@@ -950,15 +925,13 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
}
static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
- struct io_wq_work *cwork)
+ struct io_cb_cancel_data *match)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
unsigned long flags;
bool found = false;
- cwork->flags |= IO_WQ_WORK_CANCEL;
-
/*
* First check pending list, if we're lucky we can just remove it
* from there. CANCEL_OK means that the work is returned as-new,
@@ -968,8 +941,8 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list);
- if (work == cwork) {
- wq_node_del(&wqe->work_list, node, prev);
+ if (match->fn(work, match->data)) {
+ wq_list_del(&wqe->work_list, node, prev);
found = true;
break;
}
@@ -977,8 +950,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
- work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
+ io_run_cancel(work, wqe);
return IO_WQ_CANCEL_OK;
}
@@ -989,20 +961,25 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
* completion will run normally in this case.
*/
rcu_read_lock();
- found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, cwork);
+ found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
rcu_read_unlock();
return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
}
-enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
+enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
+ void *data)
{
+ struct io_cb_cancel_data match = {
+ .fn = cancel,
+ .data = data,
+ };
enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
int node;
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- ret = io_wqe_cancel_work(wqe, cwork);
+ ret = io_wqe_cancel_work(wqe, &match);
if (ret != IO_WQ_CANCEL_NOTFOUND)
break;
}
@@ -1010,38 +987,28 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
return ret;
}
-struct io_wq_flush_data {
- struct io_wq_work work;
- struct completion done;
-};
-
-static void io_wq_flush_func(struct io_wq_work **workptr)
+static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
{
- struct io_wq_work *work = *workptr;
- struct io_wq_flush_data *data;
+ return work == data;
+}
- data = container_of(work, struct io_wq_flush_data, work);
- complete(&data->done);
+enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
+{
+ return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork);
}
-/*
- * Doesn't wait for previously queued work to finish. When this completes,
- * it just means that previously queued work was started.
- */
-void io_wq_flush(struct io_wq *wq)
+static bool io_wq_pid_match(struct io_wq_work *work, void *data)
{
- struct io_wq_flush_data data;
- int node;
+ pid_t pid = (pid_t) (unsigned long) data;
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
+ return work->task_pid == pid;
+}
- init_completion(&data.done);
- INIT_IO_WORK(&data.work, io_wq_flush_func);
- data.work.flags |= IO_WQ_WORK_INTERNAL;
- io_wqe_enqueue(wqe, &data.work);
- wait_for_completion(&data.done);
- }
+enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
+{
+ void *data = (void *) (unsigned long) pid;
+
+ return io_wq_cancel_cb(wq, io_wq_pid_match, data);
}
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
@@ -1049,6 +1016,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret = -ENOMEM, node;
struct io_wq *wq;
+ if (WARN_ON_ONCE(!data->free_work))
+ return ERR_PTR(-EINVAL);
+
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
@@ -1059,20 +1029,22 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(-ENOMEM);
}
- wq->get_work = data->get_work;
- wq->put_work = data->put_work;
+ wq->free_work = data->free_work;
/* caller must already hold a reference to this */
wq->user = data->user;
for_each_node(node) {
struct io_wqe *wqe;
+ int alloc_node = node;
- wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
+ if (!node_online(alloc_node))
+ alloc_node = NUMA_NO_NODE;
+ wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
if (!wqe)
goto err;
wq->wqes[node] = wqe;
- wqe->node = node;
+ wqe->node = alloc_node;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
if (wq->user) {
@@ -1080,7 +1052,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
task_rlimit(current, RLIMIT_NPROC);
}
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
- wqe->node = node;
wqe->wq = wq;
spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
@@ -1115,7 +1086,7 @@ err:
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
- if (data->get_work != wq->get_work || data->put_work != wq->put_work)
+ if (data->free_work != wq->free_work)
return false;
return refcount_inc_not_zero(&wq->use_refs);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 50b3378febf2..3ee7356d6be5 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -5,11 +5,8 @@ struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
- IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_UNBOUND = 32,
- IO_WQ_WORK_INTERNAL = 64,
- IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
@@ -31,6 +28,18 @@ struct io_wq_work_list {
struct io_wq_work_node *last;
};
+static inline void wq_list_add_after(struct io_wq_work_node *node,
+ struct io_wq_work_node *pos,
+ struct io_wq_work_list *list)
+{
+ struct io_wq_work_node *next = pos->next;
+
+ pos->next = node;
+ node->next = next;
+ if (!next)
+ list->last = node;
+}
+
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
@@ -43,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
}
}
-static inline void wq_node_del(struct io_wq_work_list *list,
- struct io_wq_work_node *node,
+static inline void wq_list_cut(struct io_wq_work_list *list,
+ struct io_wq_work_node *last,
struct io_wq_work_node *prev)
{
- if (node == list->first)
- WRITE_ONCE(list->first, node->next);
- if (node == list->last)
+ /* first in the list, if prev==NULL */
+ if (!prev)
+ WRITE_ONCE(list->first, last->next);
+ else
+ prev->next = last->next;
+
+ if (last == list->last)
list->last = prev;
- if (prev)
- prev->next = node->next;
- node->next = NULL;
+ last->next = NULL;
+}
+
+static inline void wq_list_del(struct io_wq_work_list *list,
+ struct io_wq_work_node *node,
+ struct io_wq_work_node *prev)
+{
+ wq_list_cut(list, node, prev);
}
#define wq_list_for_each(pos, prv, head) \
@@ -66,35 +84,35 @@ static inline void wq_node_del(struct io_wq_work_list *list,
} while (0)
struct io_wq_work {
- union {
- struct io_wq_work_node list;
- void *data;
- };
+ struct io_wq_work_node list;
void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
const struct cred *creds;
+ struct fs_struct *fs;
unsigned flags;
+ pid_t task_pid;
};
-#define INIT_IO_WORK(work, _func) \
- do { \
- (work)->list.next = NULL; \
- (work)->func = _func; \
- (work)->flags = 0; \
- (work)->files = NULL; \
- (work)->mm = NULL; \
- (work)->creds = NULL; \
- } while (0) \
+#define INIT_IO_WORK(work, _func) \
+ do { \
+ *(work) = (struct io_wq_work){ .func = _func }; \
+ } while (0) \
-typedef void (get_work_fn)(struct io_wq_work *);
-typedef void (put_work_fn)(struct io_wq_work *);
+static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
+{
+ if (!work->list.next)
+ return NULL;
+
+ return container_of(work->list.next, struct io_wq_work, list);
+}
+
+typedef void (free_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
- get_work_fn *get_work;
- put_work_fn *put_work;
+ free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
@@ -102,11 +120,16 @@ bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
-void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
-void io_wq_flush(struct io_wq *wq);
+void io_wq_hash_work(struct io_wq_work *work, void *val);
+
+static inline bool io_wq_is_hashed(struct io_wq_work *work)
+{
+ return work->flags & IO_WQ_WORK_HASHED;
+}
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
+enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77f22c3da30f..358f97be9c7b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -44,6 +44,7 @@
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
+#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>
@@ -75,6 +76,9 @@
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
+#include <linux/splice.h>
+#include <linux/task_work.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -182,21 +186,23 @@ struct fixed_file_table {
struct file **files;
};
-enum {
- FFD_F_ATOMIC,
-};
-
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
struct percpu_ref refs;
struct llist_head put_llist;
- unsigned long state;
struct work_struct ref_work;
struct completion done;
};
+struct io_buffer {
+ struct list_head list;
+ __u64 addr;
+ __s32 len;
+ __u16 bid;
+};
+
struct io_ring_ctx {
struct {
struct percpu_ref refs;
@@ -204,11 +210,11 @@ struct io_ring_ctx {
struct {
unsigned int flags;
- int compat: 1;
- int account_mem: 1;
- int cq_overflow_flushed: 1;
- int drain_next: 1;
- int eventfd_async: 1;
+ unsigned int compat: 1;
+ unsigned int account_mem: 1;
+ unsigned int cq_overflow_flushed: 1;
+ unsigned int drain_next: 1;
+ unsigned int eventfd_async: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -274,6 +280,8 @@ struct io_ring_ctx {
struct socket *ring_sock;
#endif
+ struct idr io_buffer_idr;
+
struct idr personality_idr;
struct {
@@ -294,7 +302,6 @@ struct io_ring_ctx {
struct {
spinlock_t completion_lock;
- struct llist_head poll_llist;
/*
* ->poll_list is protected by the ctx->uring_lock for
@@ -347,6 +354,7 @@ struct io_accept {
struct sockaddr __user *addr;
int __user *addr_len;
int flags;
+ unsigned long nofile;
};
struct io_sync {
@@ -389,7 +397,9 @@ struct io_sr_msg {
void __user *buf;
};
int msg_flags;
+ int bgid;
size_t len;
+ struct io_buffer *kbuf;
};
struct io_open {
@@ -401,6 +411,7 @@ struct io_open {
struct filename *filename;
struct statx __user *buffer;
struct open_how how;
+ unsigned long nofile;
};
struct io_files_update {
@@ -432,6 +443,24 @@ struct io_epoll {
struct epoll_event event;
};
+struct io_splice {
+ struct file *file_out;
+ struct file *file_in;
+ loff_t off_out;
+ loff_t off_in;
+ u64 len;
+ unsigned int flags;
+};
+
+struct io_provide_buf {
+ struct file *file;
+ __u64 addr;
+ __s32 len;
+ __u32 bgid;
+ __u16 nbufs;
+ __u16 bid;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -441,6 +470,7 @@ struct io_async_msghdr {
struct iovec *iov;
struct sockaddr __user *uaddr;
struct msghdr msg;
+ struct sockaddr_storage addr;
};
struct io_async_rw {
@@ -450,17 +480,12 @@ struct io_async_rw {
ssize_t size;
};
-struct io_async_open {
- struct filename *filename;
-};
-
struct io_async_ctx {
union {
struct io_async_rw rw;
struct io_async_msghdr msg;
struct io_async_connect connect;
struct io_timeout_data timeout;
- struct io_async_open open;
};
};
@@ -470,6 +495,7 @@ enum {
REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
+ REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
@@ -483,6 +509,13 @@ enum {
REQ_F_MUST_PUNT_BIT,
REQ_F_TIMEOUT_NOSEQ_BIT,
REQ_F_COMP_LOCKED_BIT,
+ REQ_F_NEED_CLEANUP_BIT,
+ REQ_F_OVERFLOW_BIT,
+ REQ_F_POLLED_BIT,
+ REQ_F_BUFFER_SELECTED_BIT,
+
+ /* not a real bit, just to check we're not overflowing the space */
+ __REQ_F_LAST_BIT,
};
enum {
@@ -496,6 +529,8 @@ enum {
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
+ /* IOSQE_BUFFER_SELECT */
+ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
/* already grabbed next link */
REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
@@ -521,6 +556,19 @@ enum {
REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
/* completion under lock */
REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
+ /* needs cleanup */
+ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
+ /* in overflow list */
+ REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT),
+ /* already went through poll handler */
+ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
+ /* buffer already selected */
+ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+};
+
+struct async_poll {
+ struct io_poll_iocb poll;
+ struct io_wq_work work;
};
/*
@@ -546,33 +594,45 @@ struct io_kiocb {
struct io_fadvise fadvise;
struct io_madvise madvise;
struct io_epoll epoll;
+ struct io_splice splice;
+ struct io_provide_buf pbuf;
};
struct io_async_ctx *io;
- /*
- * llist_node is only used for poll deferred completions
- */
- struct llist_node llist_node;
- bool has_user;
- bool in_async;
bool needs_fixed_file;
u8 opcode;
struct io_ring_ctx *ctx;
- union {
- struct list_head list;
- struct hlist_node hash_node;
- };
- struct list_head link_list;
+ struct list_head list;
unsigned int flags;
refcount_t refs;
+ union {
+ struct task_struct *task;
+ unsigned long fsize;
+ };
u64 user_data;
u32 result;
u32 sequence;
+ struct list_head link_list;
+
struct list_head inflight_entry;
- struct io_wq_work work;
+ union {
+ /*
+ * Only commands that never go async can use the below fields,
+ * obviously. Right now only IORING_OP_POLL_ADD uses them, and
+ * async armed poll handlers for regular commands. The latter
+ * restore the work, if needed.
+ */
+ struct {
+ struct callback_head task_work;
+ struct hlist_node hash_node;
+ struct async_poll *apoll;
+ int cflags;
+ };
+ struct io_wq_work work;
+ };
};
#define IO_PLUG_THRESHOLD 2
@@ -614,6 +674,13 @@ struct io_op_def {
unsigned not_supported : 1;
/* needs file table */
unsigned file_table : 1;
+ /* needs ->fs */
+ unsigned needs_fs : 1;
+ /* set if opcode supports polled "wait" */
+ unsigned pollin : 1;
+ unsigned pollout : 1;
+ /* op supports buffer selection */
+ unsigned buffer_select : 1;
};
static const struct io_op_def io_op_defs[] = {
@@ -623,6 +690,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_WRITEV] = {
.async_ctx = 1,
@@ -630,6 +699,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
@@ -637,11 +707,13 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -656,12 +728,17 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .needs_fs = 1,
+ .pollout = 1,
},
[IORING_OP_RECVMSG] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .needs_fs = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_TIMEOUT] = {
.async_ctx = 1,
@@ -673,6 +750,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.file_table = 1,
+ .pollin = 1,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
@@ -684,6 +762,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
@@ -692,6 +771,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.fd_non_neg = 1,
.file_table = 1,
+ .needs_fs = 1,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
@@ -705,16 +785,20 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.fd_non_neg = 1,
+ .needs_fs = 1,
},
[IORING_OP_READ] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_WRITE] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
@@ -726,21 +810,32 @@ static const struct io_op_def io_op_defs[] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollout = 1,
},
[IORING_OP_RECV] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
},
[IORING_OP_OPENAT2] = {
.needs_file = 1,
.fd_non_neg = 1,
.file_table = 1,
+ .needs_fs = 1,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
.file_table = 1,
},
+ [IORING_OP_SPLICE] = {
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_PROVIDE_BUFFERS] = {},
+ [IORING_OP_REMOVE_BUFFERS] = {},
};
static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -754,6 +849,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
static void io_ring_file_ref_flush(struct fixed_file_data *data);
+static void io_cleanup_req(struct io_kiocb *req);
+static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
+ int fd, struct file **out_file, bool fixed);
+static void __io_queue_sqe(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe);
static struct kmem_cache *req_cachep;
@@ -820,11 +920,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->completions[0]);
init_completion(&ctx->completions[1]);
+ idr_init(&ctx->io_buffer_idr);
idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
- init_llist_head(&ctx->poll_llist);
INIT_LIST_HEAD(&ctx->poll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
@@ -909,6 +1009,18 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
}
if (!req->work.creds)
req->work.creds = get_current_cred();
+ if (!req->work.fs && def->needs_fs) {
+ spin_lock(&current->fs->lock);
+ if (!current->fs->in_exec) {
+ req->work.fs = current->fs;
+ req->work.fs->users++;
+ } else {
+ req->work.flags |= IO_WQ_WORK_CANCEL;
+ }
+ spin_unlock(&current->fs->lock);
+ }
+ if (!req->work.task_pid)
+ req->work.task_pid = task_pid_vnr(current);
}
static inline void io_req_work_drop_env(struct io_kiocb *req)
@@ -921,17 +1033,26 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
put_cred(req->work.creds);
req->work.creds = NULL;
}
+ if (req->work.fs) {
+ struct fs_struct *fs = req->work.fs;
+
+ spin_lock(&req->work.fs->lock);
+ if (--fs->users)
+ fs = NULL;
+ spin_unlock(&req->work.fs->lock);
+ if (fs)
+ free_fs_struct(fs);
+ }
}
-static inline bool io_prep_async_work(struct io_kiocb *req,
+static inline void io_prep_async_work(struct io_kiocb *req,
struct io_kiocb **link)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
- bool do_hashed = false;
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file)
- do_hashed = true;
+ io_wq_hash_work(&req->work, file_inode(req->file));
} else {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
@@ -940,25 +1061,18 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
io_req_work_grab_env(req, def);
*link = io_prep_linked_timeout(req);
- return do_hashed;
}
static inline void io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link;
- bool do_hashed;
- do_hashed = io_prep_async_work(req, &link);
+ io_prep_async_work(req, &link);
- trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
- req->flags);
- if (!do_hashed) {
- io_wq_enqueue(ctx->io_wq, &req->work);
- } else {
- io_wq_enqueue_hashed(ctx->io_wq, &req->work,
- file_inode(req->file));
- }
+ trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
+ &req->work, req->flags);
+ io_wq_enqueue(ctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -972,6 +1086,7 @@ static void io_kill_timeout(struct io_kiocb *req)
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
list_del_init(&req->list);
+ req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
io_put_req(req);
}
@@ -1024,24 +1139,19 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
return false;
if (!ctx->eventfd_async)
return true;
- return io_wq_current_is_worker() || in_interrupt();
+ return io_wq_current_is_worker();
}
-static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
if (waitqueue_active(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait);
- if (trigger_ev)
+ if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
}
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
- __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
-}
-
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
@@ -1074,10 +1184,11 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
list);
list_move(&req->list, &list);
+ req->flags &= ~REQ_F_OVERFLOW;
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
- WRITE_ONCE(cqe->flags, 0);
+ WRITE_ONCE(cqe->flags, req->cflags);
} else {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1101,7 +1212,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
return cqe != NULL;
}
-static void io_cqring_fill_event(struct io_kiocb *req, long res)
+static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
@@ -1117,7 +1228,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
if (likely(cqe)) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
- WRITE_ONCE(cqe->flags, 0);
+ WRITE_ONCE(cqe->flags, cflags);
} else if (ctx->cq_overflow_flushed) {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1126,25 +1237,37 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
set_bit(0, &ctx->sq_check_overflow);
set_bit(0, &ctx->cq_check_overflow);
}
+ req->flags |= REQ_F_OVERFLOW;
refcount_inc(&req->refs);
req->result = res;
+ req->cflags = cflags;
list_add_tail(&req->list, &ctx->cq_overflow_list);
}
}
-static void io_cqring_add_event(struct io_kiocb *req, long res)
+static void io_cqring_fill_event(struct io_kiocb *req, long res)
+{
+ __io_cqring_fill_event(req, res, 0);
+}
+
+static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- io_cqring_fill_event(req, res);
+ __io_cqring_fill_event(req, res, cflags);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
+static void io_cqring_add_event(struct io_kiocb *req, long res)
+{
+ __io_cqring_add_event(req, res, 0);
+}
+
static inline bool io_is_fallback_req(struct io_kiocb *req)
{
return req == (struct io_kiocb *)
@@ -1214,6 +1337,15 @@ fallback:
return NULL;
}
+static inline void io_put_file(struct io_kiocb *req, struct file *file,
+ bool fixed)
+{
+ if (fixed)
+ percpu_ref_put(&req->ctx->file_data->refs);
+ else
+ fput(file);
+}
+
static void __io_req_do_free(struct io_kiocb *req)
{
if (likely(!io_is_fallback_req(req)))
@@ -1224,15 +1356,12 @@ static void __io_req_do_free(struct io_kiocb *req)
static void __io_req_aux_free(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ io_cleanup_req(req);
kfree(req->io);
- if (req->file) {
- if (req->flags & REQ_F_FIXED_FILE)
- percpu_ref_put(&ctx->file_data->refs);
- else
- fput(req->file);
- }
+ if (req->file)
+ io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
io_req_work_drop_env(req);
}
@@ -1439,6 +1568,30 @@ static void io_free_req(struct io_kiocb *req)
io_queue_async_work(nxt);
}
+static void io_link_work_cb(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ struct io_kiocb *link;
+
+ link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+ io_queue_linked_timeout(link);
+ io_wq_submit_work(workptr);
+}
+
+static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
+{
+ struct io_kiocb *link;
+ const struct io_op_def *def = &io_op_defs[nxt->opcode];
+
+ if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
+ io_wq_hash_work(&nxt->work, file_inode(nxt->file));
+
+ *workptr = &nxt->work;
+ link = io_prep_linked_timeout(nxt);
+ if (link)
+ nxt->work.func = io_link_work_cb;
+}
+
/*
* Drop reference to request, return next in chain (if there is one) if this
* was the last reference to this request.
@@ -1446,10 +1599,10 @@ static void io_free_req(struct io_kiocb *req)
__attribute__((nonnull))
static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
- io_req_find_next(req, nxtptr);
-
- if (refcount_dec_and_test(&req->refs))
+ if (refcount_dec_and_test(&req->refs)) {
+ io_req_find_next(req, nxtptr);
__io_free_req(req);
+ }
}
static void io_put_req(struct io_kiocb *req)
@@ -1458,6 +1611,26 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
+static void io_steal_work(struct io_kiocb *req,
+ struct io_wq_work **workptr)
+{
+ /*
+ * It's in an io-wq worker, so there always should be at least
+ * one reference, which will be dropped in io_put_work() just
+ * after the current handler returns.
+ *
+ * It also means, that if the counter dropped to 1, then there is
+ * no asynchronous users left, so it's safe to steal the next work.
+ */
+ if (refcount_read(&req->refs) == 1) {
+ struct io_kiocb *nxt = NULL;
+
+ io_req_find_next(req, &nxt);
+ if (nxt)
+ io_wq_assign_next(workptr, nxt);
+ }
+}
+
/*
* Must only be used if we don't need to care about links, usually from
* within the completion handling itself.
@@ -1519,6 +1692,19 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
return true;
}
+static int io_put_kbuf(struct io_kiocb *req)
+{
+ struct io_buffer *kbuf;
+ int cflags;
+
+ kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+ cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
+ cflags |= IORING_CQE_F_BUFFER;
+ req->rw.addr = 0;
+ kfree(kbuf);
+ return cflags;
+}
+
/*
* Find and free completed poll iocbs
*/
@@ -1530,10 +1716,15 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
rb.to_free = rb.need_iter = 0;
while (!list_empty(done)) {
+ int cflags = 0;
+
req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list);
- io_cqring_fill_event(req, req->result);
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ cflags = io_put_kbuf(req);
+
+ __io_cqring_fill_event(req, req->result, cflags);
(*nr_events)++;
if (refcount_dec_and_test(&req->refs) &&
@@ -1542,6 +1733,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
}
io_commit_cqring(ctx);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ io_cqring_ev_posted(ctx);
io_free_req_many(ctx, &rb);
}
@@ -1635,11 +1828,17 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
-static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
- long min)
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+ long min)
{
int iters = 0, ret = 0;
+ /*
+ * We disallow the app entering submit/complete with polling, but we
+ * still need to lock the ring to prevent racing with polled issue
+ * that got punted to a workqueue.
+ */
+ mutex_lock(&ctx->uring_lock);
do {
int tmin = 0;
@@ -1675,21 +1874,6 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
ret = 0;
} while (min && !*nr_events && !need_resched());
- return ret;
-}
-
-static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
- long min)
-{
- int ret;
-
- /*
- * We disallow the app entering submit/complete with polling, but we
- * still need to lock the ring to prevent racing with polled issue
- * that got punted to a workqueue.
- */
- mutex_lock(&ctx->uring_lock);
- ret = __io_iopoll_check(ctx, nr_events, min);
mutex_unlock(&ctx->uring_lock);
return ret;
}
@@ -1717,13 +1901,16 @@ static inline void req_set_fail_links(struct io_kiocb *req)
static void io_complete_rw_common(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+ int cflags = 0;
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if (res != req->result)
req_set_fail_links(req);
- io_cqring_add_event(req, res);
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ cflags = io_put_kbuf(req);
+ __io_cqring_add_event(req, res, cflags);
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
@@ -1734,17 +1921,6 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
io_put_req(req);
}
-static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
-{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
- struct io_kiocb *nxt = NULL;
-
- io_complete_rw_common(kiocb, res);
- io_put_req_find_next(req, &nxt);
-
- return nxt;
-}
-
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -1793,6 +1969,10 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
list_add(&req->list, &ctx->poll_list);
else
list_add_tail(&req->list, &ctx->poll_list);
+
+ if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ wq_has_sleeper(&ctx->sqo_wait))
+ wake_up(&ctx->sqo_wait);
}
static void io_file_put(struct io_submit_state *state)
@@ -1811,7 +1991,7 @@ static void io_file_put(struct io_submit_state *state)
* assuming most submissions are for one file, or at least that each file
* has more than one submission.
*/
-static struct file *io_file_get(struct io_submit_state *state, int fd)
+static struct file *__io_file_get(struct io_submit_state *state, int fd)
{
if (!state)
return fget(fd);
@@ -1908,7 +2088,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
- /* we own ->private, reuse it for the buffer index */
+ /* we own ->private, reuse it for the buffer index / buffer ID */
req->rw.kiocb.private = (void *) (unsigned long)
READ_ONCE(sqe->buf_index);
return 0;
@@ -1935,15 +2115,14 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
-static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
- bool in_async)
+static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
- if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
- *nxt = __io_complete_rw(kiocb, ret);
+ if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
+ io_complete_rw(kiocb, ret, 0);
else
io_rw_done(kiocb, ret);
}
@@ -2022,11 +2201,147 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
return len;
}
+static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
+{
+ if (needs_lock)
+ mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
+{
+ /*
+ * "Normal" inline submissions always hold the uring_lock, since we
+ * grab it from the system call. Same is true for the SQPOLL offload.
+ * The only exception is when we've detached the request and issue it
+ * from an async worker thread, grab the lock for that case.
+ */
+ if (needs_lock)
+ mutex_lock(&ctx->uring_lock);
+}
+
+static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
+ int bgid, struct io_buffer *kbuf,
+ bool needs_lock)
+{
+ struct io_buffer *head;
+
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ return kbuf;
+
+ io_ring_submit_lock(req->ctx, needs_lock);
+
+ lockdep_assert_held(&req->ctx->uring_lock);
+
+ head = idr_find(&req->ctx->io_buffer_idr, bgid);
+ if (head) {
+ if (!list_empty(&head->list)) {
+ kbuf = list_last_entry(&head->list, struct io_buffer,
+ list);
+ list_del(&kbuf->list);
+ } else {
+ kbuf = head;
+ idr_remove(&req->ctx->io_buffer_idr, bgid);
+ }
+ if (*len > kbuf->len)
+ *len = kbuf->len;
+ } else {
+ kbuf = ERR_PTR(-ENOBUFS);
+ }
+
+ io_ring_submit_unlock(req->ctx, needs_lock);
+
+ return kbuf;
+}
+
+static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
+ bool needs_lock)
+{
+ struct io_buffer *kbuf;
+ int bgid;
+
+ kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+ bgid = (int) (unsigned long) req->rw.kiocb.private;
+ kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+ if (IS_ERR(kbuf))
+ return kbuf;
+ req->rw.addr = (u64) (unsigned long) kbuf;
+ req->flags |= REQ_F_BUFFER_SELECTED;
+ return u64_to_user_ptr(kbuf->addr);
+}
+
+#ifdef CONFIG_COMPAT
+static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
+ bool needs_lock)
+{
+ struct compat_iovec __user *uiov;
+ compat_ssize_t clen;
+ void __user *buf;
+ ssize_t len;
+
+ uiov = u64_to_user_ptr(req->rw.addr);
+ if (!access_ok(uiov, sizeof(*uiov)))
+ return -EFAULT;
+ if (__get_user(clen, &uiov->iov_len))
+ return -EFAULT;
+ if (clen < 0)
+ return -EINVAL;
+
+ len = clen;
+ buf = io_rw_buffer_select(req, &len, needs_lock);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+ iov[0].iov_base = buf;
+ iov[0].iov_len = (compat_size_t) len;
+ return 0;
+}
+#endif
+
+static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
+ bool needs_lock)
+{
+ struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
+ void __user *buf;
+ ssize_t len;
+
+ if (copy_from_user(iov, uiov, sizeof(*uiov)))
+ return -EFAULT;
+
+ len = iov[0].iov_len;
+ if (len < 0)
+ return -EINVAL;
+ buf = io_rw_buffer_select(req, &len, needs_lock);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+ iov[0].iov_base = buf;
+ iov[0].iov_len = len;
+ return 0;
+}
+
+static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
+ bool needs_lock)
+{
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ return 0;
+ if (!req->rw.len)
+ return 0;
+ else if (req->rw.len > 1)
+ return -EINVAL;
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ return io_compat_import(req, iov, needs_lock);
+#endif
+
+ return __io_iov_buffer_select(req, iov, needs_lock);
+}
+
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter)
+ struct iovec **iovec, struct iov_iter *iter,
+ bool needs_lock)
{
void __user *buf = u64_to_user_ptr(req->rw.addr);
size_t sqe_len = req->rw.len;
+ ssize_t ret;
u8 opcode;
opcode = req->opcode;
@@ -2035,15 +2350,23 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return io_import_fixed(req, rw, iter);
}
- /* buffer index only valid with fixed read/write */
- if (req->rw.kiocb.private)
+ /* buffer index only valid with fixed read/write, or buffer select */
+ if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
- ssize_t ret;
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+ if (IS_ERR(buf)) {
+ *iovec = NULL;
+ return PTR_ERR(buf);
+ }
+ req->rw.len = sqe_len;
+ }
+
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
- return ret;
+ return ret < 0 ? ret : sqe_len;
}
if (req->io) {
@@ -2056,8 +2379,15 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return iorw->size;
}
- if (!req->has_user)
- return -EFAULT;
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ ret = io_iov_buffer_select(req, *iovec, needs_lock);
+ if (!ret) {
+ ret = (*iovec)->iov_len;
+ iov_iter_init(iter, rw, *iovec, 1, ret);
+ }
+ *iovec = NULL;
+ return ret;
+ }
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
@@ -2137,26 +2467,23 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
req->io->rw.iov = req->io->rw.fast_iov;
memcpy(req->io->rw.iov, fast_iov,
sizeof(struct iovec) * iter->nr_segs);
+ } else {
+ req->flags |= REQ_F_NEED_CLEANUP;
}
}
-static int io_alloc_async_ctx(struct io_kiocb *req)
+static inline int __io_alloc_async_ctx(struct io_kiocb *req)
{
- if (!io_op_defs[req->opcode].async_ctx)
- return 0;
req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
return req->io == NULL;
}
-static void io_rw_async(struct io_wq_work **workptr)
+static int io_alloc_async_ctx(struct io_kiocb *req)
{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct iovec *iov = NULL;
+ if (!io_op_defs[req->opcode].async_ctx)
+ return 0;
- if (req->io->rw.iov != req->io->rw.fast_iov)
- iov = req->io->rw.iov;
- io_wq_submit_work(workptr);
- kfree(iov);
+ return __io_alloc_async_ctx(req);
}
static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
@@ -2166,12 +2493,11 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
if (!io_op_defs[req->opcode].async_ctx)
return 0;
if (!req->io) {
- if (io_alloc_async_ctx(req))
+ if (__io_alloc_async_ctx(req))
return -ENOMEM;
io_req_map_rw(req, io_size, iovec, fast_iov, iter);
}
- req->work.func = io_rw_async;
return 0;
}
@@ -2189,13 +2515,14 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(!(req->file->f_mode & FMODE_READ)))
return -EBADF;
- if (!req->io)
+ /* either don't need iovec imported or already have it */
+ if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
return 0;
io = req->io;
io->rw.iov = io->rw.fast_iov;
req->io = NULL;
- ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
+ ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
req->io = io;
if (ret < 0)
return ret;
@@ -2204,8 +2531,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
-static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_read(struct io_kiocb *req, bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2213,13 +2539,13 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
size_t iov_count;
ssize_t io_size, ret;
- ret = io_import_iovec(READ, req, &iovec, &iter);
+ ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
if (ret < 0)
return ret;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
- req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
req->result = 0;
io_size = ret;
@@ -2230,10 +2556,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
- if (force_nonblock && !io_file_supports_async(req->file)) {
- req->flags |= REQ_F_MUST_PUNT;
+ if (force_nonblock && !io_file_supports_async(req->file))
goto copy_iov;
- }
iov_count = iov_iter_count(&iter);
ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
@@ -2247,19 +2571,22 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
/* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) {
- kiocb_done(kiocb, ret2, nxt, req->in_async);
+ kiocb_done(kiocb, ret2);
} else {
copy_iov:
ret = io_setup_async_rw(req, io_size, iovec,
inline_vecs, &iter);
if (ret)
goto out_free;
+ /* any defer here is final, must blocking retry */
+ if (!(req->flags & REQ_F_NOWAIT))
+ req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
}
out_free:
- if (!io_wq_current_is_worker())
- kfree(iovec);
+ kfree(iovec);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
return ret;
}
@@ -2277,13 +2604,16 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
- if (!req->io)
+ req->fsize = rlimit(RLIMIT_FSIZE);
+
+ /* either don't need iovec imported or already have it */
+ if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
return 0;
io = req->io;
io->rw.iov = io->rw.fast_iov;
req->io = NULL;
- ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
+ ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
req->io = io;
if (ret < 0)
return ret;
@@ -2292,8 +2622,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
-static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_write(struct io_kiocb *req, bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
@@ -2301,7 +2630,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
size_t iov_count;
ssize_t ret, io_size;
- ret = io_import_iovec(WRITE, req, &iovec, &iter);
+ ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
if (ret < 0)
return ret;
@@ -2318,10 +2647,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
- if (force_nonblock && !io_file_supports_async(req->file)) {
- req->flags |= REQ_F_MUST_PUNT;
+ if (force_nonblock && !io_file_supports_async(req->file))
goto copy_iov;
- }
/* file path doesn't support NOWAIT for non-direct_IO */
if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
@@ -2348,27 +2675,112 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
}
kiocb->ki_flags |= IOCB_WRITE;
+ if (!force_nonblock)
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+
if (req->file->f_op->write_iter)
ret2 = call_write_iter(req->file, kiocb, &iter);
else
ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+
+ if (!force_nonblock)
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+
+ /*
+ * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
+ * retry them without IOCB_NOWAIT.
+ */
+ if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
+ ret2 = -EAGAIN;
if (!force_nonblock || ret2 != -EAGAIN) {
- kiocb_done(kiocb, ret2, nxt, req->in_async);
+ kiocb_done(kiocb, ret2);
} else {
copy_iov:
ret = io_setup_async_rw(req, io_size, iovec,
inline_vecs, &iter);
if (ret)
goto out_free;
+ /* any defer here is final, must blocking retry */
+ req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
}
out_free:
- if (!io_wq_current_is_worker())
- kfree(iovec);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ kfree(iovec);
return ret;
}
+static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_splice* sp = &req->splice;
+ unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
+ int ret;
+
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
+
+ sp->file_in = NULL;
+ sp->off_in = READ_ONCE(sqe->splice_off_in);
+ sp->off_out = READ_ONCE(sqe->off);
+ sp->len = READ_ONCE(sqe->len);
+ sp->flags = READ_ONCE(sqe->splice_flags);
+
+ if (unlikely(sp->flags & ~valid_flags))
+ return -EINVAL;
+
+ ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
+ (sp->flags & SPLICE_F_FD_IN_FIXED));
+ if (ret)
+ return ret;
+ req->flags |= REQ_F_NEED_CLEANUP;
+
+ if (!S_ISREG(file_inode(sp->file_in)->i_mode))
+ req->work.flags |= IO_WQ_WORK_UNBOUND;
+
+ return 0;
+}
+
+static bool io_splice_punt(struct file *file)
+{
+ if (get_pipe_info(file))
+ return false;
+ if (!io_file_supports_async(file))
+ return true;
+ return !(file->f_mode & O_NONBLOCK);
+}
+
+static int io_splice(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_splice *sp = &req->splice;
+ struct file *in = sp->file_in;
+ struct file *out = sp->file_out;
+ unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+ loff_t *poff_in, *poff_out;
+ long ret;
+
+ if (force_nonblock) {
+ if (io_splice_punt(in) || io_splice_punt(out))
+ return -EAGAIN;
+ flags |= SPLICE_F_NONBLOCK;
+ }
+
+ poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
+ poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
+ ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return -EAGAIN;
+
+ io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+
+ io_cqring_add_event(req, ret);
+ if (ret != sp->len)
+ req_set_fail_links(req);
+ io_put_req(req);
+ return 0;
+}
+
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
@@ -2417,82 +2829,63 @@ static bool io_req_cancelled(struct io_kiocb *req)
return false;
}
-static void io_link_work_cb(struct io_wq_work **workptr)
-{
- struct io_wq_work *work = *workptr;
- struct io_kiocb *link = work->data;
-
- io_queue_linked_timeout(link);
- work->func = io_wq_submit_work;
-}
-
-static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
-{
- struct io_kiocb *link;
-
- io_prep_async_work(nxt, &link);
- *workptr = &nxt->work;
- if (link) {
- nxt->work.flags |= IO_WQ_WORK_CB;
- nxt->work.func = io_link_work_cb;
- nxt->work.data = link;
- }
-}
-
-static void io_fsync_finish(struct io_wq_work **workptr)
+static void __io_fsync(struct io_kiocb *req)
{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
loff_t end = req->sync.off + req->sync.len;
- struct io_kiocb *nxt = NULL;
int ret;
- if (io_req_cancelled(req))
- return;
-
ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
+ io_put_req(req);
}
-static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static void io_fsync_finish(struct io_wq_work **workptr)
{
- struct io_wq_work *work, *old_work;
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ if (io_req_cancelled(req))
+ return;
+ __io_fsync(req);
+ io_steal_work(req, workptr);
+}
+
+static int io_fsync(struct io_kiocb *req, bool force_nonblock)
+{
/* fsync always requires a blocking context */
if (force_nonblock) {
- io_put_req(req);
req->work.func = io_fsync_finish;
return -EAGAIN;
}
-
- work = old_work = &req->work;
- io_fsync_finish(&work);
- if (work && work != old_work)
- *nxt = container_of(work, struct io_kiocb, work);
+ __io_fsync(req);
return 0;
}
-static void io_fallocate_finish(struct io_wq_work **workptr)
+static void __io_fallocate(struct io_kiocb *req)
{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
int ret;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
+ io_put_req(req);
+}
+
+static void io_fallocate_finish(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+
+ if (io_req_cancelled(req))
+ return;
+ __io_fallocate(req);
+ io_steal_work(req, workptr);
}
static int io_fallocate_prep(struct io_kiocb *req,
@@ -2504,26 +2897,19 @@ static int io_fallocate_prep(struct io_kiocb *req,
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->addr);
req->sync.mode = READ_ONCE(sqe->len);
+ req->fsize = rlimit(RLIMIT_FSIZE);
return 0;
}
-static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
{
- struct io_wq_work *work, *old_work;
-
/* fallocate always requiring blocking context */
if (force_nonblock) {
- io_put_req(req);
req->work.func = io_fallocate_finish;
return -EAGAIN;
}
- work = old_work = &req->work;
- io_fallocate_finish(&work);
- if (work && work != old_work)
- *nxt = container_of(work, struct io_kiocb, work);
-
+ __io_fallocate(req);
return 0;
}
@@ -2534,6 +2920,10 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
+ if (sqe->flags & IOSQE_FIXED_FILE)
+ return -EBADF;
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
req->open.dfd = READ_ONCE(sqe->fd);
req->open.how.mode = READ_ONCE(sqe->len);
@@ -2547,6 +2937,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
}
+ req->open.nofile = rlimit(RLIMIT_NOFILE);
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -2559,6 +2951,10 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
+ if (sqe->flags & IOSQE_FIXED_FILE)
+ return -EBADF;
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
req->open.dfd = READ_ONCE(sqe->fd);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -2583,11 +2979,12 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
}
+ req->open.nofile = rlimit(RLIMIT_NOFILE);
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
-static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_openat2(struct io_kiocb *req, bool force_nonblock)
{
struct open_flags op;
struct file *file;
@@ -2600,7 +2997,7 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret)
goto err;
- ret = get_unused_fd_flags(req->open.how.flags);
+ ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
if (ret < 0)
goto err;
@@ -2614,18 +3011,175 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
}
err:
putname(req->open.filename);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
}
-static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_openat(struct io_kiocb *req, bool force_nonblock)
{
req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
- return io_openat2(req, nxt, force_nonblock);
+ return io_openat2(req, force_nonblock);
+}
+
+static int io_remove_buffers_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_provide_buf *p = &req->pbuf;
+ u64 tmp;
+
+ if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
+ return -EINVAL;
+
+ tmp = READ_ONCE(sqe->fd);
+ if (!tmp || tmp > USHRT_MAX)
+ return -EINVAL;
+
+ memset(p, 0, sizeof(*p));
+ p->nbufs = tmp;
+ p->bgid = READ_ONCE(sqe->buf_group);
+ return 0;
+}
+
+static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
+ int bgid, unsigned nbufs)
+{
+ unsigned i = 0;
+
+ /* shouldn't happen */
+ if (!nbufs)
+ return 0;
+
+ /* the head kbuf is the list itself */
+ while (!list_empty(&buf->list)) {
+ struct io_buffer *nxt;
+
+ nxt = list_first_entry(&buf->list, struct io_buffer, list);
+ list_del(&nxt->list);
+ kfree(nxt);
+ if (++i == nbufs)
+ return i;
+ }
+ i++;
+ kfree(buf);
+ idr_remove(&ctx->io_buffer_idr, bgid);
+
+ return i;
+}
+
+static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_provide_buf *p = &req->pbuf;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer *head;
+ int ret = 0;
+
+ io_ring_submit_lock(ctx, !force_nonblock);
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ ret = -ENOENT;
+ head = idr_find(&ctx->io_buffer_idr, p->bgid);
+ if (head)
+ ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
+
+ io_ring_submit_lock(ctx, !force_nonblock);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req(req);
+ return 0;
+}
+
+static int io_provide_buffers_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_provide_buf *p = &req->pbuf;
+ u64 tmp;
+
+ if (sqe->ioprio || sqe->rw_flags)
+ return -EINVAL;
+
+ tmp = READ_ONCE(sqe->fd);
+ if (!tmp || tmp > USHRT_MAX)
+ return -E2BIG;
+ p->nbufs = tmp;
+ p->addr = READ_ONCE(sqe->addr);
+ p->len = READ_ONCE(sqe->len);
+
+ if (!access_ok(u64_to_user_ptr(p->addr), p->len))
+ return -EFAULT;
+
+ p->bgid = READ_ONCE(sqe->buf_group);
+ tmp = READ_ONCE(sqe->off);
+ if (tmp > USHRT_MAX)
+ return -E2BIG;
+ p->bid = tmp;
+ return 0;
+}
+
+static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
+{
+ struct io_buffer *buf;
+ u64 addr = pbuf->addr;
+ int i, bid = pbuf->bid;
+
+ for (i = 0; i < pbuf->nbufs; i++) {
+ buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf)
+ break;
+
+ buf->addr = addr;
+ buf->len = pbuf->len;
+ buf->bid = bid;
+ addr += pbuf->len;
+ bid++;
+ if (!*head) {
+ INIT_LIST_HEAD(&buf->list);
+ *head = buf;
+ } else {
+ list_add_tail(&buf->list, &(*head)->list);
+ }
+ }
+
+ return i ? i : -ENOMEM;
+}
+
+static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_provide_buf *p = &req->pbuf;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer *head, *list;
+ int ret = 0;
+
+ io_ring_submit_lock(ctx, !force_nonblock);
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
+
+ ret = io_add_buffers(p, &head);
+ if (ret < 0)
+ goto out;
+
+ if (!list) {
+ ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
+ GFP_KERNEL);
+ if (ret < 0) {
+ __io_remove_buffers(ctx, head, p->bgid, -1U);
+ goto out;
+ }
+ }
+out:
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req(req);
+ return 0;
}
static int io_epoll_ctl_prep(struct io_kiocb *req,
@@ -2653,8 +3207,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#endif
}
-static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_EPOLL)
struct io_epoll *ie = &req->epoll;
@@ -2667,7 +3220,7 @@ static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
#else
return -EOPNOTSUPP;
@@ -2689,8 +3242,7 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#endif
}
-static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_madvise(struct io_kiocb *req, bool force_nonblock)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = &req->madvise;
@@ -2703,7 +3255,7 @@ static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
#else
return -EOPNOTSUPP;
@@ -2721,8 +3273,7 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
{
struct io_fadvise *fa = &req->fadvise;
int ret;
@@ -2742,7 +3293,7 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
}
@@ -2754,6 +3305,10 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
+ if (sqe->flags & IOSQE_FIXED_FILE)
+ return -EBADF;
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
req->open.dfd = READ_ONCE(sqe->fd);
req->open.mask = READ_ONCE(sqe->len);
@@ -2771,11 +3326,11 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
}
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
-static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_statx(struct io_kiocb *req, bool force_nonblock)
{
struct io_open *ctx = &req->open;
unsigned lookup_flags;
@@ -2808,10 +3363,11 @@ retry:
ret = cp_statx(&stat, ctx->buffer);
err:
putname(ctx->filename);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
}
@@ -2827,7 +3383,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (sqe->flags & IOSQE_FIXED_FILE)
- return -EINVAL;
+ return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
if (req->file->f_op == &io_uring_fops ||
@@ -2837,30 +3393,29 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static void io_close_finish(struct io_wq_work **workptr)
+/* only called when __close_fd_get_file() is done */
+static void __io_close_finish(struct io_kiocb *req)
{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
-
- /* Invoked with files, we need to do the close */
- if (req->work.files) {
- int ret;
-
- ret = filp_close(req->close.put_file, req->work.files);
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
- }
+ int ret;
+ ret = filp_close(req->close.put_file, req->work.files);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
fput(req->close.put_file);
+ io_put_req(req);
+}
- io_put_req_find_next(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
+static void io_close_finish(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+
+ /* not cancellable, don't do io_req_cancelled() */
+ __io_close_finish(req);
+ io_steal_work(req, workptr);
}
-static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_close(struct io_kiocb *req, bool force_nonblock)
{
int ret;
@@ -2870,37 +3425,25 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
return ret;
/* if the file has a flush method, be safe and punt to async */
- if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
- goto eagain;
-
- /*
- * No ->flush(), safely close from here and just punt the
- * fput() to async context.
- */
- ret = filp_close(req->close.put_file, current->files);
-
- if (ret < 0)
- req_set_fail_links(req);
- io_cqring_add_event(req, ret);
-
- if (io_wq_current_is_worker()) {
- struct io_wq_work *old_work, *work;
+ if (req->close.put_file->f_op->flush && force_nonblock) {
+ /* submission ref will be dropped, take it for async */
+ refcount_inc(&req->refs);
- old_work = work = &req->work;
- io_close_finish(&work);
- if (work && work != old_work)
- *nxt = container_of(work, struct io_kiocb, work);
+ req->work.func = io_close_finish;
+ /*
+ * Do manual async queue here to avoid grabbing files - we don't
+ * need the files, and it'll cause io_close_finish() to close
+ * the file again and cause a double CQE entry for this request
+ */
+ io_queue_async_work(req);
return 0;
}
-eagain:
- req->work.func = io_close_finish;
/*
- * Do manual async queue here to avoid grabbing files - we don't
- * need the files, and it'll cause io_close_finish() to close
- * the file again and cause a double CQE entry for this request
+ * No ->flush(), safely close from here and just punt the
+ * fput() to async context.
*/
- io_queue_async_work(req);
+ __io_close_finish(req);
return 0;
}
@@ -2922,82 +3465,91 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static void io_sync_file_range_finish(struct io_wq_work **workptr)
+static void __io_sync_file_range(struct io_kiocb *req)
{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
int ret;
- if (io_req_cancelled(req))
- return;
-
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, &nxt);
+ io_put_req(req);
+}
+
+
+static void io_sync_file_range_finish(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ struct io_kiocb *nxt = NULL;
+
+ if (io_req_cancelled(req))
+ return;
+ __io_sync_file_range(req);
+ io_put_req(req); /* put submission ref */
if (nxt)
io_wq_assign_next(workptr, nxt);
}
-static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
{
- struct io_wq_work *work, *old_work;
-
/* sync_file_range always requires a blocking context */
if (force_nonblock) {
- io_put_req(req);
req->work.func = io_sync_file_range_finish;
return -EAGAIN;
}
- work = old_work = &req->work;
- io_sync_file_range_finish(&work);
- if (work && work != old_work)
- *nxt = container_of(work, struct io_kiocb, work);
+ __io_sync_file_range(req);
return 0;
}
#if defined(CONFIG_NET)
-static void io_sendrecv_async(struct io_wq_work **workptr)
+static int io_setup_async_msg(struct io_kiocb *req,
+ struct io_async_msghdr *kmsg)
{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct iovec *iov = NULL;
-
- if (req->io->rw.iov != req->io->rw.fast_iov)
- iov = req->io->msg.iov;
- io_wq_submit_work(workptr);
- kfree(iov);
+ if (req->io)
+ return -EAGAIN;
+ if (io_alloc_async_ctx(req)) {
+ if (kmsg->iov != kmsg->fast_iov)
+ kfree(kmsg->iov);
+ return -ENOMEM;
+ }
+ req->flags |= REQ_F_NEED_CLEANUP;
+ memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
+ return -EAGAIN;
}
-#endif
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
-#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
+ int ret;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+
if (!io || req->opcode == IORING_OP_SEND)
return 0;
+ /* iovec is already imported */
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
io->msg.iov = io->msg.fast_iov;
- return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
+ ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
&io->msg.iov);
-#else
- return -EOPNOTSUPP;
-#endif
+ if (!ret)
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return ret;
}
-static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
struct io_async_msghdr *kmsg = NULL;
struct socket *sock;
int ret;
@@ -3008,12 +3560,11 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_async_ctx io;
- struct sockaddr_storage addr;
unsigned flags;
if (req->io) {
kmsg = &req->io->msg;
- kmsg->msg.msg_name = &addr;
+ kmsg->msg.msg_name = &req->io->msg.addr;
/* if iov is set, it's allocated already */
if (!kmsg->iov)
kmsg->iov = kmsg->fast_iov;
@@ -3022,7 +3573,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
struct io_sr_msg *sr = &req->sr_msg;
kmsg = &io.msg;
- kmsg->msg.msg_name = &addr;
+ kmsg->msg.msg_name = &io.msg.addr;
io.msg.iov = io.msg.fast_iov;
ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
@@ -3038,35 +3589,24 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
flags |= MSG_DONTWAIT;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
- if (force_nonblock && ret == -EAGAIN) {
- if (req->io)
- return -EAGAIN;
- if (io_alloc_async_ctx(req))
- return -ENOMEM;
- memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
- req->work.func = io_sendrecv_async;
- return -EAGAIN;
- }
+ if (force_nonblock && ret == -EAGAIN)
+ return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
- if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
+ if (kmsg && kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
-static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_send(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
struct socket *sock;
int ret;
@@ -3107,73 +3647,186 @@ static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
-#else
- return -EOPNOTSUPP;
+}
+
+static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+{
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct iovec __user *uiov;
+ size_t iov_len;
+ int ret;
+
+ ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
+ &uiov, &iov_len);
+ if (ret)
+ return ret;
+
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ if (iov_len > 1)
+ return -EINVAL;
+ if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
+ return -EFAULT;
+ sr->len = io->msg.iov[0].iov_len;
+ iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
+ sr->len);
+ io->msg.iov = NULL;
+ } else {
+ ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
+ &io->msg.iov, &io->msg.msg.msg_iter);
+ if (ret > 0)
+ ret = 0;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_ctx *io)
+{
+ struct compat_msghdr __user *msg_compat;
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct compat_iovec __user *uiov;
+ compat_uptr_t ptr;
+ compat_size_t len;
+ int ret;
+
+ msg_compat = (struct compat_msghdr __user *) sr->msg;
+ ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
+ &ptr, &len);
+ if (ret)
+ return ret;
+
+ uiov = compat_ptr(ptr);
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ compat_ssize_t clen;
+
+ if (len > 1)
+ return -EINVAL;
+ if (!access_ok(uiov, sizeof(*uiov)))
+ return -EFAULT;
+ if (__get_user(clen, &uiov->iov_len))
+ return -EFAULT;
+ if (clen < 0)
+ return -EINVAL;
+ sr->len = io->msg.iov[0].iov_len;
+ io->msg.iov = NULL;
+ } else {
+ ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
+ &io->msg.iov,
+ &io->msg.msg.msg_iter);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
#endif
+
+static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+{
+ io->msg.iov = io->msg.fast_iov;
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ return __io_compat_recvmsg_copy_hdr(req, io);
+#endif
+
+ return __io_recvmsg_copy_hdr(req, io);
+}
+
+static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
+ int *cflags, bool needs_lock)
+{
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct io_buffer *kbuf;
+
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return NULL;
+
+ kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
+ if (IS_ERR(kbuf))
+ return kbuf;
+
+ sr->kbuf = kbuf;
+ req->flags |= REQ_F_BUFFER_SELECTED;
+
+ *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
+ *cflags |= IORING_CQE_F_BUFFER;
+ return kbuf;
}
static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
-#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
+ int ret;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+ sr->bgid = READ_ONCE(sqe->buf_group);
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
if (!io || req->opcode == IORING_OP_RECV)
return 0;
+ /* iovec is already imported */
+ if (req->flags & REQ_F_NEED_CLEANUP)
+ return 0;
- io->msg.iov = io->msg.fast_iov;
- return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
- &io->msg.uaddr, &io->msg.iov);
-#else
- return -EOPNOTSUPP;
-#endif
+ ret = io_recvmsg_copy_hdr(req, io);
+ if (!ret)
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return ret;
}
-static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
struct io_async_msghdr *kmsg = NULL;
struct socket *sock;
- int ret;
+ int ret, cflags = 0;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
+ struct io_buffer *kbuf;
struct io_async_ctx io;
- struct sockaddr_storage addr;
unsigned flags;
if (req->io) {
kmsg = &req->io->msg;
- kmsg->msg.msg_name = &addr;
+ kmsg->msg.msg_name = &req->io->msg.addr;
/* if iov is set, it's allocated already */
if (!kmsg->iov)
kmsg->iov = kmsg->fast_iov;
kmsg->msg.msg_iter.iov = kmsg->iov;
} else {
- struct io_sr_msg *sr = &req->sr_msg;
-
kmsg = &io.msg;
- kmsg->msg.msg_name = &addr;
+ kmsg->msg.msg_name = &io.msg.addr;
- io.msg.iov = io.msg.fast_iov;
- ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
- sr->msg_flags, &io.msg.uaddr,
- &io.msg.iov);
+ ret = io_recvmsg_copy_hdr(req, &io);
if (ret)
return ret;
}
+ kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+ if (IS_ERR(kbuf)) {
+ return PTR_ERR(kbuf);
+ } else if (kbuf) {
+ kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+ iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
+ 1, req->sr_msg.len);
+ }
+
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -3182,37 +3835,27 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
kmsg->uaddr, flags);
- if (force_nonblock && ret == -EAGAIN) {
- if (req->io)
- return -EAGAIN;
- if (io_alloc_async_ctx(req))
- return -ENOMEM;
- memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
- req->work.func = io_sendrecv_async;
- return -EAGAIN;
- }
+ if (force_nonblock && ret == -EAGAIN)
+ return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
- if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
+ if (kmsg && kmsg->iov != kmsg->fast_iov)
kfree(kmsg->iov);
- io_cqring_add_event(req, ret);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ __io_cqring_add_event(req, ret, cflags);
if (ret < 0)
req_set_fail_links(req);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
-static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_recv(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
+ struct io_buffer *kbuf = NULL;
struct socket *sock;
- int ret;
+ int ret, cflags = 0;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -3220,15 +3863,25 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
+ void __user *buf = sr->buf;
struct msghdr msg;
struct iovec iov;
unsigned flags;
- ret = import_single_range(READ, sr->buf, sr->len, &iov,
+ kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+ else if (kbuf)
+ buf = u64_to_user_ptr(kbuf->addr);
+
+ ret = import_single_range(READ, buf, sr->len, &iov,
&msg.msg_iter);
- if (ret)
+ if (ret) {
+ kfree(kbuf);
return ret;
+ }
+ req->flags |= REQ_F_NEED_CLEANUP;
msg.msg_name = NULL;
msg.msg_control = NULL;
msg.msg_controllen = 0;
@@ -3249,20 +3902,17 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
ret = -EINTR;
}
- io_cqring_add_event(req, ret);
+ kfree(kbuf);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ __io_cqring_add_event(req, ret, cflags);
if (ret < 0)
req_set_fail_links(req);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
-
static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
-#if defined(CONFIG_NET)
struct io_accept *accept = &req->accept;
if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
@@ -3273,15 +3923,11 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
accept->flags = READ_ONCE(sqe->accept_flags);
+ accept->nofile = rlimit(RLIMIT_NOFILE);
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
-#if defined(CONFIG_NET)
-static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int __io_accept(struct io_kiocb *req, bool force_nonblock)
{
struct io_accept *accept = &req->accept;
unsigned file_flags;
@@ -3289,7 +3935,8 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
file_flags = force_nonblock ? O_NONBLOCK : 0;
ret = __sys_accept4_file(req->file, file_flags, accept->addr,
- accept->addr_len, accept->flags);
+ accept->addr_len, accept->flags,
+ accept->nofile);
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
if (ret == -ERESTARTSYS)
@@ -3297,44 +3944,34 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
}
static void io_accept_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
if (io_req_cancelled(req))
return;
- __io_accept(req, &nxt, false);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
+ __io_accept(req, false);
+ io_steal_work(req, workptr);
}
-#endif
-static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
int ret;
- ret = __io_accept(req, nxt, force_nonblock);
+ ret = __io_accept(req, force_nonblock);
if (ret == -EAGAIN && force_nonblock) {
req->work.func = io_accept_finish;
- io_put_req(req);
return -EAGAIN;
}
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
-#if defined(CONFIG_NET)
struct io_connect *conn = &req->connect;
struct io_async_ctx *io = req->io;
@@ -3351,15 +3988,10 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return move_addr_to_kernel(conn->addr, conn->addr_len,
&io->connect.address);
-#else
- return -EOPNOTSUPP;
-#endif
}
-static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
- bool force_nonblock)
+static int io_connect(struct io_kiocb *req, bool force_nonblock)
{
-#if defined(CONFIG_NET)
struct io_async_ctx __io, *io;
unsigned file_flags;
int ret;
@@ -3395,25 +4027,301 @@ out:
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
return 0;
-#else
+}
+#else /* !CONFIG_NET */
+static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
return -EOPNOTSUPP;
-#endif
}
-static void io_poll_remove_one(struct io_kiocb *req)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
{
- struct io_poll_iocb *poll = &req->poll;
+ return -EOPNOTSUPP;
+}
+
+static int io_send(struct io_kiocb *req, bool force_nonblock)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_recvmsg_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_recv(struct io_kiocb *req, bool force_nonblock)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return -EOPNOTSUPP;
+}
+
+static int io_connect(struct io_kiocb *req, bool force_nonblock)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_NET */
+
+struct io_poll_table {
+ struct poll_table_struct pt;
+ struct io_kiocb *req;
+ int error;
+};
+
+static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
+ struct wait_queue_head *head)
+{
+ if (unlikely(poll->head)) {
+ pt->error = -EINVAL;
+ return;
+ }
+
+ pt->error = 0;
+ poll->head = head;
+ add_wait_queue(head, &poll->wait);
+}
+
+static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
+ struct poll_table_struct *p)
+{
+ struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+ __io_queue_proc(&pt->req->apoll->poll, pt, head);
+}
+
+static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
+ __poll_t mask, task_work_func_t func)
+{
+ struct task_struct *tsk;
+
+ /* for instances that support it check for an event match first: */
+ if (mask && !(mask & poll->events))
+ return 0;
+
+ trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
+
+ list_del_init(&poll->wait.entry);
+
+ tsk = req->task;
+ req->result = mask;
+ init_task_work(&req->task_work, func);
+ /*
+ * If this fails, then the task is exiting. If that is the case, then
+ * the exit check will ultimately cancel these work items. Hence we
+ * don't need to check here and handle it specifically.
+ */
+ task_work_add(tsk, &req->task_work, true);
+ wake_up_process(tsk);
+ return 1;
+}
+
+static void io_async_task_func(struct callback_head *cb)
+{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct async_poll *apoll = req->apoll;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
+
+ WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
+
+ if (hash_hashed(&req->hash_node)) {
+ spin_lock_irq(&ctx->completion_lock);
+ hash_del(&req->hash_node);
+ spin_unlock_irq(&ctx->completion_lock);
+ }
+
+ /* restore ->work in case we need to retry again */
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
+
+ __set_current_state(TASK_RUNNING);
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(req, NULL);
+ mutex_unlock(&ctx->uring_lock);
+
+ kfree(apoll);
+}
+
+static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct io_kiocb *req = wait->private;
+ struct io_poll_iocb *poll = &req->apoll->poll;
+
+ trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
+ key_to_poll(key));
+
+ return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
+}
+
+static void io_poll_req_insert(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct hlist_head *list;
+
+ list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
+ hlist_add_head(&req->hash_node, list);
+}
+
+static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
+ struct io_poll_iocb *poll,
+ struct io_poll_table *ipt, __poll_t mask,
+ wait_queue_func_t wake_func)
+ __acquires(&ctx->completion_lock)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ bool cancel = false;
+
+ poll->file = req->file;
+ poll->head = NULL;
+ poll->done = poll->canceled = false;
+ poll->events = mask;
+
+ ipt->pt._key = mask;
+ ipt->req = req;
+ ipt->error = -EINVAL;
+
+ INIT_LIST_HEAD(&poll->wait.entry);
+ init_waitqueue_func_entry(&poll->wait, wake_func);
+ poll->wait.private = req;
+
+ mask = vfs_poll(req->file, &ipt->pt) & poll->events;
+
+ spin_lock_irq(&ctx->completion_lock);
+ if (likely(poll->head)) {
+ spin_lock(&poll->head->lock);
+ if (unlikely(list_empty(&poll->wait.entry))) {
+ if (ipt->error)
+ cancel = true;
+ ipt->error = 0;
+ mask = 0;
+ }
+ if (mask || ipt->error)
+ list_del_init(&poll->wait.entry);
+ else if (cancel)
+ WRITE_ONCE(poll->canceled, true);
+ else if (!poll->done) /* actually waiting for an event */
+ io_poll_req_insert(req);
+ spin_unlock(&poll->head->lock);
+ }
+
+ return mask;
+}
+
+static bool io_arm_poll_handler(struct io_kiocb *req)
+{
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ struct io_ring_ctx *ctx = req->ctx;
+ struct async_poll *apoll;
+ struct io_poll_table ipt;
+ __poll_t mask, ret;
+
+ if (!req->file || !file_can_poll(req->file))
+ return false;
+ if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
+ return false;
+ if (!def->pollin && !def->pollout)
+ return false;
+
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+ if (unlikely(!apoll))
+ return false;
+
+ req->flags |= REQ_F_POLLED;
+ memcpy(&apoll->work, &req->work, sizeof(req->work));
+
+ /*
+ * Don't need a reference here, as we're adding it to the task
+ * task_works list. If the task exits, the list is pruned.
+ */
+ req->task = current;
+ req->apoll = apoll;
+ INIT_HLIST_NODE(&req->hash_node);
+
+ mask = 0;
+ if (def->pollin)
+ mask |= POLLIN | POLLRDNORM;
+ if (def->pollout)
+ mask |= POLLOUT | POLLWRNORM;
+ mask |= POLLERR | POLLPRI;
+
+ ipt.pt._qproc = io_async_queue_proc;
+
+ ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
+ io_async_wake);
+ if (ret) {
+ ipt.error = 0;
+ apoll->poll.done = true;
+ spin_unlock_irq(&ctx->completion_lock);
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll);
+ return false;
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+ trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
+ apoll->poll.events);
+ return true;
+}
+
+static bool __io_poll_remove_one(struct io_kiocb *req,
+ struct io_poll_iocb *poll)
+{
+ bool do_complete = false;
spin_lock(&poll->head->lock);
WRITE_ONCE(poll->canceled, true);
if (!list_empty(&poll->wait.entry)) {
list_del_init(&poll->wait.entry);
- io_queue_async_work(req);
+ do_complete = true;
}
spin_unlock(&poll->head->lock);
+ return do_complete;
+}
+
+static bool io_poll_remove_one(struct io_kiocb *req)
+{
+ bool do_complete;
+
+ if (req->opcode == IORING_OP_POLL_ADD) {
+ do_complete = __io_poll_remove_one(req, &req->poll);
+ } else {
+ /* non-poll requests have submit ref still */
+ do_complete = __io_poll_remove_one(req, &req->apoll->poll);
+ if (do_complete)
+ io_put_req(req);
+ }
+
hash_del(&req->hash_node);
+
+ if (do_complete) {
+ io_cqring_fill_event(req, -ECANCELED);
+ io_commit_cqring(req->ctx);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(req);
+ }
+
+ return do_complete;
}
static void io_poll_remove_all(struct io_ring_ctx *ctx)
@@ -3431,6 +4339,8 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
io_poll_remove_one(req);
}
spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
}
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
@@ -3440,10 +4350,11 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
hlist_for_each_entry(req, list, hash_node) {
- if (sqe_addr == req->user_data) {
- io_poll_remove_one(req);
+ if (sqe_addr != req->user_data)
+ continue;
+ if (io_poll_remove_one(req))
return 0;
- }
+ return -EALREADY;
}
return -ENOENT;
@@ -3489,186 +4400,54 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
struct io_ring_ctx *ctx = req->ctx;
req->poll.done = true;
- if (error)
- io_cqring_fill_event(req, error);
- else
- io_cqring_fill_event(req, mangle_poll(mask));
+ io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
}
-static void io_poll_complete_work(struct io_wq_work **workptr)
+static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
{
- struct io_wq_work *work = *workptr;
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_poll_iocb *poll = &req->poll;
- struct poll_table_struct pt = { ._key = poll->events };
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *nxt = NULL;
- __poll_t mask = 0;
- int ret = 0;
-
- if (work->flags & IO_WQ_WORK_CANCEL) {
- WRITE_ONCE(poll->canceled, true);
- ret = -ECANCELED;
- } else if (READ_ONCE(poll->canceled)) {
- ret = -ECANCELED;
- }
-
- if (ret != -ECANCELED)
- mask = vfs_poll(poll->file, &pt) & poll->events;
- /*
- * Note that ->ki_cancel callers also delete iocb from active_reqs after
- * calling ->ki_cancel. We need the ctx_lock roundtrip here to
- * synchronize with them. In the cancellation case the list_del_init
- * itself is not actually needed, but harmless so we keep it in to
- * avoid further branches in the fast path.
- */
spin_lock_irq(&ctx->completion_lock);
- if (!mask && ret != -ECANCELED) {
- add_wait_queue(poll->head, &poll->wait);
- spin_unlock_irq(&ctx->completion_lock);
- return;
- }
hash_del(&req->hash_node);
- io_poll_complete(req, mask, ret);
+ io_poll_complete(req, req->result, 0);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req_find_next(req, nxt);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
-
- if (ret < 0)
- req_set_fail_links(req);
- io_put_req_find_next(req, &nxt);
- if (nxt)
- io_wq_assign_next(workptr, nxt);
}
-static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
+static void io_poll_task_func(struct callback_head *cb)
{
- struct io_kiocb *req, *tmp;
- struct req_batch rb;
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_kiocb *nxt = NULL;
- rb.to_free = rb.need_iter = 0;
- spin_lock_irq(&ctx->completion_lock);
- llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
- hash_del(&req->hash_node);
- io_poll_complete(req, req->result, 0);
+ io_poll_task_handler(req, &nxt);
+ if (nxt) {
+ struct io_ring_ctx *ctx = nxt->ctx;
- if (refcount_dec_and_test(&req->refs) &&
- !io_req_multi_free(&rb, req)) {
- req->flags |= REQ_F_COMP_LOCKED;
- io_free_req(req);
- }
+ mutex_lock(&ctx->uring_lock);
+ __io_queue_sqe(nxt, NULL);
+ mutex_unlock(&ctx->uring_lock);
}
- spin_unlock_irq(&ctx->completion_lock);
-
- io_cqring_ev_posted(ctx);
- io_free_req_many(ctx, &rb);
-}
-
-static void io_poll_flush(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct llist_node *nodes;
-
- nodes = llist_del_all(&req->ctx->poll_llist);
- if (nodes)
- __io_poll_flush(req->ctx, nodes);
-}
-
-static void io_poll_trigger_evfd(struct io_wq_work **workptr)
-{
- struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
- eventfd_signal(req->ctx->cq_ev_fd, 1);
- io_put_req(req);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
- struct io_poll_iocb *poll = wait->private;
- struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
- struct io_ring_ctx *ctx = req->ctx;
- __poll_t mask = key_to_poll(key);
-
- /* for instances that support it check for an event match first: */
- if (mask && !(mask & poll->events))
- return 0;
-
- list_del_init(&poll->wait.entry);
-
- /*
- * Run completion inline if we can. We're using trylock here because
- * we are violating the completion_lock -> poll wq lock ordering.
- * If we have a link timeout we're going to need the completion_lock
- * for finalizing the request, mark us as having grabbed that already.
- */
- if (mask) {
- unsigned long flags;
-
- if (llist_empty(&ctx->poll_llist) &&
- spin_trylock_irqsave(&ctx->completion_lock, flags)) {
- bool trigger_ev;
-
- hash_del(&req->hash_node);
- io_poll_complete(req, mask, 0);
-
- trigger_ev = io_should_trigger_evfd(ctx);
- if (trigger_ev && eventfd_signal_count()) {
- trigger_ev = false;
- req->work.func = io_poll_trigger_evfd;
- } else {
- req->flags |= REQ_F_COMP_LOCKED;
- io_put_req(req);
- req = NULL;
- }
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- __io_cqring_ev_posted(ctx, trigger_ev);
- } else {
- req->result = mask;
- req->llist_node.next = NULL;
- /* if the list wasn't empty, we're done */
- if (!llist_add(&req->llist_node, &ctx->poll_llist))
- req = NULL;
- else
- req->work.func = io_poll_flush;
- }
- }
- if (req)
- io_queue_async_work(req);
+ struct io_kiocb *req = wait->private;
+ struct io_poll_iocb *poll = &req->poll;
- return 1;
+ return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
}
-struct io_poll_table {
- struct poll_table_struct pt;
- struct io_kiocb *req;
- int error;
-};
-
static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
- if (unlikely(pt->req->poll.head)) {
- pt->error = -EINVAL;
- return;
- }
-
- pt->error = 0;
- pt->req->poll.head = head;
- add_wait_queue(head, &pt->req->poll.wait);
-}
-
-static void io_poll_req_insert(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
- hlist_add_head(&req->hash_node, list);
+ __io_queue_proc(&pt->req->poll, pt, head);
}
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -3685,55 +4464,29 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+
+ /*
+ * Don't need a reference here, as we're adding it to the task
+ * task_works list. If the task exits, the list is pruned.
+ */
+ req->task = current;
return 0;
}
-static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
+static int io_poll_add(struct io_kiocb *req)
{
struct io_poll_iocb *poll = &req->poll;
struct io_ring_ctx *ctx = req->ctx;
struct io_poll_table ipt;
- bool cancel = false;
__poll_t mask;
- INIT_IO_WORK(&req->work, io_poll_complete_work);
INIT_HLIST_NODE(&req->hash_node);
-
- poll->head = NULL;
- poll->done = false;
- poll->canceled = false;
-
- ipt.pt._qproc = io_poll_queue_proc;
- ipt.pt._key = poll->events;
- ipt.req = req;
- ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
-
- /* initialized the list so that we can do list_empty checks */
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, io_poll_wake);
- poll->wait.private = poll;
-
INIT_LIST_HEAD(&req->list);
+ ipt.pt._qproc = io_poll_queue_proc;
- mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+ mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
+ io_poll_wake);
- spin_lock_irq(&ctx->completion_lock);
- if (likely(poll->head)) {
- spin_lock(&poll->head->lock);
- if (unlikely(list_empty(&poll->wait.entry))) {
- if (ipt.error)
- cancel = true;
- ipt.error = 0;
- mask = 0;
- }
- if (mask || ipt.error)
- list_del_init(&poll->wait.entry);
- else if (cancel)
- WRITE_ONCE(poll->canceled, true);
- else if (!poll->done) /* actually waiting for an event */
- io_poll_req_insert(req);
- spin_unlock(&poll->head->lock);
- }
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
io_poll_complete(req, mask, 0);
@@ -3742,7 +4495,7 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
if (mask) {
io_cqring_ev_posted(ctx);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
}
return ipt.error;
}
@@ -3991,7 +4744,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
struct io_kiocb *req, __u64 sqe_addr,
- struct io_kiocb **nxt, int success_ret)
+ int success_ret)
{
unsigned long flags;
int ret;
@@ -4017,7 +4770,7 @@ done:
if (ret < 0)
req_set_fail_links(req);
- io_put_req_find_next(req, nxt);
+ io_put_req(req);
}
static int io_async_cancel_prep(struct io_kiocb *req,
@@ -4033,11 +4786,11 @@ static int io_async_cancel_prep(struct io_kiocb *req,
return 0;
}
-static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
+static int io_async_cancel(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
+ io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
return 0;
}
@@ -4083,6 +4836,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
{
ssize_t ret = 0;
+ if (!sqe)
+ return 0;
+
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (unlikely(ret))
@@ -4169,6 +4925,15 @@ static int io_req_defer_prep(struct io_kiocb *req,
case IORING_OP_EPOLL_CTL:
ret = io_epoll_ctl_prep(req, sqe);
break;
+ case IORING_OP_SPLICE:
+ ret = io_splice_prep(req, sqe);
+ break;
+ case IORING_OP_PROVIDE_BUFFERS:
+ ret = io_provide_buffers_prep(req, sqe);
+ break;
+ case IORING_OP_REMOVE_BUFFERS:
+ ret = io_remove_buffers_prep(req, sqe);
+ break;
default:
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
@@ -4207,8 +4972,51 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EIOCBQUEUED;
}
+static void io_cleanup_req(struct io_kiocb *req)
+{
+ struct io_async_ctx *io = req->io;
+
+ switch (req->opcode) {
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ kfree((void *)(unsigned long)req->rw.addr);
+ /* fallthrough */
+ case IORING_OP_WRITEV:
+ case IORING_OP_WRITE_FIXED:
+ case IORING_OP_WRITE:
+ if (io->rw.iov != io->rw.fast_iov)
+ kfree(io->rw.iov);
+ break;
+ case IORING_OP_RECVMSG:
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ kfree(req->sr_msg.kbuf);
+ /* fallthrough */
+ case IORING_OP_SENDMSG:
+ if (io->msg.iov != io->msg.fast_iov)
+ kfree(io->msg.iov);
+ break;
+ case IORING_OP_RECV:
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ kfree(req->sr_msg.kbuf);
+ break;
+ case IORING_OP_OPENAT:
+ case IORING_OP_OPENAT2:
+ case IORING_OP_STATX:
+ putname(req->open.filename);
+ break;
+ case IORING_OP_SPLICE:
+ io_put_file(req, req->splice.file_in,
+ (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+ break;
+ }
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+}
+
static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt, bool force_nonblock)
+ bool force_nonblock)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -4225,7 +5033,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_read(req, nxt, force_nonblock);
+ ret = io_read(req, force_nonblock);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
@@ -4235,7 +5043,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_write(req, nxt, force_nonblock);
+ ret = io_write(req, force_nonblock);
break;
case IORING_OP_FSYNC:
if (sqe) {
@@ -4243,7 +5051,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_fsync(req, nxt, force_nonblock);
+ ret = io_fsync(req, force_nonblock);
break;
case IORING_OP_POLL_ADD:
if (sqe) {
@@ -4251,7 +5059,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_poll_add(req, nxt);
+ ret = io_poll_add(req);
break;
case IORING_OP_POLL_REMOVE:
if (sqe) {
@@ -4267,7 +5075,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0)
break;
}
- ret = io_sync_file_range(req, nxt, force_nonblock);
+ ret = io_sync_file_range(req, force_nonblock);
break;
case IORING_OP_SENDMSG:
case IORING_OP_SEND:
@@ -4277,9 +5085,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}
if (req->opcode == IORING_OP_SENDMSG)
- ret = io_sendmsg(req, nxt, force_nonblock);
+ ret = io_sendmsg(req, force_nonblock);
else
- ret = io_send(req, nxt, force_nonblock);
+ ret = io_send(req, force_nonblock);
break;
case IORING_OP_RECVMSG:
case IORING_OP_RECV:
@@ -4289,9 +5097,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
}
if (req->opcode == IORING_OP_RECVMSG)
- ret = io_recvmsg(req, nxt, force_nonblock);
+ ret = io_recvmsg(req, force_nonblock);
else
- ret = io_recv(req, nxt, force_nonblock);
+ ret = io_recv(req, force_nonblock);
break;
case IORING_OP_TIMEOUT:
if (sqe) {
@@ -4315,7 +5123,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_accept(req, nxt, force_nonblock);
+ ret = io_accept(req, force_nonblock);
break;
case IORING_OP_CONNECT:
if (sqe) {
@@ -4323,7 +5131,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_connect(req, nxt, force_nonblock);
+ ret = io_connect(req, force_nonblock);
break;
case IORING_OP_ASYNC_CANCEL:
if (sqe) {
@@ -4331,7 +5139,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_async_cancel(req, nxt);
+ ret = io_async_cancel(req);
break;
case IORING_OP_FALLOCATE:
if (sqe) {
@@ -4339,7 +5147,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_fallocate(req, nxt, force_nonblock);
+ ret = io_fallocate(req, force_nonblock);
break;
case IORING_OP_OPENAT:
if (sqe) {
@@ -4347,7 +5155,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_openat(req, nxt, force_nonblock);
+ ret = io_openat(req, force_nonblock);
break;
case IORING_OP_CLOSE:
if (sqe) {
@@ -4355,7 +5163,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_close(req, nxt, force_nonblock);
+ ret = io_close(req, force_nonblock);
break;
case IORING_OP_FILES_UPDATE:
if (sqe) {
@@ -4371,7 +5179,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_statx(req, nxt, force_nonblock);
+ ret = io_statx(req, force_nonblock);
break;
case IORING_OP_FADVISE:
if (sqe) {
@@ -4379,7 +5187,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_fadvise(req, nxt, force_nonblock);
+ ret = io_fadvise(req, force_nonblock);
break;
case IORING_OP_MADVISE:
if (sqe) {
@@ -4387,7 +5195,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_madvise(req, nxt, force_nonblock);
+ ret = io_madvise(req, force_nonblock);
break;
case IORING_OP_OPENAT2:
if (sqe) {
@@ -4395,7 +5203,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_openat2(req, nxt, force_nonblock);
+ ret = io_openat2(req, force_nonblock);
break;
case IORING_OP_EPOLL_CTL:
if (sqe) {
@@ -4403,7 +5211,31 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret)
break;
}
- ret = io_epoll_ctl(req, nxt, force_nonblock);
+ ret = io_epoll_ctl(req, force_nonblock);
+ break;
+ case IORING_OP_SPLICE:
+ if (sqe) {
+ ret = io_splice_prep(req, sqe);
+ if (ret < 0)
+ break;
+ }
+ ret = io_splice(req, force_nonblock);
+ break;
+ case IORING_OP_PROVIDE_BUFFERS:
+ if (sqe) {
+ ret = io_provide_buffers_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_provide_buffers(req, force_nonblock);
+ break;
+ case IORING_OP_REMOVE_BUFFERS:
+ if (sqe) {
+ ret = io_remove_buffers_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_remove_buffers(req, force_nonblock);
break;
default:
ret = -EINVAL;
@@ -4436,7 +5268,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
int ret = 0;
/* if NO_CANCEL is set, we must still run the work */
@@ -4446,10 +5277,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
}
if (!ret) {
- req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
- req->in_async = true;
do {
- ret = io_issue_sqe(req, NULL, &nxt, false);
+ ret = io_issue_sqe(req, NULL, false);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
@@ -4461,25 +5290,20 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
} while (1);
}
- /* drop submission reference */
- io_put_req(req);
-
if (ret) {
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
}
- /* if a dependent link is ready, pass it back */
- if (!ret && nxt)
- io_wq_assign_next(workptr, nxt);
+ io_steal_work(req, workptr);
}
static int io_req_needs_file(struct io_kiocb *req, int fd)
{
if (!io_op_defs[req->opcode].needs_file)
return 0;
- if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
+ if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg)
return 0;
return 1;
}
@@ -4493,41 +5317,52 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
return table->files[index & IORING_FILE_TABLE_MASK];;
}
-static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
+ int fd, struct file **out_file, bool fixed)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned flags;
- int fd;
-
- flags = READ_ONCE(sqe->flags);
- fd = READ_ONCE(sqe->fd);
-
- if (!io_req_needs_file(req, fd))
- return 0;
+ struct file *file;
- if (flags & IOSQE_FIXED_FILE) {
+ if (fixed) {
if (unlikely(!ctx->file_data ||
(unsigned) fd >= ctx->nr_user_files))
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
- req->file = io_file_from_index(ctx, fd);
- if (!req->file)
+ file = io_file_from_index(ctx, fd);
+ if (!file)
return -EBADF;
- req->flags |= REQ_F_FIXED_FILE;
percpu_ref_get(&ctx->file_data->refs);
} else {
- if (req->needs_fixed_file)
- return -EBADF;
trace_io_uring_file_get(ctx, fd);
- req->file = io_file_get(state, fd);
- if (unlikely(!req->file))
+ file = __io_file_get(state, fd);
+ if (unlikely(!file))
return -EBADF;
}
+ *out_file = file;
return 0;
}
+static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ unsigned flags;
+ int fd;
+ bool fixed;
+
+ flags = READ_ONCE(sqe->flags);
+ fd = READ_ONCE(sqe->fd);
+
+ if (!io_req_needs_file(req, fd))
+ return 0;
+
+ fixed = (flags & IOSQE_FIXED_FILE);
+ if (unlikely(!fixed && req->needs_fixed_file))
+ return -EBADF;
+
+ return io_file_get(state, req, fd, &req->file, fixed);
+}
+
static int io_grab_files(struct io_kiocb *req)
{
int ret = -EBADF;
@@ -4587,8 +5422,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
if (prev) {
req_set_fail_links(prev);
- io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
- -ETIME);
+ io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
io_put_req(prev);
} else {
io_cqring_add_event(req, -ETIME);
@@ -4625,6 +5459,9 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
if (!(req->flags & REQ_F_LINK))
return NULL;
+ /* for polled retry, if flag is set, we already went through here */
+ if (req->flags & REQ_F_POLLED)
+ return NULL;
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
link_list);
@@ -4638,13 +5475,23 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_kiocb *linked_timeout;
- struct io_kiocb *nxt = NULL;
+ struct io_kiocb *nxt;
+ const struct cred *old_creds = NULL;
int ret;
again:
linked_timeout = io_prep_linked_timeout(req);
- ret = io_issue_sqe(req, sqe, &nxt, true);
+ if (req->work.creds && req->work.creds != current_cred()) {
+ if (old_creds)
+ revert_creds(old_creds);
+ if (old_creds == req->work.creds)
+ old_creds = NULL; /* restored original creds */
+ else
+ old_creds = override_creds(req->work.creds);
+ }
+
+ ret = io_issue_sqe(req, sqe, true);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -4652,6 +5499,11 @@ again:
*/
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
+ if (io_arm_poll_handler(req)) {
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
+ goto exit;
+ }
punt:
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
@@ -4664,12 +5516,13 @@ punt:
* submit reference when the iocb is actually submitted.
*/
io_queue_async_work(req);
- goto done_req;
+ goto exit;
}
err:
+ nxt = NULL;
/* drop submission reference */
- io_put_req(req);
+ io_put_req_find_next(req, &nxt);
if (linked_timeout) {
if (!ret)
@@ -4684,15 +5537,16 @@ err:
req_set_fail_links(req);
io_put_req(req);
}
-done_req:
if (nxt) {
req = nxt;
- nxt = NULL;
if (req->flags & REQ_F_FORCE_ASYNC)
goto punt;
goto again;
}
+exit:
+ if (old_creds)
+ revert_creds(old_creds);
}
static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4732,12 +5586,12 @@ static inline void io_queue_link_head(struct io_kiocb *req)
}
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+ IOSQE_BUFFER_SELECT)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
- const struct cred *old_creds = NULL;
struct io_ring_ctx *ctx = req->ctx;
unsigned int sqe_flags;
int ret, id;
@@ -4750,29 +5604,32 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
goto err_req;
}
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[req->opcode].buffer_select) {
+ ret = -EOPNOTSUPP;
+ goto err_req;
+ }
+
id = READ_ONCE(sqe->personality);
if (id) {
- const struct cred *personality_creds;
-
- personality_creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!personality_creds)) {
+ req->work.creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->work.creds)) {
ret = -EINVAL;
goto err_req;
}
- old_creds = override_creds(personality_creds);
+ get_cred(req->work.creds);
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
- IOSQE_ASYNC);
+ req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
+ IOSQE_ASYNC | IOSQE_FIXED_FILE |
+ IOSQE_BUFFER_SELECT);
ret = io_req_set_file(state, req, sqe);
if (unlikely(ret)) {
err_req:
io_cqring_add_event(req, ret);
io_double_put_req(req);
- if (old_creds)
- revert_creds(old_creds);
return false;
}
@@ -4824,6 +5681,11 @@ err_req:
if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
req->flags |= REQ_F_LINK;
INIT_LIST_HEAD(&req->link_list);
+
+ if (io_alloc_async_ctx(req)) {
+ ret = -EAGAIN;
+ goto err_req;
+ }
ret = io_req_defer_prep(req, sqe);
if (ret)
req->flags |= REQ_F_FAIL_LINK;
@@ -4833,8 +5695,6 @@ err_req:
}
}
- if (old_creds)
- revert_creds(old_creds);
return true;
}
@@ -4950,6 +5810,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
+ int err;
req = io_get_req(ctx, statep);
if (unlikely(!req)) {
@@ -4966,21 +5827,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
submitted++;
if (unlikely(req->opcode >= IORING_OP_LAST)) {
- io_cqring_add_event(req, -EINVAL);
+ err = -EINVAL;
+fail_req:
+ io_cqring_add_event(req, err);
io_double_put_req(req);
break;
}
if (io_op_defs[req->opcode].needs_mm && !*mm) {
mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
- if (!mm_fault) {
- use_mm(ctx->sqo_mm);
- *mm = ctx->sqo_mm;
+ if (unlikely(mm_fault)) {
+ err = -EFAULT;
+ goto fail_req;
}
+ use_mm(ctx->sqo_mm);
+ *mm = ctx->sqo_mm;
}
- req->has_user = *mm != NULL;
- req->in_async = async;
req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
true, async);
@@ -5011,9 +5874,8 @@ static int io_sq_thread(void *data)
const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
- unsigned inflight;
unsigned long timeout;
- int ret;
+ int ret = 0;
complete(&ctx->completions[1]);
@@ -5021,39 +5883,19 @@ static int io_sq_thread(void *data)
set_fs(USER_DS);
old_cred = override_creds(ctx->creds);
- ret = timeout = inflight = 0;
+ timeout = jiffies + ctx->sq_thread_idle;
while (!kthread_should_park()) {
unsigned int to_submit;
- if (inflight) {
+ if (!list_empty(&ctx->poll_list)) {
unsigned nr_events = 0;
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- /*
- * inflight is the count of the maximum possible
- * entries we submitted, but it can be smaller
- * if we dropped some of them. If we don't have
- * poll entries available, then we know that we
- * have nothing left to poll for. Reset the
- * inflight count to zero in that case.
- */
- mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->poll_list))
- __io_iopoll_check(ctx, &nr_events, 0);
- else
- inflight = 0;
- mutex_unlock(&ctx->uring_lock);
- } else {
- /*
- * Normal IO, just pretend everything completed.
- * We don't have to poll completions for that.
- */
- nr_events = inflight;
- }
-
- inflight -= nr_events;
- if (!inflight)
+ mutex_lock(&ctx->uring_lock);
+ if (!list_empty(&ctx->poll_list))
+ io_iopoll_getevents(ctx, &nr_events, 0);
+ else
timeout = jiffies + ctx->sq_thread_idle;
+ mutex_unlock(&ctx->uring_lock);
}
to_submit = io_sqring_entries(ctx);
@@ -5064,34 +5906,49 @@ static int io_sq_thread(void *data)
*/
if (!to_submit || ret == -EBUSY) {
/*
+ * Drop cur_mm before scheduling, we can't hold it for
+ * long periods (or over schedule()). Do this before
+ * adding ourselves to the waitqueue, as the unuse/drop
+ * may sleep.
+ */
+ if (cur_mm) {
+ unuse_mm(cur_mm);
+ mmput(cur_mm);
+ cur_mm = NULL;
+ }
+
+ /*
* We're polling. If we're within the defined idle
* period, then let us spin without work before going
* to sleep. The exception is if we got EBUSY doing
* more IO, we should wait for the application to
* reap events and wake us up.
*/
- if (inflight ||
+ if (!list_empty(&ctx->poll_list) ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
+ if (current->task_works)
+ task_work_run();
cond_resched();
continue;
}
+ prepare_to_wait(&ctx->sqo_wait, &wait,
+ TASK_INTERRUPTIBLE);
+
/*
- * Drop cur_mm before scheduling, we can't hold it for
- * long periods (or over schedule()). Do this before
- * adding ourselves to the waitqueue, as the unuse/drop
- * may sleep.
+ * While doing polled IO, before going to sleep, we need
+ * to check if there are new reqs added to poll_list, it
+ * is because reqs may have been punted to io worker and
+ * will be added to poll_list later, hence check the
+ * poll_list again.
*/
- if (cur_mm) {
- unuse_mm(cur_mm);
- mmput(cur_mm);
- cur_mm = NULL;
+ if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ !list_empty_careful(&ctx->poll_list)) {
+ finish_wait(&ctx->sqo_wait, &wait);
+ continue;
}
- prepare_to_wait(&ctx->sqo_wait, &wait,
- TASK_INTERRUPTIBLE);
-
/* Tell userspace we may need a wakeup call */
ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
/* make sure to read SQ tail after writing flags */
@@ -5103,6 +5960,10 @@ static int io_sq_thread(void *data)
finish_wait(&ctx->sqo_wait, &wait);
break;
}
+ if (current->task_works) {
+ task_work_run();
+ continue;
+ }
if (signal_pending(current))
flush_signals(current);
schedule();
@@ -5119,10 +5980,12 @@ static int io_sq_thread(void *data)
mutex_lock(&ctx->uring_lock);
ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
mutex_unlock(&ctx->uring_lock);
- if (ret > 0)
- inflight += ret;
+ timeout = jiffies + ctx->sq_thread_idle;
}
+ if (current->task_works)
+ task_work_run();
+
set_fs(old_fs);
if (cur_mm) {
unuse_mm(cur_mm);
@@ -5187,8 +6050,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
struct io_rings *rings = ctx->rings;
int ret = 0;
- if (io_cqring_events(ctx, false) >= min_events)
- return 0;
+ do {
+ if (io_cqring_events(ctx, false) >= min_events)
+ return 0;
+ if (!current->task_works)
+ break;
+ task_work_run();
+ } while (1);
if (sig) {
#ifdef CONFIG_COMPAT
@@ -5208,6 +6076,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
do {
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
+ if (current->task_works)
+ task_work_run();
if (io_should_wake(&iowq, false))
break;
schedule();
@@ -5254,6 +6124,23 @@ static void io_file_ref_kill(struct percpu_ref *ref)
complete(&data->done);
}
+static void io_file_ref_exit_and_free(struct work_struct *work)
+{
+ struct fixed_file_data *data;
+
+ data = container_of(work, struct fixed_file_data, ref_work);
+
+ /*
+ * Ensure any percpu-ref atomic switch callback has run, it could have
+ * been in progress when the files were being unregistered. Once
+ * that's done, we can safely exit and free the ref and containing
+ * data structure.
+ */
+ rcu_barrier();
+ percpu_ref_exit(&data->refs);
+ kfree(data);
+}
+
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_file_data *data = ctx->file_data;
@@ -5266,14 +6153,14 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
flush_work(&data->ref_work);
wait_for_completion(&data->done);
io_ring_file_ref_flush(data);
- percpu_ref_exit(&data->refs);
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
kfree(data->table[i].files);
kfree(data->table);
- kfree(data);
+ INIT_WORK(&data->ref_work, io_file_ref_exit_and_free);
+ queue_work(system_wq, &data->ref_work);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
return 0;
@@ -5500,7 +6387,6 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
struct io_file_put {
struct llist_node llist;
struct file *file;
- struct completion *done;
};
static void io_ring_file_ref_flush(struct fixed_file_data *data)
@@ -5511,10 +6397,7 @@ static void io_ring_file_ref_flush(struct fixed_file_data *data)
while ((node = llist_del_all(&data->put_llist)) != NULL) {
llist_for_each_entry_safe(pfile, tmp, node, llist) {
io_ring_file_put(data->ctx, pfile->file);
- if (pfile->done)
- complete(pfile->done);
- else
- kfree(pfile);
+ kfree(pfile);
}
}
}
@@ -5525,7 +6408,6 @@ static void io_ring_file_ref_switch(struct work_struct *work)
data = container_of(work, struct fixed_file_data, ref_work);
io_ring_file_ref_flush(data);
- percpu_ref_get(&data->refs);
percpu_ref_switch_to_percpu(&data->refs);
}
@@ -5701,41 +6583,27 @@ static void io_atomic_switch(struct percpu_ref *ref)
{
struct fixed_file_data *data;
+ /*
+ * Juggle reference to ensure we hit zero, if needed, so we can
+ * switch back to percpu mode
+ */
data = container_of(ref, struct fixed_file_data, refs);
- clear_bit(FFD_F_ATOMIC, &data->state);
+ percpu_ref_put(&data->refs);
+ percpu_ref_get(&data->refs);
}
-static bool io_queue_file_removal(struct fixed_file_data *data,
+static int io_queue_file_removal(struct fixed_file_data *data,
struct file *file)
{
- struct io_file_put *pfile, pfile_stack;
- DECLARE_COMPLETION_ONSTACK(done);
+ struct io_file_put *pfile;
- /*
- * If we fail allocating the struct we need for doing async reomval
- * of this file, just punt to sync and wait for it.
- */
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
- if (!pfile) {
- pfile = &pfile_stack;
- pfile->done = &done;
- }
+ if (!pfile)
+ return -ENOMEM;
pfile->file = file;
llist_add(&pfile->llist, &data->put_llist);
-
- if (pfile == &pfile_stack) {
- if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
- percpu_ref_put(&data->refs);
- percpu_ref_switch_to_atomic(&data->refs,
- io_atomic_switch);
- }
- wait_for_completion(&done);
- flush_work(&data->ref_work);
- return false;
- }
-
- return true;
+ return 0;
}
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -5770,9 +6638,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
index = i & IORING_FILE_TABLE_MASK;
if (table->files[index]) {
file = io_file_from_index(ctx, index);
+ err = io_queue_file_removal(data, file);
+ if (err)
+ break;
table->files[index] = NULL;
- if (io_queue_file_removal(data, file))
- ref_switch = true;
+ ref_switch = true;
}
if (fd != -1) {
file = fget(fd);
@@ -5803,10 +6673,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
up->offset++;
}
- if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
- percpu_ref_put(&data->refs);
+ if (ref_switch)
percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
- }
return done ? done : err;
}
@@ -5827,20 +6695,14 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
return __io_sqe_files_update(ctx, &up, nr_args);
}
-static void io_put_work(struct io_wq_work *work)
+static void io_free_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ /* Consider that io_steal_work() relies on this ref */
io_put_req(req);
}
-static void io_get_work(struct io_wq_work *work)
-{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
- refcount_inc(&req->refs);
-}
-
static int io_init_wq_offload(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
@@ -5851,8 +6713,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
int ret = 0;
data.user = ctx->user;
- data.get_work = io_get_work;
- data.put_work = io_put_work;
+ data.free_work = io_free_work;
if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
/* Do QD, or 4 * CPUS, whatever is smallest */
@@ -6254,6 +7115,21 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
return -ENXIO;
}
+static int __io_destroy_buffers(int id, void *p, void *data)
+{
+ struct io_ring_ctx *ctx = data;
+ struct io_buffer *buf = p;
+
+ __io_remove_buffers(ctx, buf, id, -1U);
+ return 0;
+}
+
+static void io_destroy_buffers(struct io_ring_ctx *ctx)
+{
+ idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
+ idr_destroy(&ctx->io_buffer_idr);
+}
+
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
@@ -6264,6 +7140,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_sqe_buffer_unregister(ctx);
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
+ io_destroy_buffers(ctx);
+ idr_destroy(&ctx->personality_idr);
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
@@ -6301,7 +7179,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
ctx->rings->sq_ring_entries)
mask |= EPOLLOUT | EPOLLWRNORM;
- if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
+ if (io_cqring_events(ctx, false))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
@@ -6393,6 +7271,29 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
if (!cancel_req)
break;
+ if (cancel_req->flags & REQ_F_OVERFLOW) {
+ spin_lock_irq(&ctx->completion_lock);
+ list_del(&cancel_req->list);
+ cancel_req->flags &= ~REQ_F_OVERFLOW;
+ if (list_empty(&ctx->cq_overflow_list)) {
+ clear_bit(0, &ctx->sq_check_overflow);
+ clear_bit(0, &ctx->cq_check_overflow);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+ WRITE_ONCE(ctx->rings->cq_overflow,
+ atomic_inc_return(&ctx->cached_cq_overflow));
+
+ /*
+ * Put inflight ref and overflow ref. If that's
+ * all we had, then we're done with this request.
+ */
+ if (refcount_sub_and_test(2, &cancel_req->refs)) {
+ io_put_req(cancel_req);
+ continue;
+ }
+ }
+
io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
io_put_req(cancel_req);
schedule();
@@ -6405,6 +7306,13 @@ static int io_uring_flush(struct file *file, void *data)
struct io_ring_ctx *ctx = file->private_data;
io_uring_cancel_files(ctx, data);
+
+ /*
+ * If the task is going away, cancel work it may have pending
+ */
+ if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
+ io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current));
+
return 0;
}
@@ -6487,6 +7395,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
int submitted = 0;
struct fd f;
+ if (current->task_works)
+ task_work_run();
+
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
return -EINVAL;
@@ -6533,7 +7444,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
min_complete = min(min_complete, ctx->cq_entries);
- if (ctx->flags & IORING_SETUP_IOPOLL) {
+ /*
+ * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
+ * space applications don't need to do io completion events
+ * polling again, they can rely on io_sq_thread to do polling
+ * work, which can reduce cpu usage and uring_lock contention.
+ */
+ if (ctx->flags & IORING_SETUP_IOPOLL &&
+ !(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_iopoll_check(ctx, &nr_events, min_complete);
} else {
ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
@@ -6547,6 +7465,7 @@ out_fput:
return submitted ? submitted : ret;
}
+#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
const struct cred *cred = p;
@@ -6608,6 +7527,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "Personalities:\n");
idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
}
+ seq_printf(m, "PollList:\n");
+ spin_lock_irq(&ctx->completion_lock);
+ for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+ struct hlist_head *list = &ctx->cancel_hash[i];
+ struct io_kiocb *req;
+
+ hlist_for_each_entry(req, list, hash_node)
+ seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
+ req->task->task_works != NULL);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
mutex_unlock(&ctx->uring_lock);
}
@@ -6620,6 +7550,7 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
percpu_ref_put(&ctx->refs);
}
}
+#endif
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
@@ -6631,7 +7562,9 @@ static const struct file_operations io_uring_fops = {
#endif
.poll = io_uring_poll,
.fasync = io_uring_fasync,
+#ifdef CONFIG_PROC_FS
.show_fdinfo = io_uring_show_fdinfo,
+#endif
};
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
@@ -6821,7 +7754,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
- IORING_FEAT_CUR_PERSONALITY;
+ IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
@@ -7099,6 +8032,7 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(8, __u64, off);
BUILD_BUG_SQE_ELEM(8, __u64, addr2);
BUILD_BUG_SQE_ELEM(16, __u64, addr);
+ BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
BUILD_BUG_SQE_ELEM(24, __u32, len);
BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
@@ -7113,11 +8047,14 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
+ BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
BUILD_BUG_SQE_ELEM(32, __u64, user_data);
BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
BUILD_BUG_SQE_ELEM(42, __u16, personality);
+ BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
+ BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
return 0;
};
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2494095e0340..27373f5792a4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -976,29 +976,33 @@ restart_loop:
* it. */
/*
- * A buffer which has been freed while still being journaled by
- * a previous transaction.
- */
- if (buffer_freed(bh)) {
+ * A buffer which has been freed while still being journaled
+ * by a previous transaction, refile the buffer to BJ_Forget of
+ * the running transaction. If the just committed transaction
+ * contains "add to orphan" operation, we can completely
+ * invalidate the buffer now. We are rather through in that
+ * since the buffer may be still accessible when blocksize <
+ * pagesize and it is attached to the last partial page.
+ */
+ if (buffer_freed(bh) && !jh->b_next_transaction) {
+ struct address_space *mapping;
+
+ clear_buffer_freed(bh);
+ clear_buffer_jbddirty(bh);
+
/*
- * If the running transaction is the one containing
- * "add to orphan" operation (b_next_transaction !=
- * NULL), we have to wait for that transaction to
- * commit before we can really get rid of the buffer.
- * So just clear b_modified to not confuse transaction
- * credit accounting and refile the buffer to
- * BJ_Forget of the running transaction. If the just
- * committed transaction contains "add to orphan"
- * operation, we can completely invalidate the buffer
- * now. We are rather through in that since the
- * buffer may be still accessible when blocksize <
- * pagesize and it is attached to the last partial
- * page.
+ * Block device buffers need to stay mapped all the
+ * time, so it is enough to clear buffer_jbddirty and
+ * buffer_freed bits. For the file mapping buffers (i.e.
+ * journalled data) we need to unmap buffer and clear
+ * more bits. We also need to be careful about the check
+ * because the data page mapping can get cleared under
+ * out hands, which alse need not to clear more bits
+ * because the page and buffers will be freed and can
+ * never be reused once we are done with them.
*/
- jh->b_modified = 0;
- if (!jh->b_next_transaction) {
- clear_buffer_freed(bh);
- clear_buffer_jbddirty(bh);
+ mapping = READ_ONCE(bh->b_page->mapping);
+ if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
clear_buffer_mapped(bh);
clear_buffer_new(bh);
clear_buffer_req(bh);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e77a5a0b4e46..3dccc23cf010 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -936,8 +936,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
char *frozen_buffer = NULL;
unsigned long start_lock, time_lock;
- if (is_handle_aborted(handle))
- return -EROFS;
journal = transaction->t_journal;
jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -1152,8 +1150,8 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
/* For undo access buffer must have data copied */
if (undo && !jh->b_committed_data)
goto out;
- if (jh->b_transaction != handle->h_transaction &&
- jh->b_next_transaction != handle->h_transaction)
+ if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
+ READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
goto out;
/*
* There are two reasons for the barrier here:
@@ -1189,6 +1187,9 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int rc;
+ if (is_handle_aborted(handle))
+ return -EROFS;
+
if (jbd2_write_access_granted(handle, bh, false))
return 0;
@@ -1326,6 +1327,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
char *committed_data = NULL;
+ if (is_handle_aborted(handle))
+ return -EROFS;
+
if (jbd2_write_access_granted(handle, bh, true))
return 0;
@@ -2329,14 +2333,16 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
return -EBUSY;
}
/*
- * OK, buffer won't be reachable after truncate. We just set
- * j_next_transaction to the running transaction (if there is
- * one) and mark buffer as freed so that commit code knows it
- * should clear dirty bits when it is done with the buffer.
+ * OK, buffer won't be reachable after truncate. We just clear
+ * b_modified to not confuse transaction credit accounting, and
+ * set j_next_transaction to the running transaction (if there
+ * is one) and mark buffer as freed so that commit code knows
+ * it should clear dirty bits when it is done with the buffer.
*/
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
jh->b_next_transaction = journal->j_running_transaction;
+ jh->b_modified = 0;
spin_unlock(&journal->j_list_lock);
spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
@@ -2563,8 +2569,8 @@ bool __jbd2_journal_refile_buffer(struct journal_head *jh)
* our jh reference and thus __jbd2_journal_file_buffer() must not
* take a new one.
*/
- jh->b_transaction = jh->b_next_transaction;
- jh->b_next_transaction = NULL;
+ WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
+ WRITE_ONCE(jh->b_next_transaction, NULL);
if (buffer_freed(bh))
jlist = BJ_Forget;
else if (jh->b_modified)
diff --git a/fs/libfs.c b/fs/libfs.c
index c686bd9caac6..3759fbacf522 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -891,7 +891,7 @@ int simple_attr_open(struct inode *inode, struct file *file,
{
struct simple_attr *attr;
- attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
if (!attr)
return -ENOMEM;
@@ -931,9 +931,11 @@ ssize_t simple_attr_read(struct file *file, char __user *buf,
if (ret)
return ret;
- if (*ppos) { /* continued read */
+ if (*ppos && attr->get_buf[0]) {
+ /* continued read */
size = strlen(attr->get_buf);
- } else { /* first read */
+ } else {
+ /* first read */
u64 val;
ret = attr->get(attr->data, &val);
if (ret)
diff --git a/fs/locks.c b/fs/locks.c
index 44b6da032842..b8a31c1c4fff 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -725,7 +725,6 @@ static void __locks_delete_block(struct file_lock *waiter)
{
locks_delete_global_blocked(waiter);
list_del_init(&waiter->fl_blocked_member);
- waiter->fl_blocker = NULL;
}
static void __locks_wake_up_blocks(struct file_lock *blocker)
@@ -740,6 +739,13 @@ static void __locks_wake_up_blocks(struct file_lock *blocker)
waiter->fl_lmops->lm_notify(waiter);
else
wake_up(&waiter->fl_wait);
+
+ /*
+ * The setting of fl_blocker to NULL marks the "done"
+ * point in deleting a block. Paired with acquire at the top
+ * of locks_delete_block().
+ */
+ smp_store_release(&waiter->fl_blocker, NULL);
}
}
@@ -754,24 +760,41 @@ int locks_delete_block(struct file_lock *waiter)
int status = -ENOENT;
/*
- * If fl_blocker is NULL, it won't be set again as this thread
- * "owns" the lock and is the only one that might try to claim
- * the lock. So it is safe to test fl_blocker locklessly.
- * Also if fl_blocker is NULL, this waiter is not listed on
- * fl_blocked_requests for some lock, so no other request can
- * be added to the list of fl_blocked_requests for this
- * request. So if fl_blocker is NULL, it is safe to
- * locklessly check if fl_blocked_requests is empty. If both
- * of these checks succeed, there is no need to take the lock.
+ * If fl_blocker is NULL, it won't be set again as this thread "owns"
+ * the lock and is the only one that might try to claim the lock.
+ *
+ * We use acquire/release to manage fl_blocker so that we can
+ * optimize away taking the blocked_lock_lock in many cases.
+ *
+ * The smp_load_acquire guarantees two things:
+ *
+ * 1/ that fl_blocked_requests can be tested locklessly. If something
+ * was recently added to that list it must have been in a locked region
+ * *before* the locked region when fl_blocker was set to NULL.
+ *
+ * 2/ that no other thread is accessing 'waiter', so it is safe to free
+ * it. __locks_wake_up_blocks is careful not to touch waiter after
+ * fl_blocker is released.
+ *
+ * If a lockless check of fl_blocker shows it to be NULL, we know that
+ * no new locks can be inserted into its fl_blocked_requests list, and
+ * can avoid doing anything further if the list is empty.
*/
- if (waiter->fl_blocker == NULL &&
+ if (!smp_load_acquire(&waiter->fl_blocker) &&
list_empty(&waiter->fl_blocked_requests))
return status;
+
spin_lock(&blocked_lock_lock);
if (waiter->fl_blocker)
status = 0;
__locks_wake_up_blocks(waiter);
__locks_delete_block(waiter);
+
+ /*
+ * The setting of fl_blocker to NULL marks the "done" point in deleting
+ * a block. Paired with acquire at the top of this function.
+ */
+ smp_store_release(&waiter->fl_blocker, NULL);
spin_unlock(&blocked_lock_lock);
return status;
}
@@ -1364,7 +1387,8 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = posix_lock_inode(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ error = wait_event_interruptible(fl->fl_wait,
+ list_empty(&fl->fl_blocked_member));
if (error)
break;
}
@@ -1449,7 +1473,8 @@ int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
error = posix_lock_inode(inode, &fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl.fl_wait, !fl.fl_blocker);
+ error = wait_event_interruptible(fl.fl_wait,
+ list_empty(&fl.fl_blocked_member));
if (!error) {
/*
* If we've been sleeping someone might have
@@ -1652,7 +1677,8 @@ restart:
locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
- !new_fl->fl_blocker, break_time);
+ list_empty(&new_fl->fl_blocked_member),
+ break_time);
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
@@ -2136,7 +2162,8 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = flock_lock_inode(inode, fl);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ error = wait_event_interruptible(fl->fl_wait,
+ list_empty(&fl->fl_blocked_member));
if (error)
break;
}
@@ -2413,7 +2440,8 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
error = vfs_lock_file(filp, cmd, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ error = wait_event_interruptible(fl->fl_wait,
+ list_empty(&fl->fl_blocked_member));
if (error)
break;
}
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 40b6c5ac46c0..88e1763e02f3 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -164,7 +164,7 @@ config ROOT_NFS
If you want your system to mount its root file system via NFS,
choose Y here. This is common practice for managing systems
without local permanent storage. For details, read
- <file:Documentation/filesystems/nfs/nfsroot.txt>.
+ <file:Documentation/admin-guide/nfs/nfsroot.rst>.
Most people say N here.
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 989c30c98511..f1ff3076e4a4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -153,6 +153,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
goto error_0;
+ clp->cl_minorversion = cl_init->minorversion;
clp->cl_nfs_mod = cl_init->nfs_mod;
if (!try_module_get(clp->cl_nfs_mod->owner))
goto error_dealloc;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 4a841071d8a7..1865322de142 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -42,13 +42,27 @@ static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation)
if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
delegation->stateid.type = NFS4_INVALID_STATEID_TYPE;
atomic_long_dec(&nfs_active_delegations);
+ if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ nfs_clear_verifier_delegated(delegation->inode);
}
}
+static struct nfs_delegation *nfs_get_delegation(struct nfs_delegation *delegation)
+{
+ refcount_inc(&delegation->refcount);
+ return delegation;
+}
+
+static void nfs_put_delegation(struct nfs_delegation *delegation)
+{
+ if (refcount_dec_and_test(&delegation->refcount))
+ __nfs_free_delegation(delegation);
+}
+
static void nfs_free_delegation(struct nfs_delegation *delegation)
{
nfs_mark_delegation_revoked(delegation);
- __nfs_free_delegation(delegation);
+ nfs_put_delegation(delegation);
}
/**
@@ -241,13 +255,18 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
{
+ const struct cred *cred;
int res = 0;
- if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
- res = nfs4_proc_delegreturn(inode,
- delegation->cred,
+ if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ spin_lock(&delegation->lock);
+ cred = get_cred(delegation->cred);
+ spin_unlock(&delegation->lock);
+ res = nfs4_proc_delegreturn(inode, cred,
&delegation->stateid,
issync);
+ put_cred(cred);
+ }
return res;
}
@@ -273,9 +292,13 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
if (delegation == NULL)
goto out;
spin_lock(&delegation->lock);
- if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
- ret = delegation;
+ if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ /* Refcount matched in nfs_end_delegation_return() */
+ ret = nfs_get_delegation(delegation);
+ }
spin_unlock(&delegation->lock);
+ if (ret)
+ nfs_clear_verifier_delegated(&nfsi->vfs_inode);
out:
return ret;
}
@@ -393,6 +416,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
if (delegation == NULL)
return -ENOMEM;
nfs4_stateid_copy(&delegation->stateid, stateid);
+ refcount_set(&delegation->refcount, 1);
delegation->type = type;
delegation->pagemod_limit = pagemod_limit;
delegation->change_attr = inode_peek_iversion_raw(inode);
@@ -492,6 +516,8 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
err = nfs_do_return_delegation(inode, delegation, issync);
out:
+ /* Refcount matched in nfs_start_delegation_return_locked() */
+ nfs_put_delegation(delegation);
return err;
}
@@ -686,9 +712,12 @@ void nfs4_inode_return_delegation_on_close(struct inode *inode)
list_empty(&NFS_I(inode)->open_files) &&
!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
- ret = delegation;
+ /* Refcount matched in nfs_end_delegation_return() */
+ ret = nfs_get_delegation(delegation);
}
spin_unlock(&delegation->lock);
+ if (ret)
+ nfs_clear_verifier_delegated(inode);
}
out:
rcu_read_unlock();
@@ -1088,10 +1117,11 @@ restart:
delegation = nfs_start_delegation_return_locked(NFS_I(inode));
rcu_read_unlock();
if (delegation != NULL) {
- delegation = nfs_detach_delegation(NFS_I(inode),
- delegation, server);
- if (delegation != NULL)
+ if (nfs_detach_delegation(NFS_I(inode), delegation,
+ server) != NULL)
nfs_free_delegation(delegation);
+ /* Match nfs_start_delegation_return_locked */
+ nfs_put_delegation(delegation);
}
iput(inode);
nfs_sb_deactive(server->super);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 31b84604d383..9b00a0b7f832 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -22,6 +22,7 @@ struct nfs_delegation {
unsigned long pagemod_limit;
__u64 change_attr;
unsigned long flags;
+ refcount_t refcount;
spinlock_t lock;
struct rcu_head rcu;
};
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1320288ff9ec..d4b839b6cf89 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -155,6 +155,7 @@ typedef struct {
loff_t current_index;
decode_dirent_t decode;
+ unsigned long dir_verifier;
unsigned long timestamp;
unsigned long gencount;
unsigned int cache_entry_index;
@@ -353,6 +354,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
again:
timestamp = jiffies;
gencount = nfs_inc_attr_generation_counter();
+ desc->dir_verifier = nfs_save_change_attribute(inode);
error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
NFS_SERVER(inode)->dtsize, desc->plus);
if (error < 0) {
@@ -455,13 +457,13 @@ void nfs_force_use_readdirplus(struct inode *dir)
}
static
-void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
+void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
+ unsigned long dir_verifier)
{
struct qstr filename = QSTR_INIT(entry->name, entry->len);
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
struct dentry *dentry;
struct dentry *alias;
- struct inode *dir = d_inode(parent);
struct inode *inode;
int status;
@@ -500,7 +502,7 @@ again:
if (nfs_same_file(dentry, entry)) {
if (!entry->fh->size)
goto out;
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ nfs_set_verifier(dentry, dir_verifier);
status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
if (!status)
nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label);
@@ -526,7 +528,7 @@ again:
dput(dentry);
dentry = alias;
}
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ nfs_set_verifier(dentry, dir_verifier);
out:
dput(dentry);
}
@@ -564,7 +566,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
count++;
if (desc->plus)
- nfs_prime_dcache(file_dentry(desc->file), entry);
+ nfs_prime_dcache(file_dentry(desc->file), entry,
+ desc->dir_verifier);
status = nfs_readdir_add_to_array(entry, page);
if (status != 0)
@@ -983,14 +986,113 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
* full lookup on all child dentries of 'dir' whenever a change occurs
* on the server that might have invalidated our dcache.
*
+ * Note that we reserve bit '0' as a tag to let us know when a dentry
+ * was revalidated while holding a delegation on its inode.
+ *
* The caller should be holding dir->i_lock
*/
void nfs_force_lookup_revalidate(struct inode *dir)
{
- NFS_I(dir)->cache_change_attribute++;
+ NFS_I(dir)->cache_change_attribute += 2;
}
EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
+/**
+ * nfs_verify_change_attribute - Detects NFS remote directory changes
+ * @dir: pointer to parent directory inode
+ * @verf: previously saved change attribute
+ *
+ * Return "false" if the verifiers doesn't match the change attribute.
+ * This would usually indicate that the directory contents have changed on
+ * the server, and that any dentries need revalidating.
+ */
+static bool nfs_verify_change_attribute(struct inode *dir, unsigned long verf)
+{
+ return (verf & ~1UL) == nfs_save_change_attribute(dir);
+}
+
+static void nfs_set_verifier_delegated(unsigned long *verf)
+{
+ *verf |= 1UL;
+}
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+static void nfs_unset_verifier_delegated(unsigned long *verf)
+{
+ *verf &= ~1UL;
+}
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+
+static bool nfs_test_verifier_delegated(unsigned long verf)
+{
+ return verf & 1;
+}
+
+static bool nfs_verifier_is_delegated(struct dentry *dentry)
+{
+ return nfs_test_verifier_delegated(dentry->d_time);
+}
+
+static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (!nfs_verifier_is_delegated(dentry) &&
+ !nfs_verify_change_attribute(d_inode(dentry->d_parent), verf))
+ goto out;
+ if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ nfs_set_verifier_delegated(&verf);
+out:
+ dentry->d_time = verf;
+}
+
+/**
+ * nfs_set_verifier - save a parent directory verifier in the dentry
+ * @dentry: pointer to dentry
+ * @verf: verifier to save
+ *
+ * Saves the parent directory verifier in @dentry. If the inode has
+ * a delegation, we also tag the dentry as having been revalidated
+ * while holding a delegation so that we know we don't have to
+ * look it up again after a directory change.
+ */
+void nfs_set_verifier(struct dentry *dentry, unsigned long verf)
+{
+
+ spin_lock(&dentry->d_lock);
+ nfs_set_verifier_locked(dentry, verf);
+ spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_set_verifier);
+
+#if IS_ENABLED(CONFIG_NFS_V4)
+/**
+ * nfs_clear_verifier_delegated - clear the dir verifier delegation tag
+ * @inode: pointer to inode
+ *
+ * Iterates through the dentries in the inode alias list and clears
+ * the tag used to indicate that the dentry has been revalidated
+ * while holding a delegation.
+ * This function is intended for use when the delegation is being
+ * returned or revoked.
+ */
+void nfs_clear_verifier_delegated(struct inode *inode)
+{
+ struct dentry *alias;
+
+ if (!inode)
+ return;
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ spin_lock(&alias->d_lock);
+ nfs_unset_verifier_delegated(&alias->d_time);
+ spin_unlock(&alias->d_lock);
+ }
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated);
+#endif /* IS_ENABLED(CONFIG_NFS_V4) */
+
/*
* A check for whether or not the parent directory has changed.
* In the case it has, we assume that the dentries are untrustworthy
@@ -1159,6 +1261,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
struct nfs_fh *fhandle;
struct nfs_fattr *fattr;
struct nfs4_label *label;
+ unsigned long dir_verifier;
int ret;
ret = -ENOMEM;
@@ -1168,6 +1271,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
if (fhandle == NULL || fattr == NULL || IS_ERR(label))
goto out;
+ dir_verifier = nfs_save_change_attribute(dir);
ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label);
if (ret < 0) {
switch (ret) {
@@ -1188,7 +1292,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
goto out;
nfs_setsecurity(inode, fattr, label);
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ nfs_set_verifier(dentry, dir_verifier);
/* set a readdirplus hint that we had a cache miss */
nfs_force_use_readdirplus(dir);
@@ -1230,7 +1334,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
goto out_bad;
}
- if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
+ if (nfs_verifier_is_delegated(dentry))
return nfs_lookup_revalidate_delegated(dir, dentry, inode);
/* Force a full look up iff the parent directory has changed */
@@ -1415,6 +1519,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
struct nfs4_label *label = NULL;
+ unsigned long dir_verifier;
int error;
dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
@@ -1440,6 +1545,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
if (IS_ERR(label))
goto out;
+ dir_verifier = nfs_save_change_attribute(dir);
trace_nfs_lookup_enter(dir, dentry, flags);
error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label);
if (error == -ENOENT)
@@ -1463,7 +1569,7 @@ no_entry:
goto out_label;
dentry = res;
}
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ nfs_set_verifier(dentry, dir_verifier);
out_label:
trace_nfs_lookup_exit(dir, dentry, flags, error);
nfs4_label_free(label);
@@ -1668,7 +1774,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
if (inode == NULL)
goto full_reval;
- if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
+ if (nfs_verifier_is_delegated(dentry))
return nfs_lookup_revalidate_delegated(dir, dentry, inode);
/* NFS only supports OPEN on regular files */
@@ -2383,7 +2489,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre
rcu_read_lock();
if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
goto out;
- lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
+ lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru));
cache = list_entry(lh, struct nfs_access_entry, lru);
if (lh == &nfsi->access_cache_entry_lru ||
cred_fscmp(cred, cache->cred) != 0)
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index e1b938457ab9..e113fcb4bb4c 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -832,6 +832,8 @@ static int nfs_parse_source(struct fs_context *fc,
if (len > maxnamlen)
goto out_hostname;
+ kfree(ctx->nfs_server.hostname);
+
/* N.B. caller will free nfs_server.hostname in all cases */
ctx->nfs_server.hostname = kmemdup_nul(dev_name, len, GFP_KERNEL);
if (!ctx->nfs_server.hostname)
@@ -1240,6 +1242,13 @@ static int nfs_fs_context_validate(struct fs_context *fc)
}
ctx->nfs_mod = nfs_mod;
}
+
+ /* Ensure the filesystem context has the correct fs_type */
+ if (fc->fs_type != ctx->nfs_mod->nfs_fs) {
+ module_put(fc->fs_type->owner);
+ __module_get(ctx->nfs_mod->nfs_fs->owner);
+ fc->fs_type = ctx->nfs_mod->nfs_fs;
+ }
return 0;
out_no_device_name:
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 52270bfac120..1abf126c2df4 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -31,6 +31,7 @@ static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
struct nfs_server_key {
struct {
uint16_t nfsversion; /* NFS protocol version */
+ uint32_t minorversion; /* NFSv4 minor version */
uint16_t family; /* address family */
__be16 port; /* IP port */
} hdr;
@@ -55,6 +56,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp)
memset(&key, 0, sizeof(key));
key.hdr.nfsversion = clp->rpc_ops->version;
+ key.hdr.minorversion = clp->cl_minorversion;
key.hdr.family = clp->cl_addr.ss_family;
switch (clp->cl_addr.ss_family) {
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b012c2668a1f..aaeeb4659bff 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -73,6 +73,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
struct inode *inode;
char *name;
int error = -ENOMEM;
+ unsigned long kflags = 0, kflags_out = 0;
name = kstrdup(fc->source, GFP_KERNEL);
if (!name)
@@ -83,11 +84,14 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
if (fsinfo.fattr == NULL)
goto out_name;
+ fsinfo.fattr->label = nfs4_label_alloc(server, GFP_KERNEL);
+ if (IS_ERR(fsinfo.fattr->label))
+ goto out_fattr;
error = server->nfs_client->rpc_ops->getroot(server, ctx->mntfh, &fsinfo);
if (error < 0) {
dprintk("nfs_get_root: getattr error = %d\n", -error);
nfs_errorf(fc, "NFS: Couldn't getattr on root");
- goto out_fattr;
+ goto out_label;
}
inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr, NULL);
@@ -95,12 +99,12 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
dprintk("nfs_get_root: get root inode failed\n");
error = PTR_ERR(inode);
nfs_errorf(fc, "NFS: Couldn't get root inode");
- goto out_fattr;
+ goto out_label;
}
error = nfs_superblock_set_dummy_root(s, inode);
if (error != 0)
- goto out_fattr;
+ goto out_label;
/* root dentries normally start off anonymous and get spliced in later
* if the dentry tree reaches them; however if the dentry already
@@ -111,7 +115,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
dprintk("nfs_get_root: get root dentry failed\n");
error = PTR_ERR(root);
nfs_errorf(fc, "NFS: Couldn't get root dentry");
- goto out_fattr;
+ goto out_label;
}
security_d_instantiate(root, inode);
@@ -123,12 +127,39 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc)
}
spin_unlock(&root->d_lock);
fc->root = root;
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+ kflags |= SECURITY_LSM_NATIVE_LABELS;
+ if (ctx->clone_data.sb) {
+ if (d_inode(fc->root)->i_fop != &nfs_dir_operations) {
+ error = -ESTALE;
+ goto error_splat_root;
+ }
+ /* clone lsm security options from the parent to the new sb */
+ error = security_sb_clone_mnt_opts(ctx->clone_data.sb,
+ s, kflags, &kflags_out);
+ } else {
+ error = security_sb_set_mnt_opts(s, fc->security,
+ kflags, &kflags_out);
+ }
+ if (error)
+ goto error_splat_root;
+ if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+ !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+ NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+
+ nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label);
error = 0;
+out_label:
+ nfs4_label_free(fsinfo.fattr->label);
out_fattr:
nfs_free_fattr(fsinfo.fattr);
out_name:
kfree(name);
out:
return error;
+error_splat_root:
+ dput(fc->root);
+ fc->root = NULL;
+ goto out_label;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1309e6f47f3d..11bf15800ac9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2114,6 +2114,7 @@ static void init_once(void *foo)
init_rwsem(&nfsi->rmdir_sem);
mutex_init(&nfsi->commit_mutex);
nfs4_init_once(nfsi);
+ nfsi->cache_change_attribute = 0;
}
static int __init nfs_init_inodecache(void)
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index ad6077404947..f3ece8ed3203 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -153,7 +153,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
/* Open a new filesystem context, transferring parameters from the
* parent superblock, including the network namespace.
*/
- fc = fs_context_for_submount(&nfs_fs_type, path->dentry);
+ fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
if (IS_ERR(fc))
return ERR_CAST(fc);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0cd767e5c977..0bd77cc1f639 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -216,7 +216,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
INIT_LIST_HEAD(&clp->cl_ds_clients);
rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
- clp->cl_minorversion = cl_init->minorversion;
clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
clp->cl_mig_gen = 1;
#if IS_ENABLED(CONFIG_NFS_V4_1)
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index be4eb720d5b6..1297919e0fce 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -87,7 +87,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (inode != d_inode(dentry))
goto out_drop;
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
nfs_file_set_open_context(filp, ctx);
nfs_fscache_open_file(inode, filp);
err = 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 95d07a3dc5d1..cb34e840e4fb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2974,10 +2974,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
struct dentry *dentry;
struct nfs4_state *state;
fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx);
+ struct inode *dir = d_inode(opendata->dir);
+ unsigned long dir_verifier;
unsigned int seq;
int ret;
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ dir_verifier = nfs_save_change_attribute(dir);
ret = _nfs4_proc_open(opendata, ctx);
if (ret != 0)
@@ -3005,8 +3008,19 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
dput(ctx->dentry);
ctx->dentry = dentry = alias;
}
- nfs_set_verifier(dentry,
- nfs_save_change_attribute(d_inode(opendata->dir)));
+ }
+
+ switch(opendata->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ if (!opendata->rpc_done)
+ break;
+ if (opendata->o_res.delegation_type != 0)
+ dir_verifier = nfs_save_change_attribute(dir);
+ nfs_set_verifier(dentry, dir_verifier);
}
/* Parse layoutget results before we check for access */
@@ -3988,7 +4002,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
{
int error;
struct nfs_fattr *fattr = info->fattr;
- struct nfs4_label *label = NULL;
+ struct nfs4_label *label = fattr->label;
error = nfs4_server_capabilities(server, mntfh);
if (error < 0) {
@@ -3996,23 +4010,17 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
return error;
}
- label = nfs4_label_alloc(server, GFP_KERNEL);
- if (IS_ERR(label))
- return PTR_ERR(label);
-
error = nfs4_proc_getattr(server, mntfh, fattr, label, NULL);
if (error < 0) {
dprintk("nfs4_get_root: getattr error = %d\n", -error);
- goto err_free_label;
+ goto out;
}
if (fattr->valid & NFS_ATTR_FATTR_FSID &&
!nfs_fsid_equal(&server->fsid, &fattr->fsid))
memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
-err_free_label:
- nfs4_label_free(label);
-
+out:
return error;
}
@@ -5322,7 +5330,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
hdr->timestamp = jiffies;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
- nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1, 0);
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr);
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index dada09b391c6..bb14bede6da5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1179,7 +1179,6 @@ int nfs_get_tree_common(struct fs_context *fc)
struct super_block *s;
int (*compare_super)(struct super_block *, struct fs_context *) = nfs_compare_super;
struct nfs_server *server = ctx->server;
- unsigned long kflags = 0, kflags_out = 0;
int error;
ctx->server = NULL;
@@ -1239,26 +1238,6 @@ int nfs_get_tree_common(struct fs_context *fc)
goto error_splat_super;
}
- if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
- kflags |= SECURITY_LSM_NATIVE_LABELS;
- if (ctx->clone_data.sb) {
- if (d_inode(fc->root)->i_fop != &nfs_dir_operations) {
- error = -ESTALE;
- goto error_splat_root;
- }
- /* clone any lsm security options from the parent to the new sb */
- error = security_sb_clone_mnt_opts(ctx->clone_data.sb, s, kflags,
- &kflags_out);
- } else {
- error = security_sb_set_mnt_opts(s, fc->security,
- kflags, &kflags_out);
- }
- if (error)
- goto error_splat_root;
- if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
- !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
- NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
-
s->s_flags |= SB_ACTIVE;
error = 0;
@@ -1268,10 +1247,6 @@ out:
out_err_nosb:
nfs_free_server(server);
goto out;
-
-error_splat_root:
- dput(fc->root);
- fc->root = NULL;
error_splat_super:
deactivate_locked_super(s);
goto out;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index b13bfd406820..4f1205725cfe 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -247,6 +247,20 @@ out_invalid:
return ERR_PTR(-EINVAL);
}
+/**
+ * ns_match() - Returns true if current namespace matches dev/ino provided.
+ * @ns_common: current ns
+ * @dev: dev_t from nsfs that will be matched against current nsfs
+ * @ino: ino_t from nsfs that will be matched against current nsfs
+ *
+ * Return: true if dev and ino matches the current nsfs.
+ */
+bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
+{
+ return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev);
+}
+
+
static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 7202a1e39d70..554b744f41bf 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -92,8 +92,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
"0x%llx.", (unsigned long long)bh->b_blocknr);
}
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
@@ -108,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
}
tmp = tmp->b_this_page;
} while (tmp != bh);
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
/*
* If none of the buffers had errors then we can set the page uptodate,
* but we first have to perform the post read mst fixups, if the
@@ -142,8 +140,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
unlock_page(page);
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
diff --git a/fs/open.c b/fs/open.c
index e5227cd533f4..719b320ede52 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -860,9 +860,6 @@ cleanup_file:
* the return value of d_splice_alias(), then the caller needs to perform dput()
* on it after finish_open().
*
- * On successful return @file is a fully instantiated open file. After this, if
- * an error occurs in ->atomic_open(), it needs to clean up with fput().
- *
* Returns zero on success or -errno if the open failed.
*/
int finish_open(struct file *file, struct dentry *dentry,
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 444e2da4f60e..714c14c47ca5 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -93,6 +93,7 @@ config OVERLAY_FS_XINO_AUTO
bool "Overlayfs: auto enable inode number mapping"
default n
depends on OVERLAY_FS
+ depends on 64BIT
help
If this config option is enabled then overlay filesystems will use
unused high bits in undelying filesystem inode numbers to map all
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index a5317216de73..87c362f65448 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -244,6 +244,9 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
if (iocb->ki_flags & IOCB_WRITE) {
struct inode *inode = file_inode(orig_iocb->ki_filp);
+ /* Actually acquired in ovl_write_iter() */
+ __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
+ SB_FREEZE_WRITE);
file_end_write(iocb->ki_filp);
ovl_copyattr(ovl_inode_real(inode), inode);
}
@@ -346,6 +349,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out;
file_start_write(real.file);
+ /* Pacify lockdep, same trick as done in aio_write() */
+ __sb_writers_release(file_inode(real.file)->i_sb,
+ SB_FREEZE_WRITE);
aio_req->fd = real;
real.flags = 0;
aio_req->orig_iocb = iocb;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 3623d28aa4fa..3d3f2b8bdae5 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -318,7 +318,12 @@ static inline unsigned int ovl_xino_bits(struct super_block *sb)
return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0;
}
-static inline int ovl_inode_lock(struct inode *inode)
+static inline void ovl_inode_lock(struct inode *inode)
+{
+ mutex_lock(&OVL_I(inode)->lock);
+}
+
+static inline int ovl_inode_lock_interruptible(struct inode *inode)
{
return mutex_lock_interruptible(&OVL_I(inode)->lock);
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 319fe0d355b0..ac967f1cb6e5 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1411,6 +1411,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
if (ofs->config.xino == OVL_XINO_ON)
pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n");
ofs->xino_mode = 0;
+ } else if (ofs->config.xino == OVL_XINO_OFF) {
+ ofs->xino_mode = -1;
} else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) {
/*
* This is a roundup of number of bits needed for encoding
@@ -1623,8 +1625,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_stack_depth = 0;
sb->s_maxbytes = MAX_LFS_FILESIZE;
/* Assume underlaying fs uses 32bit inodes unless proven otherwise */
- if (ofs->config.xino != OVL_XINO_OFF)
+ if (ofs->config.xino != OVL_XINO_OFF) {
ofs->xino_mode = BITS_PER_LONG - 32;
+ if (!ofs->xino_mode) {
+ pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n");
+ ofs->config.xino = OVL_XINO_OFF;
+ }
+ }
/* alloc/destroy_inode needed for setting up traps in inode cache */
sb->s_op = &ovl_super_operations;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index ea005085803f..042f7eb4f7f4 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -509,7 +509,7 @@ int ovl_copy_up_start(struct dentry *dentry, int flags)
struct inode *inode = d_inode(dentry);
int err;
- err = ovl_inode_lock(inode);
+ err = ovl_inode_lock_interruptible(inode);
if (!err && ovl_already_copied_up_locked(dentry, flags)) {
err = 1; /* Already copied up */
ovl_inode_unlock(inode);
@@ -764,7 +764,7 @@ int ovl_nlink_start(struct dentry *dentry)
return err;
}
- err = ovl_inode_lock(inode);
+ err = ovl_inode_lock_interruptible(inode);
if (err)
return err;
diff --git a/fs/pipe.c b/fs/pipe.c
index 5a34d6c22d4c..2144507447c5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -722,9 +722,10 @@ pipe_release(struct inode *inode, struct file *file)
if (file->f_mode & FMODE_WRITE)
pipe->writers--;
- if (pipe->readers || pipe->writers) {
- wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLHUP);
- wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM | EPOLLERR | EPOLLHUP);
+ /* Was that the last reader or writer, but not the other side? */
+ if (!pipe->readers != !pipe->writers) {
+ wake_up_interruptible_all(&pipe->rd_wait);
+ wake_up_interruptible_all(&pipe->wr_wait);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
@@ -1026,8 +1027,8 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
static void wake_up_partner(struct pipe_inode_info *pipe)
{
- wake_up_interruptible(&pipe->rd_wait);
- wake_up_interruptible(&pipe->wr_wait);
+ wake_up_interruptible_all(&pipe->rd_wait);
+ wake_up_interruptible_all(&pipe->wr_wait);
}
static int fifo_open(struct inode *inode, struct file *filp)
@@ -1144,7 +1145,7 @@ err_rd:
err_wr:
if (!--pipe->writers)
- wake_up_interruptible(&pipe->rd_wait);
+ wake_up_interruptible_all(&pipe->rd_wait);
ret = -ERESTARTSYS;
goto err;
@@ -1271,8 +1272,9 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
pipe->max_usage = nr_slots;
pipe->tail = tail;
pipe->head = head;
- wake_up_interruptible_all(&pipe->rd_wait);
- wake_up_interruptible_all(&pipe->wr_wait);
+
+ /* This might have made more room for writers */
+ wake_up_interruptible(&pipe->wr_wait);
return pipe->max_usage * PAGE_SIZE;
out_revert_acct:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7c64272b0fa..74f948a6b621 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -405,11 +405,11 @@ print0:
static int lock_trace(struct task_struct *task)
{
- int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ int err = mutex_lock_killable(&task->signal->exec_update_mutex);
if (err)
return err;
if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
- mutex_unlock(&task->signal->cred_guard_mutex);
+ mutex_unlock(&task->signal->exec_update_mutex);
return -EPERM;
}
return 0;
@@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task)
static void unlock_trace(struct task_struct *task)
{
- mutex_unlock(&task->signal->cred_guard_mutex);
+ mutex_unlock(&task->signal->exec_update_mutex);
}
#ifdef CONFIG_STACKTRACE
@@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
*rgid = gid;
}
+void proc_pid_evict_inode(struct proc_inode *ei)
+{
+ struct pid *pid = ei->pid;
+
+ if (S_ISDIR(ei->vfs_inode.i_mode)) {
+ spin_lock(&pid->wait_pidfd.lock);
+ hlist_del_init_rcu(&ei->sibling_inodes);
+ spin_unlock(&pid->wait_pidfd.lock);
+ }
+
+ put_pid(pid);
+}
+
struct inode *proc_pid_make_inode(struct super_block * sb,
struct task_struct *task, umode_t mode)
{
struct inode * inode;
struct proc_inode *ei;
+ struct pid *pid;
/* We need a new inode */
@@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/*
* grab the reference to task.
*/
- ei->pid = get_task_pid(task, PIDTYPE_PID);
- if (!ei->pid)
+ pid = get_task_pid(task, PIDTYPE_PID);
+ if (!pid)
goto out_unlock;
+ /* Let the pid remember us for quick removal */
+ ei->pid = pid;
+ if (S_ISDIR(mode)) {
+ spin_lock(&pid->wait_pidfd.lock);
+ hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+ spin_unlock(&pid->wait_pidfd.lock);
+ }
+
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
security_task_to_inode(task, inode);
@@ -2861,7 +2883,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
unsigned long flags;
int result;
- result = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ result = mutex_lock_killable(&task->signal->exec_update_mutex);
if (result)
return result;
@@ -2897,7 +2919,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
result = 0;
out_unlock:
- mutex_unlock(&task->signal->cred_guard_mutex);
+ mutex_unlock(&task->signal->exec_update_mutex);
return result;
}
@@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.permission = proc_pid_permission,
};
-static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
-{
- struct dentry *dentry, *leader, *dir;
- char buf[10 + 1];
- struct qstr name;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%u", pid);
- /* no ->d_hash() rejects on procfs */
- dentry = d_hash_and_lookup(mnt->mnt_root, &name);
- if (dentry) {
- d_invalidate(dentry);
- dput(dentry);
- }
-
- if (pid == tgid)
- return;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%u", tgid);
- leader = d_hash_and_lookup(mnt->mnt_root, &name);
- if (!leader)
- goto out;
-
- name.name = "task";
- name.len = strlen(name.name);
- dir = d_hash_and_lookup(leader, &name);
- if (!dir)
- goto out_put_leader;
-
- name.name = buf;
- name.len = snprintf(buf, sizeof(buf), "%u", pid);
- dentry = d_hash_and_lookup(dir, &name);
- if (dentry) {
- d_invalidate(dentry);
- dput(dentry);
- }
-
- dput(dir);
-out_put_leader:
- dput(leader);
-out:
- return;
-}
-
/**
- * proc_flush_task - Remove dcache entries for @task from the /proc dcache.
- * @task: task that should be flushed.
+ * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache.
+ * @pid: pid that should be flushed.
*
- * When flushing dentries from proc, one needs to flush them from global
- * proc (proc_mnt) and from all the namespaces' procs this task was seen
- * in. This call is supposed to do all of this job.
- *
- * Looks in the dcache for
- * /proc/@pid
- * /proc/@tgid/task/@pid
- * if either directory is present flushes it and all of it'ts children
- * from the dcache.
+ * This function walks a list of inodes (that belong to any proc
+ * filesystem) that are attached to the pid and flushes them from
+ * the dentry cache.
*
* It is safe and reasonable to cache /proc entries for a task until
* that task exits. After that they just clog up the dcache with
* useless entries, possibly causing useful dcache entries to be
- * flushed instead. This routine is proved to flush those useless
- * dcache entries at process exit time.
+ * flushed instead. This routine is provided to flush those useless
+ * dcache entries when a process is reaped.
*
* NOTE: This routine is just an optimization so it does not guarantee
- * that no dcache entries will exist at process exit time it
- * just makes it very unlikely that any will persist.
+ * that no dcache entries will exist after a process is reaped
+ * it just makes it very unlikely that any will persist.
*/
-void proc_flush_task(struct task_struct *task)
+void proc_flush_pid(struct pid *pid)
{
- int i;
- struct pid *pid, *tgid;
- struct upid *upid;
-
- pid = task_pid(task);
- tgid = task_tgid(task);
-
- for (i = 0; i <= pid->level; i++) {
- upid = &pid->numbers[i];
- proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
- tgid->numbers[i].nr);
- }
+ proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
+ put_pid(pid);
}
static struct dentry *proc_pid_instantiate(struct dentry * dentry,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6da18316d209..1e730ea1dcd6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -33,21 +33,27 @@ static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
+ struct proc_inode *ei = PROC_I(inode);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
/* Stop tracking associated processes */
- put_pid(PROC_I(inode)->pid);
+ if (ei->pid) {
+ proc_pid_evict_inode(ei);
+ ei->pid = NULL;
+ }
/* Let go of any associated proc directory entry */
- de = PDE(inode);
- if (de)
+ de = ei->pde;
+ if (de) {
pde_put(de);
+ ei->pde = NULL;
+ }
- head = PROC_I(inode)->sysctl;
+ head = ei->sysctl;
if (head) {
- RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
+ RCU_INIT_POINTER(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
@@ -68,6 +74,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
+ INIT_HLIST_NODE(&ei->sibling_inodes);
ei->ns_ops = NULL;
return &ei->vfs_inode;
}
@@ -102,6 +109,62 @@ void __init proc_init_kmemcache(void)
BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
+{
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct hlist_node *node;
+ struct super_block *old_sb = NULL;
+
+ rcu_read_lock();
+ for (;;) {
+ struct super_block *sb;
+ node = hlist_first_rcu(inodes);
+ if (!node)
+ break;
+ ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+ spin_lock(lock);
+ hlist_del_init_rcu(&ei->sibling_inodes);
+ spin_unlock(lock);
+
+ inode = &ei->vfs_inode;
+ sb = inode->i_sb;
+ if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
+ continue;
+ inode = igrab(inode);
+ rcu_read_unlock();
+ if (sb != old_sb) {
+ if (old_sb)
+ deactivate_super(old_sb);
+ old_sb = sb;
+ }
+ if (unlikely(!inode)) {
+ rcu_read_lock();
+ continue;
+ }
+
+ if (S_ISDIR(inode->i_mode)) {
+ struct dentry *dir = d_find_any_alias(inode);
+ if (dir) {
+ d_invalidate(dir);
+ dput(dir);
+ }
+ } else {
+ struct dentry *dentry;
+ while ((dentry = d_find_alias(inode))) {
+ d_invalidate(dentry);
+ dput(dentry);
+ }
+ }
+ iput(inode);
+
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ if (old_sb)
+ deactivate_super(old_sb);
+}
+
static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
struct super_block *sb = root->d_sb;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 41587276798e..9e294f0290e5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -91,7 +91,7 @@ struct proc_inode {
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
- struct hlist_node sysctl_inodes;
+ struct hlist_node sibling_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
} __randomize_layout;
@@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
extern int proc_setattr(struct dentry *, struct iattr *);
+extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
@@ -210,6 +211,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;
void proc_init_kmemcache(void);
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c75bb4632ed1..b6f5d459b087 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -267,42 +267,9 @@ static void unuse_table(struct ctl_table_header *p)
complete(p->unregistering);
}
-static void proc_sys_prune_dcache(struct ctl_table_header *head)
+static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
{
- struct inode *inode;
- struct proc_inode *ei;
- struct hlist_node *node;
- struct super_block *sb;
-
- rcu_read_lock();
- for (;;) {
- node = hlist_first_rcu(&head->inodes);
- if (!node)
- break;
- ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
- spin_lock(&sysctl_lock);
- hlist_del_init_rcu(&ei->sysctl_inodes);
- spin_unlock(&sysctl_lock);
-
- inode = &ei->vfs_inode;
- sb = inode->i_sb;
- if (!atomic_inc_not_zero(&sb->s_active))
- continue;
- inode = igrab(inode);
- rcu_read_unlock();
- if (unlikely(!inode)) {
- deactivate_super(sb);
- rcu_read_lock();
- continue;
- }
-
- d_prune_aliases(inode);
- iput(inode);
- deactivate_super(sb);
-
- rcu_read_lock();
- }
- rcu_read_unlock();
+ proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
}
/* called under sysctl_lock, will reacquire if has to wait */
@@ -324,10 +291,10 @@ static void start_unregistering(struct ctl_table_header *p)
spin_unlock(&sysctl_lock);
}
/*
- * Prune dentries for unregistered sysctls: namespaced sysctls
+ * Invalidate dentries for unregistered sysctls: namespaced sysctls
* can have duplicate names and contaminate dcache very badly.
*/
- proc_sys_prune_dcache(p);
+ proc_sys_invalidate_dcache(p);
/*
* do not remove from the list until nobody holds it; walking the
* list in do_sysctl() relies on that.
@@ -483,7 +450,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
}
ei->sysctl = head;
ei->sysctl_entry = table;
- hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
+ hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
head->count++;
spin_unlock(&sysctl_lock);
@@ -514,7 +481,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
{
spin_lock(&sysctl_lock);
- hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
+ hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
if (!--head->count)
kfree_rcu(head, rcu);
spin_unlock(&sysctl_lock);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 608233dfd29c..2633f10446c3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = {
.subdir = RB_ROOT,
.name = "/proc",
};
-
-int pid_ns_prepare_proc(struct pid_namespace *ns)
-{
- struct proc_fs_context *ctx;
- struct fs_context *fc;
- struct vfsmount *mnt;
-
- fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
- if (IS_ERR(fc))
- return PTR_ERR(fc);
-
- if (fc->user_ns != ns->user_ns) {
- put_user_ns(fc->user_ns);
- fc->user_ns = get_user_ns(ns->user_ns);
- }
-
- ctx = fc->fs_private;
- if (ctx->pid_ns != ns) {
- put_pid_ns(ctx->pid_ns);
- get_pid_ns(ns);
- ctx->pid_ns = ns;
- }
-
- mnt = fc_mount(fc);
- put_fs_context(fc);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
-
- ns->proc_mnt = mnt;
- return 0;
-}
-
-void pid_ns_release_proc(struct pid_namespace *ns)
-{
- kern_unmount(ns->proc_mnt);
-}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 7fbe8f058220..d99b5d39aa90 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -87,11 +87,11 @@ static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos)
struct pstore_private *ps = s->private;
struct pstore_ftrace_seq_data *data = v;
+ (*pos)++;
data->off += REC_SIZE;
if (data->off + REC_SIZE > ps->total_size)
return NULL;
- (*pos)++;
return data;
}
@@ -101,6 +101,9 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
struct pstore_ftrace_seq_data *data = v;
struct pstore_ftrace_record *rec;
+ if (!data)
+ return 0;
+
rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off);
seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %ps <- %pS\n",
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index d896457e7c11..408277ee3cdb 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -823,9 +823,9 @@ static int __init pstore_init(void)
ret = pstore_init_fs();
if (ret)
- return ret;
+ free_buf_for_compression();
- return 0;
+ return ret;
}
late_initcall(pstore_init);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 013486b5125e..795622190c01 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -963,7 +963,6 @@ static void __init ramoops_register_dummy(void)
pr_info("could not create platform device: %ld\n",
PTR_ERR(dummy));
dummy = NULL;
- ramoops_unregister_dummy();
}
}
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 1f4d8c06f9be..c917c191e78c 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -34,7 +34,7 @@ struct persistent_ram_buffer {
uint32_t sig;
atomic_t start;
atomic_t size;
- uint8_t data[0];
+ uint8_t data[];
};
#define PERSISTENT_RAM_SIG (0x43474244) /* DBGC */
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 072156c4f895..5c766330e493 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2599,7 +2599,6 @@ static int journal_init_dev(struct super_block *super,
int result;
dev_t jdev;
fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
- char b[BDEVNAME_SIZE];
result = 0;
@@ -2621,8 +2620,8 @@ static int journal_init_dev(struct super_block *super,
result = PTR_ERR(journal->j_dev_bd);
journal->j_dev_bd = NULL;
reiserfs_warning(super, "sh-458",
- "cannot init journal device '%s': %i",
- __bdevname(jdev, b), result);
+ "cannot init journal device unknown-block(%u,%u): %i",
+ MAJOR(jdev), MINOR(jdev), result);
return result;
} else if (jdev != super->s_dev)
set_blocksize(journal->j_dev_bd, super->s_blocksize);
diff --git a/fs/splice.c b/fs/splice.c
index d671936d0aad..4735defc46ee 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1109,9 +1109,9 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
/*
* Determine where to splice to/from.
*/
-static long do_splice(struct file *in, loff_t __user *off_in,
- struct file *out, loff_t __user *off_out,
- size_t len, unsigned int flags)
+long do_splice(struct file *in, loff_t __user *off_in,
+ struct file *out, loff_t __user *off_out,
+ size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 130fc6fbcc03..26bbf960e2a2 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -558,3 +558,151 @@ void sysfs_remove_bin_file(struct kobject *kobj,
kernfs_remove_by_name(kobj->sd, attr->attr.name);
}
EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
+
+static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct iattr newattrs = {
+ .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = kuid,
+ .ia_gid = kgid,
+ };
+ return kernfs_setattr(kn, &newattrs);
+}
+
+/**
+ * sysfs_link_change_owner - change owner of a sysfs file.
+ * @kobj: object of the kernfs_node the symlink is located in.
+ * @targ: object of the kernfs_node the symlink points to.
+ * @name: name of the link.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * This function looks up the sysfs symlink entry @name under @kobj and changes
+ * the ownership to @kuid/@kgid. The symlink is looked up in the namespace of
+ * @targ.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
+ const char *name, kuid_t kuid, kgid_t kgid)
+{
+ struct kernfs_node *kn = NULL;
+ int error;
+
+ if (!name || !kobj->state_in_sysfs || !targ->state_in_sysfs)
+ return -EINVAL;
+
+ error = -ENOENT;
+ kn = kernfs_find_and_get_ns(kobj->sd, name, targ->sd->ns);
+ if (!kn)
+ goto out;
+
+ error = -EINVAL;
+ if (kernfs_type(kn) != KERNFS_LINK)
+ goto out;
+ if (kn->symlink.target_kn->priv != targ)
+ goto out;
+
+ error = internal_change_owner(kn, kuid, kgid);
+
+out:
+ kernfs_put(kn);
+ return error;
+}
+
+/**
+ * sysfs_file_change_owner - change owner of a sysfs file.
+ * @kobj: object.
+ * @name: name of the file to change.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * This function looks up the sysfs entry @name under @kobj and changes the
+ * ownership to @kuid/@kgid.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct kernfs_node *kn;
+ int error;
+
+ if (!name)
+ return -EINVAL;
+
+ if (!kobj->state_in_sysfs)
+ return -EINVAL;
+
+ kn = kernfs_find_and_get(kobj->sd, name);
+ if (!kn)
+ return -ENOENT;
+
+ error = internal_change_owner(kn, kuid, kgid);
+
+ kernfs_put(kn);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_file_change_owner);
+
+/**
+ * sysfs_change_owner - change owner of the given object.
+ * @kobj: object.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * Change the owner of the default directory, files, groups, and attributes of
+ * @kobj to @kuid/@kgid. Note that sysfs_change_owner mirrors how the sysfs
+ * entries for a kobject are added by driver core. In summary,
+ * sysfs_change_owner() takes care of the default directory entry for @kobj,
+ * the default attributes associated with the ktype of @kobj and the default
+ * attributes associated with the ktype of @kobj.
+ * Additional properties not added by driver core have to be changed by the
+ * driver or subsystem which created them. This is similar to how
+ * driver/subsystem specific entries are removed.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
+{
+ int error;
+ const struct kobj_type *ktype;
+
+ if (!kobj->state_in_sysfs)
+ return -EINVAL;
+
+ /* Change the owner of the kobject itself. */
+ error = internal_change_owner(kobj->sd, kuid, kgid);
+ if (error)
+ return error;
+
+ ktype = get_ktype(kobj);
+ if (ktype) {
+ struct attribute **kattr;
+
+ /*
+ * Change owner of the default attributes associated with the
+ * ktype of @kobj.
+ */
+ for (kattr = ktype->default_attrs; kattr && *kattr; kattr++) {
+ error = sysfs_file_change_owner(kobj, (*kattr)->name,
+ kuid, kgid);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Change owner of the default groups associated with the
+ * ktype of @kobj.
+ */
+ error = sysfs_groups_change_owner(kobj, ktype->default_groups,
+ kuid, kgid);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sysfs_change_owner);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index c4ab045926b7..5afe0e7ff7cd 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -13,6 +13,7 @@
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/err.h>
+#include <linux/fs.h>
#include "sysfs.h"
@@ -457,3 +458,117 @@ int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
return PTR_ERR_OR_ZERO(link);
}
EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj);
+
+static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
+ const struct attribute_group *grp,
+ struct iattr *newattrs)
+{
+ struct kernfs_node *kn;
+ int error;
+
+ if (grp->attrs) {
+ struct attribute *const *attr;
+
+ for (attr = grp->attrs; *attr; attr++) {
+ kn = kernfs_find_and_get(grp_kn, (*attr)->name);
+ if (!kn)
+ return -ENOENT;
+
+ error = kernfs_setattr(kn, newattrs);
+ kernfs_put(kn);
+ if (error)
+ return error;
+ }
+ }
+
+ if (grp->bin_attrs) {
+ struct bin_attribute *const *bin_attr;
+
+ for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+ kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);
+ if (!kn)
+ return -ENOENT;
+
+ error = kernfs_setattr(kn, newattrs);
+ kernfs_put(kn);
+ if (error)
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * sysfs_group_change_owner - change owner of an attribute group.
+ * @kobj: The kobject containing the group.
+ * @grp: The attribute group.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_group_change_owner(struct kobject *kobj,
+ const struct attribute_group *grp, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct kernfs_node *grp_kn;
+ int error;
+ struct iattr newattrs = {
+ .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = kuid,
+ .ia_gid = kgid,
+ };
+
+ if (!kobj->state_in_sysfs)
+ return -EINVAL;
+
+ if (grp->name) {
+ grp_kn = kernfs_find_and_get(kobj->sd, grp->name);
+ } else {
+ kernfs_get(kobj->sd);
+ grp_kn = kobj->sd;
+ }
+ if (!grp_kn)
+ return -ENOENT;
+
+ error = kernfs_setattr(grp_kn, &newattrs);
+ if (!error)
+ error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs);
+
+ kernfs_put(grp_kn);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_group_change_owner);
+
+/**
+ * sysfs_groups_change_owner - change owner of a set of attribute groups.
+ * @kobj: The kobject containing the groups.
+ * @groups: The attribute groups.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_groups_change_owner(struct kobject *kobj,
+ const struct attribute_group **groups,
+ kuid_t kuid, kgid_t kgid)
+{
+ int error = 0, i;
+
+ if (!kobj->state_in_sysfs)
+ return -EINVAL;
+
+ if (!groups)
+ return 0;
+
+ for (i = 0; groups[i]; i++) {
+ error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_groups_change_owner);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index d49fc04f2d7d..3df9be2c684c 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -208,6 +208,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ return fscrypt_ioctl_get_nonce(file, (void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -230,6 +233,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_REMOVE_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ case FS_IOC_GET_ENCRYPTION_NONCE:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a688eb5c5ae..58e937be24ce 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -587,7 +587,7 @@ xfs_dax_writepages(
xfs_iflags_clear(ip, XFS_ITRUNCATED);
return dax_writeback_mapping_range(mapping,
- xfs_inode_buftarg(ip)->bt_bdev, wbc);
+ xfs_inode_buftarg(ip)->bt_daxdev, wbc);
}
STATIC sector_t
diff --git a/fs/zonefs/Kconfig b/fs/zonefs/Kconfig
index fb87ad372e29..ef2697b78820 100644
--- a/fs/zonefs/Kconfig
+++ b/fs/zonefs/Kconfig
@@ -2,6 +2,7 @@ config ZONEFS_FS
tristate "zonefs filesystem support"
depends on BLOCK
depends on BLK_DEV_ZONED
+ select FS_IOMAP
help
zonefs is a simple file system which exposes zones of a zoned block
device (e.g. host-managed or host-aware SMR disk drives) as files.
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 8bc6ef82d693..3ce9829a6936 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -178,7 +178,8 @@ static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
* amount of readable data in the zone.
*/
static loff_t zonefs_check_zone_condition(struct inode *inode,
- struct blk_zone *zone, bool warn)
+ struct blk_zone *zone, bool warn,
+ bool mount)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
@@ -196,13 +197,26 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
zone->wp = zone->start;
return 0;
case BLK_ZONE_COND_READONLY:
- /* Do not allow writes in read-only zones */
+ /*
+ * The write pointer of read-only zones is invalid. If such a
+ * zone is found during mount, the file size cannot be retrieved
+ * so we treat the zone as offline (mount == true case).
+ * Otherwise, keep the file size as it was when last updated
+ * so that the user can recover data. In both cases, writes are
+ * always disabled for the zone.
+ */
if (warn)
zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n",
inode->i_ino);
inode->i_flags |= S_IMMUTABLE;
+ if (mount) {
+ zone->cond = BLK_ZONE_COND_OFFLINE;
+ inode->i_mode &= ~0777;
+ zone->wp = zone->start;
+ return 0;
+ }
inode->i_mode &= ~0222;
- /* fallthrough */
+ return i_size_read(inode);
default:
if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
return zi->i_max_size;
@@ -231,7 +245,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* as there is no inconsistency between the inode size and the amount of
* data writen in the zone (data_size).
*/
- data_size = zonefs_check_zone_condition(inode, zone, true);
+ data_size = zonefs_check_zone_condition(inode, zone, true, false);
isize = i_size_read(inode);
if (zone->cond != BLK_ZONE_COND_OFFLINE &&
zone->cond != BLK_ZONE_COND_READONLY &&
@@ -274,7 +288,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
if (zone->cond != BLK_ZONE_COND_OFFLINE) {
zone->cond = BLK_ZONE_COND_OFFLINE;
data_size = zonefs_check_zone_condition(inode, zone,
- false);
+ false, false);
}
} else if (zone->cond == BLK_ZONE_COND_READONLY ||
sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) {
@@ -283,7 +297,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
if (zone->cond != BLK_ZONE_COND_READONLY) {
zone->cond = BLK_ZONE_COND_READONLY;
data_size = zonefs_check_zone_condition(inode, zone,
- false);
+ false, false);
}
}
@@ -601,13 +615,13 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
/*
- * For async direct IOs to sequential zone files, ignore IOCB_NOWAIT
+ * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
* as this can cause write reordering (e.g. the first aio gets EAGAIN
* on the inode lock but the second goes through but is now unaligned).
*/
- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb)
- && (iocb->ki_flags & IOCB_NOWAIT))
- iocb->ki_flags &= ~IOCB_NOWAIT;
+ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb) &&
+ (iocb->ki_flags & IOCB_NOWAIT))
+ return -EOPNOTSUPP;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode))
@@ -975,7 +989,7 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
zi->i_zsector = zone->start;
zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
zone->len << SECTOR_SHIFT);
- zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true);
+ zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true);
inode->i_uid = sbi->s_uid;
inode->i_gid = sbi->s_gid;