summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/cmservice.c6
-rw-r--r--fs/afs/fsclient.c4
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/afs/rxrpc.c3
-rw-r--r--fs/btrfs/extent-tree.c3
-rw-r--r--fs/btrfs/extent_io.c8
-rw-r--r--fs/btrfs/inode.c13
-rw-r--r--fs/btrfs/ioctl.c5
-rw-r--r--fs/btrfs/relocation.c9
-rw-r--r--fs/btrfs/send.c58
-rw-r--r--fs/btrfs/tree-log.c20
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/iomap.c5
-rw-r--r--fs/kernfs/file.c1
-rw-r--r--fs/nfsd/netns.h5
-rw-r--r--fs/nfsd/nfs4state.c38
-rw-r--r--fs/orangefs/dcache.c5
-rw-r--r--fs/orangefs/file.c14
-rw-r--r--fs/orangefs/namei.c8
-rw-r--r--fs/orangefs/orangefs-debugfs.c147
-rw-r--r--fs/orangefs/orangefs-kernel.h7
-rw-r--r--fs/orangefs/orangefs-mod.c6
-rw-r--r--fs/overlayfs/copy_up.c2
-rw-r--r--fs/overlayfs/inode.c3
-rw-r--r--fs/overlayfs/super.c15
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/ubifs/dir.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c418
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h8
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c3
-rw-r--r--fs/xfs/libxfs/xfs_format.h1
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h2
-rw-r--r--fs/xfs/xfs_file.c232
-rw-r--r--fs/xfs/xfs_icache.c8
-rw-r--r--fs/xfs/xfs_iomap.c57
-rw-r--r--fs/xfs/xfs_mount.c1
-rw-r--r--fs/xfs/xfs_reflink.c499
-rw-r--r--fs/xfs/xfs_reflink.h11
-rw-r--r--fs/xfs/xfs_sysfs.c4
-rw-r--r--fs/xfs/xfs_trace.h4
42 files changed, 872 insertions, 791 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 2037e7a77a37..d764236072b1 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -91,11 +91,9 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
*/
bool afs_cm_incoming_call(struct afs_call *call)
{
- u32 operation_id = ntohl(call->operation_ID);
+ _enter("{CB.OP %u}", call->operation_ID);
- _enter("{CB.OP %u}", operation_id);
-
- switch (operation_id) {
+ switch (call->operation_ID) {
case CBCallBack:
call->type = &afs_SRXCBCallBack;
return true;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 96f4d764d1a6..31c616ab9b40 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -364,7 +364,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
buffer = kmap(page);
ret = afs_extract_data(call, buffer,
call->count, true);
- kunmap(buffer);
+ kunmap(page);
if (ret < 0)
return ret;
}
@@ -397,7 +397,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
page = call->reply3;
buffer = kmap(page);
memset(buffer + call->count, 0, PAGE_SIZE - call->count);
- kunmap(buffer);
+ kunmap(page);
}
_leave(" = 0 [done]");
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5497c8496055..535a38d2c1d0 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -112,7 +112,7 @@ struct afs_call {
bool need_attention; /* T if RxRPC poked us */
u16 service_id; /* RxRPC service ID to call */
__be16 port; /* target UDP port */
- __be32 operation_ID; /* operation ID for an incoming call */
+ u32 operation_ID; /* operation ID for an incoming call */
u32 count; /* count for use in unmarshalling */
__be32 tmp; /* place to extract temporary data */
afs_dataversion_t store_version; /* updated version expected from store */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 477928b25940..25f05a8d21b1 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -676,10 +676,11 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
ASSERTCMP(call->offset, <, 4);
/* the operation ID forms the first four bytes of the request data */
- ret = afs_extract_data(call, &call->operation_ID, 4, true);
+ ret = afs_extract_data(call, &call->tmp, 4, true);
if (ret < 0)
return ret;
+ call->operation_ID = ntohl(call->tmp);
call->state = AFS_CALL_AWAIT_REQUEST;
call->offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 210c94ac8818..4607af38c72e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2647,7 +2647,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
+ spin_lock(&delayed_refs->lock);
locked_ref->processing = 0;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
btrfs_delayed_ref_unlock(locked_ref);
btrfs_put_delayed_ref(ref);
btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 66a755150056..8ed05d95584a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5569,7 +5569,7 @@ void le_bitmap_set(u8 *map, unsigned int start, int len)
*p |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_BYTE;
- mask_to_set = ~(u8)0;
+ mask_to_set = ~0;
p++;
}
if (len) {
@@ -5589,7 +5589,7 @@ void le_bitmap_clear(u8 *map, unsigned int start, int len)
*p &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_BYTE;
- mask_to_clear = ~(u8)0;
+ mask_to_clear = ~0;
p++;
}
if (len) {
@@ -5679,7 +5679,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
kaddr[offset] |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_BYTE;
- mask_to_set = ~(u8)0;
+ mask_to_set = ~0;
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
@@ -5721,7 +5721,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
kaddr[offset] &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_BYTE;
- mask_to_clear = ~(u8)0;
+ mask_to_clear = ~0;
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2b790bda7998..8e3a5a266917 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4605,8 +4605,8 @@ delete:
BUG_ON(ret);
if (btrfs_should_throttle_delayed_refs(trans, root))
btrfs_async_run_delayed_refs(root,
- trans->transid,
- trans->delayed_ref_updates * 2, 0);
+ trans->delayed_ref_updates * 2,
+ trans->transid, 0);
if (be_nice) {
if (truncate_space_check(trans, root,
extent_num_bytes)) {
@@ -8931,9 +8931,14 @@ again:
* So even we call qgroup_free_data(), it won't decrease reserved
* space.
* 2) Not written to disk
- * This means the reserved space should be freed here.
+ * This means the reserved space should be freed here. However,
+ * if a truncate invalidates the page (by clearing PageDirty)
+ * and the page is accounted for while allocating extent
+ * in btrfs_check_data_free_space() we let delayed_ref to
+ * free the entire extent.
*/
- btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
+ if (PageDirty(page))
+ btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 18e1aa0f85f5..7acbd2cf6192 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3814,6 +3814,11 @@ process_slot:
}
btrfs_release_path(path);
key.offset = next_key_min_offset;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
}
ret = 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0ec8ffa37ab0..c4af0cdb783d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2728,7 +2728,14 @@ static int do_relocation(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (lowest) {
- BUG_ON(bytenr != node->bytenr);
+ if (bytenr != node->bytenr) {
+ btrfs_err(root->fs_info,
+ "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
+ bytenr, node->bytenr, slot,
+ upper->eb->start);
+ err = -EIO;
+ goto next;
+ }
} else {
if (node->eb->start == bytenr)
goto next;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 01bc36cec26e..71261b459863 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5805,6 +5805,64 @@ static int changed_extent(struct send_ctx *sctx,
int ret = 0;
if (sctx->cur_ino != sctx->cmp_key->objectid) {
+
+ if (result == BTRFS_COMPARE_TREE_CHANGED) {
+ struct extent_buffer *leaf_l;
+ struct extent_buffer *leaf_r;
+ struct btrfs_file_extent_item *ei_l;
+ struct btrfs_file_extent_item *ei_r;
+
+ leaf_l = sctx->left_path->nodes[0];
+ leaf_r = sctx->right_path->nodes[0];
+ ei_l = btrfs_item_ptr(leaf_l,
+ sctx->left_path->slots[0],
+ struct btrfs_file_extent_item);
+ ei_r = btrfs_item_ptr(leaf_r,
+ sctx->right_path->slots[0],
+ struct btrfs_file_extent_item);
+
+ /*
+ * We may have found an extent item that has changed
+ * only its disk_bytenr field and the corresponding
+ * inode item was not updated. This case happens due to
+ * very specific timings during relocation when a leaf
+ * that contains file extent items is COWed while
+ * relocation is ongoing and its in the stage where it
+ * updates data pointers. So when this happens we can
+ * safely ignore it since we know it's the same extent,
+ * but just at different logical and physical locations
+ * (when an extent is fully replaced with a new one, we
+ * know the generation number must have changed too,
+ * since snapshot creation implies committing the current
+ * transaction, and the inode item must have been updated
+ * as well).
+ * This replacement of the disk_bytenr happens at
+ * relocation.c:replace_file_extents() through
+ * relocation.c:btrfs_reloc_cow_block().
+ */
+ if (btrfs_file_extent_generation(leaf_l, ei_l) ==
+ btrfs_file_extent_generation(leaf_r, ei_r) &&
+ btrfs_file_extent_ram_bytes(leaf_l, ei_l) ==
+ btrfs_file_extent_ram_bytes(leaf_r, ei_r) &&
+ btrfs_file_extent_compression(leaf_l, ei_l) ==
+ btrfs_file_extent_compression(leaf_r, ei_r) &&
+ btrfs_file_extent_encryption(leaf_l, ei_l) ==
+ btrfs_file_extent_encryption(leaf_r, ei_r) &&
+ btrfs_file_extent_other_encoding(leaf_l, ei_l) ==
+ btrfs_file_extent_other_encoding(leaf_r, ei_r) &&
+ btrfs_file_extent_type(leaf_l, ei_l) ==
+ btrfs_file_extent_type(leaf_r, ei_r) &&
+ btrfs_file_extent_disk_bytenr(leaf_l, ei_l) !=
+ btrfs_file_extent_disk_bytenr(leaf_r, ei_r) &&
+ btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) ==
+ btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) &&
+ btrfs_file_extent_offset(leaf_l, ei_l) ==
+ btrfs_file_extent_offset(leaf_r, ei_r) &&
+ btrfs_file_extent_num_bytes(leaf_l, ei_l) ==
+ btrfs_file_extent_num_bytes(leaf_r, ei_r))
+ return 0;
+ }
+
inconsistent_snapshot_error(sctx, result, "extent");
return -EIO;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 528cae123dc9..3d33c4e41e5f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2713,14 +2713,12 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
int index, int error)
{
struct btrfs_log_ctx *ctx;
+ struct btrfs_log_ctx *safe;
- if (!error) {
- INIT_LIST_HEAD(&root->log_ctxs[index]);
- return;
- }
-
- list_for_each_entry(ctx, &root->log_ctxs[index], list)
+ list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
+ list_del_init(&ctx->list);
ctx->log_ret = error;
+ }
INIT_LIST_HEAD(&root->log_ctxs[index]);
}
@@ -2961,13 +2959,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
out_wake_log_root:
- /*
- * We needn't get log_mutex here because we are sure all
- * the other tasks are blocked.
- */
+ mutex_lock(&log_root_tree->log_mutex);
btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
- mutex_lock(&log_root_tree->log_mutex);
log_root_tree->log_transid_committed++;
atomic_set(&log_root_tree->log_commit[index2], 0);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2978,10 +2972,8 @@ out_wake_log_root:
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
- /* See above. */
- btrfs_remove_all_log_ctxs(root, index1, ret);
-
mutex_lock(&root->log_mutex);
+ btrfs_remove_all_log_ctxs(root, index1, ret);
root->log_transid_committed++;
atomic_set(&root->log_commit[index1], 0);
mutex_unlock(&root->log_mutex);
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 79101651fe9e..42f9a0a0c4ca 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -137,7 +137,7 @@ Espan:
bad_entry:
EXOFS_ERR(
"ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
- "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
+ "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n",
dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
_LLU(le64_to_cpu(p->inode_no)),
rec_len, p->name_len);
diff --git a/fs/iomap.c b/fs/iomap.c
index 013d1d36fbbf..a8ee8c33ca78 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -433,8 +433,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
struct page *page = data;
int ret;
- ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
- NULL, iomap);
+ ret = __block_write_begin_int(page, pos, length, NULL, iomap);
if (ret)
return ret;
@@ -561,7 +560,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
}
while (len > 0) {
- ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+ ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
iomap_fiemap_actor);
/* inode with no (attribute) mapping will give ENOENT */
if (ret == -ENOENT)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 2bcb86e6e6ca..78219d5644e9 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -911,6 +911,7 @@ const struct file_operations kernfs_file_fops = {
.open = kernfs_fop_open,
.release = kernfs_fop_release,
.poll = kernfs_fop_poll,
+ .fsync = noop_fsync,
};
/**
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index b10d557f9c9e..ee36efd5aece 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -84,6 +84,8 @@ struct nfsd_net {
struct list_head client_lru;
struct list_head close_lru;
struct list_head del_recall_lru;
+
+ /* protected by blocked_locks_lock */
struct list_head blocked_locks_lru;
struct delayed_work laundromat_work;
@@ -91,6 +93,9 @@ struct nfsd_net {
/* client_lock protects the client lru list and session hash table */
spinlock_t client_lock;
+ /* protects blocked_locks_lru */
+ spinlock_t blocked_locks_lock;
+
struct file *rec_file;
bool in_grace;
const struct nfsd4_client_tracking_ops *client_tracking_ops;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9752beb78659..4b4beaaa4eaa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -217,7 +217,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
{
struct nfsd4_blocked_lock *cur, *found = NULL;
- spin_lock(&nn->client_lock);
+ spin_lock(&nn->blocked_locks_lock);
list_for_each_entry(cur, &lo->lo_blocked, nbl_list) {
if (fh_match(fh, &cur->nbl_fh)) {
list_del_init(&cur->nbl_list);
@@ -226,7 +226,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
break;
}
}
- spin_unlock(&nn->client_lock);
+ spin_unlock(&nn->blocked_locks_lock);
if (found)
posix_unblock_lock(&found->nbl_lock);
return found;
@@ -1227,9 +1227,7 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp)
{
- struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
-
- lockdep_assert_held(&oo->oo_owner.so_client->cl_lock);
+ lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
list_del_init(&stp->st_locks);
nfs4_unhash_stid(&stp->st_stid);
@@ -1238,12 +1236,12 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp)
static void release_lock_stateid(struct nfs4_ol_stateid *stp)
{
- struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
+ struct nfs4_client *clp = stp->st_stid.sc_client;
bool unhashed;
- spin_lock(&oo->oo_owner.so_client->cl_lock);
+ spin_lock(&clp->cl_lock);
unhashed = unhash_lock_stateid(stp);
- spin_unlock(&oo->oo_owner.so_client->cl_lock);
+ spin_unlock(&clp->cl_lock);
if (unhashed)
nfs4_put_stid(&stp->st_stid);
}
@@ -4665,7 +4663,7 @@ nfs4_laundromat(struct nfsd_net *nn)
* indefinitely once the lock does become free.
*/
BUG_ON(!list_empty(&reaplist));
- spin_lock(&nn->client_lock);
+ spin_lock(&nn->blocked_locks_lock);
while (!list_empty(&nn->blocked_locks_lru)) {
nbl = list_first_entry(&nn->blocked_locks_lru,
struct nfsd4_blocked_lock, nbl_lru);
@@ -4678,7 +4676,7 @@ nfs4_laundromat(struct nfsd_net *nn)
list_move(&nbl->nbl_lru, &reaplist);
list_del_init(&nbl->nbl_list);
}
- spin_unlock(&nn->client_lock);
+ spin_unlock(&nn->blocked_locks_lock);
while (!list_empty(&reaplist)) {
nbl = list_first_entry(&nn->blocked_locks_lru,
@@ -5439,13 +5437,13 @@ nfsd4_lm_notify(struct file_lock *fl)
bool queue = false;
/* An empty list means that something else is going to be using it */
- spin_lock(&nn->client_lock);
+ spin_lock(&nn->blocked_locks_lock);
if (!list_empty(&nbl->nbl_list)) {
list_del_init(&nbl->nbl_list);
list_del_init(&nbl->nbl_lru);
queue = true;
}
- spin_unlock(&nn->client_lock);
+ spin_unlock(&nn->blocked_locks_lock);
if (queue)
nfsd4_run_cb(&nbl->nbl_cb);
@@ -5868,10 +5866,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (fl_flags & FL_SLEEP) {
nbl->nbl_time = jiffies;
- spin_lock(&nn->client_lock);
+ spin_lock(&nn->blocked_locks_lock);
list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru);
- spin_unlock(&nn->client_lock);
+ spin_unlock(&nn->blocked_locks_lock);
}
err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
@@ -5900,10 +5898,10 @@ out:
if (nbl) {
/* dequeue it if we queued it before */
if (fl_flags & FL_SLEEP) {
- spin_lock(&nn->client_lock);
+ spin_lock(&nn->blocked_locks_lock);
list_del_init(&nbl->nbl_list);
list_del_init(&nbl->nbl_lru);
- spin_unlock(&nn->client_lock);
+ spin_unlock(&nn->blocked_locks_lock);
}
free_blocked_lock(nbl);
}
@@ -6943,9 +6941,11 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->client_lru);
INIT_LIST_HEAD(&nn->close_lru);
INIT_LIST_HEAD(&nn->del_recall_lru);
- INIT_LIST_HEAD(&nn->blocked_locks_lru);
spin_lock_init(&nn->client_lock);
+ spin_lock_init(&nn->blocked_locks_lock);
+ INIT_LIST_HEAD(&nn->blocked_locks_lru);
+
INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
get_net(net);
@@ -7063,14 +7063,14 @@ nfs4_state_shutdown_net(struct net *net)
}
BUG_ON(!list_empty(&reaplist));
- spin_lock(&nn->client_lock);
+ spin_lock(&nn->blocked_locks_lock);
while (!list_empty(&nn->blocked_locks_lru)) {
nbl = list_first_entry(&nn->blocked_locks_lru,
struct nfsd4_blocked_lock, nbl_lru);
list_move(&nbl->nbl_lru, &reaplist);
list_del_init(&nbl->nbl_list);
}
- spin_unlock(&nn->client_lock);
+ spin_unlock(&nn->blocked_locks_lock);
while (!list_empty(&reaplist)) {
nbl = list_first_entry(&nn->blocked_locks_lru,
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index 1e8fe844e69f..5355efba4bc8 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -73,7 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
}
}
- dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
+ orangefs_set_timeout(dentry);
ret = 1;
out_release_op:
op_release(new_op);
@@ -94,8 +94,9 @@ out_drop:
static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
int ret;
+ unsigned long time = (unsigned long) dentry->d_fsdata;
- if (time_before(jiffies, dentry->d_time))
+ if (time_before(jiffies, time))
return 1;
if (flags & LOOKUP_RCU)
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 66ea0cc37b18..02cc6139ec90 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -621,9 +621,9 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
* readahead cache (if any); this forces an expensive refresh of
* data for the next caller of mmap (or 'get_block' accesses)
*/
- if (file->f_path.dentry->d_inode &&
- file->f_path.dentry->d_inode->i_mapping &&
- mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) {
+ if (file_inode(file) &&
+ file_inode(file)->i_mapping &&
+ mapping_nrpages(&file_inode(file)->i_data)) {
if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
gossip_debug(GOSSIP_INODE_DEBUG,
"calling flush_racache on %pU\n",
@@ -632,7 +632,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
gossip_debug(GOSSIP_INODE_DEBUG,
"flush_racache finished\n");
}
- truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
+ truncate_inode_pages(file_inode(file)->i_mapping,
0);
}
return 0;
@@ -648,7 +648,7 @@ static int orangefs_fsync(struct file *file,
{
int ret = -EINVAL;
struct orangefs_inode_s *orangefs_inode =
- ORANGEFS_I(file->f_path.dentry->d_inode);
+ ORANGEFS_I(file_inode(file));
struct orangefs_kernel_op_s *new_op = NULL;
/* required call */
@@ -661,7 +661,7 @@ static int orangefs_fsync(struct file *file,
ret = service_operation(new_op,
"orangefs_fsync",
- get_interruptible_flag(file->f_path.dentry->d_inode));
+ get_interruptible_flag(file_inode(file)));
gossip_debug(GOSSIP_FILE_DEBUG,
"orangefs_fsync got return value of %d\n",
@@ -669,7 +669,7 @@ static int orangefs_fsync(struct file *file,
op_release(new_op);
- orangefs_flush_inode(file->f_path.dentry->d_inode);
+ orangefs_flush_inode(file_inode(file));
return ret;
}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index d15d3d2dba62..a290ff6ec756 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -72,7 +72,7 @@ static int orangefs_create(struct inode *dir,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
- dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
+ orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
@@ -183,7 +183,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
+ orangefs_set_timeout(dentry);
inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
if (IS_ERR(inode)) {
@@ -322,7 +322,7 @@ static int orangefs_symlink(struct inode *dir,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
- dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
+ orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
@@ -386,7 +386,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
d_instantiate(dentry, inode);
unlock_new_inode(inode);
- dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
+ orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index eb09aa026723..d484068ca716 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -141,6 +141,9 @@ static struct client_debug_mask client_debug_mask;
*/
static DEFINE_MUTEX(orangefs_debug_lock);
+/* Used to protect data in ORANGEFS_KMOD_DEBUG_HELP_FILE */
+static DEFINE_MUTEX(orangefs_help_file_lock);
+
/*
* initialize kmod debug operations, create orangefs debugfs dir and
* ORANGEFS_KMOD_DEBUG_HELP_FILE.
@@ -289,6 +292,8 @@ static void *help_start(struct seq_file *m, loff_t *pos)
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_start: start\n");
+ mutex_lock(&orangefs_help_file_lock);
+
if (*pos == 0)
payload = m->private;
@@ -305,6 +310,7 @@ static void *help_next(struct seq_file *m, void *v, loff_t *pos)
static void help_stop(struct seq_file *m, void *p)
{
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_stop: start\n");
+ mutex_unlock(&orangefs_help_file_lock);
}
static int help_show(struct seq_file *m, void *v)
@@ -610,32 +616,54 @@ out:
* /sys/kernel/debug/orangefs/debug-help can be catted to
* see all the available kernel and client debug keywords.
*
- * When the kernel boots, we have no idea what keywords the
+ * When orangefs.ko initializes, we have no idea what keywords the
* client supports, nor their associated masks.
*
- * We pass through this function once at boot and stamp a
+ * We pass through this function once at module-load and stamp a
* boilerplate "we don't know" message for the client in the
* debug-help file. We pass through here again when the client
* starts and then we can fill out the debug-help file fully.
*
* The client might be restarted any number of times between
- * reboots, we only build the debug-help file the first time.
+ * module reloads, we only build the debug-help file the first time.
*/
int orangefs_prepare_debugfs_help_string(int at_boot)
{
- int rc = -EINVAL;
- int i;
- int byte_count = 0;
char *client_title = "Client Debug Keywords:\n";
char *kernel_title = "Kernel Debug Keywords:\n";
+ size_t string_size = DEBUG_HELP_STRING_SIZE;
+ size_t result_size;
+ size_t i;
+ char *new;
+ int rc = -EINVAL;
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
- if (at_boot) {
- byte_count += strlen(HELP_STRING_UNINITIALIZED);
+ if (at_boot)
client_title = HELP_STRING_UNINITIALIZED;
- } else {
- /*
+
+ /* build a new debug_help_string. */
+ new = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
+ if (!new) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * strlcat(dst, src, size) will append at most
+ * "size - strlen(dst) - 1" bytes of src onto dst,
+ * null terminating the result, and return the total
+ * length of the string it tried to create.
+ *
+ * We'll just plow through here building our new debug
+ * help string and let strlcat take care of assuring that
+ * dst doesn't overflow.
+ */
+ strlcat(new, client_title, string_size);
+
+ if (!at_boot) {
+
+ /*
* fill the client keyword/mask array and remember
* how many elements there were.
*/
@@ -644,64 +672,40 @@ int orangefs_prepare_debugfs_help_string(int at_boot)
if (cdm_element_count <= 0)
goto out;
- /* Count the bytes destined for debug_help_string. */
- byte_count += strlen(client_title);
-
for (i = 0; i < cdm_element_count; i++) {
- byte_count += strlen(cdm_array[i].keyword + 2);
- if (byte_count >= DEBUG_HELP_STRING_SIZE) {
- pr_info("%s: overflow 1!\n", __func__);
- goto out;
- }
+ strlcat(new, "\t", string_size);
+ strlcat(new, cdm_array[i].keyword, string_size);
+ strlcat(new, "\n", string_size);
}
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "%s: cdm_element_count:%d:\n",
- __func__,
- cdm_element_count);
}
- byte_count += strlen(kernel_title);
+ strlcat(new, "\n", string_size);
+ strlcat(new, kernel_title, string_size);
+
for (i = 0; i < num_kmod_keyword_mask_map; i++) {
- byte_count +=
- strlen(s_kmod_keyword_mask_map[i].keyword + 2);
- if (byte_count >= DEBUG_HELP_STRING_SIZE) {
- pr_info("%s: overflow 2!\n", __func__);
- goto out;
- }
+ strlcat(new, "\t", string_size);
+ strlcat(new, s_kmod_keyword_mask_map[i].keyword, string_size);
+ result_size = strlcat(new, "\n", string_size);
}
- /* build debug_help_string. */
- debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
- if (!debug_help_string) {
- rc = -ENOMEM;
+ /* See if we tried to put too many bytes into "new"... */
+ if (result_size >= string_size) {
+ kfree(new);
goto out;
}
- strcat(debug_help_string, client_title);
-
- if (!at_boot) {
- for (i = 0; i < cdm_element_count; i++) {
- strcat(debug_help_string, "\t");
- strcat(debug_help_string, cdm_array[i].keyword);
- strcat(debug_help_string, "\n");
- }
- }
-
- strcat(debug_help_string, "\n");
- strcat(debug_help_string, kernel_title);
-
- for (i = 0; i < num_kmod_keyword_mask_map; i++) {
- strcat(debug_help_string, "\t");
- strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
- strcat(debug_help_string, "\n");
+ if (at_boot) {
+ debug_help_string = new;
+ } else {
+ mutex_lock(&orangefs_help_file_lock);
+ memset(debug_help_string, 0, DEBUG_HELP_STRING_SIZE);
+ strlcat(debug_help_string, new, string_size);
+ mutex_unlock(&orangefs_help_file_lock);
}
rc = 0;
-out:
-
- return rc;
+out: return rc;
}
@@ -959,8 +963,12 @@ int orangefs_debugfs_new_client_string(void __user *arg)
ret = copy_from_user(&client_debug_array_string,
(void __user *)arg,
ORANGEFS_MAX_DEBUG_STRING_LEN);
- if (ret != 0)
+
+ if (ret != 0) {
+ pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
+ __func__);
return -EIO;
+ }
/*
* The real client-core makes an effort to ensure
@@ -975,45 +983,18 @@ int orangefs_debugfs_new_client_string(void __user *arg)
client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
'\0';
- if (ret != 0) {
- pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
- __func__);
- return -EIO;
- }
-
pr_info("%s: client debug array string has been received.\n",
__func__);
if (!help_string_initialized) {
- /* Free the "we don't know yet" default string... */
- kfree(debug_help_string);
-
- /* build a proper debug help string */
+ /* Build a proper debug help string. */
if (orangefs_prepare_debugfs_help_string(0)) {
gossip_err("%s: no debug help string \n",
__func__);
return -EIO;
}
- /* Replace the boilerplate boot-time debug-help file. */
- debugfs_remove(help_file_dentry);
-
- help_file_dentry =
- debugfs_create_file(
- ORANGEFS_KMOD_DEBUG_HELP_FILE,
- 0444,
- debug_dir,
- debug_help_string,
- &debug_help_fops);
-
- if (!help_file_dentry) {
- gossip_err("%s: debugfs_create_file failed for"
- " :%s:!\n",
- __func__,
- ORANGEFS_KMOD_DEBUG_HELP_FILE);
- return -EIO;
- }
}
debug_mask_to_string(&client_debug_mask, 1);
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 0a82048f3aaf..3bf803d732c5 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -580,4 +580,11 @@ static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size)
#endif
}
+static inline void orangefs_set_timeout(struct dentry *dentry)
+{
+ unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000;
+
+ dentry->d_fsdata = (void *) time;
+}
+
#endif /* __ORANGEFSKERNEL_H */
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 2e5b03065f34..4113eb0495bf 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -124,7 +124,7 @@ static int __init orangefs_init(void)
* unknown at boot time.
*
* orangefs_prepare_debugfs_help_string will be used again
- * later to rebuild the debug-help file after the client starts
+ * later to rebuild the debug-help-string after the client starts
* and passes along the needed info. The argument signifies
* which time orangefs_prepare_debugfs_help_string is being
* called.
@@ -152,7 +152,9 @@ static int __init orangefs_init(void)
ret = register_filesystem(&orangefs_fs_type);
if (ret == 0) {
- pr_info("orangefs: module version %s loaded\n", ORANGEFS_VERSION);
+ pr_info("%s: module version %s loaded\n",
+ __func__,
+ ORANGEFS_VERSION);
ret = 0;
goto out;
}
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index aeb60f791418..36795eed40b0 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -178,6 +178,8 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
len -= bytes;
}
+ if (!error)
+ error = vfs_fsync(new_file, 0);
fput(new_file);
out_fput:
fput(old_file);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index c58f01babf30..7fb53d055537 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -270,9 +270,6 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type)
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
return NULL;
- if (!realinode->i_op->get_acl)
- return NULL;
-
old_cred = ovl_override_creds(inode->i_sb);
acl = get_acl(realinode, type);
revert_creds(old_cred);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index bcf3965be819..edd46a0e951d 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1037,6 +1037,21 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
posix_acl_release(acl);
+ /*
+ * Check if sgid bit needs to be cleared (actual setacl operation will
+ * be done with mounter's capabilities and so that won't do it for us).
+ */
+ if (unlikely(inode->i_mode & S_ISGID) &&
+ handler->flags == ACL_TYPE_ACCESS &&
+ !in_group_p(inode->i_gid) &&
+ !capable_wrt_inode_uidgid(inode, CAP_FSETID)) {
+ struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
+
+ err = ovl_setattr(dentry, &iattr);
+ if (err)
+ return err;
+ }
+
err = ovl_xattr_set(dentry, handler->name, value, size, flags);
if (!err)
ovl_copyattr(ovl_inode_real(inode, NULL), inode);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index adfc5b4986f5..ca651ac00660 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1012,6 +1012,9 @@ static ssize_t auxv_read(struct file *file, char __user *buf,
{
struct mm_struct *mm = file->private_data;
unsigned int nwords = 0;
+
+ if (!mm)
+ return 0;
do {
nwords += 2;
} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index bd4a5e8ce441..ca16c5d7bab1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -543,6 +543,14 @@ out:
if (err != -ENOENT)
ubifs_err(c, "cannot find next direntry, error %d", err);
+ else
+ /*
+ * -ENOENT is a non-fatal error in this context, the TNC uses
+ * it to indicate that the cursor moved past the current directory
+ * and readdir() has to stop.
+ */
+ err = 0;
+
/* 2 is a special value indicating that there are no more direntries */
ctx->pos = 2;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c27344cf38e1..c6eb21940783 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3974,9 +3974,6 @@ xfs_bmap_remap_alloc(
* allocating, so skip that check by pretending to be freeing.
*/
error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
- if (error)
- goto error0;
-error0:
xfs_perag_put(args.pag);
if (error)
trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_);
@@ -3999,6 +3996,39 @@ xfs_bmap_alloc(
return xfs_bmap_btalloc(ap);
}
+/* Trim extent to fit a logical block range. */
+void
+xfs_trim_extent(
+ struct xfs_bmbt_irec *irec,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len)
+{
+ xfs_fileoff_t distance;
+ xfs_fileoff_t end = bno + len;
+
+ if (irec->br_startoff + irec->br_blockcount <= bno ||
+ irec->br_startoff >= end) {
+ irec->br_blockcount = 0;
+ return;
+ }
+
+ if (irec->br_startoff < bno) {
+ distance = bno - irec->br_startoff;
+ if (isnullstartblock(irec->br_startblock))
+ irec->br_startblock = DELAYSTARTBLOCK;
+ if (irec->br_startblock != DELAYSTARTBLOCK &&
+ irec->br_startblock != HOLESTARTBLOCK)
+ irec->br_startblock += distance;
+ irec->br_startoff += distance;
+ irec->br_blockcount -= distance;
+ }
+
+ if (end < irec->br_startoff + irec->br_blockcount) {
+ distance = irec->br_startoff + irec->br_blockcount - end;
+ irec->br_blockcount -= distance;
+ }
+}
+
/*
* Trim the returned map to the required bounds
*/
@@ -4829,6 +4859,219 @@ xfs_bmap_split_indlen(
return stolen;
}
+int
+xfs_bmap_del_extent_delay(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_extnum_t *idx,
+ struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec new;
+ int64_t da_old, da_new, da_diff = 0;
+ xfs_fileoff_t del_endoff, got_endoff;
+ xfs_filblks_t got_indlen, new_indlen, stolen;
+ int error = 0, state = 0;
+ bool isrt;
+
+ XFS_STATS_INC(mp, xs_del_exlist);
+
+ isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got->br_startoff + got->br_blockcount;
+ da_old = startblockval(got->br_startblock);
+ da_new = 0;
+
+ ASSERT(*idx >= 0);
+ ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(del->br_blockcount > 0);
+ ASSERT(got->br_startoff <= del->br_startoff);
+ ASSERT(got_endoff >= del_endoff);
+
+ if (isrt) {
+ int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
+
+ do_div(rtexts, mp->m_sb.sb_rextsize);
+ xfs_mod_frextents(mp, rtexts);
+ }
+
+ /*
+ * Update the inode delalloc counter now and wait to update the
+ * sb counters as we might have to borrow some blocks for the
+ * indirect block accounting.
+ */
+ xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0,
+ isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ ip->i_delayed_blks -= del->br_blockcount;
+
+ if (whichfork == XFS_COW_FORK)
+ state |= BMAP_COWFORK;
+
+ if (got->br_startoff == del->br_startoff)
+ state |= BMAP_LEFT_CONTIG;
+ if (got_endoff == del_endoff)
+ state |= BMAP_RIGHT_CONTIG;
+
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, *idx, 1, state);
+ --*idx;
+ break;
+ case BMAP_LEFT_CONTIG:
+ /*
+ * Deleting the first part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ got->br_startoff = del_endoff;
+ got->br_blockcount -= del->br_blockcount;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
+ got->br_blockcount), da_old);
+ got->br_startblock = nullstartblock((int)da_new);
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ break;
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * Deleting the last part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ got->br_blockcount = got->br_blockcount - del->br_blockcount;
+ da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
+ got->br_blockcount), da_old);
+ got->br_startblock = nullstartblock((int)da_new);
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ break;
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ *
+ * Distribute the original indlen reservation across the two new
+ * extents. Steal blocks from the deleted extent if necessary.
+ * Stealing blocks simply fudges the fdblocks accounting below.
+ * Warn if either of the new indlen reservations is zero as this
+ * can lead to delalloc problems.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+
+ got->br_blockcount = del->br_startoff - got->br_startoff;
+ got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount);
+
+ new.br_blockcount = got_endoff - del_endoff;
+ new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount);
+
+ WARN_ON_ONCE(!got_indlen || !new_indlen);
+ stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen,
+ del->br_blockcount);
+
+ got->br_startblock = nullstartblock((int)got_indlen);
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_);
+
+ new.br_startoff = del_endoff;
+ new.br_state = got->br_state;
+ new.br_startblock = nullstartblock((int)new_indlen);
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 1, &new, state);
+
+ da_new = got_indlen + new_indlen - stolen;
+ del->br_blockcount -= stolen;
+ break;
+ }
+
+ ASSERT(da_old >= da_new);
+ da_diff = da_old - da_new;
+ if (!isrt)
+ da_diff += del->br_blockcount;
+ if (da_diff)
+ xfs_mod_fdblocks(mp, da_diff, false);
+ return error;
+}
+
+void
+xfs_bmap_del_extent_cow(
+ struct xfs_inode *ip,
+ xfs_extnum_t *idx,
+ struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec new;
+ xfs_fileoff_t del_endoff, got_endoff;
+ int state = BMAP_COWFORK;
+
+ XFS_STATS_INC(mp, xs_del_exlist);
+
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got->br_startoff + got->br_blockcount;
+
+ ASSERT(*idx >= 0);
+ ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(del->br_blockcount > 0);
+ ASSERT(got->br_startoff <= del->br_startoff);
+ ASSERT(got_endoff >= del_endoff);
+ ASSERT(!isnullstartblock(got->br_startblock));
+
+ if (got->br_startoff == del->br_startoff)
+ state |= BMAP_LEFT_CONTIG;
+ if (got_endoff == del_endoff)
+ state |= BMAP_RIGHT_CONTIG;
+
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, *idx, 1, state);
+ --*idx;
+ break;
+ case BMAP_LEFT_CONTIG:
+ /*
+ * Deleting the first part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ got->br_startoff = del_endoff;
+ got->br_blockcount -= del->br_blockcount;
+ got->br_startblock = del->br_startblock + del->br_blockcount;
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ break;
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * Deleting the last part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ got->br_blockcount -= del->br_blockcount;
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ break;
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ got->br_blockcount = del->br_startoff - got->br_startoff;
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ new.br_startoff = del_endoff;
+ new.br_blockcount = got_endoff - del_endoff;
+ new.br_state = got->br_state;
+ new.br_startblock = del->br_startblock + del->br_blockcount;
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 1, &new, state);
+ break;
+ }
+}
+
/*
* Called by xfs_bmapi to update file extent records and the btree
* after removing space (or undoing a delayed allocation).
@@ -5171,175 +5414,6 @@ done:
return error;
}
-/* Remove an extent from the CoW fork. Similar to xfs_bmap_del_extent. */
-int
-xfs_bunmapi_cow(
- struct xfs_inode *ip,
- struct xfs_bmbt_irec *del)
-{
- xfs_filblks_t da_new;
- xfs_filblks_t da_old;
- xfs_fsblock_t del_endblock = 0;
- xfs_fileoff_t del_endoff;
- int delay;
- struct xfs_bmbt_rec_host *ep;
- int error;
- struct xfs_bmbt_irec got;
- xfs_fileoff_t got_endoff;
- struct xfs_ifork *ifp;
- struct xfs_mount *mp;
- xfs_filblks_t nblks;
- struct xfs_bmbt_irec new;
- /* REFERENCED */
- uint qfield;
- xfs_filblks_t temp;
- xfs_filblks_t temp2;
- int state = BMAP_COWFORK;
- int eof;
- xfs_extnum_t eidx;
-
- mp = ip->i_mount;
- XFS_STATS_INC(mp, xs_del_exlist);
-
- ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof,
- &eidx, &got, &new);
-
- ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ifp = ifp;
- ASSERT((eidx >= 0) && (eidx < ifp->if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)));
- ASSERT(del->br_blockcount > 0);
- ASSERT(got.br_startoff <= del->br_startoff);
- del_endoff = del->br_startoff + del->br_blockcount;
- got_endoff = got.br_startoff + got.br_blockcount;
- ASSERT(got_endoff >= del_endoff);
- delay = isnullstartblock(got.br_startblock);
- ASSERT(isnullstartblock(del->br_startblock) == delay);
- qfield = 0;
- error = 0;
- /*
- * If deleting a real allocation, must free up the disk space.
- */
- if (!delay) {
- nblks = del->br_blockcount;
- qfield = XFS_TRANS_DQ_BCOUNT;
- /*
- * Set up del_endblock and cur for later.
- */
- del_endblock = del->br_startblock + del->br_blockcount;
- da_old = da_new = 0;
- } else {
- da_old = startblockval(got.br_startblock);
- da_new = 0;
- nblks = 0;
- }
- qfield = qfield;
- nblks = nblks;
-
- /*
- * Set flag value to use in switch statement.
- * Left-contig is 2, right-contig is 1.
- */
- switch (((got.br_startoff == del->br_startoff) << 1) |
- (got_endoff == del_endoff)) {
- case 3:
- /*
- * Matches the whole extent. Delete the entry.
- */
- xfs_iext_remove(ip, eidx, 1, BMAP_COWFORK);
- --eidx;
- break;
-
- case 2:
- /*
- * Deleting the first part of the extent.
- */
- trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
- xfs_bmbt_set_startoff(ep, del_endoff);
- temp = got.br_blockcount - del->br_blockcount;
- xfs_bmbt_set_blockcount(ep, temp);
- if (delay) {
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- da_old);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
- da_new = temp;
- break;
- }
- xfs_bmbt_set_startblock(ep, del_endblock);
- trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
- break;
-
- case 1:
- /*
- * Deleting the last part of the extent.
- */
- temp = got.br_blockcount - del->br_blockcount;
- trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- if (delay) {
- temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- da_old);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
- da_new = temp;
- break;
- }
- trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
- break;
-
- case 0:
- /*
- * Deleting the middle of the extent.
- */
- temp = del->br_startoff - got.br_startoff;
- trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(ep, temp);
- new.br_startoff = del_endoff;
- temp2 = got_endoff - del_endoff;
- new.br_blockcount = temp2;
- new.br_state = got.br_state;
- if (!delay) {
- new.br_startblock = del_endblock;
- } else {
- temp = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- temp2 = xfs_bmap_worst_indlen(ip, temp2);
- new.br_startblock = nullstartblock((int)temp2);
- da_new = temp + temp2;
- while (da_new > da_old) {
- if (temp) {
- temp--;
- da_new--;
- xfs_bmbt_set_startblock(ep,
- nullstartblock((int)temp));
- }
- if (da_new == da_old)
- break;
- if (temp2) {
- temp2--;
- da_new--;
- new.br_startblock =
- nullstartblock((int)temp2);
- }
- }
- }
- trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
- xfs_iext_insert(ip, eidx + 1, 1, &new, state);
- ++eidx;
- break;
- }
-
- /*
- * Account for change in delayed indirect blocks.
- * Nothing to do for disk quota accounting here.
- */
- ASSERT(da_old >= da_new);
- if (da_old > da_new)
- xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
-
- return error;
-}
-
/*
* Unmap (remove) blocks from a file.
* If nexts is nonzero then the number of extents to remove is limited to
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index f97db7132564..7cae6ec27fa6 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -190,6 +190,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
#endif
+void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
+ xfs_filblks_t len);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
@@ -221,7 +223,11 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, int flags,
xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
struct xfs_defer_ops *dfops, int *done);
-int xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del);
+int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
+ xfs_extnum_t *idx, struct xfs_bmbt_irec *got,
+ struct xfs_bmbt_irec *del);
+void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx,
+ struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del);
int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 5c8e6f2ce44f..0e80993c8a59 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4826,7 +4826,7 @@ xfs_btree_calc_size(
return rval;
}
-int
+static int
xfs_btree_count_blocks_helper(
struct xfs_btree_cur *cur,
int level,
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 3cc3cf767474..ac9a003dd29a 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -191,8 +191,7 @@ xfs_dquot_buf_verify_crc(
if (mp->m_quotainfo)
ndquots = mp->m_quotainfo->qi_dqperchunk;
else
- ndquots = xfs_calc_dquots_per_chunk(
- XFS_BB_TO_FSB(mp, bp->b_length));
+ ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
for (i = 0; i < ndquots; i++, d++) {
if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index f6547fc5e016..6b7579e7b60a 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -865,7 +865,6 @@ typedef struct xfs_timestamp {
* padding field for v3 inodes.
*/
#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3)
typedef struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
__be16 di_mode; /* mode and type of file */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 8de9a3a29589..134424fac434 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -57,6 +57,17 @@ xfs_inobp_check(
}
#endif
+bool
+xfs_dinode_good_version(
+ struct xfs_mount *mp,
+ __u8 version)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return version == 3;
+
+ return version == 1 || version == 2;
+}
+
/*
* If we are doing readahead on an inode buffer, we might be in log recovery
* reading an inode allocation buffer that hasn't yet been replayed, and hence
@@ -91,7 +102,7 @@ xfs_inode_buf_verify(
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
- XFS_DINODE_GOOD_VERSION(dip->di_version);
+ xfs_dinode_good_version(mp, dip->di_version);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP,
XFS_RANDOM_ITOBP_INOTOBP))) {
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 62d9d4681c8c..3cfe12a4f58a 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -74,6 +74,8 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
struct xfs_dinode *to);
+bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version);
+
#if defined(DEBUG)
void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#else
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a314fc7b56fa..6e4f7f900fea 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -249,6 +249,7 @@ xfs_file_dio_aio_read(
struct xfs_inode *ip = XFS_I(inode);
loff_t isize = i_size_read(inode);
size_t count = iov_iter_count(to);
+ loff_t end = iocb->ki_pos + count - 1;
struct iov_iter data;
struct xfs_buftarg *target;
ssize_t ret = 0;
@@ -272,49 +273,21 @@ xfs_file_dio_aio_read(
file_accessed(iocb->ki_filp);
- /*
- * Locking is a bit tricky here. If we take an exclusive lock for direct
- * IO, we effectively serialise all new concurrent read IO to this file
- * and block it behind IO that is currently in progress because IO in
- * progress holds the IO lock shared. We only need to hold the lock
- * exclusive to blow away the page cache, so only take lock exclusively
- * if the page cache needs invalidation. This allows the normal direct
- * IO case of no page cache pages to proceeed concurrently without
- * serialisation.
- */
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
if (mapping->nrpages) {
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
- xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+ if (ret)
+ goto out_unlock;
/*
- * The generic dio code only flushes the range of the particular
- * I/O. Because we take an exclusive lock here, this whole
- * sequence is considerably more expensive for us. This has a
- * noticeable performance impact for any file with cached pages,
- * even when outside of the range of the particular I/O.
- *
- * Hence, amortize the cost of the lock against a full file
- * flush and reduce the chances of repeated iolock cycles going
- * forward.
+ * Invalidate whole pages. This can return an error if we fail
+ * to invalidate a page, but this should never happen on XFS.
+ * Warn if it does fail.
*/
- if (mapping->nrpages) {
- ret = filemap_write_and_wait(mapping);
- if (ret) {
- xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
- return ret;
- }
-
- /*
- * Invalidate whole pages. This can return an error if
- * we fail to invalidate a page, but this should never
- * happen on XFS. Warn if it does fail.
- */
- ret = invalidate_inode_pages2(mapping);
- WARN_ON_ONCE(ret);
- ret = 0;
- }
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
data = *to;
@@ -324,8 +297,9 @@ xfs_file_dio_aio_read(
iocb->ki_pos += ret;
iov_iter_advance(to, ret);
}
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+out_unlock:
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -570,61 +544,49 @@ xfs_file_dio_aio_write(
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- /* "unaligned" here means not aligned to a filesystem block */
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + count) & mp->m_blockmask))
- unaligned_io = 1;
-
/*
- * We don't need to take an exclusive lock unless there page cache needs
- * to be invalidated or unaligned IO is being executed. We don't need to
- * consider the EOF extension case here because
- * xfs_file_aio_write_checks() will relock the inode as necessary for
- * EOF zeroing cases and fill out the new inode size as appropriate.
+ * Don't take the exclusive iolock here unless the I/O is unaligned to
+ * the file system block size. We don't need to consider the EOF
+ * extension case here because xfs_file_aio_write_checks() will relock
+ * the inode as necessary for EOF zeroing cases and fill out the new
+ * inode size as appropriate.
*/
- if (unaligned_io || mapping->nrpages)
+ if ((iocb->ki_pos & mp->m_blockmask) ||
+ ((iocb->ki_pos + count) & mp->m_blockmask)) {
+ unaligned_io = 1;
iolock = XFS_IOLOCK_EXCL;
- else
+ } else {
iolock = XFS_IOLOCK_SHARED;
- xfs_rw_ilock(ip, iolock);
-
- /*
- * Recheck if there are cached pages that need invalidate after we got
- * the iolock to protect against other threads adding new pages while
- * we were waiting for the iolock.
- */
- if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, iolock);
- iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, iolock);
}
+ xfs_rw_ilock(ip, iolock);
+
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
count = iov_iter_count(from);
end = iocb->ki_pos + count - 1;
- /*
- * See xfs_file_dio_aio_read() for why we do a full-file flush here.
- */
if (mapping->nrpages) {
- ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
if (ret)
goto out;
+
/*
* Invalidate whole pages. This can return an error if we fail
* to invalidate a page, but this should never happen on XFS.
* Warn if it does fail.
*/
- ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
WARN_ON_ONCE(ret);
ret = 0;
}
/*
* If we are doing unaligned IO, wait for all other IO to drain,
- * otherwise demote the lock if we had to flush cached pages
+ * otherwise demote the lock if we had to take the exclusive lock
+ * for other reasons in xfs_file_aio_write_checks.
*/
if (unaligned_io)
inode_dio_wait(inode);
@@ -947,134 +909,6 @@ out_unlock:
return error;
}
-/*
- * Flush all file writes out to disk.
- */
-static int
-xfs_file_wait_for_io(
- struct inode *inode,
- loff_t offset,
- size_t len)
-{
- loff_t rounding;
- loff_t ioffset;
- loff_t iendoffset;
- loff_t bs;
- int ret;
-
- bs = inode->i_sb->s_blocksize;
- inode_dio_wait(inode);
-
- rounding = max_t(xfs_off_t, bs, PAGE_SIZE);
- ioffset = round_down(offset, rounding);
- iendoffset = round_up(offset + len, rounding) - 1;
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- iendoffset);
- return ret;
-}
-
-/* Hook up to the VFS reflink function */
-STATIC int
-xfs_file_share_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len,
- bool is_dedupe)
-{
- struct inode *inode_in;
- struct inode *inode_out;
- ssize_t ret;
- loff_t bs;
- loff_t isize;
- int same_inode;
- loff_t blen;
- unsigned int flags = 0;
-
- inode_in = file_inode(file_in);
- inode_out = file_inode(file_out);
- bs = inode_out->i_sb->s_blocksize;
-
- /* Don't touch certain kinds of inodes */
- if (IS_IMMUTABLE(inode_out))
- return -EPERM;
- if (IS_SWAPFILE(inode_in) ||
- IS_SWAPFILE(inode_out))
- return -ETXTBSY;
-
- /* Reflink only works within this filesystem. */
- if (inode_in->i_sb != inode_out->i_sb)
- return -EXDEV;
- same_inode = (inode_in->i_ino == inode_out->i_ino);
-
- /* Don't reflink dirs, pipes, sockets... */
- if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
- return -EISDIR;
- if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
- return -EINVAL;
- if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
- return -EINVAL;
-
- /* Don't share DAX file data for now. */
- if (IS_DAX(inode_in) || IS_DAX(inode_out))
- return -EINVAL;
-
- /* Are we going all the way to the end? */
- isize = i_size_read(inode_in);
- if (isize == 0)
- return 0;
- if (len == 0)
- len = isize - pos_in;
-
- /* Ensure offsets don't wrap and the input is inside i_size */
- if (pos_in + len < pos_in || pos_out + len < pos_out ||
- pos_in + len > isize)
- return -EINVAL;
-
- /* Don't allow dedupe past EOF in the dest file */
- if (is_dedupe) {
- loff_t disize;
-
- disize = i_size_read(inode_out);
- if (pos_out >= disize || pos_out + len > disize)
- return -EINVAL;
- }
-
- /* If we're linking to EOF, continue to the block boundary. */
- if (pos_in + len == isize)
- blen = ALIGN(isize, bs) - pos_in;
- else
- blen = len;
-
- /* Only reflink if we're aligned to block boundaries */
- if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
- !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
- return -EINVAL;
-
- /* Don't allow overlapped reflink within the same file */
- if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen)
- return -EINVAL;
-
- /* Wait for the completion of any pending IOs on srcfile */
- ret = xfs_file_wait_for_io(inode_in, pos_in, len);
- if (ret)
- goto out;
- ret = xfs_file_wait_for_io(inode_out, pos_out, len);
- if (ret)
- goto out;
-
- if (is_dedupe)
- flags |= XFS_REFLINK_DEDUPE;
- ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out),
- pos_out, len, flags);
- if (ret < 0)
- goto out;
-
-out:
- return ret;
-}
-
STATIC ssize_t
xfs_file_copy_range(
struct file *file_in,
@@ -1086,7 +920,7 @@ xfs_file_copy_range(
{
int error;
- error = xfs_file_share_range(file_in, pos_in, file_out, pos_out,
+ error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
len, false);
if (error)
return error;
@@ -1101,7 +935,7 @@ xfs_file_clone_range(
loff_t pos_out,
u64 len)
{
- return xfs_file_share_range(file_in, pos_in, file_out, pos_out,
+ return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
len, false);
}
@@ -1124,7 +958,7 @@ xfs_file_dedupe_range(
if (len > XFS_MAX_DEDUPE_LEN)
len = XFS_MAX_DEDUPE_LEN;
- error = xfs_file_share_range(src_file, loff, dst_file, dst_loff,
+ error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
len, true);
if (error)
return error;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 14796b744e0a..f295049db681 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1656,9 +1656,9 @@ void
xfs_inode_set_cowblocks_tag(
xfs_inode_t *ip)
{
- trace_xfs_inode_set_eofblocks_tag(ip);
+ trace_xfs_inode_set_cowblocks_tag(ip);
return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks,
- trace_xfs_perag_set_eofblocks,
+ trace_xfs_perag_set_cowblocks,
XFS_ICI_COWBLOCKS_TAG);
}
@@ -1666,7 +1666,7 @@ void
xfs_inode_clear_cowblocks_tag(
xfs_inode_t *ip)
{
- trace_xfs_inode_clear_eofblocks_tag(ip);
+ trace_xfs_inode_clear_cowblocks_tag(ip);
return __xfs_inode_clear_eofblocks_tag(ip,
- trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG);
+ trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d907eb9f8ef3..436e109bb01e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -566,6 +566,17 @@ xfs_file_iomap_begin_delay(
xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx,
&got, &prev);
if (!eof && got.br_startoff <= offset_fsb) {
+ if (xfs_is_reflink_inode(ip)) {
+ bool shared;
+
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count),
+ maxbytes_fsb);
+ xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
+ error = xfs_reflink_reserve_cow(ip, &got, &shared);
+ if (error)
+ goto out_unlock;
+ }
+
trace_xfs_iomap_found(ip, offset, count, 0, &got);
goto done;
}
@@ -961,19 +972,13 @@ xfs_file_iomap_begin(
struct xfs_mount *mp = ip->i_mount;
struct xfs_bmbt_irec imap;
xfs_fileoff_t offset_fsb, end_fsb;
- bool shared, trimmed;
int nimaps = 1, error = 0;
+ bool shared = false, trimmed = false;
unsigned lockmode;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
- error = xfs_reflink_reserve_cow_range(ip, offset, length);
- if (error < 0)
- return error;
- }
-
if ((flags & IOMAP_WRITE) && !IS_DAX(inode) &&
!xfs_get_extsz_hint(ip)) {
/* Reserve delalloc blocks for regular writeback. */
@@ -981,7 +986,16 @@ xfs_file_iomap_begin(
iomap);
}
- lockmode = xfs_ilock_data_map_shared(ip);
+ /*
+ * COW writes will allocate delalloc space, so we need to make sure
+ * to take the lock exclusively here.
+ */
+ if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+ lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ } else {
+ lockmode = xfs_ilock_data_map_shared(ip);
+ }
ASSERT(offset <= mp->m_super->s_maxbytes);
if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
@@ -991,16 +1005,24 @@ xfs_file_iomap_begin(
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, 0);
- if (error) {
- xfs_iunlock(ip, lockmode);
- return error;
+ if (error)
+ goto out_unlock;
+
+ if (flags & IOMAP_REPORT) {
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
+ &trimmed);
+ if (error)
+ goto out_unlock;
}
- /* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
- if (error) {
- xfs_iunlock(ip, lockmode);
- return error;
+ if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+ error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+ if (error)
+ goto out_unlock;
+
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+ length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
@@ -1039,6 +1061,9 @@ xfs_file_iomap_begin(
if (shared)
iomap->flags |= IOMAP_F_SHARED;
return 0;
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
}
static int
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fc7873942bea..b341f10cf481 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1009,6 +1009,7 @@ xfs_mountfs(
out_quota:
xfs_qm_unmount_quotas(mp);
out_rtunmount:
+ mp->m_super->s_flags &= ~MS_ACTIVE;
xfs_rtunmount_inodes(mp);
out_rele_rip:
IRELE(rip);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 5965e9455d91..a279b4e7f5fe 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -182,7 +182,8 @@ xfs_reflink_trim_around_shared(
if (!xfs_is_reflink_inode(ip) ||
ISUNWRITTEN(irec) ||
irec->br_startblock == HOLESTARTBLOCK ||
- irec->br_startblock == DELAYSTARTBLOCK) {
+ irec->br_startblock == DELAYSTARTBLOCK ||
+ isnullstartblock(irec->br_startblock)) {
*shared = false;
return 0;
}
@@ -227,50 +228,54 @@ xfs_reflink_trim_around_shared(
}
}
-/* Create a CoW reservation for a range of blocks within a file. */
-static int
-__xfs_reflink_reserve_cow(
+/*
+ * Trim the passed in imap to the next shared/unshared extent boundary, and
+ * if imap->br_startoff points to a shared extent reserve space for it in the
+ * COW fork. In this case *shared is set to true, else to false.
+ *
+ * Note that imap will always contain the block numbers for the existing blocks
+ * in the data fork, as the upper layers need them for read-modify-write
+ * operations.
+ */
+int
+xfs_reflink_reserve_cow(
struct xfs_inode *ip,
- xfs_fileoff_t *offset_fsb,
- xfs_fileoff_t end_fsb,
- bool *skipped)
+ struct xfs_bmbt_irec *imap,
+ bool *shared)
{
- struct xfs_bmbt_irec got, prev, imap;
- xfs_fileoff_t orig_end_fsb;
- int nimaps, eof = 0, error = 0;
- bool shared = false, trimmed = false;
+ struct xfs_bmbt_irec got, prev;
+ xfs_fileoff_t end_fsb, orig_end_fsb;
+ int eof = 0, error = 0;
+ bool trimmed;
xfs_extnum_t idx;
xfs_extlen_t align;
- /* Already reserved? Skip the refcount btree access. */
- xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx,
+ /*
+ * Search the COW fork extent list first. This serves two purposes:
+ * first this implement the speculative preallocation using cowextisze,
+ * so that we also unshared block adjacent to shared blocks instead
+ * of just the shared blocks themselves. Second the lookup in the
+ * extent list is generally faster than going out to the shared extent
+ * tree.
+ */
+ xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx,
&got, &prev);
- if (!eof && got.br_startoff <= *offset_fsb) {
- end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount;
- trace_xfs_reflink_cow_found(ip, &got);
- goto done;
- }
+ if (!eof && got.br_startoff <= imap->br_startoff) {
+ trace_xfs_reflink_cow_found(ip, imap);
+ xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
- /* Read extent from the source file. */
- nimaps = 1;
- error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
- &imap, &nimaps, 0);
- if (error)
- goto out_unlock;
- ASSERT(nimaps == 1);
+ *shared = true;
+ return 0;
+ }
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
+ error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
if (error)
- goto out_unlock;
-
- end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;
+ return error;
/* Not shared? Just report the (potentially capped) extent. */
- if (!shared) {
- *skipped = true;
- goto done;
- }
+ if (!*shared)
+ return 0;
/*
* Fork all the shared blocks from our write offset until the end of
@@ -278,72 +283,38 @@ __xfs_reflink_reserve_cow(
*/
error = xfs_qm_dqattach_locked(ip, 0);
if (error)
- goto out_unlock;
+ return error;
+
+ end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount;
align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip));
if (align)
end_fsb = roundup_64(end_fsb, align);
retry:
- error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb,
- end_fsb - *offset_fsb, &got,
- &prev, &idx, eof);
+ error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
+ end_fsb - imap->br_startoff, &got, &prev, &idx, eof);
switch (error) {
case 0:
break;
case -ENOSPC:
case -EDQUOT:
/* retry without any preallocation */
- trace_xfs_reflink_cow_enospc(ip, &imap);
+ trace_xfs_reflink_cow_enospc(ip, imap);
if (end_fsb != orig_end_fsb) {
end_fsb = orig_end_fsb;
goto retry;
}
/*FALLTHRU*/
default:
- goto out_unlock;
+ return error;
}
if (end_fsb != orig_end_fsb)
xfs_inode_set_cowblocks_tag(ip);
trace_xfs_reflink_cow_alloc(ip, &got);
-done:
- *offset_fsb = end_fsb;
-out_unlock:
- return error;
-}
-
-/* Create a CoW reservation for part of a file. */
-int
-xfs_reflink_reserve_cow_range(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t count)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb, end_fsb;
- bool skipped = false;
- int error;
-
- trace_xfs_reflink_reserve_cow_range(ip, offset, count);
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- end_fsb = XFS_B_TO_FSB(mp, offset + count);
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- while (offset_fsb < end_fsb) {
- error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb,
- &skipped);
- if (error) {
- trace_xfs_reflink_reserve_cow_range_error(ip, error,
- _RET_IP_);
- break;
- }
- }
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- return error;
+ return 0;
}
/* Allocate all CoW reservations covering a range of blocks in a file. */
@@ -358,9 +329,8 @@ __xfs_reflink_allocate_cow(
struct xfs_defer_ops dfops;
struct xfs_trans *tp;
xfs_fsblock_t first_block;
- xfs_fileoff_t next_fsb;
int nimaps = 1, error;
- bool skipped = false;
+ bool shared;
xfs_defer_init(&dfops, &first_block);
@@ -371,33 +341,38 @@ __xfs_reflink_allocate_cow(
xfs_ilock(ip, XFS_ILOCK_EXCL);
- next_fsb = *offset_fsb;
- error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped);
+ /* Read extent from the source file. */
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
+ &imap, &nimaps, 0);
+ if (error)
+ goto out_unlock;
+ ASSERT(nimaps == 1);
+
+ error = xfs_reflink_reserve_cow(ip, &imap, &shared);
if (error)
goto out_trans_cancel;
- if (skipped) {
- *offset_fsb = next_fsb;
+ if (!shared) {
+ *offset_fsb = imap.br_startoff + imap.br_blockcount;
goto out_trans_cancel;
}
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb,
+ error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
XFS_BMAPI_COWFORK, &first_block,
XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
&imap, &nimaps, &dfops);
if (error)
goto out_trans_cancel;
- /* We might not have been able to map the whole delalloc extent */
- *offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb);
-
error = xfs_defer_finish(&tp, &dfops, NULL);
if (error)
goto out_trans_cancel;
error = xfs_trans_commit(tp);
+ *offset_fsb = imap.br_startoff + imap.br_blockcount;
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -536,58 +511,49 @@ xfs_reflink_cancel_cow_blocks(
xfs_fileoff_t offset_fsb,
xfs_fileoff_t end_fsb)
{
- struct xfs_bmbt_irec irec;
- xfs_filblks_t count_fsb;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec got, prev, del;
+ xfs_extnum_t idx;
xfs_fsblock_t firstfsb;
struct xfs_defer_ops dfops;
- int error = 0;
- int nimaps;
+ int error = 0, eof = 0;
if (!xfs_is_reflink_inode(ip))
return 0;
- /* Go find the old extent in the CoW fork. */
- while (offset_fsb < end_fsb) {
- nimaps = 1;
- count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
- error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec,
- &nimaps, XFS_BMAPI_COWFORK);
- if (error)
- break;
- ASSERT(nimaps == 1);
-
- trace_xfs_reflink_cancel_cow(ip, &irec);
+ xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx,
+ &got, &prev);
+ if (eof)
+ return 0;
- if (irec.br_startblock == DELAYSTARTBLOCK) {
- /* Free a delayed allocation. */
- xfs_mod_fdblocks(ip->i_mount, irec.br_blockcount,
- false);
- ip->i_delayed_blks -= irec.br_blockcount;
+ while (got.br_startoff < end_fsb) {
+ del = got;
+ xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
+ trace_xfs_reflink_cancel_cow(ip, &del);
- /* Remove the mapping from the CoW fork. */
- error = xfs_bunmapi_cow(ip, &irec);
+ if (isnullstartblock(del.br_startblock)) {
+ error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
+ &idx, &got, &del);
if (error)
break;
- } else if (irec.br_startblock == HOLESTARTBLOCK) {
- /* empty */
} else {
xfs_trans_ijoin(*tpp, ip, 0);
xfs_defer_init(&dfops, &firstfsb);
/* Free the CoW orphan record. */
error = xfs_refcount_free_cow_extent(ip->i_mount,
- &dfops, irec.br_startblock,
- irec.br_blockcount);
+ &dfops, del.br_startblock,
+ del.br_blockcount);
if (error)
break;
xfs_bmap_add_free(ip->i_mount, &dfops,
- irec.br_startblock, irec.br_blockcount,
+ del.br_startblock, del.br_blockcount,
NULL);
/* Update quota accounting */
xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT,
- -(long)irec.br_blockcount);
+ -(long)del.br_blockcount);
/* Roll the transaction */
error = xfs_defer_finish(tpp, &dfops, ip);
@@ -597,15 +563,18 @@ xfs_reflink_cancel_cow_blocks(
}
/* Remove the mapping from the CoW fork. */
- error = xfs_bunmapi_cow(ip, &irec);
- if (error)
- break;
+ xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
}
- /* Roll on... */
- offset_fsb = irec.br_startoff + irec.br_blockcount;
+ if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec))
+ break;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
}
+ /* clear tag if cow fork is emptied */
+ if (!ifp->if_bytes)
+ xfs_inode_clear_cowblocks_tag(ip);
+
return error;
}
@@ -668,25 +637,26 @@ xfs_reflink_end_cow(
xfs_off_t offset,
xfs_off_t count)
{
- struct xfs_bmbt_irec irec;
- struct xfs_bmbt_irec uirec;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec got, prev, del;
struct xfs_trans *tp;
xfs_fileoff_t offset_fsb;
xfs_fileoff_t end_fsb;
- xfs_filblks_t count_fsb;
xfs_fsblock_t firstfsb;
struct xfs_defer_ops dfops;
- int error;
+ int error, eof = 0;
unsigned int resblks;
- xfs_filblks_t ilen;
xfs_filblks_t rlen;
- int nimaps;
+ xfs_extnum_t idx;
trace_xfs_reflink_end_cow(ip, offset, count);
+ /* No COW extents? That's easy! */
+ if (ifp->if_bytes == 0)
+ return 0;
+
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
- count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
/* Start a rolling transaction to switch the mappings */
resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
@@ -698,72 +668,65 @@ xfs_reflink_end_cow(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- /* Go find the old extent in the CoW fork. */
- while (offset_fsb < end_fsb) {
- /* Read extent from the source file */
- nimaps = 1;
- count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
- error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec,
- &nimaps, XFS_BMAPI_COWFORK);
- if (error)
- goto out_cancel;
- ASSERT(nimaps == 1);
+ xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx,
+ &got, &prev);
- ASSERT(irec.br_startblock != DELAYSTARTBLOCK);
- trace_xfs_reflink_cow_remap(ip, &irec);
+ /* If there is a hole at end_fsb - 1 go to the previous extent */
+ if (eof || got.br_startoff > end_fsb) {
+ ASSERT(idx > 0);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got);
+ }
- /*
- * We can have a hole in the CoW fork if part of a directio
- * write is CoW but part of it isn't.
- */
- rlen = ilen = irec.br_blockcount;
- if (irec.br_startblock == HOLESTARTBLOCK)
+ /* Walk backwards until we're out of the I/O range... */
+ while (got.br_startoff + got.br_blockcount > offset_fsb) {
+ del = got;
+ xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
+
+ /* Extent delete may have bumped idx forward */
+ if (!del.br_blockcount) {
+ idx--;
goto next_extent;
+ }
+
+ ASSERT(!isnullstartblock(got.br_startblock));
/* Unmap the old blocks in the data fork. */
- while (rlen) {
- xfs_defer_init(&dfops, &firstfsb);
- error = __xfs_bunmapi(tp, ip, irec.br_startoff,
- &rlen, 0, 1, &firstfsb, &dfops);
- if (error)
- goto out_defer;
-
- /*
- * Trim the extent to whatever got unmapped.
- * Remember, bunmapi works backwards.
- */
- uirec.br_startblock = irec.br_startblock + rlen;
- uirec.br_startoff = irec.br_startoff + rlen;
- uirec.br_blockcount = irec.br_blockcount - rlen;
- irec.br_blockcount = rlen;
- trace_xfs_reflink_cow_remap_piece(ip, &uirec);
+ xfs_defer_init(&dfops, &firstfsb);
+ rlen = del.br_blockcount;
+ error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1,
+ &firstfsb, &dfops);
+ if (error)
+ goto out_defer;
- /* Free the CoW orphan record. */
- error = xfs_refcount_free_cow_extent(tp->t_mountp,
- &dfops, uirec.br_startblock,
- uirec.br_blockcount);
- if (error)
- goto out_defer;
+ /* Trim the extent to whatever got unmapped. */
+ if (rlen) {
+ xfs_trim_extent(&del, del.br_startoff + rlen,
+ del.br_blockcount - rlen);
+ }
+ trace_xfs_reflink_cow_remap(ip, &del);
- /* Map the new blocks into the data fork. */
- error = xfs_bmap_map_extent(tp->t_mountp, &dfops,
- ip, &uirec);
- if (error)
- goto out_defer;
+ /* Free the CoW orphan record. */
+ error = xfs_refcount_free_cow_extent(tp->t_mountp, &dfops,
+ del.br_startblock, del.br_blockcount);
+ if (error)
+ goto out_defer;
- /* Remove the mapping from the CoW fork. */
- error = xfs_bunmapi_cow(ip, &uirec);
- if (error)
- goto out_defer;
+ /* Map the new blocks into the data fork. */
+ error = xfs_bmap_map_extent(tp->t_mountp, &dfops, ip, &del);
+ if (error)
+ goto out_defer;
- error = xfs_defer_finish(&tp, &dfops, ip);
- if (error)
- goto out_defer;
- }
+ /* Remove the mapping from the CoW fork. */
+ xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
+
+ error = xfs_defer_finish(&tp, &dfops, ip);
+ if (error)
+ goto out_defer;
next_extent:
- /* Roll on... */
- offset_fsb = irec.br_startoff + ilen;
+ if (idx < 0)
+ break;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
}
error = xfs_trans_commit(tp);
@@ -774,7 +737,6 @@ next_extent:
out_defer:
xfs_defer_cancel(&dfops);
-out_cancel:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
@@ -1312,19 +1274,26 @@ out_error:
*/
int
xfs_reflink_remap_range(
- struct xfs_inode *src,
- xfs_off_t srcoff,
- struct xfs_inode *dest,
- xfs_off_t destoff,
- xfs_off_t len,
- unsigned int flags)
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe)
{
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
struct xfs_mount *mp = src->i_mount;
+ loff_t bs = inode_out->i_sb->s_blocksize;
+ bool same_inode = (inode_in == inode_out);
xfs_fileoff_t sfsbno, dfsbno;
xfs_filblks_t fsblen;
- int error;
xfs_extlen_t cowextsize;
- bool is_same;
+ loff_t isize;
+ ssize_t ret;
+ loff_t blen;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return -EOPNOTSUPP;
@@ -1332,17 +1301,8 @@ xfs_reflink_remap_range(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- /* Don't reflink realtime inodes */
- if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
- return -EINVAL;
-
- if (flags & ~XFS_REFLINK_ALL)
- return -EINVAL;
-
- trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff);
-
/* Lock both files against IO */
- if (src->i_ino == dest->i_ino) {
+ if (same_inode) {
xfs_ilock(src, XFS_IOLOCK_EXCL);
xfs_ilock(src, XFS_MMAPLOCK_EXCL);
} else {
@@ -1350,39 +1310,126 @@ xfs_reflink_remap_range(
xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
}
+ /* Don't touch certain kinds of inodes */
+ ret = -EPERM;
+ if (IS_IMMUTABLE(inode_out))
+ goto out_unlock;
+
+ ret = -ETXTBSY;
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ goto out_unlock;
+
+
+ /* Don't reflink dirs, pipes, sockets... */
+ ret = -EISDIR;
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ goto out_unlock;
+ ret = -EINVAL;
+ if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
+ goto out_unlock;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ goto out_unlock;
+
+ /* Don't reflink realtime inodes */
+ if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
+ goto out_unlock;
+
+ /* Don't share DAX file data for now. */
+ if (IS_DAX(inode_in) || IS_DAX(inode_out))
+ goto out_unlock;
+
+ /* Are we going all the way to the end? */
+ isize = i_size_read(inode_in);
+ if (isize == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (len == 0)
+ len = isize - pos_in;
+
+ /* Ensure offsets don't wrap and the input is inside i_size */
+ if (pos_in + len < pos_in || pos_out + len < pos_out ||
+ pos_in + len > isize)
+ goto out_unlock;
+
+ /* Don't allow dedupe past EOF in the dest file */
+ if (is_dedupe) {
+ loff_t disize;
+
+ disize = i_size_read(inode_out);
+ if (pos_out >= disize || pos_out + len > disize)
+ goto out_unlock;
+ }
+
+ /* If we're linking to EOF, continue to the block boundary. */
+ if (pos_in + len == isize)
+ blen = ALIGN(isize, bs) - pos_in;
+ else
+ blen = len;
+
+ /* Only reflink if we're aligned to block boundaries */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+ !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+ goto out_unlock;
+
+ /* Don't allow overlapped reflink within the same file */
+ if (same_inode) {
+ if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+ goto out_unlock;
+ }
+
+ /* Wait for the completion of any pending IOs on both files */
+ inode_dio_wait(inode_in);
+ if (!same_inode)
+ inode_dio_wait(inode_out);
+
+ ret = filemap_write_and_wait_range(inode_in->i_mapping,
+ pos_in, pos_in + len - 1);
+ if (ret)
+ goto out_unlock;
+
+ ret = filemap_write_and_wait_range(inode_out->i_mapping,
+ pos_out, pos_out + len - 1);
+ if (ret)
+ goto out_unlock;
+
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
/*
* Check that the extents are the same.
*/
- if (flags & XFS_REFLINK_DEDUPE) {
- is_same = false;
- error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest),
- destoff, len, &is_same);
- if (error)
- goto out_error;
+ if (is_dedupe) {
+ bool is_same = false;
+
+ ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out,
+ len, &is_same);
+ if (ret)
+ goto out_unlock;
if (!is_same) {
- error = -EBADE;
- goto out_error;
+ ret = -EBADE;
+ goto out_unlock;
}
}
- error = xfs_reflink_set_inode_flag(src, dest);
- if (error)
- goto out_error;
+ ret = xfs_reflink_set_inode_flag(src, dest);
+ if (ret)
+ goto out_unlock;
/*
* Invalidate the page cache so that we can clear any CoW mappings
* in the destination file.
*/
- truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff,
- PAGE_ALIGN(destoff + len) - 1);
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
+ PAGE_ALIGN(pos_out + len) - 1);
- dfsbno = XFS_B_TO_FSBT(mp, destoff);
- sfsbno = XFS_B_TO_FSBT(mp, srcoff);
+ dfsbno = XFS_B_TO_FSBT(mp, pos_out);
+ sfsbno = XFS_B_TO_FSBT(mp, pos_in);
fsblen = XFS_B_TO_FSB(mp, len);
- error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
- destoff + len);
- if (error)
- goto out_error;
+ ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
+ pos_out + len);
+ if (ret)
+ goto out_unlock;
/*
* Carry the cowextsize hint from src to dest if we're sharing the
@@ -1390,26 +1437,24 @@ xfs_reflink_remap_range(
* has a cowextsize hint, and the destination file does not.
*/
cowextsize = 0;
- if (srcoff == 0 && len == i_size_read(VFS_I(src)) &&
+ if (pos_in == 0 && len == i_size_read(inode_in) &&
(src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
- destoff == 0 && len >= i_size_read(VFS_I(dest)) &&
+ pos_out == 0 && len >= i_size_read(inode_out) &&
!(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
cowextsize = src->i_d.di_cowextsize;
- error = xfs_reflink_update_dest(dest, destoff + len, cowextsize);
- if (error)
- goto out_error;
+ ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize);
-out_error:
+out_unlock:
xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
xfs_iunlock(src, XFS_IOLOCK_EXCL);
if (src->i_ino != dest->i_ino) {
xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
xfs_iunlock(dest, XFS_IOLOCK_EXCL);
}
- if (error)
- trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_);
- return error;
+ if (ret)
+ trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+ return ret;
}
/*
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 5dc3c8ac12aa..fad11607c9ad 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -26,8 +26,8 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno,
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
-extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip,
- xfs_off_t offset, xfs_off_t count);
+extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap, bool *shared);
extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
xfs_off_t offset, xfs_off_t count);
extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
@@ -43,11 +43,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
-#define XFS_REFLINK_DEDUPE 1 /* only reflink if contents match */
-#define XFS_REFLINK_ALL (XFS_REFLINK_DEDUPE)
-extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff,
- struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len,
- unsigned int flags);
+extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
struct xfs_trans **tpp);
extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 5f8d55d29a11..276d3023d60f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -512,13 +512,13 @@ static struct attribute *xfs_error_attrs[] = {
};
-struct kobj_type xfs_error_cfg_ktype = {
+static struct kobj_type xfs_error_cfg_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_attrs = xfs_error_attrs,
};
-struct kobj_type xfs_error_ktype = {
+static struct kobj_type xfs_error_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ad188d3a83f3..0907752be62d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3346,7 +3346,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range);
+DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
@@ -3356,9 +3356,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_piece);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_reserve_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);